Full text
52,414 characters
· extracted from
preprint-html
· click to expand
A reproducible open-source framework for defining type 1 and type 2 diabetes research cohorts in routinely collected electronic health record data | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search A reproducible open-source framework for defining type 1 and type 2 diabetes research cohorts in routinely collected electronic health record data View ORCID Profile Rhian Hopkins , Pedro Cardoso , Laura M Güdemann , Andrew P McGovern , John M Dennis , Beverley M Shields , Katherine G Young doi: https://doi.org/10.1101/2025.03.26.25324678 Rhian Hopkins 1 Department of Clinical and Biomedical Sciences, University of Exeter Medical School , RILD Building, Royal Devon & Exeter Hospital , Barrack Road, Exeter, EX2 5DW, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Rhian Hopkins For correspondence: rh530{at}exeter.ac.uk Pedro Cardoso 1 Department of Clinical and Biomedical Sciences, University of Exeter Medical School , RILD Building, Royal Devon & Exeter Hospital , Barrack Road, Exeter, EX2 5DW, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Laura M Güdemann 1 Department of Clinical and Biomedical Sciences, University of Exeter Medical School , RILD Building, Royal Devon & Exeter Hospital , Barrack Road, Exeter, EX2 5DW, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Andrew P McGovern 1 Department of Clinical and Biomedical Sciences, University of Exeter Medical School , RILD Building, Royal Devon & Exeter Hospital , Barrack Road, Exeter, EX2 5DW, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site John M Dennis 1 Department of Clinical and Biomedical Sciences, University of Exeter Medical School , RILD Building, Royal Devon & Exeter Hospital , Barrack Road, Exeter, EX2 5DW, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Beverley M Shields 1 Department of Clinical and Biomedical Sciences, University of Exeter Medical School , RILD Building, Royal Devon & Exeter Hospital , Barrack Road, Exeter, EX2 5DW, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Katherine G Young 1 Department of Clinical and Biomedical Sciences, University of Exeter Medical School , RILD Building, Royal Devon & Exeter Hospital , Barrack Road, Exeter, EX2 5DW, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Preview PDF Abstract Background Electronic health record data (EHR) data provide an increasingly important resource for studying people living with diabetes and their clinical outcomes, but robustly coding reproducible datasets is challenging. We aimed to develop a standardised data-processing framework for defining cohorts of people with type 1 and type 2 diabetes using EHR data. Methods We initially provide a standardised, generalisable procedure to develop clinically reviewed code lists to robustly define variables in EHR data. Using UK population-based data from primary care linked to hospital admission records (Clinical Practice Research Datalink [CPRD]), we develop and demonstrate a data-processing pipeline applicable to raw EHR data, using clinical code lists to define a population of individuals with diabetes and defining their diabetes diagnosis dates using the earliest recorded observation of diabetes (clinical code, high HbA1c test result, or prescription for glucose lowering therapy). Using a previously validated approach, we classify diabetes type (gold standard type 1, type 2) based on insulin prescriptions, diabetes type specific clinical codes, and age at diagnosis. Finally, we demonstrate how multiple research cohorts can be defined from this diabetes population based on a specific index date, including a range of baseline features (sociodemographic and lifestyle factors, biomarkers, comorbidities, medications) and key outcomes relevant to the research question. Results Application of the framework identified an incident cohort at diabetes diagnosis (type 1 diabetes (T1D): N = 10,480, mean age at diagnosis [SD] = 10.4 [4.8]; type 2 diabetes (T2D): N = 726,800, mean age at diagnosis [SD] = 60.5 [13.4]), a prevalent cohort actively registered with their GP practice on 01/02/2020 (T1D: N = 9,514, T2D: N = 559,905), and a T2D cohort initiating treatment with glucose- lowering therapies (N = 769,394 treatment initiations, considering 7 major medication classes). We publicly share our code lists and data processing code, making our research as transparent and reproducible as possible ( https://github.com/Exeter-Diabetes/CPRD-Cohort-scripts , https://github.com/Exeter-Diabetes/CPRD-Codelists/ ). Conclusions We have developed a flexible and reproducible framework to generate analysis-ready diabetes research cohorts in EHR data. The concepts of this framework are applicable to any EHR dataset and have been shared for use by other researchers. This approach could improve the quality and reproducibility of the diverse epidemiological and clinical diabetes studies using EHR worldwide. Introduction Diabetes is a major increasing health concern, currently affecting around 830 million people worldwide ( 1 , 2 ). The condition can result in serious complications such as kidney disease, heart attacks, and amputation ( 1 ). These outcomes can often be delayed or prevented through intervention and treatment ( 1 ), but the global burden of diabetes is rapidly increasing, and glycaemic control remains suboptimal for large proportions of patients ( 3 – 5 ). This highlights the need for more research. Electronic health records (EHR) are observational data that are generated and collected as part of routine clinical care, for example from consultation with a GP, or a hospital admission. EHR data are increasingly used in diabetes research and provide an invaluable resource as the data reflects real- world clinical practice. The size of EHR datasets means researchers can define very large cohorts of people with type 1 and type 2 diabetes and allows the study of less common diabetes subtypes. EHR data are representative of the broader population and provide long-term follow-up of patients ( 6 ). Therefore, they are ideal for studying clinically relevant outcomes to aid our understanding and management of diabetes. This type of real-world data also enables the study of outcomes and groups of people not covered by clinical trials ( 7 ). Transforming raw EHR data into a dataset ready for analysis requires a large amount of processing and phenotyping, which is complex and time consuming ( 8 ). These steps are commonly repeated for each individual study published, even within research groups, with different researchers often using different rules to define the same target population. EHR data processing approaches are usually not standardised, resulting in heterogeneity in cohorts and phenotypes. Full descriptions of data processing are rarely provided in research papers, limiting the reproducibility of their results ( 8 , 9 ). EHR resources, such as the Clinical Practice Research Datalink (CPRD), often publish data profiles to give researchers information on the structure and contents of the data ( 10 ) but guides on how to define specific cohorts are usually not available. As EHR data are becoming increasingly popular in observational diabetes research, it is important to make the data-processing and phenotyping involved as reproducible as possible and to use validated approaches. We aimed to develop a flexible and reproducible framework for defining standardised type 1 and type 2 diabetes cohorts in EHR data with baseline features and outcomes data. The framework we describe has been informed by over 10 years of experience analysing EHR data as part of the MASTERMIND diabetes precision medicine consortium ( 11 – 19 ). Methods Transforming raw EHR data into a diabetes cohort ready for analysis requires several processing steps. Figure 1 summarises the key steps of our flexible framework to create reproducible diabetes cohorts, which can be adapted and applied to any EHR dataset. Download figure Open in new tab Figure 1: A framework to define diabetes research cohorts using EHR data. Data source We have used EHR data from the Clinical Practice Research Datalink (CPRD) in our diabetes studies. CPRD Aurum is a large database of longitudinal, routinely collected medical records from primary care practices in the UK and contains information on patients’ demographics, diagnoses, prescriptions, lifestyle factors, and test results ( 10 ). CPRD Aurum covers over 13% of the UK population and is largely representative of the broader population ( 10 , 20 ). We obtained data from CPRD for people who had a record of diabetes between 01/01/2004 and 06/11/2020. The primary care data was linked to other datasets: Hospital Episode Statistics (HES) data which contain information on all admissions to the National Health Service (NHS) secondary care providers, index of multiple deprivation (IMD) data which is the national measure of deprivation, and Office for National Statistics (ONS) Death Registration Data which cover dates and causes of death and is considered the gold standard mortality data in the UK. Code lists EHR data are commonly stored in the form of codes. Therefore, to define each variable of interest in our dataset, we need to generate a list of the codes that could be used to record that variable (for example, a medical condition). These code lists are subsequently used to define the study population, baseline features, and outcomes. While reproducible methods for developing code lists have been suggested ( 21 ), standardised approaches are often not used. Published code lists are available for use from online repositories, or they can be generated for the study using a standardised pipeline such as the one we have defined in Figure 2 . Download figure Open in new tab Figure 2: A) A standardised pipeline to generate clinically reviewed code lists. B) Implementation of pipeline to define Read, SNOMED and CPRD med code lists. The pipeline we have developed generates comprehensive code lists that can be used to identify any condition, biomarker, or sociodemographic features. We primarily use existing code lists as inputs, sourced from online repositories (for example, the HDRUK Phenotype library ( 22 ) or OpenCodelists ( 23 )), as this improves the efficiency of our approach and reduces research waste. When existing code lists are not available, or to ensure comprehensiveness, term searching the target coding system or mapping from one coding system to another can also provide further codes. Codes from different sources are then combined and reviewed by a clinician. The clinician review is a key step in our process, ensuring our definitions are as robust as possible. This pipeline can easily be applied to any EHR coding system. We implemented our pipeline using SNOMED codes, the most comprehensive coding system used worldwide ( 24 ), and Read codes, historically used in GP systems before switching to SNOMED. Read and SNOMED code lists are initially compiled using existing publicly available code lists where available or term searching the respective coding systems. SNOMED codes are additionally mapped from Read codes and term-search of a SNOMED dictionary to be as comprehensive as possible. As we use CPRD Aurum data for our research, we use CPRD’s Medical Dictionary to map the Read and SNOMED codes to medcodes (CPRD’s own coding format). The generation of Read and SNOMED code lists additionally means this approach is generalisable to other EHR datasets using the same underlying coding system. To develop code lists for prescriptions, we generated a list of generic and brand names for each medication and searched for these terms in CPRD’s Product Dictionary. A clinician then reviewed the resulting code list. ICD-10 (for hospital diagnoses in HES and causes of death in ONS) and OPCS-4 (for procedures in HES) code lists were put together using publicly available existing code lists (where available) and adjusted to fit our variable definitions with input from a clinician. All the code lists we have developed are available online in a GitHub repository ( https://github.com/Exeter-Diabetes/CPRD-Codelists ). Defining a diabetes cohort and initial data quality checks A cohort of people with diabetes can be defined by selecting those with clinical codes for diabetes in their records. NHS England developed the Quality Outcomes Framework (QOF) ( 25 ) to provide financial remuneration for good practice for data collection and coding. We included patients who had a diabetes diagnosis code included in the Quality and Outcomes Framework (QOF) list of recommended diabetes codes, recorded with a valid date. This was validated in UK Biobank data( 26 ) using diabetes QOF codes, where our approach had 99.9% specificity and 89.5% sensitivity for self- reported diabetes at nurse interview (personal communication, K Young 2025). Our code lists used to define diabetes in CPRD are available in our GitHub repository: https://github.com/Exeter-Diabetes/CPRD-Codelists/ . CPRD have their own quality standards for data, which include identifying patients with non- continuous follow-up or poor data recording issues (for example: missing year of birth, GP registration date before year of birth, age greater than 115 at end of follow-up) whose records are deemed not ‘acceptable’ for research ( 27 ). We therefore excluded these patients from our diabetes cohort. Valid dates When cleaning the data, we only considered records with a valid date, which we defined as being no earlier than the patient’s date of birth (or month of birth if the date was not available) and no later than the end of the patient’s practice registration, last collection date from the practice, or date of death (when present). If a patient’s full date of birth was not available, we defined their date of birth as the 15 th of their month of birth if the birth month and year were available, the 1 st July of their birth year if only the birth year was available, or the date of their earliest primary care observation if this was earlier (excluding dates earlier than the month/year of birth). Defining diabetes diagnosis dates Exact diagnosis dates often cannot be directly extracted from EHR data. Therefore, we defined dates of diabetes diagnosis as the earliest observation of diabetes in a person’s record: first diabetes diagnosis code, first HbA1c result of 48 mmol/mol or higher, or first prescription for glucose lowering therapy. If an individual’s diabetes diagnosis date was very close to their GP registration start date (diagnosis within 90 days before or 30 days after registration) we considered this likely unreliable and excluded these individuals from our cohort. We also exclude those with gestational diabetes as calculating their date of diagnosis is more complicated. Classifying diabetes type Classifying diabetes type can be challenging, and in order to distinguish between different types of diabetes in EHR data, robust and validated classification approaches should be used. These can be based on clinical codes, prescriptions, and features such as age at diabetes diagnosis. Figure 3 details a full algorithm of classification approaches that can be used to classify a diabetes type for everyone in a diabetes cohort using EHR data. When classifying type 1 and type 2 diabetes, we defined those with no prescriptions for insulin in their records as type 2. In insulin-treated people, we then evaluated clinical codes specific to a diabetes type. In the majority of cases a person’s diabetes type is clearly defined (over 95% of individuals in our CPRD cohort have codes for solely type 1 diabetes or type 2 diabetes). For the small proportion where a person has clinical codes for multiple types of diabetes, we used the ratio of different type codes to determine diabetes type (based on previously validated studies ( 28 , 29 )). Where a patient has no type-specific codes (1.4% of cases), additional approaches may need to be considered, for example age at diabetes diagnosis, time to insulin, and other diabetes medication prescriptions. This approach broadly classifies all individuals by which type of diabetes their clinician likely thought they had. If an individual had a clinical code for another type of diabetes (for example, monogenic diabetes or secondary diabetes) at any date in their records, we classified their diabetes type as ‘other’. Additional work is needed to define the specific diabetes subtype of these individuals (for example, type 3c diabetes ( 30 )). Download figure Open in new tab Figure 3: A validated algorithm for classifying type 1 and type 2 diabetes. In some cases, classifying all patients may not be necessary, and particularly for type 1 diabetes research, it may be more important to define a more specific cohort using stricter criteria. Therefore, we defined a ‘gold standard’ cohort where we are more confident that they have type 1 diabetes by including only individuals broadly classified as type 1, diagnosed aged 20 or under, and treatment with insulin from within 1 year of diagnosis. Defining an index date Different diabetes cohorts can flexibly be defined using different chosen index dates: Incident diabetes cohort: the date of diabetes diagnosis Prevalent cohort: A specific time of interest (e.g. 01/02/2020, pre-Covid19) Treatment response cohort: The date of initiation of a glucose-lowering medication Defining baseline features Baseline features such as sociodemographics, biomarkers, clinical measurements, and comorbidities can be defined for a cohort relative to the chosen index date. Additional data cleaning and processing steps may be required to define each variable. Biomarkers and clinical measurements We defined a range of biomarkers for our diabetes cohorts, including: glycated haemoglobin (HbA1c), high-density lipoprotein (HDL), triglycerides, blood creatinine, low-density lipoprotein (LDL), alanine transaminase (ALT), aspartate transferase (AST), total cholesterol, and albumin-creatinine ratio (ACR), as well as the clinical measurements weight, height, body mass index (BMI), systolic blood pressure (SBP), and diastolic blood pressure (DBP). The following data processing steps could also be extended to other biomarkers or clinical measurements. For all biomarkers we removed any values outside of clinically plausible limits. These clinically plausible limits were chosen based on discussion with a clinician and values outside this range were assumed to be errors. We also dropped any observations that did not have acceptable units. For units that made up more than 0.1% of observations, including missing units, we examined the distribution of values to decide if they were acceptable. Where multiple different biomarker values were recorded on the same day, we took the mean of these values. For HbA1c, we additionally removed any observations before 1990 (HbA1c was not widely used before then) and converted all values less than 20 (assumed to be in % units) into mmol/mol. We also calculated estimated glomerular filtration rate (eGFR) from serum creatinine readings using the 2021 CKD-EPI Creatinine Equation ( 31 ). Comorbidities We also defined a range of comorbidities (any condition in addition to diabetes) based on clinical codes, including cardiovascular (hypertension, atrial fibrillation, angina, myocardial infarction, cardiac revascularisation, ischaemic heart disease, heart failure, peripheral vascular disease), respiratory (asthma, chronic obstructive pulmonary disease), neurological (transient ischaemic attack, stroke, dementia, other neurological conditions), oncological (haematological cancer, solid cancer), and other conditions (chronic kidney disease/end-stage renal disease, chronic liver disease, solid organ transplant, rheumatoid arthritis, cystic fibrosis). To define a diagnosis date for a comorbidity we used the earliest date of any record of the condition (in the primary care or hospital data). We also defined microvascular diabetes complications (diabetic nephropathy, neuropathy, retinopathy) using the same method. Sociodemographic and lifestyle factors We defined ethnicity based on clinical codes in the primary care records and categorised using the same method as Mathur et al in a previous study ( 32 ), grouping ethnicity codes into 16 categories that can be collapsed into 5 higher level categories (White, South Asian, Black, Mixed, Other) based on UK census groups. We took the most commonly recorded ethnicity category, and if there was more than one equally most common, we took the most recently recorded. For anyone who we could not define an ethnicity from the primary care records, we used their ethnicity recorded in the hospital data (HES) where it was available. Using this method, we could define an ethnicity for over 97% of our diabetes cohort. To define deprivation, we used a linked Index of Multiple Deprivation dataset. This is the official measure used in England and assigns each patient a decile representing relative deprivation based on a range of factors ( 33 ). For smoking, we grouped codes into 3 categories (non-smoker, ex-smoker, active smoker) and defined a category for each patient by taking the most recently recorded. If the most recently recorded code is in the ‘non-smoker’ category, but the patient has previously been recorded as a smoker, they are categorised as an ‘ex-smoker’. Similarly for alcohol consumption we categorised codes into 4 groups (none, within recommended limits, excess, harmful) and took the most recently recorded observation. Example cohorts and outcomes Using code lists and a flexible data-processing pipeline as detailed above, we can define various cohorts of people with diabetes for a specific index date, including their baseline features and key outcomes relevant to the research question, for example: Incident cohort An incident cohort of individuals at the date of diagnosis of their diabetes (diagnosis dates defined as above). We excluded anyone with a diagnosis date before or within 3 months of their date of GP practice registration, reducing the likelihood of including people with an existing diabetes diagnosis transferred to a new GP practice. For this cohort, we can define outcomes such as the first incidence of a comorbidity after diabetes diagnosis, including cardiovascular disease, stroke, and changes in chronic kidney disease stage (in primary care or hospital records), mortality-related outcomes including the date and cause of death defined from the linked ONS data, and time to treatment intensification for example time from diabetes diagnosis to the initiation of insulin treatment. Prevalent cohort (1 st February 2020) A cross-sectional prevalent cohort of individuals with diabetes actively registered with their GP practice on 01/02/2020. We excluded anyone who deregistered or died before 01/02/2020 and anyone whose diabetes was diagnosed after this date. Using the predefined index date, we can then define future outcomes (such as the first incidence of a medical condition, hospitalisation, or mortality). Treatment outcomes A cohort of all individuals with diabetes initiating a glucose-lowering therapy class at the date of initiation of that medication. For this cohort, we updated the approach previously published by Rodgers et al . ( 34 ), which detailed the processing steps to define a type 2 treatment cohort using an older version of CPRD data (CPRD GOLD). The drug classes included were metformin, sulphonylureas, thiazolidinediones, DPP4-inhibitors, SGLT2-inhibitors, GLP1-receptor agonists, and insulin. We also extracted acarbose and glinide prescriptions, but due to low numbers prescribed in the UK, these have not been included in our research to date. Individuals could be included more than once if they initiated multiple therapy classes. We defined a range of short- and long-term treatment outcomes for this cohort. We defined glycaemic response as the change from baseline HbA1c 12 months after drug initiation (the closest measure to 12 months after initiation within 3-15 months) with no addition or cessation of other glucose-lowering medications and continued prescription of the drug of interest. We similarly defined weight change 12 months from initiation on stable unchanged therapy. We also defined early treatment discontinuation as a therapy discontinued within 6 months of initiation. As described by Rodgers et al ., gaps between prescriptions had to be at least 6 months to be considered as a drug being stopped, and the availability of at least 3 months follow-up time after discontinuation was required to confirm the drug was discontinued ( 34 ). We additionally defined glycaemic failure (confirmed HbA1c ≥69 mmol/mol), time to treatment change, and time to insulin initiation. We also defined a range of long-term outcomes, including all-cause mortality, major cardiovascular events or heart failure, renal progression (>40% decrease in eGFR or end-stage kidney disease), and microvascular complications. Scripts to define these example cohorts and generic template scripts that can be adapted to define other cohorts are available online in our GitHub repository ( https://github.com/Exeter-Diabetes/CPRD-Cohort-scripts ). Results From CPRD, we obtained an October 2020 download including primary care data on 1,480,985 individuals with a record of diabetes between 01/01/2004 and 06/11/2020 (1,138,193 after applying quality control measures), and any linked hospital, mortality and deprivation data where available. Figure 4 describes the inclusion criteria and resulting numbers in our diabetes dataset and three example cohorts. Download figure Open in new tab Figure 4: Diabetes research cohorts (incident cohort, prevalent cohort, and treatment outcomes cohort) derived from CPRD primary care records and linked datasets. Cohorts The incident cohort, with data covering diabetes diagnosis, consisted of 10,480 people with type 1 diabetes ‘gold standard’, 726,800 with type 2 diabetes, and 9,753 with other types of diabetes. Table 1 describes the key baseline characteristics of this cohort. View this table: View inline View popup Download powerpoint Table 1: Baseline characteristics of an incident cohort of gold standard people with type 1 diabetes and type 2 diabetes at diagnosis. The prevalent cohort, actively registered with their GP practice on 01/02/2020, consisted of 9,514 people with type 1 diabetes ‘gold standard’, 559,905 with type 2 diabetes, and 12,309 with other types of diabetes). Table 2 describes the key baseline characteristics of this cohort. View this table: View inline View popup Download powerpoint Table 2: Baseline characteristics of a prevalent (2020) cohort of people with gold standard type 1 diabetes and type 2 diabetes. The type 2 diabetes treatment response cohort consisted of 769,394 people initiating a class of glucose-lowering medication: metformin (n= 636,001), sulphonylureas (n= 352,671), thiazolidinediones (n= 109,249), DPP4-inhibitors (n= 219,755), GLP1-receptor agonists (n= 60,358), SGLT2-inhibitors (n= 99,994), and insulin (n= 162,187). Table 3 describes key baseline characteristics of individuals with type 2 diabetes at initiation of a glucose-lowering therapy by drug class. There were also 1,042 people with type 1 diabetes ‘gold standard’ initiating a class of non-insulin glucose- lowering medication (all concurrently insulin-treated): metformin (n= 985), sulphonylureas (n= 16), thiazolidinediones (n= 7), DPP4-inhibitors (n= 21), GLP1-receptor agonists (n= 54), SGLT2-inhibitors (n= 59). View this table: View inline View popup Download powerpoint Table 3: Baseline characteristics for a type2 diabetes treatment outcomes cohort at drug initiation, by major drug classes. Open research We have shared our R code for defining different diabetes cohorts and our code lists online in GitHub repositories ( https://github.com/Exeter-Diabetes/CPRD-Cohort-scripts , https://github.com/Exeter-Diabetes/CPRD-Codelists/ ), alongside descriptive documentation to help others to understand and replicate our work. Making our work publicly available means our research is as transparent and reproducible as possible, adhering to the principles of open research and FAIR data ( 35 ). Discussion We have developed a reproducible framework for defining standardised cohorts of people with type 1 and type 2 diabetes in coded EHR data. The data-processing pipeline can flexibly generate diabetes cohorts from raw datasets that include information about a person’s diabetes, as well as sociodemographic features, comorbidities, biomarkers, and medications. This framework has been implemented in the definition of cohorts for analysis in several published studies ( 30 , 36 – 40 ), mainly focussed on glucose-lowering therapy in type 2 diabetes, and has ensured our work is highly reusable and coding is robust and consistent across our studies. EHR datasets are a greatly valuable research resource internationally, and the wide application of our open-source framework could enhance the quality and reproducibility of diabetes epidemiological and clinical studies. Research using EHR data is becoming increasingly popular, and data sources are becoming more readily available, both in the UK through initiatives such as NHS England’s Secure Data Environment ( 41 ) and internationally. Therefore, promoting open research practices and ensuring research is reproducible using standardised data-processing approaches will be increasingly important. While the framework we have outlined focuses on diabetes, the general approaches could be applied to other medical conditions and areas of health research. Limitations EHR data are not collected for research purposes. Therefore, robustly defining variables can be difficult ( 42 ), and misclassification is often possible. While our approaches will not be perfect, we have aimed to make them as robust as possible given the constraints of the data, developing thorough algorithms for defining key variables through discussion with clinicians involved in this research. Limitations of using EHR data for research include data completeness and missing data, the reliance on the accuracy of the data recorded in clinical records, and variation in coding between practices over time ( 6 ). Exact dates of diabetes diagnosis are not available in CPRD primary care data. We used a combination of diabetes diagnosis codes, HbA1c test results, and prescriptions for glucose lowering therapies to define a diagnosis date as comprehensively as possible, but there still may be cases where this does not accurately reflect the initial diagnosis date. Similarly, an individual’s diabetes type can be difficult to determine in healthcare records ( 7 ), where classification biomarker tests (for example, C-peptide) are not commonly available. To deal with this, we used an approach based on previous studies which were validated against clinical assessment ( 28 , 29 ). We also included both a broad definition of type 1 diabetes that likely reflects a clinical diagnosis and further criteria to define a more restricted cohort where we are more confident that these individuals have type 1. Additionally, determining whether an individual has certain comorbidities can be challenging and relies on the clinical codes used to define the conditions ( 8 ). We used comprehensive clinically reviewed code lists (utilising both the primary care data and hospital records) to define each medical condition as robustly as possible. When defining medications, we may miss some prescriptions issued in a hospital or private healthcare, however prescription data are generally well coded in UK primary care records. These highlighted problems may limit the accuracy of some coding but are likely to lead to noise rather than biases in the data, which will be mitigated by the large sample sizes available. Conclusion EHR data can be a highly valuable resource for studying diabetes, but robustly coding reproducible datasets is challenging. Therefore, we developed a flexible and reproducible framework to generate standardised diabetes cohorts using EHR data that we have made publicly available for use worldwide. Whilst we have used UK EHR data from CPRD in this study, our framework is applicable to any coded EHR dataset. Funding This research was funded by the Medical Research Council (UK) (MR/N00633X/1). The funder had no role in any part of the study or in any decision about publication. BMS is supported by the NIHR Exeter Clinical Research Facility. JMD is supported by a Wellcome Trust Early Career award (227070/Z/23/Z). Competing interests APM received prior research funding from Eli Lilly and Company, Pfizer, and AstraZeneca outside of the submitted work. All other authors declare no other relationships or activities that could appear to have influenced the submitted work. Author contributions The study concept and design were developed by KGY, BMS, APM, JMD, and RH. KGY and RH prepared and process the data and defined the study cohorts. All authors critically revised the manuscript and saw and approved the final article. KGY and BMS attest that all listed authors meet authorship criteria, that no others meeting the criteria have been omitted. KGY and BMS are responsible for the decision to submit for publication, and are the guarantors of this work and, as such, had full access to all the data in the study and takes responsibility for the integrity of the data and the accuracy of the data analysis. Ethics/ data approval The study protocol was approved by the CPRD Independent Scientific Advisory Committee (eRAP protocol numbers: 22_002000). Data availability No additional data are available from the authors although CPRD data are available by application to CPRD Independent Scientific Advisory Committee. Rights Retention For the purpose of open access, the author has applied a Creative Commons Attribution (CC BY) licence to any Author Accepted Manuscript version arising from this submission. Acknowledgements This article is based on data from the Clinical Practice Research Datalink obtained under licence from the UK Medicines and Healthcare products Regulatory Agency. CPRD data are provided by patients and collected by the NHS as part of their care and support. Approval for CPRD data access and the study protocol was granted by the CPRD Independent Scientific Advisory Committee (eRAP protocol number: 22_002000). This study was supported by the National Institute for Health and Care Research Exeter Biomedical Research Centre. The views expressed are those of the authors and not necessarily those of the NHS, NIHR or the Department of Health and Social Care. Footnotes ↵ * Joint senior authors References 1. ↵ World Health Organisation . Diabetes. Available from: https://www.who.int/news-room/fact-sheets/detail/diabetes . Accessed 7 January 2025 2. ↵ N. C. D. Risk Factor Collaboration . Worldwide trends in diabetes prevalence and treatment from 1990 to 2022: a pooled analysis of 1108 population-representative studies with 141 million participants . Lancet . 2024 ; 404 ( 10467 ): 2077 – 93 . OpenUrl PubMed 3. ↵ Bain SC , Feher M , Russell-Jones D , Khunti K . Management of type 2 diabetes: the current situation and key opportunities to improve care in the UK . Diabetes Obes Metab . 2016 ; 18 ( 12 ): 1157 – 66 . OpenUrl PubMed 4. Edelman SV , Polonsky WH . Type 2 Diabetes in the Real World: The Elusive Nature of Glycemic Control . Diabetes Care . 2017 ; 40 ( 11 ): 1425 – 32 . OpenUrl Abstract / FREE Full Text 5. ↵ Mannucci E , Monami M , Dicembrini I , Piselli A , Porta M . Achieving HbA1c targets in clinical trials and in the real world: a systematic review and meta-analysis . J Endocrinol Invest . 2014 ; 37 ( 5 ): 477 – 95 . OpenUrl CrossRef PubMed 6. ↵ Shephard E , Stapley S , Hamilton W . The use of electronic databases in primary care research . Fam Pract . 2011 ; 28 ( 4 ): 352 – 4 . OpenUrl CrossRef PubMed 7. ↵ Farmer R , Mathur R , Bhaskaran K , Eastwood SV , Chaturvedi N , Smeeth L . Promises and pitfalls of electronic health record analysis . Diabetologia . 2018 ; 61 ( 6 ): 1241 – 8 . OpenUrl PubMed 8. ↵ Denaxas SC , Morley KI . Big biomedical data and cardiovascular disease research: opportunities and challenges . Eur Heart J Qual Care Clin Outcomes . 2015 ; 1 ( 1 ): 9 – 16 . OpenUrl PubMed 9. ↵ Denaxas S , Direk K , Gonzalez-Izquierdo A , Pikoula M , Cakiroglu A , Moore J , et al. Methods for enhancing the reproducibility of biomedical research findings using electronic health records . BioData Min . 2017 ; 10 : 31 . 10. ↵ Wolf A , Dedman D , Campbell J , Booth H , Lunn D , Chapman J , et al. Data resource profile: Clinical Practice Research Datalink (CPRD) Aurum . Int J Epidemiol . 2019 ; 48 ( 6 ): 1740 -g. OpenUrl CrossRef PubMed 11. ↵ Dennis JM , Shields BM , Hill AV , Knight BA , McDonald TJ , Rodgers LR , et al. Precision Medicine in Type 2 Diabetes: Clinical Markers of Insulin Resistance Are Associated With Altered Short- and Long-term Glycemic Response to DPP-4 Inhibitor Therapy . Diabetes Care . 2018 ; 41 ( 4 ): 705 – 12 . OpenUrl Abstract / FREE Full Text 12. Dennis JM , Henley WE , Weedon MN , Lonergan M , Rodgers LR , Jones AG , et al. Sex and BMI Alter the Benefits and Risks of Sulfonylureas and Thiazolidinediones in Type 2 Diabetes: A Framework for Evaluating Stratification Using Routine Clinical and Individual Trial Data . Diabetes Care . 2018 ; 41 ( 9 ): 1844 – 53 . OpenUrl Abstract / FREE Full Text 13. Curtis HJ , Dennis JM , Shields BM , Walker AJ , Bacon S , Hattersley AT , et al. Time trends and geographical variation in prescribing of drugs for diabetes in England from 1998 to 2017 . Diabetes Obes Metab . 2018 ; 20 ( 9 ): 2159 – 68 . OpenUrl CrossRef PubMed 14. McGovern AP , Dennis JM , Shields BM , Hattersley AT , Pearson ER , Jones AG , et al. What to do with diabetes therapies when HbA1c lowering is inadequate: add, switch, or continue? A MASTERMIND study . BMC Med . 2019 ; 17 ( 1 ): 79 . OpenUrl PubMed 15. Dennis JM , Henley WE , McGovern AP , Farmer AJ , Sattar N , Holman RR , et al. Time trends in prescribing of type 2 diabetes drugs, glycaemic response and risk factors: A retrospective analysis of primary care data, 2010-2017 . Diabetes Obes Metab . 2019 ;21(7):1576-84. 16. Rodgers LR , Dennis JM , Shields BM , Mounce L , Fisher I , Hattersley AT , et al. Prior event rate ratio adjustment produced estimates consistent with randomized trial: a diabetes case study . J Clin Epidemiol . 2020 ; 122 : 78 – 86 . OpenUrl PubMed 17. McGovern AP , Hogg M , Shields BM , Sattar NA , Holman RR , Pearson ER , et al. Risk factors for genital infections in people initiating SGLT2 inhibitors and their impact on discontinuation . BMJ Open Diabetes Res Care . 2020 ; 8 ( 1 ). 18. Dennis JM , Young KG , McGovern AP , Mateen BA , Vollmer SJ , Simpson MD , et al. Development of a treatment selection algorithm for SGLT2 and DPP-4 inhibitor therapies in people with type 2 diabetes: a retrospective cohort study . Lancet Digit Health . 2022 ; 4 ( 12 ): e873 – e83 . OpenUrl 19. ↵ Cardoso P , Dennis JM , Bowden J , Shields BM , McKinley TJ, the MC . Dirichlet process mixture models to impute missing predictor data in counterfactual prediction models: an application to predict optimal type 2 diabetes therapy. BMC Med Inform Decis Mak. 2024;24(1):12. 20. ↵ Clinical Practice Research Datalink. CPRD Data . Available from: https://cprd.com/data . Accessed 7 January 2025 21. ↵ Watson J , Nicholson BD , Hamilton W , Price S . Identifying clinical features in primary care electronic health record studies: methods for codelist development . BMJ Open . 2017 ; 7 ( 11 ): e019637 . OpenUrl Abstract / FREE Full Text 22. ↵ HDRUK. HDR UK Phenotype Library . Available from: https://phenotypes.healthdatagateway.org/ . Accessed 7 January 2025 23. ↵ OpenSAFELY. OpenCodelists . Available from: https://www.opencodelists.org/ . Accessed 3 February 2025 24. ↵ NHS Digital. SNOMED CT . Available from: https://digital.nhs.uk/services/terminology-and-classifications/snomed-ct . Accessed 3 February 2025 25. ↵ NHS Digital. Quality and Outcomes Framework (QOF). Available from: https://digital.nhs.uk/data-and-information/data-tools-and-services/data-services/general-practice-data-hub/quality-outcomes-framework-qof . Accessed 7 January 2025 26. ↵ Sudlow C , Gallacher J , Allen N , Beral V , Burton P , Danesh J , et al. UK biobank: an open access resource for identifying the causes of a wide range of complex diseases of middle and old age . PLoS Med . 2015 ; 12 ( 3 ): e1001779 . OpenUrl CrossRef PubMed 27. ↵ Clinical Practice Research Datalink. Primary care data for public health research . Available from: https://www.cprd.com/primary-care-data-public-health-research . Accessed 7 January 2025 28. ↵ Ke C , Stukel TA , Luk A , Shah BR , Jha P , Lau E , et al. Development and validation of algorithms to classify type 1 and 2 diabetes according to age at diagnosis using electronic health records . BMC Med Res Methodol . 2020 ; 20 ( 1 ): 35 . OpenUrl CrossRef PubMed 29. ↵ Schroeder EB , Donahoo WT , Goodrich GK , Raebel MA . Validation of an algorithm for identifying type 1 diabetes in adults based on electronic health record data . Pharmacoepidemiol Drug Saf . 2018 ; 27 ( 10 ): 1053 – 9 . OpenUrl CrossRef PubMed 30. ↵ Hopkins R , Young KG , Thomas NJ , Jones AG , Hattersley AT , Shields BM , et al. Treatment outcomes with oral anti-hyperglycaemic therapies in people with diabetes secondary to a pancreatic condition (type 3c diabetes): A population-based cohort study . Diabetes Obes Metab . 2025 . 31. ↵ Inker LA , Eneanya ND , Coresh J , Tighiouart H , Wang D , Sang Y , et al. New Creatinine- and Cystatin C-Based Equations to Estimate GFR without Race . N Engl J Med . 2021 ; 385 ( 19 ): 1737 – 49 . OpenUrl CrossRef PubMed 32. ↵ Mathur R , Bhaskaran K , Chaturvedi N , Leon DA , vanStaa T, Grundy E, et al. Completeness and usability of ethnicity data in UK-based primary care and hospital databases . J Public Health (Oxf ). 2014 ; 36 ( 4 ): 684 – 92 . OpenUrl CrossRef PubMed 33. ↵ English indices of deprivation 2015 . Available from: https://www.gov.uk/government/statistics/english-indices-of-deprivation-2015 . Accessed 7 January 2025 34. ↵ Rodgers LR , Weedon MN , Henley WE , Hattersley AT , Shields BM . Cohort profile for the MASTERMIND study: using the Clinical Practice Research Datalink (CPRD) to investigate stratification of response to treatment in patients with type 2 diabetes . BMJ Open . 2017 ; 7 ( 10 ): e017989 . OpenUrl Abstract / FREE Full Text 35. ↵ Wilkinson MD , Dumontier M , Aalbersberg IJ , Appleton G , Axton M , Baak A , et al. The FAIR Guiding Principles for scientific data management and stewardship . Sci Data . 2016 ; 3 : 160018 . 36. ↵ Dennis JM , Young KG , Cardoso P , Gudemann LM , McGovern AP , Farmer A , et al. A five-drug class model using routinely available clinical features to optimise prescribing in type 2 diabetes: a prediction model development and validation study . Lancet . 2025 ; 405 ( 10480 ): 701 – 14 . OpenUrl PubMed 37. Hopkins R , Young KG , Thomas NJ , Godwin J , Raja D , Mateen BA , et al. Risk factor associations for severe COVID-19, influenza and pneumonia in people with diabetes to inform future pandemic preparations: UK population-based cohort study . BMJ Open . 2024 ; 14 ( 1 ): e078135 . OpenUrl Abstract / FREE Full Text 38. Gudemann LM , Young KG , Thomas NJM , Hopkins R , Challen R , Jones AG , et al. Safety and effectiveness of SGLT2 inhibitors in a UK population with type 2 diabetes and aged over 70 years: an instrumental variable approach . Diabetologia . 2024 ; 67 ( 9 ): 1817 – 27 . OpenUrl PubMed 39. Cardoso P , Young KG , Nair ATN , Hopkins R , McGovern AP , Haider E , et al. Phenotype-based targeted treatment of SGLT2 inhibitors and GLP-1 receptor agonists in type 2 diabetes . Diabetologia . 2024 ; 67 ( 5 ): 822 – 36 . OpenUrl PubMed 40. ↵ Young KG , Hopkins R , Shields BM , Thomas NJ , McGovern AP , Dennis JM , et al. Recent UK type 2 diabetes treatment guidance represents a near whole population indication for SGLT2- inhibitor therapy . Cardiovasc Diabetol . 2023 ; 22 ( 1 ): 302 . OpenUrl PubMed 41. ↵ NHS Digital. NHS England Secure Data Environment . Available from: https://digital.nhs.uk/services/secure-data-environment-service . Accessed 7 January 2025 42. ↵ Sauer CM , Chen LC , Hyland SL , Girbes A , Elbers P , Celi LA . Leveraging electronic health records for data science: common pitfalls and how to avoid them . Lancet Digit Health . 2022 ; 4 ( 12 ): e893 – e8 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted March 26, 2025. Download PDF Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A reproducible open-source framework for defining type 1 and type 2 diabetes research cohorts in routinely collected electronic health record data Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A reproducible open-source framework for defining type 1 and type 2 diabetes research cohorts in routinely collected electronic health record data Rhian Hopkins , Pedro Cardoso , Laura M Güdemann , Andrew P McGovern , John M Dennis , Beverley M Shields , Katherine G Young medRxiv 2025.03.26.25324678; doi: https://doi.org/10.1101/2025.03.26.25324678 Share This Article: Copy Citation Tools A reproducible open-source framework for defining type 1 and type 2 diabetes research cohorts in routinely collected electronic health record data Rhian Hopkins , Pedro Cardoso , Laura M Güdemann , Andrew P McGovern , John M Dennis , Beverley M Shields , Katherine G Young medRxiv 2025.03.26.25324678; doi: https://doi.org/10.1101/2025.03.26.25324678 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Endocrinology (including Diabetes Mellitus and Metabolic Disease) Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4421) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15212) Forensic Medicine (30) Gastroenterology (1121) Genetic and Genomic Medicine (6581) Geriatric Medicine (667) Health Economics (996) Health Informatics (4520) Health Policy (1366) Health Systems and Quality Improvement (1611) Hematology (539) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15906) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (667) Neurology (6580) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1141) Occupational and Environmental Health (956) Oncology (3324) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5433) Public and Global Health (9212) Radiology and Imaging (2193) Rehabilitation Medicine and Physical Therapy (1368) Respiratory Medicine (1194) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ff550a18921300f',t:'MTc3OTM4NDY4OA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.