Full text
50,943 characters
· extracted from
preprint-html
· click to expand
Machine learning prediction algorithms for 2- , 5- and 10-year risk of Alzheimer’s, Parkinson’s and dementia at age 65: a study using medical records from France and the UK General Practitioners | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Machine learning prediction algorithms for 2- , 5- and 10-year risk of Alzheimer’s, Parkinson’s and dementia at age 65: a study using medical records from France and the UK General Practitioners Thomas Nedelec , Karim Zaidi , Charlotte Montaud , Octave Guinebretiere , Pyry Sipilä , Dang Wei , Fen Yang , Anna Freydenzon , Antoine Belloir , Nemo Fournier , Nadine Hamieh , Beranger Lekens , Yanis Slaouti , Allan McRae , Baptiste Couvy-Duchesne , Yulin Hswen , Fang Fang , Mika Kivimäki , Manon Ansart , Stanley Durrleman doi: https://doi.org/10.1101/2025.01.22.25320969 Thomas Nedelec 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: thomas.nedelec{at}icm-institute.org Karim Zaidi 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Charlotte Montaud 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Octave Guinebretiere 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pyry Sipilä 2 Clinicum, University of Helsinki , Helsinki, Finland MD, PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dang Wei 3 Institute of Environmental Medicine, Karolinska Institutet , Stockholm, Sweden MD, PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Fen Yang 3 Institute of Environmental Medicine, Karolinska Institutet , Stockholm, Sweden MD, PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Anna Freydenzon 4 Institute for Molecular Bioscience, the University of Queensland , St Lucia, Queensland, Australia PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Antoine Belloir 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nemo Fournier 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nadine Hamieh 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Beranger Lekens 5 THIN & GERS Data , Boulogne-Billancourt, France MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yanis Slaouti 5 THIN & GERS Data , Boulogne-Billancourt, France MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Allan McRae 3 Institute of Environmental Medicine, Karolinska Institutet , Stockholm, Sweden PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Baptiste Couvy-Duchesne 3 Institute of Environmental Medicine, Karolinska Institutet , Stockholm, Sweden 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yulin Hswen 6 University of California San Francisco , San Francisco, California, USA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Fang Fang 4 Institute for Molecular Bioscience, the University of Queensland , St Lucia, Queensland, Australia MD, PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mika Kivimäki 2 Clinicum, University of Helsinki , Helsinki, Finland 7 Brain Sciences, University College London , London, UK PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Manon Ansart 8 Université de Bourgogne Franche-Comté , LEAD, CNRS UMR 5022, Université de Bourgogne , Pole AAFE, 2100 Dijon, France PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Stanley Durrleman 1 Paris Brain Institute , ICM, Inserm U 1127, CNRS UMR 7225, Sorbonne Université , Inria, Aramis project team, F-75013, Paris, France PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Structured abstract Background Leveraging machine learning on electronic health records offers a promising method for early identification of individuals at risk for dementia and neurodegenerative diseases. Current risk algorithms heavily rely on age, highlighting the need for alternative models with strong predictive power, especially at age 65, a crucial time for early screening and prevention. Methods This prospective study analyzed electronic health records (EHR) from 76,427 adults (age 65, 52.1% women) using the THIN database. A general risk algorithm for Alzheimer’s disease, Parkinson’s disease, and dementia was developed using machine learning to select predictors from diagnoses, and medications. Results Medications (e.g., laxatives, urological drugs, antidepressants), along with sex, BMI, and comorbidities, were key predictors. The algorithm achieved a 38.4% detection rate at a 5% false-positive rate for 2-year dementia prediction. Conclusion The validated prediction algorithms, easy to implement in primary care, identify high-risk 65-year-olds using medication records. Further refinement and broader validation are needed. Background Neurodegenerative diseases (NDDs) represent a major public health challenge in Western societies and one of the greatest drug development challenges. Although there are currently no curative treatments for the majority of NDDs, management of cardiometabolic risks and lifestyle intervention may delay the onset and enhance patients’ quality of life. An essential element in implementing timely primary and secondary prevention measures involves identifying at-risk patients, ideally at the general practitioner settings, well before the onset of the disease. In contrast to vascular diseases, where cost-effective and non-invasive methods such as cardiac ultrasound or supra-aortic trunk examinations are employed, performing of MRI or PET scans, or lumbar punctures to assess risk of NDDs for the total population aged 65 or older is not feasible, ethical or cost-effective. To enable targeted prevention, various multifactorial risk prediction models have been developed to distinguish individuals at a high risk of dementia, who may benefit most from preventive measures, from those at low risk. Widely used prediction models in research and clinical trials include Cardiovascular Risk Factors, Ageing and Dementia [CAIDE-Clinical], 2 CAIDE supplemented with APOE status, 3 Brief Dementia Screening Indicator [BDSI], 4 and Australian National University Alzheimer Disease Risk Index [ANU-ADRI]. 5 However, recent findings from a study on the UK Biobank revealed that these models missed between 84% to 91% of participants with incident dementia when calibrated to achieve a 5% false-positive rate. 6 Thus, current predictive models are not capable of identifying people at high risk of dementia at a given age (e.g. 65) when screening of risk groups for chronic conditions is typically implemented. One approach to improve cost-effective prediction of NDDs is to develop and validate risk algorithms for same-aged individuals by leveraging machine learning on electronic health records. Successful use of machine learning to develop predictive models has already been reported for the risk assessment of pancreatic cancer, 7 and for the QRISK score, 8 a cardiovascular disease prediction risk score that complements the established Framingham risk score. Concerning dementia and specific neurodegenerative diseases, several papers have attempted to predict dementia including Alzheimer’s disease (AD) 9 , 10 and Parkinson’s disease (PD) 11 using electronic health records, but these efforts have been restricted to a single country and incorporating age as a primary feature in the model. This approach presents challenges in interpreting the Area Under the Curve (AUC), as age often drives the model. To address this issue, we propose a framework for estimating the risk of dementia at specific age. Drawing inspiration from the French Minister of Health’s initiative to reimburse preventive visits at age 65, with a specific emphasis on NDDs prevention, we aim to develop and validate algorithms estimating the risk of AD, PD, and dementia at this critical age. By harnessing readily available electronic medication data, accessible to general practitioners (GPs), we identified medication patterns and health conditions associated with an elevated risk of AD, PD and dementia and developed algorithms that can be derived without laboratory measurements and easily integrated into primary care settings for efficient risk assessment by GPs. These algorithms could serve as a pre-screening tool, identifying a subgroup of individuals eligible for further, more costly diagnostic tests. This includes regular administration of the Mini-Mental State Examination (MMSE) during consultations or referrals to memory clinics for comprehensive evaluations. 12 Methods 2.1 Data We investigated health conditions and prescription of drugs associated with subsequent AD, PD and dementia in France and the UK using the THIN database. 13 THIN is a large standardized European database of non-extrapolated electronic health records collected at the physician level. The THIN database contains medical records, prescriptions, and diagnoses from 2,500 GPs in France and 532 general practices in the UK and represents 8 million patients in total. 14 The UK database, accounting for around 6% of the UK population, is representative of the UK general practice population in terms of demographics and type of consultation. 16 – 18 The French cohort is representative of the French general population in terms of age, sex, and residence. 15 For each patient and visit, the diagnosis and prescription established during the visit, as well as all ongoing prescriptions and associated diagnoses, are available. Information in the database also includes records of health indicators, such as weight and height. 2.2 Study Participants For both UK and French THIN cohorts, we extracted all patients born in 1940 and 1945 (aged 65 and 70 in 2010). Patients must be active, i.e. followed by the general practitioner, in 2008, the start of the exposure period from January 1 st , 2008, to December 31 st , 2010 ( Figure 1 ). We excluded patients with no diagnostic records before December 31 st , 2010, or with no diagnostic records after 2011 (eFigure1), or patients who became inactive, i.e. not followed by the general practitioner anymore or died before 2011. Patients who had already been diagnosed with one of the three outcomes before 2011 or with undefined sex were additionally excluded (for a flow chart, see eFigure1 in supplementary materials). Download figure Open in new tab Figure 1: Summary of study design: Extraction of medical prescriptions between 2008 and 2010 for patients who were either 65 or 70 years old in 2010 in France and UK 2.3 Outcome The outcomes of interest were AD, PD, and all-cause dementia occurring between January 1 st , 2011 and January 1 st , 2023. We defined the outcome according to diagnostic codes in the International Classification of Diseases, 10 th Revision (ICD-10) which were provided directly in the French database, with the exact list of codes available in the supplementary materials. For the UK database, we converted Read codes into ICD10 codes according to the correspondence provided by the UK Clinical Terminology Centre. 16 2.4 Predictors Exposure parameters included BMI, Charlson Comorbidity Index (CCI), prescribed medications (categorized using the first 3 characters of the ATC code), and medical diagnoses (categorized by ICD-10 codes). To select medications as exposure features for our model, we employed a Fine and Gray survival model to address the competing risk of death. We analyzed patients born in 1940 and 1945 together. Drug families (determined by the first 3 characters of the ATC codes) with a prevalence of at least 5% in the UK cohort were investigated for their association with the onset of the three diseases of interest ( Figure 2 ). We computed Hazard Ratios (HRs) for each medication’s association with Alzheimer’s disease (AD), Parkinson’s disease (PD), and dementia using the Fine and Gray model, adjusted for sex, BMI, CCI, and age group ( Figure 3 , panel B and eTable 1). Multiple comparisons were addressed using Bonferroni correction (N = 43). We then recalculated HRs in a multivariable model that included all medications associated with the outcomes in the univariate analysis, along with the adjustment variables. To identify the precise medication subclasses responsible for the associations, we performed a more detailed analysis of the ATC hierarchy, exploring two additional levels (first 5 characters of the ATC codes) for medications with a minimum 5% prevalence ( Figure 3 , panel A). Download figure Open in new tab Figure 2: Overview of the modeling strategy Download figure Open in new tab Figure 3: Feature selection based on associations of medication families with the onset of AD, PD and dementia Panel A: Classes of medications (depth 3) significantly associated with AD, PD and dementia are shown. For each class, we also show subfamilies of treatments. Bars correspond to 95% CIs after correction for multiple comparisons. Panel B: Fine Gray models with considering death as a competing risk adjusted for age class, sex, Charlson Index and body mass index with 95% confidence interval. 2.6 Prediction models – training and validation We focused on predicting the first occurrence of each neurodegenerative disease over three periods, i.e., within 2, 5, and 10 years, using a logistic regression model ( Figure 2 ) trained for each time period. We used the above-mentioned exposure time-window regardless of the prediction task (e.g. 2, 5, or 10 years). Patients were considered diseased if they experienced the specified outcome during the diagnostic period, namely between 2011, January 1 st and 2015, December 31 st , when the prediction task was configured for a 5-year span ( figure 1 and 2). Patients who died during the prediction period were regarded as non-cases of the neurodegenerative outcome in the prediction task. Prediction phase For the prediction phase, we investigated two primary models: i) a model that incorporates sex, BMI, Charlson Comorbidity Index (CCI), and the medications chosen in the feature selection stage, as exposure parameters; ii) a model that incorporates the same parameters as the first model, but also includes ten specific disease diagnoses corresponding to ICD10 codes of interest that have been previously demonstrated to be significantly positively associated with AD in a prior study (see supplementary materials for the exact list). 20 Training and validation process We defined the training set as comprising 75% of the UK data, with the remaining 25% serving as the UK test set, and used a separate external French test set. A 5-fold cross-validation was performed on the training set to select the optimal method for subsampling the majority class. We report several performance metrics, including the area under the receiver operating characteristic curve (AUROC), the detection rate for a fixed false positive rate of 5%, and the false positive rate for a fixed detection rate of 50%. As a sensitivity analysis, we excluded patients with Mild Cognitive Impairment (MCI) (codes provided in the supplementary materials) during the exposure period. We calculated 95% confidence intervals (95% CIs) using 1000 bootstrap replications from the test data. The survival and cmprsk libraries in R were used for the survival model, while the sklearn package in Python version 3.6 was employed for the logistic regression model and model evaluation.Role of the funding source: The funder of the study had no role in study design, data collection, data analysis, data interpretation, or writing of the report. Results Table 1 presents the age and sex distributions of all participants, patients with AD, PD, or dementia, and those who died. The study included 99,131 UK participants (45,062 born in 1940; 54,069 born in 1945), eFigure 1. During follow-up, 1,810 patients were diagnosed with AD, 660 with PD, 3,061 with dementia, and 10,030 died. Women slightly outnumbered men, especially among AD and dementia patients, while men predominated in PD and deceased patients. The average diagnosis year was 2015 for PD and 2017 for AD and dementia. Patients diagnosed after 2011 took more medications and had more healthcare visits, particularly consuming more laxatives, antidiabetics, lipid-modifying agents, antidepressants, and anti-dementia drugs compared to the overall population. View this table: View inline View popup Download powerpoint Table 1: Characteristics of patients with Azheimer’s disease, Parkinson’s disease and dementia in the UK In total, 43 drug classes were analyzed in the UK database. Only three drug classes—laxatives (A06), urological drugs (G04), and antidepressants (N06)—were significantly associated with all three outcomes (AD, PD, and dementia) in the multivariate Fine and Gray model (eTable 1, Figure 3 , Panel B). Further analysis of drug subclasses revealed significant associations with the three diseases: contact laxatives (A06AB), osmotic laxatives (A06AD), drugs for urinary frequency and incontinence (G04BD), selective serotonin reuptake inhibitors (N06AB), and other antidepressants (N06AX) ( Figure 3 , Panel A). The logistic regression model trained on UK patients born in 1945 showed the strongest discriminatory power for predicting dementia at 5 years (AUROC 0.67, 95% CI 0.64-0.69), followed by AD (0.66, 0.63-0.69) and PD (0.58, 0.53-0.61). Prediction improved over shorter time spans, with AUROCs of 0.74 (AD), 0.73 (dementia), and 0.56 (PD) for 2-year predictions. At a 5% false-positive rate, the algorithm detected 38.4%, 26.4%, and 18.3% of dementia cases and 39.6%, 26.0%, and 17.2% of AD cases over 2, 5, and 10 years, respectively. Adding health conditions did not enhance performance (Model 2 in eTable 5). The model performed better for women in AD and dementia predictions, and for men in PD predictions ( Figure 4 , Table 2 , eTable6). Download figure Open in new tab Figure 4: Change in Area Under Curve (AUC) for the prediction algorithms by duration of prediction and before and after the exclusion of patients diagnosed with MCI View this table: View inline View popup Download powerpoint Table 2: Performance of the predictive algorithms to estimate 2, 5 and 10-year disease risk for AD, PD and dementia at the age of 65 years Performance is analyzed using the model trained on the entire training population, with training, validation, and test Area Under the Curve (AUC) computed, alongside detection rates at a false positive rate of 5% and false positive rates at a detection rate of 50%. Among UK patients, 1,587 were diagnosed with MCI between 2008 and 2010 ( Table 1 ), with 10.8% later diagnosed with Alzheimer’s and 15.9% with dementia post-2011, rates 5 to 6 times higher than the overall prevalence (1.83% for Alzheimer’s and 3.09% for dementia). Excluding these MCI patients reduced the model’s performance: AUROCs for AD dropped from 0.67 (0.61-0.75) to 0.62 (0.55-0.67), for dementia from 0.68 (0.63-0.74) to 0.64 (0.59-0.70), and for PD from 0.59 (0.52-0.67) to 0.56 (0.48-0.65) ( Figure 4 .b, eTable 7). Discussion Our study employs a methodology that integrates survival analysis and predictive modeling to investigate neurodegenerative disease outcomes using electronic health record (EHR) data. We found that we can reach high AUC for predicting dementias in the following 2 years, and to a lesser extent in the following 5 and 10 years, for individuals aged 65, using simple predictors. The strengths of this study lie in its utilization of routinely collected observational data, which allowed for the collection of a substantial number of exposures (ATC codes), not often feasible in randomized trials. Furthermore, by leveraging data from two distinct countries, we confirmed the robustness of our findings and enhanced the generalizability of our results. Prevention strategies for neurodegenerative diseases are crucial, with early identification of at-risk individuals being a key component. 12 However, conventional methods such as MRIs or PET scans for every individual over 65 are neither practical nor cost-effective. Various multifactorial risk prediction models have been developed to identify individuals at high risk of dementia, but their accuracy in same-aged populations remains limited. 6 Recent findings indicate that existing models miss a significant portion of participants with incident dementia. Age alone shows equal promise as a predictor, highlighting the need for improvement in existing risk scores. One avenue for enhancing prediction models involves leveraging machine learning on electronic health records. Despite current limitations, machine learning algorithms offer a cost-effective means of predicting dementia risk from longitudinal clinical records. Previous studies have attempted to predict dementia using electronic health records, but these efforts have been constrained to single countries and often rely heavily on age as a primary feature, 4 , 10 , 21 , 22 posing challenges in interpretation. When analyzing models incorporating age as a predictive feature, reported classification performance across all ages may not offer a clear assessment of effectiveness due to inflated specificity from straightforwardly classifying younger patients as not at risk. When used in a same-aged group the AUC for the current risk scores (CAIDE, BDSI, ANU-ADRI) drop near 0.5. 6 Additionally, while age often acts as a dominant predictor, other variables’ influences on predictions are typically more modest, when not included as cross-features with age, leading to an aggregated estimation of their impacts across all ages. In this perspective, we proposed a framework focused on predicting dementia onset at specific ages, notably at age 65, which aligns with the recommendation for preventive health check-ups in France. Our algorithm aims to serve as a pre-screening tool, identifying patients eligible for further diagnostic tests or interventions. Our prediction algorithm demonstrates promising performance in detecting incident dementia, particularly when calibrated to achieve a 5% false-positive rate. Specifically, at this threshold, the algorithm detects 38% of incident dementia cases in a same-aged population at 2 years, 26% at 5 years and 18% at 10 years. This outperforms several widely used multifactorial 10-year risk prediction models, including the CAIDE score, CAIDE-APOE score, BDSI, and ANU-ADRI, which are to a large extent driven by age and detect only 9% to 16% of incident dementia cases, according to a recent study, even if all approaches have to be compared on the same patients before concluding to the superiority of a given model. 6 Unlike those risk scores, our algorithm does not require clinical examination or responding to a questionnaire, highlighting its potential utility as a more cost-effective tool for identifying individuals at risk of dementia, thereby facilitating early intervention and management strategies. Our analysis reveals heterogeneity in the predictive performance of the algorithm across different neurodegenerative diseases. Specifically, the algorithm demonstrates stronger discriminatory power for predicting dementia compared to AD and PD. This discrepancy may stem from differences in the underlying pathophysiology, risk factors, and disease progression trajectories among these conditions. Further research is warranted to elucidate these differences and refine prediction models tailored to each specific neurodegenerative disease. Sex-specific differences in the progression and diagnosis of neurodegenerative diseases may influence the predictive accuracy of machine learning models. Women exhibit distinct hormonal and genetic risk factors for AD and dementia, which may enhance model predictions in women. 23 – 25 Conversely, PD presents differently in men, which may explain the better prediction accuracy for males due to unique clinical factors. 26 The decision to employ a logistic regression model for our prediction analysis was informed by recent research demonstrating its efficacy in leveraging electronic health records for predictive modeling. 10 Logistic regression allows also for straightforward interpretation of coefficients, facilitating the identification of significant predictors and their impact on disease risk. Its simplicity and transparency also makes it particularly appealing for clinical applications, where interpretability and ease of implementation are required. Depression is recognized as a potential risk factor for neurodegenerative diseases. 27 – 29 Previous studies have suggested a bidirectional relationship between depression and dementia, with depression increasing the risk of developing dementia and vice versa. 30 Our findings align with this notion, as antidepressants (ATC code N06) which includes all anti-depressant drugs, showed significant associations with neurodegenerative disease outcomes. Furthermore, medications related to urinary incontinence (ATC code G04) and laxatives (ATC code A06) also displayed significant associations with neurodegenerative diseases. These associations might stem from the dual roles of these medications: facilitating the onset of neurodegenerative disease directly and/or administered to address prodromal symptoms of these diseases, which is the case for both anticholinergics and laxatives.. 14 , 20 For instance, anticholinergic medications commonly prescribed for incontinence management have been implicated in the acceleration of dementia onset. 31 Prolonged use of these medications may disrupt cholinergic pathways in the brain, contributing to cognitive decline over time. 32 , 33 Similarly, laxatives, frequently utilized to alleviate constipation, which is an established prodromal factor of neurodegenerative diseases, have also raised concerns regarding their potential role in neurodegenerative diseases. 34 We observed sex differences in the predictive performance of the algorithm, with implications for risk stratification and intervention strategies. While the overall performance of the algorithm was robust, there were notable variations in prediction accuracy between male and female cohorts. These differences may reflect sex-specific risk factors, biological mechanisms, or healthcare-seeking behaviors that influence disease progression and detection. 25 Understanding these disparities is essential for developing personalized approaches to dementia prevention and management that account for sex-specific considerations. Our analysis also revealed temporal variations in the predictive performance of the algorithm, suggesting dynamic changes in disease risk profiles over time. Specifically, the algorithm exhibited varying accuracy levels when predicting neurodegenerative disease onset over different time spans, such as 2-year vs 5-year versus 10-year horizons. These findings underscore the importance of considering temporal dynamics in risk prediction modeling 25 and highlight the need for adaptable and responsive healthcare strategies that accommodate evolving disease trajectories and individual risk profiles. Excluding patients diagnosed with MCI from the analysis had a noticeable impact on the algorithm’s predictive performance. MCI is often considered a transitional stage between normal aging and dementia, serving as an early indicator of neurodegenerative disease risk. 35 Our findings suggest that incorporating MCI diagnoses improves prediction accuracy, emphasizing the importance of capturing subtle cognitive changes in early disease detection efforts. Further research into the predictive value of MCI and its implications for risk stratification is warranted to optimize dementia prevention and intervention strategies. 36 The strengths of this study lie in its utilization of real-world observational data, which allowed for the collection of a substantial number of exposures (ATC codes), not often feasible in randomized trials. Furthermore, by leveraging data from two distinct countries, we improved the robustness of our findings, enhancing the generalizability of our results. Despite its strengths, our approach has certain limitations. The feature selection process of ATC codes carries the potential to introduce bias, given that it was executed on the entire UK dataset for computational efficiency. This procedure could lead to data leakage; however, this impact appears to have been mitigated as we were able to replicate the results in an independent study population. Additionally, the selection of medications based on Fine-Gray models might inadvertently overlook medications with potential relevance. Importantly, primary care data are characterized by two inherent challenges: the absence of key predictive variables and the reliability of ground truth. Primary care data inherently lacks crucial confounding variables such as education level, ethnicity, socio-economic status, and genetics, which could potentially introduce bias in hazard ratio estimations. Adding these variables could also help improve the performance of the different algorithms in the future. Additionally, it is imperative to acknowledge the potential for uncertainty in disease diagnoses made by general practitioners. It might result in a later diagnosis than in prospective cohorts in which each patient follows a protocol with planned visits with a neurologist. Conclusion Leveraging machine learning on electronic health records presents a promising approach for the early identification of individuals at risk of neurodegenerative diseases. Our algorithm expands on the existing risk prediction models, reaching comparable predictive capacity and potentially providing a cost-effective and easily implementable solution for detecting incident dementia and other neurodegenerative diseases. We propose a new standard for training risk algorithms using electronic health records across multiple European countries and emphasize the importance of evaluating these algorithms without relying on age as a factor, particularly for screening purposes at specific ages. Data Availability The data used in the preparation of this Article are available from the Cegedim company upon reasonable request (info@the-health- improvement-network.co.uk). Conflict of interests YS, and BE are full time employees of Cegedim. All other authors declare no competing interests. Role of the funding source The funder of the study had no role in study design, data collection, data analysis, data interpretation, or writing of the report. Ethical approval and consent statement Ethical approval was obtained before carrying out this study (and for the use of datasets). The study was a retrospective analysis of secondary pseudo-anonymised patient data only and do not involve direct patient intervention. The study was approved by the THIN Scientific Research Committee (SRC; SRC reference 22-008-R2). There was no need for written informed consent from participants. No photographs, videos, or other information of recognizable persons is used in this article. Authorization for disclosure was, therefore, not necessary. For the UK database, data are only available to researchers carrying out approved medical research. Ethical approval was granted by the NHS South-East Multicentre Research Ethics Committee in 2003 (reference 03/01/073) for the establishment of the THIN database, and was updated in 2011. A further update was carried out and approval was granted in 2020 by the NHS South Central, Oxford C Research Ethics Committee (reference 20/SC/0011). For the French database, several audits were done by the Commission Nationale de l’Informatique et des Libertés, the French authority responsible for the protection of personal data. Data are available from GERS SAS for researchers who meet the criteria for access to confidential data. Contributors KZ, CM, SD, MK and TN conceived and designed the study. YS and BL extracted the data from the THIN database. KZ,CM,MA,TN analyzed the data and ensured data quality. KZ and CM generated the figures. All authors interpreted the results and contributed to the writing of the final version of the manuscript. Data sharing The data used in the preparation of this article are available from the Cegedim company upon reasonable request (info{at}the-health-improvement-network.co.uk). Acknowledgements The research leading to these results has received funding from the joint program in neurodegenerative diseases (JPND) ANR-21-JPW2-0002-01 (LeMeReND) and the program “Investissements d’avenir” ANR-10-IAIHU-06. This work was funded in part by the French government under management of Agence Nationale de la Recherche as part of the “Investissements d’avenir” program, reference ANR-19-P3IA-0001 (PRAIRIE 3IA Institute). PNS was supported by the Emil Aaltonen Foundation and the Finnish Medical Foundation. MK was supported by Wellcome Trust, UK (221854/Z/20/Z), National Institute on Aging (NIH), US (R01AG056477), Medical Research Council, UK (MR/R024227/1, MR/Y014154/1) and Academy of Finland (350426). BCD is supported by Inria, and a NHMRC CJ Martin fellowship (1161356). References 1. Deuschl G , Beghi E , Fazekas F , Varga T , Christoforidi KA , Sipido E , et al. The burden of neurological diseases in Europe: an analysis for the Global Burden of Disease Study 2017 . Lancet Public Health . 2020 ; 5 ( 10 ): e551lJ67 . OpenUrl 2. ↵ Sindi S , Calov E , Fokkens J , Ngandu T , Soininen H , Tuomilehto J , et al. The CAIDE dementia risk score app: the development of an evidence-based mobile application to predict the risk of dementia . Alzheimers Dement Diagn Assess Dis Monit . 2015 ; 1 ( 3 ): 328lJ33 . OpenUrl 3. ↵ Kivipelto M , Ngandu T , Laatikainen T , Winblad B , Soininen H , Tuomilehto J . Risk score for the prediction of dementia risk in 20 years among middle aged people: a longitudinal, population-based study . Lancet Neurol . 2006 ; 5 ( 9 ): 735lJ41 . OpenUrl 4. ↵ Barnes DE , Beiser AS , Lee A , Langa KM , Koyama A , Preis SR , et al. Development and validation of a brief dementia screening indicator for primary care . Alzheimers Dement . 2014 ; 10 ( 6 ): 656lJ65 . OpenUrl 5. ↵ Anstey KJ , Cherbuin N , Herath PM , Qiu C , Kuller LH , Lopez OL , et al. A self-report risk index to predict occurrence of dementia in three independent cohorts of older adults: the ANU-ADRI . PloS One . 2014 ; 9 ( 1 ): e86141 . OpenUrl CrossRef PubMed 6. ↵ Kivimäki M , Livingston G , Singh-Manoux A , Mars N , Lindbohm JV , Pentti J , et al. Estimating Dementia Risk Using Multifactorial Prediction Models . JAMA Netw Open . 2023 ; 6 ( 6 ): e2318132lJe2318132 . OpenUrl 7. ↵ Placido D , Yuan B , Hjaltelin JX , Zheng C , Haue AD , Chmura PJ , et al. A deep learning algorithm to predict risk of pancreatic cancer from disease trajectories . Nat Med . 2023 ; 1lJ10 . 8. ↵ Hippisley-Cox J , Coupland C , Vinogradova Y , Robson J , May M , Brindle P . Derivation and validation of QRISK, a new cardiovascular disease risk score for the United Kingdom: prospective open cohort study . Bmj . 2007 ; 335 ( 7611 ): 136 . OpenUrl Abstract / FREE Full Text 9. ↵ Fukunishi H , Nishiyama M , Luo Y , Kubo M , Kobayashi Y . Alzheimer-type dementia prediction by sparse logistic regression using claim data . Comput Methods Programs Biomed . 2020 ; 196 : 105582 . OpenUrl PubMed 10. ↵ Reinke C , Doblhammer G , Schmid M , Welchowski T . Dementia risk predictions from German claims data using methods of machine learning . Alzheimers Dement . 2023 ; 19 ( 2 ): 477lJ86 . OpenUrl 11. ↵ Park YH , Suh JH , Kim YW , Kang DR , Shin J , Yang SN , et al. Machine learning based risk prediction for Parkinson’s disease with nationwide health screening data . Sci Rep . 2022 ; 12 ( 1 ): 19499 . OpenUrl PubMed 12. ↵ Frisoni GB , Molinuevo JL , Altomare D , Carrera E , Barkhof F , Berkhof J , et al. Precision prevention of Alzheimer’s and other dementias: Anticipating future needs in the control of risk factors and implementation of disease-modifying therapies . Alzheimers Dement . 2020 ; 16 ( 10 ): 1457lJ68 . OpenUrl 13. ↵ Blak BT , Thompson M , Dattani H , Bourke A . Generalisability of The Health Improvement Network (THIN) database: demographics, chronic disease prevalence and mortality rates . Inform Prim Care . 2011 ; 19 ( 4 ). 14. ↵ Schrag A , Horsfall L , Walters K , Noyce A , Petersen I . Prediagnostic presentations of Parkinson’s disease in primary care: a case-control study . Lancet Neurol . 2015 ; 14 ( 1 ): 57lJ64 . OpenUrl 15. ↵ Iordache L , Strazzulla A , Lekens B , Bergmann J , Diamantis S . Évaluation du suivi des recommandations de prise en charge des infections urinaires en médecine de ville en France . Médecine Mal Infect . 2018 ; 48 ( 4 ): S101lJ2 . OpenUrl 16. ↵ Langley TE , Szatkowski L , Gibson J , Huang Y , McNeill A , Coleman T , et al. Validation of The Health Improvement Network (THIN) primary care database for monitoring prescriptions for smoking cessation medications . Pharmacoepidemiol Drug Saf . 2010 ; 19 ( 6 ): 586lJ90 . OpenUrl 17. Lewis JD , Schinnar R , Bilker WB , Wang X , Strom BL . Validation studies of the health improvement network (THIN) database for pharmacoepidemiology research . Pharmacoepidemiol Drug Saf . 2007 ; 16 ( 4 ): 393lJ401 . OpenUrl 18. ↵ Seminara N , Abuabara K , Shin D , Langan S , Kimmel S , Margolis D , et al. Validity of The Health Improvement Network (THIN) for the study of psoriasis . Br J Dermatol . 2011 ; 164 ( 3 ): 602lJ9 . OpenUrl 19. Quan H , Li B , Couris CM , Fushimi K , Graham P , Hider P , et al. Updating and validating the Charlson comorbidity index and score for risk adjustment in hospital discharge abstracts using data from 6 countries . Am J Epidemiol . 2011 ; 173 ( 6 ): 676lJ82 . OpenUrl 20. ↵ Nedelec T , Couvy-Duchesne B , Monnet F , Daly T , Ansart M , Gantzer L , et al. Identifying health conditions associated with Alzheimer’s disease up to 15 years before diagnosis: an agnostic study of French and British health records . Lancet Digit Health . 2022 ; 4 ( 3 ): e169lJ78 . OpenUrl 21. ↵ McCoy Jr TH , Han L , Pellegrini AM , Tanzi RE , Berretta S , Perlis RH . Stratifying risk for dementia onset using large-scale electronic health record data: a retrospective cohort study . Alzheimers Dement . 2019 ; 22. ↵ Halpern R , Seare J , Tong J , Hartry A , Olaoye A , Aigbogun MS . Using electronic health records to estimate the prevalence of agitation in Alzheimer disease/dementia . Int J Geriatr Psychiatry . 2019 ; 34 ( 3 ): 420lJ31 . OpenUrl 23. ↵ Podcasy JL , Epperson CN . Considering sex and gender in Alzheimer disease and other dementias . Dialogues Clin Neurosci . 2016 ; 18 ( 4 ): 437lJ46 . OpenUrl 24. Ferretti MT , others. Sex differences in Alzheimer disease—the gateway to precision medicine . Nat Rev Neurol . 2018 ; 14 ( 8 ): 457lJ69 . OpenUrl 25. ↵ Tang AS , Rankin KP , Cerono G , Miramontes S , Mills H , Roger J , et al. Leveraging electronic health records and knowledge networks for Alzheimer’s disease prediction and sex-specific biological insights . Nat Aging . 2024 ; 1lJ17 . 26. ↵ Cerri S , Mus L , Blandini F . Parkinson’s disease in women and men: what’s the difference? J Park Dis . 2019 ; 9 ( 3 ): 501lJ15 . OpenUrl 27. ↵ Livingston G , Huntley J , Sommerlad A , Ames D , Ballard C , Banerjee S , et al. Dementia prevention, intervention, and care: 2020 report of the Lancet Commission . The Lancet . 2020 ; 396 ( 10248 ): 413lJ46 . OpenUrl 28. Byers AL , Yaffe K . Depression and risk of developing dementia . Nat Rev Neurol . 2011 ; 7 ( 6 ): 323lJ31 . OpenUrl 29. ↵ Schweitzer I , Tuckwell V , O’Brien J , Ames D . Is late onset depression a prodrome to dementia? Int J Geriatr Psychiatry . 2002 ; 17 ( 11 ): 997lJ1005 . OpenUrl 30. ↵ Singh-Manoux A , Dugravot A , Fournier A , Abell J , Ebmeier K , Kivimäki M , et al. Trajectories of depressive symptoms before diagnosis of dementia: a 28-year follow-up study . JAMA Psychiatry . 2017 ; 74 ( 7 ): 712lJ8 . OpenUrl 31. ↵ Cai X , Campbell N , Khan B , Callahan C , Boustani M . Long-term anticholinergic use and the aging brain . Alzheimers Dement . 2013 ; 9 ( 4 ): 377lJ85 . OpenUrl 32. ↵ Gray SL , Anderson ML , Dublin S , Hanlon JT , Hubbard R , Walker R , et al. Cumulative use of strong anticholinergics and incident dementia: a prospective cohort study . JAMA Intern Med . 2015 ; 175 ( 3 ): 401lJ7 . OpenUrl 33. ↵ Richardson K , Fox C , Maidment I , Steel N , Loke YK , Arthur A , et al. Anticholinergic drugs and risk of dementia: case-control study . bmj . 2018 ; 361 . 34. ↵ Yang Z , Wei C , Li X , Yuan J , Gao X , Li B , et al. Association between regular laxative use and incident dementia in UK Biobank participants . Neurology . 2023 ; 100 ( 16 ): e1702lJ11 . OpenUrl 35. ↵ Dickerson BC , Sperling RA , Hyman BT , Albert MS , Blacker D . Clinical prediction of Alzheimer disease dementia across the spectrum of mild cognitive impairment . Arch Gen Psychiatry . 2007 ; 64 ( 12 ): 1443lJ50 . OpenUrl 36. ↵ Ansart M , Epelbaum S , Bassignana G , Bône A , Bottani S , Cattai T , et al. Predicting the progression of mild cognitive impairment using machine learning: a systematic, quantitative and critical review . Med Image Anal . 2021 ; 67 : 101848 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted January 25, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Machine learning prediction algorithms for 2- , 5- and 10-year risk of Alzheimer’s, Parkinson’s and dementia at age 65: a study using medical records from France and the UK General Practitioners Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Machine learning prediction algorithms for 2- , 5- and 10-year risk of Alzheimer’s, Parkinson’s and dementia at age 65: a study using medical records from France and the UK General Practitioners Thomas Nedelec , Karim Zaidi , Charlotte Montaud , Octave Guinebretiere , Pyry Sipilä , Dang Wei , Fen Yang , Anna Freydenzon , Antoine Belloir , Nemo Fournier , Nadine Hamieh , Beranger Lekens , Yanis Slaouti , Allan McRae , Baptiste Couvy-Duchesne , Yulin Hswen , Fang Fang , Mika Kivimäki , Manon Ansart , Stanley Durrleman medRxiv 2025.01.22.25320969; doi: https://doi.org/10.1101/2025.01.22.25320969 Share This Article: Copy Citation Tools Machine learning prediction algorithms for 2- , 5- and 10-year risk of Alzheimer’s, Parkinson’s and dementia at age 65: a study using medical records from France and the UK General Practitioners Thomas Nedelec , Karim Zaidi , Charlotte Montaud , Octave Guinebretiere , Pyry Sipilä , Dang Wei , Fen Yang , Anna Freydenzon , Antoine Belloir , Nemo Fournier , Nadine Hamieh , Beranger Lekens , Yanis Slaouti , Allan McRae , Baptiste Couvy-Duchesne , Yulin Hswen , Fang Fang , Mika Kivimäki , Manon Ansart , Stanley Durrleman medRxiv 2025.01.22.25320969; doi: https://doi.org/10.1101/2025.01.22.25320969 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neurology Subject Areas All Articles Addiction Medicine (569) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1511) Epidemiology (15230) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6610) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15923) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6607) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3338) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5448) Public and Global Health (9239) Radiology and Imaging (2202) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a01e2ef618028e2e',t:'MTc3OTgxMzIyNQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.