Full text
42,694 characters
· extracted from
preprint-html
· click to expand
Machine learning models for the prediction of COVID-19 prognosis in the primary health care setting | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Machine learning models for the prediction of COVID-19 prognosis in the primary health care setting Joan Barrot , View ORCID Profile Joan A. Caylà , View ORCID Profile Manel Mata-Cases , Jordi Real , View ORCID Profile Bogdan Vlacho , View ORCID Profile Josep Franch-Nadal , Didac Mauricio , the COVID-19 Working Group in Primary Health Care doi: https://doi.org/10.1101/2025.05.08.25327245 Joan Barrot 1 ABS Jordi Nadal, Salt. Institut Català de la Salut Girona, Departament de Salut , Generalitat de Catalunya 2 DAP-Cat group, Fundació Institut Universitari per a la recerca a l’Atenció Primària de Salut Jordi Gol i Gurina (IDIAPJGol) , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site Joan A. Caylà 3 Barcelona Tuberculosis Research Unit Foundation. Barcelona , Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joan A. Caylà Manel Mata-Cases 2 DAP-Cat group, Fundació Institut Universitari per a la recerca a l’Atenció Primària de Salut Jordi Gol i Gurina (IDIAPJGol) , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Manel Mata-Cases Jordi Real 2 DAP-Cat group, Fundació Institut Universitari per a la recerca a l’Atenció Primària de Salut Jordi Gol i Gurina (IDIAPJGol) , Barcelona, Spain 4 Digital Health Validation Center, Hospital de la Santa Creu i Sant Pau, Sant Pau Campus Salut Barcelona , Barcelona, Catalonia, Spain 5 Institut d’Investigació Biomèdica Sant Pau (IIB Sant Pau) , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site Bogdan Vlacho 2 DAP-Cat group, Fundació Institut Universitari per a la recerca a l’Atenció Primària de Salut Jordi Gol i Gurina (IDIAPJGol) , Barcelona, Spain 5 Institut d’Investigació Biomèdica Sant Pau (IIB Sant Pau) , Barcelona, Spain 6 CIBER of Diabetes and Associated Metabolic Diseases (CIBERDEM), Instituto de Salud Carlos III (ISCIII) , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Bogdan Vlacho For correspondence: bogdan.vlacho{at}gmail.com josep.franch{at}gmail.com Josep Franch-Nadal 2 DAP-Cat group, Fundació Institut Universitari per a la recerca a l’Atenció Primària de Salut Jordi Gol i Gurina (IDIAPJGol) , Barcelona, Spain 6 CIBER of Diabetes and Associated Metabolic Diseases (CIBERDEM), Instituto de Salud Carlos III (ISCIII) , Barcelona, Spain 7 Department of Endocrinology & Nutrition, Hospital de la Santa Creu i Sant Pau , Barcelona, Spain 8 Primary Health Care Center Raval Sud, Gerència d’Atenció Primària Barcelona Ciutat, Institut Català de la Salut , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Josep Franch-Nadal For correspondence: bogdan.vlacho{at}gmail.com josep.franch{at}gmail.com Didac Mauricio 2 DAP-Cat group, Fundació Institut Universitari per a la recerca a l’Atenció Primària de Salut Jordi Gol i Gurina (IDIAPJGol) , Barcelona, Spain 5 Institut d’Investigació Biomèdica Sant Pau (IIB Sant Pau) , Barcelona, Spain 6 CIBER of Diabetes and Associated Metabolic Diseases (CIBERDEM), Instituto de Salud Carlos III (ISCIII) , Barcelona, Spain 9 Department of Medicine, University of Barcelona – Spain 10 Department of Medicine, University of Vic - Central University of Catalonia , Vic, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Preview PDF Abstract text Objective This study aimed to identify prognostic factors associated with poor outcomes of COVID-19 at diagnosis in Primary Health Care (PHC). Methods We conducted a retrospective, longitudinal study using the SIDIAP database, part of the PHC Information System of Catalonia. The analysis included COVID-19 cases diagnosed in patients aged 18 and older from March 2020 to September 2022. Follow-up was conducted for 90 days post-diagnosis or until death. Various machine learning models of differing complexities were used to predict short-term events, including mortality and hospital complications. Each model was tailored to maximize the predictive accuracy for poor outcomes, exploring algorithms such as Generalized Linear Models, flexible GLMs with Lasso, Gradient Boosting Models, and Support Vector Machines, with the model demonstrating the highest Area Under the Curve (AUC) selected for optimal performance. Results A total of 2,162,187 COVID-19 cases were identified across five epidemic waves. Key predictors of short-term complications included age and the epidemic wave. Additional significant factors encompassed social deprivation (MEDEA), blood pressure, cardiovascular history, chronic obstructive pulmonary disease (COPD), obesity, and diabetes mellitus. The models exhibited high performance, with AUC values ranging from 0.73 to 0.95. A web application was developed to estimate the risk of adverse outcomes based on individual patient profiles ( https://dapcat.shinyapps.io/CovidScore ). Conclusions In addition to age and epidemic wave, predictors such as social deprivation, diabetes mellitus, obesity, COPD, cardiovascular disease, high blood pressure, and dyslipidemia significantly indicate poor prognosis in COVID-19 patients diagnosed in PHC, and the developed application facilitates risk quantification for individual patients. Introduction The COVID-19 pandemic has posed unprecedented challenges to healthcare systems globally. Since its identification in December 2019 in Wuhan, China, this virus has spread rapidly, affecting millions and causing an international global health crisis [ 1 ]. Until December 2022, more than 704 million cases and more than 6.7 million deaths were reported [ 2 ]. In this context, the best strategy to control this pandemic has been discussed (Zero COVID-19 vs. mitigation) [ 3 ]. However, in most healthcare settings, primary healthcare (PHC) has played a crucial role in the response to the pandemic [ 4 ]. In Europe, the first cases of COVID-19 were detected in January 2020, reaching a peak in this first wave on March 31; Spain had a cumulative incidence of 178.03/1 million inhabitants, followed by Italy with 92.66 and Austria with 61.14 [ 2 ]. The first cases in Spain were detected in February 2020, [ 5 ], and a state of alarm was declared on March 15, 2020, which lasted until June 21, 2020. During this period, the Ministry of Health was in charge of the protocols and organization of healthcare related to COVID-19 nationwide [ 6 ]. PHC was instrumental in detecting, monitoring, and treating patients and their contacts worldwide. The role of PHC varied significantly between countries, and the different health system models influenced it. There were two main challenges for PHC: 1) to clearly define its role in providing an effective contribution to the prevention, diagnosis, and treatment of COVID-19 and, thus, reorganize accordingly; 2) to identify new ways of providing regular health services and maintaining the quality of care for non-COVID-19 patients [ 7 ]. The evolution of COVID-19 has shown considerable variability in the risk of severe complications and mortality, depending on the individual characteristics of the patients [ 8 ]. Known risk factors include high blood pressure, diabetes mellitus, chronic lung diseases, and obesity. Other prognostic factors may vary depending on the epidemic’s different phases and the population’s vaccination status [ 8 , 9 ]. Some factors are present during diagnosis, while others can be identified through complementary analysis or tests. Guidance to support general practitioners in managing future waves of COVID-19 or other health emergencies should be tailored to general practice from the outset. Establishing risk factors associated with severity and prognosis in the early stages of the disease is important to identify patients who need specialized care as a priority. For this reason, creating new clinical tools to improve health decisions and outcomes in the population is essential. Our study aimed to develop machine learning models to identify prognostic factors and predict the outcome in subjects with COVID-19 during diagnosis in PHC settings. Material and Methods General Design We conducted a retrospective cohort study using data from the PHC Information System of Catalonia (SIDIAP) database, which includes the healthcare data from individuals from all PHC centers and hospitals within the Catalan Health System (CatSalut), the main healthcare provider in Catalonia, Northeast Spain. The study was conducted according to the guidelines of the Declaration of Helsinki, and approved by the Institutional Review Board (or Ethics Committee) of IDIAP Jordi Gol i Gurina Foundation (code 21/271-PCV). Data were accessed for research purposes on June 30th 2021. Authors did not had access to information that could identify individual participants during or after data collection. Study selection criteria During the observational period (the initial two and half years of the pandemic, 03/2020-09/2022), we included all individuals in the database with a positive COVID-19 diagnostic test or diagnostic code (ICD-10: B34.2; B97.2; B97.21; B97.29; J12.81; J12.89; U07.1; Z20.828; J12.89; J20.8; J22; J40; J80; J98.8). We only selected subjects aged 18 years or older with at least one year of data as a registered user in the SIDIAP database. During the study, each episode of COVID-19 infection in the same patient was considered as an independent COVID-19 case. Study variables Variables available in PHC at the time of identification and/or diagnosis of each COVID-19 patient were evaluated as potential predictors. These included sociodemographic factors: age, sex, the MEDEA deprivation index (Mortality in small Spanish areas and Socioeconomic and Environmental Inequalities) [ 10 ], toxic habits (e.g., smoking), vaccination status, and the infection period (pandemic wave). Clinically relevant medical conditions were identified by the presence of ICD-10 diagnostic codes for diabetes mellitus, cardiovascular disease, obesity, ischemic heart disease, stroke, heart failure, peripheral arterial disease, chronic kidney disease (CKD), hypertension, dyslipidemia, diabetic retinopathy, diabetic neuropathy, dementia, asthma, COPD, obstructive sleep apnea syndrome and/or use of CPAP, deep vein thrombosis, HIV and malignancies. We collected the following clinical variables closest to the inclusion date, within a window of one year prior to inclusion: HbA1c, body mass index (BMI), blood pressure, lipid profile, aspartate transaminase (AST) and alanine transaminase (also known as alanine aminotransferase) (ALT), estimated glomerular filtration rate (eGFR), urine albumin to creatinine ratio (ACR), C reactive protein (CRP), leukocytes, leukocyte formula, hemoglobin, ferritin, platelets, lactate dehydrogenase (LDH), and ferritin levels. Study events The follow-up period was defined as the time from the date of COVID-19 diagnosis to a maximum of 90 days after inclusion or until death. We assessed the following events: death from any cause, hospitalization, admission to the intensive care unit (ICU), and complications attributed to COVID-19, including the need for mechanical ventilation, as well as respiratory, neurological, thrombotic, and cardiovascular complications. Statistical analysis A descriptive analysis was performed on the 2,162,187 identified COVID-19 cases across five epidemic waves. To optimize the modeling process and avoid unnecessary use of computational resources, a random sample of 100,000 cases was selected from the initial dataset for training and testing. The learning curves for the selected models were generated [ 11 ], showing that the AUC for the test and training samples stabilized and converged after approximately 40,000 cases (see learning curves in Supplementary Figure 1). The model construction process included: data preprocessing, selection of training (75%) and validation (25%) samples, selection of the machine learning (ML) algorithm with the best performance, and the type of ML of different levels of complexity: The types of algorithms tested included generalized polynomial models (GLM), flexible GLM with Lasso (elastic net regularization), gradient boosting models (GBM), and support vector machine (SVM) models. The model chosen was the one that demonstrated the best performance, based on the AUC criteria and following the parsimony principle by selecting only the most important features for each model. The process also involved adjusting, validating, optimizing, and selecting the most optimal model, followed by its implementation within an integrated application. The initial variables included for the models followed semi-agnostic criteria, prioritizing those with known clinical associations with the outcomes under study. All models were designed to include sociodemographic variables such as age, sex, and epidemic wave. For continuous variables with potential missing values (excluding age), quantile-based categorization strategies that included a ‘missing’ category were used to account for incomplete data. Finally, the selected model was implemented in a Shiny-based APP ( https://dapcat.shinyapps.io/CovidScore ), where the risk of each event was calculated for each associated model, with graphical representations of risk levels and the importance of each variable. The entire process of data management, statistical analysis, and model construction was carried out using the R Core Team (2022) software [ 12 ], along with the shiny R package [ 13 ] for the implementation of the models. The XAI (Explainable AI) methodology was applied to improve the transparency and interpretability of the machine learning models using the DALEX (v2.4.3) R package [ 14 ]. This package offers graphical tools that provide model-agnostic explainers for exploring and understanding complex predictive models, such as neural networks and ensembles. These explainers facilitate the decomposition of predictions, performance assessment, and model comparison, thereby aiding in the validation and interpretation of predictive ‘black boxes’. The main R packages utilized for the modeling were stats [ 15 ], GLMnet [ 16 ], gbm [ 17 ], E1071 [ 18 ], and caret [ 11 ]. Results Evolution of COVID-19 cases and epidemic waves A total of 2,162,187 cases of COVID-19 were identified between March 2020 and September 2022 in 5 epidemic waves. A total of 242,692 (11.2%) cases were diagnosed in the first wave, 736,020 (34.0%) in the second, 329,230 (15.2%) in the third, 199,606 (9.23%) in the fourth, and 654,639 (30.3%) in the fifth. ( Figure 1 ). Download figure Open in new tab Figure 1. Graphical representation of the evolution of COVID-19 cases in five epidemic waves in Catalonia from March 2020 to September 2022. Characteristics of COVID-19 cases The study population had a mean age of 46.5 years (standard deviation [SD] = 18.5), and 56.5% were female. Among the participants, 24.6% were smokers, and 32.8% had received at least one dose of a COVID-19 vaccine. The proportion of vaccinated individuals increased across successive waves. Additional sociodemographic characteristics and clinical variables, overall and by wave, are detailed in Table 1 . View this table: View inline View popup Table 1: Characteristics of the population in each epidemic wave Incidence of complications post COVID-19 The descriptive analysis of incidence rates per event, particularly in the context of waves, shows a progressive reduction in case fatality rate (CFR) (overall and hospital), hospital admission, and different complications over time ( Table 2 ). The highest incidence rate was for respiratory complications, at 7.4% (excluding composite endpoint non-fatal complications), while the lowest was for thrombotic complications, affecting only 0.15% of the population. View this table: View inline View popup Download powerpoint Table 2: Complications for each epidemic wave from March 2020 to September 2022. Predictors according to the selected model The predictor variables selected for different events and the type of model are summarized in Supplementary Table 1 . The models showed high performance, with the AUC ranging from 0.73 to 0.95. We found that the predictors for short-term complications with greater weight common to all models were age and epidemic wave. Other common predictors in our analysis were the deprivation index (MEDEA), blood pressure, presence of cardiovascular disease, COPD, diabetes, obesity, or chronic kidney disease. The mortality model had the highest performance with an AUC of 0.95, while the respiratory model had the lowest at 0.73. This and other performance metrics for each event model can be seen in Supplementary Table 2 . Implementation of models The models integrated into the APP-Web model can be accessed through this link: ( https://dapcat.shinyapps.io/CovidScore ). Based on a patient’s profile, this app estimates the risk for each endpoint (e.g., mortality, hospital admission, cardiovascular complications, etc., as shown in Supplementary Figure 2 ). Supplementary Figure 2 presents a bar plot of the app’s main screen, illustrating an example patient profile and the corresponding output, which includes a bar plot showing the risk estimations for each endpoint. Additionally, the app visually displays the breakdown profile, overall model performance, feature importance, and the local performance of each model for individual patient profiles. Breakdown profile For each risk estimate (e.g., mortality, hospital admission, cardiovascular complications, etc.), the app provides details of the contribution of each factor to the final probability calculation (see Breakdown profile in https://dapcat.shinyapps.io/CovidScore , with an example for the CV outcome shown in Supplementary Figure 3 ). The breakdown profile plot decomposes the estimated risk for each prediction. For example, for the risk estimate of cardiovascular complications, the plot provides detailed insights into the contribution of each factor to the final probability calculation. For a 97-year-old male with chronic kidney disease, being 97 years old increases the risk of 9.5%, being male increases the risk of 1.7%, and the presence of diabetes increases the risk of 4.2%, respectively, to the final estimated risk of 34%. Feature-importance and ROC curve by model In Supplementary Figure 4 , predictors for the mortality model are displayed and ranked according to their relative importance (feature importance) as determined by the model, along with the ROC curve comparing the training sample to the test sample. In the mortality model, the most important features were the COVID wave, sex, and neoplasia condition. The ROC curve showed good discrimination in both the test and training samples. The results of feature importance and ROC curves for the other endpoints can be seen on the app web. Discussion This retrospective study analyzed over 2 million COVID-19 cases from March 2020 to September 2022 across multiple COVID-19 waves, with follow-up 90 days post-diagnosis or until death. Our study underscores the significant impact of COVID-19 on PHC in the initial years of this pandemic in Catalonia. Furthermore, using machine learning models (GLMs, Lasso, Gradient Boosting, SVMs), we identified key predictors of poor outcomes, such as age, social deprivation (MEDEA), blood pressure, and a history of either diabetes, COPD, cardiovascular disease, or obesity. The models showed strong predictive accuracy (AUC: 0.73–0.95). Finally, using these models, an interactive web app was developed for personalized risk estimation ( https://dapcat.shinyapps.io/CovidScore ). We found that the CFR was highest during the first wave of the pandemic, gradually decreasing in subsequent waves, with the second wave showing the highest incidence of cases. The decrease in the CFR may be attributed to a combination of increased immunity (due to vaccination and SARS-CoV-2 infections), better identification of more severe cases, and the lower pathogenicity of recent variants like Omicron. Overall, these findings suggest changes in the SARS-CoV-2 virus, an adaptive response in healthcare, and improvements in the prevention (i.e., via vaccination) and treatment of complications as the pandemic progressed [ 19 ]. Regarding predictors of poor prognosis at the time of COVID-19 diagnosis in PHC, we identified older age, epidemic wave, social deprivation, and a history of diabetes, obesity, chronic obstructive pulmonary disease (COPD), cardiovascular disease, hypertension, and dyslipidemia. While study results vary across different populations, numerous studies have identified advanced age and comorbidities such as hypertension, cardiovascular disease, COPD, and diabetes as predictors of increased COVID-19 severity [ 20 – 22 ]. Additionally, social deprivation indicators, such as the MEDEA index, have been associated with poorer outcomes in terms of severity and mortality, underscoring the multifactorial nature of COVID-19 outcomes. Understanding and addressing these predictive factors is crucial for improving management and outcomes in affected patients. Conditions associated with low-grade chronic inflammation, such as obesity and diabetes mellitus, are also relevant at the metabolic level [ 23 ]. Several systematic reviews have provided consistent evidence that diabetes and obesity are associated with poorer COVID-19 outcomes, which agrees with our study [ 24 ]. Although the reasons for this association are not entirely clear, these conditions could exacerbate respiratory problems and/or affect immune responses. A systematic review and meta-analyses on high-risk phenotypes in people with diabetes determined that individuals with a more severe course of diabetes and pre-existing comorbidities had a poorer prognosis of COVID-19 than individuals with a milder course of the disease, highlighting the need for individualized and proactive management strategies for high-risk patients [ 25 ]. At the hospital level, predictive models like the ISARIC 4C have been developed to anticipate clinical deterioration (including mortality, ICU admission, or intubation), assessing age, gender, comorbidities, and nosocomial infection [ 26 ]. A similar study in the United Kingdom, utilizing computerized PHC medical records, developed predictive algorithms for COVID-19 mortality and hospital admission risk. Factors such as age, body weight, ethnicity, and social risk explained 73% of COVID-19 deaths and 58% of hospital admissions, suggesting periodic recalibration of these models to reflect the evolving nature of the pandemic [ 27 ]. An early pandemic study on PHC identified key risk factors for ICU admission and mortality, including advanced age, male gender, autoimmune disease, bilateral pulmonary infiltrates, and elevated LDH, D-dimer, and C-reactive protein. Protective factors included myalgias, arthralgias, and anosmia [ 28 ]. Recent advances in AI and machine learning have significantly contributed to managing the COVID-19 pandemic by aiding in detection, treatment, mortality prediction, and infection modeling to reduce virus spread [ 29 – 31 ]. In this study, we used machine learning to develop predictive models and then used these models to develop an app. The app provides comprehensive information to estimate the risk of COVID-19 prognosis outcomes for individuals based on their risk factors (e.g., age, sex, comorbidities, vaccination status, COVID-19 wave). This approach could be used in PHC to identify individuals needing closer monitoring and interventions to prevent serious complications and hospitalization. Our study has limitations inherent to its retrospective design. Outcomes depend on the quality of existing clinical records not specifically collected for this research and have yet to undergo individual validation. A notable limitation is the potential impact of vaccine implementation on epidemiology and prognosis, which underscores the need to recalibrate predictive models with post-vaccination data to maintain accuracy. Periodic updates with the latest available data are essential to ensure the continued relevance of these models. Although no predictive model is perfect for COVID-19 patients, our models serve as valuable tools to estimate the risk of complications, helping to identify patients who require closer monitoring. However, the accuracy of these models can be affected by variations in the detection and recording of symptoms and risk factors by different healthcare professionals. Moreover, mild COVID-19 cases may have gone unrecorded in PHC, potentially leading to their exclusion from our study population. However, the study benefits from using the SIDIAP database, which includes a substantial patient cohort and is a well-validated source for epidemiological and pharmaco-epidemiological studies within the Catalan primary care setting. This database not only provides standardized clinical data (including health issues, physical exams, lab results, and medication records) from pseudo-anonymized electronic health records but was also specifically updated to include COVID-19-related variables (such as diagnostic tests and procedures), enabling researchers to conduct targeted epidemiological studies. Conclusions This study highlights the importance of different prognostic factors, including age, epidemic wave, and comorbidities, in assessing the risk of mortality and complications in patients with COVID-19 treated in PHC. Identifying these predictors is crucial to optimizing medical care and highlights the need for further research and recalibration of predictive models as epidemiological circumstances evolve, such as vaccination. Integrating these models into clinical practice will enable healthcare professionals to make informed and personalized decisions, thereby improving outcomes for patients affected by COVID-19. The developed application allows a fast risk quantification for each patient seen in PHC centers. Statements Funding statement This study was funded by the Fondo de Investigaciones Sanitarias (FIS), Instituto de Salud Carlos III (Spain), under project number [PI21/01318]. Use of artificial intelligence tools None declared. Data availability The data analyzed in this study is subject to the following licenses/restrictions: restrictions apply to the availability of some, or all data generated or analyzed during this study because they were used under license. The corresponding author will, on request, detail the restrictions and any conditions under which access to some data may be provided. Requests to access these datasets should be directed to Dr Bogdan Vlacho PharmD, MSc, PhD, at bogdan.vlacho{at}gmail.com . Preprint None Conflict of interest None declared. Authors’ contributions J.B; J.A.C, M.M-C, D. M, J. F-N and J.R conceptualized and designed the study ; J.R conducted statistical analysis and data management; B. V contributed to data acquisition ; J.B; J.A.C, D. M, J. F-N, B.V and J.R edited and cross-reviewed the manuscript. All authors approved the final version of the manuscript. J.B and J.A.C, contributed equally to this work and share first authorship. References 1. ↵ WHO . COVID-19 Public Health Emergency of International Concern (PHEIC) Global research and innovation forum . Available from: https://www.who.int/publications/m/item/covid-19-public-health-emergency-of-internationalconcern-(pheic)-global-research-and-innovation-forum 2. ↵ OurWorld in Data . Confirmed deaths and cases: our data source . Available from: https://ourworldindata.org/covid-cases 3. ↵ Oliu-Barton M , Pradelski BSR , Aghion P , Artus P , Kickbusch I , Lazarus JV , et al. SARS-CoV-2 elimination, not mitigation, creates best outcomes for health, the economy, and civil liberties . Lancet . 2021 Jun 12; 397 ( 10291 ): 2234 – 2236 . OpenUrl PubMed 4. ↵ Guisado-Clavero M , Ares-Blanco S , Serafini A , Del Rio LR , Larrondo IG , Fitzgerald L , et al. The role of primary health care in long-term care facilities during the COVID-19 pandemic in 30 European countries: a retrospective descriptive study (Eurodata study) . Prim Health Care Res Dev . 2023 October 24; 24 : e60 . OpenUrl PubMed 5. ↵ Pérez-Gómez B , Pastor-Barriuso R , Fernández-de-Larrea N , Hernán MA , Pérez-Olmeda M , Oteo-Iglesias J , et al. SARS-CoV-2 Infection During the First and Second Pandemic Waves in Spain: the ENE-COVID Study . Am J Public Health . 2023 May ; 113 ( 5 ): 533 – 544 . OpenUrl CrossRef PubMed 6. ↵ Guisado-Clavero M , Ares-Blanco S , Serafini A , Del Rio LR , Larrondo IG , Fitzgerald L , et al. The role of primary health care in long-term care facilities during the COVID-19 pandemic in 30 European countries: a retrospective descriptive study (Eurodata study) . Prim Health Care Res Dev . 2023 October 24; 24 : e60 . OpenUrl PubMed 7. ↵ Pilbeam C , Edwards G , Tonkin-Crine S , Raymond M , Van Hecke O , Gobat N . Primary care preparedness for the SARS-CoV-2 pandemic: a survey of NHS GPs . Fam Pract . 2022 May 28; 39 ( 3 ): 332 – 339 . OpenUrl PubMed 8. ↵ Rosero PA , Realpe JS , Farinango CD , Restrepo DS , Salazar-Cabrera R , Lopez DM . Risk Factors for COVID-19: A Systematic Mapping Study . Stud Health Technol Inform . 2022 Nov 3; 299 : 63 – 74 . OpenUrl PubMed 9. ↵ Monye IN , Makinde MT , Oseni TIA , Adelowo AB , Nyirenda S . Covid-19 and Pre-Morbid Lifestyle-Related Risk Factors-A Review . Health Serv Insights . 2023 Nov 30; 16 : 11786329231215049 . OpenUrl PubMed 10. ↵ Domínguez-Berjón MF , Borrell C , Cano-Serral G , Esnaola S , Nolasco A , Pasarín MI , et al. Constructing a deprivation index based on census data in large Spanish cities (the MEDEA project)] . Gac Sanit . 2008 May-Jun ; 22 ( 3 ): 179 – 87 . Spanish . OpenUrl CrossRef PubMed Web of Science 11. ↵ Kuhn M ( 2022 ). _caret: Classification and Regression Training_ . R package version 6.0-93 , . 12. ↵ Chang W , Cheng J , Allaire J , Sievert C , Schloerke B , Xie Y , Allen J , McPherson J , Dipert A , Borges B ( 2022 ). _ shiny: Web Application Framework for R_ . R package version 1.7.4 , . 13. ↵ R Core Team ( 2022 ). R: A language and environment for statistical computing . R Foundation for Statistical Computing , Vienna, Austria . URL https://www.R-project.org/ . 14. ↵ Biecek P ( 2018 ). “ DALEX: Explainers for Complex Predictive Models in R .” Journal of Machine Learning Research , 19 ( 84 ), 1 – 5 . https://jmlr.org/papers/v19/18-416.html . OpenUrl 15. ↵ Friedman J , Hastie T , Tibshirani R . Regularization Paths for Generalized Linear Models via Coordinate Descent . J Stat Softw . 2010 ; 33 ( 1 ): 1 – 22 . OpenUrl CrossRef PubMed Web of Science 16. ↵ Tay JK , Narasimhan B , Hastie T . Elastic Net Regularization Paths for All Generalized Linear Models . J Stat Softw . 2023 ; 106 : 1 . OpenUrl CrossRef PubMed 17. ↵ Greenwell B , Boehmke B , Cunningham J , Developers G ( 2022 ). gbm: Generalized Boosted Regression Models . R package version 2.1.8.1 , https://CRAN.R-project.org/package=gbm . 18. ↵ Meyer D , Dimitriadou E , Hornik K , Weingessel A , Leisch F ( 2022 ). e1071: Misc Functions of the Department of Statistics, Probability Theory Group (Formerly: E1071), TU Wien . R package version 1.7-12 . 19. ↵ Docherty AB , Mulholland RH , Lone NI , Cheyne CP , De Angelis D , Diaz-Ordaz K , et al. ISARIC4C Investigators. Changes in in-hospital mortality in the first wave of COVID-19: a multicentre prospective observational cohort study using the WHO Clinical Characterisation Protocol UK . Lancet Respir Med . 2021 Jul; 9 ( 7 ): 773 – 785 . OpenUrl PubMed 20. ↵ Kawaji H , Kishimoto N , Muguruma N , Kozai H , Horiuchi N . Risk Factors Related to Severity in COVID-19 Patients: A Real-world Retrospective Cohort Study . Intern Med . 2023 Sep 15; 62 ( 18 ): 2627 – 2634 . OpenUrl PubMed 21. Djorwé S , Bousfiha A , Nzoyikorera N , Nyandwi J , Kawthar B , Malki A . Impact and prevalence of comorbidities and complications on the severity of COVID-19 in association with age, gender, obesity, and pre-existing smoking: A meta-analysis . Biomedicine (Taipei) . 2024 Mar 1; 14 ( 1 ): 20 – 38 . OpenUrl PubMed 22. ↵ Zaki N , Alashwal H , Ibrahim S . Association of hypertension, diabetes, stroke, cancer, kidney disease, and high-cholesterol with COVID-19 disease severity and fatality: A systematic review . Diabetes Metab Syndr . 2020 Sep -Oct; 14 ( 5 ): 1133 – 1142 . OpenUrl CrossRef PubMed 23. ↵ Feldman EL , Savelieff MG , Hayek SS , Pennathur S , Kretzler M , Pop-Busui R . COVID-19 and Diabetes: A Collision and Collusion of Two Diseases . Diabetes . 2020 Dec ; 69 ( 12 ): 2549 – 2565 . OpenUrl Abstract / FREE Full Text 24. ↵ Moazzami B , Chaichian S , Kasaeian A , Djalalinia S , Akhlaghdoust M , Eslami M , et al. Metabolic risk factors and risk of Covid-19: A systematic review and meta-analysis . PLoS One . 2020 Dec 15; 15 ( 12 ): e0243600 . OpenUrl CrossRef PubMed 25. ↵ Schlesinger S , Lang A , Christodoulou N , Linnerz P , Pafili K , Kuss O , Herder C , Neuenschwander M , Barbaresko J , Roden M . Risk phenotypes of diabetes and association with COVID-19 severity and death: an update of a living systematic review and meta-analysis . Diabetologia . 2023 Aug ; 66 ( 8 ): 1395 – 1412 . doi: 10.1007/s00125-023-05928-1 . OpenUrl CrossRef PubMed 26. ↵ Gupta RK , Harrison EM , Ho A , Docherty AB , Knight SR , van Smeden M , et al. ISARIC4C Investigators. Development and validation of the ISARIC 4C Deterioration model for adults hospitalised with COVID-19: a prospective cohort study . Lancet Respir Med . 2021 Apr; 9 ( 4 ): 349 – 359 . OpenUrl PubMed 27. ↵ Clift AK , Coupland CAC , Keogh RH , Diaz-Ordaz K , Williamson E , Harrison EM , et al. Living risk prediction algorithm (QCOVID) for risk of hospital admission and mortality from coronavirus 19 in adults: national derivation and validation cohort study . BMJ . 2020 October 20; 371 : m3731 . OpenUrl Abstract / FREE Full Text 28. ↵ Sisó-Almirall A , Kostov B , Mas-Heredia M , Vilanova-Rotllan S , Sequeira-Aymar E , Sans-Corrales M , et al. Prognostic factors in Spanish COVID-19 patients: A case series from Barcelona . PLoS One . 2020 Aug 21; 15 ( 8 ): e0237960 . OpenUrl CrossRef PubMed 29. ↵ Avila-Ponce de León U , Vazquez-Jimenez A , Cervera A , Resendis-González G , Neri-Rosario D , Resendis-Antonio O . Machine Learning and COVID-19: Lessons from SARS-CoV-2 . Adv Exp Med Biol . 2023 ; 1412 : 311 – 335 . OpenUrl PubMed 30. Chen R , Chen J , Yang S , Luo S , Xiao Z , Lu L , et al. Prediction of prognosis in COVID-19 patients using machine learning: A systematic review and meta-analysis . Int J Med Inform . 2023 Sep ; 177 : 105151 . OpenUrl PubMed 31. ↵ Ahmadi Marzaleh M , Peyravi M , Mousavi S , Sarpourian F , Seyedi M , Shalyari N . Artificial Intelligence Functionalities During the COVID-19 Pandemic . Disaster Med Public Health Prep . 2023 February 27; 17 : e336 . OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted May 09, 2025. Download PDF Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Machine learning models for the prediction of COVID-19 prognosis in the primary health care setting Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Machine learning models for the prediction of COVID-19 prognosis in the primary health care setting Joan Barrot , Joan A. Caylà , Manel Mata-Cases , Jordi Real , Bogdan Vlacho , Josep Franch-Nadal , Didac Mauricio , the COVID-19 Working Group in Primary Health Care medRxiv 2025.05.08.25327245; doi: https://doi.org/10.1101/2025.05.08.25327245 Share This Article: Copy Citation Tools Machine learning models for the prediction of COVID-19 prognosis in the primary health care setting Joan Barrot , Joan A. Caylà , Manel Mata-Cases , Jordi Real , Bogdan Vlacho , Josep Franch-Nadal , Didac Mauricio , the COVID-19 Working Group in Primary Health Care medRxiv 2025.05.08.25327245; doi: https://doi.org/10.1101/2025.05.08.25327245 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Primary Care Research Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4421) Dentistry and Oral Medicine (443) Dermatology (381) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15212) Forensic Medicine (30) Gastroenterology (1121) Genetic and Genomic Medicine (6581) Geriatric Medicine (667) Health Economics (996) Health Informatics (4520) Health Policy (1366) Health Systems and Quality Improvement (1611) Hematology (539) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15906) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (667) Neurology (6580) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1141) Occupational and Environmental Health (956) Oncology (3324) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5431) Public and Global Health (9212) Radiology and Imaging (2193) Rehabilitation Medicine and Physical Therapy (1368) Respiratory Medicine (1194) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ff003eecf204807',t:'MTc3OTMyOTExOA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.