Full text
53,830 characters
· extracted from
preprint-html
· click to expand
Not the Models You Are Looking For: Traditional ML Outperforms LLMs in Clinical Prediction Tasks | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Not the Models You Are Looking For: Traditional ML Outperforms LLMs in Clinical Prediction Tasks View ORCID Profile Katherine E. Brown , View ORCID Profile Chao Yan , View ORCID Profile Zhuohang Li , View ORCID Profile Xinmeng Zhang , View ORCID Profile Benjamin X. Collins , View ORCID Profile You Chen , View ORCID Profile Ellen Wright Clayton , View ORCID Profile Murat Kantarcioglu , View ORCID Profile Yevgeniy Vorobeychik , View ORCID Profile Bradley A. Malin doi: https://doi.org/10.1101/2024.12.03.24318400 Katherine E. Brown 1 Department of Biomedical Informatics, Vanderbilt University Medical Center , Nashville, Tennessee PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Katherine E. Brown For correspondence: katherine.brown{at}vumc.org Chao Yan 1 Department of Biomedical Informatics, Vanderbilt University Medical Center , Nashville, Tennessee PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Chao Yan Zhuohang Li 2 Department of Computer Science, Vanderbilt University , Nashville, Tennessee MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zhuohang Li Xinmeng Zhang 2 Department of Computer Science, Vanderbilt University , Nashville, Tennessee BS Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Xinmeng Zhang Benjamin X. Collins 1 Department of Biomedical Informatics, Vanderbilt University Medical Center , Nashville, Tennessee MDMS Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Benjamin X. Collins You Chen 1 Department of Biomedical Informatics, Vanderbilt University Medical Center , Nashville, Tennessee 2 Department of Computer Science, Vanderbilt University , Nashville, Tennessee PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for You Chen Ellen Wright Clayton 3 Law School, Vanderbilt University , Nashville, Tennessee, USA 4 Department of Health Policy, Vanderbilt University Medical Center , Nashville, Tennessee 5 Department of Pediatrics, Vanderbilt University Medical Center , Nashville, Tennessee, USA MDJD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ellen Wright Clayton Murat Kantarcioglu 6 Department of Computer Science , Virginia Tech, Blacksburg, Virginia PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Murat Kantarcioglu Yevgeniy Vorobeychik 7 Department of Computer Science, Washington University , St. Louis, Missouri PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Yevgeniy Vorobeychik Bradley A. Malin 1 Department of Biomedical Informatics, Vanderbilt University Medical Center , Nashville, Tennessee 2 Department of Computer Science, Vanderbilt University , Nashville, Tennessee 8 Department of Biostatistics, Vanderbilt University Medical Center , Nashville, Tennessee PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Bradley A. Malin Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF ABSTRACT Objectives To determine the extent to which current Large Language Models (LLMs) can serve as substitutes for traditional machine learning (ML) as clinical predictors using data from electronic health records (EHRs), we investigated various factors that can impact their adoption, including overall performance, calibration, fairness, and resilience to privacy protections that reduce data fidelity. Materials and Methods We evaluated GPT-3.5, GPT-4, and ML (as gradient-boosting trees) on clinical prediction tasks in EHR data from Vanderbilt University Medical Center and MIMIC IV. We measured predictive performance with AUROC and model calibration using Brier Score. To evaluate the impact of data privacy protections, we assessed AUROC when demographic variables are generalized. We evaluated algorithmic fairness using equalized odds and statistical parity across race, sex, and age of patients. We also considered the impact of using in-context learning by incorporating labeled examples within the prompt. Results Traditional ML (AUROC: 0.847, 0.894 (VUMC, MIMIC)) substantially outperformed GPT-3.5 (AUROC: 0.537, 0.517) and GPT-4 (AUROC: 0.629, 0.602) (with and without in-context learning) in predictive performance and output probability calibration (Brier Score (ML vs GPT-3.5 vs GPT-4): 0.134 versus 0.384 versus 0.251, 0.042 versus 0.06 versus 0.219). Traditional ML is more robust than GPT-3.5 and GPT-4 to generalizing demographic information to protect privacy. GPT-4 is the fairest model according to our selected metrics but at the cost of poor model performance. Conclusion These findings suggest that LLMs are much less effective and robust than locally-trained ML for clinical prediction tasks, but they are getting better over time. INTRODUCTION Making predictions is a vital part of healthcare, with common applications including estimating the likelihood of a specific diagnosis, assessing the suitability of a medication, and determining a patient’s readiness for discharge. Over the past several decades, the practice of medicine has evolved from clinical intuition to incorporate data-driven guidance, which is increasingly aided by artificial intelligence (AI) and particularly machine learning (ML). While model development has advanced, it is widely recognized that implementing ML models successfully often requires training on or fine-tuning (e.g., via transfer learning ( 3 )) with local, representative data ( 1 , 2 ). However, not all institutions have sufficient resources to implement ML effectively. For instance, community hospitals may lack the volume of ML-ready data, computational power, and personnel that academic medical centers possess ( 4 ). More recently developed ML technologies, such as large language models (LLMs), have hinted at a potential to mitigate these challenges and fundamentally change the integration of ML in medicine. Cloud-based LLMs, such as GPT-3.5 ( 5 ), GPT-4 ( 6 ) and Claude ( 7 ), are pre-trained and can be interacted with conversationally, characteristics that reduce technical friction required to create or use ML (or more broadly, AI) for healthcare settings. While there have been recent investigations into the performance ( 8 – 11 ) and fairness ( 12 ) of LLMs versus traditional ML, as well as other diagnostic aids ( 9 , 13 ), a number of critical aspects, including reliability and robustness (e.g., model calibration) of these tools, remain unknown. Given the rapid pace at which LLMs are growing in their production ( 14 ), it is critical to quantify their strengths and weaknesses to ensure responsible use. To learn more about the efficacy of LLMs for clinical prediction, several aspects of real-world implementation need to be considered. First, we need to represent the type of users and constraints that may limit how LLMs are applied in a healthcare setting. In this study, we consider two distinct types of users: 1) the everyday user, whose goal is to retrieve an answer using a single LLM query (i.e., zero-shot prompting), and 2) the advanced user who utilizes retrieval augmented generation (RAG) to contextually present the LLM with similar cases or additional relevant examples in the prompt to augment its decision-making capabilities for better results. In-context learning is such a strategy that helps guide LLM inference ( 8 ). Second, privacy concerns can limit the amount of patient information that can be supplied to an LLM under the control of a third party, such as OpenAI, as these technologies risk exposing protected health information (PHI) to organizations that are neither covered entities nor their business associates as required by the Health Insurance Portability and Accountability Act (HIPAA). Thus, it is likely that patient data would need to be de-identified before submission to the LLM. Moreover, while this may address the privacy requirements of federal regulations, such as the HIPAA Privacy Rule, evidence suggests that as data detail is reduced, the predictive power of the ML is weakened ( 16 ). The impact on LLMs, however, remains unclear. Third, many commercial LLMs do not provide detailed documentation of their architecture or training data. As a result, reproducibility and the design of formal evaluation frameworks are limited for prediction tasks in healthcare. In this study, we evaluate the utility, privacy, and fairness of LLMs compared to traditional ML, using electronic health record (EHR) data from Vanderbilt University Medical Center (VUMC) ( 17 ) to predict the likelihood of patient discharge from the hospital within 24 hours and the public use MIMIC-IV and MIMIC-IV ED datasets from Beth Israel Deaconess Medical Center (BIDMC) ( 18 , 19 ) to predict the likelihood of transfer to the ICU within 24 hours after triage in the emergency department. As our investigation shows, traditional ML models significantly outperform LLMs (both with and without in-context learning) in terms of both classification efficacy and calibration. From the privacy perspective, we further find that traditional ML is generally more robust to the reduction in detail of a patient’s demographic information. We also observe that GPT-4 is the fairest model, which may be inappropriately reassuring since GPT-4’s apparent advantage in fairness seems largely a consequence of its poor performance across all subgroups rather than improved performance on disadvantaged subgroups. We did note improvement in performance from GPT3.5 to GPT 4. MATERIALS AND METHODS This study was approved by VUMC’s Institutional Review Board (IRB# 191892). Datasets This study relied on several EHR-derived datasets from VUMC and BIDMC. The MIMIC-IV and MIMIC-IV ED datasets (n = 393,576) contains hospital admission records and emergency department records, respectively, for BIDMC ( 18 – 20 ). From these, we derived a dataset to predict if a patient will be transferred to an intensive care unit (ICU) within 24 hours of hospital admission based on their presentation during Emergency Department (ED) triage. In this dataset, the predictors include race, age, arrival transport type, triage vital signs, and the number of ED, ICU, and inpatient hospital stays for the past 30, 90, and 365 days. The VUMC dataset (n = 28,880) was developed to predict the likelihood of patient discharge from the hospital within 24 hours( 17 ). We ran preliminary experiments using gradient-boosted trees implemented with CatBoost ( 21 ). We predicted likelihood of discharge using the demographic and audit log data features. We then calculated feature importance based on change in the loss function (implemented within the CatBoost library) and selected the 15 most important audit log features. Those features and the demographic features – race, age, current day of week, insurance type, and Area Deprivation Index ( 22 ) of residence – were relied upon in this portion of our study. GPT Our experiments were run on a HIPAA-compliant Azure instantiation of GPT to guard PHI. We utilized GPT-3.5 and GPT-4 with temperature set to 0 (to reduce likelihood of extraneous text being generated) and the maximum number of tokens set to 1024. We randomly selected 1,500 data points from each dataset to query LLMs. The features for each dataset were represented in JSON. Supplementary Table S2 specifies specific prompt structures for each dataset. We considered both zero-shot prompting (not providing additional examples) and few-shot examples (see below). Retrieval Augmented Generation To implement RAG, we selected likely relevant examples that would inform LLM inference on the datapoint under consideration. To select instances that are likely to serve as useful in-context examples, we partitioned the dataset D into disjoint subsets D 1 and D 2 . Let x be the data point under consideration by the LLM and, without loss of generality, let x ∈ D 1 . Then, we performed k -means clustering on the points in D 2 . To select the number of clusters, we measured the inertia (i.e., the sum of squared distances of samples to their closest cluster center ( 23 )) for each value of k starting at 2 and continuing until the change in inertia is less than 0.001. We selected the value of k based on the Kneedle algorithm, which identifies the point where inertia shows diminishing returns ( 24 ). This defined clusters C 1 , C 2 , …, C k for D 2 . We repeated this process to define clusters for D 1 as well such that k- means is executed once for D 1 and once for D 2 . With C i defined as the cluster to which x belongs, the data points and their corresponding true labels in this cluster were used as relevant illustrating examples to present to the LLM. Such examples are known as few-shot (or n- shot) examples. If C i had fewer than five members, then the entire cluster was used as few-shot examples; otherwise, we selected five points in C i with the closest Euclidean distance to x (e.g., 5-shot examples). We denote the use of in-context examples as GPT-X RAG Closest Within Cluster (or GPT-X RAG CWC). Traditional ML We used a traditional ML benchmark of a gradient-boosting tree (GBT) implemented with Catboost ( 21 ). Categorical data were replaced with an integer corresponding to the index when sorting the possible values alphanumerically. Missing categorical features are replaced with a “?” and included in the sort. Missing numeric features were replaced with −1. We opted for this strategy for missing values since we assume that a clinician interfacing with an LLM may not use statistical imputation procedures. We used default settings for the GBT and performed 10-fold cross-validation to collect the predictions for each data point per test set (i.e., test-set predictions for each data point in dataset). Statistical Analysis Unless otherwise noted, we produced a total of 30 bootstrapped samples of the predictions and report performance metrics as average and 95% confidence intervals. When applicable, we used Welch’s t-test and one-way analysis of variance (ANOVA) test and considered p < 0.05 as significant for all statistical tests. For ROC curves and calibration curves, we performed vertical averaging to aggregate results from the bootstrap samples( 25 ). To perform vertical averaging of the graphs, we defined a comprehensive x -axis and average the corresponding y -values (interpolating y -values if necessary). All analysis was performed using Python version 3.11.5, and the ANOVA test was calculated using SciPy library 1.11.4. Design of Performance and Calibration Experiments To measure prediction performance, we used the area under the Receiver Operating Characteristic (ROC) curve (AUROC). We performed vertical averaging ( 25 ) across the thirty bootstrap samples to construct the ROC curves. We compared the average AUROC using the bootstrapped samples (not the average AUROC from the vertically averaged curves since vertical averaging of curves interpolation of y-values is required to maintain a consistent x- axis). Currently, accessing the LLM architecture or training data to assess model reliability or interpretability is not possible. Therefore, evaluating the reliability and applicability of predictions is crucial to ensuring interpretability and trustworthiness. Since, closed-source LLMs such as GPT-3.5 and GPT-4 are black box classifiers, their primary point of interaction is via prompt-based querying and the generated output. In our case, the generated output was the probability of a specific phenomenon (e.g., likelihood of discharge, likelihood of ICU admittance). In light of this, we used calibration curves ( 26 ) and Brier Score ( 27 ) to determine how well-calibrated output probabilities are from each model. Calibration curves plot the fraction of positive predictions given a predicted probability. A perfectly calibrated classifier should produce a calibration curve of y = x . This would indicate that, for all of the predictions assigned a predicted probability p , a fraction p of these predictions corresponds to the positive class. The Brier Score corresponds to the squared-loss of predicted probabilities to output labels with a range of [0,1], where a lower value indicates a better-calibrated classifier - with 0 indicating perfect calibration and 1 indicating the worst possible calibration. The calibration curves were constructed by vertically averaging ( 25 ) calibration curves of the thirty bootstrap samples. Average Brier Score was calculated with the bootstrap samples, using the same technique employed with ROC curves. We performed Welch’s t-test to determine if there were statistically significant differences in both performance and calibration. Consider model 1 and model 2 . The null hypothesis was ℋ 0 : μ model 1 = μ model 2 . For AUROC, ℋ 1 : μ model 1 > μ model 2 , whereas for Brier Score, ℋ 1 : μ model 1 < μ model 2 . Thus, these hypothesis tests determined if model 1 performs better than model 2 for the appropriate metric. Design of Privacy-Utility Tradeoff Experiments To evaluate the privacy-utility tradeoff associated with LLMs, we considered the following patient data obfuscation procedure. Data was organized along three levels of detail as documented in Supplementary Figure S3. Level 0 indicates no data obfuscation (i.e., the original data); Level 1 indicates a mild amount of obfuscation, where each feature was divided into 4-8 categories; and Level 2 indicates moderate obfuscation where each feature was in a binary category (e.g., for race, white/not white). Level 3 indicates all demographic data were suppressed. Real-valued features were partitioned into ranges in Levels 1-3, while categorical features at Levels 1-3 were grouped into semantically similar categories. Sex was suppressed for Levels 1-3. To determine if there was a statistically significant difference in LLM and ML performance between these levels, we performed a one-way analysis of variance (ANOVA) test with a p -value threshold of 0.05. Design of Algorithmic Fairness Experiments To assess the fairness of ML and LLMs, we considered subgroups (majority/minority) based on race (white/not white), age (18-49 years old/50 years or older), and sex (male/female). We measured algorithmic fairness in two ways: 1) average absolute odds difference (AAOD) and 2) statistical parity difference (SPD). AAOD is a measure based on the equalized odds (EO) measure of algorithmic fairness ( 28 ). EO defines a fair classifier as one for which TPR s maj = TPR s min and FPR s maj = FPR s min for each possible pair of distinct demographic subgroups s maj (majority) and s min (minority). Then, we can calculate AAOD as follows: . Thus, a perfectly fair classifier will have an AAOD of 0. SPD measures the difference in the rate that each sensitive group receives a positive prediction, denoted as ( 29 ). An SPD of 0 corresponds to a perfectly fair classifier. A negative SPD indicates a preference for the minority class, whereas a positive SPD favors the majority class. Since SPD can range from −1 to 1, in addition to calculating the magnitude of statistical parity, we also calculated the absolute value of SPD (|SPD|). For Welch’s T-test, we then have ℋ 0 : μ model 1 = μ model 2 and ℋ 1 : μ model 1 < μ model 2 . Since AAOD and |SPD| depict the magnitude of unfairness, smaller metric values indicate higher fairness. We report the sum of the fairness metrics per demographic subgroup. RESULTS Performance and Calibration Table 1 summarizes the predictive performance of the LLMs and traditional ML model. As shown in Figure 1 , the ROC curves for traditional ML were much larger than those for the LLMs, both with and without in-context learning. Moreover, the p-values in Figure 2 indicated this difference in performance between traditional ML and LLMs was statistically significant. Thus, in terms of measures of discrimination, traditional ML appeared to be more capable at identifying different classes of patients than LLMs at the given clinical tasks. This was not surprising because traditional ML learns a classification function based on a subset of representative data, and it has defined knowledge of expected feature distributions. In-context examples improved the ROC of GPT for all dataset-model combinations except for GPT-3.5 on VUMC data. Nonetheless, traditional ML still outperformed GPT with in-context examples. Download figure Open in new tab Figure 1. ROC curves for (A) zero-shot prompting on the VUMC dataset, (B) RAG-based few-shot prompting on the VUMC dataset, (C) zero-shot prompting on the MIMIC-IV dataset, and (D) RAG-based few-show prompting on the MIMIC-IV dataset. Note that the AUC given in the legends may not match those given in Table 1 due to numerical interpolation of the curves required for vertical curve averaging (AUC is of the final averaged curve). Download figure Open in new tab Figure 2. Let model 1 be the model given by the row and model 2 be the model given by the column. Then, ℋ 0 : μ model 1 = μ model 2 . For AUROC ℋ 1 : μ model 1 > μ model 2 . For Brier Score, ℋ 1 : μ model 1 < μ model 2 Thus, these hypothesis tests determine if model 1 performs better than model 2 for the appropriate metric. (A) AUROC from zero-shot prompting on VUMC dataset, (B) AUROC from zero-shot prompting on MIMIC-IV dataset, (C) AUROC from RAG-based few-shot prompting on VUMC dataset, (D) AUROC from RAG-based few-shot prompting on MIMIC-IV dataset, (E) Brier Score from zero-shot prompting on VUMC dataset, (F) Brier Score from zero-shot prompting on MIMIC-IV dataset, (G) Brier Score from RAG-based few-shot prompting on VUMC dataset, (H) Brier Score from RAG-based few-shot prompting on MIMIC-IV dataset View this table: View inline View popup Download powerpoint Table 1. Average AUROC and Brier Scores and 95% confidence intervals. Arrow indicates better direction. Best in bold.(↑ = higher is better; ↓ = lower is better) Figure 3 depicts the calibration curves for traditional ML, GPT-3.5, and GPT-4. The classification curve for traditional ML was nearly perfectly overlaid the y = x curve. The calibration curves for GPT-3.5 and GPT-4, however, were heavily miscalibrated. These findings were corroborated by the Brier Score, which can be interpreted as a calibration loss, such that lower Brier scores imply better calibrated classifiers. Traditional ML was better calibrated than GPT-3.5 or GPT-4, a result that was statistically significant, as shown in Figure 2 . In-context examples improved calibration for GPT-4 on both datasets, but GPT-3.5 exhibited a decrease in probability calibration for both datasets. Download figure Open in new tab Figure 3. Calibration plots (A) zero-shot prompting on the VUMC dataset, (B) RAG-based few-shot prompting on the VUMC dataset, (C) zero-shot prompting on the MIMIC-IV dataset, and (D) RAG-based few-show prompting on the MIMIC-IV dataset. Privacy-Utility Tradeoff Table 2 presents the mean performance (with 95% confidence intervals in parentheses) of each level of generalization per model. There was less than a 0.01 difference among the standard deviations of performance among different levels of generalization/suppression per model. The ANOVA analysis was statistically significant across all combinations of datasets and models ( Table 3 ), indicating all models were affected, to some degree, by the generalization and suppression of demographic information. View this table: View inline View popup Download powerpoint Table 2. Average AUROC and 95% CI for each model and level of data obfuscation. View this table: View inline View popup Download powerpoint Table 3. Results of one-way ANOVA analysis to compare the effect of data generalization/suppression on LLM AUROC Figure 4 displays the privacy-utility tradeoff across four levels of demographic generalization and suppression. We observed that traditional ML performance was generally consistent across the levels of demographic generalization and suppression, while GPT-3.5 and GPT-4 displayed varying degrees of performance but no overall directional trend. Next, we considered how LLMs compare to traditional ML as we reduced the granularity (generalize) and remove (suppress) demographic information. By visual inspection, traditional ML was relatively more robust to generalization and suppression than LLMs. LLMs varied more in performance, with little noticeable trend in movement. Still, when demographic information was removed, LLMs (and particularly LLMs with in-context learning) dropped in performance. This raises the concern that LLMs may have some level of reliance on demographic information when performing zero-shot predictions. Download figure Open in new tab Figure 4. Bar charts depicting the privacy-utility tradeoff for (A) zero-shot prompting and (B) RAG-based few-shot prompting. Algorithmic Fairness Figures 5 and 6 plot model overall performance and fairness on an XY-plane. It was observed that traditional ML was not unambiguously the fairest model. For the VUMC data, GPT-4 with in-context examples was the fairest model across both fairness metrics, while in-context examples did not improve GPT-3.5 fairness. For the MIMIC dataset, GPT-4 was the fairest model, but in-context examples improved the fairness of GPT-3.5. Download figure Open in new tab Figure 5. Two-dimensional fairness plots for the VUMC dataset using (A) average absolute odds difference, (B) statistical parity difference, or (C) absolute value of statistical parity difference as the fairness metric, respectively. Download figure Open in new tab Figure 6. Two-dimensional fairness plots for the MIMIC-IV dataset using (A) average absolute odds difference, (B) statistical parity difference, or (C) absolute value of statistical parity difference as the fairness metric, respectively We further evaluated the statistical significance of differences in fairness metrics between different models by subgroup. Supplementary Figure S4 provides heatmaps of p-values from the corresponding Welch’s t-tests determining statistical significance of the results. There are twelve possible scenarios where one model could be statistically significantly fairer than another (based on demographic variable, fairness metric, and dataset). For all twelve cases, GPT-4 was statistically significantly fairer than traditional ML; however, incorporating in-context examples reduced this number to ten of twelve cases. For GPT-3.5, zero-shot learning was statistically significantly fairer than traditional ML for only six of twelve cases. In the VUMC dataset, GPT-3.5 struggled with maintaining fairness for race, but in the MIMIC dataset, GPT-3.5 was only fairer for subgroups defined by sex. In-context learning did not help the fairness for the VUMC data; however, in-context examples improved GPT-3.5’s fairness for the race and age variables in the mimic dataset. When comparing GPT-3.5 and GPT-4, we note that there is no unambiguously fairer model. Without in-context examples, GPT-4 surpassed GPT-3.5’s fairness for all demographics in the MIMIC dataset but none in the VUMC dataset. With in-context examples, GPT-4 was fairer on the VUMC dataset, yet GPT-3.5 still surpassed GPT-4’s fairness on the MIMIC dataset. We note that this fine-grained analysis may differ somewhat from the overall trends presented in Figures 5 and 6 , since these figures consider aggregate fairness across all demographic subgroups considered. As a result, the small, yet significant shifts may be cancelled out in the aggregate. While GPT-4 was the fairest model for both datasets, the fairer model between GPT-3.5 and traditional ML depended upon dataset and fairness metric considered. Traditional ML was trained and evaluated on data from the same location. Thus, whatever biases are present in the data source will be present in the resulting model. We evaluated GPT-3.5 and GPT-4 using zero-shot prompting and without knowledge of the evaluation datasets. Thus, these models were not exposed to the local dataset biases that may influence decisions. This led us to hypothesize that traditional ML would be the most unfair model, which was validated by our findings. The use of in-context examples also impacted fairness; however, this impact was not unanimously positive or negative but depended upon dataset and metric. Moreover, despite being the fairer models, GPT-3.5 and GPT-4 were also the least capable predictors. Thus, it seems that overall improved fairness comes at the cost of reduced subgroup performance; but future work is necessary to confirm. DISCUSSION This investigation suggests that LLMs are not yet ready to serve as predictive, analytic models, although GPT-4 does surpass GPT-3.5 in most of our experiments, indicating that GPT is improving with new releases. In our experiments, LLMs have a significantly lower AUROC compared to traditional ML. This implies that LLMs simply do not match the discriminative capabilities of traditional ML for these tasks. Moreover, inspecting the calibration curves and Brier Scores reveals that LLMs are poorly calibrated when compared to the traditional ML comparison. The weak calibration of the LLM predictions in clinical scenarios calls into question their utility and reliability as a clinical classifier. This is not to say LLMs do not have advantages as a classifier, as there is little model tuning required out-of-the-box to be minimally functional (whereas traditional ML requires training data and computation time) and the conversational interaction may allow for chain-of-thought reasoning to support usability and interpretability ( 30 ). These advantages, however, are difficult to realize when the output probabilities do not indicate reliability. Unlike several traditional ML algorithms ( 31 , 32 ), LLMs have limited capability to quantify their uncertainty post-hoc ( 33 ). Thus, if LLMs are going to be useful as accurate clinical predictors, further research into improving the reliability and interpretability of LLMs is a must. For LLMs to be useful as classifiers in lower resource settings, for example, where extensive data may not be available to train models, strategies such as in-context learning and RAG are necessary to produce good performance. We expected the incorporation of intelligently selected in-context examples ( 8 , 34 ) would improve LLM performance to levels near that of traditional ML. Instead, we find that in-context examples do not always improve LLM discriminative performance. Table 1 reveals that for the VUMC data, the usage of in-context examples resulted in either a slight increase in performance (GPT-4) or a decrease in performance (GPT-3.5). Further research is needed to improve selection of RAG-based in-context examples for biomedical classification. In the event that LLMs are the most accessible or applicable model for a situation, there are still concerns with respect to patient privacy when using these models. As we alluded to, organizations may apply some form of de-identification before sending data to the LLM, but our analysis indicates that LLMs are much more sensitive to the level of detail in demographic information compared to traditional ML. This suggests that a strategy that generalizes patient information may result in a reduction in predictive performance. The apparent sensitivity of LLMs to the completeness of patient demographic information raises questions about the biases and unfairness present in these models based on their existing training. We find that GPT-4 is the fairest model evaluated; however, we note that LLMs are generally the lowest performing models evaluated. By definition, an unfair model is one that results in a disparate outcome for at least one subgroup ( 35 ). Typically, mitigating fairness concerns usually means improving the performance of the disadvantaged subgroup(s). This route ensures equity of performance with reduced harm ( 36 ). For the LLMs, increased fairness seems to come at reduced performance for all subgroups. This is evidenced by the overall degradation of performance of LLMs compared to traditional ML. Thus, it may be difficult to justify the use of LLMs on the basis of apparent fairness in the context of subpar performance. Moreover, we find that there is not a consistently fair model. Fairness is conditioned on dataset and prediction task. For example, work from Liu et al. ( 12 ) also compared the fairness of GPT-3.5 to traditional ML. In their analysis, traditional ML was fairer in their analysis when compared to GPT-3.5 in two of the three evaluated datasets. In our evaluation, traditional ML is fairer than GPT-3.5 for age and gender subgroups in the MIMIC dataset; however, this does not hold for the VUMC dataset or race in the MIMIC dataset. Liu et al. also found that fine-tuning using few-shot examples improves the fairness gap for GPT-3.5, but there is little discussion on the performance impact. Our analysis indicates that few-shot examples can improve fairness of GPT-3.5 and GPT-4, but whether or not this occurs is dependent on model and dataset. Though we conducted a large number of experiments, we acknowledge several limitations of this investigation. First, other facets of evaluation beyond the scope of this work, such as interpretability, were not evaluated and could also differ between model types. Second, we did not consider the costs of implementing and maintaining these models and the associated environmental factors. Operational costs of AI may prohibit lower-resourced medical centers from utilizing these advances, and the energy required may have a downstream negative impact both economically and environmentally. Third, we only consider two clinical tasks from datasets derived from two medical centers. Additional studies that consider a wider variety of tasks across multiple medical centers is crucial for generalizability of these findings. Finally, we note that the privacy-utility tradeoff reported does not match the trade-off expected. Ultimately, this calls into question the reliability of the datasets used for this privacy-utility tradeoff analysis. We cannot rule out the possibility that information captured in other predictors in these datasets are serving as appropriate proxies to demographic information. CONCLUSION This study reported on a multifaceted comparison of LLMs to traditional ML in terms of model performance, calibration, privacy, and fairness as the points of evaluation. Even when supplemented with in-context learning, we find that LLMs are not as effective as traditional ML for clinical prediction tasks due to their poor discriminative performance and lack of reliability, particularly when demographic factors are variables in the prediction model. While LLMs are the fairest technologies, they are also those with poor model performance. This indicates that performance across subgroups degrades to ensure fairness but this needs to be confirmed. Still, we note that LLMs have potential. Since March 2023, LLMs have been used to aid with drafting patient messages ( 37 ), optimizing clinical decision support ( 38 ), developing phenotyping algorithms ( 39 ), and summarizing patient encounters ( 40 ). In the next eighteen to twenty-four months, LLMs may be better optimized to serve as clinical prediction models. To reach this goal, research is needed to optimize in-context learning for LLM performance, understand the effect of randomization-based privacy preservation strategies, and understand the internal biases of LLMs to mitigate fairness concerns. Data Availability The datasets generated and/or analyzed during the current study are not publicly available due to patient private information investigated but are available from the corresponding authors on reasonable request. AUTHOR CONTRIBUTIONS Using the CRediT Taxonomy, the following are the roles of the authors on the manuscript: KB: Conceptualization, Formal analysis, Investigation, Writing (original draft, review & editing) CY: Conceptualization, Writing (original draft, review & editing) ZL: Writing (review & editing) XZ: Data curation, Writing (review & editing) BC: Writing (review & editing) YC: Data curation, Writing (review & editing) EC: Writing (review & editing) MK: Conceptualization, Writing (review & editing) YV: Conceptualization, Writing (review & editing) BM: Conceptualization, Resources, Supervision, Writing (original draft, review & editing) ACKNOWLEDGEMENTS This research was sponsored, in part, from NIH grants U54HG012510 and T15LM007450, from NSF grants IIS-1905558 Footnotes chao.yan.1{at}vumc.org , zhuohang.li{at}vanderbilt.edu , xinmeng.zhang{at}vanderbilt.edu , Benjamin.collins{at}vumc.org , you.chen{at}vumc.org , ellen.clayton{at}vumc.org , muratk{at}vt.edu , yvorobeychik{at}wustl.edu , b.malin{at}vumc.org REFERENCES 1. ↵ Bennett T , Russell S , King J , Schilling L , Voong C , Rogers N , et al. Accuracy of the Epic Sepsis Prediction Model in a Regional Health System [Internet] . arXiv ; 2019 [cited 2024 Jan 15]. Available from: http://arxiv.org/abs/1902.07276 2. ↵ Habib AR , Lin AL , Grant RW . The Epic Sepsis Model Falls Short-The Importance of External Validation . JAMA Intern Med . 2021 Aug 1; 181 ( 8 ): 1040 – 1 . OpenUrl PubMed 3. ↵ Weiss K , Khoshgoftaar TM , Wang D . A survey of transfer learning . J Big Data . 2016 Dec ; 3 ( 1 ): 9 . OpenUrl 4. ↵ Davis SE , Walsh CG , Matheny ME. Open questions and research gaps for monitoring and updating AI-enabled tools in clinical settings . Front Digit Health [Internet] . 2022 [cited 2023 Nov 7];4. Available from: https://www.frontiersin.org/articles/10.3389/fdgth.2022.958284 5. ↵ Floridi L , Chiriatti M . GPT-3: Its Nature, Scope, Limits, and Consequences . Minds Mach . 2020 Dec 1; 30 ( 4 ): 681 – 94 . OpenUrl CrossRef 6. ↵ OpenAI , Achiam J , Adler S , Agarwal S , Ahmad L , Akkaya I , et al. GPT-4 Technical Report [Internet] . arXiv ; 2023 [cited 2024 Jan 24]. Available from: http://arxiv.org/abs/2303.08774 7. ↵ Anthropic. Meet Claude [Internet] . [cited 2024 Oct 22]. Available from: https://www.anthropic.com/claude 8. ↵ Glicksberg BS , Timsina P , Patel D , Sawant A , Vaid A , Raut G , et al. Evaluating the accuracy of a state-of-the-art large language model for prediction of admissions from the emergency room . J Am Med Inform Assoc . 2024 May 21;ocae103. 9. ↵ Sandmann S , Riepenhausen S , Plagwitz L , Varghese J . Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks . Nat Commun . 2024 Mar 6; 15 ( 1 ): 2050 . OpenUrl PubMed 10. Zhang J , Sun K , Jagadeesh A , Falakaflaki P , Kayayan E , Tao G , et al. The potential and pitfalls of using a large language model such as ChatGPT, GPT-4, or LLaMA as a clinical assistant . J Am Med Inform Assoc . 2024 Sep 1; 31 ( 9 ): 1884 – 91 . OpenUrl PubMed 11. ↵ Acharya A , Shrestha S , Chen A , Conte J , Avramovic S , Sikdar S , et al. Clinical risk prediction using language models: benefits and considerations . J Am Med Inform Assoc . 2024 Sep 1; 31 ( 9 ): 1856 – 64 . OpenUrl PubMed 12. ↵ Duh K , Gomez H , Bethard S Liu Y , Gautam S , Ma J , Lakkaraju H. Confronting LLMs with Traditional ML: Rethinking the Fairness of Large Language Models in Tabular Classifications . In: Duh K , Gomez H , Bethard S , editors. Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers) [Internet] . Mexico City, Mexico : Association for Computational Linguistics ; 2024 [cited 2024 Nov 8]. p. 3603 – 20 . Available from: https://aclanthology.org/2024.naacl-long.198 13. ↵ Andreadis K , Newman DR , Twan C , Shunk A , Mann DM , Stevens ER . Mixed methods assessment of the influence of demographics on medical advice of ChatGPT . J Am Med Inform Assoc . 2024 Sep 1; 31 ( 9 ): 2002 – 9 . OpenUrl PubMed 14. ↵ Databricks [Internet] . 2023 [cited 2024 Oct 23]. State of Data + AI. Available from: https://www.databricks.com/discover/state-of-data-ai 15. Microsoft . Azure OpenAI Service documentation - Quickstarts, Tutorials, API Reference - Azure AI services [Internet] . [cited 2024 Sep 10]. Available from: https://learn.microsoft.com/en-us/azure/ai-services/openai/ 16. ↵ Brickell J , Shmatikov V. The cost of privacy: destruction of data-mining utility in anonymized data publishing . In: Proceedings of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining [Internet] . Las Vegas Nevada USA : ACM ; 2008 [cited 2024 Jun 11]. p. 70 – 8 . Available from: https://dl.acm.org/doi/10.1145/1401890.1401904 17. ↵ Zhang X , Yan C , Malin BA , Patel MB , Chen Y . Predicting next-day discharge via electronic health record access logs . J Am Med Inform Assoc . 2021 Nov 25; 28 ( 12 ): 2670 – 80 . OpenUrl PubMed 18. ↵ Johnson A , Bulgarelli L , Pollard T , Celi LA , Mark R , Horng S . MIMIC-IV-ED . PhysioNet . 2021 ; 19. ↵ Johnson AEW , Bulgarelli L , Shen L , Gayles A , Shammout A , Horng S , et al. MIMIC-IV, a freely accessible electronic health record dataset . Sci Data . 2023 Jan 3; 10 ( 1 ): 1 . OpenUrl PubMed 20. ↵ Goldberger AL , Amaral LA , Glass L , Hausdorff JM , Ivanov PC , Mark RG , et al. PhysioBank, PhysioToolkit, and PhysioNet: components of a new research resource for complex physiologic signals . circulation . 2000 ; 101 ( 23 ): e215 – 20 . OpenUrl Abstract / FREE Full Text 21. ↵ Dorogush AV , Ershov V , Gulin A. CatBoost: gradient boosting with categorical features support . 22. ↵ Singh GK. Area Deprivation and Widening Inequalities in US Mortality, 1969–1998 . Am J Public Health . 2003 Jul; 93 ( 7 ): 1137 – 43 . OpenUrl CrossRef PubMed Web of Science 23. ↵ scikit-learn [Internet] . [cited 2024 Jul 9]. KMeans. Available from: https://scikit-learn/stable/modules/generated/sklearn.cluster.KMeans.html 24. ↵ Satopaa V , Albrecht J , Irwin D , Raghavan B. Finding a “kneedle” in a haystack: Detecting knee points in system behavior . In: 2011 31st international conference on distributed computing systems workshops . IEEE ; 2011 . p. 166 – 71 . 25. ↵ Hogan J , Adams NM. On Averaging ROC Curves . Trans Mach Learn Res [Internet] . 2023 Feb 8 [cited 2024 Aug 28]; Available from: https://openreview.net/forum?id=FByH3qL87G 26. ↵ Wilks DS . On the Combination of Forecast Probabilities for Consecutive Precipitation Periods . Weather Forecast . 1990 Dec 1; 5 ( 4 ): 640 – 50 . OpenUrl 27. ↵ Murphy AH . A New Vector Partition of the Probability Score . J Appl Meteorol Climatol . 1973 Jun 1; 12 ( 4 ): 595 – 600 . OpenUrl 28. ↵ Hardt M , Price E , Price E , Srebro N. Equality of Opportunity in Supervised Learning . In: Advances in Neural Information Processing Systems [Internet] . Curran Associates, Inc. ; 2016 [cited 2024 Mar 8]. Available from: https://proceedings.neurips.cc/paper/2016/hash/9d2682367c3935defcb1f9e247a97c0d-Abstract.html 29. ↵ Besse P , del Barrio E , Gordaliza P , Loubes JM , Risser L. A Survey of Bias in Machine Learning Through the Prism of Statistical Parity . Am Stat . 2022 Apr 3; 76 ( 2 ): 188 – 98 . OpenUrl 30. ↵ Gramopadhye O , Nachane SS , Chanda P , Ramakrishnan G , Jadhav KS , Nandwani Y , et al. Few shot chain-of-thought driven reasoning to prompt LLMs for open ended medical question answering [Internet] . arXiv ; 2024 [cited 2024 Aug 20]. Available from: http://arxiv.org/abs/2403.04890 31. ↵ Malinin A , Prokhorenkova L , Ustimenko A. Uncertainty in Gradient Boosting via Ensembles . In 2020 [cited 2024 Nov 8]. Available from: https://openreview.net/forum?id=1Jv6b0Zq3qi 32. ↵ Gal Y , Ghahramani Z. Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning . In: Proceedings of The 33rd International Conference on Machine Learning . PMLR ; 2016 . p. 1050 – 9 . 33. ↵ Bouamor H , Pino J , Bali K Zhang J , Li Z , Das K , Malin B , Kumar S. SAC^3: Reliable Hallucination Detection in Black-Box Language Models via Semantic-aware Cross-check Consistency . In: Bouamor H , Pino J , Bali K , editors. Findings of the Association for Computational Linguistics: EMNLP 2023 [Internet] . Singapore : Association for Computational Linguistics ; 2023 [cited 2024 Nov 8]. p. 15445 – 58 . Available from: https://aclanthology.org/2023.findings-emnlp.1032 34. ↵ Hegselmann S , Buendia A , Lang H , Agrawal M , Jiang X , Sontag D. TabLLM: Few-shot Classification of Tabular Data with Large Language Models . In: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics [Internet] . PMLR ; 2023 [cited 2023 Nov 2]. p. 5549 – 81 . Available from: https://proceedings.mlr.press/v206/hegselmann23a.html 35. ↵ Ferrara E . Fairness and Bias in Artificial Intelligence: A Brief Survey of Sources , Impacts, and Mitigation Strategies. Sci . 2023 Dec 26; 6 ( 1 ): 3 . OpenUrl 36. ↵ Varkey B . Principles of Clinical Ethics and Their Application to Practice . Med Princ Pract . 2021 ; 30 ( 1 ): 17 – 28 . OpenUrl CrossRef PubMed 37. ↵ Liu S , McCoy AB , Wright AP , Carew B , Genkins JZ , Huang SS , et al. Leveraging Large Language Models for Generating Responses to Patient Messages [Internet] . Health Informatics ; 2023 Jul [cited 2023 Nov 27]. Available from: http://medrxiv.org/lookup/doi/10.1101/2023.07.14.23292669 38. ↵ Liu S , Wright AP , Patterson BL , Wanderer JP , Turer RW , Nelson SD , et al. Using AI-generated suggestions from ChatGPT to optimize clinical decision support . J Am Med Inform Assoc . 2023 Jul 1; 30 ( 7 ): 1237 – 45 . OpenUrl PubMed 39. ↵ Yan C , Ong HH , Grabowska ME , Krantz MS , Su WC , Dickson AL , et al. Large language models facilitate the generation of electronic health record phenotyping algorithms . J Am Med Inform Assoc . 2024 Sep 1; 31 ( 9 ): 1994 – 2001 . OpenUrl PubMed 40. ↵ Genes N , Simon G , Koziatek C , Koesmahargyo V , Mbachu C , Wiesenfeld B , et al. Generative AI Summaries to Facilitate Emergency Department Handoff . Ann Emerg Med . 2024 ; 84 ( 4 ): S3 . OpenUrl View the discussion thread. Back to top Previous Next Posted December 05, 2024. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Not the Models You Are Looking For: Traditional ML Outperforms LLMs in Clinical Prediction Tasks Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Not the Models You Are Looking For: Traditional ML Outperforms LLMs in Clinical Prediction Tasks Katherine E. Brown , Chao Yan , Zhuohang Li , Xinmeng Zhang , Benjamin X. Collins , You Chen , Ellen Wright Clayton , Murat Kantarcioglu , Yevgeniy Vorobeychik , Bradley A. Malin medRxiv 2024.12.03.24318400; doi: https://doi.org/10.1101/2024.12.03.24318400 Share This Article: Copy Citation Tools Not the Models You Are Looking For: Traditional ML Outperforms LLMs in Clinical Prediction Tasks Katherine E. Brown , Chao Yan , Zhuohang Li , Xinmeng Zhang , Benjamin X. Collins , You Chen , Ellen Wright Clayton , Murat Kantarcioglu , Yevgeniy Vorobeychik , Bradley A. Malin medRxiv 2024.12.03.24318400; doi: https://doi.org/10.1101/2024.12.03.24318400 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (574) Allergy and Immunology (865) Anesthesia (304) Cardiovascular Medicine (4462) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (611) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15251) Forensic Medicine (31) Gastroenterology (1132) Genetic and Genomic Medicine (6621) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4564) Health Policy (1372) Health Systems and Quality Improvement (1617) Hematology (544) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15938) Intensive Care and Critical Care Medicine (1107) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6643) Nursing (346) Nutrition (1001) Obstetrics and Gynecology (1149) Occupational and Environmental Health (957) Oncology (3350) Ophthalmology (981) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1698) Pharmacology and Therapeutics (694) Primary Care Research (714) Psychiatry and Clinical Psychology (5465) Public and Global Health (9259) Radiology and Imaging (2212) Rehabilitation Medicine and Physical Therapy (1372) Respiratory Medicine (1198) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (533) Surgery (715) Toxicology (100) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a03c4d99cfce4193',t:'MTc4MDEyOTA1Mw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.