Beyond Metrics to Methods: A Scoping Review of Large Language Models for Detection of Social Drivers of Health in Clinical Notes

preprint OA: closed Public-Domain
📄 Open PDF Full text JSON View at publisher
Full text 81,488 characters · extracted from preprint-html · click to expand
Beyond Metrics to Methods: A Scoping Review of Large Language Models for Detection of Social Drivers of Health in Clinical Notes | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Beyond Metrics to Methods: A Scoping Review of Large Language Models for Detection of Social Drivers of Health in Clinical Notes View ORCID Profile Ahmed Farrag , View ORCID Profile Ahmed Soliman , View ORCID Profile Elham Hatef , View ORCID Profile Amie Goodin , View ORCID Profile Masoud Rouhizadeh doi: https://doi.org/10.1101/2025.07.04.25330866 Ahmed Farrag 1 Department of Pharmaceutical Outcomes and Policy, College of Pharmacy, University of Florida , Gainesville, FL 32611, United States BPharm Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ahmed Farrag For correspondence: ahmed.farrag{at}ufl.edu Ahmed Soliman 1 Department of Pharmaceutical Outcomes and Policy, College of Pharmacy, University of Florida , Gainesville, FL 32611, United States MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ahmed Soliman Elham Hatef 2 Division of General Internal Medicine, Department of Medicine, Johns Hopkins School of Medicine , Baltimore, MD, United States 3 Center for Population Health Information Technology, Department of Health Policy and Management, Johns Hopkins Bloomberg School of Public Health , Baltimore, MD, United States MD, MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Elham Hatef Amie Goodin 1 Department of Pharmaceutical Outcomes and Policy, College of Pharmacy, University of Florida , Gainesville, FL 32611, United States 4 Center for Drug Evaluation and Safety (CoDES), University of Florida , Gainesville, FL 32611, United State PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Amie Goodin Masoud Rouhizadeh 1 Department of Pharmaceutical Outcomes and Policy, College of Pharmacy, University of Florida , Gainesville, FL 32611, United States 2 Division of General Internal Medicine, Department of Medicine, Johns Hopkins School of Medicine , Baltimore, MD, United States 4 Center for Drug Evaluation and Safety (CoDES), University of Florida , Gainesville, FL 32611, United State PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Masoud Rouhizadeh Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF ABSTRACT Objective This scoping review aimed to map current applications of Large language models (LLMs) for extracting Social drivers of health (SDoH), benchmarks model performance across domains to define the state of the field, and evaluates methodological approaches to identify research gaps and guide clinical deployment. Materials and Methods We searched PubMed, Web of Science, Embase, Scopus, and IEEE Xplore for studies applying LLMs in the detection of SDoH. We applied a novel methodological framework integrating: (1) a hierarchical classification system for SDoH domains and LLM architectures; (2) a systematic approach for synthesizing performance metrics; and (3) a custom seven-domain instrument to assess the methodological rigor. Results Forty-two studies met inclusion criteria. Behavioral Factors had the highest median F1-score (0.87), while Health Care Access and Quality showed the lowest and most variability (median F1 = 0.59). Research was concentrated in the United States (85.7%) and private institutional datasets (69%), often focused on critical care populations (45.2%). Methodological assessment revealed that only 29% of studies provided annotation guidelines, 24% assessed fairness across demographic groups, and 21% validated models externally. Discussion and Conclusion The progress of using LLMs for SDoH extraction is limited by performance variability, weak methodological rigor in the conducted studies, and minimal attention given to fairness and generalizability. Methodological gaps include a lack of provided annotation guidelines, assessment of fairness, and external model validation. LLMs show strong potential for extracting SDoH from clinical text. However, to move forward, addressing the current limitations demands more standardized, transparent, and robust research. 1. BACKGROUND AND SIGNIFICANCE Social drivers of health (SDoH)—defined by the World Health Organization (WHO) as the non-medical factors influencing health outcomes—determine an estimated 70-80% of modifiable health contributions across populations. 1 – 3 A robust body of research, including longitudinal cohort studies and health disparities analyses, consistently demonstrates that socioeconomic status, environmental exposures, and psychosocial factors exert a greater impact on population health than clinical care alone. 4 – 9 Despite these well-established associations, healthcare systems continue to face major challenges to systematically capture, document, or integrate these critical determinants into routine clinical practice, creating a fundamental disconnect between epidemiological evidence and care delivery. 10 Although electronic health records (EHRs) have transformed healthcare documentation, they remain limited in their capacity to capture SDoH. 11 Analyses of national Medicare claims indicate that structured SDoH codes (ICD-10-CM Z-codes Z55–Z65) are used in fewer than 1.5% of patient encounters, largely due to clinician time constraints, a lack of standardized screening protocols, and limited integration into care pathways. 12 , 13 In contrast, natural language processing (NLP) studies estimate that 83–91% of SDoH documentation resides within unstructured clinical narratives— representing both a rich source of untapped information and a significant informatics challenge. 14 Three fundamental barriers impede effective SDoH extraction from clinical text: (1) documentation heterogeneity, characterized by low prevalence and high linguistic variability of SDoH mentions across providers and settings; (2) architectural constraints within EHR systems that limit SDoH data capture and interoperability; and (3) analytical limitations of traditional NLP techniques, which lack the contextual understanding required to identify implicit social determinants. 15 – 18 Early rule-based and conventional machine learning approaches, though interpretable and relatively simple to deploy, failed to capture subtle, context-dependent language pervasive in clinical documentation. 19 , 20 Identifying housing instability, for instance, requires systems to recognize that phrases like “patient sleeps on friend’s couch” or “lives on streets” signal homelessness—inferences that require contextual understanding beyond simple keyword matching. 21 Large language models (LLMs) represent a paradigmatic shift in clinical NLP capabilities for SDoH extraction. These transformer-based neural architectures leverage massive pretraining corpora to develop nuanced linguistic understanding, enabling superior performance on complex extraction tasks despite limited labeled data. 22 – 25 Encoder-based architectures (BERT, ClinicalBERT, BioBERT) excel at classification through bidirectional context modeling, while decoder-based models (GPT variants) demonstrate few-shot learning capabilities that address data scarcity challenges endemic to SDoH research. 22 – 25 Recent studies suggest that models such as BERT and GPT-4 outperform earlier methods—and, in some cases, even human reviewers—in detecting implicit or rare SDoH within clinical narratives. 26 – 28 Despite these advances, critical methodological and ethical challenges constrain the clinical deployment of LLMs for SDoH extraction. Current implementations are often fragmented with inconsistent performance metrics, varied validation practices, and limited methodological transparency. This methodological heterogeneity prevents meaningful cross-study comparisons and evidence synthesis needed to establish clinical best practices. More concerning, pre-trained models inherit societal biases from their training corpora, potentially perpetuating or amplifying health inequities when applied to marginalized populations—a fundamental ethical challenge given that SDoH applications specifically target vulnerable groups requiring equitable representation and accurate identification. 29 – 32 2. OBJECTIVE This scoping review aimed to: (1) systematically map the SDoH domains and LLM architectures investigated to identify research gaps; (2) establish a standardized benchmark of LLM performance across SDoH domains and subdomains to serve as a field-level reference for future evaluations; (3) compare model performance using quantitative metrics; and (4) assess methodological rigor across internal validity, external validity, and reporting transparency. By integrating both performance outcomes and methodological appraisal, this review provides a dual-level benchmark to guide the development of robust and clinically meaningful LLM applications for SDoH extraction. 3. MATERIALS AND METHODS 3.1. Study Design We conducted a scoping review following the Preferred Reporting Items for Systematic Reviews and Meta-Analyses Extension for Scoping Reviews (PRISMA-ScR) checklist and the Enhancing Transparency in Reporting the Synthesis of Qualitative Research (ENTREQ) guidelines. 33 , 34 Full documentation of these frameworks is provided in the Supplementary Materials; PRISMA-ScR Checklist and Appendix A. 3.2. Databases and Search Strategy To comprehensively capture literature at the intersection of clinical medicine, informatics, and machine learning, we searched five electronic databases: PubMed/MEDLINE, Web of Science, Embase, Scopus, and IEEE Xplore. Search strategies were collaboratively developed by a medical informaticist and an information specialist experienced in systematic reviews. Queries were tailored for each database using both controlled vocabulary (e.g., Medical Subject Headings [MeSH]) and free-text terms and structured around three core concepts: (1) large language models (LLMs), such as “GPT,” “BERT,” and “transformer architecture”; (2) social determinants or drivers of health (SDoH), including “health equity,” “food insecurity,” and “housing instability”; and (3) clinical free-text sources, such as “electronic health record (EHR) narratives” and “clinical notes.” The initial search was conducted on January 15, 2025, and updated on March 1, 2025. No publication date restrictions were applied. Complete search strategies for all databases are available in Supplementary Appendix B, Section 1 . 3.3. Eligibility Criteria We included studies that met four criteria: (1) analyzed clinical free-text data—including EHR notes or synthetic clinical narratives simulating real-world documentation—for SDoH as defined by the World Health Organization (WHO), U.S. Department of Health and Human Services (HHS), Centers for Disease Control and Prevention (CDC), or Centers for Medicare & Medicaid Services (CMS); 2 , 35 , 36 (2) applied LLMs—whether pre-trained, fine-tuned, or instruction-tuned models such as GPT, BERT, or Flan-T5—for SDoH information extraction tasks; (3) reported quantitative performance metrics (e.g., F1-score, precision, recall) relevant to clinical natural language processing tasks such as named entity recognition, concept extraction, or phenotyping; and (4) presented original research with sufficient methodological detail, including peer-reviewed publications or high-quality preprints. We excluded studies that met any of the following criteria: (1) used only traditional machine learning methods (e.g., logistic regression, support vector machines) or rule-based systems without comparison to an LLM; (2) focused solely on structured data (e.g., International Classification of Diseases [ICD] codes) or non-clinical sources (e.g., surveys, social media); (3) consisted of review articles, editorials, opinion pieces, or preprints lacking methodological transparency; (4) did not report SDoH-specific performance metrics or presented only qualitative findings; and (5) reported performance as aggregated macro or micro averages across SDoH categories without disaggregated domain-level metrics. 3.4. Study Selection Process All citations were imported into Mendeley (version 1.19.8) and deduplicated. Before screening, a random subset of 10 studies was independently reviewed by all team members to calibrate decision criteria, achieving strong inter-rater agreement (Cohen’s kappa > 0.8). Two reviewers (A.F. and A.S.) independently screened all titles and abstracts using Covidence (Veritas Health Innovation). Conflicts were resolved through discussion. Articles that passed the initial screening underwent full-text review by the same reviewers, with final inclusion decisions made by consensus. The selection process was documented using the PRISMA 2020 flow diagram. 3.5. Data Extraction and Synthesis To analyze and compare heterogeneous LLM-based approaches to SDoH extraction, we developed a structured methodological framework ( Figure 1 ; described in Sections 3.5.1 to 3.5.3 ). This framework supported standardized extraction of five core data categories: study characteristics, model architecture and implementation, dataset attributes, SDoH classification, and performance metrics. Two reviewers (A.F. and A.S.) piloted the extraction form on 10 studies and independently extracted data from all included articles. A third reviewer (M.R.) validated a random 20% subset, stratified by publication year, achieving substantial agreement (Cohen’s kappa = 0.92). Discrepancies were resolved through structured adjudication or senior arbitration (M.R.). Download figure Open in new tab Figure 1. Hierarchical Framework for Standardized Evaluation of LLMs in SDoH Extraction from Clinical Text. This flowchart presents the dual-pronged methodological framework applied in the review. The first component is a Methodological Appraisal Framework used to evaluate study quality across three domains: internal validity, external validity, and reporting transparency. The second is an Evidence Mapping & Synthesis Framework , which standardizes comparison of studies by hierarchically classifying both Large Language Models (LLMs) and Social Drivers of Health (SDoH), and systematically aggregating reported performance metrics. Together, these components support a comprehensive synthesis of LLM performance and methodological rigor across the field. 3.5.1 SDoH Classification and Performance Framework We adopted the U.S. HHS Healthy People 2030 framework as the primary taxonomy for SDoH, supplemented by the WHO Commission on Social Determinants of Health and CDC domains. 2 , 3 , 35 These frameworks organize SDoH into five domains: Economic Stability, Education Access and Quality, Health Care Access and Quality, Neighborhood and Built Environment, and Social and Community Context. Additionally, we included Behavioral Factors (e.g., substance use, physical activity, and nutrition) as a sixth independent domain to reflect their clinical importance, modifiability, and documentation prevalence in EHR notes. 2 , 3 , 35 , 37 – 39 SDoH performance metrics were aggregated at the level-2 (subcategory) hierarchy. 40 This allowed us to balance analytic feasibility with granularity and to harmonize reporting across studies with variable annotation detail. When studies reported multiple attributes for the same factor (e.g., “current smoker” vs. “former smoker”), we prioritized extraction of current patient-specific status for consistency and clinical relevance. Broader attributes such as duration, severity, and family history were also recorded and documented in the Supplementary Materials (Appendix B) for comprehensive analysis. 3.5.2 LLMs Categorization and Performance Metrics We classified each model into a three-tier hierarchy: base architecture (e.g., T5), model family (e.g., Flan-T5), and specific variant (e.g., Flan-T5-Large). 41 For studies that evaluated multiple models, we adopted the more robust approach of selecting a single, overall best-performing model and extracting all its associated metrics. To this end, from each study presenting multiple models, we identified the ‘best-performing model’ as the one achieving the highest reported F1-score, given that the F1-score comprehensively balances recall and precision. All reported metrics (recall, precision, and F1-score) for this single, selected model were then recorded for our analysis. For example, consider a study comparing three models: Model A: Recall = 0.70, Precision = 0.80, F1-score = 0.75; Model B: Recall = 0.85, Precision = 0.75, F1-score = 0.79; Model C: Recall = 0.80, Precision = 0.82, F1-score = 0.81. In this scenario, Model C would be selected as the best-performing model due to its superior F1 score (0.81). Consequently, we would record Model C’s recall (0.80), Model C’s precision (0.82), and Model C’s F1-score (0.81) for our analysis. This method ensured that our evaluation reflected the holistic performance capabilities of a single, optimized model from each study, rather than an amalgam of potentially unrelated peak scores across different models for individual metrics. 42 To reflect practical clinical utility, we favored lenient over strict span-matching metrics when available, acknowledging the variability of real-world SDoH documentation in clinical notes. 43 We also applied a validation hierarchy to contextualize generalizability: external validation on independent datasets was considered the gold standard, followed by held-out testing, and cross-validation. 44 Domain-level summary statistics (e.g., median, interquartile range) were calculated for each SDoH category, with finer-grained metrics reported in the Supplementary Materials - Appendix B. 3.6. Methodological Assessment Recognizing the absence of established guidelines for evaluating LLMs in SDoH extraction, we developed a structured assessment framework spanning three key dimensions: internal validity, external validity, and reporting transparency. These dimensions were informed by patterns identified during preliminary review and are designed to reflect both technical rigor and clinical relevance ( Table 1 ). 45 – 52 View this table: View inline View popup Table 1. Methodological Domains Assessed in this Review Internal validity was assessed through three domains: annotation guidelines, error analysis, and subgroup fairness. We examined whether studies documented clear labeling criteria, conducted error analyses to identify model limitations (e.g., conflation of couch surfing with literal homelessness), and evaluated model performance across demographic subgroups. 29 , 53 External validity focused on whether models were tested on independent datasets and the clinical specificity of the documentation analyzed. 44 Reporting transparency included availability of code, prompts, and datasets, which are essential for reproducibility. 56 Full rationale for domain selection is provided in Supplementary Appendix C. Although this framework is not a validated instrument, it offers a pragmatic structure for assessing the current evidence base and identifying methodological gaps. It supports consistent evaluation of LLM-based approaches to SDoH extraction and fosters methodological refinement in this emerging field. 4. RESULTS From 254 records retrieved across Scopus, Embase, PubMed, and IEEE Xplore (through March 1, 2025), 177 titles and abstracts were screened after 95 duplicates were removed. Eighteen additional studies were identified via reference snowballing. Of the 132 full texts assessed, 42 met inclusion criteria 26 , 27 , 64 – 73 , 28 , 74 – 83 , 57 , 84 – 93 , 58 , 94 , 95 , 59 – 63 ( Supplementary Table ). The main reasons for exclusion were reliance on structured/non-clinical text, use of traditional ML only, or unrelated clinical focus ( Figure 2 ). Download figure Open in new tab Figure 2. PRISMA 2020 Flow Diagram for Study Selection. This diagram outlines the study identification and selection process. A total of 254 records were retrieved from four databases, with 159 remaining after duplicate removal. An additional 18 records were identified through reference snowballing. Following full-text screening of 132 articles, 42 studies met the inclusion criteria and were retained for qualitative synthesis. Abbreviations: EHR, electronic health records; ML, machine learning; LLM, large language model; SDoH, Social Drivers of Health . 4.1. Data Sources and Patient Characteristics We observed considerable data heterogeneity across the included studies ( Appendix B, Section 2 ). Most studies (33/42; 79% ) used private institutional datasets, which limits reproducibility. Public datasets appeared in 9 studies ( 21% ), with MIMIC-III or MIMIC-IV utilized in 8 out of 9 public cases, reflecting a pronounced bias toward critical care contexts. Dataset sizes ranged widely, from as few as 100 notes to as many as 50,000, underscoring substantial variation in training data magnitude. This spectrum also illustrates the evolution from BERT-like models, which often require large annotated corpora, to GPT-based or retrieval-augmented models—many of which achieve competitive results with relatively limited examples. Geographically, the studies were overwhelmingly U.S.-based ( 83% , 35/42), with limited representation from Korea, Spain, France, Austria, and the UK. This U.S.-centrism further constrains the global generalizability of model findings. Sampling was predominantly convenience-based. Nearly half of studies ( 19/42; 45% ) focused on critical care populations, largely driven by the widespread use of MIMIC datasets. Oncology (5/42, 12%), cardiology (4/42, 10%), and psychiatric cohorts (4/42, 10%) were also examined, while pediatric, transplant, and other subpopulations were infrequently represented. As a result, generalizability across disease areas and patient groups remains limited and poorly characterized. A full description of datasets and study populations is available in Appendix B ( Section 2 ). 4.2. Performance of LLM Variants Across SDoH Categories and Subcategories Across the six major SDoH domains, LLM performance exhibited considerable variability, with marked differences in effectiveness depending on the specific subdomain and metric evaluated ( Figure 3 ). Behavioral and social domains yielded the strongest and most consistent results, while health care access and mental health remained persistent challenges. Download figure Open in new tab Figure 3. Landscape of LLMs Performance across SDoH Categories and Subcategories. Dot plots display reported performance metrics—Recall, Precision, and F1-score—across studies, grouped by SDoH domain: (A) Behavioral Factors, (B) Economic Stability, (C) Neighborhood and Built Environment, (D) Social and Community Context, (E) Health Care Access and Quality, and (F) Education Access and Quality. Each point represents a metric reported by a single study; point shape and color correspond to the LLM used, as indicated in the in-figure legend. The red dashed line marks the 0.80 performance threshold commonly referenced in clinical benchmarks. Gu (2024) reported only accuracy, and Robitschek (2024) reported F1 < 0.2 across all 27 subcategories. Abbreviations: LLM, large language model; SDoH, Social Drivers of Health . Behavioral Factors represented the most consistently high-performing domain ( Figure 3A ). Substance Use extraction achieved outstanding results across several studies, most notably with Petit-Jean 2024’s EDS-CamemBERT (F1=0.972) and Shah-Mohammadi 2024b’s GPT-3.5-turbo (F1=0.95); many models reported F1-scores above the 0.80 clinical benchmark ( Figure 4C ). In contrast, performance in the Mental Health subdomain was significantly lower, with Fu 2024’s Flan-T5-Large (F1=0.38) illustrating the challenges of extracting nuanced psychological concepts. Download figure Open in new tab Figure 4. Landscape of LLM Applications in SDoH Extraction. (A) Stacked bar chart showing the number of studies (n = 42) addressing each of the six primary SDoH domains. Bars are color-segmented to represent subdomain-level analysis. (B) Horizontal bar chart displaying the frequency of unique LLM variants used across studies, grouped by model family (e.g., BERT, GPT). (C) Boxplots summarizing Recall, Precision, and F1-score distributions by SDoH domain. Each metric was collected independently, as studies varied in which metrics they reported and which level-2 SDoH subcategories they targeted. As a result, distributions reflect separate collective summaries rather than directly linked values. Boxes represent interquartile ranges (IQR), central lines indicate medians, and whiskers extend to 1.5×IQR. A horizontal red dashed line denotes the 0.80 clinical performance benchmark. Above each domain label, a color-coded donut chart illustrates the proportional frequency of LLM families contributing to the performance data for that domain. Abbreviations: LLM, large language model; SDoH, Social Drivers of Health . Economic Stability ( Figure 3 B ; Figure 4 A ) displayed notable intra-domain variability. Employment Status extraction was among the most successful subdomains, with Consoli 2024’s GPT-3.5 (F1=0.94), Roy 2024’s GPT-4 (F1=0.90), and Torii 2023’s Bio_Discharge_Summary_BERT (F1=0.9538) all demonstrating high precision and recall. In contrast, Financial Issues showed pronounced performance asymmetry, such as Roy 2024’s GPT-4 achieving perfect precision (1.00) but low recall (0.50), resulting in overall moderate F1 (0.60). Food Insecurity displayed the widest variation, with Lituiev 2022’s RoBERTa yielding poor performance (F1=0.46) and Goel 2024’s Yi-34B-Chat achieving near-perfect extraction (F1=0.979). Neighborhood and Built Environment results were mixed ( Figure 3 C ). Living Status frequently achieved strong F1-scores with Yu 2024’s GatorTron (F1=0.914), Richie 2023’s BioClinicalBERT (F1=0.91), and Torii 2023’s Bio_Discharge_Summary_BERT (F1=0.8702), although results ranged from as low as F1=0.53 to above 0.91. Housing Instability showed extreme variability, with F1-values from Han 2022’s BERT (0.552) to Goel 2024’s Nous-Hermes-2-Yi-34B (0.984). Transportation exhibited pronounced precision-recall tradeoffs (e.g., Han 2022: recall=0.829, precision=0.352), raising concerns about false positive rates. Social and Community Context was one of the best-performing and most frequently evaluated domains ( Figure 3 D; Figure 4 A ). Social Support extraction achieved consistent and high F1-scores across multiple models, such as Yu 2024’s GatorTron (F1=0.946), Goel 2024’s Llama-2-13B-chat-hf (F1=0.963), and Wang 2023’s BioClinicalBERT (F1=0.937). Other subdomains, including Relationship Status (F1=0.946–0.958) and Bereavement (F1=0.87), also exhibited reliably strong performance. However, Interpersonal Safety presented greater extraction challenges, often with models achieving high precision (e.g., Roy 2024, Gabriel 2024: precision=1.00) but only moderate recall (Roy 2024: 0.73). Health Care Access and Quality was underrepresented and showed the weakest performance overall ( Figure 3 E; Figure 4 A ). Insurance Status extraction was particularly poor (Lituiev 2022’s RoBERTa: F1=0.225), and Health Crisis detection varied widely, ranging from Yu 2024’s GatorTron (F1=1.00) to Scherbakov 2025’s Mixtral 8×7B (F1=0.59). Education Access and Quality was evaluated in relatively few studies but included examples of strong model performance ( Figure 3 F; Figure 4 A ). Yu 2024’s GatorTron achieved the best results for Education Status (F1=0.963, recall=0.967). Boxplots in Figure 4 C summarize broader domain-level trends, highlighting Substance Use, Living Status, and Environmental Exposure as high-performing subdomains (median F1 > 0.85), while Mental Health and Insurance Status remain challenging. Figure 4B illustrates the diversity and prevalence of LLM architectures within each domain. Overall, these results ( Figure 3 and Figure 4 ) demonstrate a heterogeneous LLM performance landscape in SDoH extraction. Domains such as behavioral factors and social context yield the most reliable results, while healthcare access and mental health subdomains continue to present distinct challenges. 4.3. Methodological Assessment Our methodological assessment revealed substantial variability in how current studies address internal validity, external validity, and reporting transparency in LLM-based extraction of social determinants of health ( Figure 5 ). Download figure Open in new tab Figure 5. Methodological Evaluation of LLM-Based SDoH Studies. (A) Heatmap showing domain-level methodological assessment for each of the 42 included studies. Green dots indicate the domain was addressed; red dots indicate it was not. (B) Horizontal bar chart summarizing the proportion and count of studies meeting criteria for each of the seven methodological domains, grouped by category: Internal Validity, External Validity, and Reporting Transparency. (C) Correlogram depicting Pearson correlation coefficients between methodological domains, with warmer colors (green) reflecting positive correlations and cooler colors (purple) indicating negative correlations in reporting practices. Abbreviations: LLM, large language model Internal validity —which assesses model rigor in identifying limitations and subgroup performance—was partially addressed in most studies. While error analysis was the most commonly fulfilled domain (76%, 32/42), annotation guidelines were documented in only 29% (12/42) of studies, and fairness assessments across demographic subgroups were reported in just 24% (10/42). This underreporting limits insight into both label consistency and equity in model behavior—core elements for extracting nuanced and context-dependent SDoH information. External validity was less consistently demonstrated. Although clinical context or patient population was specified in 69% (29/42) of studies, indicating some attention to where models were applied, only 21% (9/42) conducted external dataset validation using independent datasets— undermining confidence in model generalizability across institutions or care settings. Reporting transparency showed mixed adoption. Code or prompt availability was reported in 55% (23/42) of studies, supporting some reproducibility. However, only 29% (12/42) made annotated datasets publicly accessible, limiting opportunities for external benchmarking and comparative evaluation—particularly relevant for studies focused on housing instability, employment insecurity, or food access. Correlational patterns revealed structural tensions and opportunities for synergy. Dataset availability was negatively correlated with both fairness assessment (r = –0.23) and error analysis (r = –0.18), suggesting that privacy concerns or proprietary constraints may hinder both data sharing and rigorous performance evaluation. Conversely, the availability of annotation guidelines was moderately associated with external validation (r = 0.31), implying that better documentation may support transferability and generalization testing. A few studies—such as Sushil 2024 and Guevara 2024 —met most criteria across internal and external validity dimensions, reflecting high methodological rigor. Yet even among these exemplars, reporting transparency features were not uniformly applied. This inconsistency underscores a broader issue: predictive performance alone is insufficient. Reliable and equitable SDoH extraction depends on comprehensive methodological practices that enhance generalizability, transparency, and fairness. 5. DISCUSSION The influence of SDoH on clinical outcomes is well established, yet health systems continue to face barriers in systematically integrating these factors into routine care. 96 – 98 A key limitation is the lack of structured SDoH data—less than 1.5% of content in EHRs is coded, with the majority embedded in free-text narratives. 99 This review systematically examined how LLMs have been applied to extract SDoH from clinical text and assessed their readiness for clinical deployment using a seven-domain methodological framework that evaluates internal validity, external validity, and reporting transparency. To our knowledge, this is the first review to systematically evaluate LLMs for SDoH extraction from a methodological perspective. Prior systematic reviews, such as Patra et al., 2021 primarily examined traditional machine learning and rule-based approaches, focusing on a narrow set of SDoH categories and conducted before the emergence of powerful generative models. 100 More recent scoping reviews, including Li et al., 2024 , offered broader overviews of the SDoH data pipeline—from data collection to intervention—but did not appraise LLM-based extraction methods in depth. 101 Domain-specific reviews, such as those by McNeill et al., 2023 in cardiovascular medicine and Abbott et al., 2024 in emergency care, provided valuable insights yet relied on literature searches completed before key developments in LLM research. 102 , 103 Meanwhile, works like Wu et al., 2023 are limited to single-method evaluations of pre-transformer models. 104 In contrast, our review provides an updated synthesis of LLM performance across domains and introduces a standardized, domain-level evaluation framework tailored to SDoH extraction— accounting for challenges such as linguistic ambiguity, fairness, and contextual nuance. By integrating WHO- and HHS-endorsed taxonomies with a hierarchical model classification (e.g., GPT, LLaMA, Flan-T5), we enabled structured comparisons across architectures and domains. Our March 2025 search cutoff ensured coverage of recent trends such as open-source LLMs, hybrid prompting strategies, and interpretability methods. Furthermore, current evaluation tools (e.g., HELM, TRIPOD-AI) offer partial guidance but are not optimized for the specific challenges of SDoH extraction. 52 , 105 – 107 Our domain-level framework addresses this gap by providing structured, interpretable evaluation criteria that emphasize transparency, generalizability, and fairness. As such, this work offers the most comprehensive and methodologically rigorous synthesis to date, establishing a foundation for equitable, reliable clinical applications. LLMs offered substantial advantages over manual annotation methods, which are costly and time-intensive. Ralevski et al. estimated that annotating 25,000 clinical notes cost ∼$9,400 manually compared to <$150 with GPT-3.5. 28 Consoli et al. similarly showed their SDoH-GPT system to be over 20× cheaper and 10× faster than human annotation, while achieving greater consistency. 94 These developments paralleled a broader transition from BERT-based models toward encoder-decoder and decoder-only architectures, including privacy-preserving open-source options like LLaMA and Mixtral. 108 , 109 Importantly, larger models did not always confer performance advantages. For instance, a fine-tuned LLaMA-2 7B model achieved an F1-score of 0.885 in extracting financial issues— rivaling GPT-4. 94 Techniques such as domain routing, exemplified by Goel et al.’s Oracle Router, which sends clinical notes to the best-performing model per SDoH domain, further enhanced efficiency and specialization. 93 However, advanced methods like retrieval-augmented generation and knowledge distillation remain underutilized in this space. 110 , 111 Despite these gains, many studies (26%) reported only macro-averaged F1-scores without accompanying recall, precision, or class distribution metrics, limiting interpretability and clinical trust. Low recall may result in missed opportunities for intervention, while low precision could lead to unnecessary screening. Future studies should include class-level metrics, prevalence data, and precision–recall curves to better capture model behavior in real-world scenarios. 112 Methodological inconsistencies also hindered comparability. Few studies systematically tested prompting strategies; Consoli et al. was an exception, showing variable performance across zero-, two-, and eight-shot prompts. 94 Additionally, Chain-of-thought prompting and model confidence assessments were rare, with Scherbakov et al.’s study a notable outlier. 113 They introduced a self-consistency method—accepting outputs only if generated in ≥2 of 4 LLM runs— enhancing reliability without sacrificing efficiency. 59 Although open-source models are increasingly used, few studies conducted head-to-head comparisons with high-performing proprietary LLMs such as Gemini and Claude. Including these models in future evaluations could enhance generalizability and better reflect real-world clinical utility. 114 , 115 Across the three dimensions of internal validity, external validity, and reporting transparency, we observed substantial and recurrent gaps that undermine the reproducibility, equity, and clinical applicability of LLM-based SDoH extraction. Only four studies satisfied all internal validity criteria, and just 29% reported annotation guidelines—highlighting the urgent need for tailored reporting standards specific to this domain. While open science practices are gradually improving, with 55% of studies sharing code or prompts, dataset sharing remains limited (29%), likely constrained by privacy regulations. Interestingly, We observed that studies sharing datasets were less likely to include fairness assessments (r = –0.23) or detailed error analyses (r = –0.18). This inverse association may reflect the tension between data transparency and privacy-preserving documentation practices, where concerns about re-identification or institutional review constraints limit the granularity of performance reporting. Although the majority of studies included some form of error analysis (86%), only 24% incorporated fairness assessments, and fewer still performed subgroup analyses. These omissions are especially concerning in the context of SDoH, where model predictions may reflect—and potentially reinforce—existing disparities tied to race, gender, socioeconomic status, or insurance type. 29 – 32 , 53 , 54 Without systematic equity evaluations, such models risk perpetuating structural inequities under the guise of automation. External validation was also limited. While 69% of studies stated the medical condition evaluated, only 21% tested external datasets, and just four met both criteria. The heavy reliance on private, U.S.-based ICU datasets further limits generalizability. A model trained on discharge notes from urban ICUs may not transfer to outpatient or community settings. 44 , 116 This challenge is compounded by the lack of high-quality, publicly available SDoH datasets. Existing corpora are narrow in scope and often rely on surface-level named entity recognition, which fails to capture contextually rich and implicit social information. 117 Without granular annotations, models lack semantic depth and struggle with nuanced understanding, limiting cross-setting applicability. 118 , 119 Additionally, the absence of standardized SDoH taxonomies impedes precision. Most studies used broad domains or basic NER tags that fail to capture conceptually distinct yet clinically relevant distinctions (e.g., eviction risk vs. homelessness). 118 , 119 ICD-10 Z-codes, while useful, are too coarse for nuanced modeling. 120 This lack of shared vocabulary affects interoperability and intervention targeting. 39 , 121 To address these challenges, future work should prioritize: (1) standardizing annotation and fairness reporting practices; (2) promoting secure data-sharing infrastructures that preserve patient privacy; and (3) investing in diverse, real-world benchmark datasets spanning care settings, patient populations, and healthcare systems. These efforts are essential for developing trustworthy, generalizable SDoH tools that can be responsibly integrated into clinical workflows. This review has several limitations. First, the substantial heterogeneity across studies—in annotation scope, modeling strategies, and outcome definitions—precluded the use of formal meta-analytic techniques. To address this, we applied a structured qualitative synthesis supported by a standardized classification framework. Second, due to inconsistent reporting of class distributions, we relied on macro-averaged F1-scores, which may obscure model performance on rare SDoH subcategories. To mitigate this, we provide detailed subdomain-level results in the supplementary materials. Third, for consistency in evaluation, we extracted the best-performing model from each study, which may underrepresent within-study performance variability. Fourth, our search was restricted to English-language studies published prior to March 2025, though we included five major biomedical and technical databases to maximize coverage. Finally, while our methodological assessment framework was designed to be practical and domain-relevant, it remains preliminary and unvalidated. It does not yet weight criteria based on clinical impact—an important area for future refinement, particularly with respect to fairness and generalizability. 6. CONCLUSION LLMs showed strong promise for scalable SDoH extraction from clinical text, but progress is hindered by methodological shortcomings and inconsistent practices. Performance disparities across domains highlight the need for improved rigor, transparency, and contextual sensitivity. Advancing clinically meaningful and generalizable systems will require not just technical innovation but a shift in research culture. 7. DECLARATIONS Funding : This research received no funding from any source. Ethics approval : Not applicable. Consent to participate: Not applicable. Consent for publication: Not applicable. Data availability: Data sharing is not applicable to this article as no datasets were generated or analyzed during the current study. Competing interests: The authors declare no competing interests. Data Availability All data produced in the present work are contained in the manuscript Disclosure of interest statement The authors declare they have no conflict of interest Author Contributions M.R. conceptualized the study. A.F., and A.G. designed the methodology. A.F. and A.S . conducted the data collection and formal analysis. M.R., A.G., and E.H . Supervision, Writing – Review & Editing. A.F. wrote the original draft and created the visualizations. All authors reviewed and edited the final manuscript Acknowledgments None. REFERENCES ↵ Hood CM , Gennuso KP , Swain GR , Catlin BB . County Health Rankings: Relationships between Determinant Factors and Health Outcomes . Am J Prev Med . 2016 ; 50 ( 2 ): 129 – 135 . doi: 10.1016/j.amepre.2015.08.024 OpenUrl CrossRef PubMed ↵ U.S. Department of Health and Human Services . Social Determinants of Health - Healthy People 2030 | health.gov. U.S. Department of Health and Human Services . Published 2024 . Accessed March 29, 2025. https://odphp.health.gov/healthypeople/priority-areas/social-determinants-health ↵ World Health Organization (WHO) . Social determinants of health . Published 2024 . Accessed March 30, 2025. https://www.who.int/health-topics/social-determinants-of-health ↵ Enard KR , Coleman AM , Yakubu RA , Butcher BC , Tao D , Hauptman PJ . Influence of Social Determinants of Health on Heart Failure Outcomes: A Systematic Review . J Am Heart Assoc . 2023 ; 12 ( 3 ): 26590 . doi: 10.1161/JAHA.122.026590 OpenUrl CrossRef Khatib R , Glowacki N , Byrne J , Brady P . Impact of social determinants of health on anticoagulant use among patients with atrial fibrillation: Systemic review and meta-analysis . Med (United States ) . 2022 ; 101 ( 35 ): E29997 . doi: 10.1097/MD.0000000000029997 OpenUrl CrossRef Garg S , Sweet N , Boderman B , et al. Multiplicative Impact of Adverse Social Determinants of Health on Outcomes in Lupus Nephritis: A Meta-analysis and Systematic Review . Arthritis Care Res (Hoboken ) . 2024 ; 76 ( 9 ): 1232 – 1245 . doi: 10.1002/acr.25359 OpenUrl CrossRef PubMed Amjad S , MacDonald I , Chambers T , et al. Social determinants of health and adverse maternal and birth outcomes in adolescent pregnancies: A systematic review and meta-analysis . Paediatr Perinat Epidemiol . 2019 ; 33 ( 1 ): 88 – 99 . doi: 10.1111/ppe.12529 OpenUrl CrossRef PubMed Wilder ME , Kulie P , Jensen C , et al. The Impact of Social Determinants of Health on Medication Adherence: a Systematic Review and Meta-analysis . J Gen Intern Med . 2021 ; 36 ( 5 ): 1359 – 1370 . doi: 10.1007/S11606-020-06447-0/FIGURES/3 OpenUrl CrossRef PubMed ↵ Lam JR , Tyler J , Scurrah KJ , Reavley NJ , Dite GS . The Association between Socioeconomic Status and Psychological Distress: A Within and Between Twin Study . Twin Res Hum Genet . 2019 ; 22 ( 5 ): 312 – 320 . doi: 10.1017/THG.2019.91 OpenUrl CrossRef PubMed ↵ Wang M , Pantell MS , Gottlieb LM , Adler-Milstein J . Documentation and review of social determinants of health data in the EHR: Measures and associated insights . J Am Med Informatics Assoc . 2021 ; 28 ( 12 ): 2608 – 2616 . doi: 10.1093/jamia/ocab194 OpenUrl CrossRef ↵ Baker KM , Hill MA , Goldberg DG , et al. Using Z Codes to Document Social Risk Factors in the Electronic Health Record: A Scoping Review . Med Care . 2024 ; 63 ( 3 ). doi: 10.1097/MLR.0000000000002101 OpenUrl CrossRef ↵ Lee JS , MacLeod KE , Kuklina E V. , Tong X , Jackson SL . Social Determinants of Health–Related Z Codes and Health Care Among Patients With Hypertension . AJPM Focus . 2023 ; 2 ( 2 ): 100089 . doi: 10.1016/j.focus.2023.100089 OpenUrl CrossRef PubMed ↵ Torres JM , Lawlor J , Colvin JD , et al. ICD Social Codes: An underutilized resource for tracking social needs . Med Care . 2017 ; 55 ( 9 ): 810 – 816 . doi: 10.1097/MLR.0000000000000764 OpenUrl CrossRef PubMed ↵ Dorr D , Bejan CA , Pizzimenti C , Singh S , Storer M , Quinones A . Identifying patients with significant problems related to social determinants of health with natural language processing . In: Studies in Health Technology and Informatics . Vol 264 . IOS Press; 2019 :1456-1457. doi: 10.3233/SHTI190482 OpenUrl CrossRef PubMed ↵ Lituiev DS , Lacar B , Pak S , Abramowitsch PL , De Marchis EH , Peterson TA . Automatic extraction of social determinants of health from medical notes of chronic lower back pain patients . J Am Med Inform Assoc . 2023 ; 30 ( 8 ): 1438 – 1447 . doi: 10.1093/jamia/ocad054 OpenUrl CrossRef PubMed Hatef E , Rouhizadeh M , Tia I , et al. Assessing the availability of data on social and behavioral determinants in structured and unstructured electronic health records: A retrospective analysis of a multilevel health care system . JMIR Med Informatics . 2019 ; 7 ( 3 ). doi: 10.2196/13802 OpenUrl CrossRef Kepper MM , Walsh-Bailey C , Prusaczyk B , Zhao M , Herrick C , Foraker R . The adoption of social determinants of health documentation in clinical settings . Health Serv Res . 2023 ; 58 ( 1 ): 67 – 77 . doi: 10.1111/1475-6773.14039 OpenUrl CrossRef ↵ Feller DJ , Bear Don’t Walk Iv OJ, Zucker J, Yin MT, Gordon P , Elhadad N. Detecting Social and Behavioral Determinants of Health with Structured and Free-Text Clinical Data. Appl Clin Inform . 2020 ; 11 ( 1 ): 172 – 181 . doi: 10.1055/s-0040-1702214 OpenUrl CrossRef PubMed ↵ Gundlapalli A V. , Carter ME , Palmer M , et al. Using natural language processing on the free text of clinical documents to screen for evidence of homelessness among US veterans . AMIA Annu Symp Proc . 2013 ; 2013 : 537 – 546 . Accessed March 29, 2025. https://pmc.ncbi.nlm.nih.gov/articles/PMC3900197/ OpenUrl PubMed ↵ Bejan CA , Angiolillo J , Conway D , et al. Mining 100 million notes to find homelessness and adverse childhood experiences: 2 case studies of rare and severe social determinants of health in electronic health records . J Am Med Informatics Assoc . 2018 ; 25 ( 1 ): 61 – 71 . doi: 10.1093/jamia/ocx059 OpenUrl CrossRef PubMed ↵ Kim MH , Miramontes S , Mehta S , et al. Extracting Housing and Food Insecurity Information From Clinical Notes Using cTAKES . Health Serv Res. Published online 2025 . doi: 10.1111/1475-6773.14440 OpenUrl CrossRef ↵ Devlin J , Chang MW , Lee K , Toutanova K. BERT: Pre-training of deep bidirectional transformers for language understanding. In: NAACL HLT 2019 - 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies - Proceedings of the Conference. Vol 1. Association for Computational Linguistics (ACL) ; 2019 :4171-4186. Accessed March 2, 2025. https://arxiv.org/abs/1810.04805v2 Alsentzer E , Murphy JR , Boag W , et al. Publicly Available Clinical BERT Embeddings . Published online April 5, 2019 . Accessed March 29, 2025. https://arxiv.org/abs/1904.03323v3 Lee J , Yoon W , Kim S , et al. BioBERT: a pre-trained biomedical language representation model for biomedical text mining. Wren J, ed . Bioinformatics . 2020 ; 36 ( 4 ): 1234 – 1240 . doi: 10.1093/bioinformatics/btz682 OpenUrl CrossRef PubMed ↵ Radford A , Narasimhan K , Salimans T , Sutskever I . Improving Language Understanding by Generative Pre-Training . Published online 2018 . http://arxiv.org/abs/math/0511056 ↵ Han S , Zhang RF , Shi L , et al. Classifying social determinants of health from unstructured electronic health records using deep learning-based natural language processing . J Biomed Inform . 2022 ; 127 : 103984 . doi: 10.1016/j.jbi.2021.103984 OpenUrl CrossRef PubMed ↵ Roy S , Morrell S , Zhao L , Homayouni R . Large-scale identification of social and behavioral determinants of health from clinical notes: comparison of Latent Semantic Indexing and Generative Pretrained Transformer (GPT) models . BMC Med Inform Decis Mak . 2024 ; 24 ( 1 ): 296 . doi: 10.1186/s12911-024-02705-x OpenUrl CrossRef PubMed ↵ Ralevski A , Taiyab N , Nossal M , Mico L , Piekos S , Hadlock J . Using Large Language Models to Abstract Complex Social Determinants of Health From Original and Deidentified Medical Notes: Development and Validation Study . J Med Internet Res . 2024 ; 26 : e63445 . doi: 10.2196/63445 OpenUrl CrossRef PubMed ↵ Ferrara E . Fairness and Bias in Artificial Intelligence: A Brief Survey of Sources , Impacts, and Mitigation Strategies. Sci . 2024 ; 6 ( 1 ): 3 . doi: 10.3390/sci6010003 OpenUrl CrossRef Guo Y , Guo M , Su J , et al. Bias in Large Language Models: Origin , Evaluation, and Mitigation. Published online November 16 , 2024 . Accessed April 1, 2025. http://arxiv.org/abs/2411.10915 Ayoub NF , Balakrishnan K , Ayoub MS , Barrett TF , David AP , Gray ST . Inherent Bias in Large Language Models: A Random Sampling Analysis . Mayo Clin Proc Digit Heal . 2024 ; 2 ( 2 ): 186 – 191 . doi: 10.1016/j.mcpdig.2024.03.003 OpenUrl CrossRef ↵ Kumar CV , Urlana A , Kanumolu G , Garlapati BM , Mishra P , Hyderabad I. No LLM is Free From Bias: A Comprehensive Study of Bias Evaluation in Large Language models . Published online March 15, 2025 . Accessed April 1, 2025. https://arxiv.org/abs/2503.11985v1 ↵ Tricco AC , Lillie E , Zarin W , et al. PRISMA extension for scoping reviews (PRISMA-ScR): Checklist and explanation . Ann Intern Med . 2018 ; 169 ( 7 ): 467 – 473 . doi: 10.7326/M18-0850 OpenUrl CrossRef PubMed ↵ Tong A , Flemming K , McInnes E , Oliver S , Craig J . Enhancing transparency in reporting the synthesis of qualitative research: ENTREQ . BMC Med Res Methodol . 2012 ; 12 ( 1 ): 1 – 8 . doi: 10.1186/1471-2288-12-181 OpenUrl CrossRef PubMed ↵ Social Determinants of Health (SDOH) | About CDC | CDC . Accessed March 30, 2025. https://www.cdc.gov/about/priorities/why-is-addressing-sdoh-important.html ↵ Social Drivers of Health and Health-Related Social Needs | CMS . Accessed March 30, 2025. https://www.cms.gov/priorities/innovation/key-concepts/social-drivers-health-and-health-related-social-needs ↵ Capturing Social and Behavioral Domains and Measures in Electronic Health Records: Phase 2 . National Academies Press ; 2015 . doi: 10.17226/18951 OpenUrl CrossRef Social Determinants of Health (SDoH) - white paper - Patient Care - Confluence . Accessed May 27, 2025. https://confluence.hl7.org/spaces/PC/pages/46891893/Social+Determinants+of+Health+SDoH+-+white+paper ↵ Billioux A , Verlander K , Anthony S , Alley D . Standardized Screening for Health-Related Social Needs in Clinical Settings: The Accountable Health Communities Screening Tool . Nationa Acad Med Perspect . 2017 ; 7 ( 5 ). doi: 10.31478/201705b OpenUrl CrossRef ↵ Kollapally NM , Chen Y , Xu J , Geller J . An Ontology for the Social Determinants of Health Domain. In: Proceedings - 2022 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2022 . Institute of Electrical and Electronics Engineers Inc .; 2022 : 2403 – 2410 . doi: 10.1109/BIBM55620.2022.9995544 OpenUrl CrossRef ↵ Kucharavy A . Overview of Existing LLM Families . In: Large Language Models in Cybersecurity. Springer Nature Switzerland ; 2024 : 31 – 44 . doi: 10.1007/978-3-031-54827-7_3 OpenUrl CrossRef ↵ Tharwat A . Classification assessment methods . Appl Comput Informatics . 2018 ; 17 ( 1 ): 168 – 192 . OpenUrl ↵ Mahajan D , Liang JJ , Tsou CH , Uzuner Ö. Overview of the 2022 n2c2 shared task on contextualized medication event extraction in clinical notes . J Biomed Inform . 2023 ; 144 : 104432 . doi: 10.1016/j.jbi.2023.104432 OpenUrl CrossRef ↵ Collins GS , Dhiman P , Ma J , et al. Evaluation of clinical prediction models (part 1): from development to external validation . BMJ . 2024 ; 384 : e074819 . doi: 10.1136/bmj-2023-074819 OpenUrl FREE Full Text ↵ Viswanathan M , Patnode CD , Berkman ND , et al. Assessing the Risk of Bias in Systematic Reviews of Health Care Interventions. Methods Guid Eff Comp Eff Rev . Published online December 13, 2017. Accessed May 11, 2025. http://europepmc.org/books/NBK519366 Deeks JJ , Dinnes J , D’Amico R , et al. Evaluating non-randomised intervention studies . Health Technol Assess (Rockv ) . 2003 ; 7 ( 27 ). doi: 10.3310/HTA7270 OpenUrl CrossRef Higgins JPT , Thomas J , Chandler J , et al. Cochrane handbook for systematic reviews of interventions . Cochrane Handb Syst Rev Interv. Published online January 1 , 2019 : 1 – 694 . doi: 10.1002/9781119536604 OpenUrl CrossRef Institute of Medicine . Standards For Initiating A Systematic Review . Find What Work Heal Care Stand Syst Rev. Published online 2011 : 45 – 80 . Accessed May 11, 2025. https://www.nap.edu/catalog/13059/finding-what-works-in-health-care-standards-for-systematic-reviews Cochrane . RoB 2: A revised Cochrane risk-of-bias tool for randomized trials | Cochrane Bias. British Medical Journal . Published 2022 . Accessed May 11, 2025. https://methods.cochrane.org/bias/resources/rob-2-revised-cochrane-risk-bias-tool-randomized-trials Wolff RF , Moons KGM , Riley RD , et al. PROBAST: A tool to assess the risk of bias and applicability of prediction model studies . Ann Intern Med . 2019 ; 170 ( 1 ): 51 – 58 . doi: 10.7326/M18-1376 OpenUrl CrossRef PubMed Collins GS , Moons KGM , Dhiman P , et al. TRIPOD+AI statement: Updated guidance for reporting clinical prediction models that use regression or machine learning methods . BMJ . 2024 ; 385 . doi: 10.1136/bmj-2023-078378 OpenUrl FREE Full Text ↵ Mitchell M , Wu S , Zaldivar A , et al. Model cards for model reporting. In: FAT* 2019 - Proceedings of the 2019 Conference on Fairness, Accountability, and Transparency. Association for Computing Machinery, Inc ; 2019 : 220 – 229 . doi: 10.1145/3287560.3287596 OpenUrl CrossRef ↵ Ji Y , Ma W , Sivarajkumar S , et al. Mitigating the risk of health inequity exacerbated by large language models . npj Digit Med . 2025 ; 8 ( 1 ): 1 – 11 . doi: 10.1038/s41746-025-01576-4 OpenUrl CrossRef PubMed ↵ Wissler L , Almashraee M , Dagmar M , Paschke A . The Gold Standard in Corpus Annotation . J Polit Econ . 2014 ; 7 ( 4 ): 551 . Accessed May 11, 2025. http://www.journals.uchicago.edu/doi/abs/10.1086/250616 OpenUrl Kamoi R , Snigdha S , Das S , et al. Evaluating LLMs at Detecting Errors in LLM Responses . Published online April 4, 2024. Accessed May 11, 2025. https://arxiv.org/pdf/2404.03602v1 ↵ Haibe-Kains B , Adam GA , Hosny A , et al. Transparency and reproducibility in artificial intelligence . Nature . 2020 ; 586 ( 7829 ):E14-E16. doi: 10.1038/s41586-020-2766-y OpenUrl CrossRef PubMed ↵ Patra BG , Lepow LA , Kasi Reddy Jagadeesh Kumar P , et al. Extracting social support and social isolation information from clinical psychiatry notes: comparing a rule-based natural language processing system and a large language model . J Am Med INFORMATICS Assoc . 2025 ; 32 ( 1 ): 218 – 226 . doi: 10.1093/jamia/ocae260 OpenUrl CrossRef PubMed ↵ Kim HK , Park Y , Kim YJ , et al. EILEEN: A Multi-modal Framework for Extracting Alcohol Consumption Patterns from Bilingual Clinical Notes . IEEE Access . 2025 ; 13 : 25741 – 25751 . doi: 10.1109/ACCESS.2025.3538803 OpenUrl CrossRef ↵ Scherbakov D , Heider PM , Wehbe R , Alekseyenko A V. , Lenert LA , Obeid JS . Using large language models for extracting stressful life events to assess their impact on preventive colon cancer screening adherence . BMC Public Health . 2025 ; 25 ( 1 ): 12 . doi: 10.1186/s12889-024-21123-2 OpenUrl CrossRef PubMed Rabbani N , Brown C , Bedgood M , et al. Evaluation of a Large Language Model to Identify Confidential Content in Adolescent Encounter Notes . medRxiv . 2023 ; 178 ( 3 ): 308 – 310 . doi: 10.1101/2023.08.25.23294372 OpenUrl Abstract / FREE Full Text Shah-Mohammadi F , Finkelstein J . Utilizing RAG and GPT-4 for Extraction of Substance Use Information from Clinical Notes . In: Studies in Health Technology and Informatics . Vol 321 .; 2024 : 94 – 98 . doi: 10.3233/SHTI241070 OpenUrl CrossRef Gu B , Shao V , Liao Z , et al. Scalable information extraction from free text electronic health records using large language models . BMC Med Res Methodol . 2025 ; 25 ( 1 ): 23 . doi: 10.1186/s12874-025-02470-z OpenUrl CrossRef PubMed ↵ Shah-Mohammadi F , Finkelstein J . Extraction of Substance Use Information From Clinical Notes: Generative Pretrained Transformer–Based Investigation . JMIR Med Informatics . 2024 ; 12 : e56243 . doi: 10.2196/56243 OpenUrl CrossRef ↵ Huang T , Socrates V , Gilson A , et al. Identifying incarceration status in the electronic health record using large language models in emergency department settings . J Clin Transl Sci . 2024 ; 8 ( 1 ): e53 . doi: 10.1017/cts.2024.496 OpenUrl CrossRef Roosan D , Chok J , Li Y , Khou T . Utilizing Quantum Computing-based Large Language Transformer Models to Identify Social Determinants of Health from Electronic Health Records . In: 2024 International Conference on Electrical, Computer and Energy Technologies (ICECET. IEEE ; 2024 : 1 – 6 . doi: 10.1109/ICECET61485.2024.10698600 OpenUrl CrossRef Fu Y , Ramachandran GK , Dobbins NJ , et al. Extracting Social Determinants of Health from Pediatric Patient Notes Using Large Language Models: Novel Corpus and Methods. In: 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, LREC-COLING 2024 - Main Conference Proceedings .; 2024 : 7045 – 7056 . https://www.scopus.com/inward/record.uri?eid=2-s2.0-85195890705&partnerID=40&md5=cf5954b0252da4b1031401eb2710fe73 Guevara M , Chen S , Thomas S , et al. Large language models to identify social determinants of health in electronic health records. npj Digit Med . 2024 ; 7 ( 1 ): 6 . doi: 10.1038/s41746-023-00970-0 OpenUrl CrossRef PubMed Madrid-García A , Pérez-Sancristóbal I , Leticia-Leon , Lydia-Abásolo , Fernández-Gutiérrez B , Rodríguez-Rodríguez L . Occupation Recognition and Exploitation in Rheumatology Clinical Notes: Employing Deep Learning Models for Named Entity Recognition and Knowledge Discovery in Electronic Health Records . medRxiv. Published online May 8 , 2024 . doi: 10.1101/2024.05.08.24306389 OpenUrl Abstract / FREE Full Text Yu Z , Peng C , Yang X , et al. Identifying social determinants of health from clinical narratives: A study of performance, documentation ratio, and potential bias . J Biomed Inform . 2024 ; 153 . doi: 10.1016/j.jbi.2024.104642 OpenUrl CrossRef PubMed Peng C , Yang X , Chen A , et al. Generative large language models are all-purpose text analytics engines: Text-to-text learning is all your need . J Am Med Informatics Assoc . 2024 ; 31 ( 9 ): 1892 – 1903 . doi: 10.1093/jamia/ocae078 OpenUrl CrossRef PubMed Sushil M , Butte AJ , Schuit E , van Smeden M , Leeuwenberg AM . Cross-institution natural language processing for reliable clinical association studies: a methodological exploration . J Clin Epidemiol . 2024 ; 167 : 111258 . doi: 10.1016/j.jclinepi.2024.111258 OpenUrl CrossRef Keloth VK , Selek S , Chen Q , et al. Large Language Models for Social Determinants of Health Information Extraction from Clinical Notes – A Generalizable Approach across Institutions . medRxiv. Published online May 22 , 2024 . doi: 10.1101/2024.05.21.24307726 OpenUrl Abstract / FREE Full Text ↵ Kwon S , Wang X , Liu W , et al. ODD: A Benchmark Dataset for the Natural Language Processing Based Opioid Related Aberrant Behavior Detection . Proc 2024 Conf North Am Chapter Assoc Comput Linguist Hum Lang Technol NAACL 2024 . 2024 ;1: 4338 – 4359 . doi: 10.18653/v1/2024.naacl-long.244 OpenUrl CrossRef ↵ Holmes B , Raymer M , Banerjee T . Extraction of patients subpopulations with psychiatric symptoms using a transformer architecture . In: 2024 46th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC) .; 2024 : 1 – 4 . doi: 10.1109/EMBC53108.2024.10781648 OpenUrl CrossRef Roosan D , Wu Y , Chok J , et al. Artificial Intelligence-Powered Large Language Transformer Models for Opioid Abuse and Social Determinants of Health Detection for the Underserved Population . In: Proceedings of the 13th International Conference on Data Science, Technology and Applications, DATA 2024. SCITEPRESS - Science and Technology Publications ; 2024 : 15 – 26 . doi: 10.5220/0012717200003756 OpenUrl CrossRef Petit-Jean T , Gérardin C , Berthelot E , et al. Collaborative and privacy-enhancing workflows on a clinical data warehouse: an example developing natural language processing pipelines to detect medical conditions . J Am Med Informatics Assoc . 2024 ; 31 ( 6 ): 1280 – 1290 . doi: 10.1093/jamia/ocae069 OpenUrl CrossRef PubMed Gabriel RA , Litake O , Simpson S , Burton BN , Waterman RS , Macias AA . On the development and validation of large language model-based classifiers for identifying social determinants of health . Proc Natl Acad Sci U S A . 2024 ; 121 ( 39 ): e2320716121 . doi: 10.1073/pnas.2320716121 OpenUrl CrossRef PubMed Robitschek E , Sordean S , Horwath K , et al. The Liver Meeting: San Diego, California, Nov 15-19, 2024 . Hepatology . 2024 ; 80 (S1):S1-S2011. doi: 10.1097/HEP.0000000000001077 OpenUrl CrossRef Yao Z , Tsai J , Liu W , et al. Automated identification of eviction status from electronic health record notes . J Am Med Inform Assoc . 2023 ; 30 ( 8 ): 1429 – 1437 . doi: 10.1093/jamia/ocad081 OpenUrl CrossRef PubMed Ramachandran GK , Fu Y , Han B , et al. Prompt-based Extraction of Social Determinants of Health Using Few-shot Learning . In: Proceedings of the Annual Meeting of the Association for Computational Linguistics .; 2023 : 385 – 393 . doi: 10.18653/v1/2023.clinicalnlp-1.41 OpenUrl CrossRef Turchin A , Masharsky S , Zitnik M . Comparison of BERT implementations for natural language processing of narrative medical documents . Informatics Med Unlocked . 2023 ; 36 : 101139 . doi: 10.1016/j.imu.2022.101139 OpenUrl CrossRef Wang X , Gupta D , Killian M , He Z . Benchmarking Transformer-Based Models for Identifying Social Determinants of Health in Clinical Notes . In: 2023 IEEE 11th International Conference on Healthcare Informatics (ICHI). IEEE ; 2023 : 570 – 574 . doi: 10.1109/ICHI57859.2023.00102 OpenUrl CrossRef ↵ Richie R , Ruiz VM , Han S , Shi L , Tsui FR . Extracting social determinants of health events with transformer-based multitask, multilabel named entity recognition . J Am Med Inform Assoc . 2023 ; 30 ( 8 ): 1379 – 1388 . doi: 10.1093/jamia/ocad046 OpenUrl CrossRef ↵ Bhate NJ , Mittal A , He Z , Luo X . Zero-shot Learning with Minimum Instruction to Extract Social Determinants and Family History from Clinical Notes using GPT Model . In: Proceedings - 2023 IEEE International Conference on Big Data, BigData 2023. IEEE ; 2023 : 1476 – 1480 . doi: 10.1109/BigData59044.2023.10386811 OpenUrl CrossRef Kim HK , Park Y , Park Y , et al. Identifying Alcohol-Related Information From Unstructured Bilingual Clinical Notes With Multilingual Transformers . IEEE Access . 2023 ; 11 : 16066 – 16075 . doi: 10.1109/ACCESS.2023.3245523 OpenUrl CrossRef Gray GM , Zirikly A , Ahumada LM , et al. Application of natural language processing to identify social needs from patient medical notes: development and assessment of a scalable, performant, and rule-based model in an integrated healthcare delivery system . JAMIA Open . 2023 ; 6 ( 4 ): 85 . doi: 10.1093/jamiaopen/ooad085 OpenUrl CrossRef Sajdeya R , Mardini MT , Tighe PJ , et al. Developing and validating a natural language processing algorithm to extract preoperative cannabis use status documentation from unstructured narrative clinical notes . J Am Med Inform Assoc . 2023 ; 30 ( 8 ): 1418 – 1428 . doi: 10.1093/jamia/ocad080 OpenUrl CrossRef Lituiev D , Lacar B , Pak S , Abramowitsch PL , De Marchis E , Peterson T . Automatic Extraction of Social Determinants of Health from Medical Notes of Chronic Lower Back Pain Patients . medRxiv. Published online March 8 , 2022 . doi: 10.1101/2022.03.04.22271541 OpenUrl Abstract / FREE Full Text Kugic A , Pojian LM , Hammer LM , Schulz S , Kreuzthaler M . Alcohol Status Standardization from Clinical Real World Data with Transformer Architectures . In: Proceedings - 2022 IEEE 10th International Conference on Healthcare Informatics, ICHI 2022 . IEEE; 2022 : 233 – 238 . doi: 10.1109/ICHI54592.2022.00043 OpenUrl CrossRef Botelle R , Bhavsar V , Kadra-Scalzo G , et al. Can natural language processing models extract and classify instances of interpersonal violence in mental healthcare electronic records: an applied evaluative study . BMJ Open . 2022 ; 12 ( 2 ): e052911 . doi: 10.1136/bmjopen-2021-052911 OpenUrl Abstract / FREE Full Text Lybarger K , Dobbins NJ , Long R , et al. Leveraging natural language processing to augment structured social determinants of health data in the electronic health record . J Am Med Inform Assoc . 2023 ; 30 ( 8 ): 1389 – 1397 . doi: 10.1093/jamia/ocad073 OpenUrl CrossRef PubMed Gong L , Bresnick J , Zhang A , Wu C , Jha K . Boosting Social Determinants of Health Extraction with Semantic Knowledge Augmented Large Language Model . AMIA. Annu Symp proceedings AMIA Symp . 2024 ; 2024 : 453 – 462 . Accessed May 30, 2025. https://pmc.ncbi.nlm.nih.gov/articles/PMC12099417/ OpenUrl ↵ Goel A , Hari SN , Waltman B , Thomson M. Leveraging Open-Source Large Language Models for encoding Social Determinants of Health using an Intelligent Router . Published online May 30, 2024 . Accessed May 31, 2025. https://arxiv.org/pdf/2405.19631 ↵ Consoli B , Wu X , Wang S , et al. SDoH-GPT: Using Large Language Models to Extract Social Determinants of Health (SDoH) . Published online July 24, 2024 . Accessed June 6, 2025. https://arxiv.org/pdf/2407.17126 ↵ Torii M , Finn IM , Doan S , Wang P , Yang EW , Zisook DS. Task formulation for Extracting Social Determinants of Health from Clinical Narratives . Published online January 26, 2023 . Accessed June 6, 2025. https://arxiv.org/pdf/2301.11386 ↵ Seligman HK , Tschann J , Jacobs EA , Fernandez A , López A . Food insecurity and glycemic control among low-income patients with type 2 diabetes . Diabetes Care . 2012 ; 35 ( 2 ): 233 – 238 . doi: 10.2337/dc11-1627 OpenUrl Abstract / FREE Full Text Najibi N , Firoozi R , Shahrezaee S , Eshraghian M , Daneshi-Maskooni M , Dorosty-Motlagh A . Food insecurity is an important risk factor for type 2 diabetes: A case-control study of new referrals to the University clinics, Shiraz, Southern Iran . BMC Public Health . 2019 ; 19 ( 1 ): 1 – 8 . doi: 10.1186/s12889-019-7236-9 OpenUrl CrossRef PubMed ↵ Kushel MB , Gupta R , Gee L , Haas JS . Housing instability and food insecurity as barriers to health care among low-income Americans . J Gen Intern Med . 2006 ; 21 ( 1 ): 71 – 77 . doi: 10.1111/j.1525-1497.2005.00278.x OpenUrl CrossRef PubMed Web of Science ↵ Mehta S , Lyles CR , Rubinsky AD , et al. Social Determinants of Health Documentation in Structured and Unstructured Clinical Data of Patients With Diabetes: Comparative Analysis . JMIR Med Informatics . 2023 ; 11 : e46159 . doi: 10.2196/46159 OpenUrl CrossRef ↵ Patra BG , Sharma MM , Vekaria V , et al. Extracting social determinants of health from electronic health records using natural language processing: A systematic review . J Am Med Informatics Assoc . 2021 ; 28 ( 12 ): 2716 – 2727 . doi: 10.1093/jamia/ocab170 OpenUrl CrossRef PubMed ↵ Li C , Mowery DL , Ma X , et al. Realizing the potential of social determinants data in EHR systems: A scoping review of approaches for screening, linkage, extraction, analysis, and interventions . J Clin Transl Sci . 2024 ; 8 ( 1 ): e147 . doi: 10.1017/cts.2024.571 OpenUrl CrossRef PubMed ↵ McNeill E , Lindenfeld Z , Mostafa L , et al. Uses of Social Determinants of Health Data to Address Cardiovascular Disease and Health Equity: A Scoping Review . J Am Heart Assoc . 2023 ; 12 ( 21 ): 30571 . doi: 10.1161/JAHA.123.030571 OpenUrl CrossRef ↵ Abbott EE , Apakama D , Richardson LD , Chan L , Nadkarni GN . Leveraging Artificial Intelligence and Data Science for Integration of Social Determinants of Health in Emergency Medicine: Scoping Review . JMIR Med informatics . 2024 ; 12 : e57124 – e57124 . doi: 10.2196/57124 OpenUrl CrossRef ↵ Wu W , Holkeboer KJ , Kolawole TO , Carbone L , Mahmoudi E . Natural language processing to identify social determinants of health in Alzheimer’s disease and related dementia from electronic health records . Health Serv Res . 2023 ; 58 ( 6 ): 1292 – 1302 . doi: 10.1111/1475-6773.14210 OpenUrl CrossRef ↵ Bommasani R , Liang P , Lee T . Holistic Evaluation of Language Models . Ann N Y Acad Sci . 2023 ; 1525 ( 1 ): 140 – 146 . doi: 10.1111/nyas.15007 OpenUrl CrossRef Lin S , Hilton J , Evans O. TruthfulQA: Measuring How Models Mimic Human Falsehoods. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Vol 1. Association for Computational Linguistics ; 2022 :3214-3252. doi: 10.18653/v1/2022.acl-long.229 OpenUrl CrossRef ↵ Gehman S , Gururangan S , Sap M , Choi Y , Smith NA. RealToxicityPrompts: Evaluating Neural Toxic Degeneration in Language Models . In: Findings of the Association for Computational Linguistics: EMNLP 2020. Association for Computational Linguistics ; 2020 : 3356 – 3369 . doi: 10.18653/v1/2020.findings-emnlp.301 OpenUrl CrossRef ↵ DeepSeek-AI , Guo D , Yang D , et al. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning . Published online January 22, 2025 . Accessed May 13, 2025. https://arxiv.org/pdf/2501.12948 ↵ Touvron H , Lavril T , Izacard G , et al. LLaMA: Open and Efficient Foundation Language Models . Published online February 27, 2023 . Accessed May 13, 2025. https://arxiv.org/pdf/2302.13971 ↵ Lewis P , Perez E , Piktus A , et al. Retrieval-augmented generation for knowledge-intensive NLP tasks. In: Advances in Neural Information Processing Systems. Vol 2020-Decem. Neural information processing systems foundation ; 2020 . Accessed April 8, 2025. https://arxiv.org/abs/2005.11401v4 ↵ Hinton G , Vinyals O , Dean J. Distilling the Knowledge in a Neural Network . Published online March 9, 2015 . Accessed April 8, 2025. https://arxiv.org/abs/1503.02531v1 ↵ Powers DMW. Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation . Published online October 10, 2020 . Accessed May 27, 2025. https://arxiv.org/pdf/2010.16061 ↵ Wei J , Wang X , Schuurmans D , et al. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In: Advances in Neural Information Processing Systems. Vol 35. Neural information processing systems foundation ; 2022 . Accessed May 28, 2025. https://arxiv.org/pdf/2201.11903 ↵ Team G , Anil R , Borgeaud S , et al. Gemini: A Family of Highly Capable Multimodal Models . Published online December 19, 2023 . Accessed May 19, 2025. https://arxiv.org/pdf/2312.11805 ↵ Anthropic . The Claude 3 Model Family: Opus, Sonnet, Haiku Anthropic . 2024 ;(August 2023). https://docs.anthropic.com/ ↵ Johnson A , Pollard T , Mark R. MIMIC-III Clinical Database . 2023 ;(June). doi: 10.13026/C2XW26 OpenUrl CrossRef ↵ Ahsan H , Ohnuki E , Mitra A , Yu H. MIMIC-SBDH: A Dataset for Social and Behavioral Determinants of Health. In: Proceedings of Machine Learning Research. Vol 149. ML Research Press ; 2021 : 391 – 413 . Accessed May 21, 2025. https://pmc.ncbi.nlm.nih.gov/articles/PMC8734043/ ↵ Carrell DS , Schoen RE , Leffler DA , et al. Challenges in adapting existing clinical natural language processing systems to multiple, diverse health care settings . J Am Med Informatics Assoc . 2017 ; 24 ( 5 ): 986 . doi: 10.1093/jamia/ocx039 OpenUrl CrossRef PubMed ↵ Ahmed A , Abbasi A , Eickhoff C . Benchmarking Modern Named Entity Recognition Techniques for Free-text Health Record Deidentification . AMIA. Annu Symp proceedings AMIA Symp . 2021 ; 2021 : 102 – 111 . Accessed June 8, 2025. https://pmc.ncbi.nlm.nih.gov/articles/PMC8378656/ OpenUrl ↵ Jacobs ZG . Codifying Social Determinants of Health: a Gap in the ICD-10-CM . J Gen Intern Med . 2021 ; 36 ( 10 ): 3205 – 3207 . doi: 10.1007/s11606-021-06742-4 OpenUrl CrossRef PubMed ↵ Li C , Mowery DL , Ma X , et al. Realizing the potential of social determinants data in EHR systems: A scoping review of approaches for screening, linkage, extraction, analysis, and interventions . J Clin Transl Sci . 2024 ; 8 ( 1 ): e147 . doi: 10.1017/cts.2024.571 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted July 05, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Beyond Metrics to Methods: A Scoping Review of Large Language Models for Detection of Social Drivers of Health in Clinical Notes Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Beyond Metrics to Methods: A Scoping Review of Large Language Models for Detection of Social Drivers of Health in Clinical Notes Ahmed Farrag , Ahmed Soliman , Elham Hatef , Amie Goodin , Masoud Rouhizadeh medRxiv 2025.07.04.25330866; doi: https://doi.org/10.1101/2025.07.04.25330866 Share This Article: Copy Citation Tools Beyond Metrics to Methods: A Scoping Review of Large Language Models for Detection of Social Drivers of Health in Clinical Notes Ahmed Farrag , Ahmed Soliman , Elham Hatef , Amie Goodin , Masoud Rouhizadeh medRxiv 2025.07.04.25330866; doi: https://doi.org/10.1101/2025.07.04.25330866 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Policy Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9219) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffd9a86e89c52ad',t:'MTc3OTQ3MTYwMQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-23T02:00:01.238055+00:00
License: Public-Domain