Full text
58,099 characters
· extracted from
preprint-html
· click to expand
Predicting Hospital Admissions Using Pretrained EHR Embeddings: External Evaluation and Insights on Local Vocabulary Adaptation | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Predicting Hospital Admissions Using Pretrained EHR Embeddings: External Evaluation and Insights on Local Vocabulary Adaptation View ORCID Profile Bernardo Neves , Jorge Cerejo , Simão Gonçalves , Inês Mota , José M. Moreira , Nuno A. Silva , Francisca Leite , Michael Wornow , Mário J. Silva doi: https://doi.org/10.1101/2025.09.08.25335368 Bernardo Neves a Hospital da Luz Learning Health , Luz Sauúde, Portugal Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Bernardo Neves For correspondence: bernardoneves{at}tecnico.ulisboa.pt Jorge Cerejo a Hospital da Luz Learning Health , Luz Sauúde, Portugal Find this author on Google Scholar Find this author on PubMed Search for this author on this site Simão Gonçalves a Hospital da Luz Learning Health , Luz Sauúde, Portugal Find this author on Google Scholar Find this author on PubMed Search for this author on this site Inês Mota a Hospital da Luz Learning Health , Luz Sauúde, Portugal Find this author on Google Scholar Find this author on PubMed Search for this author on this site José M. Moreira a Hospital da Luz Learning Health , Luz Sauúde, Portugal Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nuno A. Silva a Hospital da Luz Learning Health , Luz Sauúde, Portugal Find this author on Google Scholar Find this author on PubMed Search for this author on this site Francisca Leite a Hospital da Luz Learning Health , Luz Sauúde, Portugal Find this author on Google Scholar Find this author on PubMed Search for this author on this site Michael Wornow b Stanford University , CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mário J. Silva c Instituto Superior Técnico, Universidade de Lisboa , Portugal Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Purpose Unplanned hospital admissions impose substantial strain on healthcare systems, yet predictive models for these events remain underexplored in practice. This study evaluates whether publicly available pretrained transformer-based embeddings, developed on an external health system, can improve prediction of hospital admissions—including unplanned cases—when applied to a different institution with sparser data and a distinct medical vocabulary. Methods We performed a retrospective cohort study using structured EHR data from 200,000 adult patients (2007–2023) at a Portuguese hospital, standardized to the OMOP Common Data Model. Four 30-day outcomes were predicted: emergency department visits, hospital admissions, unplanned admissions, and readmissions. Three modeling approaches were compared: (1) clinically curated handcrafted features, (2) frequency-based representations of all recorded OMOP concepts, and (3) pretrained CLMBR-T embeddings generated from longitudinal OMOP data of 2.57 million patients in a U.S. hospital system. Performance was assessed on held-out patients using AUROC, AUPRC, and calibration metrics, with additional analysis of the impact of vocabulary overlap between pretraining and local datasets. Results Pretrained embeddings achieved the highest discrimination for all outcomes, particularly for unplanned admissions (AUROC 0.877 vs. 0.770 for counts). Gains were greatest for rarer outcomes and patients with richer clinical histories. Despite only 58% overlap with local vocabulary and substantially fewer events per patient than in pretraining, embeddings transferred effectively, indicating generalizable temporal patterns. Calibration was poorer than simpler models, necessitating post-hoc recalibration before deployment. Conclusion Pretrained OMOP-based EHR embeddings can substantially improve prediction of hospital and unplanned admissions in data- and resource-limited settings, even with partial vocabulary overlap. These findings support their use for rapid, cost-effective deployment of clinically meaningful predictive models, provided local recalibration and workflow integration are addressed. Highlights Superior cross-institution performance – Pretrained EHR embeddings from Stanford Medicine achieved AUROC 0.877 for unplanned admissions, 0.814 for hospital admissions, 0.782 for ED visits, and 0.923 for readmissions in a Portuguese hospital, outperforming count-based (0.770, 0.767, 0.744, 0.922) and handcrafted feature models across all tasks. Largest gains for rare, unpredictable events – For unplanned admissions (0.4% prevalence), embeddings nearly tripled AUPRC compared to counts (0.037 vs. 0.011) and improved AUROC by 0.107, with performance continuing to scale with more training data, unlike baselines. Effective under substantial domain shift – Strong transferability observed despite only 58% vocabulary overlap and markedly different patient populations, coding distributions, and event density (707 vs. 71 events per patient). Benefit increases with richer patient histories – Performance advantage of embeddings widened in patients with higher code volumes; AUROC for hospital admissions rose from 0.694 in the lowest quartile (Q1) to 0.870 in the highest (Q4), outperforming counts by up to 0.105 in Q4. Actionable guidance for adoption – Hospitals with sparse data can achieve rapid, cost-effective deployment of predictive models using external embeddings, especially for rare outcomes, if paired with local fine-tuning and post-hoc recalibration to ensure accurate risk estimation before clinical use. 1. Introduction Unplanned hospital visits, including emergency department presentations and non-elective hospitalizations, impose substantial burdens on healthcare systems worldwide ( Buja et al., 2020 ). Despite routine clinical risk assessment, healthcare providers typically rely on implicit judgment rather than validated, systematic tools for identifying high-risk patients during clinical encounters ( Klunder et al., 2024 ). Traditional risk stratification instruments demonstrate limited predictive performance, with discriminative ability ranging from 0.61 to 0.78 AUC ( Klunder et al., 2022 ), highlighting the need for more effective approaches. Electronic Health Records (EHRs) contain rich longitudinal patient data with significant potential for dynamic risk prediction ( Knevel and Liao, 2023 ). Recent advances in deep learning have demonstrated superior performance compared to traditional approaches, with transformer architectures emerging as the state-of-the-art for patient representation due to their ability to capture long-range dependencies in patient timelines ( Si et al., 2021 ). Self-supervised learning has proven particularly valuable given the scarcity of labeled clinical data ( Krishnan et al., 2022 ). Foundational models—large-scale architectures trained on vast datasets through self-supervision—have emerged as powerful tools for EHR analysis that can adapt to multiple downstream tasks with minimal fine-tuning ( Wornow et al., 2023b ). These models offer improved performance, enhanced sample efficiency, and the ability to transfer learned representations across diverse healthcare settings, enabling external validation and improving prediction in data-constrained environments ( Rasmy et al., 2021 ; Renc et al., 2024 ). Transfer learning from foundational models addresses the significant costs and privacy barriers associated with developing models from scratch at each institution, with studies demonstrating that pretrained models can maintain 90% of optimal performance using only 30% of site-specific training data ( Guo et al., 2024 ). While the benefits of foundational models for clinical prediction have been established, their effectiveness for predicting unplanned visits has yet to be systematically evaluated, particularly regarding their ability to transfer knowledge from large datasets rich in vocabulary to smaller institutional settings. To address this gap, we investigate CLMBR-T-base, an autoregressive foundational model containing 141 million parameters pretrained on 2.57 million standardized patient records from the Observational Medical Outcomes Partnership (OMOP) Common Data Model ( Wornow et al., 2023a ). CLMBR-T-base was selected because it is currently the only publicly available foundational model with open weights specifically pretrained on longitudinal EHR data, making it particularly suitable for systematic evaluation across diverse healthcare institutions. Our study aims to determine whether the learned representations from this model can successfully adapt to the distinct challenge of predicting unplanned visits in healthcare environments with significantly different data volumes and vocabulary coverage. This scenario reflects real-world constraints faced by many healthcare institutions, where foundational models hold substantial promise for enhancing predictive capabilities. 2. Methods This retrospective cohort study aimed to develop and compare prediction models for different types of hospital visits using historic EHR data from a single hospital. We compared three distinct visit-level feature strategies: 1) few simple handcrafted engineered features; 2) counts of all documented codes and 3) fixed vector representation derived from a pre-trained transformer model, (CLMBR-T-base) ( Wornow et al., 2023a ). All these approaches use all existing data about the patient up to the prediction time, only differing in the number of features and complexity of feature representation. The study diagram is depicted in Figure 1 . Download figure Open in new tab Figure 1: Graphical abstract of the study. Transfer learning workflow from large-scale pretrained CLMBR-T model (2.57M patients) to an external institutional dataset (200K patients) with 58% vocabulary overlap, led to superior performance across hospital visit prediction tasks compared to traditional feature-based approaches. 2.1. Data Source and Preprocessing The data source for this study is a single hospital EHR database. Data were initially extracted from various clinical systems and transformed into the OMOP CDM version 5.4 ( Reich et al., 2024 ). Specifically, this study utilized structured data from the person, visit occurrence, drug exposure, measurement, procedure occurrence, and condition occurrence tables. The dataset encompasses all hospital data from 2007 to 2023. A preliminary data quality assessment was conducted, followed by a data cleaning step where patients with missing identifiers, implausible demographics or temporal inconsistencies in their records were excluded. 2.2. Cohort Selection Inclusion criteria required patients to be aged 18 years or older at their first recorded event and to have at least two documented hospital episodes of any type (outpatient, inpatient, or emergency department visits) within the study period. Each patient visit represented a potential prediction opportunity for forecasting future visits, with the exception of each patient’s final visit due to censoring. We found 650,586 adult patients with at least two visits, and from this population we randomly sampled 200,000 patients for this study. This approach generated 2,714,826 prediction points across the entire dataset. We performed patient-level data splitting to ensure no patient appeared in multiple partitions, allocating 100,000 patients for model training, 50,000 for validation, and 50,000 for testing. Patient characteristics and split distributions summarized in Tables 1 and B.6 , respectively. 2.3. Outcomes At each prediction point, we predicted the occurrence of the following outcomes in the subsequent 30 days: 1) Emergency Department (ED) visit; 2) Any hospital admission; 3) Unplanned hospital admission: defined as an hospital admission that was preceded by an ED visit in the previous 48 hours; 4) Hospital readmission: defined as a second hospital admission after a first index admission. 2.4. Patient representations At each prediction point, three distinct representations of patients patients were created and compared. Handcrafted features : Based on prior literature ( Askar et al., 2024 ), we developed handcrafted features relevant for predicting unplanned hospital admissions. We transformed OMOP CDM data into a visit-level matrix of dimensions n ′ × d , where n ′ represents all patient visits excluding each patient’s final visit, and d is the number of features. Features were engineered using information available up to each visit and included: 1) patient demographics (gender and age at visit); 2) healthcare utilization metrics (cumulative counts of outpatient visits, ED visits, total and distinct prescribed drugs); and 3) multimorbidity encoded as the Charlson Comorbidity Index ( Lee et al., 2021 ), calculated using locally developed phenotyping rules that mapped clinical concepts to diseases ( Neves et al., 2025 ). Counts of previous concepts : We implemented count-based featurization as a competitive baseline following established best practices ( Rajkomar et al., 2018 ). Our OMOP CDM data was first converted to the Medical Event Data Standard (MEDS) format to enable standardized processing. We then used the FEMR (Framework for Electronic Medical Records) 1 package to generate count-based features, which transforms each patient into a feature vector where each element represents the frequency of a specific clinical concept occurring in the patient’s timeline prior to the prediction time point. The FEMR CountFeaturizer was configured to include string value combinations (incorporating laboratory results and other coded values) and age normalization features, creating a comprehensive representation of each patient’s historical clinical encounters. CLMBR-T representations : We utilized CLMBR-T-base as our foundational model to generate patient representations from our data. This publicly available 141-million-parameter autoregressive Transformer model was pre-trained on sequences of clinical codes (e.g., diagnosis, procedure, medication, lab result code) to predict the next medical code, enabling it to learn rich representations of patient trajectories. The model was originally pre-trained on the large-scale, de-identified Stanford Medicine Research Data Repository (STARR) dataset ( Datta et al., 2020 ), which is also in the OMOP CDM format and contains structured EHR data. Detailed model training specifications can be found in the original model paper ( Wornow et al., 2023a ). To create these representations we first converted data in all OMOP tables into the Medical Event Data Standard (MEDS) format, a data schema for storing streams of medical events that is increasingly being employed for EHR predictive modeling ( Arnrich et al., 2024 ). Given that converting the whole dataset would require expensive computational and storage costs we sampled 100,000 patients from each of the train and test splits using stratified sampling to maintain the original outcome distribution. We used the FEMR tokenizer and FEMR labeler from FEMR package to convert patient data into token sequences and to create the desired labels for each prediction. These were given as input to the CLMBR-T-base model that generated fixed 768-dimensional vector representations (embeddings) for each patient at the specified prediction timepoints. These embeddings encapsulate the patient’s clinical history as learned by the transformer through its pre-training on longitudinal medical data, incorporating both the temporal sequence of medical events and continuous values such as laboratory results Wornow et al. (2023a) . A comparison of the codes used for creating representations in pretrained model and in our dataset are provided in Tables 5 and B.10 . 2.5. Modeling For the handcrafted feature and count-based representations, we implemented a machine learning pipeline using LightGBM, a gradient boosting framework selected for its demonstrated effectiveness on tabular data, built-in class imbalance handling capabilities, and use as the primary baseline in the original CLMBR-T paper, ensuring direct methodological comparability ( Wornow et al., 2023a ). Our dataset exhibited considerable class imbalance across outcomes at 30 days (ED visits: 5.87%, hospital admissions: 8.32%, unplanned admissions: 0.47%, readmissions: 24.18%). Therefore we employed LightGBM with automatic class imbalance handling and hyper-parameters optimized through grid search over learning rates of [0.02, 0.1, 0.2-0.5], maximum depths of [3, 5-6, 7], and number of leaves [5-10, 15-25, 31-100], with early stopping (50 rounds) and other parameters including feature fraction (0.8), bagging fraction (0.8), and bagging frequency (5) to address the severe class imbalance typical in healthcare prediction tasks. For count-based representations, we conducted experiments across multiple training sample sizes (5k, 10k, 25k, 50k, and all 100k available patients) using a patient-centric sampling approach where all visits for each selected patient were included, maintaining longitudinal trajectory information while preserving original outcome prevalence through stratified sampling. To mitigate the influence of high-utilization patients, we implemented a 32-visit cap per patient, randomly sampling visits when this threshold was exceeded to preserve average patient information. For handcrafted feature experiments, we utilized all available pre-assigned training patients without sub-sampling to provide the strongest possible baseline for representation comparison. Data preprocessing included LightGBM’s native missing value handling supplemented by domain-informed imputation strategies, with consistent application of the visit cap across both representation types to ensure fair comparison. For the pretrained feature representation strategy, we followed the original CLMBR-T methodology by fine-tuning CLMBR-T-base models using few-shot learning for each prediction task Wornow et al. (2023a) . The finetuning process employed outcome-stratified patient sampling with varying training sizes (5k, 10k, 25k, 50k, and all available training patients), mirroring the count-based approach to enable direct comparison. Patient-centric sampling was implemented to preserve longitudinal trajectory information, where all visits for each selected patient were included up to the 32-visit cap, maintaining the temporal sequence crucial for clinical prediction. After fine-tuning, the resulting CLMBR-T representations (768-dimensional vectors) were frozen and used as fixed feature inputs to train downstream logistic regression classifiers with L2 regularization (C=1.0) and balanced class weights to address the natural class imbalance in healthcare prediction tasks. The same data split was employed, with final evaluation on 50k held-out test patients, ensuring direct comparability with handcrafted and count-based results ( Figure 1 ). 2.6. Evaluation Framework All models were evaluated using a comprehensive set of metrics appropriate for imbalanced healthcare prediction tasks. Discrimination performance was assessed via AUROC (Area Under Receiver Operating Characteristic curve) and AUPRC (Area Under Precision-Recall Curve). Classification performance was evaluated through F1 score, precision, recall, and specificity at an optimal threshold determined by maximizing F1 score on the validation set. The optimal threshold was identified by calculating F1 scores across all possible thresholds derived from the precision-recall curve and selecting the threshold that achieved the highest F1 score, ensuring balanced performance across both positive and negative classes—a critical consideration given the severe class imbalance typical in clinical prediction scenarios ( Hicks et al., 2022 ). Model calibration was quantified using Expected Calibration Error (ECE, computed as the average absolute difference between predicted probabilities and actual frequencies across 10 probability bins), Maximum Calibration Error (MCE, the maximum absolute difference across all bins), and Brier score (mean squared difference between predicted probabilities and actual binary outcomes). All threshold optimization was performed exclusively on the validation set (50k patients), with final performance assessment conducted on the completely held-out test set (50k patients) to prevent overfitting and ensure unbiased evaluation. To quantify uncertainty while respecting the study’s hierarchical design, we computed 95% confidence intervals for AUROC and AUPRC using a patient-level cluster bootstrap with 1,000 replicates. In each iteration we resampled whole patients with replacement, retained all of their encounters, recalculated the metric, and formed the interval from the 2.5th and 97.5th percentiles of the empirical distribution. By resampling patients rather than visits, this cluster bootstrap approach preserves within-patient correlation, avoids anticonservative bias, remains assumption-lean, and achieves near-nominal coverage across diverse hierarchical data settings, making it well suited for our multi-visit clinical dataset ( Ren et al., 2010 ). 2.7. Software libraries All data preprocessing, feature engineering, modeling, and evaluation tasks were conducted using python , leveraging specialized packages: pandas, polars, and scikit-learn. For using CLMBR-T-base we converted OMOP data to MEDS format using meds-etl package 2 . The FEMR package was used to handle data, more specifically create labels and tokenize, before data was processed by the model. Model weights were downloaded from the Huggingface repository 3 . 3. Results 3.1. Dataset Description and Outcome Prevalence The study cohort comprised 200,000 adult patients with a mean age of 48.5 years (SD 17.7), of whom 57.8% were female. The population demonstrated moderate healthcare utilization patterns, with nearly half (45.8%) having at least one emergency department visit and approximately one-quarter (23.8%) experiencing hospital admissions during the study period. Outpatient care was nearly universal, with 95.8% of patients having at least one outpatient visit (median 5 visits per patient). Acute care events requiring readmission or unplanned admission were relatively uncommon, affecting only 2.1% and 2.8% of patients respectively. The burden of chronic conditions was moderate, with 39.3 of patients having at least one documented chronic condition and 17.5% exhibiting multimorbidity (two or more conditions). Four distinct 30-day outcome prediction tasks were evaluated: ED visits, hospital admissions, readmissions at 30 days, and unplanned admissions. Each patient generated multiple prediction points at the end of each clinical encounter, except for the last observable one. For readmissions, predictions were made only at hospital discharge events. This approach yielded 8,361,646 total prediction opportunities across all prediction tasks. The prevalence of positive outcomes varied substantially across tasks, ranging from 0.4% for unplanned admissions to 24.8% for readmissions at 30 days, reflecting the different risk profiles and clinical contexts of each prediction scenario ( Table 1 ). View this table: View inline View popup Download powerpoint Table 1: Population Characteristics and Prediction Task Overview. 3.2. Performance Comparison Between Model Approaches Performance varied substantially across the three modeling approaches, with CLMBR-T consistently outperforming count-based features, which in turn exceeded handcrafted features across all prediction tasks ( Table 2 and Figure 2 ). Download figure Open in new tab Figure 2: Performance comparison of three modeling approaches across four clinical prediction tasks. The left panel shows Area Under the Receiver Operating Characteristic Curve (AUROC) values, while the right panel displays Area Under the Precision-Recall Curve (AUPRC) values for each outcome. Three modeling strategies are compared: hand-crafted features (light blue), counts-based features (orange), and CLMBR-T deep learning embeddings (green). Red dashed lines in the AUPRC panel indicate outcome prevalence baselines. View this table: View inline View popup Download powerpoint Table 2: Performance of Handcrafted, Counts, and CLMBR-T Models on Four Outcomes (Test Set). CLMBR-T demonstrated superior discriminative performance, achieving the highest AUROC values for all outcomes: 0.782 for ED visits, 0.814 for hospital admissions, 0.923 for readmissions, and 0.877 for unplanned admissions. Count-based features showed intermediate performance with AU-ROCs of 0.744, 0.767, 0.922, and 0.770, respectively, while handcrafted features achieved the lowest discrimination with AUROCs ranging from 0.586 to 0.788. The performance advantages of CLMBR-T were particularly pronounced for precision-recall metrics and rare events. For unplanned admissions (0.4% prevalence), CLMBR-T achieved an AUPRC of 0.037 compared to 0.011 for counts and 0.007 for handcrafted features. Readmissions showed the smallest performance gaps, with count-based and CLMBR-T models achieving nearly identical AUROC values (0.922 vs 0.923). Model calibration revealed a trade-off between discriminative performance and probability calibration. Handcrafted features demonstrated excellent calibration (ECE geq 0.075), while CLMBR-T models showed substantial miscalibration (ECE: 0.193-0.486), and count-based models exhibited intermediate calibration performance (ECE: 0.002-0.356). Detailed metrics in Table B.7 . 3.3. Train Sample Size Efficiency Analysis We systematically evaluated the effect of training sample size on model performance by training both count-based and CLMBR-T models with varying numbers of patients (5k, 10k, 25k, 50k, and 100k). The results demonstrate distinct scaling patterns between the two approaches across all prediction tasks ( Table 3 and Figure 3 ). Download figure Open in new tab Figure 3: Performance comparison between CLMBR-T and counts-based models across different patient sample sizes. Top row shows ROC AUC performance, bottom row shows AUPRC performance. CLMBR-T consistently outperforms counts-based models across most tasks, with the largest improvements observed for unplanned admissions. View this table: View inline View popup Download powerpoint Table 3: Impact of Training Sample Size on Model Performance Across Prediction Tasks. Count-based features showed consistent but modest improvements with increased training data across most outcomes. For ED visits, AUROC improved from 0.713 (5k patients) to 0.744 (100k patients), representing a 4.3% relative improvement. Hospital admissions demonstrated the largest gains, with AUROC increasing from 0.699 to 0.767 (9.7% improvement). Readmissions showed steady but smaller improvements (0.900 to 0.922, 2.4% gain), while unplanned admissions reached a performance plateau at 25k patients, with minimal improvement beyond this point (0.767 to 0.770). The transformer-based approach exhibited more pronounced scaling benefits, particularly for precision-recall performance. ED visits showed substantial improvement from 0.731 to 0.782 AUROC (7.0% gain), with AUPRC increasing from 0.154 to 0.222 (44% relative improvement). Hospital admissions demonstrated strong scaling from 0.774 to 0.814 AUROC, while AUPRC improved from 0.283 to 0.356 (26% gain). Notably, CLMBR-T showed the most dramatic scaling for unplanned admissions, with AUROC improving from 0.751 to 0.877 (16.8% gain) and AUPRC nearly tripling from 0.014 to 0.037. Different prediction tasks exhibited varying sensitivity to training sample size. Readmissions, the most balanced outcome (22% prevalence), showed modest improvements for both approaches, with diminishing returns beyond 25k patients. In contrast, unplanned admissions (0.4% prevalence) demonstrated continued improvement with larger training sets, particularly for CLMBR-T models. This suggests that rare events benefit more substantially from increased training data, especially when using representation learning approaches. Count-based models generally reached performance plateaus earlier, with most improvements achieved by 25-50k patients. CLMBR-T models showed more linear scaling patterns, suggesting potential for further improvement with additional training data. The gap between approaches widened with larger training sets, indicating that transformer-based models can better exploit increased data availability. 3.4. Performance by Code Volume and Vocabulary Transfer Analysis To evaluate the relationship between available historical information and model performance, we stratified test patients into quartiles based on the total number of clinical codes (including diagnoses, procedures, medications, and laboratory measurements) accumulated in their EHR up to each prediction timepoint (Q1: 2-5 codes, Q2: 6-27 codes, Q3: 28-79 codes, Q4: 80+ codes). The number of available codes varied dynamically across prediction instances for the same patient, reflecting the cumulative nature of clinical information available at different timepoints throughout their care trajectory ( Table 4 and Figure 4 ). Download figure Open in new tab Figure 4: CLMBR-T and Count-based Models Performance Across Code Volume Quartiles in Test Set. Q1: 2-5, Q2: 6-27, Q3: 28-79, Q4: 80+ codes. Both approaches improve with higher code volumes, but CLMBR-T’s advantage widens substantially in higher quartiles. Performance gains are most pronounced for hospital admissions and readmissions, while ED visits and unplanned admissions show modest improvements. The expanding gap demonstrates CLMBR-T’s superior ability to leverage rich clinical histories. View this table: View inline View popup Download powerpoint Table 4: Model Performance Across Code Volume. Outcome prevalence increased substantially with higher historical code volume for most prediction tasks. Readmissions demonstrated the strongest gradient, with prevalence increasing from 8.12% (Q1) to 47.68% (Q4). Hospital admissions showed a similar pattern, rising from 5.36% to 9.35% across quartiles. ED visits increased modestly from 4.45% to 7.35%, while unplanned admissions remained relatively stable (0.28-0.54%) across code volume levels. CLMBR-T performance improved substantially with increasing historical code volume. For hospital admissions, AUROC increased from 0.694 (Q1) to 0.870 (Q4), while AUPRC improved from 0.131 to 0.533. Readmissions showed strong performance across all quartiles, with AUROC ranging from 0.834 to 0.932 and AUPRC from 0.396 to 0.893. ED visits maintained consistent discrimination (AUROC: 0.773-0.801) across code volume levels. Unplanned admissions achieved excellent discrimination across all quartiles (AUROC: 0.857-0.882). The performance advantage of CLMBR-T over count-based features increased with higher code volume. For hospital admissions, the AUROC difference widened from 0.049 (Q1) to 0.105 (Q4). Count-based features showed superior performance for readmissions in lower volume quartiles (Q1-Q2), with CLMBR-T gaining advantage only in higher volume patients (Q3-Q4). Vocabulary analysis revealed substantial differences between the CLMBR-T pretraining dataset and our institutional data ( Table 5 ). The pretrained model utilized 26,145 unique codes compared to 7,712 in our dataset, with 4,489 codes shared between datasets. This overlap represented 58.2% of our vocabulary but only 17.2% of the pretraining vocabulary. Shared codes consisted primarily of SNOMED concepts (90.5%), LOINC laboratory codes (6.9%), and RxNorm medication codes (2.5%). Despite this limited vocabulary overlap and substantially lower event density (71 vs 707 events per patient), CLMBR-T maintained strong transfer learning performance across all prediction tasks. View this table: View inline View popup Download powerpoint Table 5: Dataset Characteristics and Vocabulary Overview. 4. Discussion In this study, we evaluated the effectiveness of transfer learning using a publicly available foundational model (CLMBR-T-base) for predicting hospital visits in a new institutional setting characterized by significantly less comprehensive data compared to the model’s original training environment. Our analysis showed that despite notable differences in data density and vocabulary coverage, CLMBR-T-base with a logistic regression classifier consistently outperformed LightGBM models trained on either handcrafted features or counts of the entire data vocabulary across all evaluated outcomes. Our primary finding is that pretrained transformer models can generalize effectively across healthcare institutions, even when significant disparities exist in vocabulary size, overlap, and patient event density. Specifically, despite our institutional dataset containing approximately one-tenth of the event density per patient compared to the CLMBR-T-base training dataset and sharing only around 58% of medical codes, the pretrained model achieved superior discriminative performance (AUROC ranging from 0.782 for ED visits to 0.923 for readmissions). Notably, for rare events such as unplanned admissions (0.4% prevalence), CLMBR-T-base substantially improved prediction accuracy, achieving nearly triple the AUPRC compared to the count-based feature strategy (0.037 vs. 0.011). The robustness of CLMBR-T-base in our study suggests that pretrained transformer models capture generalizable longitudinal clinical patterns beyond institution-specific coding idiosyncrasies. Our findings illustrate clear scalability benefits, as shown by greater improvements in predictive performance with increased training sample size compared to count-based models, especially for infrequent outcomes. For example, predicting unplanned admissions showed a 16.8 percentage point improvement in AUROC from 5,000 to 100,000 patients using CLMBR-T-base, whereas the improvement plateaued early for count-based features. This result aligns with previous findings that showed the superiority of pretrained models over count-based methods in few-shot learning scenarios, albeit using smaller training data and different outcomes within the same clinical setting( Wornow et al., 2023a ). We compared CLMBR-T-base against two established baseline methods, each chosen for distinct methodological reasons. First, we evaluated against handcrafted features including comorbidity, prior drug usage, and surrogate metrics of healthcare utilization. These features represent clinically interpretable variables with established relevance to the prediction task, allowing us to assess whether our embeddings capture meaningful clinical signals beyond what domain experts would naturally select ( Askar et al., 2024 ). Second, we compared against cumulative count features that aggregate all codes in patient histories, following not only the original CLMBR-T paper methodology, but also other works that have established this as a standard benchmark for evaluating medical embeddings ( Wornow et al., 2023a ; Rajkomar et al., 2018 ). Our results show that richer embedding representations, even when trained in a different healthcare setting with distinct care pathways and coding practices, outperform both comparison methods. The hand-crafted clinically interpretable features showed the lowest performance, while the cumulative count features achieved intermediate results, with pretrained embeddings proving most effective for predicting hospital visits. This performance hierarchy aligns with established literature showing that machine learning models using larger, less interpretable feature sets typically outperform those relying on smaller sets of interpretable features, while extending this finding to cross-institutional transfer scenarios ( Rajkomar et al., 2018 ). Although CLMBR-T-base showed superior discriminative performance, it did not provide reliable calibration directly from initial implementation. The model consistently showed higher expected calibration errors than comparison approaches, indicating that post-hoc recalibration methods, such as temperature scaling or isotonic regression, should be applied before clinical deployment. Accurate calibration is critical, as it ensures that predicted risks closely align with observed outcomes, directly impacting clinical decision-making and resource allocation Hicks et al. (2022) . Our findings offer practical guidance for healthcare institutions considering pretrained model adoption. CLMBR-T-base’s predictive advantage increased with patients’ longitudinal record depth, as the transformer’s superiority over simpler baselines expanded substantially with growing historical code counts ( Figure 4 ). This advantage was most pronounced for ED visits and unplanned hospitalizations, which occurred less frequently and are less clinically predictable than hospital admissions and readmissions in our dataset. For patients with sparse histories (fewer than 30 codes), interpretable tree-based models maintained competitive accuracy and may be preferable in resource-constrained environments. These results indicate that institutions seeking to predict rare, costly events should fine-tune pretrained transformers, even with minimal local data, while simpler approaches suffice for common outcomes. Furthermore, strong performance was achieved with partial rather than complete alignment between local coding systems and the model’s vocabulary, supporting recent evidence that CLMBR-T models transfer effectively across institutions using less than 1% local training data and incomplete code mapping ( Guo et al., 2024 ). While the strengths of this study include a rigorous comparative design using a standardized data model (OMOP) and direct evaluation of transfer learning in a very different clinical context, several limitations must be considered. First, our study utilized data from a single institution, potentially limiting generalizability. Replication across diverse settings is essential to validate our findings broadly. Second, our operational definitions of hospital visits are subject to bias. Unplanned admissions relied on antecedent ED visits as a proxy, potentially underestimating directly admitted unplanned hospitalizations. Meanwhile, many hospital admissions and readmissions could have been entirely elective and therefore too easy to predict using available data, artificially inflating predictive performance. Future work should address these definitional ambiguities through better integration of clinical workflows and admission documentation. Third, we only analyzed structured EHR data; integrating other data modalities such as clinical notes and imaging data would certainly enrich patient information and could interfere with reported predictive accuracy. Fourth, the visit-level prediction design allows patients to contribute a different number of predictions during model training, which can introduce bias toward high users. While we acknowledge this issue, we believe the approach is appropriate given the intended use of the models. These models are meant to dynamically update risk predictions at each visit in order to provide actionable clinical insights. To minimize this bias, we split the data by patient, limited the number of visits each patient could contribute to model training, and reported cluster bootstrap confidence intervals, which are known to produce less optimistic estimates. Finally, the inherent “black-box” nature of transformer-based models poses interpretability challenges critical for clinical adoption. In this study the tension between interpretability and performance was clear by our comparison of few hand-crafted features to transformer vector representations ( Bouwmeester et al., 2013 ). Future research should focus into explainability methods of such representations, such as sparse attention mechanisms or concept-level feature attribution. 5. Conclusion We show that foundational models pretrained on extensive EHR data can effectively enable transfer knowledge to institutions with smaller and sparser datasets, delivering superior predictive performance for clinically important outcomes such as ED visits and hospital admissions. However, the performance and practical utility of pretrained models vary significantly depending on outcome prevalence, patient record richness, and calibration needs. Institutions aiming to predict ED visits and unplanned visits will benefit from adopting pretrained transformer models and investing in local fine-tuning and recalibration. Future research should address interpretability and clinical integration challenges, incorporate additional data modalities, and validate these findings across diverse healthcare environments to maximize the clinical impact of advanced predictive modeling. Appendix A. Supplementary Figures Download figure Open in new tab Figure A.5: AUROC and AUPRC across CLMBR-T and Counts-based models on the four evaluated outcomes, over different sizes of traning data. Download figure Open in new tab Figure A.6: AUROC and AUPRC across CLMBR-T and Counts-based models on the four evaluated outcomes, over different quartiles of number of codes in test data. Download figure Open in new tab Figure A.7: F1-score on each outcome best threshold and Estimated Calibration error (ECE) across CLMBR-T and Counts-based models on the four evaluated outcomes, over different quartiles of number of codes in test data. Download figure Open in new tab Figure A.8: Outcome prevalence over different quartiles of number of codes in test data. Download figure Open in new tab Figure A.9: Calibration plot of ED visit prediction of Counts-based LightGBM and CLMBR-T. Download figure Open in new tab Figure A.10: Calibration plot of Hospital Admission prediction of Counts-based LightGBM and CLMBR-T. Download figure Open in new tab Figure A.11: Calibration plot of Hospital Readmission prediction of Counts-based Light-GBM and CLMBR-T. Download figure Open in new tab Figure A.12: Calibration plot of Unplanned Admission prediction of Counts-based Light-GBM and CLMBR-T Appendix B. Supplementary Tables View this table: View inline View popup Download powerpoint Table B.6: Study Population and Outcome Prevalence by Prediction Task Across Train/-Validation/Test Splits. View this table: View inline View popup Download powerpoint Table B.7: Extended Performance Metrics and Model Characteristics (Test Set). View this table: View inline View popup Download powerpoint Table B.8: Comprehensive Performance Metrics Across Training Sample Sizes for All Prediction Tasks. View this table: View inline View popup Download powerpoint Table B.9: Model Calibration and Additional Performance Metrics Across Training Sample Sizes. View this table: View inline View popup Download powerpoint Table B.10: Detailed Vocabulary Distribution Analysis. Data Availability The data used in this study, while obtained from anonymized Electronic Health Records, are subject to strict confidentiality and privacy regulations. Data access requests may be considered on a case-by-case basis and will require approval from the relevant institutional review boards and data custodians. Researchers interested in obtaining access to the data for the purpose of validating or extending the findings presented in this paper should contact the corresponding author for further information on the data access process and the necessary legal and ethical requirements. All code developed for this study will be shared publicly. Footnotes ↵ 1 https://github.com/som-shahlab/femr ↵ 2 https://github.com/Medical-Event-Data-Standard/medsetl ↵ 3 https://huggingface.co/StanfordShahLab/clmbr-t-base References ↵ Arnrich , B. , Choi , E. , Fries , J.A. , McDermott , M.B. , Oh , J. , Pollard , T. , Shah , N. , Steinberg , E. , Wornow , M. , van de Water , R. , 2024 . Medical event data standard (MEDS): Facilitating machine learning for health , 03 – 08 . ↵ Askar , M. , Tafavvoghi , M. , Småbrekke , L. , Bongo , L.A. , Svendsen , K. , 2024 . Using machine learning methods to predict all-cause somatic hospitalizations in adults: A systematic review . PLOS ONE 19 , e0309175 . doi: 10.1371/journal.pone.0309175 . OpenUrl CrossRef PubMed ↵ Bouwmeester , W. , Moons , K.G.M. , Kappen , T.H. , van Klei , W.A. , Twisk , J.W.R. , Eijkemans , M.J.C. , Vergouwe , Y. , 2013 . Internal Validation of Risk Models in Clustered Data: A Comparison of Bootstrap Schemes . American Journal of Epidemiology 177 , 1209 – 1217 . URL: 10.1093/aje/kws396 , doi: 10.1093/aje/kws396 . OpenUrl CrossRef PubMed ↵ Buja , A. , Rivera , M. , De Battisti , E. , Corti , M.C. , Avossa , F. , Schievano , E. , Rigon , S. , Baldo , V. , Boccuzzo , G. , Ebell , M.H. , 2020 . Multimorbidity and Hospital Admissions in High-Need, High-Cost Elderly Patients . Journal of Aging and Health 32 , 259 – 268 . doi: 10.1177/0898264318817091 . OpenUrl CrossRef PubMed ↵ Datta , S. , Posada , J. , Olson , G. , Li , W. , O’Reilly , C. , Balraj , D. , Mesterhazy , J. , Pallas , J. , Desai , P. , Shah , N. , 2020 . A new paradigm for accelerating clinical data science at Stanford Medicine . doi: 10.48550/arXiv.2003.10534 , arXiv:2003.10534 . OpenUrl CrossRef ↵ Guo , L.L. , Fries , J. , Steinberg , E. , Fleming , S.L. , Morse , K. , Aftandilian , C. , Posada , J. , Shah , N. , Sung , L ., 2024 . A multi-center study on the adaptability of a shared foundation model for electronic health records . npj Digital Medicine 7 , 1 – 9 . doi: 10.1038/s41746-024-01166-w . OpenUrl CrossRef PubMed ↵ Hicks , S.A. , Strümke , I. , Thambawita , V. , Hammou , M. , Riegler , M.A. , Halvorsen , P. , Parasa , S. , 2022 . On evaluation metrics for medical applications of artificial intelligence . Scientific Reports 12 , 5979 . doi: 10.1038/s41598-022-09954-8 . OpenUrl CrossRef PubMed ↵ Klunder , J.H. , Heymans , M.W. , van der Heide , I. , Verheij , R.A. , Maarsingh , O.R. , van Hout , H.P. , Joling , K.J. , 2024 . Predicting unplanned admissions to hospital in older adults using routinely recorded general practice data: Development and validation of a prediction model . British Journal of General Practice 74 , e628 – e636 . doi: 10.3399/BJGP.2023.0350 . OpenUrl Abstract / FREE Full Text ↵ Klunder , J.H. , Panneman , S.L. , Wallace , E. , de Vries , R. , Joling , K.J. , Maarsingh , O.R. , van Hout , H.P.J. , 2022 . Prediction models for the prediction of unplanned hospital admissions in communitydwelling older adults: A systematic review . PLOS ONE 17 , e0275116 . doi: 10.1371/journal.pone.0275116 . OpenUrl CrossRef PubMed ↵ Knevel , R. , Liao , K.P ., 2023 . From real-world electronic health record data to real-world results using artificial intelligence . Annals of the Rheumatic Diseases 82 , 306 – 311 . doi: 10.1136/ard-2022-222626 . OpenUrl Abstract / FREE Full Text ↵ Krishnan , R. , Rajpurkar , P. , Topol , E.J ., 2022 . Self-supervised learning in medicine and healthcare . Nature Biomedical Engineering , 1 – 7 doi: 10.1038/s41551-022-00914-1 . OpenUrl CrossRef ↵ Lee , E.S. , Koh , H.L. , Ho , E.Q.Y. , Teo , S.H. , Wong , F.Y. , Ryan , B.L. , Fortin , M. , Stewart , M ., 2021 . Systematic review on the instruments used for measuring the association of the level of multimorbidity and clinically important outcomes . BMJ Open 11 , e041219 . doi: 10.1136/bmjopen-2020-041219 . OpenUrl Abstract / FREE Full Text ↵ Neves , B. , Moreira , J.M. , Goņcalves , S. , Cerejo , J. , da Silva , N.A. , Leite , F. , Silva , M.J. , 2025 . Zero-shot learning for clinical phenotyping: Comparing LLMs and rule-based methods . Computers in Biology and Medicine 192 , 110181 . URL: https://www.sciencedirect.com/science/article/pii/S0010482525005323 , doi: 10.1016/j.compbiomed.2025.110181 . OpenUrl CrossRef PubMed ↵ Rajkomar , A. , Oren , E. , Chen , K. , Dai , A.M. , Hajaj , N. , Hardt , M. , Liu , P.J. , Liu , X. , Marcus , J. , Sun , M. , Sundberg , P. , Yee , H. , Zhang , K. , Zhang , Y. , Flores , G. , Duggan , G.E. , Irvine , J. , Le , Q. , Litsch , K. , Mossin , A. , Tansuwan , J. , Wang , D. , Wexler , J. , Wilson , J. , Ludwig , D. , Volchenboum , S.L. , Chou , K. , Pearson , M. , Madabushi , S. , Shah , N.H. , Butte , A.J. , Howell , M.D. , Cui , C. , Corrado , G.S. , Dean , J ., 2018 . Scalable and accurate deep learning with electronic health records . npj Digital Medicine 1 , 1 – 10 . URL: https://www.nature.com/articles/s41746-018-0029-1 , doi: 10.1038/s41746-018-0029-1 . number: 1 Publisher: Nature Publishing Group . OpenUrl CrossRef PubMed ↵ Rasmy , L. , Xiang , Y. , Xie , Z. , Tao , C. , Zhi , D ., 2021 . Med-BERT: Pre-trained contextualized embeddings on large-scale structured electronic health records for disease prediction. npj Digital Medicine 4 , 1 – 13 . doi: 10.1038/s41746-021-00455-y . OpenUrl CrossRef PubMed ↵ Reich , C. , Ostropolets , A. , Ryan , P. , Rijnbeek , P. , Schuemie , M. , Davydov , A. , Dymshyts , D. , Hripcsak , G ., 2024 . OHDSI Standardized Vocabularies—a large-scale centralized reference ontology for international data harmonization . Journal of the American Medical Informatics Association 31 , 583 – 590 . doi: 10.1093/jamia/ocad247 . OpenUrl CrossRef PubMed ↵ Ren , S. , Lai , H. , Tong , W. , Aminzadeh , M. , Hou , X. , Lai , S ., 2010 . Nonparametric bootstrapping for hierarchical data . Journal of Applied Statistics 37 , 1487 – 1498 . URL: http://www.tandfonline.com/doi/full/10.1080/02664760903046102 , doi: 10.1080/02664760903046102 . OpenUrl CrossRef ↵ Renc , P. , Jia , Y. , Samir , A.E. , Was , J. , Li , Q. , Bates , D.W. , Sitek , A ., 2024 . Zero shot health trajectory prediction using transformer. npj Digital Medicine 7 , 1 – 10 . doi: 10.1038/s41746-024-01235-0 . OpenUrl CrossRef ↵ Si , Y. , Du , J. , Li , Z. , Jiang , X. , Miller , T. , Wang , F. , Zheng , W.J. , Roberts , K ., 2021 . Deep Representation Learning of Patient Data from Electronic Health Records (EHR): A Systematic Review . Journal of Biomedical Informatics 115 , 103671 . doi: 10.1016/j.jbi.2020.103671 , arXiv:2010.02809 . OpenUrl CrossRef PubMed ↵ Wornow , M. , Thapa , R. , Steinberg , E. , Fries , J.A. , Shah , N.H. , 2023a . EHRSHOT: An EHR Benchmark for Few-Shot Evaluation of Foundation Models. arXiv:2307.02028 . ↵ Wornow , M. , Xu , Y. , Thapa , R. , Patel , B. , Steinberg , E. , Fleming , S. , Pfeffer , M.A. , Fries , J. , Shah , N.H ., 2023b . The shaky foundations of large language models and foundation models for electronic health records. npj Digital Medicine 6 , 1 – 10 . doi: 10.1038/s41746-023-00879-8 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted September 12, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Predicting Hospital Admissions Using Pretrained EHR Embeddings: External Evaluation and Insights on Local Vocabulary Adaptation Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Predicting Hospital Admissions Using Pretrained EHR Embeddings: External Evaluation and Insights on Local Vocabulary Adaptation Bernardo Neves , Jorge Cerejo , Simão Gonçalves , Inês Mota , José M. Moreira , Nuno A. Silva , Francisca Leite , Michael Wornow , Mário J. Silva medRxiv 2025.09.08.25335368; doi: https://doi.org/10.1101/2025.09.08.25335368 Share This Article: Copy Citation Tools Predicting Hospital Admissions Using Pretrained EHR Embeddings: External Evaluation and Insights on Local Vocabulary Adaptation Bernardo Neves , Jorge Cerejo , Simão Gonçalves , Inês Mota , José M. Moreira , Nuno A. Silva , Francisca Leite , Michael Wornow , Mário J. Silva medRxiv 2025.09.08.25335368; doi: https://doi.org/10.1101/2025.09.08.25335368 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4411) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15205) Forensic Medicine (30) Gastroenterology (1119) Genetic and Genomic Medicine (6574) Geriatric Medicine (666) Health Economics (994) Health Informatics (4511) Health Policy (1365) Health Systems and Quality Improvement (1608) Hematology (537) HIV/AIDS (1263) Infectious Diseases (except HIV/AIDS) (15903) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6573) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (954) Oncology (3319) Ophthalmology (968) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (662) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5422) Public and Global Health (9205) Radiology and Imaging (2191) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1191) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9feb035c7c230db4',t:'MTc3OTI3NjY2Ng=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.