Full text
51,343 characters
· extracted from
preprint-html
· click to expand
Comparison of Multimodal Deep Learning Approaches for Predicting Clinical Deterioration in Ward Patients | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Comparison of Multimodal Deep Learning Approaches for Predicting Clinical Deterioration in Ward Patients View ORCID Profile Charles A. Kotula , Jennifer Martin , Kyle A. Carey , Dana P. Edelson , View ORCID Profile Dmitriy Dligach , View ORCID Profile Anoop Mayampurath , View ORCID Profile Majid Afshar , View ORCID Profile Matthew M. Churpek doi: https://doi.org/10.1101/2025.03.06.25322855 Charles A. Kotula 1 Department of Medicine, University of Wisconsin-Madison , Madison, WI 53792, United States , BS Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Charles A. Kotula Jennifer Martin 1 Department of Medicine, University of Wisconsin-Madison , Madison, WI 53792, United States , MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kyle A. Carey 2 Department of Medicine, University of Chicago , Chicago, IL 60637, United States , MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dana P. Edelson 2 Department of Medicine, University of Chicago , Chicago, IL 60637, United States , MD, MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dmitriy Dligach 3 Department of Computer Science, Loyola University Chicago , Chicago, IL 60626, United States , PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Dmitriy Dligach Anoop Mayampurath 1 Department of Medicine, University of Wisconsin-Madison , Madison, WI 53792, United States , 4 Department of Biostatistics and Medical Informatics, University of Wisconsin-Madison , Madison, WI 53726, United States PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Anoop Mayampurath Majid Afshar 1 Department of Medicine, University of Wisconsin-Madison , Madison, WI 53792, United States , 4 Department of Biostatistics and Medical Informatics, University of Wisconsin-Madison , Madison, WI 53726, United States MD, MSCR Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Majid Afshar Matthew M. Churpek 1 Department of Medicine, University of Wisconsin-Madison , Madison, WI 53792, United States , 4 Department of Biostatistics and Medical Informatics, University of Wisconsin-Madison , Madison, WI 53726, United States MD, MPH, PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Matthew M. Churpek For correspondence: mchurpek{at}medicine.wisc.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF ABSTRACT Objective Implementing machine learning models to identify clinical deterioration on the wards is associated with improved outcomes. However, these models have high false positive rates and only use structured data. Therefore, we aim to compare models with and without information from clinical notes for predicting deterioration. Materials and Methods Adults admitted to the wards at the University of Chicago (development cohort) and University of Wisconsin-Madison (external validation cohort) were included. Predictors consisted of structured and unstructured variables extracted from notes as Concept Unique Identifiers (CUIs). We parameterized CUIs in five ways: Standard Tokenization (ST), ICD Rollup using Tokenization (ICDR-T), ICD Rollup using Binary Variables (ICDR-BV), CUIs as SapBERT Embeddings (SE), and CUI Clustering using SapBERT embeddings (CC). Each parameterization method combined with structured data and structured data-only were compared for predicting intensive care unit transfer or death in the next 24 hours using deep recurrent neural networks. Results The study included 506,076 ward patients, 4.9% of whom experienced the outcome. The SE model achieved the highest AUPRC (0.208), followed by CC (0.199) and the structured-only model (0.199), ICDR-BV (0.194), ICDR-T (0.166), and ST (0.158). The CC and structured-only models achieved the highest AUROC (0.870), followed by ICDR-T (0.867), ICDR-BV (0.866), ST (0.860), and SE (0.859). Discussion A multimodal model combining structured data with embeddings using SapBERT had the highest AUPRC, but performance was similar between models with and without CUIs. Conclusion The addition of CUIs from notes to structured data did not meaningfully improve model performance for predicting clinical deterioration. Lay Summary Implementing machine learning models to identify clinical deterioration on the wards is associated with improved outcomes. However, these models have high rates of false positives and only use structured electronic health record (EHR) data as predictor variables. Therefore, we aimed to determine if information from clinical notes improves performance and compare methods of combining information from clinical notes with structured data. Model features consisted of variables from structured EHR data (e.g., vital signs, laboratory results) and Concept Unique Identifiers (CUIs) extracted from notes. We parameterized the CUIs in five ways, each of which was combined with the structured data to predict intensive care unit transfer or death in the next 24 hours with a multimodal deep recurrent neural network. The study cohort included 506,076 ward patients across two health systems, 4.9% of whom experienced the outcome. During external validation, the models using both CUIs and structured data performed similarly to the model using only structured data. Of the CUI parameterization approaches, methods that used SapBERT embeddings had the highest discrimination. Our findings show that, while adding medical concept variables from clinical notes could provide additional clinical context for clinicians, they do not enhance model performance compared to structured data alone. BACKGROUND AND SIGNIFICANCE Clinical deterioration, defined as “an acute worsening of a patient’s clinical status that poses a substantial increase to an individual’s short-term risk of death or serious harm,” occurs in up to 5% of hospitalized patients.[ 1 , 2 ] Early detection of clinical deterioration is essential for minimizing preventable death, as delayed intensive care unit (ICU) transfers and rapid response team activations lead to increased morbidity and mortality.[ 3 – 6 ] Recently, machine learning models developed using large electronic health record (EHR) datasets have been developed to predict clinical deterioration and were shown to have superior performance compared to commonly used tools, such as the Modified Early Warning Score (MEWS) and the National Early Warning Score (NEWS).[ 1 , 7 – 11 ] Importantly, implementing machine-learning models for clinical deterioration has been associated with decreased mortality.[ 12 , 13 ] Although machine-learning-based early warning scores are more accurate than simpler scores, they still suffer from high false positive rates (FPR), and their input is often restricted to structured EHR variables like vital signs and laboratory results.[ 1 , 11 ] Consequently, the majority of EHR data, comprised of unstructured text, are not utilized for predicting clinical deterioration. Incorporating information from unstructured text could improve model performance and provide additional clinical context for clinicians by highlighting medical terms that increase deterioration risk. Several methods of combining information from structured data and unstructured clinical notes have been shown to enhance performance in other medical domains.[ 14 – 21 ] However, it is unknown which approach works best when predicting clinical deterioration in hospitalized patients outside the ICU. OBJECTIVE We aim to compare the performance of methods that utilize information from unstructured clinical notes for use in multimodal modeling of clinical deterioration. We also aim to compare these methods to a model that uses only structured data to determine if information for clinical notes improves performance. MATERIALS & METHODS Patient Population and Data Collection Adults (age ≥ 18) admitted to the medical-surgical wards at the University of Chicago (UC) from 2016-2022 and the University of Wisconsin-Madison (UW) from 2009-2020 with clinical notes data were eligible for inclusion. Our primary outcome was clinical deterioration, defined as death or direct ward to ICU transfer within 24 hours of each observation, with UC used for model development and UW for external validation. Patient data were retrieved from the enterprise data warehouses of each university’s health system, which included demographics, vital signs, laboratory values, and clinical notes. All data from UC were de-identified in accordance with the Health Insurance Portability and Accountability Act and transferred to UW for analysis. This study was approved by the Institutional Review Boards of UW (#2019-1258) and UC (#18-0447). Structured Features Fifty-five structured predictor variables were used to develop our models, including demographics, vital signs, laboratory results, and nurse documentation (Supplementary Table 1 ) . For most predictor variables, missing values were imputed by carrying forward in time the last known value, and the remaining missing values were imputed in the development and external validation cohorts using the variable’s median value across all patients in the development cohort. The last known value was only carried forward 24 hours before their missing values were imputed for lactate and blood gas labs. Piecewise-linear encoding (PLE) transformation, a method demonstrated to improve the performance of deep learning models in large, numerical datasets, was used to create a higher-dimensional representation of the data.[ 22 ] An additional “hours since admission” input feature was created to capture temporal information. Unstructured Input Parameterization Clinical notes data were pre-processed using the Apache clinical Text Analysis and Knowledge Extraction System (cTAKES), a tool that maps medical terms from the National Library of Medicine’s Unified Medical Language System (UMLS) to Concept Unique Identifiers (CUIs).[ 23 , 24 ] For example, the term “headache” maps to the CUI “C0018681.” CUIs enable us to focus on the medical terms contained within the notes and create a standard structure to represent notes from distinct health systems. To keep the CUI vocabularies consistent between the UC and the UW data, only CUIs present in the UC development data were kept in the UW data. Furthermore, these CUIs were subset to those appearing in at least five admissions in the UC data. The CUIs were timestamped with the datetimes from their clinical note of origin, then matched with the structured data by pulling forward the most recent 360 unique CUIs that appeared in the clinical notes prior to each structured timestep. 360 was chosen due to computational limitations. Deep learning models cannot operate on CUIs themselves because they are strings. Thus, we investigated various approaches for parameterizing CUIs into model-ready numerical representations ( Figure 1 ). Download figure Open in new tab Figure 1. Overview of unstructured data processing for a single timestep. CUIs are extracted from clinical notes by cTAKES and assembled into a usable data structure that maps encounter ID to a list of CUIs associated with the timestep. A table of CUI-associated data is created, independent of patient data, containing preferred text, ICD Code, SapBERT embedding, and cluster information. The CUI-associated data is transformed according to the form required for each model, denoted by the labels beneath each neural network. Note: The data shown are meant to illustrate the processing pipeline and do not have the exact values of patient or CUI-associated data. Abbreviations: cTAKES = Apache clinical Text Analysis and Knowledge Extraction System; CUI = concept unique identifier; ST = standard tokenization; SE = CUIs as SapBERT embedding; CC = CUI clustering using SapBERT embeddings; ICDR-T = ICD rollup using tokenization; ICDR-BV = ICD rollup using binary variables. Standard Tokenization (ST ) The first method of CUI parameterization used a Keras TextVectorization layer to tokenize a list of CUIs into a vector of integers, where there was a unique one-to-one mapping from CUI to integer. These integers get mapped to dense vectors. In total, 31,418 unique CUIs were shared between the UC and the UW data. ICD Rollup using Tokenization (ICDR-T) In a second strategy designed to make the CUI inputs more parsimonious, we grouped CUIs capturing similar medical concepts based on the ICD codes associated with them in the UMLS Metathesaurus.[ 24 ] Because ICD codes are hierarchical in nature, we used them to “roll up” CUIs whose ICD codes share the first three characters into a unique ICD category. All CUIs without an associated ICD code were assigned to the same category. Mirroring the ST approach, each grouping of CUIs by ICD codes was then mapped to a unique integer. The ICD Rollup groupings were first computed on the UC CUI data and then applied to the UW CUI data. Compared to the ST approach, this process reduced the number of unique tokens from 31,418 to 1,232. ICD Rollup using Binary Variables (ICDR-BV) Using the tokenized CUI data created in the ICDR-T approach, we converted each vector of tokens to a 1,232-dimensional sparse binary vector, where each index of the sparse vectors represents the presence or absence of a CUI contained by the ICD Rollup grouping corresponding to that index. CUIs as SapBERT Embeddings (SE) Our next approach to CUI parameterization extended our standard tokenization (ST). We extracted the preferred text strings associated with each CUI (e.g. CUI: “C0000737” → Preferred Text: “Unspecified abdominal pain”) and obtained a 768-dimensional embedding for each string using Hugging Face’s SapBERT model, a transformer model trained on UMLS medical terms.[ 25 , 26 ] We then generated a mapping from Token → SapBERT Embedding that served as the basis for a pre-trained embedding matrix. When using this embedding approach in our models, the vector of tokens at each timestep is transformed into a matrix of SapBERT embeddings, where there is a one-to-one correspondence between the token value and the row of the embedding matrix that contains that token’s SapBERT embedding ( Figure 2 ). Download figure Open in new tab Figure 2. Model architectures for tokenized vs. binary variable (B.V.) CUIs. Each timestep has three inputs: structured, temporal, and unstructured (CUI). The structured and temporal inputs undergo the same transformations in models using tokenized and binary variable CUIs. The structured input is passed through a PLE layer followed by a dense linear layer. The temporal input is vectorized by a Time2Vec layer, before being concatenated with the output of the structural input’s dense layer. The concatenated structured and temporal data are then passed through an LSTM/GRU layer. For models using tokenized CUIs, the list of tokenized CUIs is embedded using either randomly initialized embeddings or SapBERT embeddings, pooled into a single embedding, then passed through an LSTM/GRU layer. For models using binary variable CUIs, the vector of binary CUI variables is directly passed through an LSTM/GRU layer. The LSTM/GRU outputs from the structured + temporal and CUI pipelines are concatenated and then passed through a final dense layer to get the model’s predictions. Dropout layers are excluded for simplicity. Abbreviations: CUI = concept unique identifier; B.V. = binary variables; ST = standard tokenization; SE = CUIs as SapBERT embedding; CC = CUI clustering using SapBERT embeddings; ICDR-T = ICD rollup using tokenization; ICDR-BV = ICD rollup using binary variables; HR = heart rate; RR = respiratory rate; PLE = piecewise linear encoding; LSTM = long short-term memory; GRU = gated recurrent unit. CUI Clustering using SapBERT Embeddings (CC) Our final approach to CUI parameterization utilized the CUI’s SapBERT embeddings to cluster the CUIs. First, we performed PCA on the embeddings to reduce their dimensionality from 768 to 100. Then, we calculated the pairwise cosine distances between the 100-dimensional embeddings. Lastly, we performed hierarchical clustering with a distance threshold to obtain 1,077 clusters. With these embedding clusters, we created clusters of tokenized CUIs. These token clusters enabled us to transform the vectors of tokenized CUIs from the ST CUI data into a 1,077-dimensional sparse vector. Like the ICDR-BV approach, each index of the embedding represents the presence or absence of a CUI contained within the cluster associated with that index. Model Development For each CUI parameterization method, a model was developed using an intermediate fusion architecture with long short-term memory (LSTM) or gated recurrent unit (GRU) layers that learned representations of the structured, unstructured (CUI), and temporal inputs separately before combining them to learn their joint interactions. Both the number and type of recurrent layers were hyperparameters. In all models, the structured data were first passed through a fully connected layer, followed by a dropout layer. The temporal data were passed through a Time2Vec layer, a layer for vectorizing time shown to improve LSTM performance. Time2Vec represents time in a neural network by encoding temporal information using a combination of learned linear and sinusoidal functions, capturing both periodic and non-periodic patterns.[ 27 ] The structured and temporal data were then concatenated and passed through the LSTM/GRU layer(s). The CUI data were processed according to their parameterization method. For the ST and ICDR-T approaches, the tokens were transformed into randomly initialized embeddings, which were then passed through a dropout layer. The resulting embeddings were collapsed into a one-dimensional embedding by average or max pooling, passed through a dropout layer, then passed through LSTM/GRU layer(s). A similar process was also applied to the SE approach, with the difference being that the tokens were mapped to their corresponding SapBERT embedding rather than to a randomly initialized one. For the methods parameterizing CUI data as sparse binary vectors (ICDR-BV & CC), the CUI data were passed directly through a dropout layer, then through LSTM/GRU layer(s). The learned representations of the structured, temporal, and unstructured (CUI) data were fused into a single joint representation. This joint representation was passed through a final fully connected layer with a sigmoid activation function to produce the model’s predictions. A separate, structured data-only model was fit in the same manner but without the additional CUI data. For hyperparameter optimization, we used the Keras Bayesian Optimization (BO) Tuner with an 80/20 training/validation split on the model development data (UC). We ran twenty trials with early stopping if the validation Area Under the Receiver Operating Characteristic (AUROC) did not improve by ≥0.005 for five consecutive epochs. Batches of size 32 were used for all models, except for the SE model, whose batch size was 16 to not exhaust GPU resources. The external validation data (UW) was not used during model development. Model Evaluation Model discrimination was assessed using the Area Under the Precision-Recall Curve (AUPRC) and the Area Under the Receiver Operating Characteristic (AUROC) as primary (due to the low outcome prevalence) and secondary performance metrics, respectively. AUPRCs with 95% confidence intervals were calculated using bootstrapping with 1000 iterations, where each bootstrapping iteration was done using a sample size equal to 20% of the population size of each subgroup. AUROCs with 95% confidence intervals were calculated using R (version 4.4.0) with the pROC (version 1.18.5) package. Sensitivity, specificity, positive predictive value, and negative predictive value were calculated across a range of cut points. Subgroup analyses were conducted to assess model performance across sex, race, ethnicity, and age. Differences in patient characteristics were assessed using Chi-squared tests for categorical variables and Mann-Whitney U tests for age. Initial data cleaning was done using Stata (version 16.1). Preprocessing, descriptive analysis, and model development were done using Python (version 3.9.18), with data analysis and machine learning libraries such as TensorFlow (version 2.12.0), Keras (version 2.12.0), KerasTuner (version 1.3.5), scikit-learn (version 1.2.0), SciPy (version, 1.10.1), and Pandas (version 1.5.3). RESULTS Cohort Characteristics The model development cohort (UC) included 284,302 patients, with 14,954 (5.3%) deteriorating during their hospitalization. The external validation cohort (UW) included 248,055 patients, with 11,327 (4.6%) deteriorating during their hospitalization. Across both cohorts (UC and UW), patients who experienced deterioration were more likely to be male (54.7% vs. 48.4% female), older, and have a greater median length of stay than those who did not experience deterioration (Supplementary Table 2). Other demographic variables were statistically significant between those who did and did not experience deterioration, but the magnitudes of these differences were small. Patient characteristics also varied between sites ( Table 1 ). The model development site (UC) had a higher proportion of females (57.9% vs. 48.4%) and a lower proportion of white patients (39.3% vs. 90.8%). Other differences were statistically significant due to the large sample size but were numerically small. View this table: View inline View popup Table 1. Comparison of patient characteristics between the model development (UC) and external validation (UW) cohorts. Model Performance The AUPRC scores with their 95% CI for predicting clinical deterioration are presented for each model in Supplementary Table 3. The SE model had the highest AUPRC (0.208), followed by the structured-only (0.199) and CC (0.199) models, ICDR-BV (0.194), ICDR-T (0.166), and ST (0.158). The structured-only and CC models achieved the highest AUROC (0.870), followed by ICDR-T (0.867), ICDR-BV (0.866), ST (0.860), and SE (0.859) ( Table 2 , Figure 3 ). Download figure Open in new tab Figure 3. Area under the precision-recall curves (AUPRCs) for the multimodal and structured-only models in the external validation cohort. View this table: View inline View popup Download powerpoint Table 2. Model AUROCs on external validation cohort (UW) across subgroups. Some variation of model performance existed across subgroups. Across all models, the AUPRC scores for Asian/Mideast-Indian (A/MI) patients were the greatest on average (0.224), while they were the lowest on average for patients between the ages of 18-30 (0.156). AUROC scores were more tightly clustered than AUPRC scores across subgroups. Still, the average AUROC score across models was greatest for patients in the age range of 34-48 (0.875), while it was the lowest for patients of age >= 79 (0.849). The SE model had the highest performance across all subgroups for AUPRC, while the best-performing model differed across subgroups for AUROC. Supplementary Table 4 depicts the sensitivity, specificity, positive, and negative predictive values for models assessed along a range of probability cutoffs corresponding to the highest-risk 15%, 10%, 5%, and 1% of observations for clinical deterioration. At the 5% cutoff, the CC model achieved the greatest positive predictive value (12.53%) and sensitivity (52.15%). At the 15% cutoff, the ICDR-T, CC, and ICDR-BV models tied for the highest positive predictive value at 5.67%, while their sensitivities were 70.95%, 70.92%, and 70.86%, respectively. The average positive predictive values across models at the 5% and 15% cutoffs were 12.30% and 5.63%, respectively, while the average sensitivities were 51.23% and 70.27%. DISCUSSION In this study, we compared five methods of CUI parameterization for use in multimodal deep learning models as well as a structured-only model to predict clinical deterioration in non-ICU inpatients. The performance of all models was strong, and discrimination was similar overall, especially with the AUROC metric. We found no meaningful differences between models that utilized tokenized + embedded CUI data, like the SE, ICDR-T, and ST models, versus those that included CUI data as a vector of binary variables, like ICDR-BV and CC models. However, our method utilizing SapBERT embeddings to represent CUIs (SE) achieved the highest AUPRC overall and across all subgroups. However, this performance was only slightly better than the structured data-only model. Our results suggest that the addition of medical terms as CUIs in multimodal models does not meaningfully improve performance beyond models using structured data alone. To our knowledge, this is the first paper to investigate multimodal integration of information from structured and unstructured data in deep learning models to predict clinical deterioration in ward patients. A recent systematic review by Van Der Vegt et al. examined the current use of artificial intelligence for predicting clinical deterioration. They found that while many groups have developed promising machine learning models using methods such as logistic regression, XGBoost, and random forests, few have developed deep learning models.[ 28 ] Deep learning models have, on average, outperformed non-deep learning models.[ 29 – 31 ] Also, groups have developed deep learning models for predicting deterioration using ICU patients, patients from the emergency department, or patients with COVID-19.[ 10 , 21 , 32 – 35 ] However, few have developed models using ward patients.[ 11 , 29 ] Fewer still have utilized multiple input modalities, none of which focused on ward deterioration.[ 36 ] Because predicting clinical deterioration early and accurately leads to improved outcomes, efforts to increase model accuracy like ours could lead to enhanced detection and decrease false alarms. Furthermore, the incorporation of information from clinical notes could provide additional context for clinicians related to factors that increase a patient’s risk of the event. When including CUIs in multimodal models, our results suggest a benefit in restricting the number of CUI-related variables in the input space. The CC, ICDR-T, and ICDR-BV models restrict the number of CUI-related variables in their unstructured input from 31,418 to 1,077, 1,232, and 1,232, respectively, while the ST and SE models use the full 31,418. The models restricting the CUI-related variable space achieved higher AUROCs, positive predictive values, and sensitivities at the 15% cutoff relative to SE and ST models. However, this pattern does not hold as strongly for AUPRC scores. The ST model, which does not restrict the number of CUI-related variables, achieved a relatively low AUPRC, as it did for AUROC, but the SE model, which achieved the lowest AUROC, achieved the highest AUPRC score. When considering all performance metrics together, the CC model may have a slight edge above the other CUI models, as it achieved the highest AUROC, positive predictive values, and sensitivities at the 5% cutoff, the highest positive predictive value (tie) at the 15% cutoff, and the second highest AUPRC. However, all differences in performance metrics were small. Importantly, the optimal model for clinical implementation would be based on the cutoff threshold picked that would prompt specific clinical actions or alerts needed for clinical practice. We also found that the structured-only model performed just as well and often better than most of the models that used CUIs. Thus, the higher complexity of implementing and interpreting the CUI models must be weighed against the additional clinical context information that could be provided by these models (e.g., using explainable artificial intelligence techniques) when considering whether to use these models in practice. Our methods also demonstrate the ability to make effective and fair predictions in ward patients. Existing papers developing deep learning models in ward patients did not provide analyses on the performance of their models across patient subgroups, which is critical given the diverse nature of ward patients.[ 11 , 29 , 36 ] Our models obtained strong performance on patients with diverse demographics coming from variable hospital settings. AUROCs were slightly higher on average for patients aged 34-48 when compared to other subgroups, and slightly worse for patients near the tail ends of the age distribution (18-33 and ≥79). Furthermore, the model discrimination was superior for black patients when compared to other racial subgroups. This might be explained by the high proportion (52.0%) of black patients in the development (UC) cohort. Additionally, the AUPRC scores for patients aged 18-33 were uniformly the lowest across all models. A likely cause for this may be that the proportion (7.3%) of these patients was lower than any other subgroup. Our study has several strengths. First, incorporating multimodal data into deep learning models is rare within the ward deterioration literature, as most groups who have developed deep learning models have used exclusively structured EHR data.[ 10 , 28 – 30 , 33 , 34 , 37 – 39 ] Our study provides a diverse set of methods to inspire further exploration of multimodal modeling of clinical deterioration. Second, each CUI parameterization method and its associated model’s performance were externally validated in a separate health system. Large discrepancies existed between the development and external validation cohorts, and note-taking practices can differ greatly between hospital systems, which could increase variability in the CUIs for similar patients.[ 40 – 42 ] Despite these challenges, all models exhibited strong performance, suggesting the robustness and generalizability of our methods to diverse settings. Our study is limited in that we did not explore all possible CUI parameterization methods, and we only focused on predicting a single outcome. Also, this study did not use the raw text that constitutes clinical notes. Thus, we cannot draw strong conclusions about the overall utility of clinical notes in clinical prediction models. Finally, this was a retrospective study, and the usefulness of a clinical prediction model needs to be tested in a prospective clinical trial, which is outside the scope of this work. CONCLUSION In conclusion, we demonstrated the feasibility of combining structured and unstructured EHR data in deep learning models to predict clinical deterioration. While the best-performing model varied by metric, those whose method of CUI parameterization utilized SapBERT embeddings may have an edge over others, as the SE model performed best with respect to our primary metric (AUPRC), while the CC model seemed to perform best when considering all metrics in aggregate, achieving the highest AUROC, second highest AUPRC, and highest positive predictive value at multiple cutoffs. However, the structured-only model performed similarly to the models that included CUIs, suggesting that the marginal gains in performance may not outweigh the increased complexity and difficulties in interpreting CUI models. This work adds to the field of predicting clinical deterioration, which has historically only focused on structured data. Data Availability The data utilized in this article cannot be shared publicly because of legal and regulatory restrictions. These data were obtained from four hospital systems after our research protocol was reviewed by IRBs from each hospital, and our data use agreements do not permit sharing due to the granular nature of the data. Author Contributions Matthew M. Churpek, Majid Afshar, and Anoop Mayampurath conceptualized the study. Matthew M. Churpek, Majid Afshar, and Dana P. Edelson managed data collection and secured IRB approvals. Kyle A. Carey, Jennifer Martin, and Charles A. Kotula carried out the data preprocessing. Charles A. Kotula, Jennifer Martin, Matthew M. Churpek, Majid Afshar, Anoop Mayampurath, and Dmitriy Dligach are responsible for data analysis and methodology. Charles A. Kotula prepared the original draft, and all authors reviewed and edited the manuscript. Funding This work was supported by the National Institutes of Health (NIH), NIH National Heart, Lung, and Blood Institute grant numbers R01HL157262 (M.M.C., C.A.K., K.A.K., J.M., D.P.E., and D.D.) and R01HL173037 (A.M. and K.A.K.), and the NIH National Library of Medicine grant number R01HL173037 (M.A. and D.D.). Institutional Review Boards Ethical approval of this work was given by the University of Wisconsin-Madison Minimal Risk Research IRB (#2019-1258) and the University of Chicago Biological Sciences Division IRB (#18-0447). Conflicts of Interest Drs. Churpek and Edelson have a patent issued (#11,410,777) for a risk stratification algorithm for hospitalized patients, which was not included in this study. Dr. Edelson is employed by and has an equity stake in AgileMD, which markets and distributes the risk stratification algorithm patented by Drs. Churpek and Edelson. They receive royalties for this intellectual property. The other authors have declared no potential conflict of interest. Data Sharing Statement The data used in this study were acquired from two hospital systems following approval from the IRBs. The data use agreements prohibit sharing data due to regulatory and legal constraints, and therefore, the data cannot be shared publicly. Acknowledgments None References ↵ Churpek MM , Carey KA , Snyder A , et al. Multicenter Development and Prospective Validation of eCARTv5: A Gradient Boosted Machine Learning Early Warning Score . 2024 ;2024.03.18.24304462. ↵ Mitchell OJL , Dewan M , Wolfe HA , et al. Defining Physiological Decompensation: An Expert Consensus and Retrospective Outcome Validation . Crit Care Explor . 2022 ; 4 : e0677 . doi: 10.1097/CCE.0000000000000677 OpenUrl CrossRef ↵ Barwise A , Thongprayoon C , Gajic O , et al. Delayed Rapid Response Team Activation Is Associated With Increased Hospital Mortality, Morbidity, and Length of Stay in a Tertiary Care Institution* : Critical Care Medicine . 2016 ; 44 : 54 – 63 . doi: 10.1097/CCM.0000000000001346 OpenUrl CrossRef PubMed Buist MD , Burton PR , Bernard SA , et al. Recognising clinical instability in hospital patients before cardiac arrest or unplanned admission to intensive care: A pilot study in a tertiary-care hospital . Medical Journal of Australia . 1999 ; 171 : 22 – 5 . doi: 10.5694/j.1326-5377.1999.tb123492.x OpenUrl CrossRef PubMed Web of Science Churpek MM , Wendlandt B , Zadravecz FJ , et al. Association Between ICU Transfer Delay and Hospital Mortality: A Multicenter Investigation . J Hosp Med . 2016 ; 11 : 757 – 62 . Doi: 10.1002/jhm.2630 OpenUrl CrossRef PubMed ↵ Gupta S , Green C , Subramaniam A , et al. The impact of delayed rapid response call activation on patient outcomes . Journal of Critical Care . 2017 ; 41 : 86 – 90 . doi: 10.1016/j.jcrc.2017.05.006 OpenUrl CrossRef PubMed ↵ Subbe CP , Kruger M , Rutherford P , et al. Validation of a modified Early Warning Score in medical admissions . QJM: An International Journal of Medicine . 2001 ; 94 : 521 – 6 . doi: 10.1093/qjmed/94.10.521 OpenUrl CrossRef PubMed Web of Science Escobar GJ , Turk BJ , Ragins A , et al. Piloting Electronic Medical Record–Based Early Detection of Inpatient Deterioration in Community Hospitals . J Hosp Med . 2016 ; 11 : S18 – 24 . doi: 10.1002/jhm.2652 OpenUrl CrossRef PubMed Jang D-H , Kim J , Jo YH , et al. Developing neural network models for early detection of cardiac arrest in emergency department . The American Journal of Emergency Medicine . 2020 ; 38 : 43 – 9 . doi: 10.1016/j.ajem.2019.04.006 OpenUrl CrossRef PubMed ↵ Kim J , Chae M , Chang H-J , et al. Predicting Cardiac Arrest and Respiratory Failure Using Feasible Artificial Intelligence with Simple Trajectories of Patient Data . J Clin Med . 2019 ; 8 : 1336 . doi: 10.3390/jcm8091336 OpenUrl CrossRef PubMed ↵ Lee YJ , Cho K-J , Kwon O , et al. A multicentre validation study of the deep learning-based early warning score for predicting in-hospital cardiac arrest in patients admitted to general wards . Resuscitation . 2021 ; 163 : 78 – 85 . doi: 10.1016/j.resuscitation.2021.04.013 OpenUrl CrossRef PubMed ↵ Escobar Gabriel J. , Liu Vincent X. , Schuler Alejandro , et al. Automated Identification of Adults at Risk for In-Hospital Clinical Deterioration . New England Journal of Medicine . 2020 ; 383 : 1951 – 60 . doi: 10.1056/NEJMsa2001090 OpenUrl CrossRef PubMed ↵ Winslow CJ , Edelson DP , Churpek MM , et al. The Impact of a Machine Learning Early Warning Score on Hospital Mortality: A Multicenter Clinical Intervention Trial . Critical Care Medicine . 2022 ; 50 : 1339 . doi: 10.1097/CCM.0000000000005492 OpenUrl CrossRef ↵ Yan MY , Gustad LT , Nytrø Ø . Sepsis prediction, early detection, and identification using clinical text for machine learning: a systematic review . Journal of the American Medical Informatics Association . 2022 ; 29 : 559 – 75 . doi: 10.1093/jamia/ocab236 OpenUrl CrossRef PubMed Gao Z , Liu X , Kang Y , et al. Improving the Prognostic Evaluation Precision of Hospital Outcomes for Heart Failure Using Admission Notes and Clinical Tabular Data: Multimodal Deep Learning Model . Journal of Medical Internet Research . 2024 ; 26 : e54363 . doi: 10.2196/54363 OpenUrl CrossRef PubMed McCoy TH Jr , Castro VM , Roberson AM , et al. Improving Prediction of Suicide and Accidental Death After Discharge From General Hospitals With Natural Language Processing . JAMA Psychiatry . 2016 ; 73 : 1064 – 71 . doi: 10.1001/jamapsychiatry.2016.2172 OpenUrl CrossRef PubMed Clapp MA , Kim E , James KE , et al. Comparison of Natural Language Processing of Clinical Notes With a Validated Risk-Stratification Tool to Predict Severe Maternal Morbidity . JAMA Network Open . 2022 ; 5 : e2234924 . doi: 10.1001/jamanetworkopen.2022.34924 OpenUrl CrossRef Garriga R , Buda TS , Guerreiro J , et al. Combining clinical notes with structured electronic health records enhances the prediction of mental health crises . CR Med . 2023 ; 4 . doi: 10.1016/j.xcrm.2023.101260 OpenUrl CrossRef Wang Y , Yin C , Zhang P . Multimodal risk prediction with physiological signals, medical images and clinical notes . Heliyon . 2024 ; 10 . doi: 10.1016/j.heliyon.2024.e26772 OpenUrl CrossRef Karway GK , Koyner JL , Caskey J , et al. Development and external validation of multimodal postoperative acute kidney injury risk machine learning models . JAMIA Open . 2023 ; 6 :ooad109. doi: 10.1093/jamiaopen/ooad109 OpenUrl CrossRef ↵ Joseph JW , Leventhal EL , Grossestreuer AV , et al. Deep learning approaches to identify critically Ill patients at emergency department triage using limited information . J Am Coll Emerg Physicians Open . 2020 ; 1 : 773 – 81 . doi: 10.1002/emp2.12218 OpenUrl CrossRef PubMed ↵ Gorishniy Y , Rubachev I , Babenko A. On Embeddings for Numerical Features in Tabular Deep Learning . ↵ Savova GK , Masanz JJ , Ogren PV , et al. Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications . J Am Med Inform Assoc . 2010 ; 17 : 507 – 13 . doi: 10.1136/jamia.2009.001560 OpenUrl CrossRef PubMed ↵ UMLS Knowledge Sources [dataset on the Internet]. Release 2024AA. Bethesda (MD) : National Library of Medicine (US) ; 2024 May 6 [cited 2024 Jul 15]. Available from: http://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html ↵ cambridgeltl/SapBERT-from-PubMedBERT-fulltext · Hugging Face . https://huggingface.co/cambridgeltl/SapBERT-from-PubMedBERT-fulltext ↵ Liu F , Shareghi E , Meng Z , et al. Self-Alignment Pretraining for Biomedical Entity Representations . 2021 . ↵ Kazemi SM , Goel R , Eghbali S , et al. Time2Vec: Learning a Vector Representation of Time . 2019 . ↵ Van der Vegt AH , Campbell V , Mitchell I , et al. Systematic review and longitudinal analysis of implementing Artificial Intelligence to predict clinical deterioration in adult hospitals: what is known and what remains uncertain . J Am Med Inform Assoc . 2023 ; 31 : 509 – 24 . doi: 10.1093/jamia/ocad220 OpenUrl CrossRef ↵ Kwon J , Lee Y , Lee Y , et al. An Algorithm Based on Deep Learning for Predicting In Hospital Cardiac Arrest . J Am Heart Assoc . 2018 ; 7 : e008678 . doi: 10.1161/JAHA.118.008678 OpenUrl Abstract / FREE Full Text ↵ Shah PK , Ginestra JC , Ungar LH , et al. A Simulated Prospective Evaluation of a Deep Learning Model for Real-Time Prediction of Clinical Deterioration Among Ward Patients* . Critical Care Medicine . 2021 ; 49 : 1312 . doi: 10.1097/CCM.0000000000004966 OpenUrl CrossRef PubMed ↵ Shamout FE , Zhu T , Sharma P , et al. Deep Interpretable Early Warning System for the Detection of Clinical Deterioration . IEEE Journal of Biomedical and Health Informatics . 2020 ; 24 : 437 – 46 . doi: 10.1109/JBHI.2019.2937803 OpenUrl CrossRef ↵ Che Z , Purushotham S , Khemani R , et al. Interpretable Deep Models for ICU Outcome Prediction . AMIA Annu Symp Proc . 2017 ; 2016 : 371 – 80 . OpenUrl PubMed ↵ Raita Y , Goto T , Faridi MK , et al. Emergency department triage prediction of clinical outcomes using machine learning models . Crit Care . 2019 ; 23 : 64 . doi: 10.1186/s13054-019-2351-7 OpenUrl CrossRef PubMed ↵ Mehrdad S , Shamout FE , Wang Y , et al. Deep learning for deterioration prediction of COVID-19 patients based on time-series of three vital signs . Sci Rep . 2023 ; 13 : 9968 . doi: 10.1038/s41598-023-37013-3 OpenUrl CrossRef PubMed ↵ Yu JY , Jeong GY , Jeong OS , et al. Machine Learning and Initial Nursing Assessment-Based Triage System for Emergency Department . Healthc Inform Res . 2020 ; 26 : 13 – 9 . doi: 10.4258/hir.2020.26.1.13 OpenUrl CrossRef PubMed ↵ Rajkomar A , Oren E , Chen K , et al. Scalable and accurate deep learning for electronic health records . npj Digital Med . 2018 ; 1 : 18 . doi: 10.1038/s41746-018-0029-1 OpenUrl CrossRef PubMed ↵ Choi E , Schuetz A , Stewart WF , et al. Using recurrent neural network models for early detection of heart failure onset . J Am Med Inform Assoc . 2017 ; 24 : 361 – 70 . doi: 10.1093/jamia/ocw112 OpenUrl CrossRef PubMed Goto T , Camargo CA , Faridi MK , et al. Machine learning approaches for predicting disposition of asthma and COPD exacerbations in the ED . The American Journal of Emergency Medicine . 2018 ; 36 : 1650 – 4 . doi: 10.1016/j.ajem.2018.06.062 OpenUrl CrossRef PubMed ↵ da Silva DB , Schmidt D , da Costa CA , et al. DeepSigns: A predictive model based on Deep Learning for the early detection of patient health deterioration . Expert Systems with Applications . 2021 ; 165 : 113905 . doi: 10.1016/j.eswa.2020.113905 OpenUrl CrossRef ↵ Sohn S , Wang Y , Wi C-I , et al. Clinical documentation variations and NLP system portability: a case study in asthma birth cohorts across institutions . Journal of the American Medical Informatics Association . 2018 ; 25 : 353 – 9 . doi: 10.1093/jamia/ocx138 OpenUrl CrossRef PubMed Hultman GM , Marquard JL , Lindemann E , et al. Challenges and Opportunities to Improve the Clinician Experience Reviewing Electronic Progress Notes . Appl Clin Inform . 2019 ; 10 : 446 – 53 . doi: 10.1055/s-0039-1692164 OpenUrl CrossRef PubMed ↵ Rosenbloom ST , Denny JC , Xu H , et al. Data from clinical notes: a perspective on the tension between structure and flexible documentation . J Am Med Inform Assoc . 2011 ; 18 : 181 – 6 . doi: 10.1136/jamia.2010.007237 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted March 07, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Comparison of Multimodal Deep Learning Approaches for Predicting Clinical Deterioration in Ward Patients Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Comparison of Multimodal Deep Learning Approaches for Predicting Clinical Deterioration in Ward Patients Charles A. Kotula , Jennifer Martin , Kyle A. Carey , Dana P. Edelson , Dmitriy Dligach , Anoop Mayampurath , Majid Afshar , Matthew M. Churpek medRxiv 2025.03.06.25322855; doi: https://doi.org/10.1101/2025.03.06.25322855 Share This Article: Copy Citation Tools Comparison of Multimodal Deep Learning Approaches for Predicting Clinical Deterioration in Ward Patients Charles A. Kotula , Jennifer Martin , Kyle A. Carey , Dana P. Edelson , Dmitriy Dligach , Anoop Mayampurath , Majid Afshar , Matthew M. Churpek medRxiv 2025.03.06.25322855; doi: https://doi.org/10.1101/2025.03.06.25322855 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Intensive Care and Critical Care Medicine Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4439) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6602) Geriatric Medicine (668) Health Economics (998) Health Informatics (4539) Health Policy (1369) Health Systems and Quality Improvement (1613) Hematology (542) HIV/AIDS (1265) Infectious Diseases (except HIV/AIDS) (15919) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (667) Neurology (6600) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1145) Occupational and Environmental Health (957) Oncology (3333) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9233) Radiology and Imaging (2199) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a0108de67bc173ad',t:'MTc3OTY3MDMxMw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.