The impact of evaluation strategy on sepsis prediction model performance metrics in intensive care data

doi:10.1101/2025.02.20.25322509

The impact of evaluation strategy on sepsis prediction model performance metrics in intensive care data

2025 · doi:10.1101/2025.02.20.25322509

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 67,163 characters · extracted from preprint-html · click to expand

The impact of evaluation strategy on sepsis prediction model performance metrics in intensive care data | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search The impact of evaluation strategy on sepsis prediction model performance metrics in intensive care data Dang-Khoa Do , View ORCID Profile Patrick Rockenschaub , View ORCID Profile Sebastian Boie , View ORCID Profile Oliver Kumpf , View ORCID Profile Hans-Dieter Volk , View ORCID Profile Felix Balzer , View ORCID Profile Falk von Dincklage , View ORCID Profile Gregor Lichtner doi: https://doi.org/10.1101/2025.02.20.25322509 Dang-Khoa Do 1 Institute of Medical Informatics, Charité – Universitätsmedizin Berlin , Berlin, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Patrick Rockenschaub 2 Institute of Clinical Epidemiology, Public Health, Health Economics, Medical Statistics and Informatics, Medical University of Innsbruck , Innsbruck, Austria Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Patrick Rockenschaub Sebastian Boie 1 Institute of Medical Informatics, Charité – Universitätsmedizin Berlin , Berlin, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sebastian Boie Oliver Kumpf 3 Department of Anesthesiology and Intensive Care Medicine, Charité – Universitätsmedizin Berlin , Berlin, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Oliver Kumpf Hans-Dieter Volk 4 Institute of Medical Immunology, Charité – Universitätsmedizin Berlin , Berlin, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hans-Dieter Volk Felix Balzer 1 Institute of Medical Informatics, Charité – Universitätsmedizin Berlin , Berlin, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Felix Balzer Falk von Dincklage 5 Department of Anesthesia, Critical Care, Emergency and Pain Medicine, Universitätsmedizin Greifswald , Greifswald, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Falk von Dincklage Gregor Lichtner 1 Institute of Medical Informatics, Charité – Universitätsmedizin Berlin , Berlin, Germany 5 Department of Anesthesia, Critical Care, Emergency and Pain Medicine, Universitätsmedizin Greifswald , Greifswald, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Gregor Lichtner For correspondence: gregor.lichtner{at}med.uni-greifswald.de Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Background The prediction of the onset of sepsis, a life-threatening condition resulting from a dysregulated response to an infection, is one of the most common prediction tasks in intensive care-related machine learning research. To assess the performance of such models, different evaluation strategies (fixed horizon, peak score and continuous evaluation) are commonly employed, but there is no clear consensus on which approach should be used in order to provide clinically meaningful performance evaluation. Objective To assess different evaluation approaches of sepsis prediction models trained on a public intensive care dataset applied to German intensive care data. Methods In this retrospective, observational cohort study, we assessed the efficacy of machine learning models, pre-trained on the MIMIC-IV dataset, when applied to BerlinICU, a multi-site German intensive care dataset. To understand the real-world impact of implementing these models, we examined the performance variability across various evaluation strategies. Results The BerlinICU dataset includes 40,132 intensive care admissions spanning 10 years (2012-2021). Using the latest Sepsis-3 definition, we identified 4,134 septic admissions (prevalence 10.3%). Application of a temporal convolution network model to BerlinICU yielded an area under the receiver operating characteristic curve (AUROC) of 0.67 (95% CI: 0.66–0.68) for continuous evaluation with a 6-hour prediction horizon, compared to 0.84 (95% CI: 0.83–0.85) on the test set of MIMIC-IV. On BerlinICU, peak score evaluation showed a similar AUROC compared to continuous evaluation, while fixed horizon evaluation showed a reduced AUROC of 0.61 (95% CI: 0.60–0.62). Onset matching had minimal impact on performance estimates using continuous evaluation or fixed horizon evaluation but increased estimates for peak score evaluation. Performance metrics improved with shorter prediction horizons across all strategies. Conclusion Our results demonstrate that the choice of evaluation strategy has a significant impact on the performance metrics of intensive care prediction models. The same model applied to the same dataset yields markedly different performance metrics depending on the evaluation approach. Therefore, careful selection of the evaluation approach is essential to ensure that the interpretation of performance metrics aligns with clinical intentions and enables meaningful comparisons between studies. In our view, the continuous evaluation approach best reflects the continual monitoring of patients that is performed in real-world clinical practice. In contrast, fixed horizon and peak score evaluation approaches may produce skewed results when not properly matching the length of stay distributions between sepsis cases and control cases. Especially for peak score evaluation, longer visits tend to produce higher maximum scores because sampling from more values increases the likelihood of capturing higher values purely by chance. Introduction Sepsis is a life-threatening condition resulting from a dysregulated response to an infection that leads to organ dysfunction ( 1 ), responsible for up to 20% of all deaths worldwide ( 2 ). Beyond its significant mortality rate ( 3 ), sepsis represents a tremendous financial burden, totaling to almost $24 billion dollars in the US in 2013 ( 4 ). Those who survive may experience long-term health consequences, including a diminished quality of life ( 5 ). As each hour of delayed therapy increases mortality ( 6 – 8 ), sepsis treatment guidelines continue to recommend early therapy within the first hour of sepsis recognition ( 9 – 11 ). Therefore, an early recognition of sepsis is essential. Numerous studies have investigated machine learning to aid early prediction of sepsis in the intensive care unit (ICU) ( 12 , 13 ), including prospective application ( 14 , 15 ) and external validation on data of different hospitals ( 15 – 17 ). Previous studies have employed different evaluation strategies, which vary in the criteria for data selection and lead to differences in the timing and scope of data considered relative to sepsis onset. These strategies can be broadly categorized into three groups: (I) Fixed Horizon Evaluation This approach evaluates a model’s ability to predict the onset of sepsis at a predefined time point before the actual onset — the so-called prediction horizon — using all available data up to that time ( 13 , 18 ). Varying the length of the prediction horizon allows for assessing the earliness of the model’s predictive ability. In this approach, it is particularly important to employ a case-control matching procedure, where control patients who do not develop sepsis are evaluated at a time point in their ICU stay that corresponds to the sepsis onset in sepsis patients. Without appropriate matching, control patients are typically assessed in the improved health conditions of their final pre-discharge hours. This comparison can oversimplify the analysis and bias results, as these conditions distinctly contrast with those of sepsis patients ( 19 ). (II) Peak Score Evaluation Like fixed horizon evaluation, this approach evaluates a single prediction for each patient. However, instead of considering a pre-defined time interval before sepsis onset, this strategy utilizes the peak prediction score generated by the model across all time points during the patient’s stay before the actual onset ( 16 ). It has been reasoned that this approach better aligns with clinical practice, based on the assumption that a prediction model in a clinical setting operates with a fixed threshold and once any prediction score surpasses this threshold, an alarm for that particular patient is triggered ( 16 , 20 ). Performance metrics derived under this evaluation strategy are considered to reflect the model performance in clinical practice. While excelling in simplicity, this strategy disregards temporal proximity of the prediction to the actual sepsis onset and favors extreme values, which might not be representative of the patient’s overall condition. (III) Continuous Evaluation This approach evaluates a series of predictions made continuously for each patient over time ( 13 ). Specifically, predictions for all time points before sepsis onset are considered. In contrast to fixed horizon or peak score evaluation, which use a single prediction per patient with a label indicating whether the patient eventually develops sepsis, continuous evaluation assesses each time point individually, resulting in multiple predictions for every patient. All time points within the defined prediction window leading up to the onset are labeled “positive”, while time points before that window are labeled “negative”. The resulting performance metrics from this approach represent the model’s predictive efficiency across all time points, simulating its use for a randomly selected patient at a random time point. This view aligns well with clinical practice, where patients are continuously monitored without prior knowledge of when sepsis might occur ( 13 ). In summary, fixed horizon evaluation assesses the model’s ability to predict sepsis onset at a specific, pre-defined time point before its actual occurrence. The peak score evaluation measures the model’s performance at the time point when the model outputs the highest probability for an upcoming sepsis event, reflecting a threshold-based clinical decision-making approach. The continuous evaluation approach evaluates the model’s predictive performance continually across all time points, reflecting the real-world scenario of continuous patient monitoring. It remains uncertain how the performance varies across these approaches and which approach most accurately reflects clinical practice. We aimed to investigate how the choice of the evaluation approach influences the estimated performance of sepsis prediction models in an external validation context. Specifically, we compared the performance of models trained on a public intensive care dataset from the United States and subsequently applied it to German intensive care data. In doing so, we also address the lack of studies involving large-scale ICU cohorts from the German healthcare context, where the efficacy of sepsis prediction models—given the distinct patient demographics, treatment approaches, and healthcare policies— remains largely uninvestigated. Methods Datasets We pre-trained models on the MIMIC-IV dataset, a well-known ICU dataset from the United States that is commonly employed to develop clinical prediction models ( 12 , 13 ). We applied these models to BerlinICU, a comprehensive dataset from eight ICUs from Charité – Universitätsmedizin Berlin in Germany, one of Europe’s largest university hospitals. The ICUs were distributed across two different sites (Campus Virchow Klinikum and Campus Charité Mitte). BerlinICU was extracted from the intensive care data management systems and the hospital information system present at the Charité – Universitätsmedizin Berlin. Specifically, we extracted the same clinical features that we used to pre-train the models, resulting in a total of 48 time-varying features, including vital parameters and laboratory results, along with 4 static features (age, sex, height and weight; Table S1 ). All clinical feature values outside of physiologically plausible ranges, as defined by the ricu package ( 21 ), were removed. The remaining data was aggregated into intervals of one hour using their median value. Missing values were imputed using a forward fill strategy. If no previous values were available, the training mean across all patients for that feature was used. Data were excluded from the dataset if they originated from non-intensive care wards, involved patients younger than 18 years, had a sepsis onset within 4 hours after admission, contained less than 6 hours of data or included gaps exceeding 12 hours. The dataset was restricted to data from the years 2012-2021 and limited to at most seven days following admission to the ICU ( Figure 1 ), following the training setup ( 17 ). Download figure Open in new tab Fig. 1. Preprocessing Pipeline. The BerlinICU dataset was extracted from the hospital information system and cleaned. We inferred sepsis onset according to the Sepsis-3-definition. For the logistic regression baseline models, we created additional temporal features. The final dataset was then passed to the models and evaluated. SOFA: Sequential Organ Failure Assessment score. MAP: Mean Arterial Pressure. ICU: Intensive Care Unit. Outcome definition To identify the onset of sepsis, we used the latest Sepsis-3-definition criteria, defined as the cooccurrence of a suspicion of infection and an increase of the SOFA score by at least 2 points [1]. A suspicion of infection (SI) is present when antibiotic treatment is coupled with microbiological culture tests. As the BerlinICU dataset did not include microbiological data, we employed an alternative definition for SI similar to that followed in previous studies ( 16 , 17 ): We inferred the SI from antibiotic treatment administered at least once every 24 hours for at least 3 days, with the SI start marked by the beginning of this treatment ( Figure 1 ). We assumed the onset of sepsis if an increase of at least 2 points of the SOFA score occurred within a 48-hour window preceding and a 24-hour window following the start of SI. The sepsis onset time point was defined as the time point of the recorded SOFA increase. Model training We employed temporal convolutional network (TCN) models trained on the MIMIC-IV dataset, following the architecture and implementation described in ( 17 ). This choice is guided by a comprehensive study by Bai et al. ( 22 ), which demonstrated the superior performance of the TCNs in sequential data in various tests and domains. Our models were trained to predict, for each hourly bin, the individual risk of sepsis onset in the next 6 hours. We used a 10-times repeated random split scheme for hyperparameter tuning, resulting in 10 trained models for the hyperparameter set with the lowest average validation loss. For each repetition in this scheme, 20% of the data was held out as a test set. The remaining data was split into training (64%) and validation (16%). Training was performed using a binary cross-entropy loss and an Adam optimizer for a maximum of 1000 epochs with early stopping. To enhance model performance, binary missingness indicators were added for the 48 time-varying and 4 static features used as inputs. For establishing baselines in our analyses, we utilized logistic regression models. These were trained following a 5-fold cross-validation scheme in which 80% of the data was used for training and 20% for testing. To better capture the temporal dynamics, we additionally engineered temporal features for the 48 time-varying features. These temporal features represent the trajectory of the measurements over time and include the minimum, maximum, mean, median and variance of the last 4, 8 and 16 hours for the 48 laboratory/vital features, totaling 772 features (48 time-varying features from the hourly bins, their 720 temporal features, 4 static features). To account for the class imbalance, we used class weights inversely proportional to class frequencies. Training and evaluation were performed using python 3.10.8, pytorch 1.12.1 and scikit-learn 1.1.3. Model evaluation To focus on evaluating the real-world applicability of the models in a clinical setting, we employed three different evaluation strategies that scrutinized different aspects of the models’ performance ( Figure 2 ). For all strategies, we report the mean area under the receiver operating characteristic (AUROC). To estimate variability of performance metrics, we applied bootstrap resampling (1000 samples). For each evaluation setting that included a prediction horizon, we only included those patients whose length of stay matched or exceeded the defined horizon. Download figure Open in new tab Fig. 2. Overview of the evaluation strategies. For the fixed horizon evaluation , a prediction horizon was defined, where predictions were made for a time point that preceded the onset (or control onset) a pre-defined number of hours (6 hours in this example). Only the data from admission up to this time point was used to make the prediction (green rectangle). Labels were assigned as “positive” (marked as orange ‘1’s) for sepsis patients or “negative” (marked as blue ‘0’s) for control patients. In the peak score evaluation , the maximum prediction score across all time points prior to onset was determined (purple rectangle). Sepsis patients were labeled “positive”, while control patients were labeled “negative”. In the continuous evaluation (brown rectangle), for sepsis patients, time points within a specific time window before the onset (6 hours in this example) were labeled “positive”, while earlier time points – and all time points for control patients – were labeled “negative”. All time points before the onset (or control onset) were considered for evaluation. For sepsis patients (top) , the onset was the actual sepsis onset. For control patients (bottom) , the onset was defined either as a hypothetical control onset matched to the onset time of a sepsis patient (“matched onset”) or as the last available data point, either at discharge or at 7 days after admission, whichever occurred first. The rectangular frames represent the data that was ultimately included for each evaluation strategy. Shown here are the default settings for each evaluation strategy: the fixed horizon and peak score evaluations use matched sepsis onset times, while the continuous evaluation is depicted without onset matching. A key aspect of the evaluation strategies is the handling of time series data for non-septic patients (controls): While septic patients (cases) have a clearly defined sepsis onset that marks the endpoint of their time series for analysis, non-septic patients lack such a reference point. Therefore, we employed two commonly used approaches to define the end-point of their time series for analysis: No Onset Matching The hypothetical onset for control patients is defined as the last available data point, either at discharge or at 7 days after admission, whichever occurred first. Onset Matching Onsets of control patients are aligned with those of septic patients by randomly pairing each control patient with a sepsis patient whose time to sepsis onset does not exceed that of the control patient. The onset time of the matched sepsis patient is then used as the hypothetical onset time for the control patient ( 16 , 19 , 23 ). Fixed horizon evaluation Fixed horizon evaluation uses all data up to a certain time point before sepsis onset, known as the prediction horizon ( h ), to predict the upcoming onset of sepsis. Here, the predicted score y pred , which is based on all available data for the patient up to h hours before the sepsis onset for sepsis patients or up to a hypothetical onset for control patients ( t onset ), and the true label y true are given by: Where X 1: t is the patient data (feature matrix) from time point 1 to t and f is the model that outputs a score correlated to the probability of a sepsis onset. Peak score evaluation To evaluate the model’s performance throughout the entire patient encounter, the maximum model prediction score from ICU admission to the onset of sepsis or control offset was determined, resulting in a single score/label pair for each patient for performance score calculation. This method focuses on the peak confidence of the model for sepsis prediction during a patient’s stay. Here, the prediction score and true label are given by: For horizon-dependent analysis, we used the peak score within the defined horizon instead of across all time points of each patient. Continuous evaluation To evaluate the model’s capability to predict the onset of sepsis at a randomly selected time point for a randomly selected patient, we implemented a continuous horizon-based prediction strategy. This included first establishing a specific time horizon (e.g., 6 hours) to assess the model’s ability to predict the sepsis onset at any given time point within that timeframe. For sepsis patients, a positive label was assigned to all time points within the horizon, while all other time points were labeled as negative. For control patients, all time points were labeled as negative. We included the prediction scores from each hourly interval up to the onset of sepsis in sepsis patients or the hypothetical control onset in control patients ( t onset ), excluding the prediction at the actual onset (or control onset). The prediction score and true label are given by: This procedure generated a sequence of hourly score/label pairs for each patient: To accommodate variations in the length of these sequences among patients, we applied inverse frequency weighting to the samples when computing performance metrics. Ethics approval This study was approved by the local ethics committee (Ethikausschuss am Campus Virchow-Klinikum, Charité—Universitätsmedizin Berlin, Chairperson PD Dr. E. Kaschina, Application Number EA2/137/22, approval date: 12 Jul 2022, amendment date: 10 May 2023). Results Study cohort From 60,332 ICU stays in the initial dataset, after extraction, preprocessing and filtering the final dataset consisted of 40,132 ICU stays of 36,872 unique patients spanning 10 years (2012-2021). 4,134 stays were septic, resulting in a sepsis prevalence of 10.3% ( Figure 3 , Table 1 ). The original MIMIC-IV training dataset consisted of 67,056 ICU stays with a sepsis prevalence of 5.6% (3,730 ICU stays). View this table: View inline View popup Download powerpoint Table 1. Patient characteristics of BerlinICU and MIMIC-IV for the total dataset, the sepsis cohort and non-sepsis cohort. Download figure Open in new tab Fig. 3. Flow diagram of patient eligibility criteria. Influence of evaluation strategy on model performance estimates Applying the TCN models pre-trained on MIMIC-IV to the MIMIC-IV test set for internal validation under a scenario aligned with the training setup - continuous evaluation using a 6-hour prediction horizon without onset matching - resulted in an AUROC of 0.84 (95% CI: 0.83-0.85, Figure 4a ). When the same models were applied to BerlinICU for external validation under the same evaluation conditions, performance dropped to 0.67 (95% CI: 0.66-0.68, Figure 4b ). Download figure Open in new tab Fig. 4. Comparison of model performance of temporal convolutional network (TCN) models for different evaluation strategies. The models were applied to the test splits of the MIMIC-IV training dataset (a) and the entire German ICU dataset (b; BerlinICU). Model performance was evaluated using fixed horizon evaluation (6h horizon), peak score evaluation (entire patient stay) and continuous evaluation (6h horizon) without onset matching (dark bars) and with onset matching (light bars). The AUROC value inside the bar represents the mean AUROC across all bootstrap samples and models. AUROC: area under the receiver operating characteristic. Error bars: 95% confidence intervals. To assess the impact of different evaluation strategies, we performed fixed horizon evaluation under the same conditions (6-hour prediction horizon, no onset matching) and peak score evaluation, which does not employ a prediction horizon. On the MIMIC-IV test set, fixed horizon evaluation resulted in a slightly higher AUROC of 0.87 (95% CI: 0.85-0.88), whereas peak score evaluation showed significantly reduced performance of 0.76 (95% CI: 0.74-0.77). In external validation on BerlinICU, fixed horizon evaluation resulted in a significantly reduced AUROC (0.61 [95% CI: 0.60-0.62]) compared to continuous evaluation (ΔAUROC −0.06), while the peak score evaluation estimate (0.66 [95% CI: 0.65-0.67]) was similar to that of the continuous evaluation (ΔAUROC −0.01). Across all evaluation strategies, performance estimates were consistently lower on BerlinICU compared to the MIMIC-IV test set. Aligning the length of stay distribution of control patients to sepsis patients through onset matching led to a notable AUROC reduction in internal validation on the MIMIC-IV test set for continuous evaluation (ΔAUROC −0.06) and fixed horizon evaluation (ΔAUROC −0.14), while AUROC estimates for peak score evaluation increased (ΔAUROC 0.04). On BerlinICU, onset matching had minimal impact on performance estimates during continuous evaluation or fixed horizon evaluation. In contrast, for peak score evaluation, onset matching increased AUROC estimates (ΔAUROC 0.08). Across all evaluation strategies, TCN models consistently outperformed logistic regression baseline models ( Figure S1 ). The only exception was observed in fixed horizon evaluation on BerlinICU without onset matching, where logistic regression showed a superior performance (AUROC 0.64 [95% CI: 0.63–0.65]) compared to TCN (AUROC 0.61 [95% CI: 0.60–0.62]). Influence of the prediction horizon on model performance estimates To evaluate the ability of the models to provide timely predictions, we assessed their performance across a range of prediction horizons ( Figure 5 ). Performance estimates generally improved with shorter prediction horizons across all evaluation strategies. During internal validation on the MIMIC-IV test set, fixed horizon evaluation and continuous evaluation showed consistent improvement, with the highest AUROC achieved at a 1-hour horizon (fixed horizon: AUROC 0.92 [95% CI: 0.91-0.93], continuous: AU-ROC 0.87 [95% CI: 0.85-0.88]). Peak score plateaued around the 6-hour horizon (AUROC 0.92 [95% CI: 0.91-0.93]), with only marginal improvements for smaller horizons. In external validation on BerlinICU, performance also generally increased with shorter prediction windows, with few exceptions including a plateau around the 6-hour horizon in fixed horizon and peak score evaluation with onset matching. Download figure Open in new tab Fig. 5. Performance metrics across prediction horizons, comparing the performance of temporal convolutional network (TCN) models for an evaluation with onset matching (dashed lines) and without onset matching (solid lines). The models were applied to the test splits of the MIMIC-IV training dataset (blue) and the entire German ICU dataset (BerlinICU; green). Shown is the AUROC based on varying prediction horizons from one hour before onset up to 100 hours before onset. AUROC: area under the receiver operating characteristic. Error bars: 95% confidence intervals. Onset matching consistently decreased performance estimates during internal validation on the MIMIC-IV test set across all evaluation strategies, albeit to different extents, with continuous evaluation showing the smallest susceptibility to the matching approach. On BerlinICU, onset matching impacted strategies differently depending on the prediction horizon. Notably, continuous evaluation remained more robust to onset matching, maintaining more consistent AU-ROCs compared to the other strategies across different prediction horizons, particularly for very short and long prediction horizons. Comparing the TCN and logistic regression baseline models in internal validation on the MIMIC-IV test set, TCN models consistently outperformed logistic regression models across all evaluation strategies and prediction horizons. In contrast, during external validation on BerlinICU without onset matching, logistic regression models approached or even exceeded the performance of TCN models at certain prediction horizons ( Figure S2 ). Discussion In this study, we assessed the impact of commonly used evaluation strategies – fixed horizon, peak score and continuous evaluation – on the performance of machine learning models for sepsis onset prediction. Our results demonstrate that the choice of the evaluation strategy significantly influences performance estimates, even though the same model is used on the same dataset. We further addressed a gap in research involving the German healthcare system by aggregating an ICU dataset – called BerlinICU – from one of the largest university hospitals in Europe, Charité – Universitätsmedizin Berlin. Including more than 40,000 admissions from eight ICU wards, this dataset is one of the largest ICU datasets in Europe, exceeding the size of the currently available European datasets AUMCdb ( 24 ), HiRID-I ( 25 ) and SICdb ( 26 ). To the best of our knowledge, this is one of the first studies to explore how sepsis prediction models perform on large-scale German ICU data. Fixed horizon evaluation requires prospective knowledge of sepsis onset Fixed horizon evaluation uses data up to a pre-defined time point before sepsis onset – the prediction horizon – to make predictions. Performance decreased with longer prediction horizons due to increasing temporal distance from sepsis onset, but remained at an overall high level. Without onset matching, performance estimates strongly increased for all horizons on the MIMIC-IV test set and for most horizons for BerlinICU, with the exception of the 6-hour horizon. This performance increase when not using onset matching is most parsimoniously explained by the evaluation of control patients over their entire stay, with emphasis on the period just before discharge. During these pre-discharge hours, control patients are typically in a stable condition, making it easier for the model to correctly identify them as low-risk for sepsis. This effect, also noted by Futoma ( 19 ), shows the significant impact of onset alignment. Despite this significant influence, many studies do not comment on the implementation of case-control matching ( 13 ). While the fixed horizon approach achieved the highest AUROC in a study on different evaluation strategies ( 18 ), we observed more nuanced performance differences compared to the other evaluation approaches depending on the dataset, horizon and onset matching approach. Generally, interpretation of performance metrics computed through the fixed horizon approach is challenging, as it relies on prospective information – the time point of the sepsis onset – which is not accessible in a real-world clinical setting. Peak score evaluation is influenced by the length of stay distribution and requires onset matching to avoid bias The peak score evaluation assesses the model’s peak confidence in predicting sepsis throughout the entire ICU stay. This approach seems intuitively aligned with clinical practice, where a fixed threshold would be pre-determined, and an alarm is triggered once the prediction model’s output surpasses it — potentially just once per patient, with subsequent alarms suppressed ( 16 , 20 ). The focus is on whether an alarm is raised at any point during the patient’s stay, which is replicated by determining the maximum prediction score. However, the performance metrics determined under this approach are influenced by differences in length of stay distributions between cases and controls: longer time series for control patients increase the likelihood of higher maximum prediction scores for these patients simply due to random fluctuations, ultimately reducing the AUROC as the maximum scores for control patients approach those of sepsis patients on average ( Figure S4 ). Importantly, this bias conflicts with the core interpretation of the AUROC, which measures the probability that a randomly selected true positive will have a higher predicted score than a randomly selected true negative. Our results demonstrate this behavior, showing that performance estimates without onset matching – where control cases tend to have longer length of stays – were significantly reduced compared to analyses with onset matching ( Figure 4 ). To mitigate this effect, an onset matching approach that aligns the length of stay distributions between positive and negative cases is strictly necessary when applying this evaluation strategy. The discrepancy between the peak score evaluation across all patients ( Figure 4 ), where onset matching increases the AUROC, and horizon-dependent analysis ( Figure 5 ), where onset matching decreases the AUROC, can be explained by the exclusion of patients with insufficient stay lengths in the latter. This exclusion alters the length of stay distribution and thereby significantly impacts performance metrics. Not filtering patients with insufficient length of stays shows a distinct interaction effect between onset matching and prediction horizon ( Figure S3 ), further highlighting the critical influence of length of stay distribution on the performance estimates in the peak score evaluation approach. Continuous evaluation without onset matching reflects clinical practice Continuous prediction evaluation showed increased performance with shorter prediction windows, demonstrating the model’s effectiveness in utilizing short-term, relevant data to predict sepsis onset. As the prediction window narrows and focuses more on the actual onset of sepsis, the assigned sepsis labels align more closely with the sepsis-related signals in the patient data, enhancing prediction efficiency. Interestingly, although the models were trained to predict sepsis 6 hours in advance, this specific labeling window did not yield the best performance. Evaluating on BerlinICU with onset matching, model performance was significantly reduced compared to the MIMIC-IV test set (ΔAUROC −0.11 for 6 hours before onset; −0.08 for 1 hour). This reduced performance when transferring models between datasets was also evident in a previous study where models trained on MIMIC-IV were evaluated on other European datasets using a horizon-based evaluation with a six hour window ( 17 ). Even though the performance in that study declined for European datasets (HiRID-I: −0.07, AUM-Cdb: −0.09) as well as for another US dataset (eICU: −0.10), it was not as severe as observed in BerlinICU. Without onset matching, performance in the MIMIC-IV test set was higher across horizons, and was similar or better on BerlinICU for most horizons. Again, this higher performance without onset matching most likely reflects the inclusion of time points closer to discharge for control patients, who generally have a healthier profile, making the prediction task easier. While onset matching may provide a focused assessment in more challenging cases, not using it better mirrors clinical practice, where patients are monitored continuously from admission to discharge. In our view, continuous evaluation without onset matching better reflects real-world clinical application, as the AUROC represents the likelihood of a positive case being ranked higher than a negative one, including time points closer to discharge. Although this approach requires pre-determining a time window, which indicates how far in the future the onset may be predicted, we consider continuous evaluation the better choice for two reasons: First, defining a time window — ideally based on pathophysiological plausibility — ensures that only alarms occurring within a clinically meaningful period before sepsis onset are considered. This distinction is crucial, as some published models show high performance days to weeks before the actual event ( 27 – 29 ), likely detecting broader, correlated health-related features rather than sepsis specifically. This underlines the importance of thoroughly defining and testing what these models are predicting to ensure their practical utility in clinical settings. Second, varying the prediction window and reporting the performance across all these windows allows for a more nuanced understanding of the models’ ability to predict event onset in advance. Limitations Our study has several limitations: First, we did not have access to microbiological data, requiring an adjustment of the Sepsis-3 definition. This approach, however, was also chosen in other studies when using datasets from hospitals that did not provide microbiological data ( 16 , 17 ). Second, we used models trained for a 6-hour horizon and applied them across various prediction horizons to predict sepsis onset. While these models may not perform optimally outside their original specifications, the inability to know the exact duration until a potential sepsis onset at any given timepoint in clinical practice necessitates using a flexible approach. Thus, by testing these models beyond their intended parameters, we can evaluate their robustness and generalizability to different clinical scenarios and timelines. Comparison with prior work Our work combines the idea of generalizability across countries ( 16 , 17 ) and suitable metric calculation concepts ( 16 , 30 ). Our study builds on these previous concepts, refining and extending them as needed. While Moor and Rockenschaub examined the influence of various algorithms and training sets from different countries, our work focuses on different evaluation strategies. We applied sepsis onset matching ( 19 , 30 ) and peak score evaluation ( 16 ), combining approaches that were presented separately. As the peak score evaluation is only suitable for retrospective analysis, we also applied a continuous evaluation strategy ( 13 ). Moreover, there are only few studies that have applied models to German data ( 29 , 31 ), none of which having a dedicated focus on comparing performances in different evaluation strategies. Several studies have applied sepsis prediction models prospectively to US data ( 14 , 15 , 32 ), but they often lack details on how the models were trained and how data was preprocessed. This lack of transparency is a common issue, as noted in two systematic reviews of sepsis prediction models address ( 12 , 13 ). Accordingly, common checklists for reporting prediction models include items for external validation in different countries and time periods ( 33 , 34 ). Conclusions Our study highlights the critical need to choose an evaluation approach that is aligned with the intended interpretation of the resulting performance metric, as different approaches yield markedly different performance estimates despite using the same model on the same dataset. Importantly, for the fixed horizon and peak score evaluation approaches, a carefully devised onset matching strategy is crucial to avoid skewed results that may not reflect the true model performance. In our view, the continuous evaluation approach better reflects clinical reality. Despite requiring specifying the horizon time window, it offers insights into the model’s effectiveness in a clinical setting. Data Availability Aggregated data produced in the present study are available upon reasonable request to the authors. Individual patient-level data can not be shared due to data privacy regulations. Funding This work received funding from the Jürgen Manchot Stiftung. Conflicts of Interests Sebastian Boie is a salaried employee at Pfizer Pharma GmbH and a visiting researcher at the institute of Medical Informatics, Charité – Universitätsmedizin Berlin. Pfizer had no involvement in the conception, design, execution, or interpretation of the study, nor in the preparation or decision to submit the manuscript for publication. The authors declare no conflicts of interest regarding this study. Supplementary Appendix A. Features This section lists the clinical features used in the prediction models, categorized into static variables, vital signs, arterial blood gas analysis, electrolytes, blood count, and other laboratory parameters. View this table: View inline View popup Download powerpoint Supplementary Table S1. Clinical features used in the prediction models. B. Cohort Sizes This section provides an overview of the dataset sizes across different prediction horizons, including the number of total patients, sepsis cases, and prevalence rates for MIMIC-IV and BerlinICU cohorts. View this table: View inline View popup Download powerpoint Supplementary Table S2. Cohort sizes for the time-dependent evaluation. Shown is the number of patients in each cohort and dataset after filtering for patients whose length of stay matches or exceeds the prediction horizon. For BerlinICU, the total cohort size refers to the complete dataset, while for MIMIC-IV the mean, min and max across all test sets is described (TCN: 10 test sets, LogReg: 5 test sets). TCN: Temporal Convolutional Network. LogReg: Logistic Regression. C. Baseline Models This section presents the performance of logistic regression models across different evaluation strategies, including fixed horizon, peak score, and continuous evaluation, comparing results between MIMIC-IV and BerlinICU datasets. Download figure Open in new tab Supplementary Figure S1. Comparison of model performance of logistic regression models for different evaluation strategies. The models were applied to the test splits of the MIMIC-IV training dataset (a) and the entire German ICU dataset (b; BerlinICU). Model performance was evaluated using fixed horizon evaluation (6h horizon), peak score evaluation (entire patient stay) and continuous evaluation (6h horizon) without onset matching (dark bars) and with onset matching (light bars). The AUROC value inside the bar represents the mean AUROC across all bootstrap samples and models. AUROC: area under the receiver operating characteristic. Error bars: 95% confidence intervals. Download figure Open in new tab Supplementary Figure S2. Performance metrics across prediction horizons for logistic regression models, comparing the performance an evaluation with onset matching (dashed lines) and without onset matching (solid lines). The models were applied to the test splits of the MIMIC-IV training dataset (blue) and the entire German ICU dataset (BerlinICU; green). Shown is the AUROC based on varying prediction horizons from one hour before onset up to 100 hours before onset. AUROC: area under the receiver operating characteristic. Error bars: 95% confidence intervals. Download figure Open in new tab Supplementary Figure S3. Performance metrics across prediction horizons, without filtering for length of stay (LOS), comparing, the performance of temporal convolutional network (TCN, a-b) and logistic regression (LogReg, c-d) models for an evaluation with onset matching (dashed lines) and without onset matching (solid lines). The models were applied to the test splits of the MIMIC-IV training dataset (blue) and the entire German ICU dataset (BerlinICU; green). Shown is the AUROC based on varying prediction horizons from one hour before onset up to 24 hours before onset. Each prediction horizon consisted of the same patients. If the LOS of a patient was shorter than the prediction horizon, then evaluation strategies were applied to the data points that were available. AUROC: area under the receiver operating characteristic. Error bars: 95% confidence intervals. D. Peak Score Evaluation Bias This section illustrates the potential bias introduced by peak score evaluation using simulated random data, demonstrating how the number of samples per patient affects classifier performance, even when the underlying data distributions are identical. Download figure Open in new tab Supplementary Figure S4. Peak Score Evaluation on Random Data with Increasing Maximum Number of Samples per Negative Patient. The figure illustrates the behavior of a classifier on simulated data where there is no inherent difference between the negative and positive patients. Both groups consist of 10,000 patients each, with scores drawn from the same random Gaussian distribution. The only distinction between the groups is that each positive patient has up to 10 samples, while the negative patients have varying maximum numbers of samples, as indicated in the legend. The results demonstrate that when taking the maximum score for each patient, the classifier metric becomes increasingly skewed as the number of samples for each negative patient deviates more from the number of samples per positive patient. This disparity between the number of samples per class creates an artificial boost or dilution in performance, even though there is no true difference between the underlying data distributions for the two groups. The diagonal line (“Random classifier”) shows the baseline performance of a random classifier, and the comparison highlights the inflation of the metric due to unequal sampling. ACKNOWLEDGEMENTS We thank Fabian Schreiber and Falk Meyer-Eschenbach for their support in data extraction. The authors also acknowledge the Scientific Computing of the IT Division at the Charité - Universitätsmedizin Berlin for providing computational resources that have contributed to the research results reported in this paper. References 1. ↵ Singer M. , Deutschman C. S. , Seymour C. W. , Shankar-Hari M. , Annane D. , Bauer M. , Bellomo R. , Bernard G. R. , Chiche J.-D. , Coopersmith C. M. , Hotchkiss R. S. , Levy M. M. , Marshall J. C. , Martin G. S. , Opal S. M. , Rubenfeld G. D. , Poll T. , Vincent J.-L. , and Angus D. C. The Third International Consensus Definitions for Sepsis and Septic Shock (Sepsis-3) . JAMA , 315 ( 8 ): 801 – 810 , feb 23 2016 . ISSN 0098-7484 . doi: 10.1001/jama.2016.0287 . OpenUrl CrossRef PubMed 2. ↵ Rudd K. E. , Johnson S. C. , Agesa K. M. , Shackelford K. A. , Tsoi D. , Kievlan D. R. , Colom-bara D. V. , Ikuta K. S. , Kissoon N. , Finfer S. , Fleischmann-Struzek C. , Machado F. R. , Reinhart K. K. , Rowan K. , Seymour C. W. , Watson R. S. , West T. E. , Marinho F. , Hay S. I. , Lozano R. , Lopez A. D. , Angus D. C. , Murray C. J. L. , and Naghavi M. Global, regional, and national sepsis incidence and mortality, 1990–2017: analysis for the Global Burden of Disease Study . Lancet (London, England) , 395 ( 10219 ): 200 – 211 , jan 18 2020 . ISSN 0140-6736 . doi: 10.1016/S0140-6736(19)32989-7 . OpenUrl CrossRef PubMed 3. ↵ Fleischmann C. , Thomas-Rueddel D. O. , Hartmann M. , Hartog C. S. , Welte T. , Heublein S. , Dennler U. , and Reinhart K. Hospital Incidence and Mortality Rates of Sepsis . Deutsches Arzteblatt International , 113 ( 10 ): 159 – 166 , mar 11 2016 . ISSN 1866-0452 . doi: 10.3238/arztebl.2016.0159 . OpenUrl CrossRef PubMed 4. ↵ Torio C. M. and Moore B. J. National Inpatient Hospital Costs: The Most Expensive Conditions by Payer , 2013 . Agency for Healthcare Research and Quality (US), Rockville (MD) , 2006. Times cited: 1 PMID: 27359025 . OpenUrl PubMed 5. ↵ Winters B. D. , Eberlein M. , Leung J. , Needham D. M. , Pronovost P. J. , and Sevransky J. E. Long-term mortality and quality of life in sepsis: a systematic review . Critical Care Medicine , 38 ( 5 ): 1276 – 1283 , 5 2010 . ISSN 1530-0293 . doi: 10.1097/CCM.0b013e3181d8cc1d . OpenUrl CrossRef PubMed Web of Science 6. ↵ Ferrer R. , Martin-Loeches I. , Phillips G. , Osborn T. M. , Townsend S. , Dellinger R. P. , Artigas A. , Schorr C. , and Levy M. M. Empiric antibiotic treatment reduces mortality in severe sepsis and septic shock from the first hour: results from a guideline-based performance improvement program . Critical Care Medicine , 42 ( 8 ): 1749 – 1755 , 8 2014 . ISSN 1530-0293 . doi: 10.1097/CCM.0000000000000330 . OpenUrl CrossRef PubMed 7. Pruinelli L. , Westra B. L. , Yadav P. , Hoff A. , Steinbach M. , Kumar V. , Delaney C. W. , and Simon G. Delay Within the 3-Hour Surviving Sepsis Campaign Guideline on Mortality for Patients With Severe Sepsis and Septic Shock . Critical Care Medicine , 46 ( 4 ): 500 – 505 , 4 2018 . ISSN 1530-0293 . doi: 10.1097/CCM.0000000000002949 . OpenUrl CrossRef PubMed 8. ↵ Seymour C. W. , Gesten F. , Prescott H. C. , Friedrich M. E. , Iwashyna T. J. , Phillips G. S. , Lemeshow S. , Osborn T. , Terry K. M. , and Levy M. M. Time to Treatment and Mortality during Mandated Emergency Care for Sepsis . The New England Journal of Medicine , 376 ( 23 ): 2235 – 2244 , jun 8 2017 . ISSN 1533-4406 . doi: 10.1056/NEJMoa1703058 . OpenUrl CrossRef PubMed 9. ↵ Evans L. , Rhodes A. , Alhazzani W. , Antonelli M. , Coopersmith C. M. , French C. , Machado F. R. , Mcintyre L. , Ostermann M. , Prescott H. C. , Schorr C. , Simpson S. , Joost Wiersinga W. , Alshamsi F. , Angus D. C. , Arabi Y. , Azevedo L. , Beale R. , Beilman G. , Belley-Cote E. , Burry L. , Cecconi M. , Centofanti J. , Yataco A. C. , De Waele J. , Dellinger R. P. , Doi K. , Du B. , Estenssoro E. , Ferrer R. , Gomersall C. , Hodgson C. , Møller M. H. , Iwashyna T. , Jacob S. , Kleinpell R. , Klompas M. , Koh Y. , Kumar A. , Kwizera A. , Lobo S. , Masur H. , McGloughlin S. , Mehta S. , Mehta Y. , Mer M. , Nunnally M. , Oczkowski S. , Osborn T. , Papathanassoglou E. , Perner A. , Puskarich M. , Roberts J. , Schweickert W. , Seckel M. , Sevransky J. , Sprung C. L. , Welte T. , Zimmerman J. , and Levy M. Executive Summary: Surviving Sepsis Campaign: International Guidelines for the Management of Sepsis and Septic Shock 2021 . Critical Care Medicine , 49 ( 11 ): 1974 , 11 2021 . ISSN 0090=3493 . doi: 10.1097/CCM.0000000000005357 . OpenUrl CrossRef PubMed 10. Evans L. , Rhodes A. , Alhazzani W. , Antonelli M. , Coopersmith C. M. , French C. , Machado F. R. , Mcintyre L. , Ostermann M. , Prescott H. C. , Schorr C. , Simpson S. , Wiersinga W. J. , Alshamsi F. , Angus D. C. , Arabi Y. , Azevedo L. , Beale R. , Beilman G. , Belley-Cote E. , Burry L. , Cecconi M. , Centofanti J. , Coz Yataco A. , De Waele J. , Dellinger R. P. , Doi K. , Du B. , Estenssoro E. , Ferrer R. , Gomersall C. , Hodgson C. , Møller M. H. , Iwashyna T. , Jacob S. , Kleinpell R. , Klompas M. , Koh Y. , Kumar A. , Kwizera A. , Lobo S. , Masur H. , McGloughlin S. , Mehta S. , Mehta Y. , Mer M. , Nunnally M. , Oczkowski S. , Osborn T. , Papathanassoglou E. , Perner A. , Puskarich M. , Roberts J. , Schweickert W. , Seckel M. , Sevransky J. , Sprung C. L. , Welte T. , Zimmerman J. , and Levy M. Surviving sepsis campaign: international guidelines for management of sepsis and septic shock 2021 . Intensive Care Medicine , 47 ( 11 ): 1181 – 1247 , 2021 . ISSN 0342-4642 . doi: 10.1007/s00134-021-06506-y . OpenUrl CrossRef PubMed 11. ↵ Levy M. M. , Evans L. E. , and Rhodes A. The Surviving Sepsis Campaign Bundle: 2018 update . Intensive Care Medicine , 44 ( 6 ): 925 – 928 , jun 1 2018 . ISSN 1432-1238 . doi: 10.1007/s00134-018-5085-0 . OpenUrl CrossRef PubMed 12. ↵ Fleuren L. M. , Klausch T. L. T. , Zwager C. L. , Schoonmade L. J. , Guo T. , Roggeveen L. F. , Swart E. L. , Girbes A. R. J. , Thoral P. , Ercole A. , Hoogendoorn M. , and Elbers P. W. G. Machine learning for the prediction of sepsis: a systematic review and meta-analysis of diagnostic test accuracy . Intensive Care Medicine , 46 ( 3 ): 383 – 400 , 3 2020 . ISSN 1432-1238 . doi: 10.1007/s00134-019-05872-y . OpenUrl CrossRef PubMed 13. ↵ Moor M. , Rieck B. , Horn M. , Jutzeler C. R. , and Borgwardt K. Early Prediction of Sepsis in the ICU Using Machine Learning: A Systematic Review . Frontiers in Medicine , 8 : 607952 , may 28 2021 . ISSN 2296-858X . doi: 10.3389/fmed.2021.607952 . OpenUrl CrossRef PubMed 14. ↵ Adams R. , Henry K. E. , Sridharan A. , Soleimani H. , Zhan A. , Rawat N. , Johnson L. , Hager D. N. , Cosgrove S. E. , Markowski A. , Klein E. Y. , Chen E. S. , Saheed M. O. , Henley M. , Miranda S. , Houston K. , Linton R. C. , Ahluwalia A. R. , Wu A. W. , and Saria S. Prospective, multi-site study of patient outcomes after implementation of the TREWS machine learning-based early warning system for sepsis . Nature Medicine , 28 ( 7 ): 1455 – 1460 , 7 2022 . ISSN 1546-170X . doi: 10.1038/s41591-022-01894-0 . OpenUrl CrossRef PubMed 15. ↵ Burdick H. , Pino E. , Gabel-Comeau D. , McCoy A. , Gu C. , Roberts J. , Le S. , Slote J. , Pellegrini E. , Green-Saxena A. , Hoffman J. , and Das R. Effect of a sepsis prediction algorithm on patient mortality, length of stay and readmission: a prospective multicentre clinical outcomes evaluation of real-world patient data from US hospitals . BMJ health & care informatics , 27 ( 1 ): e100109 , 4 2020 . ISSN 2632-1009 . doi: 10.1136/bmjhci-2019-100109 . OpenUrl Abstract / FREE Full Text 16. ↵ Moor M. , Bennett N. , Plečko D. , Horn M. , Rieck B. , Meinshausen N. , Bühlmann P. , and Borgwardt K. Predicting sepsis using deep learning across international sites: a retrospective development and validation study . eClinicalMedicine , 62 , aug 1 2023 . ISSN 2589-5370 . doi: 10.1016/j.eclinm.2023.102124 . OpenUrl CrossRef PubMed 17. ↵ Rockenschaub P. , Hilbert A. , Kossen T. , Elbers P. , Dincklage F. , Madai V. I. , and Frey D. The Impact of Multi-Institution Datasets on the Generalizability of Machine Learning Prediction Models in the ICU . Critical Care Medicine, page 10.1097/CCM.0000000000006359 . ISSN 0090-3493 . doi: 10.1097/CCM.0000000000006359 . OpenUrl CrossRef PubMed 18. ↵ Lauritsen S. M. , Thiesson B. , Jørgensen M. J. , Riis A. H. , Espelund U. S. , Weile J. B. , and Lange J. The Framing of machine learning risk prediction models illustrated by evaluation of sepsis in general wards . npj Digital Medicine , 4 ( 1 ): 1 – 12 , nov 15 2021 . ISSN 2398-6352 . doi: 10.1038/s41746-021-00529-x . OpenUrl CrossRef PubMed 19. ↵ Futoma J. , Hariharan S. , Heller K. , Sendak M. , Brajer N. , Clement M. , Bedoya A. , and O’Brien C. An Improved Multi-Output Gaussian Process RNN with Real-Time Validation for Early Sepsis Detection . In Proceedings of the 2nd Machine Learning for Healthcare Conference , pages 243 – 254 . PMLR , nov 6 2017 . Times cited: 4 ISSN: 2640-3498 . 20. ↵ Wong A. , Otles E. , Donnelly J. P. , Krumm A. , McCullough J. , DeTroyer-Cooley O. , Pestrue J. , Phillips M. , Konye J. , Penoza C. , Ghous M. , and Singh K. External Validation of a Widely Implemented Proprietary Sepsis Prediction Model in Hospitalized Patients . JAMA Internal Medicine , 181 ( 8 ): 1065 – 1070 , aug 1 2021 . ISSN 2168-6106 . doi: 10.1001/jamainternmed.2021.2626 . OpenUrl CrossRef PubMed 21. ↵ Bennett N. , Plečko D. , Ukor I.-F. , Meinshausen N. , and Bühlmann P. ricu: R’s interface to intensive care data . GigaScience , 12 : giad041 , ec 28 2022 . ISSN 2047-217X . doi: 10.1093/gigascience/giad041 . OpenUrl CrossRef 22. ↵ Bai S. , Kolter J. Z. , and Koltun V. An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling . apr 19 2018 . doi: 10.48550/arXiv.1803.01271 . OpenUrl CrossRef 23. ↵ Lichtner G. , Balzer F. , Haufe S. , Giesa N. , Schiefenhövel F. , Schmieding M. , Jurth C. , Kopp W. , Akalin A. , Schaller S. J. , Weber-Carstens S. , Spies C. , and Dincklage F. Predicting lethal courses in critically ill COVID-19 patients using a machine learning model trained on patients with non-COVID-19 viral pneumonia . Scientific Reports , 11 ( 1 ): 13205 , jun 24 2021 . ISSN 2045-2322 . doi: 10.1038/s41598-021-92475-7 . OpenUrl CrossRef PubMed 24. ↵ Thoral P. J. , Peppink J. M. , Driessen R. H. , Sijbrands E. J. G. , Kompanje E. J. O. , Kaplan L. , Bailey H. , Kesecioglu J. , Cecconi M. , Churpek M. , Clermont G. , Schaar M. , Ercole A. , Girbes A. R. J. , and Elbers P. W. G. Sharing ICU Patient Data Responsibly Under the Society of Critical Care Medicine/European Society of Intensive Care Medicine Joint Data Science Collaboration: The Amsterdam University Medical Centers Database (AmsterdamUMCdb) Example* . Critical Care Medicine , 49 ( 6 ): e563 , 6 2021 . ISSN 0090-3493 . doi: 10.1097/CCM.0000000000004916 . OpenUrl CrossRef 25. ↵ Hyland S. L. , Faltys M. , Hüser M. , Lyu X. , Gumbsch T. , Esteban C. , Bock C. , Horn M. , Moor M. , Rieck B. , Zimmermann M. , Bodenham D. , Borgwardt K. , Rätsch G. , and Merz T. M. Early prediction of circulatory failure in the intensive care unit using machine learning . Nature Medicine , 26 ( 3 ): 364 – 373 , 3 2020 . ISSN 1546-170X . doi: 10.1038/s41591-020-0789-4 . OpenUrl CrossRef PubMed 26. ↵ Rodemund N. , Wernly B. , Jung C. , Cozowicz C. , and Koköfer A. Harnessing Big Data in Critical Care: Exploring a new European Dataset . Scientific Data , 11 ( 1 ): 320 , mar 28 2024 . ISSN 2052-4463 . doi: 10.1038/s41597-024-03164-9 . OpenUrl CrossRef PubMed 27. ↵ Barton C. , Chettipally U. , Zhou Y. , Jiang Z. , Lynn-Palevsky A. , Le S. , Calvert J. , and Das R. Evaluation of a machine learning algorithm for up to 48-hour advance prediction of sepsis using six vital signs . Computers in Biology and Medicine , 109 : 79 – 84 , 6 2019 . ISSN 1879-0534 . doi: 10.1016/j.compbiomed.2019.04.027 . OpenUrl CrossRef PubMed 28. Henry K. E. , Hager D. N. , Pronovost P. J. , and Saria S. A targeted real-time early warning score (TREWScore) for septic shock . Science Translational Medicine , 7 ( 299 ): 299ra122 , aug 5 2015 . ISSN 1946-6242 . doi: 10.1126/scitranslmed.aab3719 . OpenUrl Abstract / FREE Full Text 29. ↵ Steinbach D. , Ahrens P. C. , Schmidt M. , Federbusch M. , Heuft L. , Lübbert C. , Nauck M. , Gründling M. , Isermann B. , Gibb S. , and Kaiser T. Applying Machine Learning to Blood Count Data Predicts Sepsis with ICU Admission . Clinical Chemistry , 70 ( 3 ): 506 – 515 , mar 2 2024 . ISSN 1530-8561 . doi: 10.1093/clinchem/hvae001 . OpenUrl CrossRef PubMed 30. ↵ Moor M. , Horn M. , Rieck B. , Roqueiro D. , and Borgwardt K. Early recognition of sepsis with Gaussian process temporal convolutional networks and dynamic time warping . In Machine Learning for Healthcare Conference , pages 2 – 26 . PMLR , 2019 . Times cited: 2. 31. ↵ Schamoni S. , Lindner H. A. , Schneider-Lindner V. , Thiel M. , and Riezler S. Lever-aging implicit expert knowledge for non-circular machine learning in sepsis prediction . Artificial Intelligence in Medicine , 100 : 101725 , sep 1 2019 . ISSN 0933-3657 . doi: 10.1016/j.artmed.2019.101725 . OpenUrl CrossRef PubMed 32. ↵ Henry K. E. , Adams R. , Parent C. , Soleimani H. , Sridharan A. , Johnson L. , Hager D. N. , Cosgrove S. E. , Markowski A. , Klein E. Y. , Chen E. S. , Saheed M. O. , Henley M. , Miranda S. , Houston K. , Linton R. C. , Ahluwalia A. R. , Wu A. W. , and Saria S. Factors driving provider adoption of the TREWS machine learning-based early warning system and its effects on sepsis treatment timing . Nature Medicine , 28 ( 7 ): 1447 – 1454 , 7 2022 . ISSN 1546-170X . doi: 10.1038/s41591-022-01895-z . OpenUrl CrossRef PubMed 33. ↵ Steyerberg E. W. , Moons K. G. M. , Windt D. A. , Hayden J. A. , Perel P. , Schroter S. , Riley R. D. , Hemingway H. , Altman D. G. , and Group f. t. P. Prognosis Research Strategy (PROGRESS) 3: Prognostic Model Research . PLOS Medicine , 10 ( 2 ): e1001381 , may 2 2013 . ISSN 1549-1676 . doi: 10.1371/journal.pmed.1001381 . OpenUrl CrossRef PubMed 34. ↵ Elm E. , Altman D. G. , Egger M. , Pocock S. J. , Gøtzsche P. C. , and Vandenbroucke J. P. The Strengthening the Reporting of Observational Studies in Epidemiology (STROBE) statement: guidelines for reporting observational studies . The Lancet , 370 ( 9596 ): 1453 – 1457 , oct 20 2007 . ISSN 0140-6736 . doi: 10.1016/S0140-6736(07)61602-X . OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted February 21, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following The impact of evaluation strategy on sepsis prediction model performance metrics in intensive care data Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share The impact of evaluation strategy on sepsis prediction model performance metrics in intensive care data Dang-Khoa Do , Patrick Rockenschaub , Sebastian Boie , Oliver Kumpf , Hans-Dieter Volk , Felix Balzer , Falk von Dincklage , Gregor Lichtner medRxiv 2025.02.20.25322509; doi: https://doi.org/10.1101/2025.02.20.25322509 Share This Article: Copy Citation Tools The impact of evaluation strategy on sepsis prediction model performance metrics in intensive care data Dang-Khoa Do , Patrick Rockenschaub , Sebastian Boie , Oliver Kumpf , Hans-Dieter Volk , Felix Balzer , Falk von Dincklage , Gregor Lichtner medRxiv 2025.02.20.25322509; doi: https://doi.org/10.1101/2025.02.20.25322509 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Intensive Care and Critical Care Medicine Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4436) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6600) Geriatric Medicine (668) Health Economics (997) Health Informatics (4538) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (542) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3333) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9232) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00eef7a880ff047',t:'MTc3OTY1MzMzOQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00