Full text
50,355 characters
· extracted from
preprint-html
· click to expand
Dynamic Mortality Risk Prediction in Myelodysplastic Syndromes Using Longitudinal Clinical Data | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Dynamic Mortality Risk Prediction in Myelodysplastic Syndromes Using Longitudinal Clinical Data View ORCID Profile Jonathan Bobak , View ORCID Profile Philipp Spohr , Sarah Richter , Alexander Streuer , Felicitas Schulz , Corinna Strupp , Catharina Gerhards , Nanni Schmitt , Thomas Luft , Sascha Dietrich , View ORCID Profile Ulrich Germing , View ORCID Profile Gunnar W. Klau doi: https://doi.org/10.1101/2025.07.21.25331775 Jonathan Bobak 1 Department of Hematology, Oncology and Clinical Immunology, Medical Faculty and University Hospital Düsseldorf, Heinrich Heine University Düsseldorf , Germany 2 Chair Algorithmic Bioinformatics, Heinrich Heine University Düsseldorf , Düsseldorf, Germany 3 Center for Digital Medicine, Heinrich Heine University Düsseldorf , Germany M.Sc. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jonathan Bobak For correspondence: jonathan.bobak{at}med.uni-duesseldorf.de Philipp Spohr 2 Chair Algorithmic Bioinformatics, Heinrich Heine University Düsseldorf , Düsseldorf, Germany 3 Center for Digital Medicine, Heinrich Heine University Düsseldorf , Germany M.Sc. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Philipp Spohr Sarah Richter 4 Department of Medicine, Hematology, Oncology and Rheumatology; University Hospital Heidelberg , Germany M.Sc. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alexander Streuer 5 Department of Hematology and Oncology, Medical Faculty Mannheim, Heidelberg University , Mannheim, 68167, Germany Dr. med. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Felicitas Schulz 1 Department of Hematology, Oncology and Clinical Immunology, Medical Faculty and University Hospital Düsseldorf, Heinrich Heine University Düsseldorf , Germany Dr. med. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Corinna Strupp 1 Department of Hematology, Oncology and Clinical Immunology, Medical Faculty and University Hospital Düsseldorf, Heinrich Heine University Düsseldorf , Germany PD Dr. med. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Catharina Gerhards 6 Institute for Clinical Chemistry, Medical Faculty Mannheim of the University of Heidelberg , Mannheim, Germany Dr. med. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nanni Schmitt 5 Department of Hematology and Oncology, Medical Faculty Mannheim, Heidelberg University , Mannheim, 68167, Germany Dr. sc. hum. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Thomas Luft 4 Department of Medicine, Hematology, Oncology and Rheumatology; University Hospital Heidelberg , Germany Dr. med. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sascha Dietrich 1 Department of Hematology, Oncology and Clinical Immunology, Medical Faculty and University Hospital Düsseldorf, Heinrich Heine University Düsseldorf , Germany Dr. med. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ulrich Germing 1 Department of Hematology, Oncology and Clinical Immunology, Medical Faculty and University Hospital Düsseldorf, Heinrich Heine University Düsseldorf , Germany Dr. med. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ulrich Germing Gunnar W. Klau 2 Chair Algorithmic Bioinformatics, Heinrich Heine University Düsseldorf , Düsseldorf, Germany 3 Center for Digital Medicine, Heinrich Heine University Düsseldorf , Germany Dr. rer. nat. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Gunnar W. Klau Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Purpose Patients with myelodysplastic syndromes (MDS) exhibit diverse disease trajectories necessitating different clinical approaches ranging from watch-and-wait strategies to hematopoietic stem cell transplantation. Existing risk scores like the IPSS-R or EASIX provide static risk stratification at diagnosis but do not capture evolving disease dynamics. We address this problem by introducing a dynamic, data-driven approach to predict short-term mortality risks repeatedly, across the patient’s disease course. Materials and methods We develop a machine learning model based on gradient-boosted decision trees to estimate one-year mortality risks from both longitudinal parameters from blood values and diagnosis-based features. We train the model on a dataset of patients from the MDS Registry Düsseldorf (n=1024) and validate on patients from University Hospitals Heidelberg (n=286) and Mannheim (n=31). Results Validations on independent cohorts achieved AUROC scores of around 0.8 and better predictive performance for one-year survival compared to a diagnosis-only baseline model. The model accurately predicted mortality risks as early as within the first 90 days of diagnosis. Feature importance analysis revealed clinically plausible feature-label relations, supporting interpretability. Conclusion This dynamic risk model enables continuous, individualized assessment of one-year mortality risk in MDS patients, offering a supplement to static scores used at diagnosis. Our results highlight the utility and importance of including longitudinal parameters in risk assessment analysis. Introduction Myelodysplastic syndromes (MDS) are a class of malignant stem cell diseases resulting in reduced and defective blood cells. The main affected age group are elderly patients, with survival times ranging from a few months to multiple years [ 1 ]. Quantitative mortality risk assessment for this diverse set of clinical trajectories is paramount in clinical decision-making. The main approach for MDS in recent years has been to focus on diagnosis-based modelling. Scores like the International Prognostic Scoring System are broadly applied in the form of the revised and molecular IPSS-R/IPSS-M [ 2 ], [ 3 ] as well as other scores calculated on diagnosis [ 4 ], [ 5 ] relying on the same methodology. A major advantage is the simplicity of calculating them. For the IPSS-R at diagnosis just five parameters are needed to classify a patient into one of five categories ranging from very low to very high risk. But this simplicity and focus on a single point in time leads to some drawbacks. Losses in predictive power over time due to similar hazards in all risk categories were observed [ 6 ]. Additionally, the categorization effectively limits predictions to five hazard profiles. For an individual patient, the question if they do follow their corresponding risk profile at any given time cannot be answered. For longitudinal predictions, clinicians must rely on their experience since re-evaluation of the diagnosis-based scores is not always possible and may require more intensive diagnostics rarely performed on follow-up. The WHO-based prognostic scoring system (WPSS) [ 7 ] investigated a time-adjusted score with good predictive performance for overall survival and AML progression but is still limited to 5 risk categories. Longitudinal data also comes with lots of challenges like unevenly sampled or missing data and more complex patterns. To approach these challenges prediction tools based on machine learning algorithms are a major focus to recognize patterns and improve predictions. Orgueira et al. applied random survival forests to improve model performance compared to the IPSS-R [ 8 ]. This study did not only focus on individual predictions instead of general risk classifications but also showed the advantages of complex, non-linear models to learn new patterns. Tree boosting was also used to predict MDS diagnosis risks one year prior to clinical diagnosis with good classification performance [ 9 ]. Apart from focusing on a single point in time as a basis for model prediction, dynamic predictions have been applied to analyse 100-day mortality risks after hematopoietic stem cell transplantation. Risks are predicted each day starting on the day of transplantation up until day 30 showing that with increasing longitudinal data predictions improved [ 10 ]. Here we present an approach with a moving one-year prediction horizon in relation to the time of prediction. This conceptual approach for continuous and event independent predictions and its caveats were investigated by Sherman et al. [ 11 ]. Our model leverages both diagnosis-based and longitudinal features as input. We trained and validated the model on retrospective data from the MDS Registry Düsseldorf and retrospective validation datasets from the University Hospitals Heidelberg and Mannheim showing promising prediction improvements by leveraging longitudinal data in addition to diagnosis-based parameters. Our model may serve as a valuable supplement to existing risk scores, especially for longer follow-ups and dynamic disease trajectories. A conceptual overview of our approach can be seen in Figure 1 . Download figure Open in new tab Figure 1: Overview of our study pipeline. We use the Düsseldorf patient dataset for training and initial validation of our method supplemented by two validation datasets from the University Hospitals Heidelberg and Mannheim. For each patient, we gather both diagnosis-based and longitudinal features. For the retrospective longitudinal data, we sub-sample each patient at different time points and extract a standardized set of features. Combined with diagnosis-based features these serve as input for a gradient-boosting model which predicts a probability for one-year mortality for each sub-sample. A higher predicted probability is interpreted as a higher risk of dying within 365 days after the time of prediction. Materials and Methods Patient inclusion criteria Our inclusion criteria for patients, related to technically necessary minimal data availability and bias reduction, are surviving at least six months receiving no hematopoietic stem cell transplantation (HSCT) at any time, as this procedure is altering the natural course of the disease having at least three values for each longitudinal variable having at least one year of follow-up from each prediction point in case of right-censored patients By setting a minimal survival time for inclusion we aim to alleviate the leading bias for very high-risk patients being diagnosed very late in disease progression. This does not fully eliminate the bias but allows us to focus on more subtle, long-term disease patterns. Conceptual framework We follow the framework described by Sherman et al. [ 11 ], defining a fixed reference point p 0 as the time of diagnosis. Each prediction p i at time i incorporates all available data between p 0 and p i , estimating the probability of death within 365 days from i . This setup ensures that predictions are temporally localized, incorporate full clinical history up to time i , and are independent of prior predictions and the outcome itself. While prediction windows may overlap, each instance is treated independently, which is crucial for downstream performance evaluation. Successive predictions advance the prediction horizon. Data pre-processing and feature extraction Patient data has two modalities. Diagnosis-based features are only observed at p 0 while longitudinal time-series features are measured repeatedly. The longitudinal information is irregular and sparse as patients were monitored in a real-life clinical environment. Each patient’s complete retrospective longitudinal data is subsampled each quarter with observations. Quarterly sampling was chosen such that densely observed hospitalization periods are not overrepresented, but a representative set of intermediate prediction points is obtained. The point p i is the last observation within a quarter. Due to decreasing numbers of data points, sampling was stopped after 32 quarters. To prevent label leakage, we exclude the final 60 days before death from sampling, as this period may overshadow more subtle early signals of deterioration during training. From the irregular and sparse longitudinal data, we extract a standardized set of characteristic features for each subsample (see Supplementary Table B). These derived time-dependent features are then combined with the fixed diagnosis-based features to construct the final input vector for each prediction instance. Samples are labelled positive (1) if death occurs 365 days after p i or negative (0) otherwise. By design, a patient may have samples with different labels which are obtained independently of one another as they represent different prediction times but may share similarities in features. Model Architecture Our longitudinal survival prediction model using XGBoost [ 12 ] incorporates both diagnosis-based and longitudinal features. The input features used for model training are summarized in Table 1 . The longitudinal features were mainly chosen due to their frequent observation in the clinical setting at both ordinary follow-ups and adverse events. Similar, bone-marrow blasts and cytogenetics were only included at diagnosis as follow-ups to these variables are biased to clinical deterioration and disease progression. The IPSS-R [ 2 ] was specifically excluded from the feature set since most of its constituent variables are already model inputs. We also do not consider therapies as inputs due to uncertainty around dates and a huge variety of treatment schemes. The model has to learn related patterns implicitly and may cope better with new combinations of drugs or new drugs since the input features do not depend on them. Mutational data could not be incorporated due sparse and incomplete data. View this table: View inline View popup Download powerpoint Table 1: Input features for model training. Diagnosis-based parameters have singular values while the longitudinal features correspond to series of observations. EASIX is just used for the baseline model. The survival time is input for both models and is given in fine-grained days as well as 90-day quarters. 1 Both Hemoglobin and Leukocytes are present in the baseline and longitudinal model. For the baseline model the measurements at diagnosis, for the longitudinal model a series of measurements is used. 2 These features are optional, i.e. the models can handle missing values for them, at least one feature is required. As a comparative baseline, we also train a model only on diagnosis-based features. Contrary to the longitudinal model, we included the EASIX (Endothelial Activation and Stress Index) score [ 13 ] as a replacement for raw thrombocyte counts. The XGBoost implementation handles missing data natively by adding additional decision branches, when a value is missing, which allows us to retain patients with missing diagnosis-based features. Longitudinal features are always present. Hyperparameters are selected via grid search with cross-validation on the training set. To reduce overfitting given the modest dataset size, we apply regularization strategies and minimize false negatives by tuning model sensitivity with sample weights during training (see Supplementary Table C). Evaluation Methodology Classification performance is evaluated using three metrics: area under the receiver operating characteristic curve (AUROC [ 14 ]), area under the precision-recall curve (AUPRC[ 15 ]), and Brier Score [ 16 ]. The AUPRC is particularly relevant due to the observed class imbalance, which the AUROC metric has been shown to omit [ 15 ], [ 17 ]. The Brier Score quantifies absolute error in relation to the true probabilities. For a robust initial model performance assessment, we conducted repeated cross-validation on the Düsseldorf training cohort. Metrics were averaged over all cross-validation folds to account for variability due to cohort composition and to assess general model capability. Changes in performance for predictions further from diagnosis were evaluated by calculating metrics within each quarterly bin. Model performance was compared to the established IPSS-R [ 2 ] using the same cross validation strategy. For a fair comparison we trained and evaluated on a sub-cohort of patients for which the IPSS-R was present. Using the IPSS-R category and conditional Kaplan-Meier curves [ 18 ], [ 19 ] of the training sets we obtained longitudinal predictions for each sample. A comparison with the more recent IPSS-M [ 3 ] was not possible since too little genomic information was available. To evaluate generalizability across institutions and real-world applicability, we trained our model on the Düsseldorf dataset and evaluated it on two independent validation cohorts from University Hospitals Heidelberg and Mannheim. Finally, we performed feature importance analysis on this generalizable model using the average reduction in impurity achieved for each feature over all trees and SHAP values [ 20 ] a game theoretic approach to explain relations between input features and model outputs. Datasets The training cohort consists of 1024 retrospective MDS patients treated at the University Hospital Düsseldorf, all part of the Düsseldorf MDS Registry, with a total of 6146 samples and a 2.66:1 ratio between negative to positive labels. We validated on two retrospective cohorts from the University Hospital Heidelberg (patients=286, samples=1708, label balance ≈4:1) and the University Hospital Mannheim (patients=31, samples=237, label balance ≈10:1). For the validation sets, we did not filter the final 60 days of longitudinal data to simulate real predictions without event knowledge. Sample numbers diminish over time with similar decrease across datasets and labels (see Supplementary Figure A). The Heidelberg cohort shows a lower median follow-up and higher mortality than the other datasets indicating more high-risk patients with earlier events but similar average sample numbers per patient. Detailed demographics and characteristics can be found in Supplementary Table A. Software Availability The pipeline used to train and evaluate the model is written in python using snakemake [ 21 ] as a workflow library for easy reproduction. Additionally, we used tsflex [ 22 ] and tsfresh [ 23 ] for feature extraction, XGBoost [ 12 ], SHAP [ 20 ] and scikit-learn [ 24 ] for model training and evaluation, Pandas [ 25 ], [ 26 ] and NumPy [ 27 ] to handle and transform data, and matplotlib [ 28 ] for plotting. All code is available here: https://github.com/dietrichlab-cs/dynamic_one-year_mortality_mds (archive: DOI: 10.5281/zenodo.16102058 ). A small webtool, to test both models, is available ( https://dietrichlab.de/PythonApps/dynamic_mds_paper/ ). Results & Discussion Adding longitudinal data improves predictions We first trained the baseline and longitudinal model on the Düsseldorf cohort and evaluated our metrics on the external validation cohorts from Heidelberg and Mannheim. Table 2 shows results for all metrics and datasets comparing models. On the Heidelberg dataset, the longitudinal model improves the AUROC by approximately 0.12 and the AUPRC by 0.23. Both models perform substantially better than random guessing, which would yield an AUROC of 0.5 and an expected AUPRC of around 0.2. Furthermore, the longitudinal model exhibits a lower Brier score than the baseline, indicating improved overall calibration and probabilistic accuracy. View this table: View inline View popup Download powerpoint Table 2: AUROC, AUPRC and Brier Score for all three datasets. For the Düsseldorf cohort averages from cross validation are given. For the Heidelberg and Mannheim datasets performance is measured on models trained on all patients from Düsseldorf. Results are rounded to four decimal places. For the Mannheim cohort similar improvements for the longitudinal model can be seen for the AUROC and AUPRC scores with improvements of 0.22 and 0.31. Interestingly, Brier scores for this dataset do not show the same difference but are close across models yet in the same range as for the Heidelberg and Düsseldorf datasets. The baseline model does perform equally good as the IPSS-R with the advantage of being able to cope with missing values shown in Supplementary Figure C. The longitudinal model outperforms both other models clearly. All subsequent comparisons are done between the baseline and longitudinal model as they allow for larger patient cohorts. The intuitive assumption, that adding information from longitudinal data does improve model performance compared to an approach solely reliant on data from the time of diagnosis, could be confirmed. We can conclude that our longitudinal model trained on patients from Düsseldorf does indeed learn generalizable patterns and is applicable to other, real-world cohorts. Longitudinal model can react to dynamic changes We do see differences in the two models approach to predicting risks. The baseline model is restricted to an initial state of a patient and their survival time. An initial prediction based on the diagnosis features is made which is then adjusted according to survival time. Thus, a major factor is the correlation between survival time and mortality. This approach is very similar to classical survival analysis in the form of Cox-regression or Kaplan-Meier curves [ 19 ], [ 18 ]. Predicted risks of different subsamples in one patient generally monotonically decrease over time. Figure 2 shows some example patients and their predictions for both models as well as the ground truth labels. Patients A and B start with initial risks gradually decreasing. For patient A this is correct, as we observe no positive sample. While patient B does die shortly after the 21st quarter but the baseline model predicts a low risk. For patient D the predictions indicate low risk from the start although the patient only has positive samples. The longitudinal model does predict dynamically as it does receive new and changing longitudinal data. For patient B we see an increasing risk over time with the highest probabilities on the last two predictions 75 and 3 days prior to the patient dying. The same observation can be made for patients C and D including a higher initial risk for patient D compared to the baseline model. Download figure Open in new tab Figure 2: Example predictions for four different patients. The numbers between the coloured bars correspond to the true label for each prediction, the top bar displays the probabilities given by the longitudinal model, the bottom bar those given by the baseline model. Green is associated with likely survival while red indicates a high risk for dying 365 days after the time of prediction. Dynamic predictions due to added information from longitudinal features are advantageous while the baseline model is bound to survival time alone and is subject to population dynamics resulting in probabilities being independent of a patient’s actual disease trajectory similar to classical survival curves. The longitudinal model learns generalizable patterns For the two external validation sets we explored how the longitudinal model predicts samples within the last 60 days before the occurrence of a mortality event. These samples would not have been included in training. For both datasets and all samples falling under this condition, the longitudinal model predicted either higher or equal probabilities for mortality compared to the prior prediction. Examples are patients B and C in Figure 2 . Both patients die within the last quarter of predictions thus the last two predictions are of interest. They have the highest predicted risk, clearly indicating imminent mortality. We can see that the patterns of longitudinal features right before the mortality event are either similar, more pronounced or continuations of patterns from earlier prediction points. We investigated other endpoints in MDS, namely AML progression and transfusion burden. For AML progression, the model did report higher mortality risks around the time of recorded progression for most patients in both validation cohorts. This time point is the clinical validation of progression, making it hard to assess how well the model serves as an early warning. A similar uncertainty arises when looking at transfusion burden. Patient A in Figure 2 received transfusions over the whole observation period. As a side-effect a study inclusion, investigating iron overload, was checked. This may explain why the longitudinal model predicts a constant risk for this patient as 8 years of close clinical monitoring and regular transfusions do not hint at a low-risk trajectory. Although the model seems to associate higher risk with specific events it is not conclusive if the model can act as an early warning or if it just reports on side effects of more aggressive therapies/disease progression. Trade-off between false positives versus false negatives As seen exemplary for patient A in Figure 2 the longitudinal model is prone to false positive predictions. One reason for those may be medical aspects relating to therapy and disease progression. Another influencing factor was our deliberate choice to introduce sample weights during training increasing the penalty for false negative predictions, pushing the model towards a high recall for positive samples, evident by the achieved AUROC scores. Over both validation datasets we qualitatively investigated false negatives. Many of these could be attributed to either short-term mortality or events unrelated to MDS. Some examples include septic shock after hip surgery or liver failure due to liver cancer. Although MDS is a contributing comorbidity, if the resulting cause of death is not directly visible within the longitudinal data, our model cannot predict these events. This may occur if the patient was treated elsewhere, the blood values were not available for evaluation, or the cause of death does not affect blood values as evident in the training set. Model performance does not depend on sample length To investigate predictive power over time, we perform cross-validation on the Düsseldorf dataset, tracking evaluation metrics for each split and sample length and average them. All metrics show a similar trend (see Figure 3 ). The difference between the longitudinal and baseline model starts out very small and then increases over the first quarters up to roughly the 12-13th quarter with the longitudinal model outperforming the baseline. Overall, the longitudinal model stabilizes around an AUROC of 0.8 and an AUPRC of 0.6 with improving Brier scores but is getting more unpredictable for longer sampling periods. The baseline model shows a decreasing performance trend for both AUROC and AUPRC with a constant Brier score apart from the last 4-5 quarters which do show an overall increase in model performance. Download figure Open in new tab Figure 3: Average evaluation metric values when performing cross-validation on the Düsseldorf dataset. For AUROC and AUPRC higher, while for the Brier Score, lower values are better. See Supplementary Figure B for information on variance across cross-validation runs. Increasing variance and reduced sample size are reasons for both models converging to similar performance. See Supplementary Figure B for more details on result variance. As both models start out on similar data even without lots of longitudinal data, the longitudinal model performs a lot better. We cannot completely rule out dataset specific variances and biases as reasons for some of the performance trends but the fact that the differences in model performance are persistent and we do observe independent increases and decreases between models across different cross-validation folds leads to the conclusion that the longitudinal model learns generalizable trends. One caveat for this analysis is the high variance across splits (see Supplementary Figure B) due to diminishing amounts of samples and changes in label balance. However, this variance is consistent across both models supporting observations on average performance. The early advantage of the longitudinal model indicates that disease dynamics hold predictive value early on, stressing the importance of continued quantitative monitoring of risks. Meaningful feature – label relations Feature importance evaluations show meaningful feature-label relations validated by medical domain experts for the 20 most important features as seen in Figure 4 . As an example, lower hemoglobin averages indicate a higher associated probability of mortality. According to the MDS treatment strategy, patients with lower hemoglobin values will need transfusions [ 29 ]. Additionally, hemoglobin has been shown to be a robust marker for risk assessment [ 2 ], [ 3 ]. We can also observe 7 out of 20 most important features relating to quantile distributions across all longitudinal features. This may be indicative of the model learning to detect outliers and assess mortality risk based on this. Adding to this hypothesis is the inclusion of variation coefficients and averages in this list. Patients which are in regular need of transfusion will not only have more data to base the prediction on but also show more variance. Since complete data on exact treatments was not broadly available, we can only speculate that the model does pick up on treatment related patterns like transfusion burden. There has been evidence that increased transfusion-need is correlated with worse outcome [ 5 ], [ 30 ]. One more feature type we can observe in higher abundance is the slope of the last three data points of various longitudinal features. Even though this is not an outlier, it is a feature type capturing a more localized view of a patient’s recent disease history including short term changes. According to SHAP analysis negative slopes are associated with higher risk. Download figure Open in new tab Figure 4: SHAP values for the 20 most important features according to the XGBoost “gain” metric. Gain describes the average reduction in loss when a feature was chosen in a split across all trees and training samples. For each feature and sample, the SHAP value gives an intuition if the feature biases the prediction towards the positive or negative label. Over all samples this results in the shown density distinguishing higher and lower feature values by colour. Conclusion We present a new approach to quantify dynamic one-year mortality risk in patients with MDS using gradient boosted decision tree ensembles. Based on two validation cohorts and cross validation on the training set we can show the advantages of using longitudinal features outperforming a purely diagnosis-based model with improvements among all evaluated scores. Longitudinal data adds important information to the prediction from early on. Additionally, we can show that the model does not only learn and perform on a specific dataset but does generalize to other, independent datasets. It achieves this by learning meaningful label-feature relations correlating to known medical factors and disease markers. Even beyond the learned label the model does pick up on adverse events like AML progression and transfusion burden. Even with three university clinics participating the number of samples limits the interpretability of results and our results should be seen as a proof-of-concept. For a fully validated and clinically usable model, more data from more centres from more countries would be needed. The model is applicable in different environments shown by the federated validation on the external datasets. We hope to further validate and improve the approach looking at other clinical endpoints of interest like disease progression or therapy onset. Due to the dynamic and easily adjustable pipeline, other features both at diagnosis and longitudinal can be tested for predictive power. Inclusion of both machine learning and longitudinal features into the clinical routine to support decision-making and analyse large amounts of personalized data are crucial to improve patient care and survival. Data Availability Data and models used in the present work are not available online due to privacy concerns. An interactive demo of the models and all code is available online. https://dietrichlab.de/PythonApps/dynamic_mds_paper/ https://github.com/dietrichlab-cs/dynamic_one-year_mortality_mds https://doi.org/10.5281/zenodo.16102058 SUPPLEMENTARY MATERIAL Dataset characteristics View this table: View inline View popup Download powerpoint Supplementary Table A: Dataset characteristics for the different data sources. For Heidelberg and Mannheim no cut-off of the last 60 days is performed. The main objective of the validation datasets is to show performance on real-life data without lots of curation and cherry-picking. Still, we see similar median demographics for all datasets with differing label ratios especially for the Mannheim cohort. The Heidelberg cohort has an overall shorter average follow-up and higher mortality indicating more high-risk patients. Longitudinal feature extraction View this table: View inline View popup Download powerpoint Supplementary Table B: Features extracted from longitudinal data. Each feature is extracted for each sample and longitudinal parameter for the series of value from diagnosis to the point of prediction. Hyperparameters View this table: View inline View popup Download powerpoint Supplementary Table C: Hyperparameters for both models. Note that since we use XGBoost, features_per_split are converted to a percentage (features_per_split/total_number_of_features). Samples per quarter Download figure Open in new tab Supplementary Figure A: Sampling lengths over time given for each quarter. Negative samples have label 0 while positive ones have label 1. Download figure Open in new tab Supplementary Figure B: AUROC, AUPRC and Brier Score with variance bars on the cross-validation run on the Düsseldorf dataset. Bars represent inner quartile range (IQR) with the whiskers extending to 1.5xIQR. Outliers are denoted by circles. Download figure Open in new tab Supplementary Figure C: Comparison of average evaluation metrics over time between IPSS-R, using conditional Kaplan Meier estimators, and the baseline and longitudinal models. The dataset is based on all sample from the Düsseldorf dataset for patients with a present IPSS-R. This limits the dataset to n=689 patients. Contributions JB developed, evaluated and validated the software. UG, GK, PS and JB conceptualized the approach. PS, UG and GK supervised development and evaluation. SR and TL provided data for the Heidelberg validation cohort and supervised validation. SR ran the software in Heidelberg. AS, CG and NS provided data for the Mannheim validation cohort. AS and NS supervised validation. UG and CS provided data for the Düsseldorf cohort based on the MDS Registry Düsseldorf. AS, NS, TL, UG, CS, FS and SD provided detailed medical feedback. GK, PS and SR provided feedback on statistical analysis and model training. All authors have contributed to the internal review process, read and approved the manuscript. Acknowledgments We would like to thank all members of the Algorithmic Bioinformatics and Hematology Research Lab groups at the Heinrich Heine University Düsseldorf for their feedback and discussions around the project. We also want to thank all clinical personnel and patients providing and processing data. Footnotes ↵ * shared last author References [1]. ↵ K. Nachtkamp , G. Kobbe , N. Gattermann , and U. Germing , “ Myelodysplastische Syndrome ,” Deutsches Ärzteblatt. Accessed: Jun. 23, 2025 . [Online]. Available: https://www.aerzteblatt.de/archiv/myelodysplastische-syndrome-6c18b1d7-6a50-4811-b337-c0c8d04d178e [2]. ↵ P. L. Greenberg et al. , “ Revised International Prognostic Scoring System for Myelodysplastic Syndromes ,” Blood , vol. 120 , no. 12 , pp. 2454 – 2465 , Sep. 2012 , doi: 10.1182/blood-2012-03-420489 . OpenUrl Abstract / FREE Full Text [3]. ↵ E. Bernard et al. , “ Molecular International Prognostic Scoring System for Myelodysplastic Syndromes ,” NEJM Evid ., vol. 1 , no. 7 , p. EVIDoa2200008 , Jun . 2022 , doi: 10.1056/EVIDoa2200008 . OpenUrl CrossRef PubMed [4]. ↵ E. Such et al. , “ Development and validation of a prognostic scoring system for patients with chronic myelomonocytic leukemia ,” Blood , vol. 121 , no. 15 , pp. 3005 – 3015 , Apr. 2013 , doi: 10.1182/blood-2012-08-452938 . OpenUrl Abstract / FREE Full Text [5]. ↵ L. Malcovati et al. , “ A WHO Classification-Based Prognostic Scoring System (WPSS) for Predicting Survival in Myelodysplastic Syndromes .,” Blood , vol. 106 , no. 11 , p. 788 , Nov. 2005 , doi: 10.1182/blood.V106.11.788.788 . OpenUrl CrossRef [6]. ↵ M. Pfeilstöcker et al. , “ Time-dependent changes in mortality and transformation risk in MDS ,” Blood , vol. 128 , no. 7 , pp. 902 – 910 , Aug. 2016 , doi: 10.1182/blood-2016-02-700054 . OpenUrl Abstract / FREE Full Text [7]. ↵ L. Malcovati et al. , “ Time-Dependent Prognostic Scoring System for Predicting Survival and Leukemic Evolution in Myelodysplastic Syndromes ,” J. Clin. Oncol ., vol. 25 , no. 23 , pp. 3503 – 3510 , Aug. 2007 , doi: 10.1200/JCO.2006.08.5696 . OpenUrl Abstract / FREE Full Text [8]. ↵ A. Mosquera Orgueira et al. , “ Machine Learning Improves Risk Stratification in Myelodysplastic Neoplasms: An Analysis of the Spanish Group of Myelodysplastic Syndromes ,” HemaSphere , vol. 7 , no. 10 , p. e961 , Oct. 2023 , doi: 10.1097/HS9.0000000000000961 . OpenUrl CrossRef [9]. ↵ A. Radhachandran et al. , “ A machine learning approach to predicting risk of myelodysplastic syndrome ,” Leuk. Res ., vol. 109 , p. 106639 , Oct. 2021 , doi: 10.1016/j.leukres.2021.106639 . OpenUrl CrossRef PubMed [10]. ↵ P. Spohr et al. , “ Dynamic Prediction of Mortality Risk Following Allogeneic Hematopoietic Stem Cell Transplantation ,” Feb . 18 , 2025 , medRxiv . doi: 10.1101/2025.02.17.25322391 . OpenUrl Abstract / FREE Full Text [11]. ↵ E. Sherman , H. Gurm , U. Balis , S. Owens , and J. Wiens , “ Leveraging Clinical Time-Series Data for Prediction: A Cautionary Tale ,” AMIA. Annu. Symp. Proc ., vol. 2017 , pp. 1571 – 1580 , Apr. 2018 . OpenUrl PubMed [12]. ↵ T. Chen and C. Guestrin , “ XGBoost: A Scalable Tree Boosting System ,” in Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , Aug . 2016 , pp. 785 – 794 . doi: 10.1145/2939672.2939785 . OpenUrl CrossRef [13]. ↵ T. Luft et al. , “ EASIX for prediction of survival in lower-risk myelodysplastic syndromes ,” Blood Cancer J ., vol. 9 , no. 11 , p. 85 , Nov. 2019 , doi: 10.1038/s41408-019-0247-z . OpenUrl CrossRef PubMed [14]. ↵ T. Fawcett , “ An introduction to ROC analysis ,” Pattern Recognit. Lett ., vol. 27 , no. 8 , pp. 861 – 874 , Jun. 2006 , doi: 10.1016/j.patrec.2005.10.010 . OpenUrl CrossRef Web of Science [15]. ↵ J. Davis and M. Goadrich , “ The relationship between Precision-Recall and ROC curves ,” in Proceedings of the 23rd international conference on Machine learning , in ICML ‘06. New York, NY, USA : Association for Computing Machinery , Jun . 2006 , pp. 233 – 240 . doi: 10.1145/1143844.1143874 . OpenUrl CrossRef [16]. ↵ G. W. Brier , “ VERIFICATION OF FORECASTS EXPRESSED IN TERMS OF PROBABILITY ,” Mon. Weather Rev ., vol. 78 , no. 1 , pp. 1 – 3 , Jan. 1950 , doi: 10.1175/1520-0493(1950)0782.0.CO;2 . OpenUrl CrossRef [17]. ↵ T. Saito and M. Rehmsmeier , “ The Precision-Recall Plot Is More Informative than the ROC Plot When Evaluating Binary Classifiers on Imbalanced Datasets ,” PLOS ONE , vol. 10 , no. 3 , p. e0118432 , Mar . 2015 , doi: 10.1371/journal.pone.0118432 . OpenUrl CrossRef PubMed [18]. ↵ B. A. Zamboni et al. , “ Conditional Survival and the Choice of Conditioning Set for Patients With Colon Cancer: An Analysis of NSABP Trials C-03 Through C-07 ,” J. Clin. Oncol ., vol. 28 , no. 15 , pp. 2544 – 2548 , May 2010 , doi: 10.1200/JCO.2009.23.0573 . OpenUrl Abstract / FREE Full Text [19]. ↵ E. L. Kaplan and P. Meier , “ Nonparametric Estimation from Incomplete Observations ,” J. Am. Stat. Assoc ., vol. 53 , no. 282 , pp. 457 – 481 , Jun. 1958 , doi: 10.1080/01621459.1958.10501452 . OpenUrl CrossRef [20]. ↵ S. M. Lundberg , G. G. Erion , and S.-I. Lee , “ Consistent Individualized Feature Attribution for Tree Ensembles ,” Mar . 07, 2019 , arXiv : arXiv: 1802.03888 . doi: 10.48550/arXiv.1802.03888 . OpenUrl CrossRef [21]. ↵ F. Mölder et al. , “ Sustainable data analysis with Snakemake ,” Apr . 19, 2021 , F1000Research : 10 : 33 . doi: 10.12688/f1000research.29032.2 . OpenUrl CrossRef PubMed [22]. ↵ J. Van Der Donckt , J. Van Der Donckt , E. Deprost , and S. Van Hoecke , “ tsflex: Flexible time series processing & feature extraction ,” SoftwareX , vol. 17 , p. 100971 , Jan. 2022 , doi: 10.1016/j.softx.2021.100971 . OpenUrl CrossRef [23]. ↵ M. Christ , N. Braun , J. Neuffer , and A. W. Kempa-Liehr , “ Time Series FeatuRe Extraction on basis of Scalable Hypothesis tests (tsfresh – A Python package) ,” Neurocomputing , vol. 307 , pp. 72 – 77 , Sep. 2018 , doi: 10.1016/j.neucom.2018.03.067 . OpenUrl CrossRef [24]. ↵ F. Pedregosa et al. , “ Scikit-learn: Machine Learning in Python ,” J. Mach. Learn. Res ., vol. 12 , no. 85 , pp. 2825 – 2830 , 2011 . OpenUrl CrossRef PubMed [25]. ↵ The pandas development team , pandas-dev/pandas: Pandas . ( Jun . 05, 2025 ). Zenodo . doi: 10.5281/ZENODO.3509134 . OpenUrl CrossRef [26]. ↵ W. McKinney , “Data Structures for Statistical Computing in Python,” presented at the Python in Science Conference, Austin , Texas , 2010 , pp. 56 – 61 . doi: 10.25080/Majora-92bf1922-00a . OpenUrl CrossRef [27]. ↵ C. R. Harris et al. , “ Array programming with NumPy ,” Nature , vol. 585 , no. 7825 , pp. 357 – 362 , Sep. 2020 , doi: 10.1038/s41586-020-2649-2 . OpenUrl CrossRef PubMed [28]. ↵ J. D. Hunter , “ Matplotlib: A 2D Graphics Environment ,” Comput. Sci. Eng ., vol. 9 , no. 3 , pp. 90 – 95 , 2007 , doi: 10.1109/MCSE.2007.55 . OpenUrl CrossRef PubMed [29]. ↵ P. Fenaux , D. Haase , V. Santini , G. F. Sanz , U. Platzbecker , and U. Mey , “ Myelodysplastic syndromes: ESMO Clinical Practice Guidelines for diagnosis, treatment and follow-up ,” Ann. Oncol ., vol. 32 , no. 2 , pp. 142 – 156 , Feb. 2021 , doi: 10.1016/j.annonc.2020.11.002 . OpenUrl CrossRef PubMed [30]. ↵ L. de Swart et al. , “ Impact of red blood cell transfusion dose density on progression-free survival in patients with lower-risk myelodysplastic syndromes ,” Haematologica , vol. 105 , no. 3 , Art. no. 3 , Mar . 2020 , doi: 10.3324/haematol.2018.212217 . OpenUrl Abstract / FREE Full Text View the discussion thread. Back to top Previous Next Posted July 21, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Dynamic Mortality Risk Prediction in Myelodysplastic Syndromes Using Longitudinal Clinical Data Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Dynamic Mortality Risk Prediction in Myelodysplastic Syndromes Using Longitudinal Clinical Data Jonathan Bobak , Philipp Spohr , Sarah Richter , Alexander Streuer , Felicitas Schulz , Corinna Strupp , Catharina Gerhards , Nanni Schmitt , Thomas Luft , Sascha Dietrich , Ulrich Germing , Gunnar W. Klau medRxiv 2025.07.21.25331775; doi: https://doi.org/10.1101/2025.07.21.25331775 Share This Article: Copy Citation Tools Dynamic Mortality Risk Prediction in Myelodysplastic Syndromes Using Longitudinal Clinical Data Jonathan Bobak , Philipp Spohr , Sarah Richter , Alexander Streuer , Felicitas Schulz , Corinna Strupp , Catharina Gerhards , Nanni Schmitt , Thomas Luft , Sascha Dietrich , Ulrich Germing , Gunnar W. Klau medRxiv 2025.07.21.25331775; doi: https://doi.org/10.1101/2025.07.21.25331775 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6600) Geriatric Medicine (668) Health Economics (997) Health Informatics (4538) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (541) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3333) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9232) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00d8ba60c964eb6',t:'MTc3OTYzODc2NA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.