Machine Learning for Predicting and Maximizing the Response of Breast Cancer Patients to Neoadjuvant Therapy

doi:10.1101/2025.10.11.25337587

Machine Learning for Predicting and Maximizing the Response of Breast Cancer Patients to Neoadjuvant Therapy

2025 · doi:10.1101/2025.10.11.25337587

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 39,552 characters · extracted from preprint-html · click to expand

Machine Learning for Predicting and Maximizing the Response of Breast Cancer Patients to Neoadjuvant Therapy | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Machine Learning for Predicting and Maximizing the Response of Breast Cancer Patients to Neoadjuvant Therapy Janie Y. Cai , View ORCID Profile Zhibin Chen doi: https://doi.org/10.1101/2025.10.11.25337587 Janie Y. Cai 1 Miami Palmetto Senior High School , Miami, FL Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zhibin Chen 2 Department of Microbiology and Immunology, Sylvester Comprehensive Cancer Center, University of Miami Miller School of Medicine , Miami, FL Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zhibin Chen For correspondence: zchen{at}med.miami.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Purpose Neoadjuvant therapy (NAT) is an established treatment for certain high-risk, locally advanced, or unresectable breast cancers, often facilitating breast-conserving surgery. Recent studies show that achieving pathologic complete response (pCR) after NAT correlates with higher event-free survival rates. Thus, accurate prediction of pCR is essential for personalizing breast cancer (BC) treatment to minimize side effects and improve effectiveness. Methods We used a machine learning model named XGBoost to predict pCR in BC patients. The classifier was trained on the expression values of 4,000 genes to predict pCR in ten arms of the I-SPY2 clinical trial. Based on these predictions, we developed a strategy to maximize pCR likelihood and identified influential genes using the importance scores from the model. Results XGBoost models for three arms, Pembrolizumab, ABT 888 plus carboplatin, and T-DM1 plus pertuzumab, achieved the highest prediction accuracies with areas under the receiver operation characteristic curve (AUCs) of 0.814, 0.792, and 0.788, respectively. If treatment assignments followed the XGBoost predictions, pCR rates for nine out of ten I-SPY2 arms could increase significantly, by 9.9% to 29.1% compared to trial results. Key genes associated with pCR were identified for each arm. The expression levels of some genes, including lower expression of ABDH1, AMZ1, BAIAP3 and SYTL4 and higher expression of DENND1C, HMGB3, HMMR, PLEKHF1 and RASEF, were associated with pCR in multiple arms. Conclusion The machine learning models developed in this study provide accurate pCR predictions, improve pCR rates, and may find clinical applications to enhance the treatment for BC patients. Introduction Traditionally, systemic therapy for breast cancer (BC) patients has been administrated following surgery. However, neoadjuvant therapy (NAT), which involves providing systemic treatment prior to surgery, has been found as equally effective for overall survival and long-term recurrence rates when compared to adjuvant therapy ( 1 , 2 ). NAT can reduce the size of the primary tumor, which leads to surgical resection becoming more manageable, converting inoperable tumors to resectable ones, and aiding in breast-conserving surgery (BCS) ( 3 ). Additionally, it offers vital insights into the tumor’s response to treatment, which can guide post-operative decision-making and long-term prognostic assessments ( 3 , 4 ). NAT has been an established therapeutic approach for certain high-risk, locally advanced or unresectable breast cancers and is often employed to enhance BCS feasibility ( 5 , 6 ). NAT also offers a potentially more effective trial design for assessing the efficacy of novel therapies for BC by using pathological complete response (pCR) as an endpoint, in contrast to the conventional drug development process reliant on large phase III trials ( 7 ). Randomized neoadjuvant studies indicate that pCR, which is characterized by the absence of disease in both breast tissue and lymph nodes, may be a predictive marker for long-term outcomes in patients with early-stage BC ( 8 ). Following this evidence, the US Food and Drug Administration provided guidance endorsing pCR as a primary endpoint to facilitate accelerated drug approval ( 9 ). More recent studies have also demonstrated that achieving pCR is associated with significantly better event-free survival (EFS) and overall survival (OS) ( 6 , 10 ). The I-SPY2 neoadjuvant platform trial is a clinical study aimed at swiftly identifying new drugs and combinations for treating high-risk and early-stage breast cancer that demonstrate greater efficacy than standard chemotherapy ( 11 ). Patients receive the experimental treatment prior to surgery to allow for rapid assessment of tumor response. The trial utilizes the analysis of biomarkers found in a patient’s tumor to determine which treatment is likely to be most effective for them. I-SPY2 provides a comprehensive data set that includes pretreatment gene expression profiles, tumor epithelium-specific protein and phosphoprotein data, as well as clinical response information from BC patients ( 12 ). From data involving 987 BC patients across 10 trial arms, five distinct subtypes of breast cancer have been established, forming the RPS-5 response-predictive subtyping schema ( 12 ). It was found that if patients were assigned to their optimal treatment based on the RPS-5 classifications, the overall pCR rate would rise to 58% across the nine experimental arms, compared to the original pCR rate of 35% in the same arms and just 19% in the standard-of-care control arm with paclitaxel. In this study, we employed a machine learning (ML) approach to predict the likelihood of pCR in BC patients undergoing neoadjuvant therapy, utilizing gene expression and clinical data from the I-SPY2 trial ( 12 ). Although several studies have used machine learning methods to predict pCR in BC patients following NAT ( 13 - 18 ), gene expression data from the I-SPY2 trial have not been explored in these studies. In our study, we demonstrated that the pCR rate can be significantly enhanced across nine out of ten I-SPY2 arms by leveraging the predictions generated by the ML model. Utilizing these pCR predictions, we then devised a strategy to optimize the number of patients achieving pCR. We also identified specific genes associated with pCR pertinent to each drug combination tested in the experimental arms. Methods Data set Microarray gene expression data of 987 BC patients from 10 arms of I-SPY2 were downloaded from the Gene Expression Omnibus (GEO) with the accession number GSE194040. The clinical data of these patients in Table S2 of ( 12 ) are downloaded from the journal website that hosts the paper. The total number of patients and the number of patients with pCR in each arm and in the four categories, defined by the hormone receptor (HR) and the human epidermal growth factor receptor 2 (HER2), are extracted from the clinical data and summarized in Table 1 . The gene expression data set contains the expression values of 19,134 genes. View this table: View inline View popup Download powerpoint Table 1. Number of patients in each arm of I-SPY2 ( 12 ) After removing genes with missing values, 17,997 genes were left. We then calculated the variance of each gene across 987 samples. The largest variance is 13.72. We ranked the genes according to their variance, and extracted expression values of top 4,000 genes that would be used to train machine learning models for predicting pCR. Of note, the smalles variance of these 4,000 gene is 0.53. The variance of other genes is relatively small, and thus, they may not have much predictive power for pCR and were discarded. The 4,000 genes include HER2 (also known as ERBB2), estrogen receptor 1 and 2 (ESR1 and ESR2), and the progesterone receptor (PGR). Machine Learning Method The statues of HR and HER2 in the clinical data were added to expression values of 4,000 genes as features, which along with the class label of each patient indicating whether the patient achieved pCR were used to train a classifier named XGBoost ( 19 ) for each arm of I-SPY2. Specifically, the gene expression and pCR data from an arm was randomly split into a training set with 80% samples and a test set with 20% samples. The training set was used to train an XGBoost classifier. Five-fold cross validation was carried out to determine the optimal values of several hyper-parameters including the number of trees, the maximum depth of a tree, the learning rate, colsample_bynode, and the l 2 -regularization weight. The value of scale_pos_weight was set to be the ratio of the count of negative instances to the count of positive instances to mitigate the possible problem of data imbalance between the two classes. Finally, an XGBoost classifier was trained on the whole training data set with the optimal values of hyper-parameters. The test data were then applied to the trained model to evaluate the performance of the model. The receiver operating characteristic (ROC) curve, the area under the ROC curve (AUC), the precision-recall (PR) curve, and the area under the PR curve (AUPRC) were calculated as the performance metrics. The class label was predicted based on the class probability. The optimal cutoff probability for predicting the class label was determined using Youden’s J statistics ( 20 ). Specifically, after a model was trained in CV, the validation set was used to obtain a ROC curve, and the optimal cutoff probability was calculated based on the ROC using the Youden’s J statistic. Since five-fold CV was used, the final optimal cutoff probability was the average of the five cutoff values obtained with five validation folds. After class labels were predicted, accuracy, sensitivity, specificity, precision, and recall were calculated for performance evaluation. The procedure of randomly splitting data, training and testing a classifier was repeated 20 times, and the average performance metrics and their standard deviations were calculated. Of note, the test data were never used in model training. Identification and Analysis of Genes Associated with pCR Each trained XGBoost classifier outputs an importance score for each feature. We ranked genes according to their importance scores and each was assigned a rank from one to 4,000. As described earlier, the process of random data splitting and model training was repeated 20 times. Therefore, each gene had 20 ranks. The outlier ranks of each gene were identified and removed. More specifically, the interquartile range (IQR) of the ranks of each gene was calculated, and any rank that is greater than 1.5*IQR was determined as outliers and discarded. The average rank of each gene was calculated, and finally all genes were re-ranked according to their average ranks. Top genes were deemed to have significant influence on pCR. We took the top 100 genes for each arm of ISPY-2 and performed gene ontology (GO) analysis. The set of 100 genes was input to GOrilla, an online tool for GO analysis ( 21 ), as the target set, and similarly the set of all other human protein-coding genes was input to GOrilla as the background set. Gorilla outputs a directed acyclic graph (DAG) of GO terms related to the 100 genes. Any GO term at any leaf node of the DAG with a p-value less than 10 -3 was identified as a term significantly enriched in the gene set. Maximize pCR Since we can predict a patient’s pCR to each drug for which an XGBoost model has been trained, each patient can have personalized treatment by choosing the most suitable drug based on pCR predictions, which may maximize the likelihood of pCR. To test this idea, we set aside 324 patients in the two arms Ctr and Neratinib as test samples, then applied the eight XGBoost models trained for the remaining eight arms to these test samples. Thus, each patient in the test samples had eight probabilities of pCR to the drugs in the eight arms. If the probability for a drug is greater than a threshold, then the drug is a candidate treatment for the patient. The probability threshold was determined as follows. As described earlier, after an XGBoost model was trained, we used the test data to compute a PR curve by calculating precisions and recalls for a set of probability thresholds. In this process, we also obtained a curve of precision versus probability threshold, which is named as the PPT curve. Note that precision is the ratio between the true positive and the predicted positive, which is identical to the pCR rate for the drug. Given a target pCR rate, say 0.7, we can determine the probability threshold for each drug based on its PPT curve. After candidate drugs are determined for a patient, a treatment decision can be made by selecting the drug with the highest probability of pCR or the least potential side effect for the patient. Results ML Models Improve the pCR Rate We trained an XGBoost classifier for each of 10 arms of the I-SPY2 trial to predict pCR. Figure 1 illustrates the ROC curves for these classifiers. The XGBoost models for Pembro, VC, and TDM1/P achieved the highest AUCs, with values of 0.81, 0.79, and 0.79, respectively. Conversely, the model for MK2208 attained the lowest AUC at 0.58, while the AUCs of classifiers for other six arms are between 0.62 and 0.71. Table 2 summarizes the AUC, prediction accuracy, sensitivity, specificity, and F1-score of XGBoost classifiers across all 10 arms of the I-SPY2 trial. Once again, models for Pembro, VC, and TDM1/P achieved the best performance in terms of the F1-score and sensitivity. The model for Ctr demonstrated the lowest F1-score, mainly due to its limited sensitivity. Although it achieved an accuracy of 0.702, which is relatively high, the significant imbalance in sample sizes between the two classes—as shown in Table 1 —suggests that accuracy may not adequately reflect the model’s performance compared to the F1-score. PR curves for the ten XGBoost models are presented in S1 Fig. The models for TDM1/P, Pembro, Pertuzumab, and VC achieved highest AUPRCs of 0.69, 0.64, 0.62, and 0.61, respectively. View this table: View inline View popup Download powerpoint Table 2. Performance of XGBoost classifiers in predicting pCR for 10 I-SPY2 arms Download figure Open in new tab Figure 1. ROC curves of XGBoost classiﬁers for predicting the pCR in ten arms of I-SPY2. Table 3 compares the pCR rate of each I-SPY2 arm, calculated from the data in Table 1 , with the precision of each XGBoost model, which is equivalent to the pCR rate of the arm if patients were predicted by the XGBoost gene expression model to have pCR to a treatment regimen and then treated with that regimen. Using the prediction from the XGBoost models, all arms, except for Pertuzumab, exhibit significantly higher pCR rates than those observed in the I-SPY2 clinical trial, with p-values less than 0.05 (one-sided Wilcoxon signed-rank test). Notably, based on the predictions from the XGBoost models, TDM1/P, Pembrolizumab (Pembro), and Vincristine (VC) demonstrated pCR rates of 0.784, 0.740, and 0.642, respectively, all significantly surpassing the clinical pCR rates of I-SPY2, which were 0.577, 0.449, and 0.380, respectively. The improvements in pCR rates for the nine arms, excluding Pertuzumab, as predicted by the XGBoost models, range from 0.099 to 0.291. View this table: View inline View popup Download powerpoint Table 3. Comparison of pCR rates predicted by XGBoost models and those of I-SPY2 ML Predictions Maximize pCR We applied XGBoost models for the eight arms,excluding Ctr and Neratinib, to the data of the two arms (n=324), Ctr and Neratinib, to predict pCR of each patient to the drugs in the eight arms. Figure 2 presents histograms of the pCR probabilities for each arm. It is important to note that the XGBoost models for Ganitumab, Ganetespib, Vincristine (VC), and Pembrolizumab (Pembro) were trained using data from HER2-patients, and thus were applied to the 228 HER2-samples within the 324 total samples. Similarly, the models for TDM1/P and Pertuzumab, which were trained on HER2+ patient data, were applied to 96 HER2+ samples out of the 324. As illustrated in Figure 2 , a substantial number of patients exhibit high pCR probabilities for Pembro, TDM1/P, Pertuzumab, and VC, while the remaining arms show lower probabilities. This trend aligns with the findings in Table 3 , where the pCR rates for the first four arms are significantly higher than those for the latter four. Download figure Open in new tab Figure 2. Histograms of the pCR probabilities predicted by XGBoost models for 8 I-SPY2 arms. To predict whether a patient achieves pCR based on the probabilities shown in Figure 2 , we first need to establish a probability threshold. A higher probability threshold will result in fewer patients being predicted to achieve pCR, but those predicted will have a higher pCR rate. To identify the appropriate threshold, we utilized the PPT curve from an XGBoost model specifically trained and tested for one of the I-SPY2 arms. Figure 3 illustrates the PPT curves for the eight I-SPY2 arms. For models with strong predictive power, such as those for Pembro and TDM1/P, a relatively low probability threshold can yield a high pCR rate. Conversely, for models with weaker predictive power, such as those for Neratinib, MK2206, Ganitumab, and Ganetespib, a much higher threshold is required to achieve a satisfactory pCR rate. Download figure Open in new tab Figure 3. The pCR rate versus the probability threshold of the XGBoost classiﬁers for the eight I-SPY2 Therefore, based on a target pCR rate, say 0.7, we determined the probability threshold for each arm using its respective PPT curve. We then applied these thresholds to the predicted pCR probabilities from Figure 2 to assess whether a patient would be likely to achieve pCR. Table 4 presents the number of patients predicted to achieve pCR for each arm at several target pCR rates. Taking the union across all arms, we also calculated the total number of patients anticipated to achieve pCR in at least one arm. At a target pCR rate of 0.8, 125 out of 324 patients were predicted to achieve pCR, with an actual count of 100 patients who would attain pCR if treated. Similarly, at a target pCR rate of 0.5, 268 patients were predicted to achieve pCR, of which 134 would realistically be expected to do so. Among the four target pCR rates listed in Table 4 , a rate of 0.6 seems to be optimal, as it results in the highest expected number of patients (147 of 245 patients) to achieve pCR. View this table: View inline View popup Download powerpoint Table 4. The number of patients predicted to achieve pCR at different target pCR rates Genes Associated with pCR We identified genes that are associated with pCR for each of ten I-SPY2 arms based on the importance score of each feature output from the XGBoost model. The top 100 genes for each arm are listed in the first spread sheet of Supplementary file 1. Since many genes appear in multiple arms, there are 220 unique genes across ten arms, among which 48 genes appear in all ten arms as listed in the second spread sheet of Supplementary file 2. This is not surprising, because nine of the ten arms except TDM1/P include paclitaxel. Figure 4 depicts the expression values of top ten genes in each arm. While nine of the ten genes in the Ctr arm are differentially expressed in pCR and non-pCR groups, only one gene in the MK2206 arm is differentially expressed. The expression levels of some genes were associated with pCR in multiple arms, including lower expression of ABDH1 (ctr, amg386, pembro, pertuzumab), AMZ1 (AMG386, VC, pembro), BAIAP3 (ctr, amg386, Ganetespib, pembro, TDM1/P) and SYTL4 (Ctr, AMG386, Neratinib, VC, Pembro) and higher expression of DENND1C (AMG386, neratinib, pembro), HMGB3 (Ganitumab, VC, TDM1/P, Pertuzumab), HMMR (Ganitumab, TDM1/P, Pertuzumab), PLEKHF1(Ctr, AMG386, Ganitumab,VC, Pembro) and RASEF (AMG386, Ganitumab, Ganetespib, VC, Pembro) (Supplemental file 1). Download figure Open in new tab Figure 4. Expression values of top 10 genes associated with pCR in each I-SPY2 arm. The p-values for the comparison between the response and non-response groups that are less than 0.05 are displayed at the bottom of each figure, while those greater than 0.05 are omitted. The Wilcoxon rank-sum test was employed to calculate these p-values. Of note, since the XGBoost model can capture the nonlinear effect of an input feature on the output target, it can identify genes influencing pCR that are not differentially expressed. GO enrichment analysis identified several GO terms enriched in the top 100 genes. These GO terms are presented in the third spread sheet of Supplementary file 2. The following GO terms appear in multiple arms: endothelial cell morphogenesis (GO:0001886), positive regulation of vascular smooth muscle cell proliferation (GO:1904707), regulation of neuroinflammatory response (GO:0150077), exocytosis (GO:0006887), and nucleosome (GO:0000786). The top five genes in the Pembro arm and the TDM1/P arm are all related to tumorigenesis based on the evidences in the literature (Supplemental file 2). Discussion Human BC is generally categorized into four subtypes based on HR and HER2 status: HR+/HER2, HR-/HER2-, HR+/HER2+, HR-/HER2+ ( 22 , 23 ), where the HR status is determined by the expression of ER, PR, or both. The HR/HER2 subtyping effectively predicts responses to endocrine and HER2-targeted agents ( 24 ). However, with the recent emergence of new targeted therapies for BC, which may function through mechanisms beyond those captured by HR and HER2 expression, a novel RPS-5 subtyping has been proposed ( 12 ), which provides better prediction of pCR and may facilitate more effective agent selection for BC treatment. The RPS-5 subtyping relies on five biomarkers: HER2, Immune, DRD, BP-HER2_or_Basal, and BP-Luminal. The Immune and DRD statuses are derived from scores of multiple gene signatures, respectively, while BP-HER2_or_Basal and BP-Luminal classifications are based on BluePrint molecular subtypes ( 12 ). Instead of relying on biomarkers that condense the expression data of multiple genes into a single score, we utilized the expression values of 4,000 genes with the highest variance to train an ML model for predicting pCR. This approach allows us to leverage the comprehensive information within the transcriptome, potentially enhancing prediction accuracy. Our ML models significantly improved the pCR rates for nine out of the ten I-SPY2 arms, except for Pertuzumab. They achieved the highest pCR rates for the Pembro and TDM1/P arms at 0.74 and 0.78, with sensitivities of 0.73 and 0.71, respectively. We also explored two additional ML models, support vector machine and logistic regression with elastic-net regularization, but their predictive performance was inferior to XGBoost. Consequently, their results are not included. We have trained ten XGBoost models to predict pCR for ten drug combinations. If gene expression and pCR data for additional treatments become available, we can similarly train XGBoost models for those therapies. The challenge is effectively utilizing these predictions to select the best drug for individual patients, as the models show varying predictive power shown in Tables 2 and 3 . A fixed probability threshold of 0.5 for pCR classification may not be optimal, especially if multiple drugs predict pCR. To address this, we developed a drug selection strategy based on the PPT curves of the XGBoost classifiers. We identify a probability threshold from the PPT curve that aligns with the desired pCR rate. For patients predicted to achieve pCR with multiple drugs, we can choose the drug with fewer side effects or the highest sensitivity at the threshold. This strategy ensures a target pCR while selecting the most appropriate drug for each patient. While predicting pCR is crucial for determining treatment strategies, identifying genes linked to pCR, is equally important to finding genes as potential targets for intervention for drug development. Our XGBoost models leverage importance scores to assess the influence of individual genes on pCR, pinpointing key genes that significantly impact this outcome. Analysis of the top five genes identified for Prembo and TDM1/P reveals that all involved in tumor development and progression, underscoring the relevance of the genes identified through our XGBoost models. A major limitation of this study is the small sample size. While our XGBoost models demonstrated significantly higher precision, which is equivalent to the pCR rate, compared to the I-SPY2 trial across nine arms excluding Pertuzumab, the standard error in predicted pCR rates remains high due to the limited sample. A larger sample could reduce the standard error, enhancing model robustness and potentially improving predictive power as indicated by metrics like AUC and F1 score. Despite this, the study underscores the potential of ML models trained on gene expression data to provide accurate pCR predictions for BC patients. Data Availability All data produced in the present work are contained in the manuscript. https://github.com/CaixdLab/BRCA_NATpCR Supplemental information Supplemental file1 (xlsx) Supplemental file 2 (word) Author Contributions ZC conceived the study. Both authors contributed to methodology and data analysis. JC wrote the computer programs for implementing the machine learning methods and data visualization. Both authors wrote, reviewed, and approved the manuscript. Funding The authors declare that no funds, grants, or other support were received during the preparation of this manuscript. Competing Interests The authors have no relevant financial or non-financial interests to disclose. Data availability This study used the gene expression and clinical data from the I-SPY2 clinical trial ( 12 ). The microarray gene expression data can be accessed in GEO under the accession number GSE194040. The clinical data are provided in Table S2 of ( 12 ) available on the journal’s website. The computer programs developed for this study are available on GitHub ( https://github.com/CaixdLab/BRCA_NATpCR ), allowing for the replication of all results presented in this paper. Acknowledgement We thank Dr. Xiaodong Cai at the College of Engineering of University of Miami for assistance on methodology and data analysis. Reference 1. ↵ Rastogi P , Anderson SJ , Bear HD , Geyer CE , Kahlenberg MS , Robidoux A , et al. Preoperative chemotherapy: updates of national surgical adjuvant breast and bowel project protocols B-18 and B-27 . Journal of Clinical Oncology . 2008 ; 26 ( 5 ): 778 – 85 . OpenUrl Abstract / FREE Full Text 2. ↵ Chen Y , Shi X-E , Tian J-H , Yang X-J , Wang Y-F , Yang K-H. Survival benefit of neoadjuvant chemotherapy for resectable breast cancer: A meta-analysis . Medicine . 2018 ; 97 ( 20 ): e10634 . OpenUrl PubMed 3. ↵ Potter DA , Herrera-Ponzanelli CA , Hinojosa D , Castillo R , Hernandez-Cruz I , Arrieta VA , et al. Recent advances in neoadjuvant therapy for breast cancer . Faculty reviews . 2021 ; 10 : 2 . OpenUrl PubMed 4. ↵ Graham PJ , Brar MS , Foster T , McCall M , Bouchard-Fortier A , Temple W , et al. Neoadjuvant chemotherapy for breast cancer, is practice changing? A population-based review of current surgical trends . Annals of surgical oncology . 2015 ; 22 ( 10 ): 3376 – 82 . OpenUrl PubMed 5. ↵ Mougalian SS , Soulos PR , Killelea BK , Lannin DR , Abu‐Khalaf MM , DiGiovanna MP , et al. Use of neoadjuvant chemotherapy for patients with stage I to III breast cancer in the United States . Cancer . 2015 ; 121 ( 15 ): 2544 – 52 . OpenUrl CrossRef PubMed 6. ↵ Spring LM , Fell G , Arfe A , Sharma C , Greenup R , Reynolds KL , et al. Pathologic complete response after neoadjuvant chemotherapy and impact on breast cancer recurrence and survival: a comprehensive meta-analysis . Clinical cancer research . 2020 ; 26 ( 12 ): 2838 – 48 . OpenUrl Abstract / FREE Full Text 7. ↵ Bardia A , Baselga J. Neoadjuvant therapy as a platform for drug development and approval in breast cancer . Clinical Cancer Research . 2013 ; 19 ( 23 ): 6360 – 70 . OpenUrl Abstract / FREE Full Text 8. ↵ Cortazar P , Zhang L , Untch M , Mehta K , Costantino JP , Wolmark N , et al. Pathological complete response and long-term clinical benefit in breast cancer: the CTNeoBC pooled analysis . The Lancet . 2014 ; 384 ( 9938 ): 164 – 72 . OpenUrl 9. ↵ Food U , Administration D. Guidance for industry: pathological complete response in neoadjuvant treatment of high-risk early-stage breast cancer: use as an endpoint to support accelerated approval . Silver Spring, MD : US Department of Health and Human Services . 2014 . 10. ↵ Yee D , DeMichele AM , Yau C , Isaacs C , Symmans WF , Albain KS , et al. Association of event-free and distant recurrence–free survival with individual-level pathologic complete response in neoadjuvant treatment of stages 2 and 3 breast cancer: three-year follow-up analysis for the I-SPY2 adaptively randomized clinical trial . JAMA oncology . 2020 ; 6 ( 9 ): 1355 – 62 . OpenUrl PubMed 11. ↵ Wang H , Yee D. I-SPY 2: a neoadjuvant adaptive clinical trial designed to improve outcomes in high-risk breast cancer . Current breast cancer reports . 2019 ; 11 ( 4 ): 303 – 10 . OpenUrl PubMed 12. ↵ Wolf DM , Yau C , Wulfkuhle J , Brown-Swigart L , Gallagher RI , Lee PRE , et al. Redefining breast cancer subtypes to guide treatment prioritization and maximize response: Predictive biomarkers across 10 cancer therapies . Cancer cell . 2022 ; 40 ( 6 ): 609 - 23 . e6. OpenUrl CrossRef PubMed 13. ↵ Lai C-C , Chen C-Y , Chang T-H. Predicting Pathological Complete Response Following Neoadjuvant Therapy in Patients With Breast Cancer: Development of Machine Learning–Based Prediction Models in a Retrospective Study . JMIR cancer . 2025 ; 11 : e64685 . OpenUrl 14. Jing B , Wang K , Schmitz E , Tang S , Li Y , Zhang Y , et al. Prediction of pathological complete response to chemotherapy for breast cancer using deep neural network with uncertainty quantification . Medical physics . 2024 ; 51 ( 12 ): 9385 – 93 . OpenUrl PubMed 15. Jensen M-B , Pedersen C , Misiakou M-A , Talman M-L , Gibson L , Tange U , et al. Multigene profiles to guide the use of neoadjuvant chemotherapy for breast cancer: A Copenhagen Breast Cancer Genomics Study . NPJ Breast Cancer . 2023 ; 9 ( 1 ): 47 . OpenUrl PubMed 16. Zhou Z , Adrada BE , Candelaria RP , Elshafeey NA , Boge M , Mohamed RM , et al. Prediction of pathologic complete response to neoadjuvant systemic therapy in triple negative breast cancer using deep learning on multiparametric MRI . Scientific reports . 2023 ; 13 ( 1 ): 1171 . OpenUrl PubMed 17. Qian B , Yang J , Zhou J , Hu L , Zhang S , Ren M , et al. Individualized model for predicting pathological complete response to neoadjuvant chemotherapy in patients with breast cancer: a multicenter study . Frontiers in Endocrinology . 2022 ; 13 : 955250 . OpenUrl PubMed 18. ↵ Basmadjian RB , Kong S , Boyne DJ , Jarada TN , Xu Y , Cheung WY , et al. Developing a prediction model for pathologic complete response following neoadjuvant chemotherapy in breast cancer: a comparison of model building approaches . JCO Clinical Cancer Informatics . 2022 ; 6 : e2100055 . OpenUrl 19. ↵ Chen T , Guestrin C , editors. Xgboost: A scalable tree boosting system . Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and data mining ; 2016 . 20. ↵ Hassanzad M , Hajian-Tilaki K. Methods of determining optimal cut-point of diagnostic biomarkers with application of clinical data in ROC analysis: an update review . BMC medical research methodology . 2024 ; 24 ( 1 ): 84 . OpenUrl PubMed 21. ↵ Eden E , Navon R , Steinfeld I , Lipson D , Yakhini Z. GOrilla: a tool for discovery and visualization of enriched GO terms in ranked gene lists . BMC bioinformatics . 2009 ; 10 ( 1 ): 48 . OpenUrl CrossRef PubMed 22. ↵ Howlader N , Altekruse SF , Li CI , Chen VW , Clarke CA , Ries LA , et al. US incidence of breast cancer subtypes defined by joint hormone receptor and HER2 status . Journal of the National Cancer Institute . 2014 ; 106 ( 5 ): dju055 . OpenUrl CrossRef PubMed 23. ↵ Harbeck N , Penault-Llorca F , Cortes J , Gnant M , Houssami N , Poortmans P , et al. Breast cancer Nat Rev Dis Primers 5: 66. ed ; 2019 . 24. ↵ Waks AG , Winer EP . Breast cancer treatment: a review . Jama . 2019 ; 321 ( 3 ): 288 – 300 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 14, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Machine Learning for Predicting and Maximizing the Response of Breast Cancer Patients to Neoadjuvant Therapy Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Machine Learning for Predicting and Maximizing the Response of Breast Cancer Patients to Neoadjuvant Therapy Janie Y. Cai , Zhibin Chen medRxiv 2025.10.11.25337587; doi: https://doi.org/10.1101/2025.10.11.25337587 Share This Article: Copy Citation Tools Machine Learning for Predicting and Maximizing the Response of Breast Cancer Patients to Neoadjuvant Therapy Janie Y. Cai , Zhibin Chen medRxiv 2025.10.11.25337587; doi: https://doi.org/10.1101/2025.10.11.25337587 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Oncology Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4412) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15205) Forensic Medicine (30) Gastroenterology (1120) Genetic and Genomic Medicine (6575) Geriatric Medicine (666) Health Economics (994) Health Informatics (4511) Health Policy (1365) Health Systems and Quality Improvement (1608) Hematology (537) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15904) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6573) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (954) Oncology (3319) Ophthalmology (968) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (662) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5423) Public and Global Health (9205) Radiology and Imaging (2191) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1191) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9fed59af598a58d3',t:'MTc3OTMwMTE3Mw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00