MyGESig: a population-specific gene signature improves survival prediction in Malaysian breast cancer patients

doi:10.1101/2025.08.28.25334111

MyGESig: a population-specific gene signature improves survival prediction in Malaysian breast cancer patients

2025 · doi:10.1101/2025.08.28.25334111

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 83,124 characters · extracted from preprint-html · click to expand

MyGESig: a population-specific gene signature improves survival prediction in Malaysian breast cancer patients | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search MyGESig: a population-specific gene signature improves survival prediction in Malaysian breast cancer patients View ORCID Profile Mohamad Hazwan Khairi , Zhi Lei Wong , Boon Hong Ang , James Phipps-Tan , Putri Nur Fatin , Rajadurai Pathmanathan , See Mee Hoong , Nur Aishah Mohd Taib , Cheng-Har Yip , Weang Kee Ho , Mei Chee Tai , Soo-Hwang Teo , Sok Ching Cheong , Pan Jia-Wern doi: https://doi.org/10.1101/2025.08.28.25334111 Mohamad Hazwan Khairi 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mohamad Hazwan Khairi Zhi Lei Wong 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Boon Hong Ang 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site James Phipps-Tan 2 Friedrich Miescher Laboratory of the Max Planck Society , Tübingen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Putri Nur Fatin 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia 3 Faculty of Science, Institute of Biological Sciences , Universiti Malaya, Kuala Lumpur, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rajadurai Pathmanathan 4 Subang Jaya Medical Centre , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site See Mee Hoong 5 Department of Surgery, Faculty of Medicine, University Malaya , 50603, Kuala Lumpur, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nur Aishah Mohd Taib 5 Department of Surgery, Faculty of Medicine, University Malaya , 50603, Kuala Lumpur, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Cheng-Har Yip 6 Hospital Picaso , No. 110, Jalan Professor Khoo Kay Kim, Seksyen 19, 46300 Petaling Jaya, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Weang Kee Ho 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia 7 School of Mathematical Sciences, Faculty of Science and Engineering, University of Nottingham Malaysia , Semenyih, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mei Chee Tai 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Soo-Hwang Teo 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia 5 Department of Surgery, Faculty of Medicine, University Malaya , 50603, Kuala Lumpur, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sok Ching Cheong 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pan Jia-Wern 1 Cancer Research Malaysia , No. 1, Jalan SS12/1A, 47500, Subang Jaya, Malaysia Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: jiawern.pan{at}cancerresearch.my Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Accurate prognostic models are essential for guiding treatment decisions and improving patient outcomes in breast cancer. To achieve this, population-specific models are needed to account for genetic, clinical, and pathological differences across populations. In this study, the widely used and freely available PREDICT v3.0 breast cancer prognostic model was first validated in the multiethnic Malaysian Breast Cancer (MyBrCa) cohort to assess its performance. Given its only moderate performance in this population, a machine learning workflow was developed to integrate gene expression and clinical information for classifying patients by their 10-year prognosis. A 77-gene signature, termed MyGESig, was derived from the transcriptomes of 258 MyBrCa patients. Using this signature in combination with clinical variables, an ensemble-based model achieved a median area under the receiver-operator characteristic curve (AUROC) of 0.92 in the hold-out testing set and 0.90 in the independent MyBrCa dataset. While the model exhibited poor generalizability in external cohorts, its discriminative performance improved when trained and tested within the same population (median AUROC: 0.71 in METABRIC; 0.84 in SCAN-B), validating the prognostic value of the gene set. Together, these findings demonstrate the value of incorporating population-specific gene expression datasets into prognosis prediction and highlight the need to develop and validate models tailored to diverse populations in breast cancer. 2 Introduction Prognostic tools are essential in the management of breast cancer as they provide personalized risk assessments to guide treatment decisions, optimize adjuvant therapy selection, and improve survival outcomes while avoiding overtreatment in low-risk patients ( Cardoso et al., 2019 ). One such tool is PREDICT: a widely used, freely available web-based prognostic tool that estimates survival outcomes and treatment benefits (e.g., chemotherapy, endocrine therapy, anti-HER2 therapy) in breast cancer patients by integrating clinicopathological factors such as age, tumor size, grade, and receptor status ( Wishart et al., 2010 ). PREDICT version 3.0 has been released, with the model being refitted on an updated version of its original cohort, now including a larger number of patients, an improved prognosis data for early breast cancer, and a higher proportion of ER-negative cases ( Grootes et al., 2024 ). PREDICT v3.0 has significantly influenced clinical decision-making, particularly in adjuvant therapy planning, and is highly accurate in women of European descent in North America and Europe ( Gray et al., 2018 ; Stabellini et al., 2023 ). Other prognostic tools include genomic signatures such as Oncotype DX, MammaPrint, and Prosigna, which are molecular assays that analyse tumor gene expression patterns to predict recurrence risk and chemotherapy benefit in breast cancer, enabling personalized treatment decisions that reduce overtreatment in low-risk patients while identifying high-risk cases needing aggressive therapy ( Paik et al., 2004 ; Vijver et al., 2002 ; Wallden et al., 2015 ). In women of Asian descent, the accuracy of the PREDICT tool and genomic signatures is lower than that in women of European descent, presumably because of population-specific differences in tumor biology and treatment response. Asian breast cancer patients, for instance, are younger at diagnosis and have a higher proportion of premenopausal, oestrogen receptor (ER) negative and human epidermal growth factor receptor 2 (HER2) receptor-positive cases compared to their Western counterparts ( Yap et al., 2019 ). Furthermore, previous findings revealed that the molecular landscape of Asian breast cancer patients is significantly different from that of Caucasian women, characterized by elevated expression of immune-related gene signatures and higher prevalence of TNBC and HER2-enriched molecular subtype ( Lee et al., 2024 ; Pan et al., 2020 ). Indeed, the earlier versions of PREDICT have been evaluated in Malaysia, India, and Japan, with the reported AUROC for 5-year estimation of overall survival being 0.78, 0.64, 0.71, and compared to 0.78 in women in the USA and 0.80 in the Netherlands ( Nair et al., 2023 ; Stabellini et al., 2023 ; van Maaren et al., 2017 ; Wong et al., 2015 ; Zaguirre et al., 2021 ). However, despite good discrimination overall, poor calibration was observed in certain subgroups across different cohorts, including younger patients (< 40 years old), older patients (≥ 65 years old), and those receiving specific treatment regimens, which likely reflect the differences in patient characteristics compared to the original cohort. The Western-centric development of these models further compounds this issue and contributes to their poorer performance in Asian cohorts. Similarly, when the 21-gene recurrence score (Oncotype DX) predicted recurrence risk in HR+/HER2- patients in Japan and South Korea, a higher proportion of patients were classified as intermediate risk compared to cohorts in North America and Europe ( Kim et al., 2018 ; Toi et al., 2010 ). Given these population-specific differences, several multi-gene assays have been developed specifically for women of East Asian descent. For instance, the GenesWell BCT assay, developed from Korean patient data, was able to classify more patients under 50 years old as low-risk than the Oncotype DX test ( Kwon et al., 2019 ). In Taiwanese women, a 34-gene panel was developed to classify patients into low- and high-risk groups for local/regional recurrence (LRR) after mastectomy ( Cheng et al., 2016 ). Earlier validations of PREDICT in Malaysian populations were conducted on smaller cohorts and relied on an older version of the model (v2.2). In this study, the performance of PREDICT v3.0 was assessed by validating it in the MyBrCa cohort, a large cohort of Malaysian breast cancer patients, including women of Chinese, Malay, and Indian ethnicities. To address the need for a population-specific prognostic model, a machine learning framework was developed that incorporates gene expression and clinical data to classify patients by 10-year prognosis. Through this framework, a 77-gene signature, termed MyGESig, was derived from the transcriptome of 258 MyBrCa patients. The predictive ability of this signature was then evaluated by comparing its discriminative performance across different datasets and assessing its prognostic value. 3 Method 3.1 Study cohort The clinical and gene expression data used in this study were derived from the MyBrCa, a hospital-based case-control study that was started in 2002 in two hospitals, which now includes a total of 6086 cases diagnosed sequentially from October 2002 to December 2016 ( Tan et al., 2018 ). For the validation of PREDICT v3.0, a cohort was defined consisting of patients with complete clinical information required by the PREDICT v3.0 model, specifically tumor grade, age at diagnosis, tumor size, number of positive lymph nodes, and ER status ( Figure 1 ). Download figure Open in new tab Figure 1 Flowchart summarizing cohort selection for PREDICT validation (n = 3875) and machine learning model development (n = 258) HER2 and PR status were not required for inclusion, as the PREDICT model can accommodate unknown values for these markers. Patients with incomplete survival data, including vital status and survival time, were excluded. Survival time is calculated as the difference between the date of diagnosis and the date of last follow-up. Patients with stage 4 disease were also excluded from this cohort. A separate cohort was defined for the development of a machine learning-based prognostic model. This cohort was a subset of the former, selected based on the availability of RNA-seq data. Unlike the PREDICT model, the machine learning model required complete HER2 and PR status, and therefore, patients missing either of these variables were excluded. Since the prognosis prediction task were addressed as a two-class classification task, patients were classified into good or poor prognosis groups using a 10-year survival threshold, as adapted from previous studies ( Bertucci et al., 2002 ; Chen et al., 2012 ; Zhu et al., 2009 ). Specifically, those who survived more than 10 years after diagnosis were considered to have a good prognosis, whereas those who died within 10 years were classified as having a poor prognosis. Accordingly, patients who are still alive but had less than 10 years of follow-up were excluded to ensure accurate classification of prognosis. 3.2 Validation of PREDICT v3.0 in the MyBrCa cohort Survival estimates at 5, 10, and 15 years were computed using the PREDICT v3.0 tool [GitHub ( https://github.com/pengpclab/PREDICTv3 )]. This version of PREDICT v3.0 requires input variables including age at diagnosis, tumor size, number of positive lymph nodes, ER status, tumor grade, mode of detection, and Ki67 status. HER2 status, PR status, smoking status, and treatment-related variables such as chemotherapy, hormone therapy, trastuzumab, and bisphosphonates are optional inputs and can be entered as unknown (coded as 9) when missing. Accounting for any treatments received, the predicted survival was obtained by summing the survival probabilities (post-surgery) and the calculated survival benefit from the treatments, as output by the PREDICT v3.0 model. The performance of PREDICT v3.0 was evaluated based on goodness-of-fit, calibration and discrimination. Goodness-of-fit was assessed graphically by plotting overall survival (calculated via Kaplan-Meier analysis) against the median predicted survival, with predictions grouped into quintiles. Model calibration was assessed by comparing the observed 5-, 10-, and 15-year overall survival (calculated via Kaplan-Meier analysis) versus the median probability of survival estimated by PREDICT v3.0. Model discrimination was assessed by calculating the AUROC curve using the pROC (v1.18.5) package ( Robin et al., 2011 ). Subgroup analysis was performed according to age, ethnicity, stage, tumor grade, clinical subtype, receptor status, and systemic therapy received. 3.3 Gene expression dataset Of the 6086 cases diagnosed sequentially from October 2002 to December 2016, 710 cases had fresh frozen tumor samples surplus to diagnostic requirements collected at surgery ( Pan et al., 2021 ; Pan et al., 2020 ). Only 258 samples were included in this study, based on the availability of complete clinical information and eligibility under the prognosis classification criteria. mRNA extracted from breast tissue samples was subjected to paired-end 75 base pair RNA-Seq on the HiSeq4000 platform (Illumina, San Diego, USA). Details of sample collection, sequencing, and post-sequencing data processing were previously described in Pan et al. (2020) . Clean count data were obtained by filtering out genes with low expression levels (in counts per million, CPM) using the filterByExpr() function from the edgeR R package (v4.0.16) ( Chen et al., 2025 ). A gene was retained if its CPM exceeded a library-size-adjusted threshold in enough samples from at least one group. After filtering, 18,112 genes were retained and were then normalized using the trimmed mean of M-values (TMM) method from the edgeR (v4.0.16) R package. The normalized count data were subjected to voom transformation using the Limma (v3.58.1) R package ( Ritchie et al., 2015 ) to prepare for downstream linear modelling analysis. 3.4 Identification of multigene prognostic signature To reduce the complexity of the expression data and extract the most informative genes, principal component analysis (PCA) was applied to all samples. Principal components (PCs) that together accounted for 80% of the total variance were retained, and the top 30 genes with the highest loading scores from each of these PCs were extracted. This approach assumes that genes whose expression correlates with patient outcome should not be equal in expression across all patients in the dataset. If a gene has nearly the same expression level (or follows the same eigenvector of expression) across all patients, it is unlikely to carry meaningful information about differences in prognosis. By using PCA, we sought to eliminate such genes and retain only genes that contribute the most to variation in expression across the dataset. To select relevant genes that are associated with prognosis, LASSO regression from the glmnet (v4.1-8) R package ( Tay et al., 2023 ) was implemented. LASSO regression applies a penalty term proportional to the absolute value of the coefficients, shrinking their magnitude and thereby reducing the number of features in a model. This improves model performance by shrinking the coefficients of irrelevant features towards zero. To determine the optimal penalization strength, the Akaike Information Criterion (AIC) and the Bayesian Information Criterion (BIC) were used. AIC is a model selection criterion that evaluates the goodness of fit of different models and selects the best one by balancing model fit and complexity. The best-fit model, according to AIC, should explain the data well using the fewest number of independent variables. Similarly, BIC also considers model fit and complexity, but implements a stronger penalty for model complexity, thus favouring simpler models. 3.5 Gene set enrichment analysis Gene ontology (GO) term annotation was performed using the clusterProfiler (v4.10.1) R package ( Wu et al., 2021 ) to identify the biological processes (BP) and molecular functions (MF) terms associated with the gene signature. The overrepresentation of specific GO categories was assessed through gene set enrichment analysis using the gseGO() function. Additionally, pathway-level enrichment was also investigated using the GSEA() function and the hallmark gene set (H) ( Liberzon et al., 2015 ) from the Molecular Signatures Database (MSigDB) ( Subramanian et al., 2005 ). 3.6 Prediction of disease prognosis using machine learning The prognostic model was developed in R (v4.3.3) using the machine learning framework available in the caret (v7.0-1) package ( Kuhn, 2008 ). In brief, the framework consists of an outer loop for dataset splitting and an inner loop that combines pipelines based on Random Forest (RF) and Support Vector Machine (SVM) algorithms to predict the probability of a patient having a good prognosis ( Supplementary Figure 1 ). Three SVM algorithms were used in this framework: linear, radial, and polynomial kernels. The predictions from them were averaged to get a consensus. The outer loop performs five iterations of different random seeds, where each iteration applies a one-fold random stratified split of the data in a 7:3 ratio, resulting in five distinct training and testing datasets. Within the inner loop, each of the distinct training datasets undergoes z-score normalization and collinearity removal before model training. Default tuning parameters for each model were adopted during hyperparameter tuning, with a five-fold cross-validation (CV) employed to search for parameters that maximize the area under the receiver operating characteristic (AUROC) plot. The overall model is composed of five sets of five RF and SVM sub-models, resulting in a total of 25 sub-models for each algorithm. Within each set, the classification probabilities from all algorithms were averaged to achieve a robust prediction. The performance of the model was assessed by calculating the AUROC, and the model with the median AUROC across all hold-out testing datasets was selected as the final model. Other performance metrics, such as the F1 score, precision, recall, and Cohen’s kappa, were calculated using the classification threshold that maximized the Youden’s Index, as determined in the pROC (v1.18.5) package. Youden’s Index was chosen as it balances specificity and sensitivity, making it suitable for this prognostic model where accurate distinction between good and poor prognosis is equally important. 3.7 Validation on independent cohorts In brief, the performance of the model was assessed in hold-out testing datasets and validation datasets from three independent cohorts: a subset of the MyBrCa cohort that was excluded from the initial stratified training-testing split, and two external cohorts (the METABRIC and the SCAN-B cohorts). The independent MyBrCa dataset was established by applying the same 7:3 stratified split used to generate training and testing datasets: the first 30% was retained as an independent validation dataset, while the remaining 70% was used to generate the training and testing datasets ( Supplementary Figure 1 ). The clinical and gene expression data for both external cohorts were obtained from the cBioPortal ( Cerami et al., 2012 ), and the same criteria were applied to exclude patients and classify them based on their 10-year survival. The gene expression datasets of both cohorts were cleaned and normalized as described in their respective publications ( Curtis et al., 2012 ), and the SCAN-B RNA-Seq dataset was voom-transformed to facilitate linear modelling. 3.8 Prognostic evaluation using Cox proportional hazard model and survival analysis Univariable Cox proportional hazard model (using the survival (v3.8-3) package ( Borgan, 2001 )) was used to assess the prognostic value of the machine learning model. This evaluation was performed on the independent MyBrCa dataset to prevent information leakage from the model training step. Predicted prognosis (classified as good or poor) from the machine learning model with median AUROC was used as an independent predictor in the Cox model. For comparison, the prognostic value of PREDICT v3.0 and established clinical factors, including stage, grade, hormone receptor positivity, and HER2 positivity, were also assessed using Cox model. The PREDICT v3.0 overall survival estimates were dichotomized using the classification threshold that maximized the Youden’s index, as determined by the pROC (v1.18.5) package. To assess whether the gene signature provides prognostic information independent of the clinical information, predicted prognosis from a model trained solely on gene expression data was included as a covariate in a multivariable model alongside the clinical factors that were found to be significant in the univariable analyses. The hazard ratios (HR) associated with the univariable models were plotted in a single forest plot using the ggplot2 (v3.5.1) package. Additionally, Kaplan–Meier survival curves were also constructed to compare the survival differences between prognosis groups predicted by the model and PREDICT v3.0. 4 Results 4.1 Cohort description MyBrCa cohort was previously described in Tan et. al. (2018). For validation of PREDICT v 3.0, 3875 patients (out of 6086 patients) with complete clinical information required by the model were included. The median age at diagnosis of this subset of women was 51 years old (inter-quartile range, IQR: 45-60 years) ( Table 1 ). The patients were followed for a median of 7.3 years, during which 709 deaths occurred, accounting for 18.3% of the cohort over the entire follow-up period. This cohort included a high proportion of ER-positive (69.9%) and PR-positive (61.7%) cases, and most of the patients were diagnosed with early-stage disease (Stage 0-2) (81.3%). View this table: View inline View popup Table 1. Baseline clinicopathological characteristics of the MyBrCa, METABRIC, and SCAN-B cohorts used in this study. Reported variables include patient demographics, tumor biology (ER, PR, HER2 status), tumor stage and grade, tumor size, nodal involvement, and treatment details. MyBrCa = Malaysian Breast Cancer; METABRIC = Molecular Taxonomy of Breast Cancer International Consortium; SCAN-B = Sweden Cancerome Analysis Network–Breast; ER = estrogen receptor; PR = progesterone receptor; HER2 = human epidermal growth factor receptor 2. The largest proportion of patients in this cohort was classified as having grade 2 tumors (moderately differentiated; 52.1%), followed by grade 3 (poorly differentiated; 37.9%) and grade 1 (well-differentiated; 9.9%). Although treatment data are optional inputs for the PREDICT model, they were incomplete for most of the patients. Only a limited proportion of patients (10.2% to 11.0%) had available treatment information for adjuvant chemotherapy, adjuvant hormone therapy, or radiation therapy. From this dataset, a subset of 258 patients was selected for prognostic model development ( Table 1 ). The selection was based on the availability of RNA-seq data and complete HER2 and PR receptor status. Only patients who could be classified into good or poor prognosis groups were included. Applying the same exclusion and classification criteria, 2072 patients from the SCAN-B cohort and 758 patients from the METABRIC cohort were selected as external validation cohorts. Notably, most of the patients in the external cohorts were excluded because they were still alive but had not yet reached 10 years of follow-up; therefore, could not be reliably assigned a prognosis group. The METABRIC cohort has the highest median follow-up (11 years) and recorded the highest proportion of deaths (64.6%). The cohorts also differed in age at diagnosis, with MyBrCa being younger at diagnosis (median: 53 years) and SCAN-B being the oldest (median: 70 years). This was reflected in the proportion of patients over 60 at diagnosis, which was the highest in SCAN-B (71.0%) compared to 54.9% in METABRIC and 30.6% in MyBrCa. ER and PR positivity were the highest in SCAN-B, suggesting a larger proportion of hormone receptor-positive tumors, which are generally less aggressive. Consistent with this, a higher proportion of patients in SCAN-B were diagnosed with early stage of disease (55.3% at stage 1 and 40.6% at stage 2) in comparison to METABRIC (33.8% and 57.8%) and MyBrCa (13.6% and 46.5%). Taken together, these differences highlight that the MyBrCa dataset used for model development represents a younger cohort with higher proportion of patients with advanced disease, SCAN-B reflects an older population with higher proportion of hormone receptor–positive and early-stage tumor, while METABRIC being in between with intermediate age at diagnosis and the highest proportion of deaths due to its longer follow-up. Despite these differences, relatively balanced datasets were obtained when these datasets were stratified using a 10-year survival cut-off. This suggests the suitability of the 10-year threshold for model development and evaluation. 4.2 Validation of PREDICT v3.0 in the MyBrCa cohort The 5-,10-, and 15-year overall survival estimates by the Kaplan-Meier (KM) analysis was 89.6%, 79.2%, and 70.8% with a corresponding 95% CI of 88.6%-90.6%, 77.6%-80.8%, and 68.5%-73.3%, respectively ( Table 2 ). In comparison, PREDICT v3.0 estimated higher median survival probabilities of 94.2%, 85.7%, and 77.6% with IQRs of 86.4%-97.2%, 75.4%-92.6%, and 65.0%-86.9%, respectively. The corresponding AUROCs were consistent across the 5-, 10-, and 15- year predictions at approximately 0.69 (95% CI: 0.67–0.72) ( Figure 2A ). Download figure Open in new tab Figure 2 Evaluation of PREDICT’s performance in the MyBrCa cohort. A Receiver operating characteristic (ROC) curve showing the discriminative performance of PREDICT v3.0 for estimating survival at three timepoints (5,10, and 15 years) for 3875 MyBrCa patients. Model performance is quantified by the AUROC. B Calibration plot comparing mean predicted versus observed survival in the full MyBrCa cohort. Predictions were binned into quintiles for visualization. The diagonal line represents perfect calibration performance. For clarity, both axes are cropped to begin at 0.25. C Calibration plots visualizing PREDICT’s calibration in selected subgroups of patients in the MyBrCa cohort. Axes are cropped to begin at 0.2. View this table: View inline View popup Table 2. Comparison of observed 5-, 10-, and 15-year overall survival (OS) with survival probabilities estimated by PREDICT v3.0. Observed OS based on Kaplan–Meier estimates were 89.6 (95% CI: 88.6-90.6), 79.2 (95% CI: 77.6-80.8), and 70.8 (95% CI: 68.5-73.3) at 5, 10, and 15 years, respectively. In contrast, the median PREDICT-estimated survival probabilities for the same time points were 94.2 (IQR: 86.4-97.2), 85.7 (IQR: 75.4-92.6), and 77.6 (IQR: 65-86.9). Results were further stratified by age group, ethnicity, tumor stage, grade, clinical subtype, receptor status, and systemic therapy. Values for 15-year OS in the systemic therapy category were reported as NA due to the absence of patients with 15-year follow-up. Δ% (percent difference) represents the relative difference between KM-derived overall and PREDICT-estimated survival, calculated as [(Observed − Predicted) / Observed] × 100. IQR = interquartile range; CI = confidence interval. When comparing KM-derived overall survival with PREDICT-estimated median survival, PREDICT v3.0 showed good calibration overall, with percent differences not exceeding 10% of the KM-derived values ( Table 2 ). However, across demographics subgroups, poor calibration was observed for specific age subgroups (<36 years, 36–40 years, and 41–50 years) at long-term predictions (10- and 15-year) and across all timepoints for patients of Malay ethnicity. Across clinical subgroups, PREDICT v3.0 demonstrated poor calibration at long-term predictions in hormone receptor–positive (HR+) subtypes, including ER- and PR-positive disease, as well as in HER2-negative disease. Notably, the calibration was poor all timepoints for patients with late- stage cancer (Stage 3). For patients with known treatment information, PREDICT v3.0 predictions were within 10% of KM-derived survival at 5 years, but showed poor calibration at 10 years in patients receiving no systemic therapy, chemotherapy plus hormone therapy, or hormone therapy alone. Notably, predictions were accurate at both time points in three distinct subgroups: patients receiving hormone therapy, patients receiving trastuzumab, and patients receiving radiation therapy. Goodness-of-fit assessment revealed that PREDICT v3.0 overestimated survival in the lowest-risk quintile and underestimated survival in the highest-risk quintile, with deviations more pronounced for long-term 10- and 15-year predictions ( Figure 2B ). Across demographic and clinical subgroups, poor calibration in higher-risk quintiles was particularly evident among patients with late-stage cancer (Stage 3), patients diagnosed under 36 years, patients diagnosed over 60 years, patients of Malay ethnicity (noting the small sample size in the ‘Other’ ethnic group), and in patients with HER2-positive disease ( Figure 2C , Supplementary Figure 2 ). The predictions were closer to observed survival in most quintiles for with triple-negative breast cancer (TNBC) and in patients with ER- and PR-positive disease. For patients with known treatment information, overall survival was generally high, and PREDICT v3.0 estimates reflected this trend, with median survival values closely matching those observed. However, it still underestimated the 10-year survival in patients who received trastuzumab. Taken together, these findings suggest that while PREDICT v3.0 demonstrated moderate discriminative performance, its estimates were poorly calibrated for long-term predictions and specific patient subgroups, particularly those at higher risk, highlighting the potential for misinformed clinical decisions in vulnerable populations. 4.3 Selection of prognostic gene signature in the MyBrCa cohort The selected 258 MyBrCa patients’ global transcriptome dataset comprised 18,112 cleaned expressed genes. The principal component analysis revealed no distinct clustering by clustering, indicating a high degree of overlap in the transcriptomic profiles of the MyBrCa patients in the two prognosis groups ( Supplementary Figure 3A ). After retaining the top 30 genes with the highest loading coefficient from each of the top principal components that cumulatively account for 80% of the total variance (94 PCs), a reduced list of 995 genes was obtained. Since LASSO was applied in the subsequent step, genes were retained from the PCs to allow regularization to operate on a broader set of candidate genes, including those associated with lower variance but potentially prognostic. The cut-off of 30 was selected based on an ’elbow point’ observed during comparisons with other threshold values ( Supplementary Figure 3B ); beyond this threshold, including additional genes yielded minimal gains in the variance captured by each principal component. The final gene list was fitted in a LASSO regression model for multicollinearity reduction ( Supplementary Figure 3C ). The optimal value of the hyperparameter lambda, which dictates the strength of penalization applied to the coefficients in the model, was identified as 0.026 using the AIC. In comparison, the BIC, which enforces a stricter penalization than AIC, selected the optimal value of 0.11 ( Figure 3A ). The final model, consisting of 77 genes with non-zero coefficients, was obtained using the AIC-derived lambda ( Figure 3B ). From this point onwards, this gene signature of 77 genes is referred to as MyGESig. Download figure Open in new tab Figure 3 AIC/BIC-guided lambda selection and retained genes in LASSO. A Plot of AIC and BIC scores across a range of log-transformed lambda values. The optimal lambda was selected based on the minimum values of both criteria. B Bar plot summarizing the non-zero gene coefficients retained in the final LASSO model. GO term classification revealed that these genes were associated with regulation and signalling (GO:0050794, regulation of cellular process; GO:0023052, signalling; GO:0007165, signal transduction), development and differentiation (GO:0007275, multicellular organism development, GO:0030154, cell differentiation), biosynthesis (GO:0044249, cellular biosynthetic process; GO:1901576, organic substance biosynthetic process), metabolism (GO:0043170, macromolecule metabolic process), and cellular organization (GO:0016043, cellular component organization) ( Supplementary Figure 4 ). On the other hand, classification of molecular function categories revealed the association of these genes with functions related to binding (GO:0043169, cation binding; GO:0043168, anion binding; GO:0003676, nucleic acid binding; GO:0005102, signalling receptor binding), transcriptional regulation (GO:0000981, DNA-binding transcription factor activity, RNA polymerase II-specific), receptor activation (GO:0030546, signalling receptor activator activity), and transporter activities (GO:0015318, inorganic molecular entity transmembrane transporter activity; GO:0022803, passive transmembrane transporter activity; GO:1901702, salt transmembrane transporter activity; GO:0015075, monoatomic ion transmembrane transporter activity) ( Supplementary Figure 4 ). Subsequent gene set enrichment analysis using the hallmark gene sets (H) from the MSigDB returned no statistically significant enrichment after multiple testing correction (adjusted p-value < 0.05). Subsequent gene set enrichment analysis using the hallmark gene sets (H) from the MSigDB returned no statistically significant enrichment after multiple testing correction (adjusted p-value < 0.05) (Supplementary Table 1). This indicates that this gene signature is not associated with the hallmark biological pathways represented in the MSigDB. 4.4 Classification of prognosis using the MyGESig gene signature A prognostic model was developed to use MyGESig gene signature and clinical information (age at diagnosis, hormone receptor (ER/PR) status, HER2 status, tumor grade, tumor size, and number of locoregional lymph nodes to which cancer has spread) to classify the MyBrCa patients based on their 10-year prognosis ( Supplementary Figure 1 ). The clinical information was selected based on the key inputs used by the PREDICT v3.0 tool, excluding treatment information and smoking status due to limited data availability. The implementation of this model is based on a classifier described in Sammut et. al. (2020). This adaptation involves a 5- seed stratified split, allocating 70% of the patients for model training and 30% for testing. 5-fold cross-validation was utilized to evaluate the model’s performance to ensure robustness and minimal variability across different training subsets. This classifier trained using both clinical variables and the MyGESig signature is hereafter referred to as the MyBrCa70-Combined model (i.e., trained on 70% of the MyBrCa dataset with both clinical and gene features). For clarity, MyBrCa70 is used as the base model name, with suffixes (e.g., -Clinical, -MyGESig, -Combined) denoting the specific feature set used for training. Utilizing both gene expression and clinical data as features, the MyBrCa70-Combined model achieved a median AUROC of 0.99 (95% CI: 0.98–0.99) and 0.92 (95% CI: 0.87–0.92) across five training and five testing datasets, respectively ( Figure 4A , Supplementary Table 2). Using the probability threshold that maximizes the Youden Index, the model achieved the accuracy, specificity, and sensitivity of 0.88, 0.95, and 0.81, in the testing dataset, respectively ( Table 3 ). Furthermore, the individual algorithms in this model exhibited strong discriminative performance with the AUROC of 0.83 (95% CI: 0.74– 0.83) (random forest) and 0.92 (95% CI: 0.87–0.92) (support vector machine) ( Figure 4B , Supplementary Table 2). Download figure Open in new tab Figure 4 Evaluation of model discriminative performance across different datasets, algorithms and feature sets. A ROC curves showing model performance in classifying MyBrCa patients according to their 10-year prognosis in the 70:30 split for training (left) and testing (right) datasets. The model was trained using the LASSO-derived 77-gene signature, termed MyGESig, combined with clinical information (referred to as MyBrCa70-Combined model). B ROC curves comparing model performance when using different machine learning algorithms (left) and when trained on different combinations of features (right). View this table: View inline View popup Download powerpoint Table 3. Comparison of machine learning model performance trained on 70% of the MyBrCa dataset using different combination of features. Performance metrics were calculated using a probability threshold that maximizes the Youden Index. The performance of the MyGESig gene signature was compared with established Western-based (Mammaprint, Oncotype DX, and PAM50) and the Eastern-based signatures (GenesWell BCT, signature from Cheng et. al. (2016)). AUROC = area under the receiver-operator characteristic curve. The contribution and complementarity of clinical data and MyGESig signature in the MyBrCa70-Combined model was assessed by retraining it with either feature set alone. When trained solely on clinical data (hence referred to as the MyBrCa70-Clinical model), the resulting model achieved a median AUROC of 0.67 (95% CI: 0.55–0.69) ( Figure 4B , Table 3 ), which was marginally lower than PREDICT v3.0’s 10-year survival prediction (AUROC = 0.69). Notably, the clinical variables used for training were the same as those required by the PREDICT model, and both models were validated in datasets that differed markedly in sample size. On the other hand, when trained solely on the MyGESig signature (MyBrCa70-MyGESig model), it demonstrated a stronger discriminative performance, achieving a median AUROC of 0.92 (95% CI: 0.86–0.92). This indicates that transcriptome-derived molecular signals captured by this signature are better at distinguishing between good and poor prognosis groups in the MyBrCa cohort, compared to the clinical variables. When established gene signatures (Mammaprint, Oncotype DX, and PAM50) were used to predict prognosis in the MyBrCa cohort, the discriminative performance dropped significantly, with the median AUROCs of 0.64, 0.60, and 0.61, respectively (95% CIs: 0.52–0.64, 0.47–0.60, and 0.49–0.61) ( Table 3 , Supplementary Figure 5A ). The Eastern-based gene signatures were also tested [GenesWell BCT and the 34- gene signature from Cheng et al. (2016) ], considering that most established signatures are Western-centric, to further evaluate performance in the MyBrCa cohort, achieving median AUROCs of 0.54 (95% CIs: 0.41-0.54) and 0.59 (95% CIs: 0.46-0.59). The poor performance of these gene signatures underlines the limitation of these signatures to capture strong prognosis- related variance in the transcriptome of the MyBrCa cohort. As a means of validating the feature selection approach, the shrinkage of less informative genes was assessed by iterative LASSO modelling was performed across a fine-grained sequence of decreasing lambda values, starting from the AIC-derived value (lambda = 0.026). During each iteration, genes with non-zero coefficients were identified and contrasted against the genes identified in the subsequent iteration, retaining only genes uniquely kept at that specific lambda. For a fair comparison to the 77-gene MyGESig, only gene sets with at least 70 non-overlapping genes were considered. When models were trained using the respective gene sets, performance improved with increasing lambda, indicating that stronger penalization isolates genes that are more informative and can discriminate prognosis better, as reflected in improved AUROC metrics ( Figure 5 ). Download figure Open in new tab Figure 5 Discriminative performance of uniquely retained genes across varying regularization strengths. Box-and-whisker plots summarizing the AUROC scores of machine learning models trained using uniquely retained gene sets derived from LASSO regression at different levels of penalization (represented by lambda). Gene sets retained at higher lambda values represent more strongly penalized, and potentially more informative, subsets. The gene set at lambda = 2.57e-2 corresponds to the MyGESig signature. Considering the MyBrCa70 models, namely the MyGESig and Combined variants, were trained using the 77 genes identified from the entire machine learning subset (n = 258), there could be information leakage, potentially leading to overfitting. To rule this out, the models were retrained on a reduced subset of training data and evaluated in the independent MyBrCa dataset by applying the stratified split twice and retaining the first 30% split as an independent dataset ( Supplementary Figure 1 ). When the resulting dataset of 77 patients was used for validation, the newly derived models (hereafter referred to as the IndMyBrCa30 models) achieved median AUROCs of 0.74, 0.84, and 0.90 when using clinical data, gene signatures, or both, respectively (95% CIs: 0.63–0.74, 0.76–0.84, and 0.83–0.90) ( Figure 6A , Supplementary Table 2). Download figure Open in new tab Figure 6 Discriminative performance of MyGESig gene signature across independent datasets. A ROC curves summarizing model performance in external validation datasets, including METABRIC, SCAN-B, and an independent MyBrCa cohort. The model was trained solely on gene expression from the MyBrCa dataset. B ROC curves comparing the performance of different SCANB70 models when evaluated on both its hold-out testing set (30% of the SCAN-B dataset) and the MyBrCa dataset. Models were trained using gene signatures, clinical information, or a combination of both. To further assess the generalizability and reliability of the MyBrCa70 models, the SCAN-B and METABRIC cohorts were used as external validation cohorts. Models used in each validation were different as the MyBrCa70 models had to be retrained in the MyBrCa cohort, since the validation datasets varied in how certain clinical data were recorded, and only a subset of genes from the identified gene signature were present in these datasets. Specifically, the number of positive lymph nodes was recorded as a range in SCAN-B and was converted to a numeric value by taking the lower bound of the reported range. The microarray gene expression data from METABRIC only included 57 genes from the 77-gene signature, whereas the RNA-Seq dataset from SCAN-B included 59 genes. After retraining, the corresponding models performed poorly across all feature sets, with the median AUROC of 0.65, 0.52, and 0.66 (95% CIs: 0.60–0.64, 0.48–0.52, and 0.62–0.66) in METABRIC and 0.67, 0.53, and 0.62 (95% CIs: 0.64–0.67, 0.50–0.53, and 0.61–0.63) in SCAN-B, for when using clinical data, gene signatures, or both, respectively ( Figure 6A , Supplementary Table 3). Given the possible dataset-specific biases between the training and validation cohorts that can prevent machine learning model from capturing robust features that generalize beyond the training cohort, the respective external cohorts were used for model training (under the same 70:30 split; hereafter referred to as the SCANB70 and METABRIC70 models). The model’s discriminative performance improved for SCANB70, attaining the median AUROC of 0.85, 0.76, and 0.82 (95% CIs: 0.81–0.84, 0.72–0.76, and 0.78–0.82) when trained using clinical data, gene signatures, or both, respectively ( Figure 6B , Supplementary Table 4). While a good discriminative performance for the clinical data suggests their contribution to the high AUROC in the combined model (SCANB70-Combined), the AUROC of 0.76 achieved by the SCANB70-MyGESig model in the hold-out SCAN-B testing dataset suggests that MyGESig gene set can moderately discriminate between prognosis groups when trained with cohort-specific data. In contrast, the same SCANB70 models performed poorly in the MyBrCa cohort, with the AUROC of 0.61, 0.60, and 0.54 (95% CIs: 0.54–0.61, 0.53–0.60, and 0.47–0.54), highlighting the poor generalizability of models trained with individual cohort data across cohorts. While moderate AUROC was still achieved in the METABRIC70-Combined model (0.71, 95% CI: 0.65-0.71), the METABRIC70-MyGESig demonstrated a poor discriminative performance when trained and tested in METABRIC (0.63, 95% CI: 0.58-0.63) ( Supplementary Figure 5A ). The weaker performance in this cohort is likely attributed to the difference in the platform used to measure gene expression (microarray vs RNA-seq). Taken together, these findings suggest that while the MyBrCa-trained model shows limited generalizability, MyGESig retains moderate discriminative ability, as demonstrated in the large SCAN-B dataset. 4.5 Assessment of the prognostic value of the MyGESig signature Kaplan-Meier survival analysis comparing the IndMyBrCa30 models with PREDICT v3.0 revealed that both predictors were strongly associated with overall survival (p<0.0001), with the machine learning model demonstrating better separation of prognostic groups ( Figure 7A ). The hazard ratios (HRs) observed from Cox proportional hazard analysis for the IndMyBrCa30 models, PREDICT v3.0, as well as established clinical factors known for influencing overall survival, were summarized in a forest plot in Figure 7B . The results revealed that classification from both the IndMyBrCa30 models and PREDICT v3.0 was significantly associated with increased mortality, with the IndMyBrCa30-Combined model showing the strongest association (HR = 27.43, p-value < 0.01). Grade and HER2 positivity were not significantly associated with patient outcomes, whereas stage and HR positivity were significant. To demonstrate that the MyGESig signature has a prognostic value independent of clinical status, the IndMyBrCa30-MyGESig model was fit in a multivariable Cox model, controlling for stage and HR positivity. MyGESig remained significantly associated with patient outcomes (HR = 23.06, p-value < 0.001) ( Table 4 ). Download figure Open in new tab Figure 7 Evaluation of prognostic value of MyGESig compared to PREDICT v3.0 for the MyBrCa cohort. A Kaplan-Meier survival curves stratified by predicted prognosis; left, predictions generated by the prognostic model; right, predictions generated by PREDICT v3.0. B Forest plot summarizing hazard ratios (HRs) and 95% confidence intervals (CIs) derived from univariate Cox proportional hazards models for PREDICT, the IndMyBrCa30 models, and established clinical factors. Models trained on MyGESig, clinical information, or both were evaluated in the Cox framework. View this table: View inline View popup Download powerpoint Table 4. Multivariable Cox proportional hazards model results. Hazard ratios (HRs) with 95% confidence intervals (CIs) and p-values are shown for tumor stage, hormone receptor (HR) status, and classes assigned by the IndMyBrCa30-MyGESig model. 5 Discussion In this study, the latest version of PREDICT v3.0 was validated in the MyBrCa cohort, and a prognostic gene signature was developed using transcriptomic profiles from the multi-ethnic Southeast Asian population. This gene signature, termed MyGESig, outperformed other established Western-derived gene signatures when used as input features in a machine learning-based model. Despite its poor cross-cohort performance, MyGESig demonstrated moderate-to-good discriminative performance when the models were trained and tested in the same dataset. Additionally, Cox proportional hazard analysis further confirmed the prognostic utility of this gene signature, with MyGESig remaining significantly associated with overall survival, independent of other established prognostic factors. Hence, given the moderate performance of PREDICT v3.0 in the MyBrCa cohort, this study highlights the prognostic value of a population-specific gene signature for improving prognostication in Malaysian breast cancer patients. Compared to previous studies validating PREDICT v3.0 in Malaysian cohorts ( Nik Ab Kadir et al., 2023 ; Wong et al., 2015 ), this study validated the most recently released version of PREDICT v3.0 using the largest sample size reported to date. Overall, PREDICT v3.0 slightly overestimated survival probabilities, with discrepancy from observed values increased in patients with lower overall survival and at longer time points (10- and 15-year predictions). The largest discrepancies in the observed versus estimated survival were found among patients with late-stage disease (Stage 3) and younger breast cancer patients—findings that were also corroborated by one of the earlier Malaysian-based studies ( Wong et al., 2015 ). The PREDICT v3.0 tool has been previously validated in other populations, with varying calibration and discrimination performance ( Magário et al., 2022 ; Nair et al., 2023 ; Stabellini et al., 2023 ; Wang et al., 2023 ; Zaguirre et al., 2021 ). These discrepancies have been attributed to differences in patient characteristics and clinical practices between the validation populations and the original population in which PREDICT v3.0 was developed. The poor performance during cross-cohort validation of the models trained on MyBrCa does not necessarily indicate poor generalizability of the MyGESig gene set, as the gene set demonstrated moderate discriminative performance when trained and tested within the large SCAN-B dataset. Instead, it is the trained prognostic models (e.g., MyBrCa70-Combined and MyBrCa70-MyGESig), which incorporates the probabilistic RF and SVM algorithms, that may be susceptible to transferability issues. One potential reason for this lack of generalizability is the difference in patient demographics and characteristics between MyBrCa and the external datasets, leading to variation in both clinical features and gene expression that prevents the model from capturing a shared intrinsic pattern across both datasets ( Lasko et al., 2024 ). A few solutions have been proposed to address this problem, including developing the model based on pooled cohort data. However, this will compromise the model’s performance in MyBrCa as a trade-off for generalizability. A more sophisticated approach is to separate the model into two components: a process model that identifies cohort-specific patterns to infer latent variables, and a disease model that uses these variables to make predictions ( Lasko et al., 2024 ). While promising, this approach is beyond the scope of the current study and could be explored in future work. As with the findings of this study, it can be argued that good performance in external validation may not indicate universal applicability ( Van Calster et al., 2023 ). Thus, this prognostic model may still have utility in the Malaysian population, especially considering the moderate performance of PREDICT. Hence, future work should focus on validating this model in the Malaysian population, particularly from other geographical regions. This study has several limitations. First, the MyBrCa cohort is collected from participating private hospitals located in the urban region of Malaysia and includes a relatively high proportion of women of Chinese ethnicity compared to other ethnicities. Hence, this cohort may not fully represent the overall Malaysian population, particularly in the rural areas. Second, the assessment of PREDICT’s calibration in MyBrCa is limited by the high proportion of patients in the dataset with high overall survival; therefore, calibration cannot be reliably assessed at lower quintiles, particularly among patients with below 50% overall survival. Additionally, KM’s estimation of observed survival may be affected by the high proportion of censoring in this cohort. Third, the gene signatures used for comparison —MammaPrint, Oncotype DX, and PAM50 as well as Eastern-based signatures such as GenesWell BCT and the signature from Cheng et al. (2016 )—do not fully represent the actual commercial molecular assays (MammaPrint, Oncotype DX, Prosigna and GenesWell BCT). These signatures likely perform better when used within their prognostic models, which require data generated from their respective assays. Fourth, the independent MyBrCa dataset used for machine learning validation and Cox analysis consists of 77 patients who were excluded from the 70:30 split used for training and testing datasets. Hence, a larger sample size is needed to produce more robust estimates of AUROC and HR. Fifth, the machine learning models developed in this study may be subject to probability miscalibration, which could affect the accuracy of the predicted risk estimates and limit their clinical interpretability. As a potential future direction, post hoc calibration techniques such as Platt scaling or isotonic regression could be applied to improve the reliability of these probability estimates. Taken together, the study presents a prognostic model that achieved 0.92 AUROC in predicting 10-year survival status in Malaysian breast cancer patients. The MyGESig signature has demonstrated both discriminative and prognostic value while complementing clinical information well in a machine learning setup. However, further development and validation within the Malaysian population are necessary to determine whether this model can perform reliably in real-world clinical settings. Data Availability The whole exome sequencing and RNA-seq data included in this study are available in the European Genome-phenome Archive under Accession nos. EGAS00001006518 and EGAS00001004518. Access to controlled patient data will require the approval of the Data Access Committee. Further information is available from the corresponding author upon request. 6 Author Contributions M.H.F.K. and J.-W.P. co-led the data analysis. M.H.F.K. wrote the manuscript. J.-W.P., S.C.C., S.-H.T., B.H.A., M.C.T., and W.K.H. drafted and reviewed the manuscript. J.-W.P., S.C.C., S.-H.T., B.H.A., M.C.T., and W.K.H. contributed to data analysis and interpretation. N.F.P. and J.P.-T. contributed to data analysis and drafted the manuscript. P.H., S.M.H., N.A.M.T., and C.-H.Y. contributed to sample collection and processing and data collection. Z.L.W. processed and cleaned the clinical data. The study was directed and co-supervised by S.C.C and J.-W.P. All authors read and approved the final version of the paper. 8 Conflict of interest The authors declare no conflict of interest. 9 Ethics declaration Patient recruitment and sample collection was reviewed and approved by the Independent Ethics Committee, Ramsay Sime Darby Health Care (Reference nos: 201109.4 and 201208.1), as well as the Medical Ethics Committee of the University Malaya Medical Centre (Reference no: 842.9). Written informed consent to participation in research was given by each individual patient. 10 Data availability The whole exome sequencing and RNA-seq data included in this study are available in the European Genome-phenome Archive under Accession nos. EGAS00001006518 and EGAS00001004518. Access to controlled patient data will require the approval of the Data Access Committee. Further information is available from the corresponding author upon request. Supplementary Figures Download figure Open in new tab Supplementary Figure 1 Diagram summarizing the development of the prognostic model. For validation on the independent MyBrCa cohort, 30% of the cohort was held out as an independent dataset, while the remaining 70% was used for model training and testing, following the same 7:3 stratified split. Note that only the model used for this independent validation was trained on the reduced second-split dataset; all other models were trained using the original single-split dataset. Download figure Open in new tab Supplementary Figure 2 Evaluation of PREDICT’s performance across different subgroups of patients in the MyBrCa cohort. Calibration plots comparing the predicted versus observed survival in various subgroups of patients in the MyBrCa cohort. Predictions were binned into quintiles for visualization. The diagonal line represents perfect calibration performance. Only subgroups of known clinical variables were selected for visualization. Calibration at the 15-year timepoint could not be computed for the treatment subgroups, due to the absence of patients with 15-year follow-up. Download figure Open in new tab Supplementary Figure 3 Dimensionality reduction in the transcriptome dataset for subsequent feature selection. A Principal component analysis (PCA) plot of the transcriptomic dataset for 258 MyBrCa patients, coloured by prognosis groups. No clear clustering by prognosis was observed. The top 30 genes with the highest absolute loadings in each principal component were retained for further analysis. B Distribution of absolute loading values across all principal components at different cut-off points, with genes ranked in descending order of absolute loading. C Plot of LASSO regression coefficients against log-transformed lambda values, illustrating the shrinkage of coefficients to zero as lambda increases. The final lambda value selected was 0.026. Download figure Open in new tab Supplementary Figure 4 Gene Ontology (GO) annotations for genes in the MyGESig signature. Bar plots showing the top 30 most frequently assigned GO terms in the MyGESig gene signature. No term was significantly enriched in the gene set. Download figure Open in new tab Supplementary Figure 5 Comparison of model performance across different gene signatures and training datasets. A ROC curves comparing the performance of models when established Western-based(Mammaprint, Oncotype DX, and PAM50) (left) and Eastern-based gene signatures (GenesWell BCT and signature from Cheng et. al. (2016)) were used as features. B ROC curves comparing the performance of different METABRIC70 models when evaluated on both its hold-out testing set (30% of the METABRIC dataset) and the MyBrCa dataset. Models were trained using gene signatures, clinical information, or a combination of both. 7 Acknowledgements Cancer Research Malaysia receives charitable funding from the Scientex Foundation, Estée Lauder Companies, Vistage Malaysia, Yayasan PETRONAS, and Yayasan Sime Darby which contributed to the funding of this study. Funding was also provided by a research grant from the Newton-Ungku Omar Fund (MRC Ref: MR/P012442/1) to SFC and SHT. OMR, CC, and SFC also receive funding from Cancer Research UK. All genomics work was undertaken by the Genomics Core Facility CRUK Cambridge Institute. References ↵ Bertucci , F. , Nasser , V. , Granjeaud , S. , Eisinger , F. , Adelaïde , J. , Tagett , R. , Loriod , B. , Giaconia , A. , Benziane , A. , Devilard , E. , Jacquemier , J. , Viens , P. , Nguyen , C. , Birnbaum , D. , & Houlgatte , R . ( 2002 ). Gene expression profiles of poor-prognosis primary breast cancer correlate with survival . Human Molecular Genetics , 11 ( 8 ), 863 – 872 . doi: 10.1093/hmg/11.8.863 OpenUrl CrossRef PubMed Web of Science ↵ Borgan , Ø . ( 2001 ). Modeling Survival Data: Extending the Cox Model . Terry M. Therneau and Patricia M. Grambsch, Springer-Verlag , New York , 2000. No. of pages: xiii + 350. Price: $69.95. ISBN 0-387-98784-3. Statistics in Medicine , 20 ( 13 ), 2053 - 2054 . doi: 10.1002/sim.956 OpenUrl CrossRef ↵ Cardoso , F. , Kyriakides , S. , Ohno , S. , Penault-Llorca , F. , Poortmans , P. , Rubio , I. T. , Zackrisson , S. , & Senkus , E . ( 2019 ). Early breast cancer: ESMO Clinical Practice Guidelines for diagnosis, treatment and follow-up † . Annals of Oncology , 30 ( 8 ), 1194 – 1220 . doi: 10.1093/annonc/mdz173 OpenUrl CrossRef PubMed ↵ Cerami , E. , Gao , J. , Dogrusoz , U. , Gross , B. E. , Sumer , S. O. , Aksoy , B. A. , Jacobsen , A. , Byrne , C. J. , Heuer , M. L. , Larsson , E. , Antipin , Y. , Reva , B. , Goldberg , A. P. , Sander , C. , & Schultz , N . ( 2012 ). The cBio Cancer Genomics Portal: An Open Platform for Exploring Multidimensional Cancer Genomics Data . Cancer Discovery , 2 ( 5 ), 401 – 404 . doi: 10.1158/2159-8290.Cd-12-0095 OpenUrl Abstract / FREE Full Text ↵ Chen , H.-C. , Kodell , R. L. , Cheng , K. F. , & Chen , J. J . ( 2012 ). Assessment of performance of survival prediction models for cancer prognosis . BMC Medical Research Methodology , 12 ( 1 ), 102 . doi: 10.1186/1471-2288-12-102 OpenUrl CrossRef PubMed ↵ Chen , Y. , Chen , L. , Lun , Aaron T. L. , Baldoni , Pedro L. , & Smyth , Gordon K . ( 2025 ). edgeR v4: powerful differential analysis of sequencing data with expanded functionality and improved support for small counts and larger datasets . Nucleic Acids Research , 53 ( 2 ). doi: 10.1093/nar/gkaf018 OpenUrl CrossRef ↵ Cheng , S. H. , Horng , C.-F. , Huang , T.-T. , Huang , E. S. , Tsou , M.-H. , Shi , L.-S. , Yu , B.-L. , Chen , C.-M. , & Huang , A. T . ( 2016 ). An Eighteen-Gene Classifier Predicts Locoregional Recurrence in Post-Mastectomy Breast Cancer Patients . eBioMedicine , 5 , 74 – 81 . doi: 10.1016/j.ebiom.2016.02.022 OpenUrl CrossRef PubMed ↵ Curtis , C. , Shah , S. P. , Chin , S.-F. , Turashvili , G. , Rueda , O. M. , Dunning , M. J. , Speed , D. , Lynch , A. G. , Samarajiwa , S. , Yuan , Y. , Gräf , S. , Ha , G. , Haffari , G. , Bashashati , A. , Russell , R. , McKinney , S. , Caldas , C. , Aparicio , S. , Curtis† , C. , . . . Data analysis, s. ( 2012 ). The genomic and transcriptomic architecture of 2,000 breast tumours reveals novel subgroups . Nature , 486 ( 7403 ), 346 – 352 . doi: 10.1038/nature10983 OpenUrl CrossRef PubMed Web of Science ↵ Gray , E. , Marti , J. , Brewster , D. H. , Wyatt , J. C. , Hall , P. S. , & the, S. A. G . ( 2018 ). Independent validation of the PREDICT breast cancer prognosis prediction tool in 45,789 patients using Scottish Cancer Registry data . British Journal of Cancer , 119 ( 7 ), 808 – 814 . doi: 10.1038/s41416-018-0256-x OpenUrl CrossRef PubMed ↵ Grootes , I. , Wishart , G. C. , & Pharoah , P. D. P . ( 2024 ). An updated PREDICT breast cancer prognostic model including the benefits and harms of radiotherapy . npj Breast Cancer , 10 ( 1 ), 6 . doi: 10.1038/s41523-024-00612-y OpenUrl CrossRef PubMed ↵ Kim , J.-M. , Ryu , J. M. , Kim , I. , Choi , H. J. , Nam , S. J. , Kim , S. W. , Yu , J. , Lee , S. K. , & Lee , J. E . ( 2018 ). Verification of a Western Nomogram for Predicting Oncotype DX™ Recurrence Scores in Korean Patients with Breast Cancer . J Breast Cancer , 21 ( 2 ), 222 – 226 . doi: 10.4048/jbc.2018.21.2.222 OpenUrl CrossRef PubMed ↵ Kuhn , M . ( 2008 ). Building Predictive Models in R Using the caret Package . Journal of Statistical Software , 28 ( 5 ), 1 – 26 . doi: 10.18637/jss.v028.i05 OpenUrl CrossRef PubMed ↵ Kwon , M. J. , Lee , J. E. , Jeong , J. , Woo , S. U. , Han , J. , Kang , B.-i. , Kim , J.-E. , Moon , Y. , Lee , S. B. , Lee , S. , Choi , Y.-L. , Kwon , Y. , Song , K. , Gong , G. , & Shin , Y. K. ( 2019 ). Comparison of GenesWell BCT Score With Oncotype DX Recurrence Score for Risk Classification in Asian Women With Hormone Receptor-Positive, HER2-Negative Early Breast Cancer [Original Research] . Frontiers in Oncology , Volume 9 - 2019. doi: 10.3389/fonc.2019.00667 OpenUrl CrossRef ↵ Lasko , T. A. , Strobl , E. V. , & Stead , W. W . ( 2024 ). Why do probabilistic clinical models fail to transport between sites . npj Digital Medicine , 7 ( 1 ), 53 . doi: 10.1038/s41746-024-01037-4 OpenUrl CrossRef PubMed ↵ Lee , J. Y. , Lee , J. W. , Chung , M. S. , Choi , J. G. , Sim , S. H. , Kim , H. J. , Kim , J. E. , Lee , K. E. , Park , Y. H. , Kang , M. J. , Ahn , M. S. , Chae , Y. S. , Park , J. H. , Kim , J. H. , Kim , G. M. , Byun , J. H. , Park , K. U. , Kim , J. W. , Jung , S. P. , . . . Park , K. H. ( 2024 ). Age- and ethnic-driven molecular and clinical disparity of East Asian breast cancers . BMC Medicine , 22 ( 1 ), 422 . doi: 10.1186/s12916-024-03638-y OpenUrl CrossRef PubMed ↵ Liberzon , A. , Birger , C. , Thorvaldsdóttir , H. , Ghandi , M. , Mesirov , Jill P. , & Tamayo , P . ( 2015 ). The Molecular Signatures Database Hallmark Gene Set Collection . Cell Systems , 1 ( 6 ), 417 – 425 . doi: 10.1016/j.cels.2015.12.004 OpenUrl CrossRef PubMed ↵ Magário , M. B. , Santos , R. R. D. , Teixeira , L. A. , Tiezzi , D. G. , Pimentel , F. F. , Carrara , H. H. A. , Andrade , J. M. , & Reis , F . ( 2022 ). Validation of the online PREDICT tool in a cohort of early breast cancer in Brazil . Braz J Med Biol Res , 55 , e12109 . doi: 10.1590/1414-431X2022e12109 OpenUrl CrossRef PubMed ↵ Nair , N. S. , Kothari , B. , Gupta , S. , Kanann , S. , Vanmali , V. , Hawaldar , R. , Tondare , A. , Siddique , S. , Parmar , V. , Joshi , S. , & Badwe , R . ( 2023 ). Validation of PREDICT Version 2.2 in a Retrospective Cohort of Indian Women With Operable Breast Cancer . JCO Global Oncology ( 9 ), e2300114. doi: 10.1200/go.23.00114 OpenUrl CrossRef ↵ Nik Ab Kadir , M. N. , Mohd Hairon , S. , Ab Hadi , I. S. , Yusof , S. N. , Muhamat , S. M. , & Yaacob , N. M. ( 2023 ). A Comparison between the Online Prognostic Tool PREDICT and myBeST for Women with Breast Cancer in Malaysia . Cancers , 15 ( 7 ), 2064 . https://www.mdpi.com/2072-6694/15/7/2064 OpenUrl PubMed ↵ Paik , S. , Shak , S. , Tang , G. , Kim , C. , Baker , J. , Cronin , M. , Baehner , F. L. , Walker , M. G. , Watson , D. , Park , T. , Hiller , W. , Fisher , E. R. , Wickerham , D. L. , Bryant , J. , & Wolmark , N . ( 2004 ). A Multigene Assay to Predict Recurrence of Tamoxifen-Treated, Node-Negative Breast Cancer . New England Journal of Medicine , 351 ( 27 ), 2817 – 2826 . doi: 10.1056/NEJMoa041588 OpenUrl CrossRef PubMed Web of Science ↵ Pan , J.-W. , Zabidi , M. M. A. , Chong , B.-K. , Meng , M.-Y. , Ng , P.-S. , Hasan , S. N. , Sandey , B. , Bahnu , S. , Rajadurai , P. , Yip , C.-H. , Rueda , O. M. , Caldas , C. , Chin , S.-F. , & Teo , S.-H . ( 2021 ). Germline APOBEC3B deletion increases somatic hypermutation in Asian breast cancer that is associated with Her2 subtype, PIK3CA mutations and immune activation . International Journal of Cancer , 148 ( 10 ), 2489 – 2501 . doi: 10.1002/ijc.33463 OpenUrl CrossRef ↵ Pan , J.-W. , Zabidi , M. M. A. , Ng , P.-S. , Meng , M.-Y. , Hasan , S. N. , Sandey , B. , Sammut , S.-J. , Yip , C.-H. , Rajadurai , P. , Rueda , O. M. , Caldas , C. , Chin , S.-F. , & Teo , S.-H . ( 2020 ). The molecular landscape of Asian breast cancers reveals clinically relevant population-specific differences . Nature Communications , 11 ( 1 ), 6433 . doi: 10.1038/s41467-020-20173-5 OpenUrl CrossRef PubMed ↵ Ritchie , M. E. , Phipson , B. , Wu , D. , Hu , Y. , Law , C. W. , Shi , W. , & Smyth , G. K . ( 2015 ). limma powers differential expression analyses for RNA-sequencing and microarray studies . Nucleic Acids Research , 43 ( 7 ), e47 – e47 . doi: 10.1093/nar/gkv007 OpenUrl CrossRef PubMed ↵ Robin , X. , Turck , N. , Hainard , A. , Tiberti , N. , Lisacek , F. , Sanchez , J.-C. , & Müller , M . ( 2011 ). pROC: an open-source package for R and S+ to analyze and compare ROC curves . BMC Bioinformatics , 12 ( 1 ), 77 . doi: 10.1186/1471-2105-12-77 OpenUrl CrossRef PubMed Sammut , S.-J. , Crispin-Ortuzar , M. , Chin , S.-F. , Provenzano , E. , Bardwell , H. A. , Ma , W. , Cope , W. , Dariush , A. , Dawson , S.-J. , Abraham , J. E. , Dunn , J. , Hiller , L. , Thomas , J. , Cameron , D. A. , Bartlett , J. M. S. , Hayward , L. , Pharoah , P. D. , Markowetz , F. , Rueda , O. M. , . . . Caldas , C. ( 2022 ). Multi-omic machine learning predictor of breast cancer therapy response . Nature , 601 ( 7894 ), 623 – 629 . doi: 10.1038/s41586-021-04278-5 OpenUrl CrossRef PubMed ↵ Stabellini , N. , Cao , L. , Towe , C. W. , Miller , M. E. , Sousa-Santos , A. H. , Amin , A. L. , & Montero , A. J . ( 2023 ). Validation of the PREDICT Prognostication Tool in US Patients With Breast Cancer . Journal of the National Comprehensive Cancer Network , 21 ( 10 ), 1011 – 1019 .e1016. doi: 10.6004/jnccn.2023.7048 OpenUrl CrossRef PubMed ↵ Subramanian , A. , Tamayo , P. , Mootha , V. K. , Mukherjee , S. , Ebert , B. L. , Gillette , M. A. , Paulovich , A. , Pomeroy , S. L. , Golub , T. R. , Lander , E. S. , & Mesirov , J. P . ( 2005 ). Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles . Proceedings of the National Academy of Sciences , 102 ( 43 ), 15545 – 15550 . doi: 10.1073/pnas.0506580102 OpenUrl Abstract / FREE Full Text ↵ Tan , M.-M. , Ho , W.-K. , Yoon , S.-Y. , Mariapun , S. , Hasan , S. N. , Lee , D. S.-C. , Hassan , T. , Lee , S.-Y. , Phuah , S.-Y. , Sivanandan , K. , Ng , P. P.-S. , Rajaram , N. , Jaganathan , M. , Jamaris , S. , Islam , T. , Rahmat , K. , Fadzli , F. , Vijayananthan , A. , Rajadurai , P. , . . . Teo , S.-H. ( 2018 ). A case-control study of breast cancer risk factors in 7,663 women in Malaysia . PLOS ONE , 13 ( 9 ), e0203469 . doi: 10.1371/journal.pone.0203469 OpenUrl CrossRef PubMed ↵ Tay , J. K. , Narasimhan , B. , & Hastie , T . ( 2023 ). Elastic Net Regularization Paths for All Generalized Linear Models . Journal of Statistical Software , 106 ( 1 ), 1 – 31 . doi: 10.18637/jss.v106.i01 OpenUrl CrossRef PubMed ↵ Toi , M. , Iwata , H. , Yamanaka , T. , Masuda , N. , Ohno , S. , Nakamura , S. , Nakayama , T. , Kashiwaba , M. , Kamigaki , S. , & Kuroi , K . ( 2010 ). Clinical significance of the 21-gene signature (Oncotype DX) in hormone receptor-positive early stage primary breast cancer in the Japanese population . Cancer , 116 ( 13 ), 3112 – 3118 . doi: 10.1002/cncr.25206 OpenUrl CrossRef PubMed ↵ Van Calster , B. , Steyerberg , E. W. , Wynants , L. , & van Smeden , M. ( 2023 ). There is no such thing as a validated prediction model . BMC Medicine , 21 ( 1 ), 70 . doi: 10.1186/s12916-023-02779-w OpenUrl CrossRef PubMed ↵ van Maaren , M. C. , van Steenbeek , C. D. , Pharoah , P. D. P. , Witteveen , A. , Sonke , G. S. , Strobbe , L. J. A. , Poortmans , P. M. P. , & Siesling , S. ( 2017 ). Validation of the online prediction tool PREDICT v. 2.0 in the Dutch breast cancer population . European Journal of Cancer , 86 , 364 – 372 . doi: 10.1016/j.ejca.2017.09.031 OpenUrl CrossRef PubMed ↵ Vijver , M. J. v. d. , He , Y. D. , Veer , L. J. v. t. , Dai , H. , Hart , A. A. M. , Voskuil , D. W. , Schreiber , G. J. , Peterse , J. L. , Roberts , C. , Marton , M. J. , Parrish , M. , Atsma , D. , Witteveen , A. , Glas , A. , Delahaye , L. , Velde , T. v. d. , Bartelink , H. , Rodenhuis , S. , Rutgers , E. T. , . . . Bernards , R. ( 2002 ). A Gene-Expression Signature as a Predictor of Survival in Breast Cancer . New England Journal of Medicine , 347 ( 25 ), 1999 – 2009 . doi: 10.1056/NEJMoa021967 OpenUrl CrossRef PubMed Web of Science ↵ Wallden , B. , Storhoff , J. , Nielsen , T. , Dowidar , N. , Schaper , C. , Ferree , S. , Liu , S. , Leung , S. , Geiss , G. , Snider , J. , Vickery , T. , Davies , S. R. , Mardis , E. R. , Gnant , M. , Sestak , I. , Ellis , M. J. , Perou , C. M. , Bernard , P. S. , & Parker , J. S . ( 2015 ). Development and verification of the PAM50-based Prosigna breast cancer gene signature assay . BMC Medical Genomics , 8 ( 1 ), 54 . doi: 10.1186/s12920-015-0129-6 OpenUrl CrossRef PubMed ↵ Wang , Y. , Broeks , A. , Giardiello , D. , Hauptmann , M. , Jóźwiak , K. , Koop , E. A. , Opdam , M. , Siesling , S. , Sonke , G. S. , Stathonikos , N. , ter Hoeve , N. D. , van der Wall , E. , van Deurzen , C. H. M. , van Diest , P. J. , Voogd , A. C. , Vreuls , W. , Linn , S. C. , Dackus , G. M. H. E. , & Schmidt , M. K. ( 2023 ). External validation and clinical utility assessment of PREDICT breast cancer prognostic model in young, systemic treatment-naïve women with node-negative breast cancer . European Journal of Cancer , 195 . doi: 10.1016/j.ejca.2023.113401 OpenUrl CrossRef ↵ Wishart , G. C. , Azzato , E. M. , Greenberg , D. C. , Rashbass , J. , Kearins , O. , Lawrence , G. , Caldas , C. , & Pharoah , P. D. P . ( 2010 ). PREDICT: a new UK prognostic model that predicts survival following surgery for invasive breast cancer . Breast Cancer Research , 12 ( 1 ), R1 . doi: 10.1186/bcr2464 OpenUrl CrossRef PubMed ↵ Wong , H.-S. , Subramaniam , S. , Alias , Z. , Taib , N. A. , Ho , G.-F. , Ng , C.-H. , Yip , C.-H. , Verkooijen , H. M. , Hartman , M. , & Bhoo-Pathy , N . ( 2015 ). The Predictive Accuracy of PREDICT: A Personalized Decision-Making Tool for Southeast Asian Women With Breast Cancer . Medicine , 94 ( 8 ), e593 . doi: 10.1097/md.0000000000000593 OpenUrl CrossRef PubMed ↵ Wu , T. , Hu , E. , Xu , S. , Chen , M. , Guo , P. , Dai , Z. , Feng , T. , Zhou , L. , Tang , W. , Zhan , L. , Fu , X. , Liu , S. , Bo , X. , & Yu , G . ( 2021 ). clusterProfiler 4.0: A universal enrichment tool for interpreting omics data . The Innovation , 2 ( 3 ). doi: 10.1016/j.xinn.2021.100141 OpenUrl CrossRef PubMed ↵ Yap , Y.-S. , Lu , Y.-S. , Tamura , K. , Lee , J. E. , Ko , E. Y. , Park , Y. H. , Cao , A.-Y. , Lin , C.-H. , Toi , M. , Wu , J. , & Lee , S.-C . ( 2019 ). Insights Into Breast Cancer in the East vs the West: A Review . JAMA Oncology , 5 ( 10 ), 1489 – 1496 . doi: 10.1001/jamaoncol.2019.0620 OpenUrl CrossRef PubMed ↵ Zaguirre , K. , Kai , M. , Kubo , M. , Yamada , M. , Kurata , K. , Kawaji , H. , Kaneshiro , K. , Harada , Y. , Hayashi , S. , Shimazaki , A. , Morisaki , T. , Mori , H. , Oda , Y. , Chen , S. , Moriyama , T. , Shimizu , S. , & Nakamura , M . ( 2021 ). Validity of the prognostication tool PREDICT version 2.2 in Japanese breast cancer patients . Cancer Medicine , 10 ( 5 ), 1605 – 1613 . doi: 10.1002/cam4.3713 OpenUrl CrossRef PubMed ↵ Zhu , Z.-H. , Sun , B.-Y. , Ma , Y. , Shao , J.-Y. , Long , H. , Zhang , X. , Fu , J.-H. , Zhang , L.-J. , Su , X.-D. , Wu , Q.-L. , Ling , P. , Chen , M. , Xie , Z.-M. , Hu , Y. , & Rong , T.-H . ( 2009 ). Three Immunomarker Support Vector Machines–Based Prognostic Classifiers for Stage IB Non–Small-Cell Lung Cancer . Journal of Clinical Oncology , 27 ( 7 ), 1091 – 1099 . doi: 10.1200/jco.2008.16.6991 OpenUrl Abstract / FREE Full Text View the discussion thread. Back to top Previous Next Posted September 02, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following MyGESig: a population-specific gene signature improves survival prediction in Malaysian breast cancer patients Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share MyGESig: a population-specific gene signature improves survival prediction in Malaysian breast cancer patients Mohamad Hazwan Khairi , Zhi Lei Wong , Boon Hong Ang , James Phipps-Tan , Putri Nur Fatin , Rajadurai Pathmanathan , See Mee Hoong , Nur Aishah Mohd Taib , Cheng-Har Yip , Weang Kee Ho , Mei Chee Tai , Soo-Hwang Teo , Sok Ching Cheong , Pan Jia-Wern medRxiv 2025.08.28.25334111; doi: https://doi.org/10.1101/2025.08.28.25334111 Share This Article: Copy Citation Tools MyGESig: a population-specific gene signature improves survival prediction in Malaysian breast cancer patients Mohamad Hazwan Khairi , Zhi Lei Wong , Boon Hong Ang , James Phipps-Tan , Putri Nur Fatin , Rajadurai Pathmanathan , See Mee Hoong , Nur Aishah Mohd Taib , Cheng-Har Yip , Weang Kee Ho , Mei Chee Tai , Soo-Hwang Teo , Sok Ching Cheong , Pan Jia-Wern medRxiv 2025.08.28.25334111; doi: https://doi.org/10.1101/2025.08.28.25334111 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (570) Allergy and Immunology (863) Anesthesia (301) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1511) Epidemiology (15231) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6610) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15924) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6608) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3338) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5450) Public and Global Health (9240) Radiology and Imaging (2203) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a0208840d9630db4',t:'MTc3OTgzNzg1NA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00