Development and Validation of OncoOrigin: An Integrative AI Tool for Primary Cancer Site Prediction

doi:10.1101/2024.11.29.24318189

Development and Validation of OncoOrigin: An Integrative AI Tool for Primary Cancer Site Prediction

2024 · doi:10.1101/2024.11.29.24318189

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 41,035 characters · extracted from preprint-html · click to expand

Development and Validation of OncoOrigin: An Integrative AI Tool for Primary Cancer Site Prediction | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Development and Validation of OncoOrigin: An Integrative AI Tool for Primary Cancer Site Prediction Petar Brlek , Luka Bulić , Nidhi Shah , Parth Shah , Dragan Primorac doi: https://doi.org/10.1101/2024.11.29.24318189 Petar Brlek 1 St. Catherine Specialty Hospital , Zagreb, Croatia 2 School of Medicine, Josip Juraj Strossmayer University of Osijek , Osijek, Croatia MD, PhD candidate Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: petar.brlek{at}svkatarina.hr Luka Bulić 1 St. Catherine Specialty Hospital , Zagreb, Croatia MS-6 Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nidhi Shah 3 Dartmouth Health , Lebanon, NH, USA MD, FACMG Find this author on Google Scholar Find this author on PubMed Search for this author on this site Parth Shah 3 Dartmouth Health , Lebanon, NH, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dragan Primorac 1 St. Catherine Specialty Hospital , Zagreb, Croatia 2 School of Medicine, Josip Juraj Strossmayer University of Osijek , Osijek, Croatia 4 Eberly College of Science, The Pennsylvania State University, State College , PA, USA 5 School of Medicine, University of Split , Split, Croatia 6 The Henry C. Lee College of Criminal Justice and Forensic Sciences, University of New Haven , New Haven, CT, USA 7 Regiomed Kliniken , Coburg, Germany 8 School of Medicine, University of Rijeka , Rijeka, Croatia 9 Faculty of Dental Medicine and Health, Josip Juraj Strossmayer University of Osijek , Osijek, Croatia 10 School of Medicine, University of Mostar , Mostar, Bosnia and Herzegovina 11 National Forensic Sciences University , Gandhinagar, India MD, PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF ABSTRACT Importance Cancers of unknown primary origin (CUPs) represent a significant diagnostic and therapeutic challenge in the field of oncology. With the limitations of current diagnostic tools in these cases, novel approaches must be brought forward to improve treatment outcomes for these patients. Objective The objective of this study was to develop a machine-learning-based software for primary cancer site identification (OncoOrigin), based on genetic data acquired from tumor DNA sequencing. Design By design, this was an in silico diagnostic study. Setting This study was conducted using data from the cBioPortal database (accessed on 21 September 2024) and several data processing and machine-learning Python libraries. Participants This study involved over 20,000 tumor samples with information on patient age, sex, and the presence of genetic variants in over 600 genes. Main Outcomes and Measures The main outcome of interest in this study was machine-learning-based discrimination between cancer type classes, based on the provided data. Model quality was assessed by train set cross-validation and evaluation on a segregated test set. Finally, the optimal model was incorporated with a graphical user interface into the OncoOrigin software. Feature importances for class discrimination were also determined on the optimal model. Results Out of the four tested machine-learning estimators, the XGBoostClassifier-based model proved superior on test set evaluation, with a top-2 accuracy of 0.91 and ROC-AUC of 0.97. Class sensitivity values for prostate cancer, breast cancer, melanoma, and colorectal cancer were over 0.85, while all class specificity values were equal to or higher than 0.95. The top 3 significant features were patient sex, and genetic alterations in the APC and KRAS genes. Conclusions and Relevance In this study, we have successfully developed a machine-learning-based software for primary cancer site identification with high-quality evaluation metrics. Through simple clinical implementation, such a tool has the potential to significantly improve the diagnostics and treatment outcomes for patients suffering from CUP. Question Can a machine-learning-based tool (OncoOrigin) accurately predict the primary cancer site from tumor DNA sequencing data in patients with cancers of unknown primary origin (CUP)? Findings In this in silico study, the XGBoostClassifier model demonstrated high predictive performance, achieving a top-2 accuracy of 0.91 and ROC-AUC of 0.97 for the classification of ten cancer types using genetic data from over 20,000 tumor samples. Meaning The OncoOrigin software shows potential for improving the diagnosis and treatment of CUP by providing accurate primary cancer site identification based on genetic data. INTRODUCTION Cancers of unknown primary origin (CUPs) make up 5% of all malignant conditions worldwide. They are characterized by their aggressiveness, fast progression, and poor prognosis with alarmingly high mortality rates. Up to 84% of patients have fatal outcomes within the first year after diagnosis and represent some of the most diagnostically and therapeutically challenging cases in the field of oncology. A lack of information regarding the primary origin limits treatment possibilities as oncological protocols are largely based on the primary tumor site. Traditional diagnostic methods, such as immunohistochemistry using specific markers, have limited diagnostic power, especially in poorly differentiated tumors, and are often not sufficient for primary site verification [ 1 , 2 ]. Tumor molecular profiling with novel next-generation sequencing (NGS) methods, such as whole genome sequencing (WGS), has opened up the possibility of detecting all potential germline and somatic genetic alterations in tumor tissue. This enables more precise molecular classification of tumors and their personalized treatment [ 3 ]. Scientific efforts have been made to use genetic profiling for precise molecular classification of tumors. One such example is using single-cell transcriptomics and spatial proteomics with the purpose of improving treatment outcomes [ 4 ]. In recent years, machine learning has become a robust tool in biomedical research, due to its ability to efficiently process large quantities of data [ 5 ]. Certain studies have endeavored to combine genetic testing and machine learning with the purpose of primary site identification [ 6 ]. While promising results have been demonstrated, further research and validation are required prior to the implementation of such technology into clinical practice. The aim of our study was to train a primary cancer site predictor by combining genetic data from over 20,000 tumor samples and different machine-learning algorithms. Following evaluation, the optimal model would be fitted into a simple software application which would allow for easy practical implementation. Such a tool could offer a new and robust diagnostic option for CUP cases and potentially significantly improve treatment outcomes for these patients, and in the future, enable the detection of the primary tumor site based on the analysis of liquid biopsies without the need for invasive and harmful medical procedures. METHODOLOGY Study design By design, this was an in silico study that involved the curation, processing, and utilization of publicly available data to develop a new machine-learning model ( Figure 1 ). All of the data and code used in this study are publicly available (stated in the Data sharing statement) and the study design was approved by an institutional ethics committee (stated in the Ethical statement). Download figure Open in new tab Figure 1. Data curation and model development flow chart (created with Biorender.com ). Data curation The data were extracted from cBioPortal ( https://www.cbioportal.org/ , accessed on 21 September 2024), a comprehensive database for cancer genomics [ 7 , 8 , 9 ]. cBioPortal contains clinical and genomic data from the AACR Project GENIE® (Genomic Evidence Neoplasia Information Exchange). AACR Project GENIE ( https://genie.cbioportal.org/ , accessed on 21 September 2024) is a publicly accessible registry of real-world genomic data created through collaboration between 19 leading international cancer centers [ 10 ]. With approximately 287,000 sequenced samples from over 191,000 patients, it is one of the largest publicly available cancer genomic datasets, offering a vast resource for understanding the molecular underpinnings of various cancers. From this extensive dataset, 21,016 samples were selected through a rigorous filtering process for use in machine learning model development. The filtering process involved key criteria to refine the dataset. Only one sample per patient was included to avoid multiple sequencing events for the same metastatic tumor. The selection included 62.3% female (13,085 samples) and 37.7% male (7,931 samples) patients. The patient’s age at the time of sequencing was also utilized from the dataset. The virtual study created included 19 different subtypes of metastatic tumors. Data for 665 genes were extracted for each sample, selected according to previously published validated panels as part of the implementation of whole-exome sequencing for tumor tissue analysis [ 11 ]. For the processing and utilization of data on single nucleotide variants (SNVs), copy number variants (CNVs), and structural variants to develop machine learning models, a binary system was employed. The presence of an SNV, CNV, or structural gene alteration was coded as 1, while the absence of these changes was coded as 0. Data preprocessing Initial preprocessing included elimination of samples with unknown patient age (classified as “Unknown”, “89”) and samples with Not-a-Number (NaN) values. Furthermore, elimination of gene features with no detected variants across all samples was done. Additionally, the “Hepatobiliary Cancer” and “Pancreatic Cancer” classes were merged into the “Hepatobiliary/Pancreatic Cancer” class and the “Small Cell Lung Cancer” and “Non-Small Cell Lung Cancer” classes were merged into the “Lung Cancer” class. The output vector consisted of ten classes, “Colorectal Cancer”, “Ovarian Cancer”, “Melanoma”, “Breast Cancer”, “Lung Cancer”, “Thyroid Cancer”, “Prostate Cancer”, “Hepatobiliary/Pancreatic Cancer”, “Bladder Cancer”, and “Endometrial Cancer”. Model development Four robust ensemble/gradient boosting machine-learning algorithms, based on decision trees, were used for the prediction of cancer type – RandomForestClassifier (RFC), XGBoostClassifier (XGBC), CatBoostClassifier (CBC), and ExtraTreesClassifier (ETC). The preprocessed database was initially split into train and test sets in a 90:10 size ratio, using stratification with respect to the predicted class. Feature selection among the gene features was done on the train set, using the Boruta algorithm with an optimized XGBC estimator. Secondly, hyperparameter optimization based on the train set was done for each of the four algorithms, using the Optuna library. Finally, the models were initialized with optimal hyperparameters and fitted with the filtered train set. After the evaluation stage, the best estimator was chosen and fitted with the entire preprocessed database. This model was used for the development of the OncoOrigin software and feature importance analysis. Model evaluation After fitting, each of the estimators was evaluated based on several metrics. All metrics were determined using the one-versus-rest setting for multiclass classification. The first step of evaluation was stratified 10-fold cross-validation on the train set, after feature selection and hyperparameter optimization. In this step, mean accuracy scores, mean weighted F1 scores, and mean weighted receiver operating characteristic area under curve (ROC-AUC) scores were determined from each of the ten folds. Additionally, 95% confidence intervals were determined for these metrics. The second step involved performance evaluation on the segregated test set. For this analysis, accuracy scores, top-2 accuracy scores, weighted F1 scores, weighted ROC-AUC scores, weighted precision-recall area under curve (PR-AUC) scores, and class-specific sensitivity and specificity values were determined. Top-2 accuracy represents accuracy but the prediction is considered correct if the true value is one of the two most probable predicted classes. Additionally, confusion matrices and class-specific PR and ROC curves were determined for each estimator based on test set evaluation. User interface development The final fitted XGBC model was exported and packaged into a software with a graphical user interface (GUI) - OncoOrigin (Supplemental figure S1). In the software, the user input consists of the patient’s age at time of sequencing, the patient’s sex, and genes in which genetic variants were discovered after tumor sequencing. The interface outputs the predicted class, as well as prediction confidence and probabilities for all other classes relative to the predicted class. The prediction confidence is determined by the relative probability p 2 of the second most probable class ( p 2 ≥ 0.8 for very low, 0.8 > p 2 ≥ 0.6 for low, 0.6 > p 2 ≥ 0.4 for moderate, 0.4 > p 2 ≥ 0.2 for high, and p 2 < 0.2 for very high). Programming implementation All steps involving machine learning model training and evaluation, as well as GUI development, were done in the Python/Jupyter Notebook programming language (v3.12.3) [ 12 ]. Several Python libraries were used, including Pandas (v2.2.2), NumPy (v1.26.0), MatPlotLib (v3.8.0), Seaborn (v0.13.2), SciPy (v1.14.0), Scikit-Learn (v1.5.1), XGBoost (v2.1.0), CatBoost (v1.2.5), Boruta (v0.3), and Optuna (v3.6.1) [ 13 – 22 ]. Development of the OncoOrigin GUI was done using the Tkinter (v8.6.13) library [ 23 ]. Random state variables throughout the code were defined by a chosen value. Still, due to inherent non-determinism in some of the stated libraries, different runs of the code still did not produce the exact same results. However, noticed deviations in evaluation metric values due to non-determinism were negligible. RESULTS Data preprocessing After data extraction from the cBioPortal database had been completed, data on sex, age at time of sequencing, and genetic variant presence for 665 genes across 21,016 patient tumor samples were selected (Supplemental table S2). Elimination of uniformly distributed genes produced the database for the modeling pipeline, containing the features of sex, age, and genetic variant presence for 566 genes and 20710 samples. After feature selection, 345 genetic features remained in the database, along with the sex and age features (Supplemental table S3). Class frequencies in the output vector were 0.14, 0.09, 0.08, 0.22, 0.22, 0.02, 0.07, 0.11, 0.02, and 0.02 for colorectal cancer, ovarian cancer, melanoma, breast cancer, lung cancer, thyroid cancer, prostate cancer, hepatobiliary/pancreatic cancer, bladder cancer, and endometrial cancer respectively. Model evaluation After feature selection, hyperparameter optimization, and model fitting were complete, evaluation metrics were determined for each of the four estimators. This included metrics received from train set cross-validation and evaluation based on the test set ( Table 1 ). The results demonstrated the XGBC model as the superior model in both train set cross-validation and test set evaluation. The CBC model showed only slightly poorer performance metrics, while the ETC and RFC showed significantly poorer performance metrics. Interestingly, the ETC model, while poorer in overall quality, showed the highest sensitivity values for ovarian cancer, thyroid cancer, bladder cancer, and endometrial lower cancer. View this table: View inline View popup Download powerpoint Table 1. Evaluation metrics for each estimator Class-specific evaluation and feature analysis Confusion matrices were determined for each of the models to better evaluate how each class impacted the overall predictive power ( Figure 2 ). The RFC confusion matrix showed that the model predicted almost all samples as either breast cancer or lung cancer. The ETC confusion matrix showed a somewhat better result, with a higher ability to detect less represented classes, but a poor ability to detect the lung cancer class. Finally, the XGBC and CBC confusion matrices had a very similar layout. Hotspots were noticed in both fields pertaining to breast cancer and ovarian cancer, indicating a difficulty in differentiating between the two. ROC curves were in line with the model metrics ( Figure 3 ). In terms of class, the bladder cancer and lung cancer ROC curves visibly stood out with lower AUCs in all models, while the prostate cancer ROC curves were consistently superior across all models. A similar trend was noted in the precision-recall curves (Supplemental figure S4). The feature importance analysis revealed the top 20 features for primary site prediction ( Figure 4 ). The greatest importance score was calculated for the patient sex feature, followed by APC , and KRAS . Download figure Open in new tab Figure 2. Confusion matrices of the RFC model (a), XGBC model (b), CBC model (c), and ETC model (d). Download figure Open in new tab Figure 3. Class-specific ROC analysis curves for the RFC model (a), XGBC model (b), CBC model (c), and ETC model (d). Download figure Open in new tab Figure 4. Feature importance analysis of the top 20 features based on the optimal XGBC model. DISCUSSION Software quality evaluation and comparison After evaluation and interpretation of the produced results, the XGBoostClassifier was identified as the optimal estimator for OncoOrigin, with an accuracy of 0.81, top-2 accuracy of 0.91, F1-score of 0.81, PR-AUC score of 0.88, and ROC-AUC score of 0.97. As the software provides the user with not only the prediction but also insight into the likelihood of the other classes, clinicians will always be informed whether a second class is highly likely. As this can be clinically actionable information, we deemed the top-2 accuracy metric relevant for our model. When observing specific classes, sensitivity values ≥0.79 were found for breast cancer, lung cancer, colorectal cancer, prostate cancer, melanoma, and hepatobiliary/pancreatic cancer, while the other classes had lower sensitivity values. This trend generally corresponds to the quantitative representation of each class in the total population sample, determined by frequency analysis. Where class specificity values are concerned, all values were ≥0.95. The performance metrics of our model can be compared to other relevant literature for this context. In the paper published by Moon I et al., the authors also proposed a machine-learning model for primary cancer site detection [ 6 ]. While the source of data, number of samples used for training and testing, and feature definition and representation were different, the authors reported a weighted F1-score of 0.784 for classification based on 22 cancer types and an F1-score of 0.806 for classification based on 13 cancer types. Our OncoOrigin model measures up to these metrics with a weighted F1-score of 0.81 for classification based on 10 cancer types. Furthermore, while the authors used different feature selection and hyperparameter optimization methods, they too used the XGBoostClassifier algorithm as the classification estimator. In an article by Bicakci N, the effectiveness of the F-18 FDG PET-CT scan in CUP cases was evaluated [ 24 ]. The primary tumor was correctly determined in 41% of patients. The rest of the patients either did not have matching PET-CT or histopathology findings, or failed to have their primary tumor differentiated by both methods. Another study, published by Lu MY et al., constructed a deep-learning model to identify primary cancer sites based on histopathological image processing [ 25 ]. In their article, the authors reported their model achieved an accuracy of 0.834 when evaluated on the segregated test set. Once again, this is comparable with the 0.81 accuracy which OncoOrigin achieved on our segregated test set. When comparing our software with the literature, the main strength of OncoOrigin is its integration with a graphical user interface. Normally, interactions with developed machine-learning models require a background in AI programming and data science. However, for clinicians and other medical experts, this might be an impractical skillset to acquire and maintain [ 26 ]. For this reason, a simple user interface which communicates data between the model and the user greatly facilitates its potential for implementation in clinical practice. Software limitations While quality results have been achieved in the evaluation of OncoOrigin, the model does have several limitations. Firstly, the model has a total of 347 input features, the values of which should be known upon using the software to receive a prediction with an accuracy level corresponding to the one presented in this article. This entails the sequencing of 345 genes, which may be technically difficult, depending on the available technology for any given user. Secondly, due to the format of the extracted data, the model has been trained with age values ranging from 18 to 89. For this reason, the age input in the software has been limited to this range. Thirdly, while the model did use data combined from multiple studies, this data was still extracted from a single source which leaves potential for a certain level of data bias and risk of overfitting. Another limitation regarding the data itself is underrepresentation of certain cancer types, which likely negatively impacted the ability of the model to identify them. Finally, the model is limited to predicting primary sites among the ten cancer type classes included in the study. Practical applications of OncoOrigin in precision oncology The possibility of tumor sequencing and primary cancer site prediction in CUP cases holds the potential for significant treatment benefits in the context of targeted oncological therapy. Precision medicine has revolutionized oncological therapy by enabling the implementation of targeted treatments based on specific molecular biomarkers of tumors [ 27 ]. Modern molecular biology techniques, such as whole genome sequencing (WGS) and whole exome sequencing (WES) of tumor tissue, facilitate the determination of the molecular profile of tumors and the application of therapies that act on specific molecular targets within the tumor tissue [ 3 ]. A key tool in this approach is tumor molecular profiling, which involves the genetic analysis of tumor tissue embedded in paraffin and/or circulating tumor DNA (ctDNA) obtained from liquid biopsy samples. The analysis of ctDNA from a patient’s blood represents a significant advancement in oncology, especially in cases where traditional tissue biopsies are not feasible [ 28 ]. These advancements in oncological diagnostics form the basis for the introduction of molecular classification of neoplasms aimed at targeted treatment of secondary tumors of unknown primary origin. The further development of the OncoOrigin platform, in addition to the precise detection of the primary tumor site, opens up extensive possibilities for the molecular classification of secondary neoplasms based on results obtained through NGS methods. This advancement will pave the way for the diagnosis and precise treatment of CUP. Our planned further research will validate the in vivo data obtained from the genetic analysis of tumor tissue embedded in paraffin and ctDNA derived from liquid biopsy samples. Should the quality of OncoOrigin the model hold its standard when tested on data obtained from liquid biopsy, we open the possibility for early detection and treatment of metastatic cancers without relying on standard invasive methods such as surgery and PET-CT scans, which expose patients to additional ionizing radiation that can damage healthy cells [ 29 ]. Furthermore, in the case that more than one primary cancer site is likely, our software offers this information to the clinician. As multiple primary cancers can be managed with a combined regimen, this can in certain cases be actionable information [ 30 ]. Conclusion and future directions The results presented in this study present OncoOrigin as a robust tool for primary cancer site identification, with great potential application in CUP cases. While the software has certain limitations, it achieved excellent scores on the most frequent cancer types with significant metastatic potential. Furthermore, the software does not only offer the prediction of the model itself but information on the probability of each tumor type included. This provides clinicians with a greater amount of relevant and actionable information than models that offer the predicted class alone. As all the training and evaluation in this study were done in silico , the next phase of OncoOrigin validation will be multi-centric evaluation on patients from our population diagnosed with metastatic cancer with a known primary site. Finally, provided we achieve an equal standard of results, the third phase will involve prediction of the primary cancer site in CUP cases and an analysis of outcome benefit. Our end goal with OncoOrigin is its clinical implementation in precision oncology and a significant change in the clinical paradigm of CUP. Data Availability The data used in this study is publicly available as part of a virtual study in the cBioPortal system, available at the following link: https://genie.cbioportal.org/study?id=670d2529854f636a3863065d. The source code files used for data management, model training and evaluation, and user interface development are publicly available as part of an active GitHub repository, available at the following link: https://github.com/lbulic1003/OncoOrigin (Source code/v1.0.0). https://genie.cbioportal.org/study?id=670d2529854f636a3863065d https://github.com/lbulic1003/OncoOrigin Conflict of interest statement The authors declare they have no conflicts of interest. Data sharing statements The data used in this study is publicly available as part of a virtual study in the cBioPortal system, available at the following link: https://genie.cbioportal.org/study?id=670d2529854f636a3863065d . The source code files used for data management, model training and evaluation, and user interface development are publicly available as part of an active GitHub repository, available at the following link: https://github.com/lbulic1003/OncoOrigin (Source code/v1.0.0). Ethical statement This study was approved by the St. Catherine Specialty Hospital ethics committee (Sep 2024, 24/8-I) and was conducted in accordance with the Helsinki declaration. Funding No funding was received for conducting this study. Reporting guidelines This manuscript was written in accordance with the TRIPOD+AI checklist. Study registration and protocol No registration was done for this study, nor was a protocol prepared. SUPPLEMENT Supplemental figure S1. OncoOrigin graphical user interface with initial window (a), inputted patient age, sex, and genes with detected variants (b), bottom of the window with predict function (c), and prediction report with predicted site, confidence, and relative probabilities for other sites (d). Supplemental table S2. List of all genes included in data extraction from cBioPortal. Supplemental table S3. List of all genes included in modeling after preprocessing and feature selection. Supplemental figure S4. Class-specific precision-recall analysis curves for the RFC model (a), XGBC model (b), CBC model (c), and ETC model (d). Acknowledgements We thank the International Society for Applied Biological Sciences (ISABS) for their continued support. REFERENCES 1. ↵ Zolotykh MA , Mingazova LA , Filina YV , Blatt NL , Nesterova AI , Sabirov AG , et al. Cancer of unknown primary and the «seed and soil» hypothesis . Crit Rev Oncol Hematol . 2024 ; 196 : 104297 . OpenUrl PubMed 2. ↵ Cancer of unknown primary statistics . Cancer Research UK . 2015 [updated 03.07.2024., cited 09.09.2024.] Available at: https://www.cancerresearchuk.org/health-professional/cancer-statistics/statistics-by-cancer-type/cancer-of-unknown-primary 3. ↵ Brlek P , Bulić L , Bračić M , Projić P , Škaro V , Shah N , et al. Implementing Whole Genome Sequencing (WGS) in Clinical Practice: Advantages, Challenges, and Future Perspectives . Cells . 2024 ; 13 ( 6 ): 504 . OpenUrl 4. ↵ Shiao SL , Gouin KH 3rd . , Ing N , Ho A , Basho R , Shah A , Mebane RH , Zitser D , Martinez A , Mevises NY , Ben-Cheikh B , Henson R , Mita M , McAndrew P , Karlan S , Giuliano A , Chung A , Amersi F , Dang C , Richardson H , Shon W , Dadmanesh F , Burnison M , Mirhadi A , Zumsteg ZS , Choi R , Davis M , Lee J , Rollins D , Martin C , Khameneh NH , McArthur H , Knott SRV. Single-cell and spatial profiling identify three response trajectories to pembrolizumab and radiation therapy in triple negative breast cancer . Cancer Cell . 2024 ; 42 ( 1 ): 70 – 84.e8 . OpenUrl CrossRef PubMed 5. ↵ Ting Sim JZ , Fong QW , Huang W , Tan CH . Machine learning in medicine: what clinicians should know . Singapore Med J . 2023 ; 64 ( 2 ): 91 – 97 . OpenUrl PubMed 6. ↵ Moon I , LoPiccolo J , Baca SC , et al. Machine learning for genetics-based classification and treatment response prediction in cancer of unknown primary [published correction appears in Nat Med. 2024 Feb;30(2):607.] . Nat Med . 2023 ; 29 ( 8 ): 2057 – 2067 . OpenUrl PubMed 7. ↵ Cerami , et al. The cBio Cancer Genomics Portal: An Open Platform for Exploring Multidimensional Cancer Genomics Data . Cancer Discovery . 2012 ; 2 : 401 . OpenUrl Abstract / FREE Full Text 8. ↵ Gao , et al. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal . Sci Signal . 2013 ; 6 :p l1 . OpenUrl 9. ↵ Brlek P , Kafka A , Bukovac A , Pećina-Šlaus N . Integrative cBioPortal Analysis Revealed Molecular Mechanisms That Regulate EGFR-PI3K-AKT-mTOR Pathway in Diffuse Gliomas of the Brain . Cancers (Basel ). 2021 ; 13 ( 13 ): 3247 . OpenUrl PubMed 10. ↵ de Bruijn , et al. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal . Cancer Res . 2023 . 11. ↵ Shah PS , Hughes EG , Sukhadia SS , et al. Validation and Implementation of a Somatic-Only Tumor Exome for Routine Clinical Application . J Mol Diagn . 2024 ; 26 ( 9 ): 815 – 824 . OpenUrl PubMed 12. ↵ Van Rossum G , Drake FL . Python 3 Reference Manual. Scotts Valley , CA : CreateSpace ; 2009 . 13. ↵ McKinney W , et al. Data structures for statistical computing in python . In: Proceedings of the 9th Python in Science Conference . 2010 . p. 51 – 6 . 14. Harris CR , Millman KJ , van der Walt SJ , Gommers R , Virtanen P , Cournapeau D , et al. Array programming with NumPy . Nature . 2020 ; 585 : 357 – 362 . OpenUrl CrossRef PubMed 15. Virtanen P , Gommers R , Oliphant TE , Haberland M , Reddy T , Cournapeau D , et al. SciPy 1.0: Fundamental Algorithms for Scientific Computing in Python . Nature Methods . 2020 ; 17 : 261 – 72 . OpenUrl CrossRef PubMed 16. Pedregosa F , Varoquaux G , Gramfort A , Michel V , Thirion B . Scikit-learn: Machine Learning in Python . J Mach Learn Res . 2011 ; 12 : 2825 – 2830 . OpenUrl CrossRef PubMed 17. Hunter JD . Matplotlib: A 2D Graphics Environment . Comp Sci Eng . 2007 ; 9 ( 3 ): 90 – 95 . OpenUrl 18. Waskom ML. seaborn: statistical data visualization . J Open Source Softw . 2021 ; 6 ( 60 ): 3021 . OpenUrl CrossRef 19. Chen T , Guestrin C. XGBoost: A Scalable Tree Boosting System . In: Proceedings of the 22 nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining . 2016 ; New York, NY, SAD: ACM ; p. 785 – 794 . 20. Kursa MB , Rudnicki WR . Feature Selection with the Boruta Package . J Stat Soft . 2010 ; 36 ( 11 ): 1 – 13 . OpenUrl 21. Dorogush AV , Ershov V , Gulin A. CatBoost: gradient boosting with categorical features support . arXiv [Preprint] 2018 . [cited 10.09.2024.]: 1810.11363v1 . 22. ↵ Akiba T , Sano S , Yanase T , Ohta T , Koyama M. Optuna: A Next-generation Hyperparameter Optimization Framework . arXiv [Preprint] 2019 . [cited 10.09.2024.]: 1907.10902v1 . 23. ↵ Lundh F. An introduction to tkinter . 1999 . Available at: www.pythonware.com/library/tkinter/introduction/index.htm . 24. ↵ Bicakci N . Diagnostic and prognostic value of F-18 FDG PET/CT in patients with carcinoma of unknown primary . North Clin Istanb . 2022 ; 9 ( 4 ): 337 – 346 . OpenUrl PubMed 25. ↵ Lu MY , Chen TY , Williamson DFK , et al. AI-based pathology predicts origins for cancers of unknown primary . Nature . 2021 ; 594 ( 7861 ): 106 – 110 . OpenUrl PubMed 26. ↵ Bastian G , Baker GH , Limon A . Bridging the divide between data scientists and clinicians . Intelligence-Based Medicine . 2022 ; 6 : 100066 . OpenUrl 27. ↵ Akhoundova D , Rubin MA . Clinical application of advanced multi-omics tumor profiling: Shaping precision oncology of the future . Cancer Cell . 2022 ; 40 ( 9 ): 920 – 938 . OpenUrl CrossRef PubMed 28. ↵ Li W , Liu JB , Hou LK , et al. Liquid biopsy in lung cancer: significance in diagnostics, prediction, and treatment monitoring . Mol Cancer . 2022 ; 21 ( 1 ): 25 . OpenUrl CrossRef PubMed 29. ↵ Nikanjam M , Kato S , Kurzrock R . Liquid biopsy: current technology and clinical applications . J Hematol Oncol . 2022 ; 15 ( 1 ): 131 . OpenUrl CrossRef PubMed 30. ↵ Chen X , Fang Y , Yan W-P . Considerations for multidisciplinary management of synchronous primary breast cancer and primary lung cancer – Analysis of thirty-one patients . Thorac Cancer . 2024 ; 15 ( 14 ): 1132 – 1137 . OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted December 01, 2024. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Development and Validation of OncoOrigin: An Integrative AI Tool for Primary Cancer Site Prediction Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Development and Validation of OncoOrigin: An Integrative AI Tool for Primary Cancer Site Prediction Petar Brlek , Luka Bulić , Nidhi Shah , Parth Shah , Dragan Primorac medRxiv 2024.11.29.24318189; doi: https://doi.org/10.1101/2024.11.29.24318189 Share This Article: Copy Citation Tools Development and Validation of OncoOrigin: An Integrative AI Tool for Primary Cancer Site Prediction Petar Brlek , Luka Bulić , Nidhi Shah , Parth Shah , Dragan Primorac medRxiv 2024.11.29.24318189; doi: https://doi.org/10.1101/2024.11.29.24318189 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (574) Allergy and Immunology (865) Anesthesia (304) Cardiovascular Medicine (4460) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (611) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15251) Forensic Medicine (31) Gastroenterology (1132) Genetic and Genomic Medicine (6621) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4564) Health Policy (1372) Health Systems and Quality Improvement (1617) Hematology (544) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15938) Intensive Care and Critical Care Medicine (1107) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6642) Nursing (346) Nutrition (1001) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3350) Ophthalmology (981) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1698) Pharmacology and Therapeutics (694) Primary Care Research (714) Psychiatry and Clinical Psychology (5464) Public and Global Health (9259) Radiology and Imaging (2212) Rehabilitation Medicine and Physical Therapy (1372) Respiratory Medicine (1198) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (533) Surgery (715) Toxicology (100) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a038ddd1cd7c8650',t:'MTc4MDA5MzAxNw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00