Full text
23,417 characters
· extracted from
preprint-html
· click to expand
FUSED: Cross-Domain Integration of Foundation Models for Cancer Drug Response Prediction | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results FUSED: Cross-Domain Integration of Foundation Models for Cancer Drug Response Prediction Till Rössner , Jonas Balke , Ming Tang doi: https://doi.org/10.1101/2025.09.30.679434 Till Rössner 1 L3S Research Center , Germany 2 Leibniz University Hannover , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jonas Balke 1 L3S Research Center , Germany 2 Leibniz University Hannover , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ming Tang 1 L3S Research Center , Germany 2 Leibniz University Hannover , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: tang{at}l3s.de Abstract Full Text Info/History Metrics Preview PDF A bstract AI-driven methods for predicting drug responses hold promise for advancing personalized cancer therapy, but cancer heterogeneity and the high cost of data generation pose substantial challenges. Here we explore the transfer learning capability and introduce FUSED ( Fu sion of Foundation Model E mbeddings for D rug Response Prediction), a novel architecture for cross-domain foundation model (FM) integration. By systematically benchmark FMs across two domains – molecular FM for drugs and single-cell FM for cell lines, we demonstrate that integrating single-cell FMs substantially reduces the number of input features required for cell line representation. Among FMs, Molformer significantly outperforms ChemBERTa, and scGPT surpasses scFoundation in predictive accuracy and training stability. Moreover, integrating single-cell FMs improves performance in both drug-known and leave-one-drug-out scenarios. These findings highlight the potential of cross-domain FM integration for more efficient and robust drug response prediction. 1 Introduction By leveraging large amounts of pre-training data, Foundation Models (FMs) can represent input in a context beyond the available dataset, making them effective feature extractors for various downstream tasks. In the biomedical domain, pre-trained FMs for medical images [ 1 , 2 ], drug molecules [ 3 , 4 ], single-cells [ 5 , 6 ] and many other modalities are increasingly applied and systematically explored. Being able to predict drug response is crucial for discovery and design of cancer treatment. The inherently multi-modal nature of the task requires the integration of properties from both drug molecules and cancer cells. Previous published work such as DeepCDR, GraphDRP and XGDP etc. mainly explored various CNN- and GNN-based approaches for extracting and integrating embeddings [ 7 , 8 , 9 ]. In this study, we investigate the potential of FMs for Cancer Drug Response Prediction (CDRP) and present, to our knowledge, the first attempt to integrate FMs across distinct domains. We introduce FUSED ( Fu sion of Foundation Model E mbeddings for D rug Response Prediction), and systematically benchmark four representative FMs – scGPT and scFoundation for transcriptomic profiling [ 5 , 6 ], and ChemBERTa and MolFormer [ 4 , 3 ] for molecular structure representation. Our work provides a principled path for leveraging cross-domain FMs in robust drug-response prediction. 2 Related Work Single-Cell Foundation Models scFMs are pre-trained on large corpora of single-cell gene expression data and have demonstrated capability of representing “virtual cells” and enabling diverse downstream tasks. Among the recent state-of-the-art scFMs, scGPT [ 5 ] is a generative pre-trained transformer model built on ~33 million cells, and can effectively distill critical biological insights and achieve superb performance across multiple applications. scFoundation [ 6 ], another transformer-based scFM, adopts an asymmetric architecture with a pretraining objective designed to capture complex gene relations across a wide range of cell types and states. Molecular Foundation Models Recent progress in molecular representation learning of chemical drugs has produced FMs that leverage different input modalities [ 10 ], including sequence-based models such as ChemBERTa and MolFormer [ 11 , 3 ], 2D graph-based models such as MolE and KANO [ 12 , 13 ], and 3D structure-based representations such as Uni-Mol and SubGDiff [ 14 ]. In this work, we focus on sequence-based models where drug molecules are represented as SMILES strings. ChemBERTa [ 11 ] is pre-trained on 77 million SMILES using masked language modeling to chemical tokens. MolFormer [ 3 ] extends this with rotary positional encoding and linear attention. Drug Response Prediction A classical measure of cancer drug response is to estimate the drug efficacy by predicting the half-maximal inhibitory concentration (IC 50 ) for drug–cell line pairs. [ 15 ]. Most prior work employ hybrid neural network architectures where drug and cell features are encoded separately before their embeddings are integrated. While GraphDRP [ 8 ] systematically evaluates different graph neural network (GNN) architectures to encode drug structures, DeepCDR [ 7 ] focuses on integrating multi-omics information for cell line representation. More recently, XGDP [ 9 ] replaces simple embedding concatenation with a multi-head attention mechanism, yielding more expressive feature interactions than earlier CNN-based fusion. 3 Approach We formulate cancer drug response (CDR) prediction as a regression task, where the model learns a mapping with D representing drug features and C denoting cell features. Drug features are derived from SMILES representations, while C is decomposed into three distinct modalities: transcriptomic, methylation, and mutation profiles. The core idea is to generate embeddings for each of these four input types and integrate them to predict IC 50 values. Our study proceeds in two steps: (1) baseline extension – incorporating FMs into the existing DeepCDR framework to generate drug and cell line embeddings; (2) develop novel integration architecture FUSED ( Fu sion of Foundation Model E mbeddings for D rug Response Prediction), which leverages multi-head attention to integrate molecular and single-cell embeddings generated from domain-specific FMs ( Fig. 1 ). Download figure Open in new tab Figure 1: FUSED approach with FMs extracting drug features and transcriptomic features. a) We create 4 embeddings for each feature type. While transcriptomics features and drug features are transformed by a FM, mutation and methylation features are trained from scratch. Each embedding is contextualized with respect to the other three embeddings using multi-head attention. All representations are projected to equal dimensions with a linear layer, and concatenated and applied to a MLP to estimate IC 50 . b) Multi-head attention mechanism: we create a Q-vector from the original embedding, and a K-, V-vector from all embeddings in order to compute attention score with respect to given Q-vector. Each multi-head attention block learn its own parameters to create Q-, K- and V-vectors. Datasets We use a combination of two common datasets: Cancer Cell Line Encyclopedia (CCLE) [ 16 ] and Genomics of Drug Sensitivity in Cancer (GDSC) [ 17 ]. CCLE stores multi-omics data (transcriptomics, methylation, mutation) for cancer cell lines. GDSC stores drug responses in form of IC 50 values for cell line drug pairs, matching cell lines from CCLE database. In total, we have 561 cell lines paired with 223 drugs leading to 107446 drug-cell line pairs after filter out missing combinations. Architecture Development Step 1: Baseline Extension Our baseline DeepCDR framework contains a graph convolutional network (GCN) and three subnetworks for drug structure and cancer cell profiles (genomic mutation, transcriptomics and methylation) respectively. In our extension, we replace either the GCN-based drug encoder with the molecular FM, or transcriptomic subnetworks with scFM. For the single-cell FM, we prepend it to the transcriptomic-specific subnetwork, while the molecular FM replace the GCN architecture. Step 2: FUSED We develop a novel architecture, FUSED, designed to enhance the integration of multiple FMs. Rather than relying solely on concatenation, FUSED employ multi-head attention to contextualize and align embeddings from different modalities before regression ( Fig. 1 ). To optimize hyperparameters ( number of heads, number of MLP layers, learning rate, and embedding dimension ), we performed a grid search in preliminary experiments. Experimental Configurations We used ChemBERTa or Molformer as FMs to extract drug embeddings, and scGPT or scFoundation to extract transcriptomic embeddings. In all settings, FM parameters were kept frozen to assess zero-shot capabilities. All experiments were conducted on an NVIDIA GTX 1080Ti GPU. The training time for the combined multi-omics and GCN model was ~150 minutes (100 epochs). When the GCN component was removed and frozen molecular FMs were used instead, the training time was reduced to ~70 minutes. FUSED required ~10 minutes more time than CNNs. As primary evaluation metrics, we employed Pearson Correlation Coefficient (PCC, the higher the better) and Root Mean Squared Error (RMSE, the smaller the better). We explored two ways of splitting training/test data. In drug-known split, all drugs and cell lines were present in the training set, and the results are averaged across three runs with different seeds to avoid bias in test set. In leave-one-drug-out split, all samples associated with a given drug were placed exclusively in the test set, ensuring that the model had no prior exposure to that drug during training. This process was repeated for each of the 223 drugs, and the results were averaged across all iterations. 4 Results and Discussion Drug-Known Split As shown in Table 1 , when all drugs and cell lines appear in the training set, we observe following trends from both PCC and RMSE: View this table: View inline View popup Download powerpoint Table 1: Performance comparison in drug-known split. Top 3 models from each column are highlighted Multi-omics adds modest gains . Incorporating multi-omics features yields improvements, particularly for models that are weaker with transcriptomics alone. This effect narrows the performance gap among good and poor models, therefore most of our following comparisons will focus on the transcriptomics-only results. scFM helps in transcriptomics only setting . Both scGPT and scFoundation improve over the baseline, with scGPT consistently outperforming scFoundation in all experiments. Molecular FMs differ . ChemBERTa underperforms the GCN baseline, whereas MolFormer provides a slight improvement. FUSED is the best . Our FUSED approach achieves the best performance both in transcriptomic and multi-omics setting. These findings underscore the value of FMs when comprehensive datasets (e.g., multi-omics) are unavailable. By leveraging knowledge acquired during pre-training, a FM-based approach can compensate for missing modalities. At the same time, FM choice matters: not all models confer benefits (e.g., ChemBERTa), and scGPT is more effective than scFoundation in our setting. Importantly, our FUSED approach demonstrates that integrating molecular and transcriptomic representations through an appropriate fusion mechanism can yield further performance gains, highlighting the promise of cross-domain FM integration for cancer drug response prediction. Leave-One-Drug-Out In the challenging drug-blind setting ( Table 2 ), baseline without scFM performs extremely poorly when only transcriptomic features are used: PCC = 0.076 with GCN and 0.107 with MolFormer as the drug encoder. Interestingly this weakness can be mitigated by adding other omics or using a scFM, which raises PCC into the 0.41-0.45 range and reduces RMSE. Our FUSED approach does not yield additional gains in this setting, indicating the possibility of overfitting to the training drugs. View this table: View inline View popup Download powerpoint Table 2: Leave-one-drug-out performance comparison of the different approaches Different than the drug-known setting, PCC and RMSE are not always aligned, and standard deviations are relatively large because the summary statistics are averaged over 223 held-out drugs. This variance indicates that performance depends strongly on which drug is withheld, suggesting that drug-specific properties substantially affect model generalization. Overall, the weak zero-shot generalization observed in drug-blind setting highlights a key limitation of current AI models for drug response prediction and motivates future work on stronger drug encoders, regularization, and augmentation strategies. Training Stability We examine the validation PCC over training epochs in the drug-known setting ( Figure 2 ). In the transcriptomics-only condition ( Figure 2a ), scFoundation exhibits pronounced instability—large oscillations and occasional collapses—across runs, whereas scGPT converges quickly and monotonically and the non-FM baseline improves only marginally. Adding additional omics ( Figure 2b ) markedly stabilizes training: the oscillations diminish and scFoundation’s final performance approaches that of scGPT. These results indicate that scGPT-derived embeddings are robust in low-feature transcriptomic settings, while multi-omics integration mitigates scFoundation’s instability and enables competitive accuracy. Download figure Open in new tab Figure 2: Representative learning curves PCC validation performance in drug-known setting with MolFormer as drug encoder. a) transcriptomics only; b) multi-omics. 5 Conclusion We introduce FUSED, a compact fusion architecture that aligns molecular and single-cell foundation-model (FM) embeddings, and provide the first systematic cross-domain FM benchmark for cancer drug-response prediction. Across settings, scGPT and MolFormer emerge as the strongest single-modality encoders, and FUSED attains the best accuracy in drug-known splits while reducing reliance on extensive multi-omics features. Limitations include weak generalization to unseen drugs, instability for some scFMs in transcriptomics-only training, and evaluation restricted to frozen encoders on CCLE/GDSC cell-line data. Future work could focus on stronger drug encoders, fusion schemes explicitly regularized for zero-shot drugs, and validation beyond cell lines (e.g. patient-derived models). Footnotes till.roessner{at}stud.uni-hannover.de , jonas.balke{at}stud.uni-hannover.de , tang{at}l3s.de References [1]. ↵ Richard J. Chen , Tong Ding , Ming Y. Lu , et al. Towards a general-purpose foundation model for computational pathology . Nature Medicine , 30 : 850 – 862 , 2024 . OpenUrl CrossRef PubMed [2]. ↵ Ming Y. Lu , Bowen Chen , Drew F. K. Williamson , et al. A visual-language foundation model for computational pathology . Nature Medicine , 30 : 863 – 874 , 2024 . OpenUrl CrossRef PubMed [3]. ↵ Jerret Ross , Brian Belgodere , Vijil Chenthamarakshan , et al. Large-scale chemical language representations capture molecular structure and properties . Nature Machine Intelligence , 4 : 1256 – 1264 , 2022 . OpenUrl [4]. ↵ Walid Ahmad , Elana Simon , Seyone Chithrananda , Gabriel Grand , and Bharath Ramsundar . Chemberta-2: Towards chemical foundation models , 2022 . [5]. ↵ Haotian Cui , Chloe Wang , Hassaan Maan , et al. scgpt: toward building a foundation model for single-cell multi-omics using generative ai . Nature Methods , 21 : 1470 – 1480 , 2024 . OpenUrl PubMed [6]. ↵ Minsheng Hao , Jing Gong , Xin Zeng , et al. Large-scale foundation model on single-cell transcriptomics . Nature Methods , 21 : 1481 – 1491 , 2024 . OpenUrl PubMed [7]. ↵ Qiao Liu , Zhiqiang Hu , Rui Jiang , and Mu Zhou . Deepcdr: a hybrid graph convolutional network for predicting cancer drug response . Bioinformatics , 36 ( Supplement_2 ): i911 – i918 , 2020 . OpenUrl CrossRef PubMed [8]. ↵ Tuan Nguyen , Giang T. T. Nguyen , Thin Nguyen , and Duc-Hau Le . Graph convolutional networks for drug response prediction . IEEE/ACM Transactions on Computational Biology and Bioinformatics , 19 ( 1 ): 146 – 154 , 2022 . OpenUrl [9]. ↵ Conghao Wang , Gaurav A. Kumar , and Jagath C. Rajapakse . Drug discovery and mechanism prediction with explainable graph neural networks . Scientific Reports , 15 : 179 , 2025 . [10]. ↵ Jianbo Qiao , Junru Jin , Ding Wang , et al. A self-conformation-aware pre-training framework for molecular property prediction with substructure interpretability . Nature Communications , 16 : 4382 , 2025 . [11]. ↵ Seyone Chithrananda , Gabriel Grand , and Bharath Ramsundar . Chemberta: Large-scale self-supervised pretraining for molecular property prediction , 2020 . [12]. ↵ Oscar Méndez-Lucio , Christos A. Nicolaou , and Berton Earnshaw . Mole: a foundation model for molecular graphs using disentangled attention . Nature Communications , 15 : 9431 , 2024 . [13]. ↵ Yin Fang , Qiang Zhang , Ningyu Zhang , et al. Knowledge graph-enhanced molecular contrastive learning with functional prompt . Nature Machine Intelligence , 5 : 542 – 553 , 2023 . OpenUrl [14]. ↵ Jiying Zhang , Zijing Liu , Yu Wang , and Yu Li . Subgdiff: A subgraph diffusion model to improve molecular representation learning , 2024 . [15]. ↵ Senem Aykul and Erik Martinez-Hackert . Determination of half-maximal inhibitory concentration using biosensor-based protein interaction analysis . Analytical Biochemistry , 508 : 97 – 103 , 2016 . OpenUrl CrossRef PubMed [16]. ↵ Jordi Barretina , Giordano Caponigro , Nicolas Stransky , et al. The cancer cell line encyclopedia enables predictive modelling of anticancer drug sensitivity . Nature , 483 : 603 – 607 , 2012 . OpenUrl CrossRef PubMed Web of Science [17]. ↵ Francesco Iorio , Theo A. Knijnenburg , Daniel J. Vis , et al. A landscape of pharmacogenomic interactions in cancer . Cell , 166 ( 3 ): 740 – 754 , 2016 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 01, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following FUSED: Cross-Domain Integration of Foundation Models for Cancer Drug Response Prediction Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share FUSED: Cross-Domain Integration of Foundation Models for Cancer Drug Response Prediction Till Rössner , Jonas Balke , Ming Tang bioRxiv 2025.09.30.679434; doi: https://doi.org/10.1101/2025.09.30.679434 Share This Article: Copy Citation Tools FUSED: Cross-Domain Integration of Foundation Models for Cancer Drug Response Prediction Till Rössner , Jonas Balke , Ming Tang bioRxiv 2025.09.30.679434; doi: https://doi.org/10.1101/2025.09.30.679434 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17697) Bioengineering (13894) Bioinformatics (41951) Biophysics (21455) Cancer Biology (18593) Cell Biology (25509) Clinical Trials (138) Developmental Biology (13380) Ecology (19903) Epidemiology (2067) Evolutionary Biology (24322) Genetics (15611) Genomics (22509) Immunology (17737) Microbiology (40398) Molecular Biology (17183) Neuroscience (88619) Paleontology (667) Pathology (2833) Pharmacology and Toxicology (4825) Physiology (7644) Plant Biology (15158) Scientific Communication and Education (2046) Synthetic Biology (4296) Systems Biology (9825) Zoology (2271)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.