A structure-informed evolutionary model for predicting viral immune escape and evolution

doi:10.1101/2025.07.31.667864

A structure-informed evolutionary model for predicting viral immune escape and evolution

2025 · doi:10.1101/2025.07.31.667864

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 92,943 characters · extracted from preprint-html · click to expand

A structure-informed evolutionary model for predicting viral immune escape and evolution | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results A structure-informed evolutionary model for predicting viral immune escape and evolution View ORCID Profile Chonghao Wang , View ORCID Profile Lu Zhang doi: https://doi.org/10.1101/2025.07.31.667864 Chonghao Wang 1 Department of Computer Science, Hong Kong Baptist University , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Chonghao Wang Lu Zhang 1 Department of Computer Science, Hong Kong Baptist University , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lu Zhang For correspondence: ericluzhang{at}hkbu.edu.hk Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Persistent emergence of viral variants capable of evading host immunity constitutes a significant threat to public health. This antigenic evolution frequently outpaces the development of vaccines and therapeutics, highlighting the necessity of predictive models surveilling the immune escape potential of emerging variants. However, existing models suffer from two key limitations: they inadequately incorporate protein structural information and neglect the importance of distinguishing large-impact mutations from neutral ones given multiple mutations. To address these gaps, we presented KEScape, a deep learning model designed to predict viral immune escape and evolution. KEScape integrates evolutionary context with protein structural information, introduces a novel top-K L 2 -differential pooling mechanism to prioritize mutations with large functional effects, and incorporates a supervised L 2 margin loss to facilitate the L 2 -distance-based ranking of high-impact mutations. We demonstrated that KEScape significantly outperformed state-of-the-art models on the most comprehensive benchmark to date, comprising eleven deep mutational scanning experiments spanning diverse viruses. Furthermore, KEScape exhibited outstanding performance in the practical applications of identifying immune escape hotspots and variants in emerging lineages and real-time surveillance of lineages associated with WHO-designated variants of SARS-CoV-2. These results show that KEScape is an effective model to predict viral immune escape and evolution. Its capacity for early warning can directly inform public health interventions and guide the development of countermeasures, thereby mitigating the threat of future viral pandemics. Introduction The co-evolutionary history of humans and viruses is marked by recurrent pandemics that have inflicted catastrophic mortality and shaped societal development [ 1 ]. For instance, the 1918 influenza pandemic, caused by the H1N1 influenza A virus, resulted in an estimated 50 million deaths worldwide [ 2 ]. More recently, severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) emerged as a global health crisis, precipitating a pandemic that has claimed millions of lives [ 3 , 4 ]. These historical precedents establish a clear and urgent mandate: to develop strategies that can mitigate the impact of future viral threats. To this end, the World Health Organization (WHO) introduced the concept of “Disease X” – a framework conceptualizes the challenge of preparing for a pandemic caused by a yet-to-be-identified pathogen. This proactive stance emphasizes global surveillance, which focuses on threats with high pandemic potential, such as highly pathogenic avian influenza (H5N1) and other zoonotic agents that have demonstrated the capacity to cross the species barrier [ 5 – 7 ]. Given the broad consensus that future pandemics are inevitable, pandemic preparedness must shift from a primarily reactive paradigm to a proactive one. A robust preparedness framework should not only integrate systematic pathogen genome sequencing and rapid vaccine development, but also include a critical third pillar: the timely identification of emerging viral variants with immune escape potential in the genomic surveillance. The capacity to recognize such variants at an early stage is essential for prioritizing experimental characterization, guiding vaccine and therapeutic updates, and enabling immediate public health interventions [ 8 – 13 ]. The persistence of viral threats is fundamentally shaped by the complex coevolutionary dynamics between viruses and their hosts. During the initial emergence of a novel virus in an immunologically naive population, viral spread is primarily driven by factors such as intrinsic transmissibility, host susceptibility, and ecological or epidemiological conditions. Once substantial population-level immunity has been established, for example through a large wave of infection, widespread vaccination, or both, the evolutionary landscape shifts. This newly established population-level immunity imposes intense selective pressure on the viral population. In turn, viruses continuously evolve, accumulating mutations that facilitate evasion of host immune surveillance – a phenomenon known as viral immune escape [ 14 – 20 ]. While advancements in immunology have yielded highly effective vaccines that bolster host immunity and save countless lives [ 21 – 24 ], the adaptive antigenic evolution of viruses poses an ongoing challenge. Specifically, viral mutations, particularly in key antigenic sites, can reduce or abrogate the binding affinity of vaccine-induced antibodies, thereby diminishing vaccine efficacy. This necessitates periodic reformulation of vaccines to match circulating viral lineages. It poses a critical challenge to accurately predict which viral variants will escape existing population immunity and achieve future prevalence. Current practices, such as the WHO’s influenza vaccine composition meetings, rely on expert consultation. However, mismatches between vaccine lineages and dominant circulating lineages have occurred, leading to suboptimal vaccine effectiveness in several seasons [ 25 – 27 ]. To better understand the landscape of potential escape mutations, experimental techniques like deep mutational scanning (DMS) have been developed [ 28 ]. DMS 2 allows for the high-throughput functional characterization of numerous viral mutants, assessing their impact on properties such as antibody binding or viral fitness. However, DMS experiments are resource-intensive and inherently limited in scope relative to the vast combinatorial space of possible mutations. For example, for a protein of length n , there are 19 × n possible single amino acid substitutions, and exponentially more multiple mutations. Computational approaches therefore provide a promising means of complementing experimental characterization. For example, DMS experiments performed during an initial wave of infection or following early vaccination campaigns can quantify the effects of mutations relative to the wild-type virus. These data can then be used to train computational models capable of predicting the immune escape potential of subsequently emerging variants. Several computational models have been developed to predict viral immune escape. For instance, Hie et al. [ 29 ] introduced CSCS, a model to predict viral escape by analyzing the differences between the wild-type and mutant sequences in the embedding spaces generated by a protein language model. The core premise of CSCS is that viable escape variants must maintain essential fitness while exhibiting sufficient divergence from the wild-type virus to evade immune surveillance. MLAEP [ 30 ] leverages a genetic algorithm to identify potential immune escape variants of SARS-CoV-2 by simulating natural selection. The genetic search is informed by a fitness landscape generated by integrating sequence features with structural information. The sequence feature is derived from the receptor-binding domain (RBD), which is encoded using ESM-2, a protein language model (PLM). The structural information, pertaining to ACE2 and antibody complexes, is processed by a structure-aware transformer. Another prominent model, EVEscape [ 31 ], operates on the assumptions that an escape variant should preserve its fitness, while simultaneously disrupting the antibody binding interface on the viral protein surface. To quantify escape potential, EVEscape integrates three distinct metrics: fitness, accessibility, and dissimilarity. E2VD [ 32 ] utilizes a pre-trained PLM to extract residue-level embeddings, and employs a convolution neural network and an attention block to capture the local and global dependencies of residues. E2VD proposes a multi-task framework to address the challenge of imbalanced data between beneficial and deleterious mutations, which consists of three tasks: receptor-binding, expression, and immune escape. While these models have advanced the field of viral immune escape prediction, they either omit protein structural information or derive a limited set of static features from protein structures. However, structural changes induced by mutations are demonstrated to be a primary mechanism of viral immune escape [ 33 , 34 ]. In addition, most models for predicting mutational effects overlook the importance of distinguishing the signals of large-effect mutations from the noise introduced by neutral mutations in variants carrying multiple mutations. Given that viral fitness and immune escape are driven by a small number of mutations with large functional effects [ 19 ], while the majority are phenotypically neutral or attenuated [ 35 – 37 ], the noise from abundant neutral mutations can obscure the signals from determinative ones. In response to these gaps, we propose KEScape, a model to predict viral immune escape and evolution. The primary application of KEScape is to predict the immune escape potential of emerging variants when population immunity constitutes a major selective pressure and DMS datasets from the same protein family as the target viral protein are available. To make predictions, KEScape first integrates evolutionary and structural information into unified embeddings, and then models residue-residue dependencies to generate context-aware unified embeddings. Subsequently, the embedding pairs corresponding to the K mutations with the largest functional impacts, as quantified by their L 2 distances, are selectively pooled and used for prediction ( Fig. 1A ). KEScape introduces three innovative features: (i) a fusion mechanism that integrates evolutionary embeddings and structural embeddings into unified embeddings; (ii) a top-K L 2 -differential pooling mechanism to prioritize mutations with significant functional impacts; (iii) a supervised L 2 margin loss that encourages the model to enlarge L 2 distances between immune escape variants and their corresponding wild-type references in the embedding space. Download figure Open in new tab Fig. 1: Overview of KEScape. ( A ) An illustrative diagram of KEScape. The model processes protein sequences of the wild-type and mutant viruses as inputs, generating context-aware unified embeddings by integrating evolutionary and structural embeddings to capture residue-residue dependencies. For each sequence, KEScape selects and averages the pairs of context-aware unified embeddings ( K = 5) corresponding to K mutations with the largest L 2 distances. Subsequently, KEScape estimates the likelihood of the mutant virus evading host immune surveillance. ( B ) KEScape architecture. KEScape (i) derives evolutionary embeddings and structural embeddings from ESMFold for an input protein sequence; (ii) fuses these embeddings into unified embeddings via a cross-attention module with residual connections; (iii) captures residue-residue dependencies through a self-attention module; (iv) employs top-K L 2 -differential pooling to focus on mutations with the largest functional impacts and predicts an immune escape score for the mutant sequence; (v) calculates the supervised L 2 margin loss and weighted cross-entropy loss to evaluate the predictive performance during the training stage. ( C ) Top-K L 2 -differential pooling mechanism. Given N mutations, the pooling mechanism calculates L 2 distances for N pairs of wild-type and mutant context-aware unified embeddings, selects K pairs with largest L 2 distances, and averages the corresponding K wild-type and mutant embeddings, respectively. To address the limited scope of prior benchmarks, which have typically been confined to five or fewer viral species, we evaluated KEScape using a comprehensive suite of eleven DMS datasets. To our knowledge, this benchmark represents the most extensive collection assembled to date, spanning a diverse range of viruses and effectively doubling the scale of prior evaluations. In this benchmark, KEScape outperformed all competing models, achieving a mean area under the precision-recall curve (AUPRC) of 0.534 compared to 0.365 from the second best-performing model. Extensive studies have demonstrated that viruses frequently acquire mutations within specific regions and sites [ 15 , 38 , 39 ] to evade host immune surveillance. Consequently, identifying such mutation hotspots in the emerging lineages provides a valuable probe to assess their immune escape potential during pandemics. Therefore, we examined the performance of KEScape in identifying immune escape hotspots on the SARS-CoV-2 BA.2 spike protein and the results were consistent with previous independent studies [ 20 , 40 – 42 ]. In addition, we demonstrated that, when trained on SARS-CoV-2 Wuhan-Hu-1 and other viral proteins, KEScape can identify immune escape variants of SARS-CoV-2 XBB.1.5 lineages. However, we note that KEScape is not recommended for application to viral proteins whose corresponding protein families are not represented in the training set, as predictions in such settings may be unreliable. During epidemics and pandemics, novel viral variants frequently emerge across distinct geographic regions. To prepare for subsequent waves of re-infection, accurately predicting the immune escape potential of emerging variants in a large scale is essential. We evaluated KEScape using temporally partitioned sequences obtained from the GISAID database [ 31 , 43 ]. The assessment demonstrated that KEScape could identify lineages associated with WHO-designated variants of SARS-CoV-2 (hereafter referred to as WDV lineages) by assigning higher immune escape scores to them compared to contemporaneous non-WDV lineages. To elucidate the internal mechanisms of KEScape, we analyzed its latent embeddings and attention weights, confirming the KEScape’s interpretability. Subsequently, we demonstrated the contribution of innovative components through ablation studies. Results Overview of KEScape KEScape comprises four main steps to predict viral immune escape and evolution. First, it employs ESMFold [ 44 ] to generate evolutionary and structural embeddings for both the wild-type and mutant viral protein sequences ( Fig. 1B step 1). In step 2, KEScape leverages a multi-head cross-attention module to systematically integrate structural and evolutionary embeddings. Within this framework, evolutionary embeddings serve as queries, while structural embeddings are keys and values, selectively amplified and combined according to their relevance in modeling immune escape for each query (Methods). After residual connections, KEScape generates unified embeddings by incorporating these reweighted structural embeddings into evolutionary embeddings ( Fig. 1B step 2). The fusion is applied to both the wild-type and mutant sequences. In step 3 ( Fig. 1B step 3), KEScape generates context-aware unified embeddings by capturing residue-residue dependencies from unified embeddings based on a self-attention module, enabling the modeling of intra-sequence interactions. In step 4, the context-aware unified embeddings of the wild-type and mutant sequences are put into the top-K L 2 -differential pooling module ( Fig. 1C and supplementary notes). Given N mutations, this module first isolates N pairs of context-aware unified embeddings corresponding to the mutated residues in both sequences. Pairwise L 2 distances are computed between the N wild-type/mutant embedding pairs, after which the K pairs exhibiting maximal divergence ( K empirically determined as min(5, N )) are retained. The K retained embedding pairs are averaged, generating two pooled embeddings encoding the mutational effects from these K mutations for the wild-type and mutant sequences, respectively. These two pooled embeddings and their differences are concatenated and processed through a multilayer perceptron to predict an immune escape score for the mutant sequence ( Fig. 1B step 4). The loss function of KEScape comprises two components: (i) a supervised L 2 margin loss, which increases the L 2 distance between the wild-type and mutant embeddings of escape variants while reducing the corresponding distance for non-escape variants; (ii) a weighted cross-entropy loss, which evaluates the classification performance. KEScape demonstrates state-of-the-art concordance with DMS experiments We collected eleven DMS datasets spanning diverse viral proteins [ 31 , 45 – 52 ] (Supplementary Table 1) and benchmarked the performance of KEScape against five computational models: CSCS, EVEscape, fine-tuned CSCS, fine-tuned EVEscape, and MLAEP. Fine-tuned versions of CSCS and EVEscape were included to control for the performance gains attributable to the training datasets, as both CSCS and EVEscape are originally unsupervised models. Because MLAEP is specifically designed for SARS-CoV-2, its performance was evaluated only on SARS-CoV-2 DMS datasets. In addition, the benchmarking against E2VD was conducted using its own designated dataset, as the antibody sequences it required were unavailable in our collected datasets. Comparative analysis across eleven DMS datasets ( Fig. 2A, C ) revealed the superior performance of KEScape, which achieved significantly higher area under the receiver operating characteristic curve (AUROC) than other models. Specifically, KEScape attained a mean AUROC of 0.901, compared with 0.851 for fine-tuned CSCS, 0.751 for fine-tuned EVEscape, 0.682 for EVEscape, and 0.587 for CSCS. As all datasets were imbalanced, we also evaluated model performance using AUPRC. KEScape likewise significantly outperformed the competing models in terms of AUPRC, achieving a mean AUPRC of 0.534, whereas fine-tuned CSCS, fine-tuned EVEscape, EVEscape, and CSCS achieved mean AUPRC values of 0.365, 0.168, 0.125, and 0.064, respectively ( Fig. 2A, D ). Download figure Open in new tab Fig. 2: Comparative performance evaluation of KEScape across diverse DMS datasets. ( A ) Aggregate comparison of AUROC and AUPRC values between KEScape and state-of-the-art computational models for viral immune escape prediction across eleven DMS datasets. The suffix ‘(F)’ designates the fine-tuned version of a respective model (e.g., CSCS (F)). Symbols above the competing models indicate the statistical significance of the extent to which KEScape outperforms each corresponding model: n . s . denotes p > 0.05; ∗ denotes 0.01 < p ≤ 0.05; and ∗∗ denotes p ≤ 0.01. The p -values were obtained using paired t -tests. ( B ) Extended benchmark including E2VD, assessed on its SARS-CoV-2 DMS dataset containing antibody sequences. E2VD requires both viral protein sequences and antibody sequences as inputs, whereas other models operate solely on viral protein inputs. ( C ) AUROC comparison of KEScape and competing state-of-the-art models across the eleven DMS datasets. MLAEP was evaluated only on the SARS-CoV-2 spike and SARS-CoV-2 XBB.1.5 spike datasets because it was specifically designed for SARS-CoV-2. ( D ) AUPRC comparison of KEScape and competing state-of-the-art models across the eleven DMS datasets. Notably, we found that all evaluated models exhibited variable performance across different datasets ( Fig. 2A, C, D ). KEScape achieved maximal AUPRC (0.864) for SARS-CoV-2 spike, while showed reduced AUPRC (0.223) for Zika envelope. Similar trends were observed for other models. For example, fine-tuned CSCS and fine-tuned EVEscape, which both reached their highest AUPRC on SARS-CoV-2 spike (0.771 and 0.572, respectively) but underperformed on H3N2 hemagglutinin and Lassa glycoprotein. This heterogeneity indicates that different viral proteins exhibit distinct propensities for specific immune escape mechanisms, which are not fully captured by current computational models. For instance, glycan shielding was reported critical for HIV immune escape [ 31 ]. Furthermore, we observed consistent performance patterns across models for certain datasets, with uniformly strong or weak predictive accuracy observed in specific viral protein contexts. For example, KEScape, fine-tuned CSCS, fine-tuned EVEscape and EVEscape all excelled in SARS-CoV-2 spike dataset, while exhibiting comparatively reduced performance in the rabies glycoprotein dataset. In summary, these results suggest that the difficulty of predicting immune escape varies substantially across different viral proteins. Beyond potential data noise, the variation likely arises from differences in immune escape mechanisms, with certain viral proteins employing more complex mechanisms. To benchmark KEScape against E2VD, we used a SARS-CoV-2 DMS dataset containing antibody sequences provided by the E2VD study [ 32 ]. As illustrated in Fig. 2B , KEScape outperformed all competing models, including E2VD. We note that while E2VD explicitly requires antibody sequence information as input, KEScape and other models operate without this input. Our results reveal that KEScape can achieve superior performance in the absence of antibody-specific information, implying that its structure-informed evolutionary modeling - operating solely through viral protein analysis - is sufficient to capture critical determinants of viral immune escape. KEScape identifies immune escape hotspots and variants in emerging lineages A practical assessment of model performance is whether it can identify immune escape hotspots and variants in emerging lineages that were not represented in the training set. To evaluate hotspot identification, we leveraged the well-characterized evolutionary trajectory of SARS-CoV-2 and selected the BA.2 lineage, which was not in the training set, as a test case for assessing KEScape’s ability to identify immune escape hotspots. Mapping the max per-residue KEScape scores (representing the highest KEScape score among all single amino acid substitutions in each residue, see Methods) onto the SARS-CoV-2 BA.2 spike protein structure (Protein Data Bank (PDB): 7XIX), this revealed that residues with high scores predominantly clustered in RBD and the N-terminal domain (NTD), while most of residues outside these two domains exhibited mild predicted impact on immune escape ( Fig. 3A, B , Supplementary Fig. 1). These findings align with the established evidence that RBD and NTD mutations are primary drivers of immune escape in Omicron variants [ 20 , 53 , 54 ]. Notably, many residues with high KEScape scores correspond to experimentally validated immune escape hotspots. For instance, residue 483 (align with residue 486 in SARS-CoV-2 Wuhan-Hu-1) is a well-documented site critical for immune escape [ 40 – 42 ]. This residue constitutes a primary target for several neutralizing antibodies [ 55 , 56 ], and the F486V mutation present in BA.4/5 facilitates escape from multiple antibodies [ 20 , 40 ]. Similarly, mutations in residue 343 (346 in Wuhan-Hu-1), present in several Omicron sublineages, confer significant antigenic variation [ 53 ]. Furthermore, other high-scoring residues—including 142 (145), 441 (444), 449 (452), 453 (456), and 457 (460)—reside within functionally critical regions and can be associated with viral immune escape [ 20 , 57 – 61 ]. Download figure Open in new tab Fig. 3: KEScape identifies immune escape hotspots and variants in spike proteins from SARS-CoV-2 lineages not represented in the training set. ( A ) Max per-residue KEScape scores mapped onto one chain of the BA.2 spike (pdb: 7XIX). Higher scores, indicated by warmer colors, signify greater potential for immune escape. A subset of residues with high scores is annotated, with their corresponding Wuhan-Hu-1 coordinates indicated in parentheses. ( B ) Max per-residue KEScape scores for the BA.2 spike. ( C ) Max per-residue KEScape scores for the RBD of the BA.2 spike. Residues with higher scores are highlighted, with the Wuhan-Hu-1 reference positions provided in parentheses. ( D ) A comprehensive mutation landscape of the BA.2 RBD. The heatmap displays the KEScape score for every possible single amino acid substitution at each residue position (x-axis). The wild-type amino acid at each position is omitted for clarity, while all alternative amino acids are shown with their respective KEScape scores. ( E ) Performance of KEScape, EVEscape, and CSCS on the SARS-CoV-2 XBB.1.5 spike DMS dataset. None of these models was trained on this lineage. Visualization of max per-residue KEScape scores for the BA.2 spike RBD ( Fig. 3C ) demonstrated that the highest-scoring residues were largely concentrated in the receptor-binding motif (RBM). RBM, a critical subdomain of the spike protein, mediates direct interaction with the host cell receptor ACE2 [ 62 , 63 ] and represents a major target for neutralizing antibodies [ 64 , 65 ]. This observation aligns with experimental evidence highlighting RBM mutations—such as N460K and F486V—as significant contributors to immune evasion [ 20 , 42 , 66 ]. Taken together, these results demonstrate that KEScape effectively identifies key regions governing immune escape and accurately pinpoints established immune escape hotspots. Analysis of the mean per-residue KEScape scores for the BA.2 RBD (Supplementary Fig. 2) revealed a distribution congruent with the max scores, exhibiting similar patterns of regions with large and small scores. However, several residues exhibited high maximum scores coupled with low mean scores. For example, residue 394 (397) displayed a significant discrepancy between its max and mean scores. Further investigation indicated that this residue possessed approximately three mutant amino acids with high immune escape scores, while the remaining substitutions had low scores ( Fig. 3D ). This observation suggests that while some residues exhibit broad mutational landscapes conducive to immune escape with minimal fitness cost, others are subject to significant constraint, tolerating only a limited subset of mutations, aside from some neutral mutations, without incurring substantial fitness penalties. In addition to identifying immune escape hotspots, we evaluated the ability of KEScape to detect immune escape variants. Specifically, KEScape was trained on DMS datasets of SARS-CoV-2 spike and other proteins, and its performance was then evaluated on the DMS dataset of the SARS-CoV-2 XBB.1.5 spike protein, a closely related homolog to the training set. We benchmarked KEScape against EVEscape and CSCS and found that KEScape achieved superior performance ( Fig. 3E ), demonstrating its effectiveness in detecting immune escape variants. However, we observed that KEScape’s predictive power is contingent upon the presence of homologous protein families within the training set. While KEScape generalized across distantly related homologs of different influenza A hemagglutinin subtypes, its performance deteriorated significantly when applied to the rabies glycoprotein (Supplementary Fig. 3). As a target lacking homologous representation in the training set, this evaluation yielded an AUROC only slightly above 0.5. These results suggest that KEScape should be applied to viral proteins for which proteins from the same family and viral species are represented in the training set. One practical use case is to train the model on proteins from the original lineage during a pandemic and then predict the immune escape potential of subsequently emerging lineages. KEScape as a real-time surveillance tool for emerging lineages A major challenge in predicting the immune escape potential of emerging SARS-CoV-2 lineages is the selection of an appropriate wild-type reference lineage, particularly in the context of heterogeneous population immunity shaped by diverse vaccination histories and prior infections with distinct viral lineages. To address this challenge, we comprehensively evaluated the ability of KEScape to surveil emerging SARS-CoV-2 lineages across seven 90-day intervals encompassing WDV lineages. We used both the ancestral Wuhan-Hu-1 isolate and the earliest WDV lineage from the preceding interval as wild-type references (Methods). All SARS-CoV-2 spike protein sequences were obtained from GISAID [ 31 ]. We benchmarked the performance of KEScape against MLAEP. Regarding the other baseline models, neither CSCS nor E2VD can process mutations involving insertions or deletions (indels). Because the dataset for this experiment contains indels, simply ignoring them is methodologically untenable; omitting indels would erroneously collapse distinct variants, including WDV lineages, into identical sequences. Similarly, EVEscape does not directly support indels. Although its underlying EVE model can theoretically be substituted with TranceptEVE [ 67 ] to accommodate such mutations, the scripts for calculating immune escape scores for indels have not been made publicly available. In addition, the Trancept model in Tran-ceptEVE was pre-trained on UniRef100, which could contain all SARS-CoV-2 spike variant sequences instead of just one representative sequence in UniRef50. Including it would therefore introduce an unfair comparison. As shown in Fig. 4A , KEScape evaluated the immune escape potential of each viral variant relative to Wuhan-Hu-1 by comparing the cumulative distribution function of non-WDV lineages with the maximum KEScape scores of contemporaneous WDV lineages. Across the seven intervals, KEScape identified ten WDV lineages with scores exceeding those of 90% of non-WDV lineages, whereas four WDV lineages received scores below this threshold ( Fig. 4E ). By comparison, MLAEP assigned five WDV lineages scores substantially below the 90% threshold ( Fig. 4B, F ), indicating significantly weaker performance than KEScape. In addition, we noticed that, beginning in the fourth interval (September 2021 onward), while most WDV lineages received high scores, a large proportion of non-WDV lineages attained comparably high scores for both KEScape and MLAEP. We posited that this phenomenon arose because non-WDV lineages progressively accumulated escape-conferring mutations acquired from prior dominant lineages. Given the fixed wild-type reference (Wuhan-Hu-1), these mutations rendered non-WDV lineages antigenically distinct from the reference. Supporting this hypothesis, Supplementary Fig. 4A demonstrated a marked increase in the proportion of verified immune escape mutations [ 68 ] within non-WDV lineages starting in the fourth interval, correlating with the observed rise in high-scoring non-WDV lineages ( Fig. 4A-B ). Analysis of the immune escape mutations in WDV lineages (Supplementary Fig. 4B) suggested that non-WDV lineages indeed incorporated immune escape mutations from preceding WDV lineages. This validates our hypothesis and explains the convergence in KEScape and MLAEP scores between WDV and non-WDV lineages. Download figure Open in new tab Fig. 4: Performance of KEScape and MLAEP in prioritizing WDV lineages of SARS-CoV-2 in the lineage surveillance application. ( A-B ) Lineage surveillance evaluation using the ancestral Wuhan-Hu-1 as the wild-type reference for ( A ) KEScape and ( B ) MLAEP. The scores of non-WDV lineages are presented by cumulative distribution functions. The scores of WDV-lineages from each interval are indicated by vertical lines. ( C-D ) Lineage surveillance evaluation using dynamic references for ( C ) KEScape and ( D ) MLAEP. Data presentations of C and D follow identical conventions to A and B , respectively. ( E–H ) Predicted cumulative probabilities for WDV lineages relative to a 0.9 baseline. Predictions are shown for KEScape ( E, G ) and MLAEP ( F, H ). Wild-type references are based on the Wuhan-Hu-1 ( E–F ) or dynamic references ( G–H ). The evolutionary trajectory of SARS-CoV-2 is predominantly characterized by continuous antigenic drift, enabling the emergence of novel lineages capable of evading host immunity established against previously prevalent lineages. To capture these dynamics, we performed an additional experiment assessing the performance using the earliest WDV lineage from the preceding interval as the wild-type reference. As shown in Fig. 4C and G , KEScape assigned scores to all but one WDV lineage that exceeded those of 90% of contemporaneous non-WDV lineages. This strong discriminatory performance is notable given the multitude of factors that influence lineage prevalence, including transmission bottlenecks [ 69 , 70 ]. In contrast, MLAEP assigned three WDV lineages scores below the 90% threshold, indicating inferior performance. We note that both KEScape and MLAEP leverage ESM2, which was pre-trained on the September 2021 release of UniRef50, a database where highly homologous viral sequences are collapsed into single representatives. In addition, ESMFold was trained on PDB chains available before May 2020, and our fine-tuned ESMFold was further trained on dozens of viral protein structures containing only two instances of the SARS-CoV-2 spike protein. Consequently, the risk of data leakage in our evaluation is effectively mitigated. Taken together, these results demonstrate that KEScape exhibits robust predictive capacity for the early identification of emerging viral lineages with immune escape potential. KEScape interpretability analysis and ablation study To elucidate the internal mechanisms underlying KEScape’s capacity to model viral immune escape and evolution, we analyzed two components in the model: (i) latent embeddings preceding the final linear layer of a multilayer perceptron in the immune escape prediction step; (ii) attention weights from self-attention modules in the residue-residue dependencies identification step (Methods). Samples in the test sets of DMS datasets were used for this analysis. As demonstrated in Fig. 5A-B , SARS-CoV-2 XBB.1.5 spike and H1N1 hemagglutinin variants with immune escape capacity formed clusters that were distinct from the majority of their non-escape counterparts in the two-dimensional t-SNE projection [ 71 ], providing a robust basis for linear classification. A similar separation pattern was observed in the latent embeddings of other viral proteins (Supplementary Figs. 5-13), indicating the ability of KEScape to discriminate between immune escape and non-escape variants in the embedding space. We further quantified the local enrichment of escape variants using KNN-based enrichment scores, denoted as pos-enrichment k (Methods). These values are shown in the lower-right corners of Fig. 5A and B . In both cases, pos-enrichment k exceeded 1, suggesting that the model tends to embed escape variants in proximity to other escape variants. Download figure Open in new tab Fig. 5: Visualization of latent embeddings and attention weights in KEScape. ( A-B ) t-SNE visualization of latent embeddings preceding the final linear layer of a multilayer perceptron in the immune escape prediction step for ( A ) SARS-CoV-2 XBB.1.5 spike and ( B ) H1N1 hemagglutinin. The pos-enrichment k =5 value is shown in the lower-right corner of each subplot. ( C-D ) Heatmaps of self-attention weights for ( C ) SARS-CoV-2 XBB.1.5 spike and ( D ) H1N1 hemagglutinin. Color intensity is proportional to attention weight magnitude, with darker hues indicating higher values. Analysis of attention weights for SARS-CoV-2 spike ( Fig. 5C ) revealed pronounced attention focusing on specific tokens, with mild variation across input queries. The token with high attention weights localized to antibody-targeted regions including RBD, NTD and fusion peptide proximal region [ 19 , 72 ]. A parallel analysis of H1N1 hemagglutinin similarly demonstrated focused attention on tokens within major neutralizing antibody-binding regions, particularly the HA1 subunit ( Fig. 5D ). The HA1 subunit, containing the receptor-binding sites and comprising the globular head, represents the primary target for neutralizing antibodies, while the HA2 subunit is occasionally targeted by neutralizing antibodies [ 15 ]. Collectively, these findings indicate that KEScape prioritizes functionally critical regions governing immune escape. To evaluate the contribution of supervised L 2 margin loss and structural embeddings, we implemented two ablated variants in which these components were removed individually. Because the supervised L 2 margin loss is designed to facilitate the ranking of high-impact mutations in the context of multiple mutations, it is not expected to have a substantial effect in the DMS benchmarks, which consist only of singlemutation samples. As demonstrated in Fig. 6A-C , incorporating the supervised L 2 margin loss indeed did not compromise performance on the DMS benchmarks and instead provided a marginal improvement. For the ablated variant excluding structural embeddings, KEScape outperformed this variant across the eleven DMS datasets ( Fig. 6A ), as evidenced by a significant increase in AUPRC. Comparative analysis of the latent embeddings for H1N1 hemagglutinin ( Fig. 6B ) showed that KEScape produced a more compact distribution of immune escape variants than the ablated variant without structural embeddings. This observation was further supported by the pos-enrichment k =5 values shown in the figure. We further summarized the average pos-enrichment k across the eleven DMS datasets for k = 2, 5, 10 ( Fig. 6C ). KEScape achieved the highest average pos-enrichment k for all three values of k . In addition, we investigated the contribution of these two components in the emerging lineage surveillance application. As shown in Fig. 6D , the ablated variant without supervised L 2 margin loss assigned five out of fourteen WDV lineages scores below those of 90% of non-WDV lineages, whereas the variant without structural embeddings assigned three WDV lineages below this threshold. By comparison, KEScape assigned only one WDV lineage below the 90% threshold. Collectively, these results suggest that incorporating supervised L 2 margin loss and structural embeddings improves the ability of KEScape to surveil emerging lineages and identify viral variants with immune escape potential. We further assessed the performance enhancement conferred by top-K L 2 -differential pooling. As the pooling was designed for multiple mutations while the eleven DMS datasets all consisted of single mutations, we evaluated the pooling in the emerging lineage surveillance application, utilizing the earliest WDV lineage from the preceding interval as the wild-type reference. As illustrated in Fig. 6E , top-K L 2 differential pooling enabled accurate identification of predominant WDV lineages through elevated scoring relative to contemporaneous non-WDV lineages, significantly outperforming both max and mean pooling approaches. This can be attributed to that max pooling’s emphasis on the mutation with maximal embedding distance predisposes it to neglect combinatorial mutation effects and makes it more sensitive to outliers. Download figure Open in new tab Fig. 6: Ablation analysis of KEScape components. ( A ) Performance comparison 16 between KEScape and its ablated variants (lacking structural embeddings or the supervised L 2 margin loss) across the eleven DMS datasets. The p -values were obtained using paired t -tests. ( B ) t-SNE visualization of latent embeddings preceding the final linear layer of a multilayer perceptron in the immune escape prediction step for KEScape and its ablated counterparts. ( C ) Average pos-enrichment k ( k = 2, 5, 10) of KEScape and its ablated counterparts. ( D-E ) Predicted cumulative probabilities relative to a 0.9 baseline for ( D ) KEScape and its ablated counterparts and ( E ) KEScape using top-K L 2 -differential pooling versus max pooling and mean pooling. The wild-type references are dynamic. Conversely, the average mechanism of mean pooling can be significantly influenced by neutral mutations and thus dampen the effects of beneficial mutations due to the dominance of neutral mutations, which systematically makes mean pooling assign reduced scores for all viral variants. Discussion The evolutionary arms race between viral pathogens and host immunity necessitates a paradigm shift from reactive public health measures to proactive surveillance. The efficacy of vaccines and therapeutics is perpetually challenged by antigenic drift, which allows novel variants to escape pre-existing immunity and cause successive waves of infection. To shorten the timeline for countermeasures, computational models that can accurately predict the immune escape potential of emerging variants are indispensable. These in silico approaches circumvent logistical and temporal bottlenecks of wet-lab experiments, enabling high-throughput risk assessment of viral sequences once they are detected by global surveillance systems. Here, we present KEScape, a deep learning model that advances the state-of-the-art performance in immune escape prediction. We attribute its success to a novel architecture that synergistically integrates evolutionary context with protein structure and captures residue-residue dependencies in a sequence. Furthermore, KEScape incorporates two novel components to deal with multiple mutations: a top-K L 2 -differential pooling mechanism designed to account for effects from critical mutations and a supervised L 2 margin loss to facilitate L 2 -distance-based mutation ranking. KEScape demonstrated high effectiveness by setting a new performance benchmark on DMS datasets that significantly outperformed competing models. Beyond the benchmark performance, KEScape showed a strong capacity to identify immune escape hotspots and variants and to surveil SARS-CoV-2 WDV lineages. Furthermore, KEScape exhibited a high degree of interpretability. It focused on functionally critical regions and accurately distinguished between escape and non-escape viral variants within the embedding space. Despite these advances, wet-lab DMS experiments remain indispensable, particularly for novel viral species. As investigated in the Results, KEScape can achieve strong performance on emerging lineages that are not represented in the training set. However, this capability depends on the presence of at least a subset of training proteins from the same protein family as the target viral protein. In the absence of such condition, KEScape predictions may be unreliable. Therefore, when a novel virus emerges with pandemic potential, wet-lab DMS experiments against the wild-type virus are essential during the initial wave of large-scale infection or vaccination. Once population-level adaptive immunity has been established and the virus is subjected to strong immune-mediated selective pressure, computational models such as KEScape can be applied to surveil newly emerging variants using the DMS dataset. An ideal predictive framework would identify high-risk variants at their genesis and accurately forecast their subsequent evolutionary trajectories. However, the inherent stochasticity of viral evolution, influenced by factors such as transmission bottlenecks, renders such deterministic prediction currently impractical. A more practical application, for which KEScape is well-suited, is to generate a prioritized pool of candidate variants with high predicted immune escape potential based on real-time genomic surveillance. This allows for the immediate computational assessment of newly detected variants, identifying high-risk candidates for rapid experimental validation and informing public health responses. While the prediction of viral immune escape remains a formidable challenge due to biological complexity and stochastic effects, the progress demonstrated here and elsewhere in the field represents a significant step towards developing the tools necessary to combat future pandemics. Methods Deep mutational scanning datasets We collected eleven DMS datasets of various viral proteins (Supplementary Table 1). Since the three DMS datasets obtained from EVEscape were already processed, we performed data processing on the remaining eight DMS datasets. Specifically, we removed any mutations that (i) had no recorded immune escape scores, (ii) included ambiguous amino acids (i.e. X, B, Z, J), or (iii) resulted in a premature stop codon. If negative immune escape scores were present in a dataset, we applied an exponential transformation to the scores. In particular, we used base-2 exponential transformations for the SARS-CoV-2 XBB.1.5 spike, H5N1 hemagglutinin, Nipah receptor-binding protein, lassa glycoprotein, H3N2 hemagglutinin, rabies glycoprotein datasets, whereas a base-10 exponential transformation was applied to the H3N2 neuraminidase dataset. For datasets containing multiple immune escape scores per mutation from different neutralizing antibodies or sera (including lassa glycoprotein, H3N2 hemagglutinin and rabies glycoprotein), we used the maximum immune escape scores for each mutation. To mitigate noises in immune escape scores, we adopted the binarization strategy from EVEscape. For each dataset, we fitted a gamma distribution using the method of moments and set a threshold corresponding to a 5% false discovery rate. Mutations with immune escape scores greater than or equal to this threshold were labeled as conferring immune escape (label 1), whereas those below the threshold were labeled as non-escape (label 0). We excluded any mutations in the test set that could not be predicted by EVEscape due to insufficient coverage in its multiple sequence alignment. This ensured consistent test sets across all computational models. For the SARS-CoV-2 DMS dataset obtained from E2VD, we constructed a transformed dataset that used a single immune escape score for each mutation by selecting the maximum value observed across all tested neutralizing antibodies. This transformed dataset was then randomly divided into training (70%), validation (15%) and test (15%) sets, with the constraint that no mutation appeared in more than one sub-set. The original, non-transformed dataset was subsequently partitioned into training, validation, and test sets by mapping each sample according to the split assignment of its corresponding mutation in the transformed dataset. The test set was filtered to be consistent with the transformed one by selecting only the antibody with the largest immune escape score for each mutation. For binarization, we set the threshold to 0.4, consistent with E2VD. We trained E2VD using its default settings on the training set and evaluated its performance on the test set. KEScape was trained on the transformed training set, and all models except E2VD were evaluated on the transformed test set. EVEscape and CSCS fine-tuning We included fine-tuned versions of EVEscape and CSCS in the benchmark. For EVEscape, we used four features as input to a multilayer perceptron: the embedding from the final hidden layer of the EVE encoder before the mean and log-variance heads for the mutant sequence ( H e ), the EVE score ( S EVE ), the weighted contact number ( S WCN ), and the dissimilarity score ( S D ). Specifically, the fine-tuned EVEscape computes the immune escape score of a mutant sequence as follows: Here, f 1 , f 2 , and f 3 denote linear transformations; ReLU denotes the rectified linear unit activation function; [ H e , S EVE , S WCN , S D ] denotes the concatenation of the four features; and σ 1 denotes the sigmoid function. The model was fine-tuned using a weighted cross-entropy loss function with a learning rate of 0.001. For CSCS, we initialized the model from the protein-specific pre-trained checkpoint and appended an immune escape prediction head. This head takes as input the final hidden state of the LSTM encoding the left sequence context ( H l ) and the final hidden state of the LSTM encoding the right sequence context ( H r ). These two context embeddings are concatenated and passed through a multilayer perceptron, analogous to the fine-tuning head used for EVEscape, to produce immune escape scores: In this expression, f 4 , f 5 , and f 6 denote linear transformations; ReLU denotes the rectified linear unit activation function; [ H l , H r ] denotes the concatenation of H l and H r ; and σ 2 denotes the element-wise sigmoid function. The prediction head outputs a vector of scores over amino acid identities conditioned on the sequence context surrounding the target residue. During training, the score corresponding to the mutant amino acid at the target position is selected and optimized against the ground-truth label using cross-entropy loss with a learning rate of 0.001. As a result, only substitutions observed in the training set contribute directly to the loss, whereas substitutions absent from the training set do not receive direct supervision. Immune escape hotspot experiment Given twenty different amino acids, with one wild-type and nineteen alternative amino acids for a residue i , the max per-residue KEScape score m ( i ) is defined to be where is a set of all immune escape scores of the mutant sequences with alternative amino acids. Similarly, the mean per-residue KEScape score z ( i ) is defined to be The max per-residue KEScape scores were calculated and mapped onto a viral variant of the SARS-CoV-2 BA.2 lineage (pdb: 7XIX). SARS-CoV-2 lineage surveillance dataset We used a dataset from EVEscape, comprising viral sequences from GISAID sampled across diverse geographical regions over time. Based on the WHO and PANGO nomen-clature [ 73 ], we classified lineages associated with WHO-designated variants (referred to as WDV lineages), as listed in Supplementary Table 2. All remaining lineages were classified as non-WDV lineages. The dataset aggregated viral isolates with the same spike protein sequences into a group. For each group, the earliest collection date among all constituent isolates was assigned as the representative date for that group. In addition, a definitive PANGO lineage was assigned to each group: if any isolate within a group belonged to WDV lineages, the entire group was classified under that lineage. Groups were subsequently filtered to exclude any that contained ambiguous amino acids, premature stop codons, or were represented by ten or fewer viral isolates (count ≤ 10). To reflect the dynamics of viral lineage turnover, we divided the timeline into non-overlapping 90-day intervals. WDV lineages present in previous intervals were excluded from subsequent intervals, and any intervals without newly emerged WDV lineages were also removed. We retrieved immune escape mutations from Alam et al. [ 68 ], calculated the proportion of these mutations in non-WDV lineages over different intervals, and investigated their presence in WDV lineages (Supplementary Figure 4). We conducted two experiments with different wild-type references: one using the ancestral Wuhan-Hu-1 (a constant reference) and one using the earliest WDV lineage from the preceding interval (a dynamic reference). The immune escape potential of each unique viral spike sequence was predicted by KEScape relative to both the constant and dynamic references. Since a WDV lineage can encompass multiple variants with different sets of mutations within a time interval, a representative score was required. Therefore, to quantify the immune escape potential of a WDV lineage, the maximum KEScape score among all its constituent variants within that interval was selected. KEScape architecture KEScape is designed to quantify the immune escape potential of a given viral variant. The model takes a mutant sequence, S m , and its corresponding wild-type reference sequence, S w , as input. It computes a score, denoted as , which is constrained to the interval (0, 1). A higher score signifies an increased probability that the viral variant will evade host immune surveillance. This predictive process can be mathe-matically formulated by representing the KEScape model as a function f KEScape , such that: The inference process of KEScape comprises four primary steps. KEScape begins with the generation of protein embeddings from a fine-tuned ESMFold (650M parameter version, see supplementary notes), denoted as f ESMFold . For a given wild-type and mutant protein sequence pair, the model processes each sequence respectively. Input sequences exceeding a length of 1022 amino acids should be truncated to meet the model’s architectural constraints. ESMFold generates two distinct sets of embeddings for each residue in a sequence: evolutionary embeddings and structural embeddings. Specifically, for a wild-type sequence S w of length L w , this operation yields evolutionary embeddings with dimensions ( L w +2, 1280), and structural embeddings with dimensions ( L w , 1024); similarly, An analogous procedure is applied to the mutant sequence S m to produce its respective embeddings, and . This process can be formally expressed as: Here, represents the set of evolutionary embeddings , including embeddings for each residue, augmented by bos and eos tokens. In contrast, is the set of structural embeddings , corresponding only to the residues of the sequence. The embeddings for the mutant sequence, and are generated analogously. In the second step, KEScape integrates evolutionary embeddings and structural embeddings into unified embeddings for each sequence. For clarity, the following procedure is described for the wild-type sequence. The dimensionality of the evolutionary embeddings is first reduced from 1280 to 1024 via a linear projection layer f linear 1 to match the one from the structural embeddings. where the resulting has dimensions of ( L w + 2, 1024). These projected evolutionary embeddings are transformed to queries, and the structural embeddings, , are transformed to keys and values as follows: where W q,w , W k,w , W v,w are the weights and b q,w , b k,w , b v,w are the biases. Q w , K w , and V w are subsequently divided into 8 heads as and for a given head t . Subsequently, , the weighted sum of the values is calculated as follows: In this formulation, d is a scaling factor, and is the transpose of . The output matrices from all eight heads are concatenated and passed through a linear projection ( f linear 2 ) to produce a representation Z w : The resulting Z w has the dimensions of ( L w + 2, 1024). Finally, this representation is processed through a multilayer perceptron with residual connections to yield unified embeddings for the wild-type sequence. The unified embeddings for the mutant sequence can be derived analogously. Following the integration of structural embeddings and evolutionary embeddings, KEScape employs a self-attention module to further refine the unified embeddings by modeling residue-residue dependencies. For each sequence, the unified embeddings are projected into query Q , key K and value V . The self-attention module then computes a contextually updated representation, Z ′ , by calculating a weighted sum of the values, where the weights are determined by the similarity between queries and keys: Here, d ′ is a scaling factor. This output is subsequently processed by a multilayer perceptron with residual connections, yielding the context-aware unified embeddings for each sequence. In the fourth step, KEScape employs a top-K L 2 -differential pooling mechanism to identify and select mutations with large L 2 distances. For a mutant sequence with N mutations, the L 2 distance is calculated between the wild-type and mutant context-aware unified embeddings at each of the N mutant positions. Suppose the corresponding N embeddings for the wild-type and mutant sequence are and , respectively, the L 2 distance at a given mutant position i is calculated as Here, are the j -th elements of the vector and , respectively. The top- K mutant positions are selected by ranking the L 2 distances in descending order. Let denote the index set of the K mutant positions with the largest L 2 distances, where |ℐ | K = K . The corresponding wild-type and mutant context-aware unified embeddings are then separately averaged to obtain the pooled embeddings: KEScape then constructs a representation by concatenating the wild-type pooled embedding, the mutant pooled embedding, and their difference: This representation is passed through a multilayer perceptron consisting of two linear layers with a GELU activation between them, followed by a softmax layer: Here, W clf ,1 and b clf ,1 are the weight matrix and bias vector of the first linear layer, W clf ,2 and b clf ,2 are those of the second linear layer, and o denotes the output logits. The final vector represents the predicted class probabilities, with the immune escape probability given by Supervised L 2 margin loss Supervised L 2 margin loss is designed to explicitly force the model to increase the L 2 distances between the wild-type and mutant embeddings for immune escape variants while decreasing such distances for non-escape variants in the embedding space. For each sample, the average L 2 distances across the selected K mutations is defined as: During training, each sample is associated with a ground-truth label y ∈ {0, 1}, where y = 1 denotes an immune escape variant (positive class) and y = 0 denotes a non-escape variant (negative class). The supervised L 2 margin loss applies distinct penalties based on the class label. For positive samples, the model is penalized if is smaller than a predefined positive margin m pos . Conversely, for negative samples, the model is penalized if exceeds a predefined negative margin m neg . To prevent extreme penalties from dominating the gradient, the penalties are capped by a ceiling parameter C . The individual sample penalties and for a given sample n are formulated as for positive and negative classes, respectively, where denotes the mean L 2 distance for sample n . For a given training batch, let 𝒫 denote the set of indices for positive samples ( y = 1) and 𝒩 denote the set of indices for negative samples ( y = 0), the supervised L 2 margin loss is then defined as: where w pos and w neg are the class weights for positive and negative samples, respectively, and |𝒫| and |𝒩| are the number of samples in each class within the batch. KEScape training KEScape was trained using a combined loss function consisting of weighted binary cross-entropy loss and supervised L 2 margin loss. The optimizer was selected as AdamW [ 74 ] with β 1 = 0.9, β 2 = 0.98, and a weight decay of 0.01. The gradient accumulation was employed to achieve an effective batch size of 128. A learning rate scheduler with a warm-up phase was implemented. The learning rate increased to a peak value over the first min ( int (0.05 ∗ num training steps ), 500) steps, and then decayed to a final value of 10% of its peak value in the subsequent training period. Each DMS dataset was partitioned into training (70%), validation (15%), and test (15%) sets. In the DMS benchmark experiments, KEScape was trained separately on the training set of each dataset and evaluated on the corresponding test set using the checkpoint that achieved the lowest cross-entropy loss on the validation set. The peak learning rate was set to 0.001. In zero-shot experiments, including the identification of immune escape hotspots and variants from emerging lineages as well as lineage surveillance, KEScape was trained on those DMS datasets that excluded the target lineages, with a peak learning rate of 0.0005. The only exception was the identification of immune escape hotspots in the BA.2 spike protein, for which KEScape was trained on all DMS datasets except the XBB.1.5 spike dataset (There is no BA.2 spike DMS dataset in the benchmark). For the comparative benchmark against E2VD, training was restricted to the SARS-CoV-2 DMS dataset provided by E2VD, with training, validation, test splits of 70%, 15%, 15%, respectively. Model interpretation and ablation study To elucidate the decision-making process of KEScape, we conducted model interpretation experiments. First, we analyzed the latent embeddings preceding the final linear layer of a multilayer perceptron in the immune escape prediction step. These highdimensional embeddings were projected into a two-dimensional space using t-SNE to visualize the distribution of immune escape and non-escape viral variants. In addition, we used KNN-based positive enrichment score to quantify the degree of enrichment for embeddings of escape variants in the high-dimensional embedding space. The score is defined as follows: where N is the total number of samples, is the number of positive samples, 𝒩 k ( x i ) denotes the set of k -nearest neighbors of sample x i , y i ∈ {0, 1} is the class label of sample i , and 1 ( y j = 1) is the indicator function equal to 1 if neighbor j is positive and 0 otherwise. Second, we examined pre-softmax attention weights from the self-attention modules, which are responsible for identifying residue-residue dependencies. These weights were visualized as heatmaps to identify the specific residue-residue interactions that the model weighted most heavily in its computations. To evaluate the contribution of integrating the supervised L 2 margin loss and structural embeddings, an ablation study was conducted wherein these two components were removed respectively. Specifically, for the ablated variant excluding the supervised L 2 margin loss, this loss was removed from the loss function while the others remained the same. For the ablated variant excluding the structural embeddings, the corresponding structural embeddings input to the cross-attention module were replaced with a duplicate of the evolutionary embeddings. Accordingly, the linear layer in the evolution-structure fusion block was applied to both evolutionary embeddings to ensure dimensional compatibility. In a separate ablation study targeting the top-K L 2 -differential pooling mechanism, we replaced the mechanism with max pooling and mean pooling, respectively. Max pooling was defined as selecting a single mutation with the largest L 2 distance between its wild-type and mutant context-aware unified embeddings. In contrast, mean pooling involved averaging the context-aware unified embeddings across all mutant positions for the wild-type and mutant sequences, respectively. Data availability The sources of eleven DMS datasets are provided in Supplementary Table 1 and these datasets are available at https://github.com/ericcombiolab/KEScape . The DMS dataset from E2VD can be downloaded from https://github.com/ZhiweiNiepku/E2VD . The SARS-CoV-2 BA.2 spike protein structure (pdb: 7XIX) is available at https://www.rcsb.org/structure/7XIX. The SARS-CoV-2 lineage surveillance dataset is available at https://marks.hms.harvard.edu/evescape/strain scores 20230318.zip. Code availability The source code of KEScape is available at https://github.com/ericcombiolab/KEScape . Author contributions C.W. and L.Z. conceived the study. C.W. developed the methods and conducted the experiments. L.Z. contributed to technical discussions. C.W. wrote the manuscript. L.Z. revised the manuscript. Competing interests The authors declare no competing interests. Supplementary information A supplementary file is attached. Acknowledgements The project is partially supported by the Young Collaborative Research grant (No. C2004-23Y) and HMRF grant (No. 11221026). Fig. 1A was created with BioRender. com. Funder Information Declared the Young Collaborative Research grant , No. C2004-23Y HMRF grant , No. 11221026 Footnotes We have refined the model's architecture, integrated a supervised L 2 margin loss, and officially renamed the model as KEScape. Furthermore, we have expanded our comparative analysis by incorporating fine-tuned versions of unsupervised baseline models into the deep mutational scanning (DMS) benchmark, alongside the addition of MLAEP for the lineage surveillance task. All relevant experimental results have been updated accordingly to reflect these enhancements. References [1]. ↵ Broecker , F. , Moelling , K. : What viruses tell us about evolution and immunity: beyond Darwin? Ann N Y Acad Sci 1447 ( 1 ), 53 – 68 ( 2019 ) OpenUrl PubMed [2]. ↵ Wissler , A. , DeWitte , S.N. : Frailty and survival in the 1918 influenza pandemic . Proc. Natl. Acad. Sci. U. S. A . 120 ( 42 ), 2304545120 ( 2023 ) OpenUrl [3]. ↵ Jha , P. , Brown , P.E. , Ansumana , R. : Counting the global COVID-19 dead . Lancet 399 ( 10339 ), 1937 – 1938 ( 2022 ) OpenUrl PubMed [4]. ↵ Msemburi , W. , Karlinsky , A. , Knutson , V. , Aleshin-Guendel , S. , Chatterji , S. , Wakefield , J. : The WHO estimates of excess mortality associated with the COVID-19 pandemic . Nature 613 ( 7942 ), 130 – 137 ( 2023 ) OpenUrl PubMed [5]. ↵ Ukoaka , B.M. , Okesanya , O.J. , Daniel , F.M. , Ahmed , M.M. , Udam , N.G. , Wag-wula , P.M. , Adigun , O.A. , Udoh , R.A. , Peter , I.G. , Lawal , H. : Updated WHO list of emerging pathogens for a potential future pandemic: Implications for public health and global preparedness . Infez. Med . 32 ( 4 ), 463 – 477 ( 2024 ) OpenUrl PubMed [6]. Eisfeld , A.J. , Biswas , A. , Guan , L. , Gu , C. , Maemura , T. , Trifkovic , S. , Wang , T. , Babujee , L. , Dahn , R. , Halfmann , P.J. , Barnhardt , T. , Neumann , G. , Suzuki , Y. , Thompson , A. , Swinford , A.K. , Dimitrov , K.M. , Poulsen , K. , Kawaoka , Y. : Pathogenicity and transmissibility of bovine H5N1 influenza virus . Nature 633 ( 8029 ), 426 – 432 ( 2024 ) OpenUrl CrossRef PubMed [7]. ↵ Peacock , T.P. , Moncla , L. , Dudas , G. , VanInsberghe , D. , Sukhova , K. , Lloyd-Smith , J.O. , Worobey , M. , Lowen , A.C. , Nelson , M.I. : The global H5N1 influenza panzootic in mammals . Nature 637 ( 8045 ), 304 – 313 ( 2025 ) OpenUrl CrossRef PubMed [8]. ↵ Simpson , S. , Kaufmann , M.C. , Glozman , V. , Chakrabarti , A. : Disease x: acceler-ating the development of medical countermeasures for the next pandemic . Lancet Infect. Dis . 20 ( 5 ), 108 – 115 ( 2020 ) OpenUrl [9]. Van Kerkhove , M.D. , Ryan , M.J. , Ghebreyesus , T.A. : Preparing for “disease x” . Science 374 ( 6566 ), 377 ( 2021 ) OpenUrl PubMed [10]. Becerra , X. , Jha , A. : Project NextGen - defeating SARS-CoV-2 and preparing for the next pandemic . N. Engl. J. Med . 389 ( 9 ), 773 – 775 ( 2023 ) OpenUrl CrossRef PubMed [11]. Viral respiratory infections in a rapidly changing climate: the need to prepare for the next pandemic . eBioMedicine 93 , 104593 ( 2023 ) OpenUrl PubMed [12]. Holmes , E.C. , Krammer , F. , Goodrum , F.D. : Virology-The next fifty years . Cell 187 ( 19 ), 5128 – 5145 ( 2024 ) OpenUrl CrossRef PubMed [13]. ↵ Hamelin , D.J. , Scicluna , M. , Saadie , I. , Mostefai , F. , Grenier , J.C. , Baron , C. , Caron , E. , Hussin , J.G. : Predicting pathogen evolution and immune evasion in the age of artificial intelligence . Comput. Struct. Biotechnol. J . 27 , 1370 – 1382 ( 2025 ) OpenUrl CrossRef PubMed [14]. ↵ Chung , A.W. , Isitman , G. , Navis , M. , Kramski , M. , Center , R.J. , Kent , S.J. , Stratov , I. : Immune escape from HIV-specific antibody-dependent cellular cytotoxicity (ADCC) pressure . Proc. Natl. Acad. Sci. U. S. A . 108 ( 18 ), 7505 – 7510 ( 2011 ) OpenUrl Abstract / FREE Full Text [15]. ↵ Petrova , V.N. , Russell , C.A. : The evolution of seasonal influenza viruses . Nat. Rev. Microbiol . 16 ( 1 ), 47 – 60 ( 2018 ) OpenUrl CrossRef PubMed [16]. Barnes , C.O. , Jette , C.A. , Abernathy , M.E. , Dam , K.-M.A. , Esswein , S.R. , Gristick , H.B. , Malyutin , A.G. , Sharaf , N.G. , Huey-Tubman , K.E. , Lee , Y.E. , Robbiani , D.F. , Nussenzweig , M.C. , West, A.P. Jr , Bjorkman , P.J. : SARS-CoV-2 neutralizing antibody structures inform therapeutic strategies . Nature 588 ( 7839 ), 682 – 687 ( 2020 ) OpenUrl CrossRef PubMed [17]. Garcia-Beltran , W.F. , Lam , E.C. , St Denis , K. , Nitido , A.D. , Garcia , Z.H. , Hauser , B.M. , Feldman , J. , Pavlovic , M.N. , Gregory , D.J. , Poznansky , M.C. , Sigal , A. , Schmidt , A.G. , Iafrate , A.J. , Naranbhai , V. , Balazs , A.B. : Multiple SARS-CoV-2 variants escape neutralization by vaccine-induced humoral immunity . Cell 184 ( 9 ), 2523 ( 2021 ) OpenUrl CrossRef PubMed [18]. Andreano , E. , Piccini , G. , Licastro , D. , Casalino , L. , Johnson , N.V. , Paciello , I. , Dal Monego , S. , Pantano , E. , Manganaro , N. , Manenti , A. , Manna , R. , Casa , E. , Hyseni , I. , Benincasa , L. , Montomoli , E. , Amaro , R.E. , McLellan , J.S. , Rappuoli , R. : SARS-CoV-2 escape from a highly neutralizing COVID-19 convalescent plasma . Proc. Natl. Acad. Sci. U. S. A . 118 ( 36 ), 2103154118 ( 2021 ) OpenUrl [19]. ↵ Harvey , W.T. , Carabelli , A.M. , Jackson , B. , Gupta , R.K. , Thomson , E.C. , Harrison , E.M. , Ludden , C. , Reeve , R. , Rambaut , A. , COVID-19 Genomics UK (COG-UK) Consortium , Peacock , S.J. , Robertson , D.L. : SARS-CoV-2 variants, spike mutations and immune escape . Nat. Rev. Microbiol . 19 ( 7 ), 409 – 424 ( 2021 ) OpenUrl CrossRef PubMed [20]. ↵ Carabelli , A.M. , Peacock , T.P. , Thorne , L.G. , Harvey , W.T. , Hughes , J. , COVID-19 Genomics UK Consortium , Peacock , S.J. , Barclay , W.S. , Silva , T.I. , Towers , G.J. , Robertson , D.L. : SARS-CoV-2 variant biology: immune escape, transmission and fitness . Nat. Rev. Microbiol . 21 ( 3 ), 162 – 177 ( 2023 ) OpenUrl PubMed [21]. ↵ Krammer , F. : The human antibody response to influenza a virus infection and vaccination . Nat. Rev. Immunol . 19 ( 6 ), 383 – 397 ( 2019 ) OpenUrl CrossRef PubMed [22]. Wei , C.-J. , Crank , M.C. , Shiver , J. , Graham , B.S. , Mascola , J.R. , Nabel , G.J. : Next-generation influenza vaccines: opportunities and challenges . Nat. Rev. Drug Discov . 19 ( 4 ), 239 – 252 ( 2020 ) OpenUrl CrossRef PubMed [23]. Chen , J.-R. , Liu , Y.-M. , Tseng , Y.-C. , Ma , C. : Better influenza vaccines: an industry perspective . J. Biomed. Sci . 27 ( 1 ), 33 ( 2020 ) OpenUrl CrossRef PubMed [24]. ↵ Watson , O.J. , Barnsley , G. , Toor , J. , Hogan , A.B. , Winskill , P. , Ghani , A.C. : Global impact of the first year of COVID-19 vaccination: a mathematical modelling study . Lancet Infect. Dis . 22 ( 9 ), 1293 – 1302 ( 2022 ) OpenUrl CrossRef PubMed [25]. ↵ Xie , H. , Wan , X.-F. , Ye , Z. , Plant , E.P. , Zhao , Y. , Xu , Y. , Li , X. , Finch , C. , Zhao , N. , Kawano , T. , Zoueva , O. , Chiang , M.-J. , Jing , X. , Lin , Z. , Zhang , A. , Zhu , Y. : H3N2 mismatch of 2014-15 northern hemisphere influenza vaccines and head-to-head comparison between human and ferret antisera derived antigenic maps . Sci. Rep . 5 ( 1 ), 15279 ( 2015 ) OpenUrl CrossRef PubMed [26]. Gouma , S. , Weirick , M. , Hensley , S.E. : Antigenic assessment of the H3N2 component of the 2019-2020 northern hemisphere influenza vaccine . Nat. Commun . 11 ( 1 ), 2445 ( 2020 ) OpenUrl CrossRef PubMed [27]. ↵ Tenforde , M.W. , Kondor , R.J.G. , Chung , J.R. , Zimmerman , R.K. , Nowalk , M.P. , Jackson , M.L. , Jackson , L.A. , Monto , A.S. , Martin , E.T. , Belongia , E.A. , McLean , H.Q. , Gaglani , M. , Rao , A. , Kim , S.S. , Stark , T.J. , Barnes , J.R. , Wentworth , D.E. , Patel , M.M. , Flannery , B. : Effect of antigenic drift on influenza vaccine effectiveness in the united states-2019-2020 . Clin. Infect. Dis . 73 ( 11 ), 4244 – 4250 ( 2021 ) OpenUrl [28]. ↵ Fowler , D.M. , Fields , S. : Deep mutational scanning: a new style of protein science . Nat. Methods 11 ( 8 ), 801 – 807 ( 2014 ) OpenUrl CrossRef PubMed Web of Science [29]. ↵ Hie , B. , Zhong , E.D. , Berger , B. , Bryson , B. : Learning the language of viral evolution and escape . Science 371 ( 6526 ), 284 – 288 ( 2021 ) OpenUrl Abstract / FREE Full Text [30]. ↵ Han , W. , Chen , N. , Xu , X. , Sahil , A. , Zhou , J. , Li , Z. , Zhong , H. , Gao , E. , Zhang , R. , Wang , Y. , Sun , S. , Cheung , P.P.-H. , Gao , X. : Predicting the antigenic evolution of SARS-COV-2 with deep learning . Nat. Commun . 14 ( 1 ), 3478 ( 2023 ) OpenUrl CrossRef PubMed [31]. ↵ Thadani , N.N. , Gurev , S. , Notin , P. , Youssef , N. , Rollins , N.J. , Ritter , D. , Sander , C. , Gal , Y. , Marks , D.S. : Learning from prepandemic data to forecast viral escape . Nature 622 ( 7984 ), 818 – 825 ( 2023 ) OpenUrl CrossRef PubMed [32]. ↵ Nie , Z. , Liu , X. , Chen , J. , Wang , Z. , Liu , Y. , Si , H. , Dong , T. , Xu , F. , Song , G. , Wang , Y. , Zhou , P. , Gao , W. , Tian , Y. : A unified evolution-driven deep learning framework for virus variation driver prediction . Nat. Mach. Intell . ( 2025 ) [33]. ↵ Valkenburg , S.A. , Gras , S. , Guillonneau , C. , Hatton , L.A. , Bird , N.A. , Twist , K.-A. , Halim , H. , Jackson , D.C. , Purcell , A.W. , Turner , S.J. , Doherty , P.C. , Rossjohn , J. , Kedzierska , K. : Preemptive priming readily overcomes structure-based mechanisms of virus escape . Proc. Natl. Acad. Sci. U. S. A . 110 ( 14 ), 5570 – 5575 ( 2013 ) OpenUrl Abstract / FREE Full Text [34]. ↵ Cai , Y. , Zhang , J. , Xiao , T. , Lavine , C.L. , Rawson , S. , Peng , H. , Zhu , H. , Anand , K. , Tong , P. , Gautam , A. , Lu , S. , Sterling , S.M. , Walsh, R.M. Jr , Rits-Volloch , S. , Lu , J. , Wesemann , D.R. , Yang , W. , Seaman , M.S. , Chen , B. : Structural basis for enhanced infectivity and immune evasion of SARS-CoV-2 variants . Science 373 ( 6555 ), 642 – 648 ( 2021 ) OpenUrl Abstract / FREE Full Text [35]. ↵ Frost , S.D.W. , Magalis , B.R. , Kosakovsky Pond , S.L. : Neutral theory and rapidly evolving viral pathogens . Mol. Biol. Evol . 35 ( 6 ), 1348 – 1354 ( 2018 ) OpenUrl CrossRef PubMed [36]. Lyons , D.M. , Lauring , A.S. : Mutation and epistasis in influenza virus evolution . Viruses 10 ( 8 ), 407 ( 2018 ) OpenUrl CrossRef PubMed [37]. ↵ Markov , P.V. , Ghafari , M. , Beer , M. , Lythgoe , K. , Simmonds , P. , Stilianakis , N.I. , Katzourakis , A. : The evolution of SARS-CoV-2 . Nat. Rev. Microbiol . 21 ( 6 ), 361 – 379 ( 2023 ) OpenUrl CrossRef PubMed [38]. ↵ Civetta , A. , Ostapchuk , D.C.M. , Nwali , B. : Genome hotspots for nucleotide substitutions and the evolution of influenza a (H1N1) human strains . Genome Biol. Evol . 8 ( 4 ), 986 – 993 ( 2016 ) OpenUrl CrossRef PubMed [39]. ↵ Jian , F. , Wang , J. , Yisimayi , A. , Song , W. , Xu , Y. , Chen , X. , Niu , X. , Yang , S. , Yu , Y. , Wang , P. , Sun , H. , Yu , L. , Wang , J. , Wang , Y. , An , R. , Wang , W. , Ma , M. , Xiao , T. , Gu , Q. , Shao , F. , Wang , Y. , Shen , Z. , Jin , R. , Cao , Y. : Evolving antibody response to SARS-CoV-2 antigenic shift from XBB to JN.1 . Nature 637 ( 8047 ), 921 – 929 ( 2025 ) OpenUrl CrossRef PubMed [40]. ↵ Wang , Q. , Guo , Y. , Iketani , S. , Nair , M.S. , Li , Z. , Mohri , H. , Wang , M. , Yu , J. , Bowen , A.D. , Chang , J.Y. , Shah , J.G. , Nguyen , N. , Chen , Z. , Meyers , K. , Yin , M.T. , Sobieszczyk , M.E. , Sheng , Z. , Huang , Y. , Liu , L. , Ho , D.D. : Antibody evasion by SARS-CoV-2 omicron subvariants BA.2.12.1, BA.4 and BA.5 . Nature 608 ( 7923 ), 603 – 608 ( 2022 ) OpenUrl CrossRef PubMed [41]. Quadir , N. , Singh , J. , Alam , A. , Malik , A.A. , Rahman , S.A. , Hira , S. , Ehtesham , N.Z. , Sundar , D. , Hasnain , S.E. : Evolution of SARS-CoV-2: BA.4/BA.5 variants continues to pose new challenges . Viruses 14 ( 12 ), 2610 ( 2022 ) OpenUrl PubMed [42]. ↵ Cox , M. , Peacock , T.P. , Harvey , W.T. , Hughes , J. , Wright , D.W. , COVID-19 Genomics UK (COG-UK) Consortium , Willett , B.J. , Thomson , E. , Gupta , R.K. , Peacock , S.J. , Robertson , D.L. , Carabelli , A.M. : SARS-CoV-2 variant evasion of monoclonal antibodies based on in vitro studies . Nat. Rev. Microbiol . 21 ( 2 ), 112 – 124 ( 2023 ) OpenUrl CrossRef PubMed [43]. ↵ Khare , S. , Gurry , C. , Freitas , L. , Schultz , M.B. , Bach , G. , Diallo , A. , Akite , N. , Ho , J. , Lee , R.T. , Yeo , W. , Curation Team, G.C ., Maurer-Stroh , S. : GISAID’s role in pandemic response . China CDC Wkly . 3 ( 49 ), 1049 – 1051 ( 2021 ) OpenUrl CrossRef PubMed [44]. ↵ Lin , Z. , Akin , H. , Rao , R. , Hie , B. , Zhu , Z. , Lu , W. , Smetanin , N. , Verkuil , R. , Kabeli , O. , Shmueli , Y. , Dos Santos Costa , A. , Fazel-Zarandi , M. , Sercu , T. , Candido , S. , Rives , A. : Evolutionary-scale prediction of atomic-level protein structure with a language model . Science 379 ( 6637 ), 1123 – 1130 ( 2023 ) OpenUrl CrossRef PubMed [45]. ↵ Sourisseau , M. , Lawrence , D.J.P. , Schwarz , M.C. , Storrs , C.H. , Veit , E.C. , Bloom , J.D. , Evans , M.J. : Deep mutational scanning comprehensively maps how zika envelope protein mutations affect viral growth and antibody escape . Journal of Virology 93 ( 23 ), 10 – 11280129119 ( 2019 ) OpenUrl CrossRef [46]. Lei , R. , Hernandez Garcia , A. , Tan , T.J.C. , Teo , Q.W. , Wang , Y. , Zhang , X. , Luo , S. , Nair , S.K. , Peng , J. , Wu , N.C. : Mutational fitness landscape of human influenza H3N2 neuraminidase . Cell Rep . 42 ( 1 ), 111951 ( 2023 ) OpenUrl PubMed [47]. Dadonaite , B. , Ahn , J.J. , Ort , J.T. , Yu , J. , Furey , C. , Dosey , A. , Hannon , W.W. , Baker , A.L.V. , Webby , R.J. , King , N.P. , Liu , Y. , Hensley , S.E. , Peacock , T.P. , Moncla , L.H. , Bloom , J.D. : Deep mutational scanning of h5 hemagglutinin to inform influenza virus surveillance . bioRxiv ( 2024 ) [48]. Carr , C.R. , Crawford , K.H.D. , Murphy , M. , Galloway , J.G. , Haddox , H.K. , Matsen , F.A. 4th, Andersen , K.G. , King , N.P. , Bloom , J.D. : Deep mutational scanning reveals functional constraints and antibody-escape potential of lassa virus glycoprotein complex . Immunity 57 ( 9 ), 2061 – 207611 ( 2024 ) OpenUrl CrossRef PubMed [49]. Dadonaite , B. , Brown , J. , McMahon , T.E. , Farrell , A.G. , Figgins , M.D. , Asarnow , D. , Stewart , C. , Lee , J. , Logue , J. , Bedford , T. , Murrell , B. , Chu , H.Y. , Veesler , D. , Bloom , J.D. : Spike deep mutational scanning helps predict success of SARS-CoV-2 clades . Nature 631 ( 8021 ), 617 – 626 ( 2024 ) OpenUrl CrossRef PubMed [50]. Welsh , F.C. , Eguia , R.T. , Lee , J.M. , Haddox , H.K. , Galloway , J. , Van Vinh Chau , N. , Loes , A.N. , Huddleston , J. , Yu , T.C. , Quynh Le , M. , Nhat , N.T.D. , Thi Le Thanh , N. , Greninger , A.L. , Chu , H.Y. , Englund , J.A. , Bedford , T. , Matsen , F.A. 4th, Boni , M.F. , Bloom , J.D. : Age-dependent heterogeneity in the antigenic effects of mutations to influenza hemagglutinin . Cell Host Microbe 32 ( 8 ), 1397 – 141111 ( 2024 ) OpenUrl CrossRef PubMed [51]. Aditham , A.K. , Radford , C.E. , Carr , C.R. , Jasti , N. , King , N.P. , Bloom , J.D. : Deep mutational scanning of rabies glycoprotein defines mutational constraint and antibody-escape mutations ( 2024 ) [52]. ↵ Larsen , B.B. , McMahon , T. , Brown , J.T. , Wang , Z. , Radford , C.E. , Crowe, J.E. Jr , Veesler , D. , Bloom , J.D. : Functional and antigenic landscape of the nipah virus receptor-binding protein . Cell 188 ( 9 ), 2480 – 249422 ( 2025 ) OpenUrl PubMed [53]. ↵ Cao , Y. , Jian , F. , Wang , J. , Yu , Y. , Song , W. , Yisimayi , A. , Wang , J. , An , R. , Chen , X. , Zhang , N. , Wang , Y. , Wang , P. , Zhao , L. , Sun , H. , Yu , L. , Yang , S. , Niu , X. , Xiao , T. , Gu , Q. , Shao , F. , Hao , X. , Xu , Y. , Jin , R. , Shen , Z. , Wang , Y. , Xie , X.S. : Imprinted SARS-CoV-2 humoral immunity induces convergent omicron RBD evolution . Nature 614 ( 7948 ), 521 – 529 ( 2023 ) OpenUrl PubMed [54]. ↵ Yajima , H. , Nomai , T. , Okumura , K. , Maenaka , K. , Genotype to Phenotype Japan (G2P-Japan) Consortium , Ito , J. , Hashiguchi , T. , Sato , K. : Molecular and structural insights into SARS-CoV-2 evolution: from BA.2 to XBB subvariants . MBio 15 ( 10 ), 0322023 ( 2024 ) OpenUrl [55]. ↵ Liu , Z. , VanBlargan , L.A. , Bloyet , L.-M. , Rothlauf , P.W. , Chen , R.E. , Stumpf , S. , Zhao , H. , Errico , J.M. , Theel , E.S. , Liebeskind , M.J. , Alford , B. , Buchser , W.J. , Ellebedy , A.H. , Fremont , D.H. , Diamond , M.S. , Whelan , S.P.J. : Identification of SARS-CoV-2 spike mutations that attenuate monoclonal and serum antibody neutralization . Cell Host Microbe 29 ( 3 ), 477 – 4884 ( 2021 ) OpenUrl CrossRef PubMed [56]. ↵ Cao , Y. , Wang , J. , Jian , F. , Xiao , T. , Song , W. , Yisimayi , A. , Huang , W. , Li , Q. , Wang , P. , An , R. , Wang , J. , Wang , Y. , Niu , X. , Yang , S. , Liang , H. , Sun , H. , Li , T. , Yu , Y. , Cui , Q. , Liu , S. , Yang , X. , Du , S. , Zhang , Z. , Hao , X. , Shao , F. , Jin , R. , Wang , X. , Xiao , J. , Wang , Y. , Xie , X.S. : Omicron escapes the majority of existing SARS-CoV-2 neutralizing antibodies . Nature 602 ( 7898 ), 657 – 663 ( 2022 ) OpenUrl CrossRef PubMed [57]. ↵ Dussupt , V. , Sankhala , R.S. , Mendez-Rivera , L. , Townsley , S.M. , Schmidt , F. , Wieczorek , L. , Lal , K.G. , Donofrio , G.C. , Tran , U. , Jackson , N.D. , Zaky , W.I. , Zemil , M. , Tritsch , S.R. , Chen , W.-H. , Martinez , E.J. , Ahmed , A. , Choe , M. , Chang , W.C. , Hajduczki , A. , Jian , N. , Peterson , C.E. , Rees , P.A. , Rutkowska , M. , Slike , B.M. , Selverian , C.N. , Swafford , I. , Teng , I.-T. , Thomas , P.V. , Zhou , T. , Smith , C.J. , Currier , J.R. , Kwong , P.D. , Rolland , M. , Davidson , E. , Doranz , B.J. , Mores , C.N. , Hatziioannou , T. , Reiley , W.W. , Bieniasz , P.D. , Paquin-Proulx , D. , Gromowski , G.D. , Polonis , V.R. , Michael , N.L. , Modjarrad , K. , Joyce , M.G. , Krebs , S.J. : Low-dose in vivo protection and neutralization across SARS-CoV-2 variants by monoclonal antibody combinations . Nat. Immunol . 22 ( 12 ), 1503 – 1514 ( 2021 ) OpenUrl CrossRef PubMed [58]. Greaney , A.J. , Starr , T.N. , Barnes , C.O. , Weisblum , Y. , Schmidt , F. , Caskey , M. , Gaebler , C. , Cho , A. , Agudelo , M. , Finkin , S. , Wang , Z. , Poston , D. , Muecksch , F. , Hatziioannou , T. , Bieniasz , P.D. , Robbiani , D.F. , Nussenzweig , M.C. , Bjorkman , P.J. , Bloom , J.D. : Mapping mutations to the SARS-CoV-2 RBD that escape binding by different classes of antibodies . Nat. Commun . 12 ( 1 ), 4196 ( 2021 ) OpenUrl CrossRef PubMed [59]. Scarpa , F. , Sanna , D. , Benvenuto , D. , Borsetti , A. , Azzena , I. , Casu , M. , Fiori , P.L. , Giovanetti , M. , Maruotti , A. , Ceccarelli , G. , Caruso , A. , Caccuri , F. , Cauda , R. , Cassone , A. , Pascarella , S. , Ciccozzi , M. : Genetic and structural data on the SARS-CoV-2 omicron BQ.1 variant reveal its low potential for epidemiological expansion . Int. J. Mol. Sci . 23 ( 23 ), 15264 ( 2022 ) OpenUrl CrossRef PubMed [60]. McCarthy , K.R. , Rennick , L.J. , Nambulli , S. , Robinson-McCarthy , L.R. , Bain , W.G. , Haidar , G. , Duprex , W.P. : Recurrent deletions in the SARS-CoV-2 spike glycoprotein drive antibody escape . Science 371 ( 6534 ), 1139 – 1142 ( 2021 ) OpenUrl Abstract / FREE Full Text [61]. ↵ Cao , Y. , Song , W. , Wang , L. , Liu , P. , Yue , C. , Jian , F. , Yu , Y. , Yisimayi , A. , Wang , P. , Wang , Y. , Zhu , Q. , Deng , J. , Fu , W. , Yu , L. , Zhang , N. , Wang , J. , Xiao , T. , An , R. , Wang , J. , Liu , L. , Yang , S. , Niu , X. , Gu , Q. , Shao , F. , Hao , X. , Meng , B. , Gupta , R.K. , Jin , R. , Wang , Y. , Xie , X.S. , Wang , X. : Characterization of the enhanced infectivity and antibody evasion of omicron BA.2.75 . Cell Host Microbe 30 ( 11 ), 1527 – 15395 ( 2022 ) OpenUrl CrossRef PubMed [62]. ↵ Lan , J. , Ge , J. , Yu , J. , Shan , S. , Zhou , H. , Fan , S. , Zhang , Q. , Shi , X. , Wang , Q. , Zhang , L. , Wang , X. : Structure of the SARS-CoV-2 spike receptor-binding domain bound to the ACE2 receptor . Nature 581 ( 7807 ), 215 – 220 ( 2020 ) OpenUrl CrossRef PubMed [63]. ↵ Shang , J. , Ye , G. , Shi , K. , Wan , Y. , Luo , C. , Aihara , H. , Geng , Q. , Auerbach , A. , Li , F. : Structural basis of receptor recognition by SARS-CoV-2 . Nature 581 ( 7807 ), 221 – 224 ( 2020 ) OpenUrl CrossRef PubMed [64]. ↵ Chen , Y. , Zhao , X. , Zhou , H. , Zhu , H. , Jiang , S. , Wang , P. : Broadly neutralizing antibodies to SARS-CoV-2 and other human coronaviruses . Nat. Rev. Immunol . 23 ( 3 ), 189 – 199 ( 2023 ) OpenUrl CrossRef PubMed [65]. ↵ Shitaoka , K. , Higashiura , A. , Kawano , Y. , Yamamoto , A. , Mizoguchi , Y. , Hashiguchi , T. , Nishimichi , N. , Huang , S. , Ito , A. , Ohki , S. , Kanda , M. , Taniguchi , T. , Yoshizato , R. , Azuma , H. , Kitajima , Y. , Yokosaki , Y. , Okada , S. , Sakaguchi , T. , Yasuda , T. : Structural basis of spike RBM-specific human antibodies counteracting broad SARS-CoV-2 variants . Commun. Biol . 6 ( 1 ), 395 ( 2023 ) OpenUrl PubMed [66]. ↵ He , Q. , Wu , L. , Xu , Z. , Wang , X. , Xie , Y. , Chai , Y. , Zheng , A. , Zhou , J. , Qiao , S. , Huang , M. , Shang , G. , Zhao , X. , Feng , Y. , Qi , J. , Gao , G.F. , Wang , Q. : An updated atlas of antibody evasion by SARS-CoV-2 omicron sub-variants including BQ.1.1 and XBB . Cell Rep. Med . 4 ( 4 ), 100991 ( 2023 ) OpenUrl PubMed [67]. ↵ Notin , P. , Van Niekerk , L. , Kollasch , A. , Ritter , D. , Gal , Y. , Marks , D. : TranceptEVE: Combining family-specific and family-agnostic models of protein sequences for improved fitness prediction ( 2022 ) [68]. ↵ Alam , M.S. : Insight into SARS-CoV-2 omicron variant immune escape possibility and variant independent potential therapeutic opportunities . Heliyon 9 ( 2 ), 13285 ( 2023 ) OpenUrl [69]. ↵ Bergstrom , C.T. , McElhany , P. , Real , L.A. : Transmission bottlenecks as determinants of virulence in rapidly evolving pathogens . Proc. Natl. Acad. Sci. U. S. A . 96 ( 9 ), 5095 – 5100 ( 1999 ) OpenUrl Abstract / FREE Full Text [70]. ↵ Sinclair , P. , Zhao , L. , Beggs , C.B. , Illingworth , C.J.R. : The airborne transmission of viruses causes tight transmission bottlenecks . Nat. Commun . 15 ( 1 ), 3540 ( 2024 ) OpenUrl CrossRef PubMed [71]. ↵ Maaten , L. , Hinton , G. : Visualizing data using t-sne . Journal of Machine Learning Research 9 ( 86 ), 2579 – 2605 ( 2008 ) OpenUrl [72]. ↵ Dacon , C. , Tucker , C. , Peng , L. , Lee , C.-C.D. , Lin , T.-H. , Yuan , M. , Cong , Y. , Wang , L. , Purser , L. , Williams , J.K. , Pyo , C.-W. , Kosik , I. , Hu , Z. , Zhao , M. , Mohan , D. , Cooper , A.J.R. , Peterson , M. , Skinner , J. , Dixit , S. , Kollins , E. , Huzella , L. , Perry , D. , Byrum , R. , Lembirik , S. , Drawbaugh , D. , Eaton , B. , Zhang , Y. , Yang , E.S. , Chen , M. , Leung , K. , Weinberg , R.S. , Pegu , A. , Geraghty , D.E. , Davidson , E. , Douagi , I. , Moir , S. , Yewdell , J.W. , Schmaljohn , C. , Crompton , P.D. , Holbrook , M.R. , Nemazee , D. , Mascola , J.R. , Wilson , I.A. , Tan , J. : Broadly neutralizing antibodies target the coronavirus fusion peptide . Science 377 ( 6607 ), 728 – 735 ( 2022 ) OpenUrl CrossRef PubMed [73]. ↵ Rambaut , A. , Holmes , E.C. , O’Toole , Á. , Hill , V. , McCrone , J.T. , Ruis , C. , Plessis , L. , Pybus , O.G. : A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology . Nat. Microbiol . 5 ( 11 ), 1403 – 1407 ( 2020 ) OpenUrl PubMed [74]. ↵ Loshchilov , I. , Hutter , F. : Decoupled Weight Decay Regularization ( 2019 ). https://arxiv.org/abs/1711.05101 View the discussion thread. Back to top Previous Next Posted May 11, 2026. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A structure-informed evolutionary model for predicting viral immune escape and evolution Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A structure-informed evolutionary model for predicting viral immune escape and evolution Chonghao Wang , Lu Zhang bioRxiv 2025.07.31.667864; doi: https://doi.org/10.1101/2025.07.31.667864 Share This Article: Copy Citation Tools A structure-informed evolutionary model for predicting viral immune escape and evolution Chonghao Wang , Lu Zhang bioRxiv 2025.07.31.667864; doi: https://doi.org/10.1101/2025.07.31.667864 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Microbiology Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41936) Biophysics (21452) Cancer Biology (18588) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15153) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00