Enhancing Clinical Classification of Protein Variants using ESM2 and UMAP

doi:10.1101/2025.07.26.666924

Enhancing Clinical Classification of Protein Variants using ESM2 and UMAP

2025 · doi:10.1101/2025.07.26.666924

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 22,372 characters · extracted from preprint-html · click to expand

Enhancing Clinical Classification of Protein Variants using ESM2 and UMAP | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Enhancing Clinical Classification of Protein Variants using ESM2 and UMAP Ugo Lomoio , Pierangelo Veltri , View ORCID Profile Pietro Hiram Guzzi doi: https://doi.org/10.1101/2025.07.26.666924 Ugo Lomoio 1 Department of Surgical and Medical Sciences, University of Catanzaro Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pierangelo Veltri 2 DIMES, University of Calabria Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pietro Hiram Guzzi 1 Department of Surgical and Medical Sciences, University of Catanzaro Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Pietro Hiram Guzzi For correspondence: hguzzi{at}unicz.it Abstract Full Text Info/History Metrics Preview PDF Abstract Protein sequences may vary due to mutations in their coding DNA sequence, leading to differences in structure and function. The same protein may exist in multiple variant forms, each potentially leading to distinct phenotypic consequences depending on how the alterations affect its structure, function, or expression. Missense variants are single nucleotide substitutions in the DNA sequence that result in the replacement of one amino acid with another in the corresponding protein, potentially altering its structure, stability, or function. The clinical interpretation of missense variants in protein-coding regions remains a fundamental challenge in genomic medicine. Recent advances in protein language models and manifold learning provide new opportunities for unsupervised extraction of biologically relevant information from protein sequences. In this work, we integrate representations derived from ESM2 ( spiegare ) with nonlinear dimensionality reduction via UMAP ( spiegare ) to improve the classification of variants of uncertain significance (VUS) in disease-associated proteins. Our results suggest that this approach improves separability of benign and pathogenic variants, offering a scalable and interpretable strategy for variant prioritization in precision medicine. 1 Introduction The growing accessibility of genome sequencing technologies has ushered in a new era of precision medicine, enabling the identification of genetic variants at an unprecedented scale. However, while sequencing pipelines can detect thousands of variants per individual, interpreting their clinical relevance remains a formidable challenge. Among protein-coding alterations, missense variants — those resulting in amino acid substitutions — are particularly difficult to interpret due to the complex relationship between sequence variation, protein structure and function, and clinical phenotype [ 5 ]. As a result, a large proportion of such variants are classified as variants of uncertain significance (VUS), which poses significant limitations to genetic diagnosis, risk assessment, and personalized treatment [ 8 ]. Traditional computational tools for variant effect prediction, such as SIFT, PolyPhen-2, CADD, and REVEL, leverage features based on evolutionary conservation, sequence homology, protein structure, and functional annotations [ 1 ]. While these tools provide valuable insights and have been widely adopted in clinical practice, they are constrained by limitations in interpretability, gene-specific biases, and generalizability across variant classes. Moreover, their reliance on curated databases or explicit alignments restricts their scalability to rare or novel variants, which often lack sufficient homologous information. Recent advances in self-supervised learning, particularly in the domain of protein language modeling, have opened new frontiers in computational biology. Inspired by developments in natural language processing, protein language models (PLMs) are trained on vast corpora of unaligned amino acid sequences using transformer-based architectures to learn contextual embeddings that implicitly capture structural, functional, and evolutionary signals [ 6 ]. These models, such as ESM (Evolutionary Scale Modeling), ProtBERT, and TAPE, are capable of producing high-dimensional vector representations of protein sequences or individual residues without relying on explicit multiple sequence alignments. Among these, ESM2 represents the most recent and powerful iteration in the ESM family of models, trained on over 60 billion amino acids from UniRef and comprising up to 15 billion parameters [ 3 ]. ESM2 generates contextualized embeddings at the residue and sequence level that have demonstrated state-of-the-art performance across several downstream tasks, including structure prediction, contact map inference, and mutational effect prediction. These embeddings have the potential to encode nuanced information about protein folding, dynamics, and biochemical function, making them highly attractive for variant effect classification. However, despite the richness of PLM-derived representations, their high dimensionality (e.g., 1,280 dimensions for ESM2-T36) presents significant challenges in terms of visualization, interpretability, and downstream machine learning. Raw embeddings may contain noise, redundancy, or non-linearly separable patterns that complicate classification. To address these limitations, dimensionality reduction techniques are frequently employed to map high-dimensional vectors into lower-dimensional manifolds that preserve essential structural features. Uniform Manifold Approximation and Projection (UMAP) is a nonlinear dimensionality reduction algorithm that preserves both local and global relationships in data while producing embeddings that are well-suited for visualization and clustering [ 4 ]. UMAP has been used successfully in genomics and single-cell transcriptomics, where high-dimensional latent spaces are common. When applied to PLM-derived embeddings, UMAP enables better separation of latent biological classes and enhances the interpretability of variant spaces. Computational methods have become instrumental in predicting the functional effects of missense mutations, offering valuable insights into personalized medicine [ 9 ]. Databases such as ClinVar and DECIPHER, along with matchmakers like GeneMatcher, facilitate the sharing of variants and phenotypic descriptions, aiding in the determination of disease relevance [ 12 ]. Assessing genetic intolerance through Homologous Missense Constraint measures can further inform the clinical significance of missense variants associated with human diseases [ 2 ]. Evaluating regional intolerance using tools like MTR-Viewer can help identify pathogenic missense variants [ 7 ]. These methods highlight functionally significant regions within genes that traditional approaches may overlook [ 7 ]. Furthermore, databases incorporating protein contactmaps, sequence-to-structure mapping, and stability predictions enhance variant analysis, while evolutionary fitness effect scoring aids in predicting the deleteriousness of variants, potentially reclassifying variants of unknown significance [ 13 ] [ 11 ]. However, not all mutations have severe damaging impacts, and most produce subtle effects with unclear clinical significance, necessitating accurate measurement of changes in binding affinity induced by mutations [ 15 ]. Functional analyses of protein domains have revealed that specific protein functions are associated with varying sensitivities to mutations, suggesting that future pathogenicity predictors may benefit from considering functional annotation [ 14 ]. The identification of critical mutation sites and understanding the complex patterns of mutation effect of viral proteins are also crucial, especially for forecasting dominant variants [ 10 ]. 2 The Proposed Method 2.1 Dataset Collection We constructed a curated dataset of missense variants for multiple amyloidosis related proteins from the ClinVar database (accessed June 2025). Focusing on the following proteins: Alpha-1-antitrypsin (P02768) , a protease inhibitor that forms toxic polymers in liver cells when mutated, leading to liver disease and predisposing to secondary amyloidosis. Transthyretin (P02766) , thyroid hormone carrier with over 140 known mutations causing hereditary cardiac and neurological amyloidosis. Also the wild-type can causes agerelated cardiac amyloidosis. Fibrinogen α -chain (P02671) , a blood clotting protein whose mutations cause renallimited amyloidosis with rapid progression to kidney failure. Apolipoprotein A-I (P02647) , High-Density Lipoprotein (HDL) component causing hereditary amyloidosis affecting liver, kidneys, heart, or other organs depending on mutation location. Lysozyme (P61626) , an antimicrobial enzyme causing rare hereditary amyloidosis with renal, gastrointestinal, and sicca symptoms. β 2 -microglobulin (P61769) , a component of the Major Histocompatibility Complex 1 (MHC-I) that accumulates in dialysis patients, causing joint and bone disease including carpal tunnel syndrome. Cystatin C (P01034) , a protease inhibitor causing hereditary cerebral amyloid angiopathy with recurrent brain haemorrhages in young patients. For each protein, we retained the canonical UniProt sequence and mapped variants to their corresponding amino acid positions. Since we aim to perform a binary classification, variants labelled as “likely benign” or “likely pathogenic” were initially treated and approximated as “benign” and “pathogenic” respectively. Then, with the help of UMAP embeddings, we visualized the position of those “likely” mutations and interpret the prediction. 2.2 ESM2 Embedding We used the pretrained ESM2-T36 model [ 3 ], implemented in PyTorch and available through the FAIR/ESM GitHub repository, to compute residue-level embeddings. Each variant sequence was mutated in silico at the position of interest, and the corresponding 1,280-dimensional embedding vector was extracted for the mutated residue. No fine-tuning was performed. 2.3 Dimensionality Reduction via UMAP2 To reduce the dimensionality of the ESM2-derived vectors, we applied UMAP2 [ 4 ] using the umap-learn package. We selected parameters empirically: min_dist = 0.0 , metric = “euclidean” , and a variable n_neighbors parameter to overcome cases of dataset unbalance (i.e. more benign than pathogenic mutation). The reduced embeddings were projected to 2D spaces for visualization and classification. Classification of each variant was performed on their distance from the wild-type: pathogenic if the distance is greater then a given threshold, benign otherwise. We confirmed that pathogenic and benign variants formed separable clusters in the reduced space, and variants of uncertain significance (VUS) typically populated inter-cluster regions. View this table: View inline View popup Download powerpoint Table 1: Detailed ROC-AUC scores of different classification models for each protein related with amyloidosis. View this table: View inline View popup Download powerpoint Table 2: Mean ROC-AUC performances of different variant classification models across all the amyloidosis-related proteins studied. 2.4 Classification Pipeline The final feature matrix consisted of concatenated vectors: UMAP-reduced ESM2 embeddings. We evaluated three classifier models: logistic regression (LR), support vector machines (SVMs) with RBF kernel, and random forests. For each protein, we trained models using 5-fold crossvalidation stratified by class, and reported average precision, recall, F1-score, and area under the ROC curve (AUC). 3 Results We compared our hybrid pipeline against state-of-the-art variant effect predictors such as AlphaMissense and VESM++. Download figure Open in new tab Figure 1: 2-dimensional UMAP representation of the ESM2 variant embeddings for protein: P02768 (a) and P02766 (b). 4 Conclusion This study introduces a hybrid framework that combines ESM2 protein language model (PLM) embeddings with non-linear dimensionality reduction via UMAP to support the classification and interpretation of missense variants. By leveraging the rich contextual information captured by transformer-based models and projecting these high-dimensional embeddings into a tractable space, the proposed approach enables both accurate classification and intuitive visual exploration of variant effect landscapes. Comparative analysis against state-of-the-art predictors, including AlphaMissense and VESM++, demonstrates that the ESM2+UMAP pipeline achieves competitive or superior ROC-AUC scores across a diverse set of proteins, while also offering complete coverage for all evaluated cases. Footnotes ugo.lomoio{at}unicz.it pierangelo.veltri{at}unical.it hguzzi{at}unicz.it References [1]. ↵ Emidio Capriotti , Piero Fariselli , et al. The use of molecular features in the prediction of the pathogenicity of genetic variants . Current Opinion in Biotechnology , 24 ( 4 ): 646 – 653 , 2013 . OpenUrl PubMed [2]. ↵ Dhavendra Kumar and Perry Elliott . Cardiovascular Genetics and Genomics. Springer Nature , 01 2018 . doi: 10.1007/978-3-319-66114-8 . URL https://doi.org/10.1007/978-3-319-66114-8. OpenUrl CrossRef [3]. ↵ Zeming Lin , H Akin , Roshan Rao , et al. Esm2: Evolutionary scale modeling for learning protein structure and function . bioRxiv , 2023 . doi: 10.1101/2022.07.20.500902 . OpenUrl Abstract / FREE Full Text [4]. ↵ Leland McInnes , John Healy , and James Melville . Umap: Uniform manifold approximation and projection for dimension reduction . arXiv preprint arXiv: 1802.03426 , 2018 . [5]. ↵ Sue Richards , Nazneen Aziz , Stephen Bale , et al. Standards and guidelines for the inter-pretation of sequence variants . Genetics in Medicine , 17 ( 5 ): 405 – 424 , 2015 . OpenUrl CrossRef PubMed [6]. ↵ Alexander Rives , Joshua Meier , Tom Sercu , et al. Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences . Proceedings of the National Academy of Sciences , 118 ( 15 ), 2021 . [7]. ↵ Michael Silk , Slave Petrovski , and David B. Ascher . Mtr-viewer: identifying regions within genes under purifying selection . Nucleic Acids Research , 47 , 06 2019 . ISSN 0305-1048 , 1362-4954, 1362-4962. doi: 10.1093/nar/gkz457 . https://doi.org/10.1093/nar/gkz457. OpenUrl CrossRef [8]. ↵ Lea M Starita , Nadav Ahituv , Maitreya J Dunham , et al. Variant interpretation: functional assays to the rescue . American Journal of Human Genetics , 101 ( 3 ): 315 – 325 , 2017 . OpenUrl CrossRef PubMed [9]. ↵ Kuan Pern Tan , Tejashree Rajaram Kanitkar , Chee Keong Kwoh , and M. S. Madhusudhan . Packpred: Predicting the functional effect of missense mutations . Frontiers in Molecular Biosciences , 8 , 08 2021 . ISSN 2296-889X . doi: 10.3389/fmolb.2021.646288 . URL https://doi.org/10.3389/fmolb.2021.646288. OpenUrl CrossRef [10]. ↵ Xiaoqin Tan . Co-evolution integrated deep learning framework for variants generation and fitness prediction . bioRxiv (Cold Spring Harbor Laboratory) , 01 2023 . doi: 10.1101/2023.01.28.526023 . URL https://doi.org/10.1101/2023.01.28.526023. OpenUrl Abstract / FREE Full Text [11]. ↵ Susan E. Tsutakawa , Albino Bacolla , Panagiotis Katsonis , Amer Bralic , Samir M. Hamdan , Olivier Lichtarge , John A. Tainer , and Chi-Lin Tsai . Decoding cancer variants of unknown significance for helicase–nuclease–rpa complexes orchestrating dna repair during transcrip-tion and replication . Frontiers in Molecular Biosciences , 8 , 12 2021 . ISSN 2296-889X . doi: 10.3389/fmolb.2021.791792 . URL https://doi.org/10.3389/fmolb.2021.791792. OpenUrl CrossRef [12]. ↵ Robin van der Lee , Solenne Correard , and Wyeth W. Wasserman . Deregulated regulators: Disease-causing cis variants in transcription factor genes , 05 2020 . ISSN 0168-9525 , 1362-4555. URL https://doi.org/10.1016/j.tig.2020.04.006. [13]. ↵ Jaie Woodard , Chengxin Zhang , and Yang Zhang . Address: A database of disease-associated human variants incorporating protein structure and folding stabilities . Jour-nal of Molecular Biology , 433 : 166840 – 166840 , 02 2021 . ISSN 0022-2836 , 1089-8638. doi: 10.1016/j.jmb.2021.166840 . URL https://doi.org/10.1016/j.jmb.2021.166840. OpenUrl CrossRef PubMed [14]. ↵ Jan Zaucha , Michael Heinzinger , S. I. Tarnovskaya , Burkhard Rost , and Dmitrij Frish-man . Family-specific analysis of variant pathogenicity prediction tools . NAR Genomics and Bioinformatics , 2 , 02 2020 . ISSN 2631-9268 . doi: 10.1093/nargab/lqaa014 . URL https://doi.org/10.1093/nargab/lqaa014. OpenUrl CrossRef [15]. ↵ Ning Zhang , Yuting Chen , Haoyu Lu , Feiyang Zhao , Roberto Vera Alvarez , Alexander Goncearenco , Anna R. Panchenko , and Minghui Li . Mutabind2: Predicting the impacts of single and multiple mutations on protein-protein interactions . iScience , 23 : 100939 – 100939 , 02 2020 . ISSN 2589-0042 . doi: 10.1016/j.isci.2020.100939 . URL https://doi.org/10.1016/j.isci.2020.100939. OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted July 31, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Enhancing Clinical Classification of Protein Variants using ESM2 and UMAP Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Enhancing Clinical Classification of Protein Variants using ESM2 and UMAP Ugo Lomoio , Pierangelo Veltri , Pietro Hiram Guzzi bioRxiv 2025.07.26.666924; doi: https://doi.org/10.1101/2025.07.26.666924 Share This Article: Copy Citation Tools Enhancing Clinical Classification of Protein Variants using ESM2 and UMAP Ugo Lomoio , Pierangelo Veltri , Pietro Hiram Guzzi bioRxiv 2025.07.26.666924; doi: https://doi.org/10.1101/2025.07.26.666924 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41911) Biophysics (21436) Cancer Biology (18578) Cell Biology (25482) Clinical Trials (138) Developmental Biology (13371) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15599) Genomics (22483) Immunology (17728) Microbiology (40364) Molecular Biology (17163) Neuroscience (88537) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00