Which pLM to choose?

doi:10.1101/2025.10.30.685515

Which pLM to choose?

2025 · doi:10.1101/2025.10.30.685515

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

⚙ AI-generated deep summary by claude@2026-06, 2026-06-24 · read from full text ⓘ

The paper benchmarks 14 protein language models (pLMs) spanning a wide range of parameter sizes by evaluating how well their embeddings capture protein sequence, structure, and function similarity across a dataset of about 100 million protein pairs. The authors separate “inherent information” accessible via simple distances in the raw embedding space from “extractable information” revealed only after additional supervised training, and use function similarity assessed by an HFSP score with controls of sequence identity (PIDE) and structural similarity (TM-score). They find a size–performance paradox where mid-scale models perform comparably to larger ones for tested biological properties, that embedding information scales with model size in a way that enables lightweight downstream predictors, and that task-specific learning can reshape embeddings in a way that helps the task but reduces general extractability. The authors note that specialist model representations are not broadly generalizable and that larger models mainly help when fine-tuning is planned for specific tasks. The paper does not explicitly discuss endometriosis or adenomyosis; it was included in the corpus via a keyword match in the upstream search index.

Read from the paper's body, not the abstract. Not a substitute for reading the paper. No clinical advice. How this works

Full text 66,210 characters · extracted from preprint-html · click to expand

Which pLM to choose? | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Which pLM to choose? View ORCID Profile Tobias Senoner , View ORCID Profile Ivan Koludarov , View ORCID Profile Joshua Günther , View ORCID Profile Amarda Shehu , View ORCID Profile Burkhard Rost , View ORCID Profile Yana Bromberg doi: https://doi.org/10.1101/2025.10.30.685515 Tobias Senoner 1 School of Computation, Information, and Technology (CIT), Department of Informatics, Bioinformatics & Computational Biology, TUM (Technical University of Munich) , 85748 Garching/Munich, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tobias Senoner For correspondence: yana{at}bromberglab.org tobias.senoner{at}tum.de ivan.koludarov{at}tum.de Ivan Koludarov 1 School of Computation, Information, and Technology (CIT), Department of Informatics, Bioinformatics & Computational Biology, TUM (Technical University of Munich) , 85748 Garching/Munich, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ivan Koludarov For correspondence: yana{at}bromberglab.org tobias.senoner{at}tum.de ivan.koludarov{at}tum.de Joshua Günther 1 School of Computation, Information, and Technology (CIT), Department of Informatics, Bioinformatics & Computational Biology, TUM (Technical University of Munich) , 85748 Garching/Munich, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joshua Günther Amarda Shehu 2 Department of Computer Science, George Mason University , 4400 University Dr, 22030 VA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Amarda Shehu Burkhard Rost 1 School of Computation, Information, and Technology (CIT), Department of Informatics, Bioinformatics & Computational Biology, TUM (Technical University of Munich) , 85748 Garching/Munich, Germany 3 TUM School of Life Sciences Weihenstephan (WZW) , Alte Akademie 8, Freising, Germany 4 Institute for Advanced Study (TUM-IAS) , Lichtenbergstr. 2a, 85748 Garching/Munich, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Burkhard Rost Yana Bromberg 4 Institute for Advanced Study (TUM-IAS) , Lichtenbergstr. 2a, 85748 Garching/Munich, Germany 5 Department of Computer Science, Emory University , Atlanta, GA 30307, USA 6 Department of Biology, Emory University , Atlanta, GA 30322, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Yana Bromberg For correspondence: yana{at}bromberglab.org tobias.senoner{at}tum.de ivan.koludarov{at}tum.de Abstract Full Text Info/History Metrics Preview PDF A bstract Protein-language models (pLMs) provide a novel means for mapping the protein space. Which of these new maps best advances specific biological analyses, however, is not obvious. To elucidate the principles of model selection, we benchmarked fourteen pLMs, spanning several orders of magnitude in parameter count, across a hundred million protein pairs, to assess how well they capture sequence, structure, and function similarity. For each model, we distinguish inherent information , i.e. signal recoverable from raw-embedding distances, and extractable information , i.e. signal revealed through additional supervised training. Three key results emerge. First, pLM protein representation space is inherently different from the space of biological protein representations, i.e. sequences or structures. Here, a size-performance paradox is salient – mid-scale foundation models are as good as much larger ones in reflecting all tested biological properties. Second, pLM representations compress and store biological information in proportion to model size. That is, a lightweight feed-forward network can be trained on embedding pairs to predict said biological properties well – a capacity dividend. Finally, we observe that a task-specific learning radically reshapes the embedding space, gaining inherent understanding of the task, but garbling any further extractions. In other words, smaller pLMs can provide efficient and compute-light general insight. Larger models are advantageous only when fine-tuning is planned to accomplish a specific task. Furthermore, representations generated by “specialist” models are not immediately generalizable throughout protein biology. Thus, for pLMs, bigger isn’t always better. I ntroduction Different protein language models ( pLMs – term introduced in ProtTrans ( Elnaggar et al., 2022 )) can be conceptualized as distinct maps of the protein sequence space. Just as different cartographic projections emphasize different geographical features, different pLMs construct fundamentally different representations of the same biological landscape through their learned protein embeddings. With the increasing diversity of pLMs, ( Heinzinger & Rost, 2025 ; Wang et al., 2025 ; Weissenow et al., 2022 ) a critical challenge has emerged: which pLM to choose for a specific biological application? To answer this question, we distinguish between two types of information encoded within pLM embeddings: (1) inherent information , which is directly accessible through simple arithmetic computation on raw embedding vectors, e.g. to establish protein distances/similarities and (2) extractable information , which requires subsequent embedding-based supervised machine learning, e.g. to predict specific protein features. Understanding this distinction reveals different model capabilities and computational trade-offs. Current pLMs span diverse architectures, scales, and targets: from ESM2 ( Lin, Akin, Rao, Hie, et al., 2023 ) and ProtT5 ( Elnaggar et al., 2022 ) models of eight million (8M) to 15 billion (15B) free parameters to specialized models, such as CLEAN, targeting enzyme labeling,( Yu et al., 2023 ) and ProtTucker, targeting structural domain classification ( Heinzinger et al., 2022 ). Unlike in natural language processing (NLP), where larger models typically yield better results ( Hoffmann et al., 2022 ), the relationship between the model size, i.e. number of free/trainable parameters, and protein representation quality remains unclear ( Schmirler et al., 2024 ; Teufel et al., 2022 ). How does one pick a pLM, a map, that is most representative of the (protein) world? For geographical maps, this selection is not usually done in isolation, but rather with focus on their usefulness for a specific task. Note that task-relevant features can we added to any map, but this would, trivially, change the map, even as it becomes more suitable for that specific task. Evaluating this new map is then different from judging the quality of the original. Evaluating pLMs similarly requires use-cases, i.e. a biologically meaningful task that can be completed using embeddings directly and without further training, i.e. changing the representation/map. Here, we set out to evaluate pLM quality by relying on a long-standing computational biology problem of identifying protein function – a task plagued by multidimensionality and conflicting annotations ( Ashburner et al., 2000 ; Gillis & Pavlidis, 2013 ; Radivojac et al., 2013 ; Rembeza & Engqvist, 2021 ; Schnoes et al., 2009 ). In the absence of complete and precise functional labels, we fell back on the practice of “function transfer by homology”, where protein pairs deemed related, can be assigned shared function. That is, we evaluated protein functional similarity using the principle that it’s easier to state “these proteins have the same function” ( Mahlich et al., 2018 ; Prabakaran & Bromberg, 2025a ) than to define function individually. Incidentally, this approach allows for direct evaluations of protein projections in the pLM space, i.e. functionally similar proteins should ideally be close on this map. To this end we systematically compared representations generated by 14 commonly used pLMs. In addition to protein functional similarity, measured by their HFSP score ( Mahlich et al., 2018 ), we assessed two controls: sequence similarity, reported as the percentage pairwise sequence identity (PIDE), and structural similarity, reported by their TM-score ( Y. Zhang & Skolnick, 2005 ). Specifically, we asked whether embeddings could capture sequence similarity well and, more importantly, whether this feature is the only one that embeddings would capture; the latter outcome would be the worst-case scenario for pLM use, where model representations would not contribute novelty to our understanding of proteins. Structure similarity was our second, more advanced, baseline, where pLM success would suggest that model representations are able to capture molecule biophysical features – a likely outcome, given earlier success of sequence-based structure prediction models. As sequence, structure, and function similarity are correlated but not identical, we expected to observed differences in model performance for each of these values. By analyzing both inherent information (raw embedding distances) and extractable information (supervised learning performance), we addressed two key questions: “Does pLM embedding similarity inherently capture biologically meaningful protein relationships?” and “Do larger pLMs necessarily create better protein representations?” Our findings reveal that all pLMs retain more extractable information than is inherently available. Moreover, inherent information is similar across larger and smaller models of the same family, with benefit plateauing early relative to model size. Access to extractable information, however, is significantly improved in larger models. Importantly, task-specific training dramatically reshapes the embedding spaces, changing the map to create specialized and significantly less generalizable representations. M ethods Data preparation To evaluate the performance and generalizability of protein language models (pLMs), we prepared two datasets. Our primary dataset, SwissProt-pre2024 , was constructed for model training, validation, and testing from the Swiss-Prot database release 2024_01 ( The UniProt Consortium et al., 2025 ). To create non-redundant data splits, we first partitioned all proteins using the cluster mode of MMseqs2 ( Steinegger & Söding, 2017 ). Clusters were defined by a minimum of 30% pairwise sequence identity (--min-seq-id 0.3) and 80% coverage of the shorter sequence (-c 0.8,--cov-mode 1), using a sensitivity setting of -s 7.5. The resulting clusters were then split in a 70/15/15 ratio for training, validation, and testing, respectively, stratified by cluster size to ensure a balanced distribution. This procedure yielded a training set of 375,209, a validation set of 86,546, and a test set of 80,514 sequence non-redundant proteins. To assess model generalizability on novel proteins, we constructed a second, more stringent test set named New2024 . This set comprises proteins that are not only recently reported but also sequence-dissimilar to our primary dataset, based on UniRef50 cluster membership. We identified all UniRef50 clusters present in the 2025_01 release but absent in the 2024_01 release. From this set of new clusters, we established a “truly novel” subset by applying two strict filters. First, we retained only those clusters where none of the members appeared in any UniRef50 cluster from the 2024_01 release, ensuring that they represent new sequences rather than re-clustered known ones. Second, we filtered out any remaining clusters containing UniParc entries to exclude any sequences that were previously available in public databases. We then compiled all sequences from the remaining clusters and filtered for those with high-confidence Protein Existence (PE) annotations (level=1, i.e. “evidence at protein level” or level=2, “evidence at transcript level”) to ensure that we retained only experimentally validated sequences. The final New2024 set was formed by intersecting this high-confidence candidate pool with the members of our “truly novel” clusters. This rigorous filtering process yielded a final test set of 1,237 proteins from 833 distinct, truly novel UniRef50 clusters. Protein similarity metrics To establish the ground truth for model evaluations, we computed three distinct similarity metrics for protein pairs within all our datasets: sequence identity, structural similarity, and functional similarity. Sequence Similarity: we performed an all-against-all search using MMseqs2 ( Steinegger & Söding, 2017 ). The search was configured as a high-sensitivity profile search with three iterations (-s 7 . 5, --num-iterations 3, --e-profile 1e-10 ). We generated full alignments ( -a, --alignment-mode 3 ) for up to 1,000 target sequences (--max-seqs 1000 ) meeting an E-value cutoff of 0.001 (-e 0 . 001 ). From the resulting alignments, we extracted the fractional identity (fident), defined as the number of identical aligned residues divided by the total length of the alignment. We name this metric PIDE (percent pairwise sequence identity) for the rest of this manuscript. Structural Similarity was quantified using the alntmscore computed using Foldseek ( van Kempen et al., 2024 ). We employed the identical search parameters as for sequence similarity computation. For the SwissProt-pre2024 dataset, we used the pre-existing Foldcomp ( Kim et al., 2023 ) compressed structures from the AlphaFold DB (v4) ( Varadi et al., 2024 ). For proteins in the New2024 set, which lack corresponding AlphaFold entries, we first predicted their structures using ColabFold ( Mirdita et al., 2022 ). The structural alignment metric for all pairs is termed TM-scor e for the rest of the manuscript. Functional Similarity was assessed by computing the HFSP (homology-derived functional similarity of proteins) scores ( Mahlich et al., 2018 ) from the MMseqs2 alignments above. To ensure the quality of our ground-truth data, we filtered the resulting pairs based on established reliability thresholds for each metric ( Figure S1 ). (1) For sequence similarity, we retained pairs only if they had a PIDE>0.3 and an alignment coverage of ≥80% for both proteins. (2) For structural similarity, we excluded any protein structure with an average prediction reliability (pLDDT) score≤70. We also required an alignment TM-score≥0.4 and an alignment coverage≥80% for both structures. (3) Finally, pairs were only considered functionally similar if they had a positive HFSP score (>0). After filtering, the training set for further supervised learning, contained 379,566 proteins, resulting in: (1) 47,777,153 pairs of proteins with similar sequences, (2) 111,875,191 pairs with similar structure, and (3) 47,777,153 pairs with similar function. The validation set contained 8,447,555, 15,699,682, and 8,447,555 pairs, while the test set contained 8,731,253, 15,481,580, and 8,731,253 pairs, respectively. To equalize comparisons, we used this test set for all inherent or extractable evaluations in this work. Note that this filtering step excluded the vast majority of protein pairs – the highly dissimilar ones, i.e. those with pairwise similarity too low to be detected by our methods. Thus, the analyses presented here primarily assess the capacity of embeddings to quantitatively capture measurable similarity, rather than to perform a binary discrimination between similar and dissimilar proteins ( Figure S1 ). Protein language models (pLMs) We analyzed 14 pLMs of different architectures, training approaches, and sizes. Foundation models included ProtT5 (originally named ProtT5-XL-UniRef50; with 1.5B parameters; 1024-dimensional embeddings) ( Elnaggar et al., 2022 ), ESM-1b (650M parameters; 1280-dimensional embeddings) ( Rives et al., 2021 ), ESM-2 models with 8M, 35M, 150M, 650M, and 3B parameters and 320, 480, 640, 1280, and 2560-dimensional embeddings respectively ( Lin, Akin, Rao, Hie, et al., 2023 ), Ankh with 450M and 1.15B parameters and 768 and 1536-dimensional embeddings ( Elnaggar et al., 2023 ), ESM-C models with 300M and 600M parameters and 960 and 1152-dimensional embeddings (Team ESM, 2024), and ESM-3 with 1.4B parameters and 1536-dimensional embeddings ( Hayes et al., 2025 ). Task-specific models included CLEAN (650M parameters; 128-dimensional embeddings), trained via contrastive learning for enzyme classification ( Yu et al., 2023 ) and ProtTucker (1.5M parameters; 128-dimensional embeddings), optimized for CATH domain prediction through contrastive learning ( Heinzinger et al., 2022 ). We define a model “family” as a set of model variants developed for a single publication, sharing identical architectures and training procedures and differing only in parameter count and embedding dimensionality. For each protein sequence, we generated fixed-length embeddings by averaging per-residue representations from each pLM’s final hidden layer, producing protein-level embeddings suitable for pairwise comparison analyses. Model training and evaluation framework We developed a comprehensive framework to quantify biological information encoded in pLM embeddings. Evaluations were conducted in two settings: (1) native embedding dimensions and (2) standardized 128-dimensional representations obtained via Principal Component Analysis (PCA) ( Pearson, 1901 ). For each setting, we employed two assessment approaches. First, as a non-trainable baseline, we computed Euclidean distances between embedding pairs. Second, we trained feed-forward networks (FFNs) with symmetric architecture: each n-dimensional embedding was processed through a hidden layer (n → 64), concatenated, and passed through successive layers (128 → 64 → 32 → 1) to generate predictions ( Figure S2 ). Models were trained independently to predict three biological properties: sequence identity (PIDE), structural similarity (TM-score), and functional similarity (HFSP score). In total, this yielded 180 models: 3 properties × 15 embedding sources (14 pLMs plus random control) × 4 approaches (native/PCA × Euclidean/FFN). Random control embeddings (1024 dimensions; generated from a standard normal distribution) established performance baselines. All models were implemented in PyTorch Lightning with consistent hyperparameters: learning rate 0.001, batch size 1024, maximum 100 epochs, and early stopping patience of 5 epochs. Performance was evaluated using Pearson R 2 correlation. Bootstrap analysis (1,000 samples) yielded 95% confidence intervals with standard errors <0.001 and are therefore not visible in the graphics. To enable cross-model comparisons, we applied max-min normalization to all pairwise embedding distances, scaling each pLM to the range [0,1] via the transformation x norm = (x – x min )/(x max – x min ), where x min and x max represent the minimum and maximum observed distances for that pLM. This normalization was applied prior to visual comparison in Figure 2 and for quantitative assessment via Wasserstein distance. The Wasserstein distance (W) quantifies the minimum “cost” of transforming one probability distribution into another ( Villani, 2008 ). We computed it between normalized pairwise distance distributions to compare embedding space distributions across different pLMs. R esults We analyzed embedding spaces of 14 protein language models (pLMs, Table S1 ) across three protein similarity scores (PIDE, TM-score, and HFSP) and two embedding similarity modalities (Euclidean distance and supervised training). We primarily focused on the capacity of embeddings to quantitatively capture similarity, rather than to perform a binary discrimination between similar and dissimilar proteins. We discovered fundamental differences in inherent and extractable information present in these embeddings. Size-performance paradox for inherent information We observed that representations of proteins, generated by pLMs, were inherently different from protein sequences or structures, i.e. molecule representations that currently serve as primary protein descriptors. Here, inherent information refers to the biological signal present in the raw protein embedding space, i.e. the distances between protein embedding pairs. To evaluate representations, we compared similarity of protein sequences to the similarity of their embeddings. Across all non-specialized pLMs, at best, the Pearson correlation R 2 between protein sequence identity (PIDE) and Euclidean distance of embeddings was 0.51 (for ProtT5 embeddings; Table S1 ). Structural similarity (TM-score) was also poorly represented by embedding distances (R 2 =0.4 for Ankh-base). These observations strongly suggest that pLM-generated protein representations describe proteins in ways unlike sequence or structure alone. Importantly, in evaluating the similarity of inherent information to protein-pair biological relationships across model families, we found that mid-size foundation pLMs matched or even outperformed larger models ( Figure 1 and Table S1 ). For instance, within the ESM-2 family ( Lin, Akin, Rao, & Hie, 2023 ), the 8M-parameter variant reflected functional similarity (Pearson correlation R 2 =0.28 with HFSP score) better than the 3B-parameter model (R 2 =0.26). The effect was qualitatively similar for sequence identity (e.g. ESM-2 8M R 2 =0.44 vs. ESM-2 3B R 2 =0.45) and structural similarity (e.g. ESM-2 35M R 2 =0.18 vs. ESM-2 3B R 2 =0.19). In the ESM-C family (Team ESM, 2024), the 300M parameter model outperformed its 600M parameter sibling , achieving higher Pearson R 2 for sequence (PIDE= 0.49 vs. 0.47) and structure similarity (TM-score= 0.10 vs. 0.09), while matching performance for functional similarity (HFSP= 0.30 for both). We termed this counter-intuitive outcome the size-performance paradox : where adding parameters increases theoretical capacity but does not necessarily improve biologically meaningful organization of the inherent information, i.e. the raw embedding space. Download figure Open in new tab Figure 1. : Mid-scale non-specialist foundation pLMs produce the strongest “out-of-the-box” signal, while larger models are better with supervised tuning. Evaluated on SwissProt-pre2024 test pairs, for all models (x-axis, twelve non-specialist pLMs in order of their parameter count, colored by family), inherent embedding information (Euclidean distance of the two protein embeddings, marked by ×) is outperformed by additional supervised learning (prediction score from a feed-forward neural network, FNN , for the same embeddings, marked by •). Each panel reports the correlation (Pearson R 2 ; y-axis) between the inherent information and the extracted information for one biological property of the pair: (A) sequence identity (PIDE), (B) structural similarity (TM-score), and (C) functional similarity (HFSP score). Trendlines across pLM parameter sizes (dashed line for FNN, dash-dot line for Euclidean) show that inherent information (×) remains flat with model size, with slopes of 0.0045, 0.0075, and 0.00087 for panels A, B, and C respectively, while extractable information (•) increases, with slopes 0.094, 0.09, and 0.13 (all values in change per billion parameters). Note: ESM2-3B was excluded from the FNN trendline fit as an outlier. Points (inherent and extractable information) of the same model are joined by dotted lines. All standard errors are below 0.001. Download figure Open in new tab Figure 2. : Task-specific training distorts the protein-embedding space. Kernel-density ridges show the distribution of all - vs - all Euclidean distances between protein embeddings, after min–max normalization [0, 1] within each model. Rows are ordered by model family and within family by model size; colors match model names. Vertical lines indicate quartiles: dashed lines show medians, dotted lines show Q25 and Q75. Colors represent model families (shared by all family members): Ankh (yellow), ESM-1 (green), ESM-2 (orange), ESM-3 (purple), ESM-C (blue), and ProtT5 (pink). Foundation models ESM - 2, Ankh, ProtT5, and ESM - C families display similar distance profiles within each family (e.g. ESM - 2 variants share a common peak at ∼0.23). The task-specific CLEAN (green) and ProtTucker (Pink) are shifted to the right and are markedly broader than their parent ESM-1b or ProtT5, respectively. Curiously, within the Ankh family ( Elnaggar et al., 2023 ), the 450M parameter base model brought structurally similar pairs closer together than the 1.15B parameter large model (R 2 =0.43 vs. 0.34), but the smaller model was worse than the larger one for sequence and function comparisons. Indeed, in out-of-the-box comparisons across families, Ankh (base) outperformed all others in reflecting structure similarity, while ProtT5 was best among non-specialists at mapping sequence and function. Supervised learning reveals more information capacity in larger models Inference of protein properties from sequence alone is fairly limited – while some features can be inferred from amino acid composition, models for predicting structure or function most often require sequence alignments to describe the evolutionary trajectories and sequence constraints ( Jones et al., 2014 ; Rost & Sander, 1993 ). Protein embeddings, i.e. the end result of pLM training to capture sequence preferences in extant molecule collections, may inherently achieve similar ends. We thus further asked if pLM representations easily lend themselves to inference of protein characteristics. We assessed this extractable information captured by pLM embeddings by using these embeddings as input for supervised learning (shallow networks trained to predict one of: sequence identity, structure similarity, or shared function). Here we found that larger pLMs outperformed smaller ones, suggesting that the former capture a richer protein representation despite often weak inherent organization. For example, networks trained with ESM C-600M embeddings surpassed ESM C-300M for extraction of all biological characteristics (PIDE Pearson R 2 =0.90 vs. R 2 =0.88, TM-score R 2 =0.63 vs. R 2 =0.62, and HFSP R 2 =0.82 vs. R 2 =0.80). Similarly, ESM-2 650M bested its smaller counterpart ESM-2 8M (HFSP R 2 =0.65 vs. R 2 =0.60, TM-score R 2 =0.61 vs. R 2 =0.48, and HFSP R 2 =0.81 vs. R 2 =0.67). However, the largest model in our set, ESM-2 with 3B parameters performed poorly, compared to its family’s smaller versions (PIDE R 2 =0.78, HFSP R 2 =0.51, and TM-score R 2 =0.59), suggesting a limit to performance improvement, perhaps given the current protein data availability. Again, Ankh behavior was somewhat of an outlier. Models trained on the 1.15B parameter embeddings were, as expected, better than the ones using the 450M parameter version for PIDE (R 2 = 0.79 vs. 0.53) and HFSP (R 2 = 0.76 vs. 0.45), but not for the TM-score (R 2 = 0.54 vs 0.58). Ankh behavior highlights the variability of large pLMs with even identical data sets and similar architectures. Overall, performance generally improved with foundation model parameter count, regardless of architecture. Furthermore, all FNN models (extractable information) were better than Euclidean distance (inherent information) in reflecting all evaluated biological properties (highest Euclidian R 2 vs. FNN R 2 ; PIDE 0.53 ≤ R 2 ≤ 0.9, TM-score 0.48 ≤ R 2 ≤ 0.74, and HFSP 0.45 ≤ R 2 ≤ 0.84). This observation highlights the necessity of further embedding processing when using protein representations instead of sequences. That is, embedding distances are not obviously constrained by evolutionary signals in the same way as protein sequences appear to be. Embedding dimensionality does not alter observed parameter-based trends We evaluated embedding dimensionality across all models (ranging from 128 in CLEAN to 2,560 in ESM2-3B) as a factor in deciding performance. We standardized all embeddings to 128 dimensions using PCA (Methods) and repeated all experiments described above. Previous work in NLP showed that PCA-based dimensionality reduction can even improve the performance of pretrained embeddings while reducing computational costs ( G. Zhang et al., 2024 ). Models trained on pLMs with more parameters still outperformed those trained on smaller variants ( Figure S3 and Table S1 ). Task-specific fine-tuning reshapes embedding geometry The three pLM families in this work exhibited consistent embedding space geometries ( Figure S4 ). That is, pairwise embedding distance distributions between all proteins of a pLM tended to be more similar between pLMs of the same family than compared to other pLMs ( Figure 2 ; Figure 3 ); higher Spearman correlations and lower Wasserstein distances within families vs. across families. For instance, ESM-2 variants (8M–3B parameters) were more similar than ESM-2 vs. Ankh, though smaller ESM-2 siblings had wider distributions (higher variance) than larger siblings ( Figure 2 ). Download figure Open in new tab Figure 3. : Family identity and task-specific fine-tuning leave distinct fingerprints on protein-embedding geometry. For every protein pair in the SwissProt-pre2024 dataset, we computed the Euclidean distance between all protein embeddings produced by each of the fourteen pLM. The upper-right triangle (blue) presents normalized Wasserstein distances (W) between embedding-distance distributions for each pLM pair. For each pLM, its distribution of pairwise, raw-embedding distances was min–max normalized to [0, 1] and W computed to compare distribution shapes independent of scale. The lower-left triangle (orange) reports the Spearman rank correlations (ρ) between the model embedding-distance distributions, with cell coloring proportional to ρ. All values in the plot are displayed ×10 -2 (e.g. 0.90 → 90). High ρ and low W values within model families – e.g. Ankh-base vs. Ankh-large (ρ=0.90; W=0.02) or ESM-C 300M vs. 600M (ρ=0.94; W=0.02) – highlight family-specific organization of embedding spaces. This similarity is lost for task-specific models; for example, CLEAN correlates poorly with its parent ESM1B (ρ=0.38; W=0.22). In contrast, the embeddings inferred by supervised, contrastive learning models CLEAN and ProtTucker, displayed a substantially altered embedding space relative to their parents. CLEAN’s distribution was right-shifted (higher distances between protein pairs), bimodal, and broader than its origin ESM-1b, reflecting its objective of minimizing intra-class distances while maximizing inter-class separation for enzyme classification ( Figure 2 ). This resulted in low similarity to ESM-1b ρ=0.37 and W=0.22 ( Figure 3 ). ProtTucker – trained for CATH ( Sillitoe et al., 2021 ) domain classification – distribution of embedding pairs also extended to the right, but maintained greater similarity to its parent ProtT5 (ρ=0.76, W=0.09). These findings demonstrate that foundation models within families implicitly preserve some aspects of their geometry, while task-specific objectives fundamentally reshape embedding geometry. D iscussion Through systematic analyses of protein language model (pLM) embedding spaces, we reveal fundamental trade-offs that challenge conventional assumptions about model scaling and highlight the effect of specialization on protein representations. These findings provide a framework for understanding when and why particular pLMs excel, guiding model selection for specific research needs. Although not considered in this work, we suggest that it may also be warranted to combine pLM embeddings on the basis of orthogonality (lack of correlation, Figure 3 ), thereby maximizing complementarity for further tuning. Size-performance paradox: diminishing returns for large pLMs The counterintuitive observation that smaller foundation pLMs matched the performance of those (many) folds larger when exclusively using inherent information, departs sharply from the scaling behavior established in natural language processing (NLP). That is, we find that additional capacity yields no measurable gain in modelling protein relationships. For example, the ESM-C family, which has only 300M and 600M parameter variants, consistently outperformed earlier, larger models. In addition to the model’s architectural enhancements, this superior performance may stem from ESM-C’s expanded training data, which included UniRef ( Suzek et al., 2015 ), MGnify ( Richardson et al., 2023 ), and sequences from the Joint Genome Institute ( Nordberg et al., 2014 ) clustered at 70% PIDE, making it substantially larger and, likely, more diverse than the datasets used by other models. If so, training data diversity, rather than model size, may be the key to advancing pLM performance. The size-performance paradox reflects the fundamental difference between natural language and protein sequences. While natural language benefits from increasingly complex contextual relationships that larger models can capture, protein sequences may have more constrained and well-defined biological relationships that smaller models can encode directly. For current pLMs, and given the current relative paucity of data diversity (Koludarov, Senoner, unpublished data), the ∼300M parameter models may strike an optimal balance between representation usefulness and the inherent structure of protein sequence space. We note that while our analysis has been limited by resource constraints barring the inclusion of even larger pLMs — a limitation shared in many academic environments — all results suggested that our identified patterns would hold true for them as well. Furthermore, among larger models in our set, superior performance of ProtT5 (1.5B) implied that alignment between model design, benchmarking objectives, and better-selected data, might be as (or more) decisive in ensuring overall model efficacy as parameter count. Capacity dividend: larger pLMs capture more extractable information While smaller pLMs excelled in reflecting the inherent structure of the protein space, larger pLMs demonstrated superior capacity for extractable information, i.e. via subsequent supervised training on embeddings. This was particularly obvious for complex tasks like functional similarity prediction. For instance, the dramatic performance improvement of Ankh-1.15B over Ankh-450M with further training (R 2 =0.76 vs R 2 =0.45 for HFSP) indicated that larger pLMs encode more sophisticated representations, requiring additional effort to unlock. This pattern suggests a fundamental trade-off in protein language modeling: immediate accessibility (inherent) versus ultimate potential (extractable). Larger models appear to encode biological information in more complex patterns that are less immediately interpretable but can be leveraged effectively through supervised training. This pattern has been observed even at the 100B parameter scale, where improved perplexity does not guarantee better downstream performance without task-specific fine-tuning ( Chen et al., 2025 ). Yet the incremental nature of these improvements — even the largest models in this study struggled to exceed Pearson R 2 of 0.74 and 0.84 for structural and functional similarity, respectively — suggests that current architectures may be approaching fundamental limitations in protein representation rather than simply requiring more parameters. Geometry-warp by task-specific training The dramatic embedding space alterations observed for task-specific models, e.g. CLEAN ( Yu et al., 2023 ), optimized to classify enzymes, and ProtTucker ( Heinzinger et al., 2022 ), optimized to classify protein structure by CATH ( Sillitoe et al., 2021 ), revealed both the power and limitations of specialized training. Reduced correlation between CLEAN and its parent foundation pLM ESM-1b (ρ=0.38) illustrates how task-specific training fundamentally warps embedding spaces, optimizing for specific biological properties while losing sight of general protein relationships. CLEAN’s warping results in a broader distribution of pairwise distances ( Figure 2 ) and limits the model’s ability to adapt to new tasks through supervised learning. Here, for example, CLEAN performed worse than ESM-1b (Pearson R 2 PIDE 0.69 vs. 0.81, TM-score 0.52 vs. 0.67, HFSP 0.46 vs. 0.68) – a result due to performance measurements unconcerned with enzymes. Both non-specialist models, ProtTucker and CLEAN, expand their embedding space relative to their parent models, ProtT5 and ESM-1b, respectively. We suspect that ProtTucker’s expansion improves CATH domain discrimination while potentially compressing other biological signals into narrower ranges. CLEAN’s expansion magnifies enzyme-class separations but dilutes signals for structural classification. Note that CLEAN’s ability to capture function is clear in its inherent relatively high correlation with HFSP. However, its embeddings do not lend themselves to further tuning ( Table S1 ). The geometry of an embedding space governs how every downstream method – nearest-neighbor search, clustering, or a thin supervised head – interprets relatedness. Because supervised training usually adds only a lightweight head on top of frozen embeddings, a strongly warped base geometry leaves little latent information to reuse, explaining each model’s weak cross-task transfer despite their hundreds of millions of parameters. That is, while task-specific training can achieve superior performance for targeted applications, it sacrifices versatility that makes foundation models valuable for diverse biological questions. Practical pLM selection and future directions Our findings enable clear recommendations for pLM selection based on research requirements and computational constraints. For immediate biological insights requiring minimal computational overhead, i.e. using raw embeddings, mid-size foundation models are clearly sufficient. For highly specialized tasks with well-defined objectives, task-specific pLMs, regardless of their size, are needed; be aware that these models sacrifice general applicability for specialized performance. Note that, here, using LoRA fine-tuning to create smaller models specific for each problem ( Schmirler et al., 2024 ), instead of large-scale re-training, is likely most efficient. Indeed, when planning additional fine-tuning for specific applications larger foundation models are warranted. Overall, the modest performance improvements due to parameter scaling ( Figure 1 ) suggests that current model limitations may stem from fundamental data and methodological constraints rather than from insufficient model capacity. That is, in addition to model size, the choice between pLM families should be driven by specific biological properties of interest, with different architectures exhibiting distinct strengths. Moreover, the underrepresentation of protein diversity in training datasets ( Avasthi & York, 2024 ; Karsch-Mizrachi et al., 2018 ; Mora et al., 2011 ) likely constrains all models regardless of size. Systematic biases in protein representations identified in recent work ( Ding & Steinhardt, 2024 ; Marquet et al., 2024 ) further compound these limitations, suggesting that addressing data quality and diversity may yield greater improvements than scaling model size. Future progress in protein language modeling thus requires fundamental shifts in approach. First, developing methods to better preserve and extract biological information from existing embedding spaces remains a significant challenge. A recent study exploring 13 different compression methods found that average pooling of residue representations to represent proteins remains superior ( Vieira et al., 2025 ), however additional work in this direction is warranted given the difference between embedding-inherent vs. extractable information patterns identified herein. Second, acknowledging that not all embeddings of one pLM are created equal will lead to developments of more descriptive models. That is, if evaluation methods such as RNS ( Prabakaran & Bromberg, 2025b ) can identify embeddings of poorly captured proteins, should we train additional models specific to these poorly captured subsets or, rather, optimize generalized model training? Third, explicitly designing training approaches that optimize embedding spaces for multiple biological properties simultaneously, possibly through multi-task or carefully designed contrastive learning, may result in improved representations for a better understanding of the protein space. Finally, it is critical for any future progress to address the issues of data quality and diversity. Focusing on underrepresented protein families and taxonomic groups through strategic dataset expansion and improved data curation may address systematic bias in protein representations, leading to more robust pLMs – no petabytes of parameters needed. C onclusion Our systematic comparison of 14 protein Language Models (pLMs) uncovered a two-tier effect of scale. For zero-shot use, the sweet spot for pLM size appears today in mid-scale foundation models. Adding billions of extra parameters does not improve this inherent information but consumes substantially more energy. Additional extractable information capacity dividend becomes clear through supervised training with a simple prediction head. Specialization is a double-edged sword: contrastive fine-tuning for enzymes (CLEAN) or CATH domains (ProtTucker) boosts performance on the target task, but it also warps the global geometry of the protein space thereby reducing model versatility. Taken together, the results argue for a task-driven model-selection strategy: use smaller foundation pLMs when rapid, low-cost insight is required; reach for the billion-parameter variants only when you can afford to fine-tune and need maximal ceiling performance; and deploy task-specific models solely when the biological question precisely matches their training objective. In short, judicious choice of pLM can save both computation and carbon without sacrificing accuracy, that is bigger is not automatically better. S upplementary M aterial View this table: View inline View popup Table S1: pLM performance in predicting pairwise PIDE, TM-score, and HFSP. Download figure Open in new tab Figure S1: Distribution and filtering of protein similarity metrics. Violin plots illustrating the distribution of key quality and similarity metrics across protein pairs prior to filtering. Each plot visualizes the data density and the applied filtering threshold used to generate the final, high-confidence datasets for model training and evaluation. Note that most proteins are dissimilar and thus do not reach a method specific threshold, i . e. they can not be included into the set visualized herein . (A) Sequence-based metrics . Distributions derived from the MMSeqs2 search. The panels show, from left to right: the minimum alignment coverage between sequence pairs, the fractional sequence identity (fident), and the calculated HFSP scores. (B) Structural confidence metric . Distribution of the average per-residue confidence scores (pLDDT) for all protein structures from the AlphaFold DB (v4). (C) Structural similarity metrics . Distributions derived from Foldseek search. The panels show the minimum structural alignment coverage and the resulting alignment TM-score (alntmscore). In each panel, the red dashed line indicates the specific filtering threshold applied. The red shaded area highlights the portion of the data that was excluded based on this threshold. The associated text annotation quantifies the absolute number and corresponding percentage of protein pairs (or individual structures in B) that were removed by each filter. Download figure Open in new tab Figure S2: Feed-forward network architecture for supervised learning on protein pair relationships. The network processes two protein embeddings (EMB1 and EMB2) of dimension n (pLM-dependent) through a symmetric architecture. Each embedding passes through a shared weight layer (n → 64; shown separately for visualization), followed by concatenation and successive fully connected layers (128 → 64 → 32 → 1). The output represents the predicted biological relationship between the protein pair (sequence identity, structural similarity, or functional similarity). Download figure Open in new tab Figure S3: Performance with PCA-compressed embeddings mirrors the trends in Figure 1 . Each panel reports the Pearson R 2 on SwissProt-pre2024 test pairs for (A) sequence identity (PIDE), (B) structural similarity (alignment TM-score), and (C) functional similarity (HFSP). The x-axis lists twelve non-specialist pLMs ordered by parameter count within each family; colors encode families (Ankh = yellow, ESM-1 = green, ESM-2 = orange, ESM-3 = purple, ESM-C = blue, ProtT5 = pink). For every model, a × marks performance from the Euclidean distance between PCA-projected embeddings (inherent information after compression), while a • shows an FNN trained on the same PCA features (extractable information after compression). Points within a family are linked by faint dotted lines. Error bars are not visible since standard errors are all below 0.001. As in Figure 1 trendlines across pLM parameter sizes (dashed line for FNN, dash-dot line for Euclidean) show that inherent signal (×) plateaus with model size, with slopes of -0.0051, -0.0017, and -0.004 for panels A, B, and C respectively, while extractable information (•) increases, with slopes 0.053, 0.092, and 0.095 (all values in change per billion parameters). Note: ESM2-3B was excluded from the FNN trendline fit. Download figure Open in new tab Figure S4: pLM distance distributions reveal family-specific embedding geometries through density fingerprints. Hexagonal grid plots comparing pairwise Euclidean distance distributions across all pLM pairs in the SwissProt-pre2024 dataset. For every model pair, we plot the joint distribution of distances using a 50 × 50 hexagonal grid, where color intensity represents the number of protein pairs falling in each matched distance bin. Each grid cell visualizes how distance values from one model align with those from another across all protein pairs. The diagonal displays model names for orientation. Tight, concentrated distributions along the diagonal indicate similar distance geometries between models, while dispersed patterns reveal divergent embedding spaces. Models within the same family (e.g., Ankh-base/large, ESM-C variants) show concentrated joint distributions, reflecting shared architectural principles. In contrast, CLEAN exhibits dispersed patterns when compared to foundation models, consistent with its unique contrastive training objective. Footnotes DATA AND CODE AVAILABILITY All code used in this project can be found on GitHub: https://github.com/tsenoner/plm_choice . All data used in this project can be found on Zenodo: 10.5281/zenodo.17469268 R eferences ↵ Ashburner , M. , Ball , C. A. , Blake , J. A. , Botstein , D. , Butler , H. , Cherry , J. M. , Davis , A. P. , Dolinski , K. , Dwight , S. S. , Eppig , J. T. , Harris , M. A. , Hill , D. P. , Issel-Tarver , L. , Kasarskis , A. , Lewis , S. , Matese , J. C. , Richardson , J. E. , Ringwald , M. , Rubin , G. M. , & Sherlock , G. ( 2000 ). Gene Ontology: Tool for the unification of biology . Nature Genetics , 25 ( 1 ), 25 – 29 . doi: 10.1038/75556 OpenUrl CrossRef PubMed Web of Science ↵ Avasthi , P. , & York , R. ( 2024 ). The known protein universe is phylogenetically biased (Version 1.0, p. 4900 words) [Text/html] . Arcadia Science . doi: 10.57844/ARCADIA-570F-5CFB OpenUrl CrossRef ↵ Chen , B. , Cheng , X. , Li , P. , Geng , Y. , Gong , J. , Li , S. , Bei , Z. , Tan , X. , Wang , B. , Zeng , X. , Liu , C. , Zeng , A. , Dong , Y. , Tang , J. , & Song , L. ( 2025 ). xTrimoPGLM: Unified 100-billion-parameter pretrained transformer for deciphering the language of proteins . Nature Methods , 22 ( 5 ), 1028 – 1039 . doi: 10.1038/s41592-025-02636-z OpenUrl CrossRef ↵ Ding , F. , & Steinhardt , J. ( 2024 ). Protein language models are biased by unequal sequence sampling across the tree of life . Bioinformatics . doi: 10.1101/2024.03.07.584001 OpenUrl CrossRef ↵ Elnaggar , A. , Essam , H. , Salah-Eldin , W. , Moustafa , W. , Elkerdawy , M. , Rochereau , C. , & Rost , B. ( 2023 ). Ankh ☥: Optimized Protein Language Model Unlocks General-Purpose Modelling . doi: 10.1101/2023.01.16.524265 OpenUrl Abstract / FREE Full Text ↵ Elnaggar , A. , Heinzinger , M. , Dallago , C. , Rehawi , G. , Wang , Y. , Jones , L. , Gibbs , T. , Feher , T. , Angerer , C. , Steinegger , M. , Bhowmik , D. , & Rost , B. ( 2022 ). ProtTrans: Toward Understanding the Language of Life Through Self-Supervised Learning . IEEE Transactions on Pattern Analysis and Machine Intelligence , 44 ( 10 ), 7112 – 7127 . doi: 10.1109/TPAMI.2021.3095381 OpenUrl CrossRef PubMed ↵ Gillis , J. , & Pavlidis , P. ( 2013 ). Assessing identity, redundancy and confounds in Gene Ontology annotations over time . Bioinformatics , 29 ( 4 ), 476 – 482 . doi: 10.1093/bioinformatics/bts727 OpenUrl CrossRef PubMed Web of Science ↵ Hayes , T. , Rao , R. , Akin , H. , Sofroniew , N. J. , Oktay , D. , Lin , Z. , Verkuil , R. , Tran , V. Q. , Deaton , J. , Wiggert , M. , & others. ( 2025 ). Simulating 500 million years of evolution with a language model . Science , 387 ( 6736 ), 850 – 858 . OpenUrl CrossRef PubMed ↵ Heinzinger , M. , Littmann , M. , Sillitoe , I. , Bordin , N. , Orengo , C. , & Rost , B. ( 2022 ). Contrastive learning on protein embeddings enlightens midnight zone . NAR Genomics and Bioinformatics , 4 ( 2 ), qac043. doi: 10.1093/nargab/lqac043 OpenUrl CrossRef ↵ Heinzinger , M. , & Rost , B. ( 2025 ). Teaching AI to speak protein . Current Opinion in Structural Biology , 91 , 102986 . doi: 10.1016/j.sbi.2025.102986 OpenUrl CrossRef PubMed Heinzinger , M. , Weissenow , K. , Sanchez , J. G. , Henkel , A. , Mirdita , M. , Steinegger , M. , & Rost , B. ( 2024 ). Bilingual language model for protein sequence and structure . NAR Genomics and Bioinformatics , 6 ( 4 ), qae150. doi: 10.1093/nargab/lqae150 OpenUrl CrossRef ↵ Hoffmann , J. , Borgeaud , S. , Mensch , A. , Buchatskaya , E. , Cai , T. , Rutherford , E. , Casas , D. de L. , Hendricks , L. A. , Welbl , J. , Clark , A. , Hennigan , T. , Noland , E. , Millican , K. , Driessche G. van den , Damoc , B. , Guy , A. , Osindero , S. , Simonyan , K. , Elsen , E. , … Sifre , L. ( 2022 ). Training Compute-Optimal Large Language Models (No . arxiv: 2203.15556 ). arXiv . doi: 10.48550/arXiv.2203.15556 OpenUrl CrossRef ↵ Jones , P. , Binns , D. , Chang , H.-Y. , Fraser , M. , Li , W. , McAnulla , C. , McWilliam , H. , Maslen , J. , Mitchell , A. , Nuka , G. , Pesseat , S. , Quinn , A. F. , Sangrador-Vegas , A. , Scheremetjew , M. , Yong , S.-Y. , Lopez , R. , & Hunter , S. ( 2014 ). InterProScan 5: Genome-scale protein function classification . Bioinformatics , 30 ( 9 ), 1236 – 1240 . doi: 10.1093/bioinformatics/btu031 OpenUrl CrossRef PubMed Web of Science ↵ Karsch-Mizrachi , I. , Takagi , T. , Cochrane , G. , & on behalf of the International Nucleotide Sequence Database Collaboration . ( 2018 ). The international nucleotide sequence database collaboration . Nucleic Acids Research , 46 ( D1 ), D48 – D51 . doi: 10.1093/nar/gkx1097 OpenUrl CrossRef PubMed ↵ Kim , H. , Mirdita , M. , & Steinegger , M. ( 2023 ). Foldcomp: A library and format for compressing and indexing large protein structure sets . Bioinformatics , 39 ( 4 ), btad153 . doi: 10.1093/bioinformatics/btad153 OpenUrl CrossRef ↵ Lin , Z. , Akin , H. , Rao , R. , & Hie , B. ( 2023 ). Evolutionary-scale prediction of atomic-level protein structure with a language model . Science , 379 ( 6637 ), 1123 – 1130 . doi: 10.1126/science.ade2574 OpenUrl CrossRef PubMed Lin , Z. , Akin , H. , Rao , R. , Hie , B. , Zhu , Z. , Lu , W. , Smetanin , N. , Verkuil , R. , Kabeli , O. , Shmueli , Y. , Dos Santos Costa , A. , Fazel-Zarandi , M. , Sercu , T. , Candido , S. , & Rives , A. ( 2023 ). Evolutionary-scale prediction of atomic-level protein structure with a language model . Science , 379 ( 6637 ), 1123 – 1130 . doi: 10.1126/science.ade2574 OpenUrl CrossRef PubMed ↵ Mahlich , Y. , Steinegger , M. , Rost , B. , & Bromberg , Y. ( 2018 ). HFSP: High speed homology-driven function annotation of proteins . Bioinformatics , 34 ( 13 ), i304 – i312 . doi: 10.1093/bioinformatics/bty262 OpenUrl CrossRef PubMed ↵ Marquet , C. , Schlensok , J. , Abakarova , M. , Rost , B. , & Laine , E. ( 2024 ). Expert-guided protein language models enable accurate and blazingly fast fitness prediction . Bioinformatics , 40 ( 11 ), btae621 . doi: 10.1093/bioinformatics/btae621 OpenUrl CrossRef PubMed ↵ Mirdita , M. , Schütze , K. , Moriwaki , Y. , Heo , L. , Ovchinnikov , S. , & Steinegger , M. ( 2022 ). ColabFold: Making protein folding accessible to all . Nature Methods , 19 ( 6 ), 679 – 682 . doi: 10.1038/s41592-022-01488-1 OpenUrl CrossRef PubMed ↵ Mora , C. , Tittensor , D. P. , Adl , S. , Simpson , A. G. B. , & Worm , B. ( 2011 ). How Many Species Are There on Earth and in the Ocean? PLoS Biology , 9 ( 8 ), e1001127 . doi: 10.1371/journal.pbio.1001127 OpenUrl CrossRef PubMed ↵ Nordberg , H. , Cantor , M. , Dusheyko , S. , Hua , S. , Poliakov , A. , Shabalov , I. , Smirnova , T. , Grigoriev , I. V. , & Dubchak , I. ( 2014 ). The genome portal of the Department of Energy Joint Genome Institute: 2014 updates . Nucleic Acids Research , 42 ( D1 ), D26 – D31 . doi: 10.1093/nar/gkt1069 OpenUrl CrossRef PubMed Web of Science ↵ Pearson , K. ( 1901 ). LIII. On lines and planes of closest fit to systems of points in space . The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science , 2 ( 11 ), 559 – 572 . doi: 10.1080/14786440109462720 OpenUrl CrossRef ↵ Prabakaran , R. , & Bromberg , Y. ( 2025a ). Functional profiling of the sequence stockpile: A protein pair-based assessment of in silico prediction tools . Bioinformatics , 41 ( 2 ), btaf035 . doi: 10.1093/bioinformatics/btaf035 OpenUrl CrossRef ↵ Prabakaran , R. , & Bromberg , Y. ( 2025b ). Quantifying uncertainty in Protein Representations Across Models and Task . bioRxiv , 2025 – 04 . ↵ Radivojac , P. , Clark , W. T. , Oron , T. R. , Schnoes , A. M. , Wittkop , T. , Sokolov , A. , Graim , K. , Funk , C. , Verspoor , K. , Ben-Hur , A. , Pandey , G. , Yunes , J. M. , Talwalkar , A. S. , Repo , S. , Souza , M. L. , Piovesan , D. , Casadio , R. , Wang , Z. , Cheng , J. , … Friedberg , I. ( 2013 ). A large-scale evaluation of computational protein function prediction . Nature Methods , 10 ( 3 ), 221 – 227 . doi: 10.1038/nmeth.2340 OpenUrl CrossRef PubMed Web of Science ↵ Rembeza , E. , & Engqvist , M. K. M. ( 2021 ). Experimental and computational investigation of enzyme functional annotations uncovers misannotation in the EC 1.1.3.15 enzyme class . PLOS Computational Biology , 17 ( 9 ), e1009446 . doi: 10.1371/journal.pcbi.1009446 OpenUrl CrossRef PubMed ↵ Richardson , L. , Allen , B. , Baldi , G. , Beracochea , M. , Bileschi , M. L. , Burdett , T. , Burgin , J. , Caballero-Pérez , J. , Cochrane , G. , Colwell , L. J. , Curtis , T. , Escobar-Zepeda , A. , Gurbich , T. A. , Kale , V. , Korobeynikov , A. , Raj , S. , Rogers , A. B. , Sakharova , E. , Sanchez , S. , … Finn , R. D. ( 2023 ). MGnify: The microbiome sequence data analysis resource in 2023 . Nucleic Acids Research , 51 ( D1 ), D753 – D759 . doi: 10.1093/nar/gkac1080 OpenUrl CrossRef PubMed ↵ Rives , A. , Meier , J. , Sercu , T. , Goyal , S. , Lin , Z. , Liu , J. , Guo , D. , Ott , M. , Zitnick , C. L. , Ma , J. , & Fergus , R. ( 2021 ). Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences . Proceedings of the National Academy of Sciences , 118 ( 15 ), e2016239118 . doi: 10.1073/pnas.2016239118 OpenUrl Abstract / FREE Full Text ↵ Rost , B. , & Sander , C. ( 1993 ). Prediction of Protein Secondary Structure at Better than 70% Accuracy . Journal of Molecular Biology , 232 ( 2 ), 584 – 599 . doi: 10.1006/jmbi.1993.1413 OpenUrl CrossRef PubMed Web of Science ↵ Schmirler , R. , Heinzinger , M. , & Rost , B. ( 2024 ). Fine-tuning protein language models boosts predictions across diverse tasks . Nature Communications , 15 ( 1 ), 7407 . doi: 10.1038/s41467-024-51844-2 OpenUrl CrossRef PubMed ↵ Schnoes , A. M. , Brown , S. D. , Dodevski , I. , & Babbitt , P. C. ( 2009 ). Annotation Error in Public Databases: Misannotation of Molecular Function in Enzyme Superfamilies . PLoS Computational Biology , 5 ( 12 ), e1000605 . doi: 10.1371/journal.pcbi.1000605 OpenUrl CrossRef PubMed ↵ Sillitoe , I. , Bordin , N. , Dawson , N. , Waman , V. P. , Ashford , P. , Scholes , H. M. , Pang , C. S. M. , Woodridge , L. , Rauer , C. , Sen , N. , Abbasian , M. , Le Cornu , S. , Lam , S. D. , Berka , K. , Varekova , I. H. , Svobodova , R. , Lees , J. , & Orengo , C. A. ( 2021 ). CATH: Increased structural coverage of functional space . Nucleic Acids Research , 49 ( D1 ), D266 – D273 . doi: 10.1093/nar/gkaa1079 OpenUrl CrossRef PubMed ↵ Steinegger , M. , & Söding , J. ( 2017 ). MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets . Nature Biotechnology , 35 ( 11 ), 1026 – 1028 . doi: 10.1038/nbt.3988 OpenUrl CrossRef PubMed ↵ Suzek , B. E. , Wang , Y. , Huang , H. , McGarvey , P. B. , Wu , C. H. , & the UniProt Consortium . ( 2015 ). UniRef clusters: A comprehensive and scalable alternative for improving sequence similarity searches . Bioinformatics , 31 ( 6 ), 926 – 932 . doi: 10.1093/bioinformatics/btu739 OpenUrl CrossRef PubMed Team ESM . ( 2024 ). ESM Cambrian: Revealing the mysteries of proteins with unsupervised le arning . EvolutionaryScale Website . ↵ Teufel , F. , Almagro Armenteros , J. J. , Johansen , A. R. , Gíslason , M. H. , Pihl , S. I. , Tsirigos , K. D. , Winther , O. , Brunak , S. , von Heijne , G. , & Nielsen , H. ( 2022 ). SignalP 6.0 predicts all five types of signal peptides using protein language models . Nature Biotechnology , 40 ( 7 ), 1023 – 1025 . OpenUrl CrossRef PubMed ↵ The UniProt Consortium , Bateman , A. , Martin , M.-J. , Orchard , S. , Magrane , M. , Adesina , A. , Ahmad , S. , Bowler-Barnett , E. H. , Bye-A-Jee , H. , Carpentier , D. , Denny , P. , Fan , J. , Garmiri , P. , Gonzales , L. J. D. C. , Hussein , A. , Ignatchenko , A. , Insana , G. , Ishtiaq , R. , Joshi , V. , … Zhang , J. ( 2025 ). UniProt: The Universal Protein Knowledgebase in 2025 . Nucleic Acids Research , 53 ( D1 ), D609 – D617 . doi: 10.1093/nar/gkae1010 OpenUrl CrossRef PubMed ↵ van Kempen , M. , Kim , S. S. , Tumescheit , C. , Mirdita , M. , Lee , J. , Gilchrist , C. L. M. , Söding , J. , & Steinegger , M. ( 2024 ). Fast and accurate protein structure search with Foldseek . Nature Biotechnology , 42 ( 2 ), 243 – 246 . doi: 10.1038/s41587-023-01773-0 OpenUrl CrossRef PubMed ↵ Varadi , M. , Bertoni , D. , Magana , P. , Paramval , U. , Pidruchna , I. , Radhakrishnan , M. , Tsenkov , M. , Nair , S. , Mirdita , M. , Yeo , J. , Kovalevskiy , O. , Tunyasuvunakool , K. , Laydon , A. , Žídek , A. , Tomlinson , H. , Hariharan , D. , Abrahamson , J. , Green , T. , Jumper , J. , … Velankar , S. ( 2024 ). AlphaFold Protein Structure Database in 2024: Providing structure coverage for over 214 million protein sequences . Nucleic Acids Research , 52 ( D1 ), D368 – D375 . doi: 10.1093/nar/gkad1011 OpenUrl CrossRef PubMed ↵ Vieira , L. C. , Handojo , M. L. , & Wilke , C. O. ( 2025 ). Medium-sized protein language models perform well at transfer learning on realistic datasets . Scientific Reports , 15 ( 1 ), 21400 . doi: 10.1038/s41598-025-05674-x OpenUrl CrossRef PubMed ↵ Villani , C. ( 2008 ). Optimal Transport: Old and New . Springer Berlin Heidelberg . https://books.google.de/books?id=NZXiNAEACAAJ ↵ Wang , L. , Li , X. , Zhang , H. , Wang , J. , Jiang , D. , Xue , Z. , & Wang , Y. ( 2025 ). A comprehensive review of protein language models . arXiv Preprint arxiv: 2502.06881 . ↵ Weissenow , K. , Heinzinger , M. , & Rost , B. ( 2022 ). Protein language-model embeddings for fast, accurate, and alignment-free protein structure prediction . Structure , 30 ( 8 ), 1169 - 1177.e4 . doi: 10.1016/j.str.2022.05.001 OpenUrl CrossRef ↵ Yu , T. , Cui , H. , Li , J. C. , Luo , Y. , Jiang , G. , & Zhao , H. ( 2023 ). Enzyme function prediction using contrastive learning . Science , 379 ( 6639 ), 1358 – 1363 . OpenUrl CrossRef PubMed ↵ Zhang , G. , Zhou , Y. , & Bollegala , D. ( 2024 ). Evaluating Unsupervised Dimensionality Reduction Methods for Pretrained Sentence Embeddings (No . arxiv: 2403.14001 ). arXiv . doi: 10.48550/arXiv.2403.14001 OpenUrl CrossRef ↵ Zhang , Y. , & Skolnick , J. ( 2005 ). TM-align: A protein structure alignment algorithm based on the TM-score . Nucleic Acids Research , 33 ( 7 ), 2302 – 2309 . doi: 10.1093/nar/gki524 OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted October 31, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Which pLM to choose? Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Which pLM to choose? Tobias Senoner , Ivan Koludarov , Joshua Günther , Amarda Shehu , Burkhard Rost , Yana Bromberg bioRxiv 2025.10.30.685515; doi: https://doi.org/10.1101/2025.10.30.685515 Share This Article: Copy Citation Tools Which pLM to choose? Tobias Senoner , Ivan Koludarov , Joshua Günther , Amarda Shehu , Burkhard Rost , Yana Bromberg bioRxiv 2025.10.30.685515; doi: https://doi.org/10.1101/2025.10.30.685515 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17633) Bioengineering (13856) Bioinformatics (41841) Biophysics (21399) Cancer Biology (18529) Cell Biology (25422) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24282) Genetics (15582) Genomics (22462) Immunology (17700) Microbiology (40295) Molecular Biology (17140) Neuroscience (88419) Paleontology (666) Pathology (2823) Pharmacology and Toxicology (4813) Physiology (7632) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00