Towards functional annotation with latent protein language model features

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 43,426 characters · extracted from preprint-html · click to expand
Towards functional annotation with latent protein language model features | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Towards functional annotation with latent protein language model features Jake Silberg , Elana Simon , James Zou doi: https://doi.org/10.1101/2025.10.02.680154 Jake Silberg 1 Stanford University Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: jsilberg{at}stanford.edu Elana Simon 1 Stanford University Find this author on Google Scholar Find this author on PubMed Search for this author on this site James Zou 1 Stanford University Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Protein Language Models (PLMs) create high-dimensional embeddings that can be transformed into interpretable sparse features using Sparse Autoencoders (SAEs), where each feature activates on specific protein elements or patterns. However, scalably identifying which features are cohesive and reliable enough for protein annotation remains challenging. We address this by developing a validation pipeline combining three complementary methods: (1) expanded database matching across 20+ annotation sources including hierarchical codes, (2) feature-guided local structural alignment to identify structurally consistent activation regions, and (3) LLM-based feature description generation. Our annotation pipeline demonstrates three key properties of SAE features that make them a useful source of functional annotation complementary to existing methods. First, they can represent more granular patterns than existing protein databases, enabling the identification of sub-domains. Second, they can detect missing annotations by finding proteins that display recognizable structural motifs but lack corresponding database labels. Here, we automatically identify at least 491 missing CATH topology annotations with our pipeline. Third, they can maintain structural consistency across unseen proteins. Of our 10,240 SAE features, we find 615 that are consistently structurally similar in unannotated metagenomic proteins, allowing us to structurally match at least 8,077 metagenomic proteins to characterized proteins. This provides a rapid annotation pipeline with constant time search regardless of database size, that automatically includes structural and function information about the feature that triggered the match. Code is available at https://github.com/jsilbergDS/towards_functional_annotations_plms 1 Introduction Proteins play essential roles in nearly all biological processes, yet our ability to annotate their functional elements lags behind recent growth in sequence data. With the advent of metagenomics, vast datasets of protein sequences have been identified, but many remain functionally uncharacterized ( Karin and Steinegger [2025 ]). Traditional computational methods annotate proteins based on similarity to well-characterized peers, but these approaches struggle when sequence similarity is low ( Karin and Steinegger [2025 ]). Recent advances in protein language models (PLMs) like ESM-2 ( Lin et al. [2023 ]) have demonstrated remarkable ability to capture evolutionary patterns across the protein universe. However, ESM embeddings cannot easily be translated into interpretable features that represent specific protein domains. Sparse autoencoders (SAEs) have emerged as powerful tools for mechanistic interpretability. When applied to PLMs, they can decompose dense embeddings into interpretable features that frequently correspond to functional protein elements without biological supervision ( Simon and Zou [2024 ]), ( Adams et al. [2025 ]). This suggests that SAE features could serve as a foundation for efficient functional annotation. However, not all SAE features are suitable for annotation. Some may be polysemantic, activating on multiple unrelated concepts, while others may represent patterns meaningful to the model but not interpretable by humans who rely on structural, sequential, and functional annotations. To harness SAE features for practical annotation, we need to systematically identify features that correspond to interpretable biological concepts. To identify cohesive features, we focus on those that activate most strongly on groups of proteins that are functionally or structurally related, as defined by meeting one of three validation criteria: A single annotation from an existing annotation database is predictive of feature activation The activating regions of the highest activating proteins are structurally similar A language model can generate a feature description that is predictive of activation, as validated in a held-out set of proteins In this work, we analyze latent features from an SAE trained on Layer 18 of ESM-2 650M, yielding 10,240 features (referred to throughout as f/feature-number, e.g., f/401). Our analysis proceeds in two main parts. First, we develop and evaluate our three-pronged validation approach, demonstrating how expanded database matching, feature-guided local structural alignment, and LLM-based pattern recognition complement each other to identify cohesive features. We show this approach doubles annotation coverage, identifying over 60% of features with strong correspondence to biological annotations ( F 1 > 0.8). With local structural alignment, we find an additional 2.4% of the features with worse correspondence to existing databases, but high structural similarity. Second, we demonstrate three practical applications that highlight the advantages of SAE features for protein annotation: (1) capturing granular subdomains within existing annotations that reveal discrete functional units, (2) detecting missing annotations by identifying proteins with recognizable structural motifs that lack corresponding database labels, and (3) enabling zero-shot generalization to novel metagenomic proteins, including those without matches to existing protein families, specifically, Pfam ( Bateman et al. [2004 ]). Utilizing PLM SAE features in this annotation pipeline offers natural advantages: First, because the top structures for each feature can be pre-computed, during a search, hits can be found in constant-time search regardless of database size. Second, because we know the feature(s) triggering the hit, the search can automatically include known structural or functional information about that feature (such as its links to existing databases or a natural language description). Finally, because the features are found in an unsupervised manner, the search can find structural hits based on features not in existing databases. 2 Related Work Traditional Sequence-Based Annotation Methods Functional annotation of uncharacterized proteins has historically relied on sequence conservation approaches. Hidden Markov Models and Position-Specific Scoring Matrices form the foundation of major databases including CATH ( Orengo et al. [1997 ]) (via Gene3D Buchan et al. [2002 ]) and Pfam ( Bateman et al. [2004 ]). These have been integrated into unified resources like UniProt Consortium [2015 ] and InterPro ( Hunter et al. [2009 ]), with search tools like InterProScan enabling annotation of arbitrary sequences ( Jones et al. [2014 ]). However, these methods struggle with divergent sequences, particularly from metagenomic sources, leading to specialized databases like Novel Metagenomic Pfams (NMPfamsDB) ( Baltoumas et al. [2024 ]) that specifically curate sequences lacking Pfam annotations. Structure-Based Approaches AlphaFold 2 ( Jumper et al. [2021 ]) catalyzed a shift toward 3D structure-based annotation methods. This has enabled new databases focused on structural similarity, such as The Encyclopedia of Domains ( Lau et al. [2024 ]) based on the Merizo tool ( Lau et al. [2023 ]). However, large-scale structural searching remains computationally challenging despite algorithmic advances like TMalign ( Zhang and Skolnick [2005 ]) and CEalign ( Shindyalov and Bourne [1998 ]). FoldSeek addresses this by converting structural search into sequence matching using a 3D-informed alphabet, achieving orders-of-magnitude speedup (van Kempen et al. [2022 ]). Merizo-search uses embeddings trained on CATH domains for rapid structure matching ( Kandathil et al. [2025 ]). While powerful, these approaches have limitations: FoldSeek doesn’t automatically provide domain-specific functional information about the region(s) of each protein that triggered the match, and Merizo-search is constrained to identify hits based on supervised training on CATH annotations. Interpretable Features from Protein Language Models Recent work has explored sparse autoencoders (SAEs) for extracting interpretable features from PLMs. InterPLM ( Simon and Zou [2024 ]) demonstrated correspondence between amino acid activations and UniProtKB annotations, while InterProt ( Adams et al. [2025 ]) associated protein-level activations with Pfam families. However, these approaches achieved limited coverage, leaving over 75% of features unexplained when using stringent matching criteria. Our work extends this foundation by: (1) incorporating the full InterPro database across 20+ annotation sources including hierarchical codes, (2) developing systematic structural validation through feature-guided local structural alignment, and (3) integrating LLM-based pattern recognition to identify features missed by existing codes. This comprehensive approach doubles annotation coverage while enabling practical applications for both missing annotation detection and novel protein characterization. 3 Using existing database annotations, local structural alignment, and LLMs to screen SAE features for cohesiveness 3.1 Combining protein annotation databases identifies structurally and functionally cohesive features We expand annotation coverage by incorporating the full InterPro database ( Hunter et al. [2009 ]), which includes annotations from UniProtKB, Pfam, CATH / Gene3D ( Orengo et al. [1997 ]), ( Buchan et al. [2002 ]), and 19 other sources. We evaluate associations at the protein level rather than amino acid level, allowing us to capture cases where feature activations occur near—but not exactly on—annotated elements, and enabling us to combine both protein-level annotations (like Pfam domains) and more granular annotations (like binding sites and motifs in UniProtKB). We also include hierarchical codes such as Pfam clans and CATH topologies to capture broader biological concepts. For each SAE feature, we sample up to 1100 proteins across 10 activation levels (up to 100 proteins each for activation 0, 0.1, etc up to 1.0) and test whether any single code can predict high versus low feature activation, calculating F1 scores between predicted and actual activations. This approach doubles our annotation coverage . As shown in Figure 2 , expanding the databases increases the percentage of features with F1 > 0.8 from approximately 25% to over 60%. The highest-performing annotations are distributed relatively evenly across UniProtKB, Pfam clans, Gene3D/CATH codes, and Pfam families, demonstrating that SAE features capture biological concepts at multiple levels of granularity. Features with moderate performance (F1 0.5-0.75) are predominantly associated with UniProtKB categories. Download figure Open in new tab Figure 1: Workflow for protein latent feature applications. Download figure Open in new tab Figure 2: Additional databases expand our ability to find cohesive features. Hierarchical codes like Pfam clans often correspond more closely to a feature than a Pfam family, indicating the SAE learns different levels of specificity. This metric serves dual purposes: quantifying annotation correspondence and screening for feature cohesiveness—features that correspond strongly to existing annotations likely represent understandable biological elements suitable for annotation purposes. 3.2 Feature-guided local structural alignment independently identifies structurally cohesive features To better understand what causes these features to fire, we turn to structural analysis. This serves two benefits. First, we can track structural subdomains that may not have their own annotation. Second, we can track structural elements that fire even when an annotation is missing. InterPLM ( Simon and Zou [2024 ]) demonstrated that when a feature associated with an existing annotation activates on a protein that lacks this corresponding annotation, what initially appears to be an erroneous feature activation can actually indicate missing or incorrect database labels. Through manual inspection, they identified three features that were correctly identifying functional patterns in proteins that should have been annotated but weren’t. However, investigating each feature individually for missing labels through manual verification would be infeasible at scale, motivating our automated pipeline for systematically identifying such annotation gaps. To scale up this process, we introduce a procedure for Feature-Guided Local Structural Alignment to find “structurally cohesive” features, meaning the regions that activate highly have a consistent 3D structure, even if it has not been annotated. In our procedure, we sample 20 AlphaFold-predicted structures from the top activating proteins of a given SAE feature (activation above 0.7), after de-duplicating for structures from gene orthologs. We then crop selected structures to the 100 amino acids surrounding the peak activating SAE amino acid. We run pairwise local alignment between all possible pairs, and score the alignments with backbone RMSD 100 , a modification of Root Mean Square Deviation that is more lenient for longer alignments ( Carugo and Pongor [2001 ]). We use this rather than RMSD because it finds longer complex conserved structures rather than, for example, a single alpha helix match. Because we are only aligning cropped structures, we can use a robust structural alignment algorithm, CEalign ( Shindyalov and Bourne [1998 ]) with minimal cost. Additionally, by restricting alignments to regions where the SAE feature activates, we evaluate structural similarity in functionally relevant areas. In Figure 3 we see that better structural alignment (lower RMSD 100 ) is correlated (pearson r=0.53) with higher annotation F1 scores. Specifically, 92% of features with RMSD 100 .8, while only 51% of features with an RMSD 100 > 5 have a code-based F1 > .8. Thus, for pairwise alignments with low RMSD 100 , we would expect they indeed share a local structural feature, even if one protein is missing an annotation, or is an uncharacterized novel protein. Since RMSD 100 scores below 4 consistently indicate clear structural similarity, we use this as our stringent threshold. Scores between 4 and 5 also typically represent genuine structural similarity but with occasional false positives, making this our more permissive threshold for broader coverage. Download figure Open in new tab Figure 3: RMSD vs. F1 for existing database annotations. Many features with high RMSD 100 do not correspond closely with existing annotations, while the vast majority of features with low RMSD 100 correspond with existing database annotations. 3.3 Large Language Models identify additional cohesive features missed by other methods While many features fire on a single database annotation code, other features appear to fire on shared traits across existing annotations. Thus, asking LLMs to reason over protein data is a natural step in developing better feature descriptions that can be used for annotation. We adapt the automated pipeline from InterPLM ( Simon and Zou [2024 ]), using Claude-3.5 Sonnet (new) to generate feature descriptions by providing it with protein metadata from our expanded sources along with examples of 40 proteins showing varying levels of maximum feature activation. The LLM analyzes these to identify what protein and amino acid characteristics cause the feature to activate at different levels, generating descriptions of the underlying biological patterns. As validation, we test whether these LLM-generated descriptions can predict feature activation levels on held-out proteins. We evaluate the ability of the generated descriptions and the annotation metadata to classify proteins as high or low activating for each feature, calculating F1 scores on a separate test set to ensure the descriptions capture generalizable patterns rather than overfitting to the training examples. We find that the LLM’s ability to describe a feature is highly correlated with the F1 score of the single best annotation code, as shown in Figure 4 . That is, though we want the model to reason about a combination of codes, a primary driver of performance is the primary existing database annotation. Download figure Open in new tab Figure 4: LLM-based F1 score vs. code-based F1 score on the same train/val split. Still, we find interesting cases where the LLM quantitatively improves performance. For example, for f/3174, the LLM identifies “this feature activates on transmembrane domains in multi-pass membrane proteins, particularly those involved in protein complex assembly and ion transport across membranes.” This description, which focuses on transmembrane domains across a broader range of proteins than would be covered by a single Pfam code, allows the description to outperform existing annotation codes. Similarly, for another feature (f/4896), the LLM writes, “This feature activates on precursor regions of secreted peptide hormones and neuropeptides that undergo proteolytic processing to produce bioactive signaling molecules.” This description correctly identifies that the feature fires on several different types of peptide hormones and neuropeptides, even though they do not share a GO code. Finally, the LLM can sometimes still interpret polysemantic features that appear to fire on two distinct protein elements. For example, f/3118 appears to fire at both the phosphohistidine of histidine kinases, and at chlorine channel sites for proton-coupled chloride transporters. While we have not been able to find a sequential or structural similarity between these two kinds of sites, the LLM correctly describes that the feature, “activates on functional domains involved in ion-mediated signaling, particularly histidine kinase domains and chloride channel domains that facilitate ion transport across membranes.” Ideally this feature would fire on a single monosemantic concept. However, this description can still reveal that the activating proteins are likely one of two types that function in a similar pathway. 4 Using PLM SAE features to enhance protein annotation Having identified cohesive features using our three approaches, we now show three benefits of utilizing these features. In particular, we utilize these features to identify subdomains within existing annotations that specifically appear in the literature. Second, we utilize these features to scalably identify missing CATH annotations. Finally, we utilize these features to rapidly identify structural matches for metagenomic proteins in NMPfamsDB. 4.1 SAE features capture granular subdomains interpretable through targeted literature search While our annotation matching successfully identifies coherent features, it also reveals an important limitation: we can identify the types of proteins where features activate, but not always what specific elements within those proteins cause the activation. This limitation, however, highlights a key advantage of SAE features-their ability to capture granularity beyond existing annotation schemes. Multiple SAE features often share the same top annotation code yet activate on different structural subregions, revealing subdomains that current databases treat as single units. Figure 5 illustrates both this limitation and opportunity. Eight different features (f/253, f/515, f/1505, f/1579, f/1712, f/1731, f/2768, and f/3288) all achieve high F1 scores for the same Pfam clan annotation (CL0062), yet each activates on distinct regions–transmembrane helices, cytoplasmic domains (f/253), and extracellular domains (f/515). While our annotation-based screening identifies these as coherent features, it cannot explain their specific roles, demonstrating both the power of SAE features to decompose protein families and the need for methods to interpret this granularity. Download figure Open in new tab Figure 5: Eight SAE features correspond to the same Pfam clan (APC, CL0062) but activate on distinct structural components within GAP1_YEAST, including different transmembrane helices, cytoplasmic domains, and extracellular loops. Left: Each row shows feature activation along the protein sequence with highly activated (> 0.8) residues highlighted in color, compared to the single Pfam domain (gray). Right: Feature activations on AlphaFold predicted structure (AFDB: P19145) showing each feature highlighting distinct structural components. To bridge this gap, we tested whether language models could retrieve literature discussing specific protein regions where features activate. We provided OpenAI’s o4-mini-high with the 10 highest-activating proteins for selected features, their gene descriptions from UniProtKB, and precise amino acid positions of peak activation, asking it to search for papers describing these regions. In a pilot of 6 features, this approach successfully identified papers that discussed those exact regions for 4 cases. For example, for f/515, the model retrieved literature on extracellular loops in APC proteins, identifying the effects of mutations in this precise region ( Raba et al. [2014 ])—functional detail entirely absent from the broader clan annotation. An example prompt for this pilot, and more detail on the retrieved papers is in Appendix B. This demonstrates that SAE features can reveal biologically meaningful subdomains within existing annotations, but realizing this potential requires methods to interpret their specific functional roles. While literature search shows promise for this task, it currently requires manual validation and is limited by API availability. 4.2 SAE features can identify missingness in existing databases We examine proteins that activate the same SAE feature but differ in their annotation status—some have the top database annotation while others do not. When we measure RMSD 100 between annotated and unannotated proteins, we often find they align just as well structurally as annotated proteins align with each other, as shown in Figure 6 . This suggests the unannotated proteins may have missing database labels. For example, we note f/401, whose best code-based description is a CATH Topology (3.40.50, with F1 of .84). However, several of the top activating proteins for f/401 are not tagged with any CATH code using Gene3D. Still, these structures have very low local RMSD to structures that are tagged with the CATH code using Gene3D, suggesting the CATH code is missing. Download figure Open in new tab Figure 6: Aligning structures with the top existing annotation to structures without the annotation reveals features like (a) f/401 and (b) f/581 with strong structural similarity despite different annotations. This can identify missing annotations in existing databases, like in (c) showing an alignment for two proteins that activate on f/401 where one protein (Q0P9A8) has and one protein (Q5WSK6) is missing the CATH 3.40.50 annotation, the topology code for the Rossmann fold. For clarity, only the area around the feature is shown for Q5WSK6. To analyze this at scale, we reviewed the 20 proteins per feature randomly sampled in Section 3.2, looking at features with an F1 of .8 or above for a CATH code or topology. Across 221 features, we find 1,055 of those proteins that are not tagged with a CATH code by Gene3D, but have local structural similarity to a protein also activating the same feature that does have a Gene3D annotation. As external validation, we compare the CATH tags from Gene3D, a sequence-based model, to TED ( Lau et al. [2024 ]), which uses a deep learning structure-based approach. We find 491 of these proteins indeed have a hit for that same CATH topology in TED (see Appendix Table 2 ). For proteins that do not have hits even in TED, it is possible that these features can fire on a subdomain within a code, so the entire structure is not similar enough to be tagged. While we can verify these seemingly missing annotations for CATH by using TED, there are SAE features that align with annotations from other databases like Pfam or UniProtKB. Thus, this combination of screening for existing annotations and local structural alignment can likely help identify potential missing annotations beyond just CATH. 4.3 Features can rapidly detect structural matches in unannotated metagenomic proteins A key advantage of feature-based annotation is that granular features can detect conserved domains even when full proteins show no sequence similarity to known families. This enables annotation of highly divergent metagenomic sequences that lack Pfam matches, which we test using NMPfamsDB ( Baltoumas et al. [2024 ]). We find that many of our features activate highly in proteins the SAE was not trained on. Specifically, for over 50% of our features, there is at least one NMPfamsDB protein that activates that feature with a value ≥ 0.7. Then, by applying our local structural alignment procedure, we can identify 615 features with strong median local structural alignments ( RMSD 100 < 5 for pairwise alignments between one Swiss-Prot protein and one metagenomic protein) and 181 features with median RMSD 100 < 4. This is strong evidence these features activate on the same element in both Swiss-Prot proteins and metagenomic proteins, as seen in Figure 7 . Download figure Open in new tab Figure 7: (a) RMSD 100 for comparisons within Swiss-Prot or between Swiss-Prot and metagenomic proteins; (b) local structural alignment for f/931 for a Swiss-Prot (Q9M9H7) and metagenomic protein F014369; (c) local structural alignment for f/723 for a Swiss-Prot (O06984) and metagenomic protein F001558. Using just these 615 features, we find matches between 12,526 metagenomic proteins in NMPfamsDB and Swiss-Prot proteins with an RMSD 100 ≤ 5 (14.9% of the 83,878 metagenomic proteins in NMPfamsDB analyzed). At an RMSD 100 threshold below 4, we find matches for 8,077 metagenomic proteins (9.6%). Many of the SAE features that align structurally in the metagenomic proteins also have high correspondence to CATH domains (378 of the 615 have a code-based F1 above 0.8 for a CATH code). Still, we also find features like f/723 and f/931 that correspond to a granular sub-domain within Pfam codes. As a final corroboration, for f/723 and f/931, we note that for several metagenomic proteins, if we run FoldSeek on the protein, several of the top Swiss-Prot hit also activate this feature. That demonstrates that these SAE features can be used to find the same structural element found by FoldSeek. However, using SAE features has a natural advantage over FoldSeek, in that we can provide the known structural or functional information about the feature(s) that triggered the alignment. 5 Conclusion In this work, we demonstrate the benefits of using latent features from protein language models for annotation. We find features that consistently activate on discernible subdomains, though these cannot yet be understood automatically. We also find features that can identify missing database annotations at scale, and find features that allow us to characterize metagenomic proteins. We can pre-compute the top proteins for each feature, tie each feature to existing databases, and generate LLM-descriptions for each feature. Thus, for a new protein, this workflow returns not only a structurally similar characterized protein in constant time, but also information about exactly the aligned region. We note several important limitations of the work: first, while LLMs can help in pulling relevant literature, identifying where within proteins these features fire remains a manual task due to hallucinations and the need for careful verification. Second, our local structural validation approach likely misses features that span distant regions or involve flexible structural motifs. Third, for now we rely on a single layer (Layer 18) of ESM-2 650M and focus our annotation validation on Swiss-Prot, which limits the space of proteins we can match. Future work should expand analysis across multiple layers within PLMs and systematically evaluate how the quality of the latent features influences our ability to find structurally consistent matches. Additionally, more rigorous benchmarking against existing annotation approaches like FoldSeek and Merizo-search can highlight the benefits of each method. We expect advances in LLM capabilities and more advanced protein representations may improve further on these tasks, and hope this framework can provide a useful proof-of-concept as this field develops further. A Flexible regions We note that many features still have a high code-based F1 even though they also have high RMSD 100 . Some of these are regions that are structurally flexible, for example f/73 below. We see three proteins, each with a homeodomain-like region highlight in orange and blue, and the highly activated region for f/73 in pink. Download figure Open in new tab Figure 1: f/73 captures a variety of structures that all link two structurally consistent domains. Here, three proteins have homeolike-domains are in orange and blue, while the highly active region for f/73 is in pink B Understanding granular features For six features, we used o4-mini-high to attempt to find citations that discussed the specific regions of interest. An example prompt is given below, followed by the best citation (where applicable) found by the model. Each citation returned by the model was reviewed manually, as the model could sometimes hallucinate specific quotes or mutations that were not found in the underlying papers. Where no relevant citation was retrieved, the feature was analyzed manually to determine its specific function. Sometimes very promising literature about the specific region exists, but was not returned by o4-mini-high, perhaps because of only asking about up 10 gene-species combinations per feature. For example, manually searching for additional papers revealed that f/253 corresponds to the cytoplasmic loop between transmembrane domain 2 and 3 in a topological model of amino acid permeases Cosgriff and Pittard [1997 ]. Prompt for f/515 What do we know about the following proteins in these amino acid regions mentioned for each? Cite papers that look specifically at or very near these regions ACTP_PECCP Cation/acetate symporter ActP (Acetate permease) (Acetate transporter ActP) in Pectobacterium carotovorum subsp. carotovorum (strain PC1) at 332 MNTH_AGRFC Divalent metal cation transporter MntH in Agrobacterium fabrum (strain C58 / ATCC 33970) (Agrobacterium tumefaciens (strain C58)) at 309 … KUP_CYTH3 Probable potassium transport system protein Kup in Cytophaga hutchinsonii (strain ATCC 33406 / DSM 1761 / CIP 103989 / NBRC 15051 / NCIMB 9469 / D465) at 271 KUP_STRA1 Probable potassium transport system protein Kup in Streptococcus agalactiae serotype Ia (strain ATCC 27591 / A909 / CDC SS700) at 277 PUTP_STAAN Sodium/proline symporter (Proline permease) in Staphylococcus aureus (strain N315) at 310 In total (across all proteins) provide me only with 2-4 of the best matching citations and what we know about the region from each paper View this table: View inline View popup Download powerpoint Table 1: o4-retrieved feature citations for specific sub-domains. C Missing CATH annotations Below we show the first 100 missing CATH annotations identified by our workflow, of the 491 that match with TED annotations. We considered a match if TED contained a code matching the top CATH code for a given feature that was within the same toplogy. Best code represents the top CATH code for that feature, while TED label is the exact TED label for that protein (may either be a CATH homologous superfamily or topology). View this table: View inline View popup Download powerpoint Table 2: Rows 1–50 View this table: View inline View popup Download powerpoint Table 3: Rows 51–100 Footnotes ↵ * Department of Biomedical Data Science. epsimon{at}stanford.edu jamesz{at}stanford.edu https://github.com/jsilbergDS/towards_functional_annotations_plms References ↵ Etowah Adams , Liam Bai , Minji Lee , Yiyang Yu , and Mohammed AlQuraishi . From mechanistic interpretability to mechanistic biology: Training, evaluating, and interpreting sparse autoencoders on protein language models . bioRxiv , pages 2025 – 02 , 2025 . ↵ Fotis A Baltoumas , Evangelos Karatzas , Sirui Liu , Sergey Ovchinnikov , Yorgos Sofianatos , I-Min Chen , Nikos C Kyrpides , and Georgios A Pavlopoulos . Nmpfamsdb: a database of novel protein families from microbial metagenomes and metatranscriptomes . Nucleic Acids Research , 52 ( D1 ): D502 – D512 , 2024 . OpenUrl CrossRef PubMed ↵ Alex Bateman , Lachlan Coin , Richard Durbin , Robert D Finn , Volker Hollich , Sam Griffiths-Jones , Ajay Khanna , Mhairi Marshall , Simon Moxon , Erik LL Sonnhammer , et al. The pfam protein families database . Nucleic acids research , 32 (suppl_ 1 ): D138 – D141 , 2004 . OpenUrl CrossRef PubMed Web of Science ↵ Daniel WA Buchan , Adrian J Shepherd , David Lee , Frances MG Pearl , Stuart CG Rison , Janet M Thornton , and Christine A Orengo . Gene3d: structural assignment for whole genes and genomes using the cath domain structure database . Genome research , 12 ( 3 ): 503 – 514 , 2002 . OpenUrl Abstract / FREE Full Text ↵ Oliviero Carugo and Sándor Pongor . A normalized root-mean-spuare distance for comparing protein three-dimensional structures . Protein science , 10 ( 7 ): 1470 – 1473 , 2001 . OpenUrl CrossRef PubMed Web of Science ↵ UniProt Consortium . Uniprot: a hub for protein information . Nucleic acids research , 43 ( D1 ): D204 – D212 , 2015 . OpenUrl CrossRef PubMed ↵ Angela J Cosgriff and AJ Pittard . A topological model for the general aromatic amino acid permease, arop, of escherichia coli . Journal of bacteriology , 179 ( 10 ): 3317 – 3323 , 1997 . OpenUrl Abstract / FREE Full Text ↵ Sarah Hunter , Rolf Apweiler , Teresa K Attwood , Amos Bairoch , Alex Bateman , David Binns , Peer Bork , Ujjwal Das , Louise Daugherty , Lauranne Duquenne , et al. Interpro: the integrative protein signature database . Nucleic acids research , 37 (suppl_ 1 ): D211 – D215 , 2009 . OpenUrl CrossRef PubMed Web of Science ↵ Philip Jones , David Binns , Hsin-Yu Chang , Matthew Fraser , Weizhong Li , Craig McAnulla , Hamish McWilliam , John Maslen , Alex Mitchell , Gift Nuka , et al. Interproscan 5: genome-scale protein function classification . Bioinformatics , 30 ( 9 ): 1236 – 1240 , 2014 . OpenUrl CrossRef PubMed Web of Science ↵ John Jumper , Richard Evans , Alexander Pritzel , Tim Green , Michael Figurnov , Olaf Ronneberger , Kathryn Tunyasuvunakool , Russ Bates , Augustin Žídek , Anna Potapenko , et al. Highly accurate protein structure prediction with alphafold . nature , 596 ( 7873 ): 583 – 589 , 2021 . OpenUrl CrossRef PubMed ↵ Shaun M Kandathil , Andy M Lau , Daniel WA Buchan , and David T Jones . Foldclass and merizo-search: Scalable structural similarity search for single-and multi-domain proteins using geometric learning . Bioinformatics , page btaf277, 2025 . ↵ Eli Levy Karin and Martin Steinegger . Cutting edge deep-learning based tools for metagenomic research . National Science Review , page nwaf056, 2025 . ↵ Andy M Lau , Shaun M Kandathil , and David T Jones . Merizo: a rapid and accurate protein domain segmentation method using invariant point attention . Nature Communications , 14 ( 1 ): 8445 , 2023 . OpenUrl PubMed ↵ Andy M Lau , Nicola Bordin , Shaun M Kandathil , Ian Sillitoe , Vaishali P Waman , Jude Wells , Christine A Orengo , and David T Jones . Exploring structural diversity across the protein universe with the encyclopedia of domains . Science , 386 ( 6721 ): eadq4946 , 2024 . OpenUrl CrossRef PubMed ↵ Zeming Lin , Halil Akin , Roshan Rao , Brian Hie , Zhongkai Zhu , Wenting Lu , Nikita Smetanin , Robert Verkuil , Ori Kabeli , Yaniv Shmueli , et al. Evolutionary-scale prediction of atomic-level protein structure with a language model . Science , 379 ( 6637 ): 1123 – 1130 , 2023 . OpenUrl CrossRef PubMed ↵ Christine A Orengo , Alex D Michie , Susan Jones , David T Jones , Mark B Swindells , and Janet M Thornton . Cath–a hierarchic classification of protein domain structures . Structure , 5 ( 8 ): 1093 – 1109 , 1997 . OpenUrl CrossRef PubMed ↵ Michael Raba , Sabrina Dunkel , Daniel Hilger , Kamila Lipiszko , Yevhen Polyhach , Gunnar Jeschke , Susanne Bracher , Johann P Klare , Matthias Quick , Heinrich Jung , et al. Extracellular loop 4 of the proline transporter putp controls the periplasmic entrance to ligand binding sites . Structure , 22 ( 5 ): 769 – 780 , 2014 . OpenUrl CrossRef PubMed ↵ Ilya N Shindyalov and Philip E Bourne . Protein structure alignment by incremental combinatorial extension (ce) of the optimal path . Protein engineering , 11 ( 9 ): 739 – 747 , 1998 . OpenUrl CrossRef PubMed Web of Science ↵ Elana Simon and James Zou . Interplm: Discovering interpretable features in protein language models via sparse autoencoders . bioRxiv , pages 2024 – 11 , 2024 . ↵ Michel van Kempen , Stephanie S Kim , Charlotte Tumescheit , Milot Mirdita , Cameron LM Gilchrist , Johannes Söding , and Martin Steinegger . Foldseek: fast and accurate protein structure search . Biorxiv , pages 2022 – 02 , 2022 . ↵ Yang Zhang and Jeffrey Skolnick . Tm-align: a protein structure alignment algorithm based on the tm-score . Nucleic acids research , 33 ( 7 ): 2302 – 2309 , 2005 . OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted October 04, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Towards functional annotation with latent protein language model features Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Towards functional annotation with latent protein language model features Jake Silberg , Elana Simon , James Zou bioRxiv 2025.10.02.680154; doi: https://doi.org/10.1101/2025.10.02.680154 Share This Article: Copy Citation Tools Towards functional annotation with latent protein language model features Jake Silberg , Elana Simon , James Zou bioRxiv 2025.10.02.680154; doi: https://doi.org/10.1101/2025.10.02.680154 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17697) Bioengineering (13894) Bioinformatics (41951) Biophysics (21456) Cancer Biology (18594) Cell Biology (25515) Clinical Trials (138) Developmental Biology (13380) Ecology (19903) Epidemiology (2067) Evolutionary Biology (24322) Genetics (15612) Genomics (22510) Immunology (17737) Microbiology (40400) Molecular Biology (17183) Neuroscience (88619) Paleontology (667) Pathology (2833) Pharmacology and Toxicology (4825) Physiology (7644) Plant Biology (15158) Scientific Communication and Education (2046) Synthetic Biology (4296) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00