Full text
41,116 characters
· extracted from
preprint-html
· click to expand
A Unified Protein Embedding Model with Local and Global Structural Sensitivity | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results A Unified Protein Embedding Model with Local and Global Structural Sensitivity Jerry Xu , Shaojun Pei , Gil Alterovitz doi: https://doi.org/10.1101/2025.10.27.684815 Jerry Xu 1 Massachusetts Institute of Technology ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site Shaojun Pei 2 Brigham and Women’s Hospital Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: spei1{at}bwh.harvard.edu Gil Alterovitz Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Preview PDF Abstract Structural comparison between proteins is key to many research tasks, including evolutionary analysis, peptidomimetics, and functional annotation. Traditional structure alignment tools based on three-dimensional protein structures, such as TM-Align, DALI, or ProBiS, are accurate, but they are computationally expensive and impractical at scale. Existing protein language models (PLMs), such as TM-Vec, improve computational efficiency but only capture global structural similarity, overlooking important motif-level structural details. In this paper, we propose a novel PLM consisting of a Siamese neural network, enabling efficient embedding-based structural comparison while also capturing both global and local structural similarity. Our model was trained on a dual loss function combining TM-score, a global similarity metric, and a variation of lDDT scores, a per-residue similarity metric. We tested against two datasets: a varied TM-score dataset from TM-Vec, and a high TM-score mutant dataset from VIPUR. Against these sets, our model achieved a TM-score MAE of 0.0741 and 0.0583, respectively, and a lDDT-score MAE of 0.0788 and 0.0038, respectively. Our model fulfills two key roles: first, it rapidly detects global structural differences. Second, it supports fine-grained structural assessments, improving sensitivity to subtle but functionally important structural changes. 1 Introduction Many research tasks depend on identifying structural homologs of proteins, including but not limited to evolutionary analysis, peptidomimetics, and functional annotation. However, identifying structural homologs at scale requires efficient comparison algorithms, which currently are still limited. Over the years, sequence alignment algorithms such as BLAST [ 1 ] and MMseqs2 [ 2 ] have become extremely optimized, allowing for rapid, large-scale processing. However, sequence alignment alone is ineffective for structural comparisons, since many structural homologs differ vastly in sequence, and likewise many sequential homologs differ vastly in structure [ 3 ] [ 4 ] [ 5 ]. Meanwhile, structural alignment algorithms are computationally expensive due to high algorithmic complexity. In particular, such algorithms often work with C α distance matrices to directly superimpose proteins, which has a time complexity of O ( mn ) or worse for protein pairs of lengths m and n . Furthermore, structural alignment algorithms like TM-align [ 6 ] and DALI [ 7 ] focus on global alignments, even though motif-level or subdomain-level features may be crucial to protein function. Meanwhile, local alignment algorithms like ProBiS [ 8 ] only consider surfaces and binding sites, and hence may miss the relevance of internal structural motifs or global folds. In contrast, protein language models, or PLMs (usually transformer architectures) [ 9 ], can predict structural similarity for two proteins of arbitrary length in 𝒪 (1) time given pre-computed embeddings. Utilizing the biophysical prior that sequences alone can reconstruct structures, [ 10 ], these models produce fixed-size embeddings directly from sequences that capture structural, chemical, or other features of proteins. Embeddings can then be compared via cosine similarity. Although efficient, existing PLMs for structural awareness focus on global features rather than local features. One notable example is TM-Vec [ 11 ], a recently-developed PLM trained to predict TM-scores (template-modeling scores) [ 12 ], a global similarity metric for proteins. While highly accurate for TM-scores, TM-Vec is locally unaware. Our PLM, consisting of a transformer-based Siamese neural network [ 13 ], addresses both the inefficiency of superposition-based algorithms and local insensitivity of prior methods. We continue to generate sequence-based embeddings, resulting in efficient comparison times. Furthermore, our model utilizes a loss function that combines TM-score and a custom variation of lDDT scores (Local Distance Different Test scores) [ 14 ], which are per-residue local structural similarity scores, ensuring the model captures both global and local structural features. The network predicts lDDT scores by producing per-residue embeddings, and predicts TM-scores by pooling those per-reside embeddings into a global embedding. Prior to training, ground truth TM-scores and sequence alignments are computed using TM-align, and ground truth lDDT scores are computed by comparing the local atomic environments of C α atoms in aligned residues. During training, predicted TM-scores are calculated as the cosine similarity between the global embeddings of the two proteins. Using the sequence alignment generated by TM-align, we predict the lDDT scores for each alignment pair as the cosine similarity of the embeddings for the residues in that pair. Our paper’s main contribution is the generation of protein language embeddings which are both locally and globally structure-aware. These embeddings can then be used for efficient structural comparison in many downstream research tasks, particularly in mutation analysis. Global embeddings can be utilized to determine the degree to which the overall fold of the mutant differs from the wild-type protein, quickly identifying deleterious mutants (i.e. low TM-score mutants). Per-residue embeddings can further analyze these mutants, identifying affected subdomains and hence the impacted functions of the protein. 2 Methods and Materials Our Siamese network-based architecture consists of three main steps. Data Preprocessing : The purpose of this step is twofold. Firstly, it generates the ProtTrans embeddings for each protein that will be the inputs of the neural network. Secondly, it precomputes the true TM-scores and lDDT scores, as well as the sequence alignments used to predict lDDT scores. All ground truth information is cached. Siamese Neural Network : For every pair of proteins whose similarity will be predicted, the (padded) ProtTrans embeddings of both proteins will be independently processed by a neural network with shared weights. The transformer module will capture sequential dependencies, as well as local and global structural patterns, producing learned per-residue embeddings that contain local structural information for use in lDDT score prediction. After being processed by the transformer module, the per-residue embeddings will be pooled via an attention mechanism to produce a single global embedding for each protein, to be used TM-score prediction. Contrastive Loss : The per-residue lDDT score loss and TM-score loss are combined for backpropagation. The degree to which each type of loss contributes to model learning depends on the true TM-score of that training pair. 2.1 Data Preprocessing 2.1.1 Datasets This study utilized datasets from UniProt and SWISS-MODEL, particularly working with protein sequences and PDBs. For training, our model used a dataset from SWISS-MODEL consisting of 250, 000 distinct proteins with length at most 300 amino acids, organized into 300, 000 protein pairs. This dataset mimics the training dataset of TM-Vec [ 11 ], which can be can be found in full at https://zenodo.org/records/8038377 in the file swiss under 300 141M.csv. PDBs were obtained from SWISS-MODEL’s REST API at https://swissmodel.expasy.org/repository/uniprot/.pdb?provider=swissmodel. For testing, two benchmark datasets were used: TM-Vec dataset : For this dataset, we reserved 886 protein pairs from the SWISS-MODEL dataset that were not used in training. VIPUR dataset : This dataset consists of 350 curated human wild-type/mutant pairs, some benign and some deleterious, from the dataset used in VIPUR, a model trained to predict deleteriousness of protein variants [ 15 ]. These datasets were designed to test the robustness of the model against both global structural similarity and local structural similarity. For each protein in the TM-Vec dataset and the wild-type proteins in the VIPUR dataset, PDBs were downloaded based on the AlphaFold-generated model of the proteins using the AlphaFold PDB API at https://alphafold.ebi.ac.uk/files/AF--F1-model_v4.pdb. (This might represent a slight discrepancy from how TM-Vec evaluated their results, as they may not have used AlphaFold PDBs.) In order to induce the desired structural changes for each mutation in the VIPUR dataset, we used the MODELLER library with a sphere size of 100Å to remodel the AlphaFold PDB of the wild-type protein. Figure 1 provides a visual overview of the changes induced by MODELLER for one protein, the human tumor suppressor ARF. Download figure Open in new tab Figure 1: MODELLER’s remodeling for the human tumor suppressor ARF (UniProt ID Q8N726). (a) The wild-type and mutant PDB superimposed, with the mutation site highlighted in red. (b) The distance matrix between residues in the wild-type protein. Certain structural features can be identified: the purple line parallel to the main antidiagonal around residue 30 represents the beta hairpin at the start of the protein, and the thickened section of purple around residues 40-60 along the main diagonal represents the alpha helix after the hairpin. (c) The difference matrix of the distance between two residues in the wild-type PDB and remodeled PDB. Darker red colors close to the main diagonal are indicative of local structural changes, while darker red colors farther from the main diagonal are indicative of global structural changes. 2.1.2 Sequence Embeddings We utilize the pre-trained ProtT5-XL-UniRef50 model [ 16 ] to generate per-residue embeddings of length 1024 for each amino acid in the protein sequence. Since the proteins in the datasets have variable lengths, each protein’s per-residue embeddings are padded before training such that there are 300 per-residue embeddings (the maximum sequence length), simultaneously producing a binary padding mask that indicates which embeddings are padded. Hence, the per-residue embeddings for the whole protein satisfy X ∈ ℝ 300 × 1024 . 2.1.3 TM-scores TM-score, or t emplate m odeling score, is a commonly-used metric of global structural similarity in proteins that is normalized between 0 and 1, with higher scores indicating higher global similarity [ 12 ]. TM-score depends on a sequence alignment between the reference and model proteins, and is calculated as where 𝒫 is the set of superpositions of the template and target structures, L target is the number of residues of the target structure, L aligned is the number of residues aligned between the template and target sequences, d 𝒮 ( i ) is the distance between the i th pair of aligned residues in the template and target sequences in 𝒮, and Here, each residue is represented as the point corresponding to the position of its C α . 2.1.4 TM-align TM-align is a superposition-based algorithm for global structural alignment [ 6 ]. We use the tm_align function in the tmtools python library to extract two pieces of ground truth information: first, we obtain true TM-score ( results.tm_norm_chain1 ) between the two proteins (we normalize the TM-score against chain1, the reference protein). Second, we obtain the residue alignment information ( seqxA, seqM, seqyA ) we can use to calculate predicted lDDT scores, which is given in the below format: seqxA: GKTIQVIPHVTNEIKDFISIGED—EVDFMLCEIG seqM: ::::::::::: :::.::: ::: :::::::::: seqyA: GATVQVIPHVT-ALKEKIKRAATTTDSDVIITEVG “:” denotes highly aligned residues, “.” denotes poorly aligned residues, and “” (space) denotes unaligned residues. “-” represents gaps in one of the sequences. 2.1.5 lDDT Scores lDDT scores, which stands for l ocal D istance D ifference T est scores, were initially used to determine how well a predicted (target) structure of a protein matched the experimentally-determined (template) structure of that protein [ 14 ]. Per-residue lDDT scores are also normalized between 0 and 1 and assigned to each amino acid in the target protein, representing how well the local atomic environment of that residue matches the local atomic environment of the same residue in the template protein (with higher scores meaning more aligned). The original algorithm for lDDT scores is described in Algorithm 1 . Algorithm 1 Per-Residue lDDT Scores Download figure Open in new tab We cannot directly use this implementation of lDDT scores, since it depends on the two proteins having identical sequences and hence an atomic bijection. In our case, there is no longer a clear one-to-one sequential alignment between proteins, and aligned residues are also unlikely to have an equal number of atoms. Therefore, we made the following adjustments: First, we align residues using TM-align’s alignment information, and generate a list of aligned residue pairs: all unaligned residues are discarded from the rest of the calculation. To resolve the lack of an atomic bijection, we only use the C α atoms of each residue in an alignment pair. This means that only inter- C α distances are compared. We calculated the per-residue lDDT scores using protein 1 as the template and protein 2 as the target. Furthermore, the array of lDDT scores was padded to length 300: each aligned residue will have its respective lDDT score inputted in the appropriate position in the array, while all non-aligned or padded residues were assigned an lDDT score of 0. We maintained the neighborhood threshold of 15Å and the difference thresholds of { 0.5Å, 1Å, 2Å, 4Å } . However, since there are far fewer C α atoms within this neighborhood than total atoms, we added a distance-decaying kernel to ensure that closer distances are upweighted in their contribution to lDDT score. After experimenting with different kernel types, we found that an inverse cubic kernel provided the most stable lDDT scores while still highlighting important local structural changes. The revised algorithm is given in Algorithm 2 . Algorithm 2 Custom Per-Residue lDDT Scores Download figure Open in new tab 2.2 Siamese Neural Network Siamese neural networks are used for tasks involving pairwise comparison and similarity analysis, and hence take in two inputs. The typical architecture of a Siamese neural network consists of a feature extractor and a comparison head, as illustrated in Figure 2 . The feature extractor consists of two branches f 1 and f 2 , which have the same network structure and weights ( W ) [ 13 ]. The comparison head outputs a comparison score C score based on the features f 1 ( x 1 ) and f 2 ( x 2 ) extracted from the inputs x 1 and x 2 , respectively. Download figure Open in new tab Figure 2: The typical architecture of a Siamese neural network consists of a feature extractor (i.e. the two neural networks, with shared weights) and a comparison head. There are generally two comparison mechanisms: similarity comparison, which calculates a form of distance between the feature vectors, and ranking comparison, which orders elements based on some metric. Our feature extractor consists of the pretrained ProTrans model, an input projection, two transformer layers, pooling, and an output projection (shown below). Our Siamese neural network utilizes similarity comparison. Download figure Open in new tab 2.2.1 Inputs The two neural networks take in the padded embeddings from ProtTrans of the form X ∈ ℝ 300 × 1024 , as well as the padding mask of shape M ∈ ℝ 300 . Before passing through the transformer, the model must pass through an input projection consisting of a linear layer, layer normalization [ 17 ], and Bernoulli dropout [ 18 ]. 2.2.2 Transformer Module Each neural network is composed of two transformer encoder layers and an attention pooling mechanism. The transformer layers consist of four attention heads and two feedforward layers each. Each multi-head attention component runs scaled dot-product attention several times in parallel, as described in Algorithm 3 . The feedforward layers consist of two linear layers using ReLU as an activation function, layer normalization, and Bernoulli dropout. Algorithm 3 Multi-Head Attention Download figure Open in new tab 2.2.3 Pooling We implement softmax pooling to produce a single, structurally-aware embedding representing the protein, which is used in TM-score prediction. An attention layer first assigns weights to the per-residue embeddings using softmax normalization. Then, the global embedding is created by computing a weighted average of the per-residue embeddings. This is described in Algorithm 4 . Algorithm 4 Attention Pooling Download figure Open in new tab 2.2.4 Output The outputs of each neural network are passed through an output projection consisting of a linear layer using ReLU as an activation function. The final per-residue embeddings are of the form Y ∈ ℝ 300 × 512 , and the final global embedding is of the form Z ∈ ℝ 512 . 2.3 Contrastive Loss Contrastive loss was based upon a combined loss function involving both TM-score (allowing for global structural sensitivity) and lDDT score (allowing for local structural sensitivity). Pairs with true TM-scores less than 0.1 were ignored since their structures are too dissimilar to extract useful patterns. Pairs with true TM-scores less than 0.7 utilized only TM-score loss since local structural similarity requires sufficient global alignment between proteins. Pairs with true TM-scores at least 0.7 used both lDDT and TM-score loss. The specific loss function we used is detailed below: (In our model, α = 0.7, β = 0.3. θ t are the parameters at time t .) Predicted TM-scores and lDDT scores were computed using cosine similarity, and weights were updated using Adam optimizers [ 19 ]. Predicted per-residue lDDT values were calculated from the per-residue embeddings using TM-align’s sequence alignments. Given residue i in protein 1 is aligned with residue j in protein 2, with respective embeddings and , then . Predicted TM-scores were calculated directly from the pooled global embeddings z 1 and z 2 for proteins 1 and 2 as . Figure 3 shows the architecture of one of the two neural networks, including the subsequent prediction and loss steps. (Projections are omitted from this diagram.) Download figure Open in new tab Figure 3: The structure of a single neural network. There are two transformer encoder layers with four attention heads and two feedforward layers each. The per-residue embeddings outputted by the transformer layers are used to generate predicted lDDT scores via cosine similarity using the sequence alignment generated by TM-align. After being processed by the transformer, the per-residue embeddings are pooled into a global embedding that generates a predicted TM-score via cosine similarity. The loss function uses both TM-score and lDDT scores. 2.4 Training Training occurred over 5 epochs of the SWISSS-MODEL dataset. The parquet files were shuffled prior to each epoch to prevent overfitting. The batch size was 16 pairs. V100 GPUs from Pittsburgh Supercomputing Center’s Bridges-2 were used to accelerate the speed of training. 2.5 Comparison to Prior Work Here, we compare our framework with other well-known structural prediction tools. The differences are summarized in Table 1 . All time complexities are given for the task of comparing two proteins of lengths m and n . View this table: View inline View popup Download powerpoint Table 1: Comparison of our model’s framework against selected PLMs. Hamamsy et al. (2023) [ 11 ] designed TM-Vec, which is also a PLM in the form of a transformer-based Siamese neural network. Hence, it has a time complexity of O (1). In Hamamsy’s paper, TM-Vec was used in conjunction with DeepBLAST, a structural alignment algorithm. TM-Vec predicts the structural homologs of a query protein within a vector database, which are then passed into DeepBLAST. DeepBLAST utilizes differential sequence alignment algorithms to predict areas of local structural alignment in those homologs. Unlike our model, TM-Vec is only trained to predict TM-scores. We will be using TM-Vec as a baseline to evaluate the performance of our model’s TM-score prediction. Holm et al. (1993) [ 7 ] created DALI, a superposition-based global alignment algorithm. DALI creates a C α distance matrix for both proteins, and attempts to string together local alignments of 6 × 6 submatrices (corresponding to hexapeptide fragments) to form a global alignment. Each pair of fragments is assigned a similarity score that indicates how well the submatrices align. A graph is constructed using residues as nodes and strong fragment pairs as edges, slowly building up a global alignment. The algorithm utilizes Monte Carlo optimization [ 20 ], and is hence stochastic. Unlike our PLM, which utilizes sequences as input, DALI utilizes coordinates as input. Furthermore, because it must compare the distance matrices of both proteins, its time complexity is 𝒪 ( m 2 n 2 ), much slower than our model’s 𝒪 (1). Zhang et al. (2005) [ 6 ] developed TM-align, another superposition-based global alignment algorithm. This algorithm works similarly to DALI, first generating multiple possible initial alignments by matching local fragments (essentially contact patterns, small-scale C α distance matrices) and secondary structures. Unlike DALI, TM-align uses dynamic programming, allowing the algorithm to try attaching different local fragments onto the current alignment. For each possible alignment, the optimal superposition is computed using the Kabsch-Umeyama algorithm [ 21 ]. Scoring matrices (essentially C α distance matrices in each alignment) are computed, and the alignment maximizing the TM-score is chosen as the final alignment. Like DALI, TM-align also utilizes coordinates as an input. However, this method was much faster than DALI for two reasons: first, it was deterministic, and second, the Kabsch-Umeyama algorithm utilizes singular value decomposition, which as a time complexity of 𝒪 (1). Despite this, the score matrix generation and dynamic programming steps still have a time complexity of 𝒪 ( mn ), resulting in an overall time complexity of 𝒪 ( mn ). This is slower than our PLM’s 𝒪 (1). Konc et al. (2010) [ 8 ] developed ProBiS, a tool for identifying structurally similar protein surfaces or binding sites. ProBiS represents the solvent-accessible surface of each protein as a 3D graph, and uses a maximum clique algorithm to find maximal common subgraphs (MCS) between two proteins. The MCS is used to generate superpositions. A maximum clique search theoretically has an exponential time complexity (in fact, the most efficient maximum clique algorithm has a time complexity of 𝒪 (2 n/ 4 ) [ 22 ]), although the practical runtime for this algorithm is typically near-polynomial. At the very least, creation of the 3D graph is an 𝒪 ( n 2 ) process. This is much slower than our PLM. Furthermore, ProBiS does not capture similarity in the global fold or internal structural features, which can often be important, while our model addresses both global and local similarity at every region of the protein. 3 Results We evaluated the performance of our model to predict both TM-score for protein pairs and lDDT score for aligned residues against the two testing datasets mentioned in 2.1.1, comparing our TM-score prediction capability against TM-Vec. 3.1 TM-Vec Dataset For the 886 proteins from the TM-Vec dataset, our performance is evaluated against TM-Vec’s in Table 2 . The error plots are shown in Figure 4 . In Figure 5 , we plot both our model and TM-Vec’s performance at each TM-score interval. View this table: View inline View popup Download powerpoint Table 2: Performance of our model and TM-Vec on the TM-Vec dataset. Download figure Open in new tab Figure 4: (a) The lDDT score error histograms across all residues. (b) The TM-score error histograms across all protein pairs for both our model and TM-Vec. Download figure Open in new tab Figure 5: The box plots for our model and TM-Vec’s errors at different TM-score intervals. When performing Welch’s t-test for our model’s TM-score prediction and TM-Vec’s TM-score prediction with α = 0.05, we obtain the following test statistic and degrees of freedom: giving p ≈ 0.313 > α . Thus, across the TM-Vec dataset, there is no statistically significant difference between our model’s and TM-Vec’s TM-score prediction. 3.2 VIPUR Dataset For the 350 proteins in the VIPUR dataset, our accuracy is evaluated against TM-Vec’s in Table 3 . The error plots are shown in Figure 6 . View this table: View inline View popup Download powerpoint Table 3: Performance of our model and TM-Vec on the VIPUR dataset. Download figure Open in new tab Figure 6: (a) The lDDT score error histograms across all residues. (b) The TM-score error histograms across all mutants for both our model and TM-Vec. Once again performing Welch’s t-test, we obtain giving p ≈ 0.652 > α . Thus, across the VIPUR dataset, there is no statistically significant difference between our model’s and TM-Vec’s TM-score prediction. 3.3 Overall Findings It has been demonstrated that our model’s capability for TM-score prediction is statistically indistinguishable from TM-Vec’s on both the TM-Vec dataset and the VIPUR dataset across all TM-score predictions. However, Figure 5 indicates our model is slightly more consistent at predicting low or high TM-scores (TM ∈ [0, 0.4) ∪ [0.8, 1]), and slightly worse at predicting medium TM-scores (TM ∈ [0.4, 0.8]). The improved results for true TM-scores within [0.8, 1] is possibly due to the fact that the loss function utilizes combined lDDT- and TM-score loss for high TM-scores, resulting in the model better capturing structural features at higher levels of global similarity. However, it is also likely that the differences in performance at each TM-score interval are statistically insignificant. For lDDT score prediction, our model is accurate on both datasets. 4 Discussion 4.1 Limitations While our model has promising test results, it is important to acknowledge the current methodology’s shortcomings, including limited dataset size for both training and testing. 4.2 Future Work Increasing the dataset sizes, extending training to different databases (e.g. CATH), and expanding the testing to other types of mutations would validate our model’s efficacy against a wide range of cases. Another possible line of future study is incorporating topological priors into the model. For example, one can design a new hierarchical feature extractor that directly captures protein motifs at different neighborhood sizes (e.g. using 1D or 2D convolutions [ 23 ]), based on the prior that proteins have multiple levels of structure (residue → secondary structure → domain → global fold [ 24 ]). In this way, predicting local similarity via embeddings is no longer dependent on direct sequence alignments (like lDDT scores require). 5 Conclusion State-of-the-art structural alignment algorithms, most of which rely on direct superposition, are algorithmically complex and have high time complexities. Currently, no PLMs that attempt to calculate structural similarity encapsulate both global and local structural similarity, although both are important in determining protein function, homology, and more. This study developed a framework for a sequence-based PLM, consisting of a transformer-based Siamese neural network, which produced locally- and globally-structure aware embeddings. Our model was trained to predict both TM-score, a global similarity metric, and lDDT scores, a per-residue similarity metric. Our testing results confirmed the plausibility of our framework, as our model performed similarly in TM-score prediction when compared to highly accurate models like TM-Vec while also producing accurate lDDT scores. This dual capability makes the model a potential tool in downstream research tasks, particularly mutation analysis, where it can aid in the identification of deleterious mutations as well as the recognition of affected subdomains. 6 Code Availability My data preprocessing, training, and testing scripts are all located at https://github.com/Brainana/GLASS-PRIMES . Acknowledgements I would like to thank Dr. Shaojun Pei, Dr. Gil Alterovitz, and Dr. Ning Xie for their invaluable mentorship in shaping the direction of the research. I would specifically like to thank Dr. Shaojun Pei for guiding me towards a case study on local structural changes induced by mutations, and providing me with computing resources and software that aided the development of my model. I would also like to thank the MIT PRIMES program for providing the opportunity to perform this research. References [1]. ↵ S. F. Altschul , W. Gish , W. Miller , E. W. Myers , and D. J. Lipman , “ Basic local alignment search tool ,” Journal of Molecular Biology , vol. 215 , no. 3 , p. 403 – 410 , Oct . 1990 . [Online]. Available : doi: 10.1016/S0022-2836(05)80360-2 OpenUrl CrossRef PubMed Web of Science [2]. ↵ M. Steinegger and J. Söding , “ Mmseqs2 enables sensitive protein sequence searching for the analysis of massive data sets ,” Nature Biotechnology , vol. 35 , no. 11 , p. 1026 – 1028 , Oct . 2017 . [Online]. Available : doi: 10.1038/nbt.3988 OpenUrl CrossRef PubMed [3]. ↵ S. Tomiuk and K. Hofmann , “ Sequence similarity in structurally dissimilar proteins ,” Current Biology , vol. 13 , no. 4 , p. R124 – R125 , Feb . 2003 . [Online]. Available : doi: 10.1016/S0960-9822(03)00070-8 OpenUrl CrossRef PubMed [4]. ↵ M. Kosloff and R. Kolodny , “ Sequence-similar, structure-dissimilar protein pairs in the pdb ,” Proteins: Structure, Function, and Bioinformatics , vol. 71 , no. 2 , p. 891 – 902 , Nov . 2007 . [Online]. Available : doi: 10.1002/PROT.21770 OpenUrl CrossRef [5]. ↵ M. Banach , L. Konieczny , and I. Roterman , “ Dissimilar sequence: similar structure of proteins ,” Bio-Algorithms and Med-Systems , vol. 12 , no. 3 , p. 117 – 121 , Aug . 2016 . [Online]. Available : doi: 10.1515/bams-2016-0014 OpenUrl CrossRef [6]. ↵ Y. Zhang , “ Tm-align: a protein structure alignment algorithm based on the tm-score ,” Nucleic Acids Research , vol. 33 , no. 7 , p. 2302 – 2309 , Apr . 2005 . [Online]. Available : doi: 10.1093/nar/gki524 OpenUrl CrossRef PubMed Web of Science [7]. ↵ L. Holm and C. Sander , “ Protein structure comparison by alignment of distance matrices ,” Journal of Molecular Biology , vol. 233 , no. 1 , p. 123 – 138 , Sep . 1993 . [Online]. Available : doi: 10.1006/jmbi.1993.1489 OpenUrl CrossRef PubMed Web of Science [8]. ↵ J. Konc and D. Janežič, “ Probis algorithm for detection of structurally similar protein binding sites by local structural alignment ,” Bioinformatics , vol. 26 , no. 9 , p. 1160 – 1168 , Mar . 2010 . [Online]. Available : doi: 10.1093/bioinformatics/btq100 OpenUrl CrossRef PubMed Web of Science [9]. ↵ L. Wang , X. Li , H. Zhang , J. Wang , D. Jiang , Z. Xue , and Y. Wang , “A comprehensive review of protein language models,” 2025 . [Online]. Available: https://arxiv.org/abs/2502.06881 [10]. ↵ C. B. Anfinsen , “ Principles that govern the folding of protein chains ,” Science , vol. 181 , no. 4096 , p. 223 – 230 , Jul . 1973 . [Online]. Available : doi: 10.1126/science.181.4096.223 OpenUrl FREE Full Text [11]. ↵ T. Hamamsy , J. T. Morton , R. Blackwell , D. Berenberg , N. Carriero , V. Gligorijevic , C. E. M. Strauss , J. K. Leman , K. Cho , and R. Bonneau , “ Protein remote homology detection and structural alignment using deep learning ,” Nature Biotechnology , vol. 42 , no. 6 , p. 975 – 985 , Sep . 2023 . [Online]. Available : doi: 10.1038/s41587-023-01917-2 OpenUrl CrossRef [12]. ↵ Y. Zhang and J. Skolnick , “ Scoring function for automated assessment of protein structure template quality ,” Proteins: Structure, Function, and Bioinformatics , vol. 57 , no. 4 , p. 702 – 710 , Oct . 2004 . [Online]. Available : doi: 10.1002/prot.20264 OpenUrl CrossRef PubMed Web of Science [13]. ↵ Y. Li , C. L. P. Chen , and T. Zhang , “ A survey on siamese network: Methodologies, applications, and opportunities ,” IEEE Transactions on Artificial Intelligence , vol. 3 , no. 6 , p. 994 – 1014 , Dec . 2022 . [Online]. Available : doi: 10.1109/TAI.2022.3207112 OpenUrl CrossRef [14]. ↵ V. Mariani , M. Biasini , A. Barbato , and T. Schwede , “ lddt: a local superposition-free score for comparing protein structures and models using distance difference tests ,” Bioinformatics , vol. 29 , no. 21 , p. 2722 – 2728 , Aug . 2013 . [Online]. Available : doi: 10.1093/bioinformatics/btt473 OpenUrl CrossRef PubMed Web of Science [15]. ↵ E. H. Baugh , R. Simmons-Edler , C. L. Müller , R. F. Alford , N. Volfovsky , A. E. Lash , and R. Bonneau , “ Robust classification of protein variation using structural modelling and large-scale data integration ,” Nucleic Acids Research , vol. 44 , no. 6 , p. 2501 – 2513 , Feb . 2016 . [Online]. Available : doi: 10.1093/nar/gkw120 OpenUrl CrossRef PubMed [16]. ↵ A. Elnaggar , M. Heinzinger , C. Dallago , G. Rehawi , Y. Wang , L. Jones , T. Gibbs , T. Feher , C. Angerer , M. Steinegger , D. Bhowmik , and B. Rost , “ Prottrans: Toward understanding the language of life through self-supervised learning ,” IEEE Transactions on Pattern Analysis and Machine Intelligence , vol. 44 , no. 10 , p. 7112 – 7127 , Oct . 2022 . [Online]. Available : doi: 10.1109/TPAMI.2021.3095381 OpenUrl CrossRef PubMed [17]. ↵ J. L. Ba , J. R. Kiros , and G. E. Hinton , “Layer normalization,” 2016 . [Online]. Available: https://arxiv.org/abs/1607.06450 [18]. ↵ N. Srivastava , G. Hinton , A. Krizhevsky , I. Sutskever , and R. Salakhutdinov , “ Dropout: A simple way to prevent neural networks from overfitting ,” Journal of Machine Learning Research , vol. 15 , no. 56 , pp. 1929 – 1958 , 2014 . [Online]. Available: http://jmlr.org/papers/v15/srivastava14a.html OpenUrl [19]. ↵ D. P. Kingma and J. Ba , “Adam: A method for stochastic optimization,” 2014 . [Online]. Available: https://arxiv.org/abs/1412.6980 [20]. ↵ S. Kirkpatrick , C. D. Gelatt , and M. P. Vecchi , “ Optimization by simulated annealing ,” Science , vol. 220 , no. 4598 , p. 671 – 680 , May 1983 . [Online]. Available : doi: 10.1126/science.220.4598.671 OpenUrl Abstract / FREE Full Text [21]. ↵ J. Lawrence , J. Bernal , and C. Witzgall , “ A purely algebraic justification of the kabsch-umeyama algorithm ,” Journal of Research of the National Institute of Standards and Technology , vol. 124 , Oct . 2019 . [Online]. Available : doi: 10.6028/jres.124.028 OpenUrl CrossRef [22]. ↵ J. M. Robson , “Finding a maximum independent set in time O(2n/4),” 2001 . [23]. ↵ M. M. Bronstein , J. Bruna , Y. LeCun , A. Szlam , and P. Vandergheynst , “Geometric deep learning: going beyond euclidean data,” 2016 . [Online]. Available: https://arxiv.org/abs/1611.08097 [24]. ↵ C. Orengo , A. Michie , S. Jones , D. Jones , M. Swindells , and J. Thornton , “ Cath – a hierarchic classification of protein domain structures ,” Structure , vol. 5 , no. 8 , p. 1093 – 1109 , Aug . 1997 . [Online]. Available : doi: 10.1016/S0969-2126(97)00260-8 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 27, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A Unified Protein Embedding Model with Local and Global Structural Sensitivity Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A Unified Protein Embedding Model with Local and Global Structural Sensitivity Jerry Xu , Shaojun Pei , Gil Alterovitz bioRxiv 2025.10.27.684815; doi: https://doi.org/10.1101/2025.10.27.684815 Share This Article: Copy Citation Tools A Unified Protein Embedding Model with Local and Global Structural Sensitivity Jerry Xu , Shaojun Pei , Gil Alterovitz bioRxiv 2025.10.27.684815; doi: https://doi.org/10.1101/2025.10.27.684815 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41913) Biophysics (21436) Cancer Biology (18578) Cell Biology (25482) Clinical Trials (138) Developmental Biology (13372) Ecology (19889) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15599) Genomics (22483) Immunology (17728) Microbiology (40365) Molecular Biology (17163) Neuroscience (88540) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15130) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9818) Zoology (2269)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.