Full text
51,446 characters
· extracted from
preprint-html
· click to expand
Protein Embeddings and Local Alignments | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Protein Embeddings and Local Alignments Julia Malec , G. Brian Golding , View ORCID Profile Lucian Ilie doi: https://doi.org/10.1101/2025.07.23.666343 Julia Malec 1 Department of Computer Science, University of Western Ontario , London, N6A 5B7, Ontario, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site G. Brian Golding 2 Department of Biology, McMaster University , Hamilton, L8S 4K1, Ontario, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lucian Ilie 1 Department of Computer Science, University of Western Ontario , London, N6A 5B7, Ontario, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lucian Ilie For correspondence: ilie{at}uwo.ca Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Background The advent of protein embeddings has revolutionized bioinformatics by providing contextual representations that capture functional and evolutionary patterns. They have become, alongside sequence alignments, the cornerstone of bioinformatics. While embeddings cannot replace alignments, they can greatly help improving their quality. Our goal is replacing the BLOSUM matrices, the decades-old standard scoring system for protein alignment, with an embedding-based scoring method. Results We introduce a new scoring function and algorithm for local alignment of protein sequences, we offer a new comprehensive framework for evaluating local alignments. The score between two residues is given by the cosine similarity of their Ankh-embedding vectors and the algorithm uses dynamic programming with affine penalty. For the evaluation, we built multiple datasets, using both natural and inserted sequences, from the Conserved Domain Database, BAL-iBASE, and GPCRdb, designed a new algorithm for local alignment extraction, localization and quality evaluation, and employed five distance metrics to evaluate the similarity with the true alignment. We performed nearly one and a half million tests to compare the new algorithm with the best BLOSUM matrices, specialized GPCRtm matrices, and top programs, such as PEbA, ProtT5-score, DEDAL, vcMSA and pLM-BLAST. Regarding the protein embedding models, Ankh not only surpasses the best combination of ProtT5 and ESM2, but appears to better understand the “language” of proteins, as it behaves much better on natural sequences compared to artificial ones obtained by inserting domains in random protein sequences. Also, while ProtT5 and ESM2 combine to produce better results, Ankh does not combine well with other embeddings. Conclusions The new Ankh-score-based program is vastly superior to the BLOSUM matrices and clearly superior to all existing methods. New light shed on the protein embeddings can guide future improvements. In order to facilitate the use of the new method and protocol, they are freely available as a web server at e-score.csd.uwo.ca and as source code at github.com/lucian-ilie/E-score . Background The advent of protein embeddings such as ProtT5 ( Elnaggar et al. 2021 ), ESM2 ( Rives et al. 2021 ), or Ankh ( Elnaggar et al. 2023 ), has revolutionized bioinformatics by providing contextual representations that capture functional and evolutionary patterns. They are helping to create state-of-the-art solutions for many problems in proteomics, such as structure prediction ( Senior et al. 2020 ; Yang et al. 2020 ; Jumper et al. 2021 ), function prediction ( Kulmanov and Hoehndorf 2020 ; Gligorijević et al. 2021 ; Lai and Xu 2022 ), interaction site prediction ( Manfredi et al. 2023 ; Hosseini et al. 2024 ), etc. Embeddings have become, alongside sequence alignments, the cornerstone of bioinformatics. Local alignments are crucial for comparing protein sequences as they discover regions of similarity within different sequences that may be functionally or evolutionarily related. This can help identify conserved domains, motifs, or functional regions, providing insights into protein function, structural characteristics, and evolutionary relationships. Detecting these conserved subsequences helps in annotating uncharacterized proteins, understanding protein families, and inferring potential functional roles of unknown proteins. Local alignment search is the most widely used procedure in bioinformatics and biological sciences. The well known BLAST program is one of the most cited contributions in the history of science; the two BLAST papers ( Altschul et al. 1990 , 1997 ) combined exceed 205,000 citations on Google Scholar (as of Apr. 2025). In spite of the thorough investigation of this problem over many years, alignment of distantly related protein sequences remains challenging. Accurate local alignment is particularly important when searching for homologous sequences that have diverged significantly, making detection of relevant similarities difficult. More precise alignment algorithms increase the chance of identifying subtle similarities, allowing researchers to uncover distant evolutionary relationships and functional conservation across species. The key feature in all local alignments algorithms is the scoring method that evaluates the similarity between amino acid residues. BLOSUM matrices ( Henikoff and Henikoff 1992 ) provided the standard scoring procedure for more than three decades, with another popular method provided by PAM matrices ( Dayhoff et al. 1978 ). Different BLOSUM matrices, such as BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, and BLOSUM90, are optimized for detecting homology at various sequence similarity levels, with lower numbers matrices being designed to discover more distantly related sequences. Recent developments in deep learning provide the ideas for improvement. Techniques from Natural Language Processing involving self-supervised learning on very large unlabelled data helped create word embeddings, that is, numerical vectors associated with words, which can preserve their meaning ( Mikolov et al. 2013 ; Pennington et al. 2014 ; Peters et al. 2018 ; Devlin et al. 2018 ; Liu et al. 2019 ; Yang et al. 2019 ; Raffel et al. 2020 ). These ideas were applied to protein sequences, with entire proteins seen as sentences and residues as words, by training on many protein sequences from UniProt ( UniProt Consortium 2018 ). Top protein embeddings include: ProtVec ( Asgari and Mofrad 2015 ), SeqVec ( Heinzinger et al. 2019 ), PRoBERTa ( Nambiar et al. 2020 ), MSA-transformer ( Rao et al. 2021 ), ESM-1b and ESM2 ( Rives et al. 2021 ), ProtAlbert, ProtBert, ProtT5, ProtElectra, ProtXLNet ( Elnaggar et al. 2021 ), Ankh ( Elnaggar et al. 2023 ). Alignments will continue to provide the most basic sequence-level information, critical for many applications, while embeddings provide the most basic functional or contextual information, ideal for predictive analyses. While embeddings cannot replace alignments they can help improving their quality. The cosine similarity of embedding vectors associated with amino acid residues has been used recently as a successful scoring scheme by several independent studies, for similarity search with pLM-BLAST ( Kaminski et al. 2023 ), multiple sequence alignment with vcMSA ( McWhite et al. 2023 ), pairwise alignment with PEbA ( Iovino and Ye 2024 ) and E-score ( Ashrafzadeh et al. 2024 ). Our goal here is two fold. First, we give the best local alignment algorithm for protein sequences, together with a new comprehensive framework for evaluating local alignments. The algorithm uses dynamic programming ( Smith and Waterman 1981 ; Gotoh 1982 ) with the score provided by the cosine similarity of embedding vectors associated with amino acid residues by Ankh. To assess its performance, we introduce a thorough protocol for the evaluation of local alignment computation, including building multiple datasets, suitable for local alignment computation, from the Conserved Domain Database (CDD) ( Marchler-Bauer et al. 2015 ), BALiBASE database ( Thompson et al. 2005 ), and GPCRdb ( Pándy-Szekeres et al. 2022 ), designing an algorithm for local alignment extraction, localization and quality evaluation, and using five distance metrics for evaluating the similarity with the true alignment. Second, our investigations shed new, important light on the protein embeddings themselves. Ankh not only surpasses the best combination of ProtT5 and ESM2, but appears to better understand the “language” of proteins, as it behaves much better on natural sequences compared to artificial ones obtained by inserting domains in random protein sequences. Also, while ProtT5 and ESM2 combine to produce better results, Ankh does not combine well with other embeddings. We hope that using the new method for computing local alignments will have a significant impact in the fields of bioinformatics and evolutionary biology, improving on how protein sequences are aligned, compared, and interpreted, enhancing the precision of identifying homologous sequences, especially for distantly related proteins, where BLOSUM matrices often struggle. It should improve the detection of functional domains, rare motifs, and evolutionary relationships, which are critical for annotating proteins in newly sequenced genomes. This could have direct implications in various fields, such as drug discovery, protein engineering, and functional genomics. We hope as well that our findings regarding protein embedding methods indicate the possibility of future improvement and can help guide such developments. Methods Local alignment evaluation setup A successful method for local alignment computation has to correctly address two aspects: - alignment localization : identification of the subsequences participating in the local alignment, and - alignment quality : alignment of the subsequences involved, as measured by the distance to the reference alignment. This basic setup is used below in designing the datasets and algorithms used for proper evaluation. Datasets A domain consists of a set of similar sequences for which a reference multiple sequence alignment (MSA) is known. A test consists of two protein sequences containing two similar subsequences from the same domain. Ideally, these two subsequences are the only significant similarities between the two protein sequences. We have imposed two additional conditions on the data. First, each domain occurrence inside a protein sequence has to be away from the sequence ends (usually we require at least 30 residues away from either end), to be considered a proper local alignment; otherwise the alignment is essentially global. Second, in order to have statistically significant p-values, we used, whenever possible, domains with at least 15 sequences, such that the number of tests is over 100. (We made exceptions for the BALiBASE datasets RV11 and RV12, otherwise almost all domains would be eliminated.) We constructed several datasets with these properties. Natural datasets have the domain sequences naturally occurring inside protein sequences. Insert datasets are built by inserting domain sequences into random protein sequences. All datasets are summarized in Table 1 . Note the wide amino acid identity range for the entire collection of data. View this table: View inline View popup Download powerpoint Table 1. Dataset summary. For each dataset, we give the number of domains, sequences, tests performed, average protein and domain length and amino-acid identity range. * Total tests are calculated using the number of times each dataset was used multiplied by the number of distances calculated. We have produced, according to the above requirements, four natural datasets, from the Conserved Domain Database ( Marchler-Bauer et al. 2015 ) and the BALiBASE ( Thompson et al. 2005 ) datasets RV11 -highly divergent, RV12 - moderately divergent, and RV30 - divergent families. Their details are given in Table 2 . Then, for greater variety, we have produced several insert datasets. Three are from the BAL-iBASE linear motifs datasets RV911 - highly divergent, RV912 - moderately divergent, and RV913 - closely related sequences, one is from the GPCRdb database ( Pándy-Szekeres et al. 2022 ), for comparison against specialized scoring matrices, and we use the CDD dataset also in insert mode to compare with the natural one. The details of all datasets are given in Supplementary Tables 1-12. View this table: View inline View popup Download powerpoint Table 2. Natural dataset details. From left to right, domain ID, number of sequences, average length of protein and domain sequences, average sequence length before and after domain occurrences, and amino acid percentage identity. We used multiple testing datasets to evaluate our method under a variety of situations. These included datasets from NCBI’s Conserved Domain Database as the alignment of these sequences are curated by hand according to experimental data of the function of the proteins. The test datasets also included datasets from BAliBASE, as again it is a curated dataset designed to be a dataset of alignment standards. From this we choose subsets that covered a range of divergences. Alignment computation Given a protein embedding method, E , and two protein sequences, we use the classic dynamic programming Smith and Waterman (1981) algorithm to compute the local alignment, using the affine gap penalty variant of Gotoh (1982) . The score between two residues a 1 and a 2 is the E -score we introduced in Ashrafzadeh et al. (2024) , given by the cosine similarity between the vectors computed by E for the two residues: For a fixed method E , such an Ankh, we obtain Ankh-score. The penalties are empirically set at −1 for gap opening and −0.1 for gap extension. Note that the scoring function of Iovino and Ye (2024) is essentially the same; they multiply the cosine by 10, but keeping the original cosine and dividing their penalties by 10 produces the same results. (They use slightly different gap penalties, −11 and −1, which correspond to our −1.1 and −.1.) In order to correctly evaluate the quality of a given local alignment, we need to calculate its distance to the reference alignment using a proper distance metric. Distance metrics require the alignments to be between the same sequences. Therefore, we need an algorithm that computes, from any alignment produced by some method, a local alignment between the same sequences as the reference. The idea is to eliminate sequences outside reference and also pad the computed alignment, if necessary, on either side with the worst alignment of the missing subsequences, denoted Worst(·,·), which is the alignment of length the sum of the lengths of the two sequences, with each sequence contiguous and aligned with gaps only, e.g., The local alignment computation algorithm is given below. LocalAlignment Download figure Open in new tab For clarity, we provide below an example of running the algorithm. Consider the following protein sequences p 1 and p 2 containing the domain sequences, s 1 and s 2 , shown in red, having the reference alignment : Download figure Open in new tab Assume that the E -score local alignment algorithm produces the alignment between the substrings t 1 and t 2 , indicated by curly braces above and below the sequences; the red parts indicate the overlaps, o 1 and o 2 , with the reference domains: Download figure Open in new tab The alignment, , of the overlaps induced by , the extra sequences to the left and to the right, and the final alignment, , of the domain sequences, s 1 and s 2 , induced by the E -score alignment are given below: Download figure Open in new tab The evaluation of the quality of the alignment produced by this process consists of measuring the distance between the alignment and the reference alignment . Note that both of these alignments are between the same reference domain sequences s 1 and s 2 . Comparison procedure Assume a fixed protein embedding method. Given a domain and two methods for computing local alignment, we perform a number of specified tests, each consisting of a pair of sequences. The local alignment is computed using both methods and then the results are compared according to their distances to the reference alignment, as induced by the domain’s MSA for the given sequences. Potentially the alignments could differ in multiple ways and hence we used five distance metrics to evaluated the difference in the alignments. Three of those are from Blackburne and Whelan (2012) – d ssp , d seq , d pos – and two newly introduced in our global alignment study ( Ashrafzadeh et al. 2024 ) – d d , d cc . The former three are true metrics inspired from the sum-of-pairs score, differing from each other in the way they consider gaps: ignore, label by sequence, and label by sequence and position, resp. This makes d pos the most relevant of the three. The other two are very different; d d , relative displacement distance, evaluates all position differences of all residues; d cc , closest context distance, takes the middle ground, by measuring the distance to the closest position with the same context. After evaluating all tests as explained above, we report, for each domain and each distance, two values for each method: (i) the average distance between local alignments computed by the method to the corresponding reference alignment and (ii) the number of tests won by the method. Except for rare cases, a method prevails with respect to both the average distance and the number of tests won. Methods are always compared in pairs, in order to enable the use of the Wilcoxon signed-rank test; we consider p-values smaller than 0.01 significant. Values higher than this threshold are shown in red. Score modifications We have explained above how alignment quality is evaluated. Concerning the alignment localization, the above evaluation penalizes the situation when a computed alignment misses part of the reference alignment by replacing the missing part with worst alignment of the subsequences missed. If, however, the computed alignment extends outside the reference alignment, the length of the overhangs cannot be measured properly because there is no golden standard for that. Instead, we offer the user the possibility of adjusting the size of the overhangs. If we make the scores more negative by shifting them down by a fixed value, then the size of the overhangs decreases. Fig. 1 shows how various shifts affect the amount of alignment outside the reference compared to what BLOSUM matrices produce. The quality of the alignments is little affected by using the shifts, therefore this is a good way for the user to adjust the alignment length, if desired. Download figure Open in new tab Fig. 1. Alignment overhangs. Average lengths of the alignment overhangs for the CDD natural dataset are given for all BLOSUM matrices and several shift values of Ankh-score. Results Model selection We have first tested several embedding methods to see which one works best for our task. Supplementary Table 13 gives the comparison of seven top methods on the CDD natural dataset. For all five distances, Ankh-score and ProtT5-score are the best, with Ankh-score usually first, followed at some distance by ESM2. Therefore, we are going to use Ankh-score as our scoring function. Note that the statistical significance of the comparison between Ankh-score and ProtT5-score is assessed thoroughly later in this section, when we compare the two on eight datasets. Competing methods The candidates for comparison include the standard for scoring protein alignments, the BLOSUM matrices, which we feel should be replaced by the new method, specialized matrices such as GPCRtm, and existing top methods, PEbA ( Iovino and Ye 2024 ), ProtT5-score ( Ashrafzadeh et al. 2024 ), DEDAL ( Llinares-López et al. 2023 ), vcMSA ( McWhite et al. 2023 ), and pLM-BLAST ( Kaminski et al. 2023 ). The first evaluation we did was to assess how much of the alignments are missed, on the average, by each method. We used the CDD natural dataset for this purpose and the results are plotted in Fig. 2 ; all values are given in Supplementary Table 14. We do not compare with vcMSA because it is a heuristic version of PEbA or E-score, therefore generally weaker, as it is intended for multiple sequence alignments, therefore unable to use dynamic programming. Also, as already mentioned PEbA uses esentially the same score as ProtT5-score, therefore, comparing against ProtT5-score will cover both. We are not comparing with pLM-BLAST due to its poor performance covering alignments, as explained above. Download figure Open in new tab Fig. 2. Missed alignments. Average percentage of the alignments missed by each scoring method on the CDD natural dataset. For pLM-BLAST we tested several values (100, 60, 20) for the minimum alignment parameter, attempting to reduce the amount of alignments missed. Comparison against BLOSUM First, we give an idea of what the scores look like in Fig. 3 , where we plotted the actual scores for both methods, Ankh-score and BLOSUM, for two typical examples, one natural example and one insert. It is quite striking how clearly visible the alignments are in Ankh-score plots, whereas in the BLOSUM plots, for the insert case the alignment is barely visible, while in the natural case it cannot be detected visually. Download figure Open in new tab Fig. 3. Score examples. Ankh-scores (left column) and BLOSUM scores (right column) are given for two examples, a natural one (top row) and an insert one (bottom row). As we aim to replace BLOSUM matrices by Ankh-score, we perform a thorough comparison against the five common BLOSUM matrices – 45, 50, 62, 80 and 90 – on all datasets from Table 1 (except GPCR). We give as an example the comparison on the CDD natural datasets for the first three distances in Table 3 . Full results for all datasets and all distances are given in Supplementary Tables 15-30, including also the BLOSUM matrices that produced the best results. View this table: View inline View popup Download powerpoint Table 3. Ankh-score vs best BLOSUM on CDD natural dataset, for the first three distances. The full results are summarized by the dot plots in Fig. 4 . It can be seen from the plots that Ankh-score clearly dominates the best BLOSUM scores for each case. Download figure Open in new tab Fig. 4. Ankh-score vs best BLOSUM. One row for each natural dataset gives the distribution of each of the five distances, for the alignments produced using Ankh-score (blue) and the best BLOSUM matrix for each test (red). The legend gives the number of tests won by each. We give also the summaries for the number of domains won by each in Table 4 and the average percentage of the tests won for each dataset in Table 5 . The number of BLOSUM matrices, by dataset, achieving the best results is given in Table 6 . As expected, more higher order BLOSUM matrices produce the best performance for datasets with higher amino acid identity level. View this table: View inline View popup Download powerpoint Table 4. Summary of Ankh-score vs best BLOSUM for all datasets: number of domains won by each method for each dataset. View this table: View inline View popup Download powerpoint Table 5. Summary of Ankh-score vs best BLOSUM for all datasets: number of domains won by each method for each dataset. View this table: View inline View popup Download powerpoint Table 6. Best BLOSUMs. The number of BLOSUM matrices, by dataset, achieving the best results. Comparison against GPCRtm We compared first the GPCRtm matrix with the best BLOSUM matrix and GPCRtm is the winner; see Supplementary Tables 31 and 32. A more fair comparison is against a single BLOSUM matrix. As seen in Supplementary Table 32, BLOSUM62 is most often the best matrix. The comparison between GPCRtm and BLOSUM62 is given in Supplementary Table 33 and indicates GPCRtm as a very clear winner. Finally, we compared Ankh-score against GPCRtm in Supplementary Table 34 and it wins by a very large margin. Comparison against ProtT5-score and PEbA We already mentioned that the PEbA and ProtT5-score produce the same results. ProtT5-score is the closest competitor, therefore we compared Ankh-score against ProtT5-score on all eight datasets, natural and insert. The dot plots are shown in Supplementary Fig. 1 and all they indicate is that the performances are close. We need the actual numbers to decide the winner. Full comparison is given in Supplementary Tables 34-42. We give here the summaries for the number of domains won by each in Table 7 and the average percentage of the tests won for each dataset in Table 8 . The numbers present a very interesting picture. Ankh-score is very clearly the winner for the natural datasets, which are the most relevant. For the insert data, Ankh-score is still winning, but not as clearly. This behaviour is considered further in the Discussion section. View this table: View inline View popup Download powerpoint Table 7. Summary of Ankh-score vs ProtT5-score for all datasets: number of domains won by each method for each dataset. View this table: View inline View popup Download powerpoint Table 8. Summary of Ankh-score vs ProtT5 for all datasets: number of domains won by each method for each dataset. Comparison against DEDAL The comparison of Ankh-score against DEDAL is given as dot plots in Supplementary Fig. 2 for all natural datasets, with full details in Supplementary Tables 43-46 and they show that Ankh-score is clearly the winner. The comparisons indicate a similar situation as with BLOSUM matrices, therefore we have not performed the comparison using the insert data. Comparison against BLOSUM on common overlap In the light of our results for missed alignments in Fig. 2 and considering that our rigorous comparison of alignments penalizes for such missed alignment segments, we need to evaluate whether the advantage of Ankh-score over BLOSUM matrices is due to this penalization or actual alignment quality. We compared Ankh-score with the commonly best matrix, BLOSUM45, on the common alignments. That is, the alignments were first computed by each method and only the intersection among both computed alignments and the reference alignment was considered for comparison. (Note that we need to first fix a BLOSUM matrix in order to be able to compute the common alignments.) The results are given in Supplementary Table 47 and they indicate fairly similar performance with the original comparison. Protein embeddings improvements We have tried to improve the performance of the embedding vectors by combining them in various ways. We obtained a successful combination between ProtT5 and ESM2 models, using the average of the normalized vectors. As shown in Supplementary Table 48, ProtT5+ESM2-score is better than ProtT5-score. Ankh-score still outperforms the combination quite clearly, as seen in Supplementary Table 49. Interestingly, Ankh does not appear to combine well with other methods. Discussion We have compared the new algorithm, that uses Ankh-score, rigorously using a large number of tests against the current standard of BLOSUM matrices, as well as the closest competitor, ProtT5-score/PEbA. The datasets include natural data, where domains occur naturally inside protein sequences, and insert data, where we insert domains inside random protein sequences. This was done to cover a wide range of amino acid identity levels, 8-66%; see Table 1 . The tests were evaluated using the Wilcoxon signed-rank test for statistical significance, with the p-value threshold set conservatively at 0.01. Ankh-score shows far better performance than the best BLOSUM matrices, winning all natural data cases. Only for three domains of the BALiBASE RV11 dataset and a single case of BALiBASE RV12, the p-values are not significant, shown in red in Supplementary Tables 17 and 19. Of these, only two cases have p-value above 0.05. Ankh-score performs much better than best BLOSUM matrices also for the insert data, the winning margin becoming smaller for the cases with higher identity. This is to be expected as BLOSUM matrices are good at handling higher identity cases, as there are fewer divergent regions and the contextual and long-range dependencies captured by Ankh are less critical. This is captured well by the top row of Fig. 5 . The plots indicate the percentage of wins of Ankh-score over best BLOSUM for, from left to right, natural, insert, and all data. The difference is the largest for natural data, then decreases for insert data and with increasing identity level. Importantly, the performance of Ankh-score stays higher than that of the best BLOSUM matrices at all identity levels tested. Note also the different identity range of the first plot. The magnitude of the advantage of Ankh-score is shown in the top row of Fig. 6 ; note the different range of both the abscissa and ordinate axes for the first plot. Download figure Open in new tab Fig. 5. Wins percentage over identity level. The plots show the percentage of wins of Ankh-score vs best BLOSUM (top row) and ProtT5-score (bottom row), for natural (left), insert (middle), and all data (right). Two curves are given for each of the five distances, one solid curve for Ankh-score and a dotted of the same colour for the competitor. Download figure Open in new tab Fig. 6. Distance ratio over identity level. The plots show the distance ratio, for all five distance, between Ankh-score vs best BLOSUM (top row) and ProtT5-score (bottom row), for natural (left), insert (middle), and all data (right). Ratio higher than 1 means Ankh-score is better than the competitor. Compared against the closest competitor, Ankh-score performs, remarkably, consistently better than ProtT5-score. The number of cases that are not statistically significant is much higher than in the BLOSUM comparison – see the p-values in red in Supplementary Tables 35-42. Ankh-score dominates ProtT5-score at all identity levels, as seen in the bottom row of Fig. 5 . The magnitude of the advantage is shown in the bottom row of Fig. 6 . Of the five distances considered, the last three, d pos , d seq , d ssp , are true metric distances built on the sum-of-pairs idea. The difference is that d ssp ignores gaps, d seq considers gaps labelled by sequences, whereas d pos labels gaps by both sequence and position. Therefore, d pos is the most relevant of the three, however their behaviour is fairly consistent. The other two distances are very different and they exhibit different behaviour from the three, as well as from each other. The five distances together give a better general picture of the comparison. As seen in Tables 7 and 8 , Ankh-score is winning everywhere for the last three distance, but ProtT5-score has some wins for the first two distance, particularly d d . The full meaning of this behaviour is yet to be determined. Our results have interesting implications on the behaviour of the two models, Ankh and ProtT5, themselves. Comparing the first two plots of the bottom row of Fig. 5 , we see that the difference is much smaller between the Ankh and ProtT5 sets of curves, indicating that Ankh behaves significantly better for the natural domains, compared with insert ones. This phenomenon is most visible when comparing the ‘CDD natural’ and ‘CDD insert’ rows of Table 7 . A possible interpretation is as follows. The embedding models are trained in unsupervised manner on very large number of (natural) protein sequences. It is therefore to be expected that the insert sequences may confuse the models somewhat, as being unnatural. Our results show that Ankh is significantly more confused than ProtT5 is. This can be interpreted as indication that Ankh better captures the true “language” of proteins. Conclusions The methods discussed here are a massive improvement for the analysis of biological sequences. The reconstruction of accurate protein alignments are a first step and a basic step in almost all studies. Alignments are necessary to detect homologues between species and to employ comparative data. These, in turn, aid structural predictions, functional annotations, and domain identification. Active site identification and annotation of pathogenic mutations rely on precise alignments. This knowledge contributes to deeper insights in areas like drug design and disease diagnostics. The detection of remote homologs is a prerequisite for evolutionary studies. Accurate local protein sequence alignments are the foundation of virtually every bioinformatics and comparative-genomics analysis. A global alignment is restricted to end-to-end alignments regardless of the homology present. The ends of sequences are known to evolve rapidly with little selective constraint. This leads to fragments of irrelevant sequences at the ends of sequences and hinders the correct alignment of subsequences. Similarly, proteins are often composed of multiple functional domains. But their alignment is needlessly complicated by global alignments that must match the entire sequence rather than local alignments that align just the functional domains. Most alignment methods use a scoring matrix to evaluate the differences in potential alignments. The most commonly used of these matrices is the empirically determined BLOSUM matrix. This is a static matrix that was determined decades ago from active site blocks. All such empirical matrices suffer from amino acid biases dues to over/under-represented amino acids in the training blocks. BLOSUM assumes uniform substitution rates and assumes no context dependencies. Here we have used embedding vectors from every known protein sequence. These vectors capture far more of the biology than just what amino acid is present. It includes context for each amino acid and knowledge from similar proteins as well as from distantly related proteins. This information reflects not just amino acid presence but substitution patterns and selective constraints. Moving beyond generic substitution matrices allows alignments to more faithfully reflect the true, context-dependent evolutionary and structural constraints acting on proteins, thereby enhancing every downstream inference. The alignments using embedding vectors clearly provide alignments that are as good as the standard methods used and, in almost all cases, far superior. This is due to embedding vectors providing a vast amount of information that relate the biology to the alignment. Alignments are constrained to by the function of the protein and by the phylogenetic history between the two species. With a slightly different function, the proteins can explore more of sequence space without compromising their function. With a great divergence between the species the sequences can explore greater levels of structural divergence without a loss of function. The embedding vectors incorporate individual amino acid differences (as done in a BLOSUM matrix) but in addition, they also include contextual information including some sense of the three-dimensional structure. This information and more is combined into the embedding vector to produce alignments that respect the biological constraints and can therefore produce very accurate alignments. The last comment concerns the interplay between structure prediction and alignments. AlphaFold ( Jumper et al. 2021 ; Abramson et al. 2024 ) does a much better job at the alignment of distantly related sequences than does BLOSUM ( Lesk and Konagurthu 2022 ; Rajapaksa et al. 2023 ). But embedding vectors do a similar or better job at this same problem. In addition, embedding vectors can do a better job when less distant sequences are aligned and, arguably, a better job than BLOSUM with very similar sequences. The reason for this is that embedding vectors incorporate features of the structures from the proteins from which they were constructed and also the finer features of individual amino acid differences. Furthermore, AlphaFold provides an estimate of the structure but often has levels of uncertainty in these structures that are otherwise ignored. Even with a well supported structure, such a structure is not static and can change with different environmental conditions or as the protein binds other moieties to function. Last but not least, AlphaFold relies on sequence alignments to generate its highly accurate protein structure predictions. Availability of data and materials The new method, protocol, and data are freely available as a web server at e-score.csd.uwo.ca and as source code at github.com/lucian-ilie/E-score. Supplementary information Supplementary information is available online: localAlign supp.pdf. Author contributions J.M. selected the data, wrote the software and web server, performed all tests, including installing and running the necessary libraries, and contributed to the methodology and analysis. G.B.G. advised on choosing the data and contributed to performing and writing the biological analysis. L.I. proposed the study, designed the methodology, analyzed the results, and wrote the manuscript. Funding This work was supported by NSERC Discovery [RGPIN 2021-03978 to L.I., RGPIN-2020-05733 to G.B.G.]. Declarations Ethics approval and consent to participate Not applicable. Consent for publication Not applicable. Competing interests The authors declare no competing interests Acknowledgements All our computations were performed on Digital Research Alliance of Canada servers. We would like to thank Paul Moore for helping with some of the code for automating testing on the server. Funder Information Declared Natural Sciences and Engineering Research Council , RGPIN 2021-03978 , RGPIN-2020-05733 Footnotes https://github.com/lucian-ilie/E-score https://e-score.csd.uwo.ca/ References ↵ Abramson J , Adler J , Dunger J , Evans R , Green T , Pritzel A , et al. Accurate structure prediction of biomolecular interactions with AlphaFold 3 . Nature . 2024 ; 630 ( 8016 ): 493 – 500 . OpenUrl CrossRef PubMed ↵ Altschul SF , Gish W , Miller W , Myers EW , Lipman DJ . Basic local alignment search tool . Journal of molecular biology . 1990 ; 215 ( 3 ): 403 – 410 . OpenUrl CrossRef PubMed Web of Science ↵ Altschul SF , Madden TL , Schäffer AA , Zhang J , Zhang Z , Miller W , et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs . Nucleic acids research . 1997 ; 25 ( 17 ): 3389 – 3402 . OpenUrl CrossRef PubMed Web of Science ↵ Asgari E , Mofrad MR . Continuous distributed representation of biological sequences for deep proteomics and genomics . PloS one . 2015 ; 10 ( 11 ): e0141287 . OpenUrl CrossRef PubMed ↵ Ashrafzadeh S , Golding GB , Ilie S , Ilie L. Scoring alignments by embedding vector similarity . Briefings in Bioinformatics . 2024 ; 25 ( 3 ): bbae178 . OpenUrl PubMed ↵ Blackburne BP , Whelan S. Measuring the distance between multiple sequence alignments . Bioinformatics . 2012 ; 28 ( 4 ): 495 – 502 . OpenUrl CrossRef PubMed Web of Science ↵ Dayhoff M , Schwartz R , Orcutt B. 22 a model of evolutionary change in proteins . Atlas of protein sequence and structure . 1978 ; 5 : 345 – 352 . OpenUrl ↵ Devlin J , Chang MW , Lee K , Toutanova K. Bert: Pre-training of deep bidirectional transformers for language understanding . arXiv preprint arXiv:181004805. 2018 ;. ↵ Elnaggar A , Essam H , Salah-Eldin W , Moustafa W , Elkerdawy M , Rochereau C , et al. Ankh: Optimized protein language model unlocks general-purpose modelling . arXiv preprint arXiv:230106568. 2023 ;. ↵ Elnaggar A , Heinzinger M , Dallago C , Rehawi G , Wang Y , Jones L , et al. Prottrans: Toward understanding the language of life through self-supervised learning . IEEE transactions on pattern analysis and machine intelligence . 2021 ; 44 ( 10 ): 7112 – 7127 . OpenUrl ↵ Gligorijević V , Renfrew PD , Kosciolek T , Leman JK , Berenberg D , Vatanen T , et al. Structure-based protein function prediction using graph convolutional networks . Nature communications . 2021 ; 12 ( 1 ): 3168 . OpenUrl PubMed ↵ Gotoh O. An improved algorithm for matching biological sequences . Journal of molecular biology . 1982 ; 162 ( 3 ): 705 – 708 . OpenUrl CrossRef PubMed Web of Science ↵ Heinzinger M , Elnaggar A , Wang Y , Dallago C , Nechaev D , Matthes F , et al. Modeling aspects of the language of life through transfer-learning protein sequences . BMC bioinformatics . 2019 ; 20 ( 1 ): 1 – 17 . OpenUrl CrossRef PubMed ↵ Henikoff S , Henikoff JG . Amino acid substitution matrices from protein blocks . Proceedings of the National Academy of Sciences . 1992 ; 89 ( 22 ): 10915 – 10919 . OpenUrl Abstract / FREE Full Text ↵ Hosseini S , Golding GB , Ilie L. Seq-InSite: sequence supersedes structure for protein interaction site prediction . Bioinformatics . 2024 01; 40 ( 1 ): btad738 . OpenUrl PubMed ↵ Iovino BG , Ye Y. Protein embedding based alignment . BMC bioinformatics . 2024 ; 25 ( 1 ): 85 . OpenUrl CrossRef PubMed ↵ Jumper J , Evans R , Pritzel A , Green T , Figurnov M , Ronneberger O , et al. Highly accurate protein structure prediction with AlphaFold . Nature . 2021 ; 596 ( 7873 ): 583 – 589 . OpenUrl CrossRef PubMed ↵ Kaminski K , Ludwiczak J , Pawlicki K , Alva V , Dunin-Horkawicz S. pLM-BLAST: distant homology detection based on direct comparison of sequence representations from protein language models . Bioinformatics . 2023 ; 39 ( 10 ): btad579 . OpenUrl CrossRef PubMed ↵ Kulmanov M , Hoehndorf R. DeepGOPlus: improved protein function prediction from sequence . Bioinformatics . 2020 ; 36 ( 2 ): 422 – 429 . OpenUrl CrossRef PubMed ↵ Lai B , Xu J. Accurate protein function prediction via graph attention networks with predicted structure information . Briefings in Bioinformatics . 2022 ; 23 ( 1 ): bbab502 . OpenUrl CrossRef PubMed ↵ Lesk AM , Konagurthu AS . Protein structure prediction improves the quality of amino-acid sequence alignment . Proteins: Structure, Function, and Bioinformatics . 2022 ; 90 ( 12 ): 2144 – 2147 . OpenUrl ↵ Liu Y , Ott M , Goyal N , Du J , Joshi M , Chen D , et al. Roberta: A robustly optimized bert pretraining approach . arXiv preprint arXiv:190711692. 2019 ;. ↵ Llinares-López F , Berthet Q , Blondel M , Teboul O , Vert JP . Deep embedding and alignment of protein sequences . Nature methods . 2023 ; 20 ( 1 ): 104 – 111 . OpenUrl PubMed ↵ Manfredi M , Savojardo C , Martelli PL , Casadio R. ISPRED-SEQ: Deep neural networks and embeddings for predicting interaction sites in protein sequences . Journal of Molecular Biology . 2023 ; 435 ( 14 ): 167963 . OpenUrl CrossRef PubMed ↵ Marchler-Bauer A , Derbyshire MK , Gonzales NR , Lu S , Chitsaz F , Geer LY , et al. CDD: NCBI’s conserved domain database . Nucleic acids research . 2015 ; 43 ( D1 ): D222 – D226 . OpenUrl CrossRef PubMed ↵ McWhite CD , Armour-Garb I , Singh M. Leveraging protein language models for accurate multiple sequence alignments . Genome Research . 2023 ; 33 ( 7 ): 1145 – 1153 . OpenUrl Abstract / FREE Full Text ↵ Mikolov T , Chen K , Corrado G , Dean J. Efficient estimation of word representations in vector space . arXiv preprint arXiv:13013781. 2013 ;. ↵ Nambiar A , Heflin M , Liu S , Maslov S , Hopkins M , Ritz A. Transforming the language of life: transformer neural networks for protein prediction tasks . In: Proceedings of the 11th ACM international conference on bioinformatics, computational biology and health informatics ; 2020 . p. 1 – 8 . ↵ Pándy-Szekeres G , Esguerra M , Hauser AS , Caroli J , Munk C , Pilger S , et al. The G protein database, GproteinDb . Nucleic Acids Research . 2022 ; 50 ( D1 ): D518 – D525 . OpenUrl CrossRef PubMed ↵ Pennington J , Socher R , Manning CD . Glove: Global vectors for word representation . In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP) ; 2014 . p. 1532 – 1543 . ↵ Peters ME , Neumann M , Iyyer M , Gardner M , Clark C , Lee K , et al. Deep contextualized word representations . arXiv:180205365. 2018 ;. ↵ Raffel C , Shazeer N , Roberts A , Lee K , Narang S , Matena M , et al. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer . arXiv preprint arXiv:191010683. 2020 ;. ↵ Rajapaksa S , Konagurthu AS , Lesk AM . Sequence and structure alignments in post-AlphaFold era . Current Opinion in Structural Biology . 2023 ; 79 : 102539 . OpenUrl CrossRef PubMed ↵ Rao RM , Liu J , Verkuil R , Meier J , Canny J , Abbeel P , et al. MSA transformer . In: International Conference on Machine Learning PMLR ; 2021 . p. 8844 – 8856 . ↵ Rives A , Meier J , Sercu T , Goyal S , Lin Z , Liu J , et al. Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences . Proceedings of the National Academy of Sciences . 2021 ; 118 ( 15 ): e2016239118 . OpenUrl Abstract / FREE Full Text ↵ Senior AW , Evans R , Jumper J , Kirkpatrick J , Sifre L , Green T , et al. Improved protein structure prediction using potentials from deep learning . Nature . 2020 ; 577 ( 7792 ): 706 – 710 . OpenUrl CrossRef PubMed ↵ Smith TF , Waterman MS . Identification of common molecular subsequences . Journal of molecular biology . 1981 ; 147 ( 1 ): 195 – 197 . OpenUrl CrossRef PubMed Web of Science ↵ Thompson JD , Koehl P , Ripp R , Poch O. BAliBASE 3.0: latest developments of the multiple sequence alignment benchmark . Proteins: Structure, Function, and Bioinformatics . 2005 ; 61 ( 1 ): 127 – 136 . OpenUrl ↵ UniProt Consortium T . UniProt: the universal protein knowledgebase . Nucleic acids research . 2018 ; 46 ( 5 ): 2699 – 2699 . OpenUrl CrossRef PubMed ↵ Yang J , Anishchenko I , Park H , Peng Z , Ovchinnikov S , Baker D. Improved protein structure prediction using predicted interresidue orientations . Proceedings of the National Academy of Sciences . 2020 ; 117 ( 3 ): 1496 – 1503 . OpenUrl Abstract / FREE Full Text ↵ Yang Z , Dai Z , Yang Y , Carbonell J , Salakhutdinov RR , L. QV. XLNet: Generalized autoregressive pretraining for language understanding . Advances in neural information processing systems . 2019 ; 32 . View the discussion thread. Back to top Previous Next Posted July 27, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Protein Embeddings and Local Alignments Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Protein Embeddings and Local Alignments Julia Malec , G. Brian Golding , Lucian Ilie bioRxiv 2025.07.23.666343; doi: https://doi.org/10.1101/2025.07.23.666343 Share This Article: Copy Citation Tools Protein Embeddings and Local Alignments Julia Malec , G. Brian Golding , Lucian Ilie bioRxiv 2025.07.23.666343; doi: https://doi.org/10.1101/2025.07.23.666343 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41936) Biophysics (21452) Cancer Biology (18588) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15153) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.