Structome-TM: Complementing dataset assembly for structural phylogenetics by addressing size-based biases

preprint OA: closed CC-BY-NC-ND-4.0
Full text 21,675 characters Β· extracted from preprint-html Β· click to expand
Structome-TM: Complementing dataset assembly for structural phylogenetics by addressing size-based biases | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Structome-TM: Complementing dataset assembly for structural phylogenetics by addressing size-based biases View ORCID Profile Ashar J. Malik , View ORCID Profile David B. Ascher doi: https://doi.org/10.1101/2025.02.08.637224 Ashar J. Malik 1 School of Chemistry and Molecular Biosciences, The University of Queensland , Brisbane, Australia 2 Australian Centre for Ecogenomics, The University of Queensland , Brisbane, Australia 3 Computational Biology and Clinical Informatics, Baker Heart and Diabetes Institute , Melbourne, Victoria, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ashar J. Malik For correspondence: ashar.malik{at}uq.edu.au d.ascher{at}uq.edu.au David B. Ascher 1 School of Chemistry and Molecular Biosciences, The University of Queensland , Brisbane, Australia 2 Australian Centre for Ecogenomics, The University of Queensland , Brisbane, Australia 3 Computational Biology and Clinical Informatics, Baker Heart and Diabetes Institute , Melbourne, Victoria, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for David B. Ascher For correspondence: ashar.malik{at}uq.edu.au d.ascher{at}uq.edu.au Abstract Full Text Info/History Metrics Preview PDF Abstract Harnessing the explosion of protein structure data to uncover deep evolutionary relation-ships requires effective comparison methods. While widely used global alignment techniques are powerful, they can fail to identify homologous structures that differ significantly in size or domain architecture. To address this limitation, we introduce Structome-TM, a web resource for assembling datasets for distance-based phylogenetic reconstruction. Making use of the TM-score to prioritize local structural similarity, Structome-TM excels at identifying these otherwise obscured relationships, allowing users to build a comprehensive structural neighbourhood of proteins suitable for comparison. To facilitate this dataset assembly, the resource accepts both RCSB PDB identifiers and protein sequences as inputs. When querying using a protein sequence, protein structures are predicted in real-time and their respective neighbourhoods determined, enabling analysis where experimentally determined structures may not be available. Through its user-friendly interface, Structome-TM provides a powerful and necessary approach for a more comprehensive exploration of protein evolution. This resource is freely available at: https://biosig.lab.uq.edu.au/structome_tm/ . Introduction The large-scale availability of protein structure data has revolutionized the exploration of deep phylogenetic relationships [ 1 , 2 ]. These relationships are typically determined in a two-part process involving: (1) gathering structures to define the taxon set and (2) comparing these structures to infer evolutionary relationships. The Structome suite is intended to streamline this process. The first tool developed in this suite, Structome-Q (originally published as Structome [ 3 ]), uses the Q-score metric from GESAMT [ 4 ] to assess global structural similarity. It efficiently identifies related proteins and performs the necessary pairwise comparisons to construct a neighbour-joining tree. The Q-score metric is defined as: where N align is the number of aligned residue pairs, RMSD is the root-mean-square deviation of the alignment, R 0 is a scaling parameter (set to 3.0 Γ…), and N 1 and N 2 are the amino acid counts of the two structures. The normalisation by the product of the protein lengths ( N 1 N 2 ) makes the Q-score a robust measure of global structural similarity. However, this inherently penalizes alignments where structures share only localized regions of similarity, such as proteins with partial domain matches. As a result, these biologically significant homologues may rank poorly and could be overlooked. In contrast, the TM-score [ 5 ] adopts a different approach: where L target is the length of the target protein, N is the number of aligned amino acid, d i is the distance between aligned amino acid pairs, and The implementation of this metric within Foldseek [ 6 ] prioritizes local similarity by normalizing for alignment length rather than overall protein size. This is particularly advantageous for identifying meaningful alignments between multi-domain proteins that may only share a single domain with the query, highlighting relationships underestimated by the Q-score. The latest resource in this suite, Structome-TM, therefore provides this powerful local alignment methodology. By offering complementary search strategies, the Structome suite provides researchers with a more comprehensive and nuanced toolkit for structural phylogenetics. To clarify the distinctions between these web resources, their features are summarized in Table 1 . View this table: View inline View popup Download powerpoint Table 1: Comparison of Structome-Q with Structome-TM. The table highlights the key differences in methodology and functionality between the original published tool, the updated Structome-Q, and the new Structome-TM. Methods The architecture of Structome-TM builds upon the framework established by Structome-Q [ 3 ]. The methodology can be divided into three main components: dataset construction, the search algorithm, and the web server implementation. Dataset Construction The foundation of Structome-TM is a representative, non-redundant set of protein structures derived from the RCSB Protein Data Bank [ 7 ] (PDB; www.rcsb.org ). To curate this dataset, all PDB entries were first filtered to remove short peptides, retaining only proteins longer than 50 amino acids. This collection was then clustered at 90% sequence identity using USEARCH [ 8 ]. Each resulting cluster is represented by a single centroid structure, which serves as a proxy for all its members. This approach makes the subsequent all-versus-all comparison computationally tractable. Structural Search and Analysis To enable rapid querying, an all-versus-all pairwise comparison of all centroids in the database was performed using Foldseek [ 6 ], with the resulting TM-scores stored. When a user submits a PDB structure as a query, it is first mapped to its pre-calculated cluster centroid for which the pre-computed results are returned. To broaden accessibility, Structome-TM also accepts a protein sequence. A submitted sequence is folded into a 3D model in real-time using ESMFold [ 9 ]. The resulting predicted structure is then used as the query against the full RCSB PDB. In addition to returning hits, a dedicated column is populated informing if the target protein is a member of the core Structome-TM dataset. Additionally, results for both structure and sequence-based query are annotated with data from several key resources, including SCOP [ 10 ], CATH [ 11 ], ECOD [ 12 ], and the NCBI taxonomy database [ 13 ]. Web Server Implementation The web application is built using the Flask Python framework and is deployed within Docker containers managed by an Nginx web server. The front-end is developed with standard HTML, CSS, and JavaScript, utilizing AJAX. Interactive 3D visualization of protein structures are rendered using the Mol* viewer [ 14 ]. For phylogenetic analysis, neighbour-joining trees are generated from the β€œ1 - TM-score” distance metric using Biopython and are rendered as interactive diagrams in the browser via the D3.js library. Usage Structome-TM offers two query modes, structure-based and sequence-based, each tailored for different input-type, but both leading to a common interface for interactive analysis and phylogenetic tree generation. Structure-based Query When a PDB identifier is submitted, the search is performed against the curated set of Structome-TM centroids. Results are displayed in a sortable table listing hits with a TM-score > 0.1. Clicking any row expands it to show the full list of PDB IDs within that cluster and displays an interactive 3D superposition of the query structure against the hit centroid using the Mol* visualiser. Sequence-based Query For protein sequence submissions, the resource first predicts the query’s structure and then searches it against the complete PDB dataset (as of June 2025). The resulting table includes an additional, sortable column with a binary marker. This marker indicates whether a given hit is also a representative centroid within the core Structome-TM dataset. Statistics and Analysis For both query modes, an accompanying histogram summarizes the distribution of TM-scores, providing a rapid overview of result quality. Users can then select hits via checkboxes to assemble a custom dataset for phylogenetic analysis. A key difference between the two modes is the composition of this dataset: the structure-based search allows only curated centroids to be used for tree reconstruction, whereas the sequence-based search allows any target protein to be included, regardless of its centroid status. Upon submission, a neighbour-joining tree is generated using β€œ1 - TM-score” as the distance metric. The final tree is rendered as an interactive diagram, allowing inspection of individual leaf labels and download in Newick format for further annotation. Case Study: Identifying Homology in Multi-Domain Proteins To demonstrate how Structome-TM captures meaningful alignments missed by global methods, hemoglobin from Anser indicus (a single globin domain) was compared with flavohemoglobin from Saccharomyces cerevisiae ( Figure 1 ). Flavohemoglobin is a larger, multi-domain protein, comprising a globin domain fused to an FAD-binding domain [ 15 ]. Download figure Open in new tab Figure 1. Protein structure alignment between hemoglobin from Anser indicus (RCSB PDB accession 1a4f, chain A, 141 amino acids, shown in green) and flavohemoglobin from Saccharomyces cerevisiae (RCSB PDB accession 4g1b, chain A, 399 amino acids, shown in yellow). The left panel illustrates the alignment from Structome-Q using the Q-score metric, while the right panel shows the alignment from Structome-TM using TM-score. The Q-score for this alignment is 0.213, whereas the TM-score is 0.803. The lower similarity score from Q-score reflects its global normalization accounting for overall protein sizes. A score of 0.213 indicates that this hit will probably be lost in the background noise. Due to this size mismatch, the global alignment approach assigns a low Q-score of 0.213. This score is low enough to be indistinguishable from background noise, meaning this significant homologous relationship would likely be missed. In contrast, by focusing on the shared local similarity, a high TM-score of 0.803 is obtained. This result demonstrates how Structome-TM can successfully build a more comprehensive structural neighbourhood. It ensures that valuable homologous relationships within multi-domain proteins are correctly identified, enabling more complete and accurate downstream phylogenetic analyses. Outlook Structome-Q and Structome-TM are designed to streamline the initial, and often most challenging, step of structural phylogenetics: the assembly of a relevant protein structure dataset. This dataset assembly is the essential first step for using both distance-based methods included within these resources and character-based phylogenetic methods, which use 3Di structural characters to infer maximum likelihood trees [ 16 ] and helper tools like Structome-AlignViewer [ 17 ]. Thus, while these resources provide a direct route to distance-based trees, their fundamental purpose is to empower researchers with the curated data needed for any downstream structural evolutionary analysis. Looking ahead, the data exploration process will be enhanced by incorporating advances in generative models. Future developments will explore the use of protein language model (pLM) embeddings as an alternative feature for structural comparison, alongside the planned integration of generative AI. This latter capability will enable automated textual summaries of search results, immediately highlighting common functional or taxonomic features within a structural neighbourhood. Such enhancements promise to transform this resource into an interactive knowledge discovery platform, accelerating insights into the evolution of protein structure and function. Availability and implementation Structome-TM is available at https://biosig.lab.uq.edu.au/structometm/ Funding DBA is supported by the investigator grant from the National Health and Medical Research Council (NHMRC) of Australia [GNT1174405] and the Victorian government Operational Infrastructure Support program. Footnotes This version adds: 1) a new feature of using protein sequence as query input which is folded at run-time using ESMFold and results are generated for it. This is documented. 2) a comprehensive table is added which documents improvements offered in this resource. Other changes include minor refinements in langauge. References [1]. ↡ Malik , A. J. , Poole , A. M. , and Allison , J. R. β€œ Structural phylogenetics with confidence ”. Molecular biology and evolution 37 . 9 ( 2020 ), pp. 2711 – 2726 . OpenUrl CrossRef PubMed [2]. ↡ Puente-Lelievre , C. , Malik , A. J. , and Douglas , J. β€œ Protein Structural Phylogenetics ”. Genome Biology and Evolution ( 2025 ). in press. doi: 10.1093/gbe/evaf139 . OpenUrl CrossRef [3]. ↡ Malik , A. J. et al. β€œ Structome: a tool for the rapid assembly of datasets for structural phylogenetics ”. Bioinformatics Advances 3 . 1 ( 2023 ), vbad134 . OpenUrl [4]. ↡ Krissinel , E. β€œ Enhanced fold recognition using efficient short fragment clustering ”. Journal of molecular biochemistry 1 . 2 ( 2012 ), p. 76 . OpenUrl PubMed [5]. ↡ Zhang , Y. and Skolnick , J. β€œ TM-align: a protein structure alignment algorithm based on the TM-score ”. Nucleic acids research 33 . 7 ( 2005 ), pp. 2302 – 2309 . OpenUrl CrossRef PubMed Web of Science [6]. ↡ Kempen , M. van et al. β€œ Foldseek: fast and accurate protein structure search ”. Biorxiv ( 2022 ), pp. 2022 – 02 . [7]. ↡ Berman , H. M. et al. β€œ The protein data bank ”. Biological Crystallography 58 . 6 ( 2002 ), pp. 899 – 907 . OpenUrl PubMed [8]. ↡ Edgar , R. C. β€œ Search and clustering orders of magnitude faster than BLAST ”. Bioinformatics 26 . 19 ( 2010 ), pp. 2460 – 2461 . OpenUrl CrossRef PubMed Web of Science [9]. ↡ Lin , Z. et al. β€œ Evolutionary-scale prediction of atomic-level protein structure with a language model ”. Science 379 . 6637 ( 2023 ), pp. 1123 – 1130 . OpenUrl CrossRef PubMed [10]. ↡ Fox , N. K. , Brenner , S. E. , and Chandonia , J.-M. β€œ SCOPe: Structural Classification of Proteinsβ€”extended, integrating SCOP and ASTRAL data and classification of new structures ”. Nucleic acids research 42 . D1 ( 2014 ), pp. D304 – D309 . OpenUrl CrossRef PubMed Web of Science [11]. ↡ Sillitoe , I. et al. β€œ CATH: increased structural coverage of functional space ”. Nucleic acids research 49 . D1 ( 2021 ), pp. D266 – D273 . OpenUrl CrossRef PubMed [12]. ↡ Schaeffer , R. D. et al. β€œ ECOD: new developments in the evolutionary classification of domains ”. Nucleic acids research 45 . D1 ( 2017 ), pp. D296 – D302 . OpenUrl CrossRef PubMed [13]. ↡ Schoch , C. L. et al. β€œ NCBI Taxonomy: a comprehensive update on curation, resources and tools ”. Database 2020 ( 2020 ), baaa062 . OpenUrl CrossRef PubMed [14]. ↡ Sehnal , D. et al. β€œ Mol* Viewer: modern web app for 3D visualization and analysis of large biomolecular structures ”. Nucleic acids research 49 . W1 ( 2021 ), W431 – W437 . OpenUrl CrossRef PubMed [15]. ↡ Schuster , C. D. et al. β€œ Globin phylogeny, evolution and function, the newest update ”. Proteins: Structure, Function, and Bioinformatics 92 . 6 ( 2024 ), pp. 720 – 734 . OpenUrl [16]. ↡ Puente-Lelievre , C. et al. β€œ Tertiary-interaction characters enable fast, model-based structural phylogenetics beyond the twilight zone ”. bioRxiv ( 2023 ), pp. 2023 – 12 . [17]. ↡ Malik , A. J. et al. β€œ Structome-AlignViewer: On Confidence Assessment in Structure-Aware Alignments ”. bioRxiv ( 2025 ), pp. 2025 – 05 . View the discussion thread. Back to top Previous Next Posted August 18, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Structome-TM: Complementing dataset assembly for structural phylogenetics by addressing size-based biases Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Structome-TM: Complementing dataset assembly for structural phylogenetics by addressing size-based biases Ashar J. Malik , David B. Ascher bioRxiv 2025.02.08.637224; doi: https://doi.org/10.1101/2025.02.08.637224 Share This Article: Copy Citation Tools Structome-TM: Complementing dataset assembly for structural phylogenetics by addressing size-based biases Ashar J. Malik , David B. Ascher bioRxiv 2025.02.08.637224; doi: https://doi.org/10.1101/2025.02.08.637224 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41937) Biophysics (21452) Cancer Biology (18588) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15156) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source β€” PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

βš™ Ask this paper AI returns verbatim quotes from the full text Β· source: preprint-html β“˜

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) β€” citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-23T02:00:01.238055+00:00
License: CC-BY-NC-ND-4.0