Full text
53,747 characters
· extracted from
preprint-html
· click to expand
InteracTor: A new integrative feature extraction toolkit for improved characterization of protein structural properties | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results InteracTor: A new integrative feature extraction toolkit for improved characterization of protein structural properties View ORCID Profile Jose Cleydson F. Silva , View ORCID Profile Layla Schuster , Nick Sexson , View ORCID Profile Matias Kirst , View ORCID Profile Marcio F. R. Resende , View ORCID Profile Raquel Dias doi: https://doi.org/10.1101/2024.10.07.616705 Jose Cleydson F. Silva 1 Department of Microbiology and Cell Science, Institute of Food and Agricultural Sciences, University of Florida , 1052 Museum Road, 32611-0700, Gainesville, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jose Cleydson F. Silva Layla Schuster 1 Department of Microbiology and Cell Science, Institute of Food and Agricultural Sciences, University of Florida , 1052 Museum Road, 32611-0700, Gainesville, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Layla Schuster Nick Sexson 1 Department of Microbiology and Cell Science, Institute of Food and Agricultural Sciences, University of Florida , 1052 Museum Road, 32611-0700, Gainesville, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Matias Kirst 3 Genetics Institute, University of Florida , Gainesville, FL, USA 4 School of Forest, Fisheries and Geomatic Sciences, University of Florida , Gainesville, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Matias Kirst Marcio F. R. Resende 2 Horticultural Sciences Department, University of Florida , Gainesville, FL, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Marcio F. R. Resende Raquel Dias 1 Department of Microbiology and Cell Science, Institute of Food and Agricultural Sciences, University of Florida , 1052 Museum Road, 32611-0700, Gainesville, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Raquel Dias For correspondence: raquel.dias{at}ufl.edu Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Understanding the structural and functional diversity of protein families is crucial for elucidating their biological roles. Traditional analyses often focus on primary and secondary structures, which include amino acid sequences and local folding patterns like alpha helices and beta sheets. However, primary and secondary structures alone may not fully represent the complex interactions within proteins. To address this limitation, we developed a new algorithm (InteracTor) to analyze proteins by extracting features from their three-dimensional (3D) structures. The toolkit extracts interatomic interaction features such as hydrogen bonds, van der Waals interactions, and hydrophobic contacts, which are crucial for understanding protein dynamics, structure, and function. Incorporating 3D structural data and interatomic interaction features provides a more comprehensive understanding of protein structure and function, potentially enhancing downstream predictive modeling capabilities. By using the extracted features in Mutual Information scoring (MI), Principal Component Analysis (PCA), t-distributed Stochastic Neighbor Embedding (t-SNE), Uniform Manifold Approximation and Projection (UMAP), and hierarchical clustering analysis as use cases, we identified clear separations among protein structural families, highlighting distinct functional aspects. Our analysis revealed that interatomic interaction features were more informative than protein secondary structure features, providing insights into potential structural and functional properties. These findings underscore the significance of considering tertiary structure in protein analysis, offering a robust framework for future studies aiming at enhancing the capabilities of models for protein function prediction and drug discovery. Introduction In recent decades, high-throughput sequencing techniques have dramatically expanded protein sequence databases, while advances in cryo-electron microscopy and deep-learning-based computational structure determination methods, including AlphaFold [ 1 ] and RoseTTAFold [ 2 ], have transformed protein structure elucidation. This surge in available sequence and structural data has catalyzed the development of machine- and deep-learning techniques for predictive modeling. This data has been leveraged to address a variety of challenges, such as identifying non-classical secreted proteins [ 3 , 4 ], predicting binding affinity [ 5 – 7 ], and engineering proteins for novel functions [ 8 , 9 ]. Central to these algorithms is the selection of feature encodings, aimed at converting protein sequences and physiochemical properties into machine-readable formats. Ideally, this process captures the attributes most relevant to the predictive targets of interest. Sequence-based feature representations are among the most widely utilized, including amino acid composition, chemical property-based features, k-mers, and alignment-based embeddings. These descriptors effectively simplify sequence information and reduce the data dimensionality while still highlighting broader functional characteristics, sequence patterns, and evolutionary relationships. However, sequence-based methods can suffer from high dimensionality and data sparsity and are limited in their ability to capture critical properties influencing protein function. The incorporation of 3D structural data into the suite of available encodings allows predictive models to have a deeper layer of biological context that can give insight into functional dynamics. This has spurred the development of more comprehensive feature extraction platforms such as iFeatureOmega [ 10 ], Pfeature [ 11 ], and ProFeatX [ 12 ]. which incorporate both sequence and structural descriptors. While these descriptors have been foundational for many types of predictive modeling, the encoding of interaction features represents a significant yet underexploited strategy. These features, including dispersion forces, Van der Waals interactions, hydrophobic interactions, and repulsive interactions, add another dimension to the protein representation. Beyond informing protein-protein and protein-ligand interactions, interaction features within the same protein can significantly enhance the capabilities of models that predict protein function. Furthermore, such interaction-based descriptions may also enhance drug-discovery capabilities and provide insights into protein dynamics, folding, and stability. Here we present InteracTor, a new toolkit for the extraction of three types of protein feature encodings: interaction features, physicochemical features, and compositional features. Interaction features include hydrogen bonds, hydrophobic contacts, repulsive interactions, and van der Waals interactions, each encoding aspects of molecular dynamics that play an important role in governing protein function. Specifically, hydrogen bonds and hydrophobic contacts are important for stabilizing secondary and tertiary structures [ 13 , 14 ]. Van der Waals interactions influence molecular complementarity, which is crucial for substrate binding, and mediate transient interactions that can facilitate or destabilize protein structures and complexes [ 15 ]. Physiochemical property features include accessible solvent area, hydrophobicity, and surface tension, which are implicated in protein folding, stability, solubility, and protein-protein interactions. Compositional features include mono-, di-, and tripeptide composition and amino acid side chain chemical property (CPAASC) frequencies [ 16 , 17 , 18 ]. These features determine local spatial arrangements (secondary structure) and the overall 3D folded conformation (tertiary structure) of a protein through the formation of alpha helices, beta sheets, loops, and structural motifs. Our integrative feature encoding approach underscores the complexity and richness of proteins, pointing toward a more holistic approach in computational biology. Structural and sequence-based features are complementary and provide a more comprehensive representation of the protein, ultimately improving the performance of tasks like protein structure and function characterization. Materials and Methods Algorithm description The InteracTor algorithm consists of a sequence of steps for analyzing protein structures and calculating various features from proteins’ interatomic interactions, physicochemical properties of residues, and peptide composition ( Figure 1 ). The following is a description of each step: Download figure Open in new tab Figure 1: Pipeline of the InteracTor Algorithm. (A) Extensive database of PDB files, where each chain is separated into an individual PDB file. These files are converted to MOL2 format. (B) Molecular representation with carbon atoms in gray, oxygen in red, and nitrogen in blue. The following are calculated: Atom-atom distances (d) and van der Waals radii (r); Bond angles (a) and geometric centers (g); Interaction and atom types. (C) The following features are calculated: Interaction features ( Table 1 ); Mono-, di-, and tripeptide composition; Side chain physicochemical properties. (D) The resulting data are saved in table format, which can be used later for feature selection, dimensionality reduction, clustering, visualization, and machine learning. View this table: View inline View popup Table 1: Protein structural features based interatomic interactions and protein physicochemical properties. Step 1 . Extract atom, residue, and sequence information from PDB file ( Figure 1A ): This step involves parsing the Protein Data Bank (PDB) file to obtain the atomic types, 3D coordinates, and the amino acid sequence of the protein. Step 2 . Extract additional information from MOL2 file ( Figure 1A ): In this step, we extract additional information from the protein’s atoms, including covalent bond mapping and types (e.g., single, double, triple bonds, aromatic rings, etc.) and sp 2 hybridization. Step 3. Calculate atom-atom distances ( Figure 1B ): In this step, the algorithm calculates the distances between pairs of atoms not covalently bound within the protein structure, which is used to identify potential interatomic interactions within the protein structure. A distance threshold, as described by Dias et al., [ 5 ] is applied to determine whether atoms are interacting. The minimum allowed distance between each pair of atoms is the sum of their van der Waals (VDW) radii minus 0.7, and the maximum allowed distance is the sum of their radii plus 0.7. Step 4 . Calculate geometric centers ( Figure 1B ): Additional properties are extracted from atoms selected as potentially participating in non-covalent interactions (step 3) and covalently bound to multiple atoms (step 2). This includes the calculation of geometric centers between the selected atoms and their respective covalently bound atoms. These centers are utilized to calculate angles between hydrogen bond donor and acceptor atoms (step 5). Step 5. Calculate angles ( Figure 1B ): Here, the algorithm computes the angles between atoms or groups of atoms in the protein structure, which is used to evaluate hydrogen bonding geometry. Step 6 . Compute interatomic interaction features and protein physicochemical properties described in Table 1 and Figure 1C . Step 7. Extract compositional features ( Figure 1C ): InteracTor calculates the frequencies of single amino acids (monopeptide), pairs of amino acids (dipeptide), and triplets of amino acids (tripeptide) in the protein sequence. Step 8 . Extract CPAASC frequencies ( Table 2 and Figure 1C ): This step involves calculating the frequencies of various physicochemical properties associated with the side chains of the amino acids in the protein sequence. View this table: View inline View popup Table 2: Protein physicochemical features based on chemical properties of amino acid side chains (CPAASC). Step 9. Write results and postprocessing ( Figure 1D ): The final step is to write the features computed in steps 6, 7 and 8 to an output file for further analysis or use in other applications. The toolkit also includes example scripts for downstream analyses such as feature selection via MI scoring, dimensionality reduction via PCA, t-SNE, and UMAP, hierarchical clustering and visualization using heatmaps, and training machine learning algorithms. Implementation of protein structural features We implemented a toolkit for the extraction and encoding of protein structural features based on physico-chemical properties, amino acids composition, and interatomic interactions. By building upon our previous methods for predicting protein-ligand binding affinity [ 5 ], we modified the algorithm to analyze residue-residue interactions within protein structures ( Table 1 ). We also included the calculation of CPAASC frequencies [ 16 ] ( Table 2 ), monopeptide [ 18 ], dipeptide, and tripeptide composition [ 17 ] ( Table 3 ). In addition to the 20 classic amino acids, our algorithm supports rare residues found in distinct biological systems, including selenocysteine, pyrrolysine, and N-methylvaline. These amino acids, although rare, play important roles in specific biological processes [ 19 , 20 ]. View this table: View inline View popup Table 3: Protein sequence composition by k-mers frequencies features. Structural datasets and Preprocessing We used 20,877 protein structures from the PDB-REDO database [ 45 ] eliminated redundant (100% sequence similarity) and small proteins (<50 residues) using Biopython and in-house scripts (see Code and Data Availability). PDB files were converted to MOL2 format using Open Babel [ 48 ]. We then applied the InteracTor algorithm to extract features from both PDB and MOL2 files. Data selected for demonstration of use cases We utilized UniProt’s application programming interface (API) to extract Gene Ontology (GO) terms and protein family names based on the PDB accession number to facilitate analysis and demonstration of use cases [ 49 , 47 ]. We selected the most abundant protein families and GO terms in the dataset for further demonstration of use cases. Mapping scripts are available in GitHub as well (see Code and Data Availability). Feature selection We applied MI scoring to quantify the relevance of each feature in distinguishing between different protein families and GO categories [ 50 ]. We ranked the features by their respective MI scores and selected the top 100 most informative features for further analyses. Analysis of protein structural families We applied PCA to reduce the dimensionality of the data generated by our toolkit and evaluated how the primary components capture the variability among protein families and GO terms [ 51 ]. In addition to PCA, we also applied t-SNE [ 52 , 53 , 54 ] and UMAP [ 55 ] to reduce dimensionality while preserving nonlinear relationships and local structures in data. We used violin plots to visualize the distribution and density of the extracted features across different protein families and GO terms, providing insights into the distribution and central tendencies of the data [ 52 ]. We performed hierarchical clustering using Pearson correlation as distance metric and the complete linkage method for cluster formation [ 52 ]. This approach allowed us to identify and visualize the hierarchical relationships between protein families based on similarities among the features extracted by our toolkit. We then visualized the dimensionality reduction and clustering results with FreeViz [ 53 , 54 ]. For data analysis and visualization, the Python Data Mining Toolbox package was utilized [ 52 ]. Results InteracTor computes 11 different interatomic interaction features and 8 distinct CPAASC that are key for characterizing structure and function of proteins ( Table 1 and Table 2 ). In addition to interaction and structural features, our toolkit also extracts classic protein sequence composition features such as k-mer frequencies ( Table 3 ), resulting in a total of 18,296 multimodal features extracted ( Table 4 ). This provides a comprehensive representation of protein structure, function, and sequence characteristics, enabling in-depth analysis of protein properties across various scales of molecular organization. View this table: View inline View popup Download powerpoint Table 4: Overview of protein features extracted. InteracTor accurately characterizes the variability among distinct protein families We conducted PCA across all 18,296 features extracted from 20,877 protein structures representing the most abundant protein families and GO terms in the PDB REDO database ( Supplementary Table 1 ) to evaluate the overall utility of InteracTor in the characterization of variability among protein families and Gene Ontology (GO) terms. Figure 2 shows the first two principal components, which captured 1.72% of the variance across protein families ( Figure 2B ), and 1.0% of the variance across GO terms ( Figure 2A ). Except for Peptidase S1 and the Glycosyl hydrolase 5 (cellulase A), the protein families exhibited well-defined clusters in the two-dimensional space ( Figure 2D ), whereas GO terms exhibited less separation overall ( Figure 2A ). This difference may be attributed to the inconsistent accuracy of GO annotations across non-model organisms [ 56 ]. Furthermore, features extracted from 3D structures could provide more accurate annotations compared to those based solely on sequence data, potentially improving the clustering and functional interpretation of protein families. Unlike the two-dimensional projections illustrated in Figure 2A and Figure 2B , the three-dimensional projections provide enhanced separation and clearer clustering of the data, allowing a distinct visualization of relationships among the protein structures. The first three principal components captured 2.40% of the variance across protein families ( Figure 2C , Figure S1A ), as well as 1.38% of the variance across GO terms ( Figure 2D , Figure S1B) . Besides that, the 3D visualization improved the clustering and separation of Peptidase S1 and Glycosyl hydrolase 5 families, as well as the separation of GO categories, allowing for a more nuanced interpretation of functional relationships among groups ( Figure 2C ). Download figure Open in new tab Figure 2: Analysis of Protein Families and Their Molecular Functions. A) Clustering of three representative groups based on molecular functions, emphasizing key functional categories and their interrelationships. B) Clustering of the most prominent protein family, showcasing distinct traits unique to that family. C) Three-dimensional PCA visualization of molecular functions, capturing the variance and complexity within different functional categories. D) Three-dimensional PCA representation of protein families, showing clear separations and relationships among them. Mutual Information scoring identifies key features across protein families and GOs In the feature selection process, all features associated with the selected protein families were ranked using the MI score. The histogram of MI scores used for feature selection shows a bimodal distribution ( Figure 3A ). The primary mode, or large peak, corresponds to features with low MI scores (MI=0.2) represent a subset of 354 features with informational content effectively distinguished from the background noise peak. Among the top 100 high MI scoring features are 9 interatomic interactions, 2 CPAASC features, and 89 sequence composition features ( Supplementary Table 3 ). Figure 3B displays the distribution of MI scores across all features, highlighting the spread and density of these scores. Among the 12 most highly ranked features across protein families are hydrogen bonds (MI=0.775), total surface tension (MI=0.763), london dispersion forces (MI=0.758), repulsive interactions (MI=0.722), internal tension (MI=0.708), ASA (MI=0.694), hydrophobic contacts (MI=0.561), TG frequency (MI=0.562), internal hydrophobicity (MI=0.561), VN frequency (MI=0.556), total hydrophobicity (MI=0.539), and GG frequency (MI=0.509). For GO terms, the MI score analysis revealed values ranging from approximately 0.06 to 0.1, as shown in Figure 3C . While several peaks are well-defined, indicating key features, the MI scores are notably lower compared to those for protein families. Figure 3D presents a violin plot illustrating the distribution of MI scores, with a high density of values between 0.1 and 0.2. High MI scoring features include total surface tension (MI=0.202), london dispersion forces (MI=0.165), total hydrophobicity (MI=0.146), ASA (MI=0.142), EL frequence (MI=0.127), VDW interactions (MI=0.123), EG, NL, KT, GN, AN, (MI=0.113, 0.111, 0.109, 0.105, and 0.103, respectively) and repulsive interactions (MI=0.100). Despite the lower MI scores, these features demonstrate a significant (Wilcoxon test, p-value < 2.2 −16 ) influence on the differentiation of protein family and GO categories, highlighting their relevance in the context of ligand/substrate interactions and functional protein family. Download figure Open in new tab Figure 3: Analysis of Mutual Information (MI) Scores Across Protein Families and Molecular Functions. A) Frequency distribution of MI scores for each protein family B) Density distribution of MI scores within protein families, focusing on the top 12 ranked features that significantly contribute to family differentiation. C) Frequency distribution of MI scores for molecular functions. D) Density distribution of MI scores for molecular functions, emphasizing the top 12 ranked features. Selected features reveal unique and distinct distributions across protein families and GOs Figure 4A shows consistent distribution of london dispersion forces and total surface tension features across multiple protein families, including Peptidase S1, Cytochrome P450, and Glycosyl Hydrolase 5. In Figure 4B and 4D , the repulsive interactions and hydrogen bonds features show similar patterns among Bacterial Solute Binding Protein 2, Enoyl-CoA Hydratase/Isomerase, and Glycosyl Hydrolase families, displaying a bimodal distribution. Further analysis of the repulsive interactions and hydrogen bond features ( Figure 4B and D , respectively) revealed similar distribution patterns across the analyzed families (Spearman correlation, r = 0.99, p = 0.968, Supplementary Figure S2). Bacterial Solute Binding Protein 2, Enoyl-CoA Hydratase/Isomerase, and Glycosyl Hydrolase families displayed bimodal distributions. Download figure Open in new tab Figure 4: Distribution and Frequency of Key Features Across Protein Families. A) Distribution and frequency of the London dispersion feature among various protein families. B) Distribution and frequency of the repulsive interaction feature across different protein families. C) Distribution and frequency of the total surface tension feature for each protein family. D) Distribution and frequency of the total hydrogen bond count within protein families. Clustering of selected features reveals distinct patterns and relationships among protein families Hierarchical clustering identified 19 distinct groups across eight families ( Figure 5A ), revealing a complex internal structure within the dataset. Complementary t-SNE clustering analysis ( Figure 5B ) detected 9 groups and not only corroborated these groupings but also unveiled finer details of inter-group relationships. While hierarchical clustering effectively segmented proteins with similar functions, t-SNE provided a more nuanced separation, highlighting occasional cross-family similarities, with some datapoints from different families appearing in unexpected clusters. Multivariate visualization ( Figure 5C ) further highlighted the role of interatomic interaction features, which were predominantly observed in the Cytochrome P450 and Glycosyl Hydrolase 5 (cellulase A) families (Wilcoxon p ≤ 0.00049, See Supplementary Table 4 ). In contrast, dipeptide features were more prevalent among other protein families, indicating a varied functional landscape shaped by distinct sequence composition patterns. (Wilcoxon p ≤ 0.03125, See Supplementary Table 4 ). Download figure Open in new tab Figure 5: Advanced Clustering and Visualization of Protein Families. A) Hierarchical clustering of protein families, depicting relationships and groupings based on structural and functional similarities. B) t-SNE plot offering a refined visualization of inter-family relationships within a reduced-dimensional space. C) FreeViz plot highlighting the most significant features for each group, illustrating key attributes that differentiate protein families. Discussion A Novel Algorithm for Improved Characterization of Protein Structural Properties Our protein feature extraction algorithm distinguishes itself by focusing on the tertiary structure of proteins, particularly on interatomic interactions, unlike other approaches that rely on primary and secondary structures derived from amino acid sequences [ 10 , 57 – 58 ]. By analyzing three-dimensional relationships, the algorithm captures intramolecular interactions that underpin protein structure and function. Extracting features directly from the tertiary structure allows the algorithm to uncover additional patterns in the chemical properties of amino acids and their interactions, offering a comprehensive understanding of the structural and functional dynamics of proteins [ 59 ]. This enhanced perspective provides new insights into the complex mechanisms governing protein behavior, facilitating advancements in protein engineering and drug design [ 60 ]. The graphical representations ( Figures 2 , 5 , 6 , and 7 ) provide a comprehensive overview of protein characteristics as well, assisting in the visualization and interpretation of complex structural and physicochemical properties across protein families. Download figure Open in new tab Figure 6: Heatmap of Protein Family and Feature Clustering Heatmap displays the clustering of protein families and features, emphasizing interatomic interaction characteristics. Z-scores were applied to standardize the data, highlighting distinct clusters based on these pivotal interactions. Download figure Open in new tab Figure 7: Two-Dimensional Projection of Protein Families. This UMAP projection visualizes the clustering of protein families, showcasing distinct groupings informed by structural and functional characteristics. The feature selection process, guided by MI scores, was crucial in identifying the most effective features for distinguishing protein families, as shown in Figure 3A and B . Prominent peaks in MI scores indicate that features such as hydrogen bonds, total surface tension, and contact hydrophobicity are particularly influential in differentiating protein families, underscoring their critical role in our analysis. This prioritization of key features enhances our ability to effectively distinguish between various protein structural and functional groups. The identification of 9 out of 11 interatomic interactions features among the top 12 features with the highest MI scores underscores their critical role in differentiating protein families. This finding defines the importance of specific molecular interactions, such as hydrogen bonds and hydrophobic contacts, in defining protein structure and function, providing valuable insights into the fundamental principles governing protein family diversity. Figure 6 revealed clear definitions of specific clusters, highlighting the effectiveness of the approach in identifying grouping patterns among protein families. The groupings not only reflected complex interatomic relationships between different features but also demonstrated the formation of distinct clusters within dipeptide feature groups. The Peptidase S1 family, Glycosyl hydrolase 5 (cellulase A) family and Short-chain dehydrogenases/reductases (SDR) family demonstrated clustering of dipeptide feature groups. This result confirms the method’s ability to highlight relevant functional patterns and distinguish between different protein categories based on the observed interatomic interactions. A bimodal distribution of hydrogen bonding in a protein family may suggest the presence of two distinct subtypes of structural or functional groups within that family ( Figure 4D ). This variability might arise from differences in the protein’s environment or structure, such as variations in hydrophobicity or the presence of neighboring groups that influence hydrogen bond strength and stability. For instance, hydrogen bonds can be stronger in hydrophobic environments due to lower dielectric constants, and their contributions to stability can vary significantly depending on their distance, geometry, and surrounding interactions. Additionally, a bimodal distribution might suggest the presence of a miss annotation or two subfamilies within the protein family, each characterized by distinct hydrogen bonding patterns that could be related to specific structural or functional adaptations ( Figure 4 ). Visualization of the two-dimensional projection generated via UMAP analysis ( Figure 7 ) separated key protein families, including Short-chain dehydrogenases/reductases (SDR), Cytochrome P450, Enoyl-CoA hydratase/isomerase, Bacterial solute-binding protein 2, Class-I aminoacyl-tRNA synthetase, Glycosyl hydrolase 5 (cellulase A), Peptidase S1, and FPP/GGPP synthase. Each family formed distinct clusters, indicating that UMAP effectively distinguished these groups based on the protein features extracted by InteracTor. This clear separation underscores the method’s ability to reveal meaningful patterns and relationships within the dataset, offering valuable insights into the functional dynamics of the protein families. Case Study on Multi-dimensional Clustering Reveals Complex Structure and Functional Diversity in Protein Families Hierarchical clustering effectively delineated differences among protein families such as short-chain dehydrogenases/reductases (SDR), Cytochrome P450, Enoyl-CoA hydratase/isomerase, Bacterial solute-binding protein 2, Class-I aminoacyl-tRNA synthetase, Glycosyl hydrolase 5 (cellulase A), Peptidase S1, and FPP/GGPP synthase. Complementary analysis with t-SNE allowed the detection of subtle similarities and differences among the protein families and offering deeper insights into their functional connections ( Figure 5 , See supplementary Table 4 ). Multivariate visualization allowed dynamic adjustments to identify unique aspects of protein families, including specific features contributing to the clustering process, yielding valuable insights into the functional diversity within and between protein families ( Figure 5 , 6 , 7 ). Additionally, UMAP captured both local and global structures in a reduced-dimensional space, allowing a comprehensive understanding of the clustering and distribution of protein families, highlighting both shared and similar structural characteristics. This integrative approach led to a deeper understanding of the complex relationships and functional diversity inherent across protein families, revealing details on the interplay of intermolecular interactions, physicochemical properties, and composition of proteins. This holistic methodology not only enhances our comprehension of protein family dynamics but also establishes a robust framework for future research. The extracted features have the potential to directly impact the performance of machine learning and deep learning models by offering a new feature engineering tool. By providing new insights on protein structural and functional properties, our approach supports advancements in structural biology, synthetic biology and in the characterization proteins of unknown function (PUFs) [ 61 , 62 , 63 , 64 ]. Additionally, these features can contribute to emerging fields such as quantum biology [ 64 , 65 , 66 , 67 , 68 ], facilitating a deeper understanding of biological roles and interactions across diverse protein families. Key Points This paper provides a comprehensive toolkit for extracting and encoding sequence, physicochemical, and structural features from proteins. The proposed toolkit introduces a novel algorithm for extracting interatomic interaction features from protein structures. This work also provides detailed use cases, demonstrating how our toolkit can enhance protein family characterization of distinct protein families by leveraging several types of clustering algorithms. Our comprehensive feature extraction and analytical methods revealed new insights into protein structure-function relationships. The new toolkit is available as an open-source software on GitHub, promoting accessibility, reproducibility and collaboration in research. Conflicts of Interest The authors declare no conflicts of interest. Funding This work was supported by UF LIFT AI funding of University of Florida’s Institute of Food and Agricultural Sciences (UF/IFAS). Author Contributions R.D. designed and supervised the study and contributed to manuscript writing. M.F.R.R and M.K. contributed to manuscript writing. J.C.F.S. developed the algorithm, built the computational tools, performed the analysis, and wrote the manuscript. L.S and N.S. were responsible for the data preprocessing and contributed to manuscript writing. All authors read and approved the final manuscript. Code and Data Availability InteracTor is provided as an open-source python in GitHub: https://github.com/Dias-Lab/InteracTor Supplementary Figures Figure S1. Principal component of protein families and GO terms. (A) PCA plot illustrating the variance captured by the first three principal components across different protein families. (B) PCA plot depicting the variance explained by the first three principal components across Gene Ontology (GO) terms. Figure S2. Analysis of correlation between repulsive interactions and hydrogen bonds. (A) Correlation analysis based on the features of repulsive interactions and hydrogen bonds. (B) Average values of hydrogen bonds across different conditions. (C) Average values of repulsive interactions across different conditions. Supplementary Tables View this table: View inline View popup Download powerpoint Supplementary Table 1: Distribution and quantification of selected protein families View this table: View inline View popup Download powerpoint Supplementary Table 2: Distribution and quantification of selected gene ontology Supplemental Table 3. MI Scores for protein families and GO Terms. Supplemental Table 4. Wilcoxon p-values for feature pairwise comparisons between protein families. Acknowledgements The authors gratefully acknowledge UF Research Computing for providing computational resources and support that have contributed to the research reported in this publication ( http://www.rc.ufl.edu ) and the financial support provided by Launching Innovative Faculty Teams in AI (LIFT AI) program. References 1. ↵ Varadi M , Anyango S , Deshpande M , et al. AlphaFold Protein Structure Database: massively expanding the structural coverage of protein-sequence space with high-accuracy models . Nucleic Acids Res 2021 ; 50 : D439 – D444 . OpenUrl 2. ↵ Baek M , DiMaio F , Anishchenko I , et al. Accurate prediction of protein structures and interactions using a three-track neural network . Science 2021 ; 373 : 871 – 876 . OpenUrl Abstract / FREE Full Text 3. ↵ Wang X , Li F , Xu J , et al. ASPIRER: a new computational approach for identifying non-classical secreted proteins based on deep learning . Brief Bioinformatics 2022 ; 23 . 4. ↵ Zhang Y , Yu S , Xie R , et al. PeNGaRoo, a combined gradient boosting and ensemble learning framework for predicting non-classical secreted proteins . Bioinformatics 2019 ; 36 : 704 – 712 . OpenUrl 5. ↵ Dias R , Kolazckowski B . Different combinations of atomic interactions predict protein-small molecule and protein-DNA/RNA affinities with similar accuracy . Proteins . 2015 ; 83 : 2100 – 2114 . OpenUrl 6. Hong X , Tong X , Xie J , et al. An updated dataset and a structure-based prediction model for protein–RNA binding affinity . Proteins . 2023 ; 91 : 1245 – 1253 . OpenUrl 7. ↵ Zhang H , Saravanan KM , Zhang JZH . DeepBindGCN: Integrating Molecular Vector Representation with Graph Convolutional Neural Networks for Protein–Ligand Interaction Prediction . Molecules 2023 ; 28 : 4691 . OpenUrl 8. ↵ Shroff R , Cole AW , Diaz DJ , et al. Discovery of Novel Gain-of-Function Mutations Guided by Structure-Based Deep Learning . ACS synth. biol . 2020 ; 9 : 2927 – 2935 . OpenUrl 9. ↵ Torng W , Altman RB . 3D deep convolutional neural networks for amino acid environment similarity analysis . BMC Bioinformatics 2017 ; 18 . 10. ↵ Chen Z , Liu X , Zhao P , et al. iFeatureOmega:an integrative platform for engineering, visualization and analysis of features from molecular sequences, structural and ligand data sets . Nucleic Acids Res 2022 ; 50 : W434 – W447 . OpenUrl 11. ↵ Pande A , Patiyal S , Lathwal A , et al. Pfeature: A Tool for Computing Wide Range of Protein Features and Building Prediction Models . Journal of Computational Biology 2023 ; 30 : 204 – 222 . OpenUrl CrossRef 12. ↵ Guevara-Barrientos D , Kaundal R . ProFeatX: A parallelized protein feature extraction suite for machine learning . Comput Structural Biotechnol J 2023 ; 21 : 796 – 801 . OpenUrl 13. ↵ Gao J , Bosco DA , Powers ET , et al. Localized thermodynamic coupling between hydrogen bonding and microenvironment polarity substantially stabilizes proteins . Nat Struct Mol Biol 2009 ; 16 : 684 – 690 . OpenUrl CrossRef PubMed 14. ↵ Coulocheri SA , Pigis DG , Papavassiliou KA , et al. Hydrogen bonds in protein–DNA complexes: Where geometry meets plasticity . Biochimie 2007 ; 89 : 1291 – 1303 . OpenUrl CrossRef PubMed Web of Science 15. ↵ Nilofer C , Sukhwal A , Mohanapriya A , et al. Protein-protein interfaces are vdW dominant with selective H-bonds and (or) electrostatics towards broad functional specificity . Bioinformation 2017 ; 13 : 164 – 173 . OpenUrl 16. ↵ Carvalho TFM , Silva JCF , Calil IP , et al. Rama: a machine learning approach for ribosomal protein prediction in plants . Sci Rep 2017 ; 7 . 17. ↵ Silva JCF , Ferreira MA , Carvalho TFM , et al. RLPredictiOme, a Machine Learning-Derived Method for High-Throughput Prediction of Plant Receptor-like Proteins, Reveals Novel Classes of Transmembrane Receptors . Int J Mol Sci 2022 ; 23 : 12176 . OpenUrl 18. ↵ Silva JCF , Carvalho TFM , Fontes EPB , et al. Fangorn Forest (F2): a machine learning approach to classify genes and genera in the family Geminiviridae . BMC Bioinformatics 2017 ; 18 . 19. ↵ Brugère J-F , Atkins JF , O’Toole PW , et al. Pyrrolysine in archaea: a 22nd amino acid encoded through a genetic code expansion . Emerging Topics in Life Sciences 2018 ; 2 : 607 – 618 . OpenUrl Abstract / FREE Full Text 20. ↵ Turanov AA , Lobanov AV , Fomenko DE , et al. Genetic Code Supports Targeted Insertion of Two Amino Acids by One Codon . Science 2009 ; 323 : 259 – 261 . OpenUrl Abstract / FREE Full Text 21. Pace CN , Fu H , Fryar KL , et al. Contribution of Hydrophobic Interactions to Protein Stability . Journal of Molecular Biology 2011 ; 408 : 514 – 528 . OpenUrl CrossRef PubMed 22. Sung S . Peptide folding driven by Van der Waals interactions . Protein Science 2015 ; 24 : 1383 – 1388 . OpenUrl 23. Bao G . Protein Mechanics: A New Frontier in Biomechanics . Experimental Mechanics 2008 ; 49 : 153 – 164 . OpenUrl 24. Sung S . Peptide folding driven by Van der Waals interactions . Protein Science 2015 ; 24 : 1383 – 1388 . OpenUrl 25. Walters ET , Mohebifar M , Johnson ER , et al. Evaluating the London Dispersion Coefficients of Protein Force Fields Using the Exchange-Hole Dipole Moment Model . The Journal of Physical Chemistry B 2018 ; 122 : 6690 – 6701 . OpenUrl 26. van Gils JHM , Gogishvili D , van Eck J , et al. How sticky are our proteins? Quantifying hydrophobicity of the human proteome. Bioinformatics Advances 2022 ; 2 . 27. Wei A-P , Herron JN , Andrade JD . The Role of Protein Structure in Surface Tension Kinetics . Developments in Biotherapy 1990 ; 305 – 313 . 28. Sashi P , Ramakrishna D , Bhuyan AK . Dispersion Forces and the Molecular Origin of Internal Friction in Protein . Biochemistry 2016 ; 55 : 4595 – 4602 . OpenUrl 29. Bosco Ho , https://github.com/boscoh/pdbremix/blob/master/pdbremix/asa.py . 2014 . 30. Shrake A , Rupley JA . Environment and exposure to solvent of protein atoms . Lysozyme and insulin. J Mol Bio 1973 ; 79 : 351 – 371 . OpenUrl 31. Mayol E , Campillo M , Cordomí A , et al. Inter-residue interactions in alpha-helical transmembrane proteins . Bioinformatics 2018 ; 35 : 2578 – 2584 . OpenUrl 32. Anjana R , Vaishnavi MK , Sherlin D , et al. Aromatic-aromatic interactions in structures of proteins and protein-DNA complexes: a study based on orientation and distance . Bioinformation 2012 ; 8 : 1220 – 1224 . OpenUrl CrossRef PubMed 33. Stollar EJ , Smith DP . Uncovering protein structure . Essays in Biochemistry 2020 ; 64 : 649 – 680 . OpenUrl 34. Zhou H-X , Pang X . Electrostatic Interactions in Protein Structure, Folding, Binding, and Condensation . Chemical Reviews 2018 ; 118 : 1691 – 1741 . OpenUrl CrossRef PubMed 35. Ortega G , Aguilar MA , Gautam BK , et al. The effect of charged residue substitutions on the thermodynamics of protein-surface interactions . Protein Science 2021 ; 30 : 2408 – 2417 . OpenUrl 36. Wilcox PE , Kraut J , Wade RD , et al. The molecular weight of a-chymotrypsinogen . Biochimica et Biophysica Acta 1957 ; 24 : 72 – 78 . OpenUrl PubMed 37. Nelson DL , Cox MM . Lehninger Principles of Biochemistry . 7th ed . New York : W.H. Freeman and Company ; 2017 . 38. Zamyatnin AA . Protein volume in solution . Prog Biophys Mol Biol . 1972 ; 24 : 107 – 123 . OpenUrl CrossRef PubMed 39. Ponnuswamy PK , Prabhakaran M , Manavalan P . Hydrophobic packing and spatial arrangement of amino acid residues in globular proteins . Biochimica et Biophysica Acta (BBA) - Protein Structure 1980 ; 623 : 301 – 316 . OpenUrl 40. Gasteiger E , Hoogland C , Gattiker A , et al. Protein Identification and Analysis Tools on the ExPASy Server . The Proteomics Protocols Handbook 2005 ; 571 – 607 . 41. Kumar S , Tsai C-J , Nussinov R . Factors enhancing protein thermostability. Protein Engineering , Design and Selection 2000 ; 13 : 179 – 191 . OpenUrl 42. Ghadimi M , Heshmati E , Khalifeh K . Distribution of dipeptides in different protein structural classes: an effort to find new similarities . European Biophysics Journal 2017 ; 47 : 31 – 38 . OpenUrl 43. Ghadimi M , Heshmati E , Khalifeh K . Distribution of dipeptides in different protein structural classes: an effort to find new similarities . European Biophysics Journal 2017 ; 47 : 31 – 38 . OpenUrl 44. Dodson G . Catalytic triads and their relatives . Trends in Biochemical Sciences 1998 ; 23 : 347 – 352 . OpenUrl CrossRef PubMed Web of Science 45. ↵ Joosten RP , Long F , Murshudov GN , et al. ThePDB_REDOserver for macromolecular structure model optimization . IUCrJ 2014 ; 1 : 213 – 220 . OpenUrl CrossRef PubMed 46. Berman HM . The Protein Data Bank . Nucleic Acids Research 2000 ; 28 : 235 – 242 . OpenUrl CrossRef PubMed Web of Science 47. ↵ Ashburner M , Ball CA , Blake JA , et al. Gene Ontology: tool for the unification of biology . Nature Genetics 2000 ; 25 : 25 – 29 . OpenUrl CrossRef PubMed Web of Science 48. ↵ O’Boyle NM , Banck M , James CA , et al. Open Babel: An open chemical toolbox . J Cheminform 2011 ; 3 . 49. ↵ Bateman A , Martin M-J , Orchard S , et al. UniProt: the universal protein knowledgebase in 2021 . Nucleic Acids Research 2020 ; 49 : D480 – D489 OpenUrl CrossRef 50. ↵ Kraskov A , Stögbauer H , Grassberger P . Estimating mutual information . Phys Rev E 2004 ; 69 . 51. ↵ Jolliffe IT , Cadima J . Principal component analysis: a review and recent developments. Philosophical Transactions of the Royal Society A: Mathematical , Physical and Engineering Sciences 2016 ; 374 : 20150202 . OpenUrl CrossRef PubMed 52. ↵ Demšar , Janez , et al. “ Orange: data mining toolbox in Python .” the Journal of machine Learning research 2013 ; 14 . 1 : 2349 – 2353 . OpenUrl 53. ↵ Belkina AC , Ciccolella CO , Anno R , et al. Automated optimized parameters for T-distributed stochastic neighbor embedding improve visualization and analysis of large datasets . Nature Communications 2019 ; 10 . 54. ↵ Demšar J , Leban G , Zupan B . FreeViz—An intelligent multivariate visualization approach to explorative analysis of biomedical data . Journal of Biomedical Informatics 2007 ; 40 : 661 – 671 . OpenUrl CrossRef PubMed 55. ↵ McInnes L , Healy J , Melville J. UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction . arXiv.org 2018 . 56. ↵ Gaudet P , Dessimoz C . Gene Ontology: Pitfalls, Biases, and Remedies . Methods in Molecular Biology 2016 ; 189 – 205 57. ↵ Markin CJ , Mokhtari DA , Sunden F , et al. Revealing enzyme functional architecture via high-throughput microfluidic enzyme kinetics . Science 2021 ; 373 . 58. ↵ Cao J , Xiong L . Protein Sequence Classification with Improved Extreme Learning Machine Algorithms . BioMed Research International 2014 ; 2014 : 1 – 12 . OpenUrl 59. ↵ Holehouse AS , Kragelund BB . The molecular basis for cellular function of intrinsically disordered protein regions . Nature Reviews Molecular Cell Biology 2023 ; 25 : 187 – 21 . OpenUrl 60. ↵ de Juan D , Pazos F , Valencia A . Emerging methods in protein co-evolution . Nature Reviews Genetics 2013 ; 14 : 249 – 261 . OpenUrl CrossRef PubMed 61. ↵ J. Karn Park J , Lappe M , Teichmann SA . Mapping protein family interactions: intramolecular and intermolecular protein family interaction repertoires in the PDB and yeast1 Edited by J. Karn . Journal of Molecular Biology 2001 ; 307 : 929 – 938 . OpenUrl CrossRef PubMed Web of Science 62. ↵ Hoffmann SA , Cai Y . Engineering stringent genetic biocontainment of yeast with a protein stability switch . Nature Communications 2024 ; 15 . 63. ↵ Nadzirin N , Firdaus-Raih M . Proteins of Unknown Function in the Protein Data Bank (PDB): An Inventory of True Uncharacterized Proteins and Computational Tools for Their Analysis . International Journal of Molecular Sciences 2012 ; 13 : 12761 – 12772 OpenUrl 64. ↵ Poudel S , Cope AL , O’Dell KB , et al. Identification and characterization of proteins of unknown function (PUFs) in Clostridium thermocellum DSM 1313 strains as potential genetic engineering targets . Biotechnology for Biofuels 2021 ; 14 : 65. ↵ Ha P , Kwak JH , Zhang Y , et al. Bisphosphonate conjugation enhances the bone-specificity of NELL-1-based systemic therapy for spaceflight-induced bone loss in mice . npj Microgravity 2023 ; 9 . 66. ↵ Doga H , Raubenolt B , Cumbo F , et al. A Perspective on Protein Structure Prediction Using Quantum Computers . Journal of Chemical Theory and Computation 2024 ; 20 : 3359 – 3378 . OpenUrl 67. ↵ Menezes AA , Cumbers J , Hogan JA , et al. Towards synthetic biological approaches to resource utilization on space missions . Journal of The Royal Society Interface 2015 ; 12 : 20140715 . OpenUrl 68. ↵ Mokhtari M , Khoshbakht S , Ziyaei K , et al. New classifications for quantum bioinformatics: Q-bioinformatics, QCt-bioinformatics, QCg-bioinformatics, and QCr-bioinformatics . Briefings in Bioinformatics 2024 ; 25 . 69. Pal S , Bhattacharya M , Lee S-S , et al. Quantum Computing in the Next-Generation Computational Biology Landscape: From Protein Folding to Molecular Dynamics . Molecular Biotechnology 2023 ; 66 : 163 – 178 . OpenUrl View the discussion thread. Back to top Previous Next Posted October 11, 2024. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following InteracTor: A new integrative feature extraction toolkit for improved characterization of protein structural properties Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share InteracTor: A new integrative feature extraction toolkit for improved characterization of protein structural properties Jose Cleydson F. Silva , Layla Schuster , Nick Sexson , Matias Kirst , Marcio F. R. Resende , Raquel Dias bioRxiv 2024.10.07.616705; doi: https://doi.org/10.1101/2024.10.07.616705 Share This Article: Copy Citation Tools InteracTor: A new integrative feature extraction toolkit for improved characterization of protein structural properties Jose Cleydson F. Silva , Layla Schuster , Nick Sexson , Matias Kirst , Marcio F. R. Resende , Raquel Dias bioRxiv 2024.10.07.616705; doi: https://doi.org/10.1101/2024.10.07.616705 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7651) Biochemistry (17746) Bioengineering (13928) Bioinformatics (42066) Biophysics (21499) Cancer Biology (18650) Cell Biology (25579) Clinical Trials (138) Developmental Biology (13409) Ecology (19947) Epidemiology (2067) Evolutionary Biology (24374) Genetics (15633) Genomics (22557) Immunology (17775) Microbiology (40505) Molecular Biology (17217) Neuroscience (88796) Paleontology (667) Pathology (2845) Pharmacology and Toxicology (4836) Physiology (7664) Plant Biology (15179) Scientific Communication and Education (2047) Synthetic Biology (4304) Systems Biology (9839) Zoology (2272)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.