Manually weighted taxonomy classifiers improve species-specific rumen microbiome analysis compared to unweighted or average weighted taxonomy classifiers

doi:10.1101/2025.03.12.642789

Manually weighted taxonomy classifiers improve species-specific rumen microbiome analysis compared to unweighted or average weighted taxonomy classifiers

2025 · doi:10.1101/2025.03.12.642789

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 51,738 characters · extracted from preprint-html · click to expand

Manually weighted taxonomy classifiers improve species-specific rumen microbiome analysis compared to unweighted or average weighted taxonomy classifiers | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Manually weighted taxonomy classifiers improve species-specific rumen microbiome analysis compared to unweighted or average weighted taxonomy classifiers View ORCID Profile Ryukseok Kang , View ORCID Profile Zhongtang Yu , View ORCID Profile Tansol Park doi: https://doi.org/10.1101/2025.03.12.642789 Ryukseok Kang 1 Department of Animal Science and Technology, Chung-Ang University , Anseong-si, Gyeonggi-do 17546, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ryukseok Kang Zhongtang Yu 2 Department of Animal Sciences, The Ohio State University , Columbus, OH 43210, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zhongtang Yu Tansol Park 1 Department of Animal Science and Technology, Chung-Ang University , Anseong-si, Gyeonggi-do 17546, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tansol Park For correspondence: tansol{at}cau.ac.kr Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Previous research has demonstrated that applying taxonomic weights to shotgun metagenomic data can improve species identification in 16S rRNA gene-based microbiome analysis. However, such an approach does not allow for accurate analysis of samples collected from less studied habitats, such as rumen. In the present study, we developed a method to incorporate taxonomic weights based on relative abundance of species identified from shotgun sequencing and amplicon sequencing data derived from rumen. Using this weighting method, we evaluated latest versions of five prominent databases—SILVA, Greengenes2, RDP, NCBI RefSeq, and GTDB—against the BLAST 16S rRNA database, assessing classification counts, fully classified ratios (proportion of ASVs classified to a known genus and species), and error rates. Our results revealed that the use of the weighting method significantly improved both classification counts and fully classified ratios, along with a substantial ( P < 0.05) reduction in error rates compared to unweighted taxonomy classifier. While GG2 and SILVA struggled with accurate classification at the species level owing to their inherent database characteristics, GTDB consistently improved all metrics using the manually weighted taxonomy classifier, achieving up to an 8% error rate reduction at the species level. NCBI RefSeq and RDP also exhibited remarkable improvement in the classification counts and fully classified ratios, along with substantial error rate reductions by up to 47% at the species level. These findings demonstrate that amplicon sequencing datasets can enhance rumen microbiome analyses through effective weighting methods. While SILVA is commonly used in metataxonomic analyses of the rumen microbiome, we recommend NCBI RefSeq for species-level classification due to its superior accuracy and minimal ambiguous classification (e.g., “uncultured” or “sp.") in future metataxonomic studies. Background Ruminants host a diverse and highly specialized microbiome in their rumen, which enables them to digest a wide range of feedstuffs, particularly fibrous materials that would otherwise be indigestible [ 1 , 2 ]. Accurately identifying the key microbes and understanding their contributions to various rumen fermentation processes are crucial for improving animal nutrition and productivity and mitigating the environmental impact associated with this unique animal group. The rumen microbiome has been intensively studied using culture-based and culture-independent nucleic acid-based techniques, with recent efforts predominantly leveraging the latter that utilize sequencing approaches like metataxonomics, metagenomics, and metatranscriptomics. Advances in sequencing technologies and the expansion of reference databases have significantly improved the accuracy of microbiome characterization, including taxonomic classification. However, most gut microbiome databases are disproportionally dominated by human microbiome data [ 3 ]. Such database bias may reduce the accuracy of analyses for non-human microbiomes, including the rumen microbiome. To address this challenge, several initiatives have been proposed, including the Global Rumen Census project, the “Hungate1000,” “Holoruminant,” and meta-omic approaches aimed at enhancing comprehensive characterization of the rumen microbiome [ 4 – 7 ]. Despite these efforts, the accuracy of 16S rRNA gene-based metataxonomic analysis of the rumen microbiome remains uncertain. Recent advances, such as weighted taxonomy classifiers, have shown promise in improving taxonomic resolution. For example, Kaehler et al. (2019) utilized q2-clawback and the Earth Microbiome Project Ontology (EMPO) 3 datasets to apply weighting to taxonomy classifiers. Specifically, this approach assigns varying weights to individual taxa: higher weights to taxa prevalent in specific environments and lower weights to rare or absent taxa [ 8 ]. This approach improved species-level identification and abundance estimation accuracy. It was also suggested that weighted taxonomy classifiers could further improve taxonomic resolution when applied to metagenomic analyses. However, since the ruminal microbiome is not included in the EMPO habitat types, the proposed classifiers probably cannot be applied to analyses of rumen samples. The distinct rumen microbiome profiles observed across ruminant species [ 9 , 10 ] suggest that generalized taxonomic weights may compromise classification accuracy when applied to a specific ruminant species. Therefore, creating and implementing ruminant species-specific taxonomy classifiers would be essential for achieving greater classification accuracy. Additionally, while shotgun sequencing-based metagenomics has been increasingly used for improved resolution, 16S rRNA gene amplicon-based metataxonomics remains widely used for rumen microbiome profiling due to its cost-effectiveness. To achieve greater accuracy in metataxonomic analyses of the rumen microbiome, we utilized multiple datasets of partial and full-length 16S rRNA gene sequences along with shotgun sequences obtained from native Korean Hanwoo cattle in developing weighted taxonomic classifiers. Furthermore, unlike the earlier studies by Kaehler et al. (2019) that used an outdated version of Greengenes for validating weighted methods, we used the most recent Greengenes database (GG2). We also included the latest versions of other commonly used databases—SILVA, NCBI RefSeq, GTDB, and RDP—to explore how the choice of databases affects classification accuracy. We developed an enhanced taxonomy classification method that integrates weight assignment, additional preprocessing steps, and q2-clawback by utilizing data from native Korean Hanwoo cattle. This study indicates that applying taxonomic weights can significantly improve rumen microbiome analyses. Methods Constructing an unweighted and weighted taxonomy classifier All data were analyzed using the QIIME2-amplicon version 2024.10 [ 11 ]. To construct unweighted taxonomy classifier (UWTC), the NCBI reference sequence database (RefSeq), SILVA, and GTDB databases were collected through the q2-rescript [ 12 ]. The NCBI RefSeq was downloaded on October 14, 2024, and used for archaeal and bacterial 16S rRNA sequences. The SILVA versions 138.1 and 138.2 [ 13 ], GTDB version 220.0 [ 14 ], along with the GG2 versions 2022.10 and 2024.09 [ 15 ] and the RDP bacterial and archaeal hierarchy model (version 2.14, August 2023) [ 16 ], were used to build the taxonomy classifier. Archaeal sequences shorter than 900 bp and bacterial sequences shorter than 1,200 bp were filtered out from all the databases to enhance taxonomic reliability and resolution. Additionally, all the database sequences were dereplicated to eliminate redundancy. To construct an average weighted taxonomy classifier (AWTC) as described in the QIIME2 resource ( https://resources.qiime2.org/ ), metadata were developed using the Earth Microbiome Project Ontology (EMPO) [ 17 ] for assigning average taxonomic weights. Publically available V4 region (150 bases) of the 16S rRNA gene sequences and associated EMPO3 metadata were obtained from Qiita using q2-clawback [ 8 ]. The following keywords were used to search and download the EMPO3 metadata: ‘animal-corpus,’ ‘animal-distal-gut,’ ‘animal-proximal-gut,’ ‘animal-secretion,’ ‘animal-surface,’ ‘plant-corpus,’ ‘plant-rhizosphere,’ ‘plant-surface,’ ‘sediment-non-saline,’ ‘sediment-saline,’ ‘soil-non-saline,’ ‘surface-saline,’ ‘water-non-saline,’ ‘water-saline,’ ‘animal-non-saline,’ and ‘animal-saline.’ Metagenomic shotgun and 16S rRNA gene amplicon datasets derived from rumen fluid samples collected directly from Hanwoo cattle were used to construct manually weighted taxonomy classifier (MWTC). The characteristics of each dataset are presented in Table 1 . The metagenomic shotgun dataset from Hanwoo steers (36 samples) and cows (5 samples) was preprocessed using fastp [ 18 ] for quality control, followed by sequence filtering with Bowtie2 (version 2.5.4) [ 19 ] to remove host sequences by aligning them to the Hanwoo genome (GCA_028973685.2) and feed sequences by aligning them to the genomes of the feed ingredients consumed by each animal group. For Hanwoo steers, feed sequences were filtered out using the genomes of oat hay (GCA_916181665.1), corn (GCF_902167145.1), rice straw (GCF_001433935.1), wheat (GCF_018294505.1), and palm kernel meal (GCF_000442705.1). For cows, filtering was performed using oat hay, corn, and rice straw genomes. The filtering ensured the isolation of microbial DNA sequences by removing host and feed-related sequences. The filtered metagenomic datasets were processed using SortMeRNA (version 4.3.7) [ 20 ] to extract the rRNA gene sequences. The resultant rRNA sequences were directly imported into QIIME2, and a feature table was generated after sequence dereplication. For the 16S rRNA gene amplicon sequencing data, raw reads were imported directly into QIIME2, followed by merging the paired-end amplicon sequences using FLASH2 (version 2.2.00) [ 21 ] with a minimum overlap of 20 bp. The sequences were then denoised using Deblur [ 22 ] with default options. View this table: View inline View popup Download powerpoint Table 1. Summary of the datasets used to generate the manual weighted taxonomy classifier. A custom Python script was used to combine the feature tables and the corresponding representative sequences to generate the input file for taxonomy weight generation. Sequences associated with chloroplasts and mitochondria were removed before generating the classifier to ensure accuracy. Weighted datasets were then generated using q2-clawback, utilizing 16S rRNA gene sequences from the EMPO dataset. These datasets incorporated all databases used to create the classifier, including UWTC generated with the naïve Bayes algorithm using the ‘feature-classifier’ plugin [ 23 ]. The process for constructing the weighted taxonomy classifier is detailed in Fig. 1 . Download figure Open in new tab Fig. 1. The workflow used to construct each weighted taxonomy classifier. The bioinformatics tools used were italicized. Amplicon datasets for validation of taxonomy classifier 16S rRNA gene amplicon sequencing data were generated from rumen samples of Hanwoo cattle to validate the taxonomy classifiers. Briefly, rumen fluid samples were directly collected via stomach tubing from the rumen following an in vivo experiment and in vitro fermentation for 24 h. The nearly full-length and the V3-V4 hypervariable region of the 16S rRNA gene were amplified using the primers 27F (5′-AGRGTTYGATYMTGGCTCAG-3′) and 1492R (5′-GYTACCTTGTTACGACTT-3′) and 341F (5′-CCTACGGGNGGCWGCAG-3′) and 805R (5′-GACTACHVGGGTATCTAATCC-3′), producing amplicons of full-length and V3-V4 regions, respectively. A total of 47 in vivo samples and five in vitro samples were used for validating the classifiers with the full-length amplicons, while 30 in vivo samples and four in vitro samples were analyzed for the V3-V4 amplicons. Data processing The amplicon sequences were classified into 21 classifiers using scikit-learn [ 24 ], employing three classification methods—UWTC, AWTC, and MWTC—applied across the five databases. For comparisons of the overall microbiota composition between the databases and among UWTC, AWTC, and MWTC, principal component analysis (PCA) was conducted using Bray-Curtis dissimilarity matrices at both the genus and species levels for both the full-length and the V3-V4 amplicon sequences. Alpha diversity metrics, including observed features [ 25 ], evenness [ 26 ], Shannon [ 27 ], Simpson, and inversed Simpson diversity indices [ 28 ] were also computed at these levels. Prior to these analyses, the data from all samples were normalized using Total Sum Scaling (TSS). Classification counts were calculated for all amplicon sequence variants (ASVs) only when there was no blank classification at each taxonomic level, from the phylum to the species. To address cases where certain ASVs were not fully classified at a specific taxonomic level, the ratio of fully classified taxa was calculated as the number of fully classified taxa at each level relative to the total taxa count. This metric, referred to as the fully classified ratio, was evaluated for each database at both the genus and species levels to assess classification efficiency. Error rate estimation Error rates for taxonomic classification were calculated following the approach outlined by Kaehler et al. [ 8 ] to determine the proportion of ASVs misclassified by classifiers. The BLAST 16S ribosomal RNA database [ 29 ] was used as a reference, and all ASVs were classified to the species level without considering confidence scores. This ensured that every ASV was classified to a species regardless of classification confidence. However, significant updates in taxonomy occurred after the release of GG2 2022.10 and SILVA 138.1, and in these updates some taxa were reclassified at both the phylum (e.g., Firmicutes to Bacillota , Bacteroidetes to Bacteroidota , Proteobacteria to Pseudomonadota , and Actinobacteria to Actinomycetota ) and the genus levels (e.g., certain Prevotella species reclassified to Xylanibacter and Segatella , and Propionibacterium acnes renamed as Cutibacterium acnes ) [ 30 , 31 ]. Thus, error rates were not calculated for these databases. Statistical analysis The effects of classifier types on overall microbiota composition analysis were statistically evaluated using permutational multivariate analysis of variance (PERMANOVA). This analysis used the vegan and pairwiseAdonis packages in R (version 4.3.3) with 9,999 random permutations [ 32 , 33 ]. Multiple testing adjustments were applied using the Benjamini-Hochberg correction method [ 34 ]. Normality of alpha diversity metrics, classification counts, fully classified ratios, and error rates were assessed using the Shapiro-Wilk test, while homogeneity of variances was evaluated using Levene’s test. All data satisfied the assumption of homogeneity of variances. Statistical analysis was conducted using PROC MIXED in SAS 9.4 (SAS Institute Inc., Cary, NC, USA) for normally distributed data, whereas PROC GLIMMIX was used for the data that did not follow a normal distribution. Classifier type was treated as a fixed effect. For the alpha diversity metrics, classification counts and fully classified ratios, each amplicon dataset was included as a random effect, while for the error rates, both amplicon dataset and database types were treated as random effects. Statistical significance was declared at s threshold of P ≤ 0.05. Results The taxonomy classifier evaluation method was validated using microbiota datasets derived from both in vitro experiments and in vivo samples collected from the rumen of Hanwoo cows. The in vitro fermentation experiment involved the collection of ruminal fluid and incubation with a mixed microbial substrate under controlled conditions, enabling the study of fermentation dynamics and microbial composition. This approach is widely used in ruminant research [ 35 ]. A recent study validated this approach by demonstrating that in vitro batch cultures of rumen fluid effectively maintained the ruminal microbiome for up to 48 h [ 36 ]. Based on this evidence, both in vivo and in vitro datasets were selected to validate the taxonomy classifiers. Classifier evaluation using in vivo datasets The classifiers were comparatively evaluated for diversity analyses and taxonomic identification of rumen microbes. To compare within-sample diversity and evaluate different classifiers, alpha diversity analysis was performed at the genus and species level (Tables S1 and S2). Observed features were statistically different ( P < 0.05) across all classifiers and taxonomy levels, except for GTDB with full-length amplicon sequences at the species level ( P = 0.0526). The MWTC exhibited the highest values of observed features compared to the UWTC and AWTC with NCBI RefSeq and RDP ( P < 0.01) but either lower or comparable diversity metric values with GG2, SILVA, and GTDB ( P < 0.01). With the full-length amplicon sequences, MWTC showed the lowest evenness, Shannon index, and Simpson index compared to UWTC with NCBI RefSeq and RDP ( P < 0.01), but with the V3-V4 amplicon sequences, these alpha diversity metrics were higher or comparable. When GTDB was used, AWTC and MWTC yielded higher Shannon, Simpson, and Inverse Simpson indices at the genus level compared to UWTC ( P < 0.01) but lower indices at the species level ( P < 0.01). To assess differences in microbiota identified by the three taxonomy classifiers (i.e., UWTC, AWTC, and MWTC), we performed beta diversity analysis using PERMANOVA. Significant differences ( P = 0.001) were noted in microbiota composition depicted by the full-length amplicon sequences at the genus level with NCBI RefSeq and RDP, while no significant differences were noted with the other databases ( Fig. 2A ). At the species level, however, the revealed microbiota compositions were significantly affected ( P = 0.001) by the taxonomy classifier used irrespective of databases ( Fig. 2B ). With respect to the microbiota composition revealed by the V3-V4 amplicon sequences, significant differences were identified at both the genus and the species levels across all databases ( P = 0.001) ( Fig. 2C, D ). Pairwise comparisons revealed that with the GG2, microbiota profiles classified using UWTC and MWTC were similar, whereas AWTC produced distinct microbiota profiles. Significant differences in microbiota profiles were observed among UWTC, AWTC, and MWTC with all other databases (Table S3). Download figure Open in new tab Fig. 2. PCA plots showing differences in rumen microbiomes among taxonomy classifier types within each database at genus and species levels for full-length amplicon sequences (A) and for V3-V4 amplicon sequences (B) in an in vivo study. Significant differences were observed for all classifiers except for NCBI RefSeq and RDP at the genus level. The classifiers were evaluated for taxonomic classification of key rumen microbial taxa and the proportion of successfully annotated taxa, focusing on the top 20 most dominant genera and species. Across all databases, MWTC yielded higher proportion of the top 20 dominant taxa compared to UWTC and AWTC, while maintaining the same taxonomic composition ( Fig. 3 , S2). With NCBI RefSeq and RDP, AWTC assigned some of the full-length and V3-V4 amplicon sequences to Segatella copri , whereas MWTC and UWTC assigned these sequences to Xylanibacter ruminicola ( Fig. 3A , S2B). With other databases, UWTC and MWTC identified similar dominant taxa. With GG2 and GTDB, over 70% of the top 20 taxa were classified as "sp.," whereas with SILVA, more than 80% of the top 20 taxa were annotated as "uncultured." Notably, with SILVA at the species level, AWTC predominantly assigned uncultured microorganisms as "bacterium," whereas MWTC annotated them as "rumen." Download figure Open in new tab Fig. 3. Taxonomic barplot showing the top 20 taxa for each taxonomy classifier in an in vivo study. Taxonomy classification was performed at the species level using full-length amplicon sequences (A) and at the genus level using V3-V4 amplicon sequences (B). In terms of classification counts and fully classified ratios, MWTC outperformed UWTC and AWTC with both the full-length amplicon sequences and V3-V4 amplicon sequences across all databases ( Fig. 4 , 5 ). Specifically, MWTC exhibited similar or higher classification counts at the genus level although it showed lower or no difference in classification counts at the phylum and family levels compared to UWTC and AWTC with the GG2 and SILVA databases. At the species level, MWTC increased classification counts by over 5% and 40% with the GG2 and the SILVA databases, respectively, compared with UWTC ( P 20% increase in higher classification counts and fully classified ratios over UWTC and even greater increases over AWTC at both the genus and species levels ( P < 0.001) (Tables S2 and S3). Notably, with SILVA 138.1 version, AWTC performed worse than UWTC with the full-length amplicon sequences. With the V3-V4 amplicon sequences, AWTC also performed worse than UWTC at the genus level but slightly better than MWTC at the species level (Tables S4 and S5). Download figure Open in new tab Fig. 4. Counts of classified ASVs at the phylum, family, genus, and species levels for each database (A) and the fully classified ratios (the proportion of completely classified features relative to the total ASVs) (B) for each database identified using the full-length amplicon sequences in an in vivo study. Classifiers were categorized as follows: UWTC, a classifier without any adjustments; AWTC, a classifier based on average data from the EMPO3 dataset; and MWTC, a classifier manually curated using metagenomic and amplicon sequencing data. Download figure Open in new tab Fig. 5. Counts of classified ASVs at the phylum, family, genus, and species levels for each database (A) and the fully classified ratios (the proportion of completely classified features relative to the total ASVs) (B) for each database identified using the V3-V4 amplicon sequences in an in vivo study. Classifiers were categorized as follows: UWTC, a classifier without any adjustments; AWTC, a classifier based on average data from the EMPO3 dataset; and MWTC, a classifier manually curated using metagenomic and amplicon sequencing data. Classifiers evaluation using in vitro experimental samples In vitro experiments are often used to simulate the rumen environment when conducting preliminary comparisons or when large-scale animal trials are not feasible. Therefore, since 16S rRNA amplicon sequences derived from in vitro experiments also require improved resolution through MWTC, we validated MWTC using in vitro experimental samples. The alpha diversity indices demonstrated notable differences across classifiers (Tables S6 and S7). Observed features were statistically significant across all classifiers ( P < 0.05), except for GTDB with full-length amplicon sequences. Consistent with the in vivo datasets, NCBI RefSeq and RDP demonstrated higher observed features with MWTC compared to UWTC ( P < 0.05), whereas GG2 and SILVA exhibited lower or similar values. Shannon, Simpson, and inverse Simpson indices were significant in both full-length and V3-V4 amplicon sequences, except for GG2 and SILVA at the genus level in full-length sequences ( P < 0.05). When used with NCBI RefSeq and RDP, MWTC yielded higher values in these indices compared to UWTC and AWTC, irrespective of sequencing types or taxonomic levels. In contrast, coupled with GG2 and SILVA, MWTC exhibited lower or similar values than UWTC. AWTC displayed inconsistent trends, with the values fluctuating, relative to UWTC. The beta diversity analysis revealed no significant differences as determined with the full-length amplicon sequences at the genus level with the two SILVA database releases (Fig. S1A). In contrast, other genus-level data, except for the SILVA databases, and all species-level data were statistically different in beta diversity ( P < 0.05) (Fig. S1A, B). Both genus- and species-level data determined by the V3-V4 amplicon sequences differed statistically significantly across all databases (Fig. S1C, D), consistent with the in vivo results (Table S8). Similar to the in vivo dataset, MWTC achieved the highest proportion accounted for by the top 20 dominant taxa compared to UWTC (Fig. S3, S4). MWTC showed an overall higher microbial abundance than UWTC. Across all the classifiers, at the species level, with GG2 and GTDB, 75% of the top 20 taxa were classified as "sp.," whereas with SILVA, 90% of the top 20 taxa were annotated as "uncultured" or "unidentified." Regarding classification counts and fully classified ratios, MWTC consistently outperformed UWTC at both the genus and species levels (Fig. S5, S6). With the V3-V4 amplicon sequences, AWTC and MWTC yielded lower classification counts than UWTC at the phylum and family levels when the GG2 and SILVA databases were used. Similarly, with the full-length sequences and the V3-V4 amplicon sequences, AWTC showed lower values in classification counts than UWTC at the genus and species levels when GG2 was used, with a 3.4–5.5% reduction at the genus level and a 5.6–9.5% reduction at the species level. Similarly, when used together with SILVA, AWTC showed a 0.9–9.8% decrease in classification counts at the genus level. In contrast, while MWTC demonstrated lower values in classification counts than UWTC, these were statistically comparable to UWTC (Tables S9 and S10). Interestingly, with the NCBI RefSeq and RDP, MWTC exhibited fully classified ratio at the species level, with improvements ranging from 19.2–64.6% in NCBI RefSeq and 22.8–58.3% in RDP, which exceeded that achieved by UWTC. Error rate estimate The error rates were thoroughly analyzed, taking into account variations across all databases for each amplicon dataset resulting from both in vivo and in vitro environments ( Fig. 6 ). With the NCBI RefSeq, MWTC showed a relatively decreased in error rate compared to UWTC, averaging by 33.9–62.6% at the genus level and 22.6–47.5% at the species level. Similarly, with RDP, reductions of 25.2–53.5% and 15.7–24.9% were observed at the genus and species levels, respectively. The GTDB database yielded a decrease of 1.7– 4.4% at the genus level and 2.9–8.0% at the species level. With GG2 and SILVA, reductions were minimal, with a maximum decrease of only 1.2% at the species level. Overall, MWTC exhibited lower error rates than UWTC ( P < 0.05), except at the phylum level with the V3-V4 amplicon sequences. Notably, the average error rate at the genus and species levels was higher with AWTC than with UWTC but lower with MWTC (Table S11). Download figure Open in new tab Fig. 6. Error rate plots for each classification count, including in vivo experimental datasets from full-length amplicon sequences (A) and the V3-V4 amplicon sequences (B) and in vitro datasets from the full-length amplicon sequences (C) and the V3-V4 amplicon sequences (D). Error rates were calculated by comparing the results with those of the BLAST 16S ribosomal RNA database, considering both unnamed and misnamed classifications as errors. Discussion Advancements in sequencing technologies and taxonomy classifiers have improved microbiome characterization, yet existing databases remain heavily human-centric, limiting analysis accuracies of non-human microbiomes like the rumen. While weighted taxonomy classifiers enhance microbial identification, their application to rumen samples may be less precise due to the absence of ruminal microbiota in EMPO habitat types and the unique microbial profiles influenced by ruminant species and diet. This underscores the need for ruminant-specific taxonomy classifiers to improve classification accuracy. To enhance the accuracy of taxonomic classification in rumen microbiome analyses, we suggested assigning weights during the classification process. However, since many microbiome databases do not contain sufficient rumen-specific data, we proposed the development of manually weighted datasets using both shotgun metagenomics and amplicon sequencing data from specific ruminant breeds. The MWTC constructed by integrating both shotgun and amplicon datasets could enhance taxonomic resolution MWTC achieved higher classification counts, fully classified ratios, and a lower error rate than UWTC, demonstrating its capability to provide more accurate annotations, particularly at lower taxonomic levels. This suggests that even with the same ASV dataset, beta diversity analysis results could vary depending on the classifier used, which assigns different taxonomy. This further highlighted that classifiers with different taxonomy weighting methods influence the microbial community profiles differently. MWTC preserves or increases the relative abundance of key microbial taxa present in the rumen samples. With NCBI RefSeq and RDP, AWTC reclassified Xylanibacter ruminicola , classified by UWTC, as Segatella copri . Segatella copri accounts for approximately 30% of the human gut microbiome [ 37 , 38 ], but it is not dominant in the rumen. In contrast, MWTC retained the annotation of Xylanibacter ruminicola and further enhanced its relative abundance. Additionally, MWTC showed Xylanibacter, Succiniclasticum, Succinivibrio, Sodaliphilus, Ruminococcoides, Butyrivibrio, Treponema, and Fibrobacter , all of which are common genera found in the rumen [ 9 , 39 , 40 ], without a decline in abundance compared to UWTC. These suggest that MWTC provides a more accurate classification for rumen microbiota compared to the weighting approach used in AWTC. A previous study had shown that using shotgun metagenome data as a weighting factor increased the taxonomy detection rate [ 8 ]. In this study, by incorporating not only shotgun sequencing datasets but also amplicon sequencing datasets as taxonomic weights, MWTC demonstrated higher classification counts and fully classified ratios, along with reduced error rates, than UWTC and AWTC. This indicated that amplicon sequencing datasets can be utilized to provide taxonomic weights while maintaining high resolution. However, not all the amplicon datasets are suitable for this purpose. To construct the MWTC, amplicon datasets derived from DNA collected directly from the ruminal fluid of individual animals were utilized. In contrast, amplicon datasets obtained from the in vitro fermentation process may not be ideal for use as weighting datasets, since they originate from mixed cultures gathered from multiple individuals [ 41 , 42 ] and may include microorganisms introduced during the process of preparing buffer solutions [ 43 , 44 ]. The selection of taxonomy classifier databases is crucial at lower taxonomic levels When GG2 and SILVA were used, no differences in beta diversity or error rate were noted. This finding could be attributed to the similar taxonomies implemented in GG2 and SILVA. In GG2, ambiguous annotations, such as ‘CAG,’ ‘RUG (Rumen uncultured genus)’ and ‘sp.’ are frequently observed, while in SILVA, annotations like ‘uncultured,’ ‘unidentified,’ or ‘unknown organism’ are common, both presenting challenges in identifying microbes at lower taxonomic levels and compromising taxonomic and phylogenetic rigor (Supplementary data) [ 12 , 45 ]. Despite AWTC and MWTC showed higher classification count and fully classified ratio compared to UWTC, the error rate did not decrease in the GG2 and SILVA databases. The AWTC showed increased error rates compared to the UWTC at both the genus and species levels. However, the MWTC demonstrated lower error rates than the AWTC and overall showed similar or lower error rates than UWTC. This suggests that the MWTC maintained a resolution comparable to that of the UWTC when GG2 and SILVA were used. The GTDB, which serves as the underlying database for GG2, shares the same classification framework as GG2 [ 15 ]. Similar to GG2, GTDB reflects the characteristics of the database with annotations like ‘CAG,’ ‘RUG (Rumen uncultured genus)’ and ‘spp.’ However, unlike GG2 and SILVA, GTDB did not yield lower classification counts at the phylum or family levels and showed a slight improvement in error rates. There were significant improvements in the V3-V4 amplicon sequences, which may have a lower resolution than the full-length amplicon sequences. Therefore, integration of MWTC with GTDB as the database could be used in future studies. In the cases of NCBI RefSeq and RDP, both classification count and fully classified ratio showed a significant increased by MWTC, accompanied by a substantial reduction in error rates. Although the two databases showed lower classification counts by up to two to three times at the genus and species levels compared to other databases when UWTC was used, applying MWTC narrowed the gap in classification counts, improving to a level achieved by the other databases. Furthermore, the error rate was remarkably enhanced, exceeding the performance of the other databases. The NCBI RefSeq used in this analysis contained only 16S rRNA gene sequences annotated at the species level, excluding the genomes of uncultured microbes. Similarly, RDP was derived from genomic data provided by NCBI GenBank, EMBL, and DDBJ [ 46 ], and its results were highly consistent with those of NCBI RefSeq. Thus, we confirmed that using a weighted taxonomy classifier is highly effective, especially in those that exclude uncultured genomes, and its impact becomes even more pronounced when applied to species-specific analyses, such as ruminants. Unlike other databases that predominantly contain uncultured microbes, these NCBI RefSeq and RDP enable the assignment of microbial individuals to lower taxonomic levels, thereby being suitable for increasingly detailed studies on microbes and their communities. Conclusion Overall, the use of MWTC improved taxonomic classification resolution in 16S rRNA gene-based microbiome analysis over AWTC, which applies taxonomic weights based on the EMPO database. Notably, the current study confirmed that taxonomic weights could be applied using both shotgun metagenome and amplicon sequence datasets. In particular, the NCBI RefSeq, known for its strict taxonomy and low resolution, showed a significant improvement in taxonomic resolution when MWTC was applied. This enhancement facilitated species-level microbial analysis, making it more effective than other databases that rely on ambiguous annotations. Thus, we recommend using NCBI RefSeq for species-level classification, and its performance will significantly improve when applying MWTC. Data availability All codes and prebuilt classifiers used for data analysis are available at https://github.com/6seok/rumanclass . Declarations Competing Interests The authors declare no competing interests. Author’s Contributions R. K. contributed to the conception, methodology, and data curation and wrote the original manuscript. Z. Y. and T. P. reviewed and edited the manuscript. All authors reviewed the final version. Author details 1 Department of Animal Science and Technology, Chung-Ang University, Anseong-si, Gyeonggi-do 17546, Republic of Korea 2 Department of Animal Sciences, The Ohio State University, Columbus, OH 43210, USA Acknowledgments Special thanks to Prof. Jakyeom Seo and Dr. Hanbeen Kim for providing the Hanwoo amplicon sequencing dataset used for validation. Footnotes https://github.com/6seok/rumanclass/ List of abbreviations EMPO Earth Microbiome Project Ontology PERMANOVA Permutational multivariate analysis of variance UWTC Unweighted taxonomy classifier AWTC Average weighted taxonomy classifier MWTC Manually weighted taxonomy classifier GG2 most recent Greengenes database (GG2) References 1. ↵ Xu , Q. , et al. , Gut Microbiota and Their Role in Health and Metabolic Disease of Dairy Cow . Frontiers in Nutrition , 2021 . 8 . 2. ↵ Russell , J.B. , R.E. Muck , and P.J. Weimer , Quantitative analysis of cellulose degradation and growth of cellulolytic bacteria in the rumen . FEMS Microbiology Ecology , 2009 . 67 ( 2 ): p. 183 – 197 . OpenUrl CrossRef PubMed Web of Science 3. ↵ Pollock , J. , et al. , The Madness of Microbiome: Attempting To Find Consensus “Best Practice” for 16S Microbiome Studies . Applied and Environmental Microbiology , 2018 . 84 ( 7 ): p. e02627 – 17 . OpenUrl PubMed 4. ↵ Seshadri , R. , et al. , Cultivation and sequencing of rumen microbiome members from the Hungate1000 Collection . Nature Biotechnology , 2018 . 36 ( 4 ): p. 359 – 367 . OpenUrl CrossRef PubMed 5. HoloRuminant Consortium , HoloRuminant – Understanding microbiomes of the ruminant holobiont . 2021 , European Commission: European Union . p. 2021 – 2025 . 6. Wilkinson , T.J. , et al. , CowPI: A Rumen Microbiome Focussed Version of the PICRUSt Functional Inference Software . Frontiers in Microbiology , 2018 . 9 . 7. ↵ McGovern , E. , et al. , Evaluating Established Methods for Rumen 16S rRNA Amplicon Sequencing With Mock Microbial Populations . Frontiers in Microbiology , 2018 . 9 . 8. ↵ Kaehler , B.D. , et al. , Species abundance information improves sequence taxonomy classification accuracy . Nature Communications , 2019 . 10 ( 1 ): p. 4643 . OpenUrl CrossRef PubMed 9. ↵ Henderson , G. , et al. , Rumen microbial community composition varies with diet and host, but a core microbiome is found across a wide geographical range . Scientific Reports , 2015 . 5 ( 1 ):p. 14567 . OpenUrl CrossRef PubMed 10. ↵ Islam , M. , et al. , Holstein and Jersey Steers Differ in Rumen Microbiota and Enteric Methane Emissions Even Fed the Same Total Mixed Ration . Frontiers in Microbiology , 2021 . 12 . 11. ↵ Bolyen , E. , et al. , Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2 . Nature Biotechnology , 2019 . 37 ( 8 ): p. 852 – 857 . OpenUrl CrossRef PubMed 12. ↵ Robeson , M.S. , et al. , RESCRIPt: Reproducible sequence taxonomy reference database management . PLoS Comput Biol , 2021 . 17 ( 11 ): p. e1009581 . OpenUrl CrossRef PubMed 13. ↵ Quast , C. , et al. , The SILVA ribosomal RNA gene database project: improved data processing and web-based tools . Nucleic Acids Research , 2012 . 41 ( D1 ): p. D590 – D596 . OpenUrl CrossRef PubMed Web of Science 14. ↵ Parks , D.H. , et al. , GTDB: an ongoing census of bacterial and archaeal diversity through a phylogenetically consistent, rank normalized and complete genome-based taxonomy . Nucleic Acids Research , 2021 . 50 ( D1 ): p. D785 – D794 . OpenUrl 15. ↵ McDonald , D. , et al. , Greengenes2 unifies microbial data in a single reference tree . Nature Biotechnology , 2024 . 42 ( 5 ): p. 715 – 718 . OpenUrl CrossRef PubMed 16. ↵ Wang , Q. and J.R. Cole , Updated RDP taxonomy and RDP Classifier for more accurate taxonomic classification . Microbiology Resource Announcements , 2024 . 13 ( 4 ): p. e01063 – 23 . OpenUrl 17. ↵ Thompson , L.R. , et al. , A communal catalogue reveals Earth’s multiscale microbial diversity . Nature , 2017 . 551 ( 7681 ): p. 457 – 463 . OpenUrl CrossRef PubMed 18. ↵ Chen , S. , et al. , fastp: an ultra-fast all-in-one FASTQ preprocessor . Bioinformatics , 2018 . 34 ( 17 ): p. i884 – i890 . OpenUrl CrossRef PubMed 19. ↵ Langmead , B. and S.L. Salzberg , Fast gapped-read alignment with Bowtie 2 . Nature Methods , 2012 . 9 ( 4 ): p. 357 – 359 . OpenUrl CrossRef PubMed 20. ↵ Kopylova , E. , L. Noé , and H. Touzet , SortMeRNA: fast and accurate filtering of ribosomal RNAs in metatranscriptomic data . Bioinformatics , 2012 . 28 ( 24 ): p. 3211 – 3217 . OpenUrl CrossRef PubMed Web of Science 21. ↵ Magoč , T. and S.L. Salzberg , FLASH: fast length adjustment of short reads to improve genome assemblies . Bioinformatics , 2011 . 27 ( 21 ): p. 2957 – 2963 . OpenUrl CrossRef PubMed Web of Science 22. ↵ Amir , A. , et al. , Deblur Rapidly Resolves Single-Nucleotide Community Sequence Patterns . mSystems , 2017 . 2 ( 2 ): p. doi: 10.1128/msystems.00191-16 . OpenUrl CrossRef 23. ↵ Wang , Q. , et al. , Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy . Appl Environ Microbiol , 2007 . 73 ( 16 ): p. 5261 – 7 . OpenUrl Abstract / FREE Full Text 24. ↵ Pedregosa , F. , et al. , Scikit-learn: Machine Learning in Python . Journal of Machine Learning Research , 2011 . 12 ( null ): p. 2825 – 2830 . OpenUrl 25. ↵ DeSantis , T.Z. , et al. , Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB . Appl Environ Microbiol , 2006 . 72 ( 7 ): p. 5069 – 72 . OpenUrl Abstract / FREE Full Text 26. ↵ Pielou , E.C ., The measurement of diversity in different types of biological collections . Journal of theoretical biology , 1966 . 13 : p. 131 – 144 . OpenUrl CrossRef Web of Science 27. ↵ Shannon , C.E ., A mathematical theory of communication . The Bell system technical journal , 1948 . 27 ( 3 ): p. 379 – 423 . OpenUrl CrossRef Web of Science 28. ↵ Simpson , E. , Measurement of Diversity . Nature , 1949 . 163 . 29. ↵ Camacho , C. , et al. , BLAST+: architecture and applications . BMC Bioinformatics , 2009 . 10 ( 1 ): p. 421 . OpenUrl CrossRef PubMed 30. ↵ National Center for Biotechnology Information . Prokaryote phyla added to the NCBI taxonomy database . 2021 ; Available from: https://ncbiinsights.ncbi.nlm.nih.gov/2021/12/10/ncbi-taxonomy-prokaryote-phyla-added/ . 31. ↵ Hitch , T.C.A. , et al. , A taxonomic note on the genus Prevotella: Description of four novel genera and emended description of the genera Hallella and Xylanibacter . Syst Appl Microbiol , 2022 . 45 ( 6 ): p. 126354 . OpenUrl CrossRef 32. ↵ Martinez Arbizu , P. , pairwiseAdonis: Pairwise multilevel comparison using adonis . R package version 0.4 , 2020 . 1 . 33. ↵ Dixon , P ., VEGAN, a package of R functions for community ecology . Journal of vegetation science , 2003 . 14 ( 6 ): p. 927 – 930 . OpenUrl CrossRef Web of Science 34. ↵ Benjamini , Y. and Y. Hochberg , Controlling the False Discovery Rate - a Practical and Powerful Approach to Multiple Testing . Journal of the Royal Statistical Society Series B-Statistical Methodology , 1995 . 57 ( 1 ): p. 289 – 300 . OpenUrl CrossRef PubMed Web of Science 35. ↵ Menke , K.H. , et al. , The estimation of the digestibility and metabolizable energy content of ruminant feedingstuffs from the gas production when they are incubated with rumen liquor in vitro . The Journal of Agricultural Science , 1979 . 93 ( 1 ): p. 217 – 222 . OpenUrl CrossRef 36. ↵ Shaw , C.A. , et al. , A Comparison of Three Artificial Rumen Systems for Rumen Microbiome Modeling . Fermentation , 2023 . 9 ( 11 ): p. 953 . OpenUrl CrossRef 37. ↵ Xiao , X. , et al. , Segatella copri strains adopt distinct roles within a single individual’s gut . bioRxiv , 2024 : p. 2024.05.20.595015 . 38. ↵ Panwar , D. , et al. , Transcriptional delineation of polysaccharide utilization loci in the human gut commensal Segatella copri DSM18205 and co-culture with exemplar Bacteroides species on dietary plant glycans . Appl Environ Microbiol , 2025 . 91 ( 1 ): p. e0175924 . OpenUrl CrossRef PubMed 39. ↵ Qi , W. , et al. , — Invited Review — Understanding the functionality of the rumen microbiota: searching for better opportunities for rumen microbial manipulation . Anim Biosci , 2024 . 37 ( 2 ): p. 370 – 384 . OpenUrl CrossRef PubMed 40. ↵ Buckner , A.M. , et al. , The selective culture and enrichment of major rumen bacteria on three distinct anaerobic culture media . bioRxiv , 2025 : p. 2024.08.23.608987 . 41. ↵ Kang , R. , et al. , Effects of diets for three growing stages by rumen inocula donors on in vitro rumen fermentation and microbiome . Journal of Animal Science and Technology , 2023 . 42. ↵ Jeong , J. , et al. , Application of propionate-producing bacterial consortium in ruminal methanogenesis inhibited environment with bromoethanesulfonate as a methanogen direct inhibitor . Frontiers in Veterinary Science , 2024 . 11 . 43. ↵ McDougall , E.I ., Studies on ruminant saliva. 1. The composition and output of sheep’s saliva . Biochem J , 1948 . 43 ( 1 ): p. 99 – 109 . OpenUrl FREE Full Text 44. ↵ Goering , H. , Forage fiber analyses (apparatus, reagents, procedures, and some applications) . 1970 . 45. ↵ Ceccarani , C. and M. Severgnini , A comparison between Greengenes, SILVA, RDP, and NCBI reference databases in four published microbiota datasets . bioRxiv , 2023 : p. 2023.04.12.535864 . 46. ↵ Cole , J.R. , et al. , The Ribosomal Database Project (RDP-II): previewing a new autoaligner that allows regular updates and the new prokaryotic taxonomy . Nucleic Acids Res , 2003 . 31 ( 1 ): p. 442 – 3 . OpenUrl CrossRef PubMed Web of Science 47. Kang , R. , et al. , Impact of Forage Sources on Ruminal Bacteriome and Carcass Traits in Hanwoo Steers During the Late Fattening Stages . Microorganisms , 2024 . 12 ( 10 ): p. 2082 . OpenUrl CrossRef PubMed 48. Song , J. , et al. , Effects of Sampling Techniques and Sites on Rumen Microbiome and Fermentation Parameters in Hanwoo Steers . Journal of Microbiology and Biotechnology , 2018 . 28 ( 10 ): p. 1700 – 1705 . OpenUrl CrossRef PubMed 49. Kim , M. , et al. , Association between Rumen Microbiota and Marbling Score in Korean Native Beef Cattle . Animals , 2020 . 10 ( 4 ): p. 712 . OpenUrl CrossRef PubMed 50. Bharanidharan , R. , et al. , Feeding Systems and Host Breeds Influence Ruminal Fermentation, Methane Production, Microbial Diversity and Metagenomic Gene Abundance . Frontiers in Microbiology , 2021 . 12 . View the discussion thread. Back to top Previous Next Posted March 14, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Manually weighted taxonomy classifiers improve species-specific rumen microbiome analysis compared to unweighted or average weighted taxonomy classifiers Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Manually weighted taxonomy classifiers improve species-specific rumen microbiome analysis compared to unweighted or average weighted taxonomy classifiers Ryukseok Kang , Zhongtang Yu , Tansol Park bioRxiv 2025.03.12.642789; doi: https://doi.org/10.1101/2025.03.12.642789 Share This Article: Copy Citation Tools Manually weighted taxonomy classifiers improve species-specific rumen microbiome analysis compared to unweighted or average weighted taxonomy classifiers Ryukseok Kang , Zhongtang Yu , Tansol Park bioRxiv 2025.03.12.642789; doi: https://doi.org/10.1101/2025.03.12.642789 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7616) Biochemistry (17625) Bioengineering (13851) Bioinformatics (41824) Biophysics (21397) Cancer Biology (18524) Cell Biology (25417) Clinical Trials (138) Developmental Biology (13350) Ecology (19858) Epidemiology (2067) Evolutionary Biology (24277) Genetics (15580) Genomics (22459) Immunology (17698) Microbiology (40278) Molecular Biology (17134) Neuroscience (88400) Paleontology (666) Pathology (2823) Pharmacology and Toxicology (4812) Physiology (7632) Plant Biology (15106) Scientific Communication and Education (2042) Synthetic Biology (4281) Systems Biology (9807) Zoology (2266)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00