Optimization of denoising and filtering parameters of DADA2 for QIIME2 amplicon metagenomics data analysis

doi:10.1101/2025.06.24.661437

Optimization of denoising and filtering parameters of DADA2 for QIIME2 amplicon metagenomics data analysis

2025 · doi:10.1101/2025.06.24.661437

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 36,793 characters · extracted from preprint-html · click to expand

Optimization of denoising and filtering parameters of DADA2 for QIIME2 amplicon metagenomics data analysis | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Optimization of denoising and filtering parameters of DADA2 for QIIME2 amplicon metagenomics data analysis View ORCID Profile Moirangthem Goutam Singh , View ORCID Profile Romi Wahengbam doi: https://doi.org/10.1101/2025.06.24.661437 Moirangthem Goutam Singh 1 Center for Infectious Diseases, Biological Sciences and Technology Division, CSIR-North East Institute of Science and Technology , Jorhat 785006, Assam, India 2 Academy of Scientific and Innovative Research (AcSIR) , Ghaziabad 201002, Uttar Pradesh, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Moirangthem Goutam Singh Romi Wahengbam 1 Center for Infectious Diseases, Biological Sciences and Technology Division, CSIR-North East Institute of Science and Technology , Jorhat 785006, Assam, India 2 Academy of Scientific and Innovative Research (AcSIR) , Ghaziabad 201002, Uttar Pradesh, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Romi Wahengbam For correspondence: romiwahengbam{at}neist.res.in Abstract Full Text Info/History Metrics Preview PDF Abstract Background High-throughput sequencing generates vast data, often containing low-quality bases, chimeras, and artifacts that can mislead taxonomic classification and diversity assessments. DADA2 enhances taxonomic resolution by excluding low-quality bases and optimizing ASV inference. Proper truncation reduces computational load while maintaining key hypervariable regions for accurate classification. In this study, we examine the effect of various truncation lengths during the DADA2 analysis in ensuring statistical robustness and improving the reliability of microbial community profiling in ecological and environmental studies. Results Truncation of read length improves the quality reads recovery rate, and preserves microbial diversity in the V4 hypervariable region of Illumina paired-end reads. Conclusion Incorporating the best truncation length strategy optimizes the reads recovery and preserves the richness and evenness of microbial communities. Background In microbial ecology and environmental biology, analyzing microbial communities through high-throughput sequencing has revolutionized our understanding of biodiversity, ecological interactions, and ecosystem functioning. QIIME 2 (Quantitative insights into microbial ecology), one of the most widely used platforms for analyzing and interpreting microbiome data, offers comprehensive tools for processing and visualizing large sets of sequencing data. Among the various preprocessing steps, trimming the sequence length of reads is crucial to ensure the accuracy and reliability of downstream analyses [ 1 , 2 ]. High-throughput sequencing technologies, such as Illumina sequencing, generate vast amounts of sequencing data that provide insights into the composition and function of microbial communities. However, raw sequences often come with inherent issues such as low-quality bases, chimeras, and non-target (host or contaminant) sequences. These factors can dramatically affect the quality of the data and, consequently, the interpretations drawn from it [ 3 ]. Quality control is a fundamental aspect of sequence analysis that determines the integrity and validity of results. Trimmed sequences help polish the dataset’s quality, allowing for better detection of true microbial diversity and accurate taxonomic assignment [ 4 ]. Low-quality sequences can introduce noise into the dataset, potentially leading researchers to erroneous conclusions regarding the composition and structure of microbial communities. By trimming sequences, researchers can remove parts of reads that do not meet quality standards, significantly increasing the overall data quality [ 5 ]. Sequencing artifacts, including errors introduced during the sequencing process, can result in misinterpretations of microbial community structure. This can involve substitutions, insertions, deletions, or biases associated with specific sequences [ 6 ]. Trimming identified low-quality bases from the beginning and end of sequences helps eliminate potential mismatches during alignment and reduces the likelihood of assigning incorrect taxonomy to operational taxonomic units (OTUs) or amplicon sequence variants (ASVs) [ 7 ]. Effective trimming can also reduce the creation of chimeric sequences, which arise when two sequences join together, leading to false inflation of biodiversity estimates [ 8 ]. Performance in downstream statistical analyses relies heavily on the quality of input data. Tools used for diversity analyses, such as alpha and beta diversity metrics, depend on accurately representing community richness and evenness. If sequences are left untrimmed, analyses can produce misleading results, with potential increases in false positives and negatives [ 9 ]. For instance, alpha diversity measures such as Shannon or Simpson indices can be disproportionately influenced if low-quality sequences are included in these calculations. Addressing these concerns through effective trimming ensures that all statistical assumptions and models are based on high-quality, reliable data. The specifics of trimming often involve determining an appropriate “cut-off” length for sequences. This cut-off is typically chosen based on several criteria. The quality score associated with each base in a sequence is a critical determinant in the decision to trim. The Phred score, commonly used in sequencing data, provides a logarithmic representation of base call accuracy. Sequences that fall below a predetermined threshold (for example, Q20 or Q30) are often candidates for trimming or exclusion [ 10 ]. Sequences that are shorter than a defined minimum length after trimming may not contain enough information for accurate taxonomic assignment. Trimming sequences to a consistent length ensures that subsequent analyses within a study are based on comparable data breadth across samples [ 2 ]. For paired-end sequencing, overlapping regions between forward and reverse reads may need to be trimmed to yield optimal merged reads. This step enhances the accuracy of read merging and helps reduce the generation of uninformative sequences [ 8 ]. In many studies, particular regions (like the 16S rRNA gene) are targeted for amplification. Trimming ensures that only the regions of interest are retained, mitigating the influence of extraneous information that can arise from untrimmed reads [ 3 ]. Within the QIIME 2 framework, DADA2 plays a crucial role in the analysis pipeline, facilitating accurate taxonomic classification and community structure analysis. The integration of DADA2 in QIIME enables researchers to obtain more reliable and reproducible results in ecological studies of microbial populations [ 2 ]. Truncating sequences to a specific length during DADA2 analysis in QIIME 2 is an essential strategy for improving the quality and accuracy of microbial community profiling. This process involves shortening the sequences, particularly the ends where low-quality bases may occur, which can lead to sequencing errors and affect downstream analyses. The effectiveness of this technique is underscored by several key factors that support its application in microbiome research. One of the primary benefits of truncation is the significant reduction of sequencing errors. High-throughput sequencing technologies, while powerful, often generate low-quality bases at the ends of the read, especially in the context of longer reads. These base call inaccuracies can mislead taxonomic assignments and inflate measures of diversity. DADA2 improves its performance by using quality scores to inform truncation length decisions [ 2 ]. By defining a truncation length that excludes low-quality bases, researchers ensure that only the most reliable data are retained for analysis. DADA2 employs a denoising algorithm that is sensitive to the quality of the input sequences. When poor-quality bases are present, the algorithm may struggle to accurately distinguish true sequences from artifacts or noise [ 7 ]. Truncating the sequences to a specific length helps create a more stable input by excluding these problematic ends, thereby allowing DADA2 to generate more reliable amplicon sequence variants (ASVs) representative of the actual microbial community. It has been demonstrated that appropriate truncation, in conjunction with quality filtering, significantly enhances the accuracy of ASV inference [ 1 ]. Truncating sequences to a length that maintains the core hypervariable regions (e.g., V3-V4 or V4) enables effective taxonomic resolution while reducing the likelihood of including less informative or error-prone sequence data. Many studies have shown that retaining specific length ranges where the sequence variability is high contributes positively to the precision of taxonomic classification across complex microbial communities [ 10 ]. This approach is particularly useful for distinguishing closely related taxa, allowing for more robust interpretations of microbial diversity and community structure. Truncating sequences also streamlines the computational burden in downstream analyses. Longer reads can lead to increased processing times and memory usage. By specifying an optimal truncation length, researchers can reduce the complexity of the data, making analyses such as diversity assessments or statistical comparisons more manageable [ 11 ]. This efficiency is especially important during large-scale microbiome studies, where datasets can become quite extensive. This study aims to contribute to the body of knowledge by conducting a comprehensive analysis of truncation length through a comparative lens. We will identify best practices by assessing already published datasets and provide evidence-based recommendations for determining optimal truncation lengths. The discussion will highlight how these practices enhance the clarity and precision of research findings and ensure that valuable information is not inadvertently discarded. Results Quality control Assessment A rigorous quality assessment of the sequences was conducted before the execution of DADA2 and subsequent analyses ( Fig. 1 ). Forward reads exhibited high-quality sequence bases at the onset of the sequencing cycle, with a notable decline in quality towards the end ( Fig. 1a ). In contrast, reverse reads displayed comparatively lower base quality throughout the cycle ( Fig. 1b ). To evaluate the impact of trimming, a series of truncation lengths—specifically 0, 170, 175, 180, 185, 190, 195, and 200 bases—were applied to both forward and reverse reads. Download figure Open in new tab Fig. 1 Representative box plot of quality scores over positions in sequenced reads generated using a random sampling of 10000 out of 2277091 sequences without replacement. ( a ) quality distribution for forward reads ( b) quality distribution for reverse reads. The upper whisker, top of box, middle box, bottom box, and lower whisker represent 91 st , 75 th , 50 th , 25 th , and 9 th percentile quality scores, respectively. Good reads count The DADA2 analysis unequivocally indicated that the parameter with no truncation (trunc_length_0) produced the highest counts of filtered reads, denoised reads, merged reads, and non-chimeric reads ( Tab. 1 ). Subsequent truncation lengths (170, 175, 180, 185, 190) resulted in progressively reduced read counts, with trunc_len_200 yielding the lowest. The percentage difference between input and filtered reads was remarkably minimal for trunc_length_0 at 25.18%, while approximately 45% of denoised reads were lost during the merging of paired-end sequences ( Fig. 2 ). Importantly, trunc_len_175 and trunc_length_170 accounted for losses of 60.31% and 59.33% of input reads, respectively, during the filtering phase. Therefore, the no-truncation parameter decisively yielded the maximum counts of non-chimeric reads View this table: View inline View popup Table 1. Filtered statistics of reads after DADA2 filtering. Download figure Open in new tab Fig. 2 Percentage difference between different filtering step in DADA2 denoising. OTU clustering, taxonomic classification, removal of singleton, and doubleton, and filtering of low-frequency and low-abundant features and frequencies The quantification of frequencies and features was methodically assessed at every stage of OTU clustering, taxonomic classification, removal of singletons and doubletons, and filtering of low-frequency and low-abundant features. Trunc_len_0 emerged with the highest number of frequencies, followed closely by trunc_len_170 and trunc_len_175, while trunc_len_200, as expected, recorded the lowest ( Tab. 2 ). Furthermore, the maximum number of features was definitively observed at trunc_len_195, featuring 359 identifiable features ( Tab. 3 ). View this table: View inline View popup Download powerpoint Table. 2: Number of feature frequencies after various filtering step viz: DADA2 denoising, OUT Clustering, Taxonomic classification, singleton and doubleton filtering, low frequency filtering and low abundant filtering. View this table: View inline View popup Table. 3: Number of features after different filtering steps viz: DADA2 denoising, OUT Clustering, Taxonomic classification, singleton and doubleton filtering, low-frequency filtering, and low abundant filtering. Diversity analysis Alpha diversity analysis is a pivotal component in microbiome research, offering insights into the richness and evenness of microbial communities within individual samples. High diversity is often associated with ecosystem stability, resilience, and functionality, particularly in gut microbiota studies, whereas low diversity has been linked to dysbiosis and various diseases, including inflammatory bowel disease (IBD), obesity, and diabetes. In this study, while trunc_len_0 yielded the highest number of good reads, it correspondingly revealed the lowest alpha diversity ( Fig. 3a ), as also indicated by the rarefaction curve presented in Fig. 4a . Alpha diversity was observed to be greater in group 1 than in group 2, a finding that diverges from conventional expectations. The remaining truncation parameters displayed comparable alpha diversity indices, and the rarefaction curves illustrated a consistent pattern in the relationship between richness and sequencing depth ( Fig. 3 and 4 ). Download figure Open in new tab Fig. 3 Distribution of alpha diversity indices of different truncation length during DADA2 analysis ( a ) truncation length at 0 ( b ) truncation length at 200 ( c ) truncation length at 195 ( d ) truncation length at 190 ( e ) truncation length at 185 ( f ) truncation length at 180 ( g ) truncation length at 175 ( h ) truncation length at 170. Download figure Open in new tab Fig. 4 Alpha rarefaction curves ( a ) truncation length at 0 ( b ) truncation length at 200 ( c ) truncation length at 195 ( d ) truncation length at 190 ( e ) truncation length at 185 ( f ) truncation length at 180 ( g ) truncation length at 175 ( h ) truncation length at 170. Taxa abundance Relative abundance at the phyla level revealed three distinct and significant patterns across the various truncation parameters. In the absence of truncation (trunc_len_0), the phylum Firmicutes_D was prominently dominant, followed by Bacteroidota and Firmicutes_A. Conversely, for trunc_len_200, trunc_len_195, trunc_len_190, trunc_len_185, and trunc_len_180, Firmicutes_A was the predominant phylum, succeeded by Firmicutes_D and Bacteroidota. Remarkably, in trunc_len_175 and 170, Bacteroidota emerged as the more prominent phylum compared to Firmicutes_D ( Fig. 5 ). ANCOM comparisons conclusively revealed significant abundance of Bacteroidota and Firmicutes_D in trunc_len_0, whereas Bacteroidota and Firmicutes_B were significantly more prevalent in trunc_len_200, 185, 180, 175, and 170 for group 2 ( Fig. 6 a, b, e, f, g, h). Conversely, the phyla Proteobacteria and Verrucomicrobiota showed significant reductions across all truncation parameters except for trunc_len_0 ( Fig. 6 ). This disparity at the phylum level was further substantiated by the analysis of the top 20 species. The highest species-level relative abundances were definitively documented in trunc_len_0, followed by trunc_len_170, 200, 175, 180, 185, 195, and 190. The increased relative abundance in trunc_len_0 predominantly stemmed from the significant presence of species such as Phocaeicola sartorii , Cryptobacteroidetes sp.009774765, and CAG-485 sp002493045 from the phylum Bacteroidota, as well as Limosilactobacillus spp. and Faecalibaculum rodentium from the phylum Firmicutes_D ( Fig. 7 ). This underscores the critical influence of sequencing parameters on microbial community structure and diversity, emphasizing the need for careful consideration of truncation strategies in microbiome studies. Download figure Open in new tab Fig. 5 Ribbon chart showing the relative abundance of different truncation lengths at phylum level. Download figure Open in new tab Fig. 6 ANCOM volcano plot of differential abundance between group 1 and group 2 data set ( a ) truncation length at 0 ( b ) truncation length at 200 ( c ) truncation length at 195 ( d ) truncation length at 190 ( e ) truncation length at 185 ( f ) truncation length at 180 ( g ) truncation length at 175 ( h ) truncation length at 170. Download figure Open in new tab Fig. 7 Barplot of relative abundance of different truncation lengths at species level. Discussion The analysis of different truncation lengths in sequencing data is integral to optimizing the accuracy and completeness of microbiome studies. In our research, we systematically evaluated multiple truncation lengths, concluding that trunc_len_175 strikes an ideal balance between maintaining high read recovery rates and preserving microbial diversity. A similar study by Lee et al. [ 12 ] demonstrates the trimming condition for DADA2 analysis in QIIME2. In the study, they used the V3-V4 region of human oral microbiota samples and optimized the minimum overlapping length at 16 bps with a Phred quality score of 20. In our study, we have kept the Phred quality score at 18 [ 13 ] and the overlapping length was kept as default so that maximum overlap can be achieved and did not compromise with richness of the microbial community. This finding aligns with and expands upon existing literature, demonstrating that careful parameter selection is essential for interpreting microbial community structures effectively. Previous studies have highlighted the impact of truncation lengths on data quality and community representation. Callahan et al. [ 2 ] emphasized that excessive truncation can lead to significant reductions in the number of usable sequences, thereby diminishing the presence of less abundant taxa. Their analysis found that a truncation length of 220 bases optimized read quality while still capturing the richness of the microbial community. Similarly, McMurdie and Holmes [ 14 ] found that more stringent truncation adversely affected alpha diversity, as high-throughput sequencing often fails to recover rare taxa that contribute to overall community dynamics, which mirrors our findings where trunc_len_0 produced the most sequences but led to low diversity metrics. Our results showed that trunc_len_0 yielded the highest overall read counts but was associated with the lowest alpha diversity ( Fig. 3a ), indicating that while this parameter captured a broad spectrum of reads, it disproportionately favored highly abundant OTUs. This phenomenon is consistent with conclusions drawn from McKnight et al. [ 15 ], which noted that untrimmed reads may skew results toward dominant taxa, potentially masking the presence of low-abundance species that could be crucial for ecological insights. In contrast, trunc_len_200, aimed at improving data quality by filtering out shorter reads, resulted in a significant loss of data. This finding echoes the work of Bokulich et al. [ 1 ], who established that overly strict filtering criteria hindered the recovery of diverse microbial populations. Their research illustrated that essential taxa might be omitted if truncation thresholds are set too stringently, emphasizing the importance of balancing quality control with adequate representation of microbial diversity. Our analysis identified trunc_len_175 as the most effective strategy. This parameter not only maintained a substantial number of filtered and non-chimeric reads ( Tab. 1 ) but also positively impacted the representation of key phyla. For instance, we observed that Bacteroidota became significantly more prominent at this truncation length, which is consistent with findings from Turnbaugh et al. [ 16 ]. Their extensive work on the human gut microbiome underscored the importance of appropriately balanced truncation lengths to characterize community structures accurately, especially when working with abundant and rare taxa alike. Furthermore, the significant shifts in taxa abundance observed at trunc_len_175 align closely with the microbial dynamics reported in studies of similar environments. For example, in a study by Lahti et al. [ 17 ] , a balanced approach to truncation directly influenced the clarity and interpretability of the results regarding Bacteroidota and Firmicutes composition in various hosts. The ability to discern these shifts is critical for understanding the ecological roles that different taxa play in health and disease contexts. Conclusion Based on our findings and corroborated by established literature, trunc_len_175 emerges as the best truncation strategy for microbial sequencing studies. This length not only optimizes read recovery but also preserves the richness and evenness of microbial communities, facilitating more nuanced interpretations of ecological dynamics. Future investigations should incorporate these insights and benchmark against established datasets to refine truncation protocols further. By ensuring that methodological choices in high-throughput sequencing are informed by empirical evidence, researchers can enhance the reliability and applicability of microbiome research outcomes. Methods Sample data The sample data set consists of 20 sequences of mice fecal samples selected from our study, 11 samples from the group fed with atherogenic diet (Group1) and nine samples from the group fed with normal chow diet (Group2) (Bioproject no. PRJNA1143515). These samples were sequenced in Illumina MiSeq using 2X250bp paired-end technology. We randomly selected samples of the V4 hypervariable region with accession codes SRS22237203, SRS22237228, SRS22237243, SRS22237244, SRS22237247, SRS22237252, SRS22237255, SRS22237256, SRS22237295, SRS22237296, SRS22237298, SRS22237303, SRS22237304, SRS22237316, SRS22237325, SRS22237326, SRS22237327, SRS22237328, SRS22237329, SRS22237432. QIIME2 analysis QIIME2 [ 18 ](Version 2023.7) was used for this study. The raw reads fastq files were imported using the input format “PairedEndFastqManifestPhred33V2” in QIIME2 artifact file .qza. The artifact files were quality filtered, denoised, joined reads, chimera removed, and dereplicated similar sequences into amplicon sequence variants (ASVs) in DADA2 [ 19 ]. In the DADA2 analysis, the Phred score was kept at 18 [ 13 ], the V4 primer sequences were trimmed, the truncation length was set at 0, 200, 195, 190, 185, 180, 175, and 170 for both forward and reverse reads. The rest of the parameters were set by default. OTU clustering was done using the 16S_full-length Greengenes2 (version 2022.10) database at 99% similarity. Taxonomic classification was performed based on the Naïve Bayes classifier Greengenes2 2022.10 515F/806R V4 region. The classified features were subjected to stringent quality filters. Host DNA including the mitochondria/ chloroplast, features having one or two frequencies, and features and frequencies less than 0.01% were removed during the filtering steps. Microbiome composition (ANCOM) was analysed on the phylum level to determine differential abundance among the groups of various truncation length parameters. The alpha-rarefaction curve was analyzed for the richness matrix with a maximum sequencing depth of 10000, and the rest of the parameters were set by default. Alpha diversity and statistical analysis The feature ASVs table, taxonomy file, and metadata were imported as .csv files in MicrobiomAnalyst 2.0 [ 20 ] and alpha diversity index was calculated. Bar graph, box plot, ribbon plot, and volcano plot were prepared using OriginPro, version 2024 (OriginLab Corporation, Northampton, MA, USA.). Alpha rarefaction curve data was imported into R and plotting was done using ggplot2 [ 21 ], tidyr [ 22 ] and dplyr [ 23 ]. Abbreviation ASV Amplicon Sequence variants DADA Divisive Amplicon Denoising Algorithm OTU Operational Taxonomic Unit QIIME Qualitative insights into microbial ecology Author contribution W.R. developed the detailed protocols and oversaw all stages of the study, including critical inputs and the manuscript drafting process. M.G.S. conceptualized the study, performed data analysis, generated the empirical data, and wrote the manuscript. All authors reviewed and edited the final draft of the manuscript and collectively approved the decision to submit the manuscript for publication consideration. Funding Not applicable Availability of data and materials Raw data are available in NCBI SRA under Bioproject No. PRJNA1143515 Ethics approval and consent to participate Not applicable Consent for publication Not applicable Competing interests The authors declare that they have no competing interests. Acknowledgement This study received funding from the Indian Council of Medical Research (ICMR), New Delhi, India, under its Junior Research Fellowship program, awarded to M.G.S. (JRF No.: 3/1/3/JRF-2019/HRD-070(31035), Project No. GAP0797). The Council of Scientific and Industrial Research (CSIR), New Delhi, and the CSIR-NEIST, Jorhat, Assam, are gratefully acknowledged for providing the necessary facilities, thereby facilitating the smooth progression of the study. The authors also thank the Publication & Intellectual Property Rights Committee, CSIR-NEIST, Jorhat for approving the manuscript for publication (Manuscript No: CSIR-NEIST/PUB/2025/026, dated 11-02-2025). Reference 1. ↵ Bokulich NA , Subramanian S , Faith JJ , Gevers D , Gordon JI , Knight R , et al. Quality-filtering vastly improves diversity estimates from Illumina amplicon sequencing . Nat Methods . 2013 ; 10 : 57 – 9 . OpenUrl CrossRef PubMed Web of Science 2. ↵ Callahan BJ , McMurdie PJ , Rosen MJ , Han AW , Johnson AJA , Holmes SP . DADA2: High-resolution sample inference from Illumina amplicon data . Nat Methods . 2016 ; 13 : 581 – 3 . OpenUrl CrossRef PubMed 3. ↵ Kozich JJ , Westcott SL , Baxter NT , Highlander SK , Schloss PD . Development of a dual-index sequencing strategy and curation pipeline for analyzing amplicon sequence data on the MiSeq Illumina sequencing platform . Appl Environ Microbiol . 2013 ; 79 : 5112 – 20 . OpenUrl Abstract / FREE Full Text 4. ↵ Caporaso JG , Kuczynski J , Stombaugh J , Bittinger K , Bushman FD , Costello EK , et al. QIIME allows analysis of high-throughput community sequencing data . Nat Methods . 2010 ; 7 : 335 – 6 . OpenUrl CrossRef PubMed Web of Science 5. ↵ Schirmer M , Ijaz UZ , D’Amore R , Hall N , Sloan WT , Quince C . Insight into biases and sequencing errors for amplicon sequencing with the Illumina MiSeq platform . Nucleic Acids Res . 2015 ; 43 : e37 . OpenUrl CrossRef PubMed 6. ↵ Amir A , McDonald D , Navas-Molina JA , Kopylova E , Morton JT , Zech Xu Z , et al. Deblur Rapidly Resolves Single-Nucleotide Community Sequence Patterns . mSystems . 2017 ; 2 : doi: 10.1128/msystems.00191-16 . OpenUrl CrossRef 7. ↵ Edgar RC . UPARSE: highly accurate OTU sequences from microbial amplicon reads . Nat Methods . 2013 ; 10 : 996 – 8 . OpenUrl CrossRef PubMed Web of Science 8. ↵ Haas BJ , Gevers D , Earl AM , Feldgarden M , Ward DV , Giannoukos G , et al. Chimeric 16S rRNA sequence formation and detection in Sanger and 454-pyrosequenced PCR amplicons . Genome Res . 2011 ; 21 : 494 – 504 . OpenUrl Abstract / FREE Full Text 9. ↵ Lundberg DS , Yourstone S , Mieczkowski P , Jones CD , Dangl JL . Practical innovations for high-throughput amplicon sequencing . Nat Methods . 2013 ; 10 : 999 – 1002 . OpenUrl CrossRef PubMed 10. ↵ Schloss PD , Westcott SL , Ryabin T , Hall JR , Hartmann M , Hollister EB , et al. Introducing mothur: open-source, platform-independent, community-supported software for describing and comparing microbial communities . Appl Environ Microbiol . 2009 ; 75 : 7537 – 41 . OpenUrl Abstract / FREE Full Text 11. ↵ Hu T , Chitnis N , Monos D , Dinh A . Next-generation sequencing technologies: An overview . Hum Immunol . 2021 ; 82 : 801 – 11 . OpenUrl CrossRef PubMed 12. ↵ Lee S-Y , Yu Y , Chung J , Na HS . Trimming conditions for DADA2 analysis in QIIME2 platform . International Journal of Oral Biology . 2021 ; 46 : 146 – 53 . OpenUrl CrossRef 13. ↵ Mohsen A , Park J , Chen Y-A , Kawashima H , Mizuguchi K . Impact of quality trimming on the efficiency of reads joining and diversity analysis of Illumina paired-end reads in the context of QIIME1 and QIIME2 microbiome analysis frameworks . BMC Bioinformatics . 2019 ; 20 : 581 . OpenUrl CrossRef PubMed 14. ↵ McMurdie PJ , Holmes S. phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data . PLoS One . 2013 ; 8 : e61217 . OpenUrl CrossRef PubMed 15. ↵ McKnight DT , Huerlimann R , Bower DS , Schwarzkopf L , Alford RA , Zenger KR . Methods for normalizing microbiome data: An ecological perspective . Methods in Ecology and Evolution . 2019 ; 10 : 389 – 400 . OpenUrl CrossRef 16. ↵ Pj T , Re L , M H , Cm F-L , R K , Ji G . The human microbiome project . Nature . 2007 ; 449 . 17. ↵ Lahti L , Salojärvi J , Salonen A , Scheffer M , de Vos WM . Tipping elements in the human intestinal ecosystem . Nat Commun . 2014 ; 5 : 4344 . OpenUrl CrossRef PubMed 18. ↵ Bolyen E , Rideout JR , Dillon MR , Bokulich NA , Abnet CC , Al-Ghalith GA , et al. Author Correction: Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2 . Nat Biotechnol . 2019 ; 37 : 1091 . OpenUrl CrossRef 19. ↵ Callahan BJ , Sankaran K , Fukuyama JA , McMurdie PJ , Holmes SP . Bioconductor Workflow for Microbiome Data Analysis: from raw reads to community analyses . F1000Res . 2016 ; 5 : 1492 . OpenUrl 20. ↵ Dhariwal A , Chong J , Habib S , King IL , Agellon LB , Xia J . MicrobiomeAnalyst: a web-based tool for comprehensive statistical, visual and meta-analysis of microbiome data . Nucleic Acids Res . 2017 ; 45 Web Server issue: W180 – 8 . OpenUrl CrossRef PubMed 21. ↵ Wickham H. ggplot2: Elegant Graphics for Data Analysis . 2nd ed. 2016 . Cham : Springer International Publishing : Imprint: Springer; 2016. 22. ↵ Wickham H , Vaughan , Davis , Girlich, Max. tidyr: Tidy Messy Data . 2024 . 23. ↵ Wickham H , François R , Henry L , Müller K , Vaughan , Davis . dplyr: A Grammr of Data Manipulation . 2023 . View the discussion thread. Back to top Previous Next Posted June 27, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Optimization of denoising and filtering parameters of DADA2 for QIIME2 amplicon metagenomics data analysis Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Optimization of denoising and filtering parameters of DADA2 for QIIME2 amplicon metagenomics data analysis Moirangthem Goutam Singh , Romi Wahengbam bioRxiv 2025.06.24.661437; doi: https://doi.org/10.1101/2025.06.24.661437 Share This Article: Copy Citation Tools Optimization of denoising and filtering parameters of DADA2 for QIIME2 amplicon metagenomics data analysis Moirangthem Goutam Singh , Romi Wahengbam bioRxiv 2025.06.24.661437; doi: https://doi.org/10.1101/2025.06.24.661437 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7621) Biochemistry (17645) Bioengineering (13867) Bioinformatics (41873) Biophysics (21420) Cancer Biology (18550) Cell Biology (25443) Clinical Trials (138) Developmental Biology (13360) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24289) Genetics (15587) Genomics (22472) Immunology (17707) Microbiology (40318) Molecular Biology (17142) Neuroscience (88457) Paleontology (666) Pathology (2826) Pharmacology and Toxicology (4815) Physiology (7634) Plant Biology (15111) Scientific Communication and Education (2042) Synthetic Biology (4285) Systems Biology (9813) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00