BugBuster: A novel automatic and reproducible workflow for metagenomic data analysis

doi:10.1101/2025.02.24.639915

BugBuster: A novel automatic and reproducible workflow for metagenomic data analysis

2025 · doi:10.1101/2025.02.24.639915

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

⚙ AI-generated deep summary by claude@2026-06, 2026-06-24 · read from full text ⓘ

This paper presents BugBuster, a modular metagenomic data analysis workflow implemented in Nextflow (DSL2) designed to be automatic, reproducible, and portable via containerized dependencies. Across major steps—reads processing with Fastp, human/PhiX decontamination with Bowtie2, taxonomic profiling in reads with Kraken2/Sourmash, and read-level prediction of antibiotic resistance genes and variants using KARGA/KARGVA—BugBuster unifies outputs into generated reports. The workflow includes options such as different assembly strategies (per-sample vs co-assembly) and provides multiple customization points across 61 modules grouped into read, assembly, binning, and contig-level analysis steps. A key caveat is that reproducibility is addressed through containers and modularity, but performance and interoperability still depend on the specific tool versions and data types used within the pipeline. The paper does not explicitly discuss endometriosis or adenomyosis; it was included in the corpus via a keyword match in the upstream search index.

Read from the paper's body, not the abstract. Not a substitute for reading the paper. No clinical advice. How this works

Full text 34,108 characters · extracted from preprint-html · click to expand

BugBuster: A novel automatic and reproducible workflow for metagenomic data analysis | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results BugBuster: A novel automatic and reproducible workflow for metagenomic data analysis Francisco Fuentes-Santander , Carolina Curiqueo , Rafael Araos , Juan A. Ugalde doi: https://doi.org/10.1101/2025.02.24.639915 Francisco Fuentes-Santander 1 Center for Bioinformatics and Integrative Biology, Facultad de Ciencias de la Vida , República 330, Santiago, Chile Find this author on Google Scholar Find this author on PubMed Search for this author on this site Carolina Curiqueo 1 Center for Bioinformatics and Integrative Biology, Facultad de Ciencias de la Vida , República 330, Santiago, Chile Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rafael Araos 2 Genomics & Resistant Microbes group (GeRM), Instituto de Ciencias e Innovación en Medicina (ICIM), Facultad de Medicina, Clínica Alemana, Universidad del Desarrollo , Av Plaza 680, Santiago, Chile Find this author on Google Scholar Find this author on PubMed Search for this author on this site Juan A. Ugalde 1 Center for Bioinformatics and Integrative Biology, Facultad de Ciencias de la Vida , República 330, Santiago, Chile Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: juan{at}ugalde.bio Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Summary In metagenomic sequencing, large volumes of data are obtained with all the genetic information present in a sample, allowing valuable data to be obtained about microbial communities. The software and processes necessary to obtain quality results have become increasingly complex and sophisticated, limiting the accessibility of biologists who try to use them. To facilitate the analysis of this data, a modular and reproducible workflow was developed using the Nextflow workflow orchestrator named BugBuster. The pipeline is easy to implement because all its dependencies are provided within containers, it is reproducible, modular and portable. BugBuster consists of different processes that allow data analysis at the level of reads, contigs and MAGs, also including modules for resistome characterization and taxonomic profiling. Availability and implementation BugBuster was written in Nextflow DSL2 Syntaxis. The program applications, user manual, exemplary data and code are freely available at https://github.com/gene2dis/BugBuster . 1 Introduction Metagenomics has established itself as a fundamental tool in diverse areas of microbiology, including pathogen identification, antimicrobial resistance monitoring, and industrial process optimization ( Bashir et al., 2014 ). Processing a metagenomic sample involves multiple steps, such as removing low-quality sequences and contaminants, taxonomic profiling, genome assembly, gene annotation, and binning. While various tools exist to perform each step, their outputs and standards are often incompatible or not directly comparable, adding complexity to the analysis of metagenomic datasets ( Tamames & Puente-Sánchez, 2018 ). The large volumes of data generated require not only advanced computational tools but also expertise in high-performance infrastructure to run large-scale analyses efficiently. Numerous pipelines and workflows have been developed to process microbial data following specific protocols for different analytical stages ( Navgire et al., 2022 ). Nevertheless, experienced users frequently assemble custom hybrid workflows by selecting analysis that best fit their requirements, adding complexity and challenges to reproducibility. Despite their flexibility, these hybrid approaches pose significant challenges owing to the lack of standardized interoperability between tools, which requires manual adjustments to integrate inputs and outputs. Additionally, the various and evolving dependencies associated with these workflows make them difficult to implement, even for experienced users, and are nearly inaccessible to beginners, particularly in unfamiliar computational environments ( Kesh & Raghupathi, 2004 ). Updating these workflows to incorporate new methodologies often requires rewriting scripts, further complicating maintenance and usability. In addition to these concerns regarding interoperability and usability, reproducibility has become a key concern in computational biology. Publishing data and scripts on platforms such as GitHub or GitLab, are becoming standard practice and even a requirement in some cases. However, making these resources available does not guarantee reproducibility because platform-specific dependencies can hinder deployment across different environments ( Baykal et al., 2024 ). In addition, differing versions of libraries and tools on HPC systems can further complicate reproducibility. A potential solution to these challenges is the use of container applications, such as Docker, which allow users to run applications in isolated environments by packaging all the necessary components, including files, libraries, and operating systems ( Docker, 2020 ). When combined with a workflow management system, such as Nextflow, these containers enable the creation of automated, scalable, reproducible, efficient, and portable workflows ( Di Tommaso et al., 2017 ). To address these challenges, we present BugBuster, a fully automated workflow for metagenomic data processing that covers all stages of analysis, from initial quality control to resistome detection and characterization. BugBuster was developed in Nextflow using the DSL2 syntax, providing a modular and flexible structure. Each process was encapsulated in Docker containers to ensure easy installation and high reproducibility. Additionally, it includes specialized modules for identifying antibiotic resistance genes that can be directly associated with specific taxa. 2 BugBuster pipeline BugBuster is written in Nextflow using the DSL2 syntax, which allows for a modularized pipeline structure. It consists of 61 modules that facilitate customization during execution. These 61 modules can be grouped into six steps that address metagenome processing: i) Reads processing, ii) Taxonomic profiling reads, iii) Prediction of of antibiotic resistance genes (ARGs) and resistance-causing gene variants (ARGVs) in reads, iv) Assembly; v) Binning; vi) Taxonomy prediction and resistance genes in contigs. Some of the customization options currently available are: Assembly mode: Assembly, all metagenomes were processed individually; co-assembly and metagenomes were grouped, and a single assembly was performed. Software used for taxonomic profiling: Kraken2 ( Wood et al., 2019 ) and Sourmash ( Titus Brown & Irber, 2016 ). Prediction of resistance genes and gene variants that cause antibiotic resistance in reads. Functional annotation, taxonomic prediction, and resistance genes prediction from contigs. Metagenomic binning 2.1 Description of BugBuster modules 2.1.1 Read processing Read quality filtering is performed using FastP v. 0.23.2 ( Chen et al., 2018 ) with the following parameters --unqualified_percent_limit=10, --cut_front, --cut_front_window_size=4, --cut_front_mean_quality=20, --cut_right, --cut_right_window_size=4, --cut_right_mean_quality=20, --detect_adapter_for_pe, --n_base_limit=5 and --trim_poly_g these are the parameters set by default to BugBuster. Subsequently, the samples containing the minimum number of reads specified by the user are filtered. Additionally, contaminating reads are discarded by mapping against the human reference genome T2T-CHM13v2.0 (Accession number: GCA_000001405.1) and the PhiX genome ( Accession number: NC_001422 ) with Bowtie2 v. 2.5.3 ( Langmead & Salzberg, 2012 ) using the parameters -N=1, -L=20, -score-min=‘G,15,6’, -R=2, -i ‘S,1,0.75’. Data from the filtered reads during all steps are collected using a Bash script, and a report and plots are generated using an R script. Resistance gene prediction in reads Prediction of ARGs and ARGVs at the read level is performed with KARGA v. 1.02 ( Prosperi & Marini, 2021 ) and KARGVA v, 1.0 ( Marini et al., 2023 ) both using a kmer length of 17. The predicted ARG genes are filtered by 90%>= gene coverage, and ARGV genes are filtered by 80%>= gene coverage and with at least 2 KmerSNPHits. Predicted genes are normalized by estimating the number of cells with ARGs-OAP v. 3.2.4 ( Yin et al., 2023 ) using the default options. All the generated results are unified in a report using an R script. 2.1.3 Taxonomic profiling in reads Kraken2 workflow: Taxonomic prediction and abundance estimation at the read level is performed using Kraken2 v. 2.1.3 in conjunction with Bracken v. 2.9 ( Lu et al., 2017 ). The results generated by Bracken are unified using Kraken-Biom v. 1.2.0 ( Dabdoub, 2016 ). Subsequently, the generated file in the biom format is transformed into a Phyloseq object ( McMurdie & Holmes, 2013 ). Sourmash workflow An alternative to Kraken2, is Sourmash, which uses MinHash sketches to represent the taxonomic signatures of large sequence sets, allowing for more efficient storage, reducing RAM and CPU usage, and allowing more extensive reference databases. Taxonomic prediction and abundance estimation are performed using Sourmash v. 4.8.11. Estimation of the k-mer content in the reads is performed using the Sourmash sketch dna function with different parameters depending on the requested taxonomic resolution: Genus: -p k=21,scaled=1000,abund; Species: -p k=31,scaled=1000,abund; Strain: -p k=51,scaled=1000,abund. The minimum metagenome coverage estimation is then performed using the Sourmash gather function with its default parameters, changing the k-mer length according to the requested taxonomic resolution (Genus: k21; Species: k31; Strain: k51). Subsequently, the Sourmash tax annotation function is used to obtain the taxonomy assigned to the reads, and the generated file is used to create a Phyloseq object with an R script. Finally, data on the proportion of taxonomically classified reads is collected for both workflows using a Bash script. These results are used to generate relative abundance plots using the microViz package v. 0.12.3 ( Barnett et al., 2021 ) and bar plots with the proportion of classified reads using ggplot2 v. 3.5.0 2.1.4 Assembly Read assembly is performed using MegaHit v. 1.2.9 ( Li et al., 2015 ), in two different modes: per sample, where each metagenome is processed individually; and co-assembly, where all the samples are grouped, and a single assembly is performed. Contigs smaller than 1,000 bp are filtered using BBmap version 39.06 ( Bushnell, 2014 ), and a report is generated with assembly metrics. 2.1.5 Taxonomic annotation and antibiotic resistance genes identification in contigs Taxonomic annotation of contigs is first performed with a search using Blastn v. 2.15.0 ( Altschul et al., 1990 ) against the NT database. Then, the search result is used to assign taxonomy with BlobTools v.1.1.1 ( Laetsch & Blaxter, 2017 ) using the default parameters. Identification of antibiotic resistance genes at the contig level is performed with DeepARG v. 1.0.4 ( Arango-Argoty et al., 2018 ) using the default parameters. The results generated with both software are unified in a CSV file and visualized with an R script. 2.1.6 ORF Prediction and Functional Annotation in Contigs ORF prediction in contigs is performed using Prodigal v. 2.6.3 ( Hyatt et al., 2010 ) with the metagenomics option. Functional annotation at the contig level is performed using MetaCerberus v. 1.2.1 ( Figueroa et al., 2024 ), using, by default, the KOFam_all, COG, VOG, PHROG, and CAZy HMMs available in its database. 2.1.7 Binning and bin refinement Metagenome binning is performed using Metabat2 v. 2.15 ( Kang et al., 2019 ), Semibin2 v. 2.1.0 ( Pan et al., 2023 ) using the human-intestine trained model by default (modifiable by the user) and Comebin v. 1.0.4 ( Wang et al., 2024 ) with 3 attempts, the first with the default options, followed by reduction of the embedding size in case there is a failure on the process, using the following parameters: -b 896, -e 1792, -c 1792 for the second attempt, and -b 512, -e 1024, -c 1024 for a final attempt. Subsequently, the MAGs generated are refined with MetaWrap v.1.2 ( Uritskiy et al., 2018 ), using default thresholds of a minimum completeness of 50% and a maximum contamination of 10%. 2.1.8 Quality estimation and taxonomic prediction of bins The quality prediction of unrefined and refined bins is performed with CheckM2 v. 1.0.1 ( Chklovski et al., 2024 ) using the default parameters. The refined bins are taxonomically classified using GTDB-TK v. 2.4.0 ( Chaumeil et al., 2022 ) against the GTDB database release 220. Finally, the results are unified in a unique CSV file, which is used to generate quality and taxonomy graphs using an R script. In the co-ensemble mode, the coverage in each sample is also calculated separately for each refined bin using Bedtools v. 2.31.1 ( Quinlan & Hall, 2010 ) and Samtools v. 1.17 ( Danecek et al., 2021 ), and the coverage data are unified with the taxonomy and quality data in a CSV file. 3 Tests dataset To illustrate the results of the BugBuster pipeline, we used 9 simulated samples of metagenomic data from the human gastrointestinal tract from CAMI ( Fritz et al., 2019 ). Using this data, we tested the pipeline with these options: --assembly_mode “assembly” --taxonomic_profiler “sourmash” --read_arg_prediction --contig_tax_and_arg --include_binning. All the data was processed on an 80-CPU Intel Xeon E7-4820 v4 server with 2TB RAM. 4 Results 4.1 Reads preprocessing The processing of simulated gut microbiota reads shows a progressive decrease in the total number of reads throughout the filtering steps. A total of 1.62% of reads were removed based on quality criteria, and no human contamination was detected (Figure S1). 4.2 Taxonomic profiling and abundance estimation The results show a comparison between Sourmash and Kraken using the GTDB release 207 database and the simulated data. On average, Kraken classified approximately 96.5% of the reads, whereas Sourmash classified approximately 62.5% (Figure S2). We kept the proportions of taxa in the two workflows with respect to the CAMI data set at the phylum level, observing differences only in the taxonomic names assigned by the respective databases due to different naming conventions between NCBI and GTDB (Figure S3). In particular, variations in the naming of taxonomic groups within the phylum Firmicutes are mainly due to the number of genomes included in the GTDB database and their detailed taxonomic classification at the strain level (Parks et al., 2022). As a result, the GTDB database assigns additional identifiers, such as letter codes, after the phylum name (e.g., Firmicutes_B) to distinguish the genomes of unique species. 4.3 Resistance gene prediction in reads Processed reads can also be utilized to predict antibiotic resistance genes (ARGs) and their variants (ARGVs). For ARG prediction, we used the Megares v3.0 database ( Bonin et al., 2023 ), identifying 824 genes in total (Table S1). For ARGV prediction, we employed the KARGVA v5 database ( Marini et al., 2023 ), which led to the identification of 358 predicted genes in total (Table S2). 4.4 Assembly, taxonomic prediction, and resistance gene prediction in contigs Each sample was assembled individually, and contigs larger than 1 KB were kept for further analysis. Results show the taxonomy of the contigs present in all samples using blobplots, we obtained an average N50 of 2.175 KB and taxonomically classified 99.88% of the generated contigs throughout the entire set of samples ( Figure 2 , top and bottom panel). The proportion of phyla in the contigs obtained by BugBuster was similar to that provided by the simulated CAMI data, with Bacillota being the most abundant phylum (Table S3). All filtered contigs were used to search for resistance genes using DeepARG identifying 1706 predicted genes throughout the entire set of samples (Figure S6). Download figure Open in new tab Figure 1. Workflow of the modules and customizable decisions during execution. The execution performs: 1) Quality filtering of reads; 2) Filtering of samples containing a minimum number of reads specified by the user; 3) Filtering of contaminant reads; 4) Prediction of antibiotic resistance genes and gene variants causing resistance at the read level; 5) Normalization of predicted genes by estimating the number of cells; 6) Taxonomic prediction and abundance estimation at the read level; 7) Reports for tracking reads and taxonomic reports; 8) Read assembly; 9) Taxonomic annotation of contigs; 10) Functional annotation of contigs; 11) ORF prediction in contigs; 12) Prediction of resistance genes at the contig level; 13) Contig report and two-dimensional scatter plots; 14) Contig filtering; 15) Metagenomic binning; 16) Refinement of assembled genomes in metagenomes (MAGs); 17) Prediction of MAG quality; 18) Taxonomic prediction of MAGs; 19) MAG results report. Download figure Open in new tab Figure 2. Summary of the metrics obtained during the taxonomic classification of the contigs across all sample sets. A) Blobplot with contig metrics and taxonomy assigned using Blastn, Blobtools and the NT database. B) Relative abundance of the taxonomy at phylum level in the contigs, classified with Blastn, BlobTools, and the NT database. 4.5 Binning, quality estimation, and taxonomic prediction in bins For binning, we obtained 112, 72, and 95 high-quality bins; 61, 66, and 73 medium-quality bins; and 221, 135, and 169 low-quality bins from Comebin, MetaBAT2, and SemiBin2, respectively. This set of bins was used to generate 115 high-quality, 73 medium-quality, and 27 low-quality bins with MetaWRAP (Figure S4). In total, we reconstructed 215 metagenome-assembled genomes (MAGs) of 266 genomes provided in the CAMI dataset. All these 215 MAGs were successfully classified taxonomically at species level (Figure S5). 5 Conclusions BugBuster provides an easy-to-implement and highly reproducible workflow covering pre-processing, taxonomic classification, antibiotic resistance gene prediction, assembly, and MAG refinement. It includes documentation for users ( https://github.com/gene2dis/BugBuster ) and generates visual outputs for a better interpretation of the results. BugBuster offers modular flexibility, allowing users to select specific modules and optimize configurations for their research needs. The pipeline will be continuously updated to integrate the latest analysis methods. Its DSL2-based modularity enables the efficient incorporation of new tools, including future enhancements for mobile genetic element detection, resistance gene clustering, and optimized execution on limited computational resources. Funding This study was supported by the Agencia Nacional de Investigación y Desarrollo (ANID) of Chile through various grants: Fondecyt Regular 1221209 to JAU, Anillo ATE220061 to JAU and RA, Fondef IDeA ID23I10402 to CC, and National Doctorate Scholarship 21241355 to FF. Supplementary data Figure S1 . Tracking of the reads at each filtering step. Bowtie PhiX, and Bowtie Human represent the number of reads that passed the filtering for PhiX phage and human reads, respectively. Figure S2 . Proportion of reads taxonomically classified with both read taxonomic classification workflows. A) Reads classified with kraken 2 with confidence of 0.1 and GTDB release 207. B) Reads classified with sourmash at species-level configuration and GTDB release 207. Figure S3 . Relative abundance of the simulated gut microbial communities. A) Relative abundance for the simulated communities, provided in CAMI challenge. B) Relative abundance with BugBuster, using Kraken 2 with confidence of 0.1 and GTDB release 207. C) Relative abundance with BugBuster, using Sourmash at species level configuration and GTDB release 207. Figure S4 . Quality evaluation of the generated MAGs. Raw MAGs refers to the MAGs generated with Comebin, Semibin2, and Metabat2 across all samples. Refined MAGs represent the total number of refined MAGs reconstructed across all samples. Figure S5 . Summary of the phylum-level taxonomy of the MAGs generated for the entire sample set. Figure S6 . Summary of the contigs metrics mixed with ARG predictions across all sample sets. A) Blobplot with contig metrics and ARGs predicted using Deeparg. B) Relative abundance of ARGs found in contigs using Deeparg. Table S1 BugBuster merged results from KARGA and ARGs-OAP. Table S2 BugBuster merged results from KARGVA and ARGs-OAP. Table S3 Comparison between BugBuster predicted taxonomy of contigs vs real taxonomy provided by CAMI dataset References ↵ Altschul , S. F. , Gish , W. , Miller , W. , Myers , E. W. , & Lipman , D. J. ( 1990 ). Basic local alignment search tool . Journal of Molecular Biology , 215 ( 3 ), 403 – 410 . OpenUrl CrossRef PubMed Web of Science ↵ Arango-Argoty , G. , Garner , E. , Pruden , A. , Heath , L. S. , Vikesland , P. , & Zhang , L. ( 2018 ). DeepARG: a deep learning approach for predicting antibiotic resistance genes from metagenomic data . Microbiome , 6 ( 1 ), 23 . OpenUrl CrossRef PubMed ↵ Barnett , D. , Arts , I. , & Penders , J. ( 2021 ). microViz: an R package for microbiome data visualization and statistics . Journal of Open Source Software , 6 ( 63 ), 3201 . OpenUrl CrossRef ↵ Bashir , Y. , Pradeep Singh , S. , & Kumar Konwar , B. ( 2014 ). Metagenomics: An application based perspective . Chinese Journal of Biology , 2014 , 1 – 7 . OpenUrl ↵ Baykal , P. I. , Labaj , P. P. , Markowetz , F. , Schriml , L. M. , Stekhoven , D. J. , Mangul , S. , & Beerenwinkel , N. ( 2024 ). Genomic reproducibility in the bioinformatics era . Genome Biology , 25 ( 1 ), 213 . OpenUrl CrossRef PubMed ↵ Bonin , N. , Doster , E. , Worley , H. , Pinnell , L. J. , Bravo , J. E. , Ferm , P. , Marini , S. , Prosperi , M. , Noyes , N. , Morley , P. S. , & Boucher , C. ( 2023 ). MEGARes and AMR++, v3.0: an updated comprehensive database of antimicrobial resistance determinants and an improved software pipeline for classification using high-throughput sequencing . Nucleic Acids Research , 51 ( D1 ), D744 – D752 . OpenUrl CrossRef PubMed ↵ Bushnell , B. ( 2014 ). BBMap: A Fast, Accurate, Splice-Aware Aligner . https://escholarship.org/uc/item/1h3515gn ↵ Chaumeil , P.-A. , Mussig , A. J. , Hugenholtz , P. , & Parks , D. H. ( 2022 ). GTDB-Tk v2: memory friendly classification with the genome taxonomy database . Bioinformatics , 38 ( 23 ), 5315 – 5316 . OpenUrl CrossRef PubMed ↵ Chen , S. , Zhou , Y. , Chen , Y. , & Gu , J. ( 2018 ). fastp: an ultra-fast all-in-one FASTQ preprocessor . Bioinformatics , 34 ( 17 ), i884 – i890 . OpenUrl CrossRef PubMed ↵ Chklovski , A. , Parks , D. H. , Woodcroft , B. J. , & Tyson , G. W. ( 2024 ). Author Correction: CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning . Nature Methods , 21 ( 4 ), 735 . OpenUrl CrossRef PubMed ↵ Dabdoub , S. ( 2016 ). kraken-biom: Enabling interoperative format conversion for Kraken results (Version 1.2)[Software] . ↵ Danecek , P. , Bonfield , J. K. , Liddle , J. , Marshall , J. , Ohan , V. , Pollard , M. O. , Whitwham , A. , Keane , T. , McCarthy , S. A. , Davies , R. M. , & Li , H. ( 2021 ). Twelve years of SAMtools and BCFtools . GigaScience , 10 ( 2 ). doi: 10.1093/gigascience/giab008 OpenUrl CrossRef PubMed ↵ Di Tommaso , P. , Chatzou , M. , Floden , E. W. , Barja , P. P. , Palumbo , E. , & Notredame , C. ( 2017 ). Nextflow enables reproducible computational workflows . Nature Biotechnology , 35 ( 4 ), 316 – 319 . OpenUrl CrossRef PubMed ↵ Docker , I. ( 2020 ). https://www.aeris-consulting.com/wp-content/uploads/2022/04/Docker.pdf ↵ Figueroa , J. L. , Iii Dhungel , E. , Bellanger , M. , Brouwer , C. R. , & White , R.A. , Iii. ( 2024 ). MetaCerberus: distributed highly parallelized HMM-based processing for robust functional annotation across the tree of life . Bioinformatics , 40 ( 3 ). doi: 10.1093/bioinformatics/btae119 OpenUrl CrossRef PubMed ↵ Fritz , A. , Hofmann , P. , Majda , S. , Dahms , E. , Dröge , J. , Fiedler , J. , Lesker , T. R. , Belmann , P. , DeMaere , M. Z. , Darling , A. E. , Sczyrba , A. , Bremges , A. , & McHardy , A. C. ( 2019 ). CAMISIM: simulating metagenomes and microbial communities . Microbiome , 7 ( 1 ), 17 . OpenUrl CrossRef PubMed ↵ Hyatt , D. , Chen , G.-L. , Locascio , P. F. , Land , M. L. , Larimer , F. W. , & Hauser , L. J. ( 2010 ). Prodigal: prokaryotic gene recognition and translation initiation site identification . BMC Bioinformatics , 11 , 119 . OpenUrl CrossRef PubMed ↵ Kang , D. D. , Li , F. , Kirton , E. , Thomas , A. , Egan , R. , An , H. , & Wang , Z. ( 2019 ). MetaBAT 2: an adaptive binning algorithm for robust and efficient genome reconstruction from metagenome assemblies . PeerJ , 7 , e7359 . OpenUrl CrossRef PubMed ↵ Kesh , S. , & Raghupathi , W. ( 2004 ). Critical issues in bioinformatics and computing . Perspectives in Health Information Management , 1 , 9 . OpenUrl PubMed ↵ Laetsch , D. , & Blaxter , M. ( 2017 ). BlobTools: Interrogation of genome assemblies . F1000Research , 6 , 1287 –. OpenUrl ↵ Langmead , B. , & Salzberg , S. L. ( 2012 ). Fast gapped-read alignment with Bowtie 2 . Nature Methods , 9 ( 4 ), 357 – 359 . OpenUrl CrossRef PubMed ↵ Li , D. , Liu , C.-M. , Luo , R. , Sadakane , K. , & Lam , T.-W. ( 2015 ). MEGAHIT: an ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph . Bioinformatics , 31 ( 10 ), 1674 – 1676 . OpenUrl CrossRef PubMed ↵ Lu , J. , Breitwieser , F. P. , Thielen , P. , & Salzberg , S. L. ( 2017 ). Bracken: estimating species abundance in metagenomics data . PeerJ. Computer Science , 3 ( e104 ), e104 . OpenUrl CrossRef ↵ Marini , S. , Boucher , C. , Noyes , N. , & Prosperi , M. ( 2023 ). The K-mer antibiotic resistance gene variant analyzer (KARGVA) . Frontiers in Microbiology , 14 , 1060891 . OpenUrl CrossRef PubMed ↵ McMurdie , P. J. , & Holmes , S. ( 2013 ). phyloseq: an R package for reproducible interactive analysis and graphics of microbiome census data . PloS One , 8 ( 4 ), e61217 . OpenUrl CrossRef PubMed ↵ Navgire , G. S. , Goel , N. , Sawhney , G. , Sharma , M. , Kaushik , P. , Mohanta , Y. K. , Mohanta , T. K. , & Al-Harrasi , A. ( 2022 ). Analysis and Interpretation of metagenomics data: an approach . Biological Procedures Online , 24 ( 1 ), 18 . OpenUrl CrossRef PubMed ↵ Pan , S. , Zhao , X.-M. , & Coelho , L. P. ( 2023 ). SemiBin2: self-supervised contrastive learning leads to better MAGs for short- and long-read sequencing . Bioinformatics , 39 ( 39 Suppl 1 ), i21 – i29 . OpenUrl CrossRef PubMed ↵ Prosperi , M. , & Marini , S. ( 2021 ). KARGA: Multi-platform Toolkit for k-mer-based Antibiotic Resistance Gene Analysis of High-throughput Sequencing Data . IEEE-EMBS International Conference on Biomedical and Health Informatics. IEEE-EMBS International Conference on Biomedical and Health Informatics , 2021 . doi: 10.1109/bhi50953.2021.9508479 OpenUrl CrossRef ↵ Quinlan , A. R. , & Hall , I. M. ( 2010 ). BEDTools: a flexible suite of utilities for comparing genomic features . Bioinformatics , 26 ( 6 ), 841 – 842 . OpenUrl CrossRef PubMed Web of Science ↵ Tamames , J. , & Puente-Sánchez , F. ( 2018 ). SqueezeMeta, A highly portable, fully automatic metagenomic analysis pipeline . Frontiers in Microbiology , 9 , 3349 . OpenUrl PubMed ↵ Titus Brown , C. , & Irber , L. ( 2016 ). sourmash: a library for MinHash sketching of DNA . Journal of Open Source Software , 1 ( 5 ), 27 . OpenUrl CrossRef ↵ Uritskiy , G. V. , DiRuggiero , J. , & Taylor , J. ( 2018 ). MetaWRAP—a flexible pipeline for genome-resolved metagenomic data analysis . Microbiome , 6 ( 1 ), 1 – 13 . OpenUrl CrossRef PubMed ↵ Wang , Z. , You , R. , Han , H. , Liu , W. , Sun , F. , & Zhu , S. ( 2024 ). Effective binning of metagenomic contigs using contrastive multi-view representation learning . Nature Communications , 15 ( 1 ), 585 . OpenUrl CrossRef PubMed ↵ Wood , D. E. , Lu , J. , & Langmead , B. ( 2019 ). Improved metagenomic analysis with Kraken 2 . Genome Biology , 20 ( 1 ), 257 . OpenUrl CrossRef PubMed ↵ Yin , X. , Zheng , X. , Li , L. , Zhang , A.-N. , Jiang , X.-T. , & Zhang , T. ( 2023 ). ARGs-OAP v3.0: Antibiotic-Resistance Gene Database Curation and Analysis Pipeline Optimization . Proceedings of the Estonian Academy of Sciences: Engineering , 27 , 234 – 241 . OpenUrl View the discussion thread. Back to top Previous Next Posted February 28, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following BugBuster: A novel automatic and reproducible workflow for metagenomic data analysis Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share BugBuster: A novel automatic and reproducible workflow for metagenomic data analysis Francisco Fuentes-Santander , Carolina Curiqueo , Rafael Araos , Juan A. Ugalde bioRxiv 2025.02.24.639915; doi: https://doi.org/10.1101/2025.02.24.639915 Share This Article: Copy Citation Tools BugBuster: A novel automatic and reproducible workflow for metagenomic data analysis Francisco Fuentes-Santander , Carolina Curiqueo , Rafael Araos , Juan A. Ugalde bioRxiv 2025.02.24.639915; doi: https://doi.org/10.1101/2025.02.24.639915 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7624) Biochemistry (17651) Bioengineering (13871) Bioinformatics (41884) Biophysics (21424) Cancer Biology (18566) Cell Biology (25463) Clinical Trials (138) Developmental Biology (13365) Ecology (19867) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15590) Genomics (22477) Immunology (17714) Microbiology (40331) Molecular Biology (17148) Neuroscience (88487) Paleontology (666) Pathology (2828) Pharmacology and Toxicology (4817) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00