CAGEcleaner: reducing genomic redundancy in gene cluster mining

preprint OA: gold CC-BY-NC-ND-4.0
📄 Open PDF Full text JSON View at publisher
Full text 20,752 characters · extracted from preprint-html · click to expand
CAGEcleaner: reducing genomic redundancy in gene cluster mining | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results CAGEcleaner: reducing genomic redundancy in gene cluster mining View ORCID Profile Lucas De Vrieze , Miguel Biltjes , Sofya Lukashevich , Kodai Tsurumi , View ORCID Profile Joleen Masschelein doi: https://doi.org/10.1101/2025.02.19.639057 Lucas De Vrieze 1 Department of Biology, KU Leuven , Heverlee, Belgium 2 Laboratory for Biomolecular Discovery & Engineering, VIB-KU Leuven Centre for Microbiology , Heverlee, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lucas De Vrieze Miguel Biltjes 1 Department of Biology, KU Leuven , Heverlee, Belgium 2 Laboratory for Biomolecular Discovery & Engineering, VIB-KU Leuven Centre for Microbiology , Heverlee, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sofya Lukashevich 1 Department of Biology, KU Leuven , Heverlee, Belgium 2 Laboratory for Biomolecular Discovery & Engineering, VIB-KU Leuven Centre for Microbiology , Heverlee, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kodai Tsurumi 1 Department of Biology, KU Leuven , Heverlee, Belgium 2 Laboratory for Biomolecular Discovery & Engineering, VIB-KU Leuven Centre for Microbiology , Heverlee, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site Joleen Masschelein 1 Department of Biology, KU Leuven , Heverlee, Belgium 2 Laboratory for Biomolecular Discovery & Engineering, VIB-KU Leuven Centre for Microbiology , Heverlee, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joleen Masschelein For correspondence: joleen.masschelein{at}kuleuven.be Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Summary Mining homologous biosynthetic gene clusters (BGCs) typically involves searching colocalised genes against large genomic databases. However, the high degree of genomic redundancy in these databases often propagates into the resulting hit sets, complicating downstream analyses and visualisation. To address this challenge, we present CAGEcleaner, a Python-based tool with auxiliary bash scripts designed to reduce redundancy in gene cluster hit sets by dereplicating the genomes that host these hits. CAGEcleaner integrates seamlessly with widely used gene cluster mining tools, such as cblaster and CAGECAT, enabling efficient filtering and streamlining BGC discovery workflows. Availability and implementation Source code and documentation is available at GitHub ( https://github.com/LucoDevro/CAGEcleaner ) and at Zenodo ( https://doi.org/10.5281/zenodo.14726119 ) under an MIT license. CAGEcleaner comes with its own Conda environment but can also be installed from the Python Package Index ( https://pypi.org/project/cagecleaner/ ). Contact lucas.devrieze{at}kuleuven.be or joleen.masschelein{at}kuleuven.be Supplementary information Supplementary data are available at Bioinformatics online. Introduction Biosynthetic pathways are often driven by multiple genes that are physically grouped together in the genome. Across all kingdoms of life, such gene clusters are extensively studied for their ability to direct the biosynthesis of diverse metabolites and proteins in a streamlined fashion. They play a central role in various biological processes, such as secondary metabolism, virulence, toxin production and drug resistance. Comparative analysis of gene clusters provides valuable insights into their evolutionary trajectories, and the functional and biosynthetic diversity of the pathways they encode. Tools such as MultiGeneBlast ( Medema, Takano and Breitling 2013 ), antiSMASH ( Blin et al. 2023 ) and, most recently, cblaster ( Gilchrist et al. 2021 ) and CAGECAT ( van den Belt et al. 2023 ), have facilitated such large-scale comparative analyses. These tools can provide a wide view on gene cluster diversity by querying large public genome databases, such as those hosted by NCBI. However, these large databases contain substantial genomic redundancy due to the deposition of (re)sequenced (nearly) identical genomes, as well as from continuous sequencing efforts in the context of pathogen surveillance. This redundancy tends to propagate into the output of gene cluster mining tools, often yielding hundreds of quasi-identical clusters. As a result, a tedious curation process is typically required before meaningful downstream analyses and visualisations can be performed. Here, we present CAGEcleaner, a Python-based pipeline with auxiliary bash scripts that rapidly dereplicates gene cluster sets by assessing the genomic similarity of their host genomes. In addition, it can preserve a certain degree of genomic redundancy when justified by sufficient gene cluster diversity. Designed primarily as a post-processing tool for cblaster, CAGEcleaner serves as an intermediate filtering step, streamlining downstream analyses and visualisations, including those facilitated by cblaster’s sister package clinker ( Gilchrist and Chooi 2021 ). Implementation The CAGEcleaner workflow is outlined in Figure 1 and consists of three parts. As input, CAGEcleaner requires a cblaster session file in json-format, obtained after running a search query. Download figure Open in new tab Figure 1: Schematic overview of the CAGEcleaner pipeline. A cblaster search query of three colocalised genes (red, blue and black rectangles) returned several redundant cluster hits. Some of these are from the same species as the query (green lines), while other hits are from another species (purple lines). Some of the same-species hits are hosted by strains that are only remotely related to the query strain, which, for example, shows in the presence of point mutations (black crosses). Starting from the session file of this search, CAGEcleaner dereplicates the genomes that host these hits, and returns a filtered session file that can readily be used for downstream analyses and visualisations. It preserves some degree of genomic redundancy when justified by (1) outlier cblaster homology scores (hits with transparent rectangles), or (2) a different number of query gene homologs than in the query cluster (hits with two or none black rectangles). In the first part, the genome assemblies associated with the cblaster hits are retrieved. The scaffold NCBI Nucleotide IDs are first extracted from the cblaster session file and mapped to NCBI Assembly IDs using the Entrez-Direct utilities, executed via an auxiliary bash script. Scaffold entries that are part of an NCBI Whole Genome Shotgun (WGS) project are first redirected to their respective WGS master records before being mapped to Assembly IDs. The mapped assemblies are then downloaded as gzipped nucleotide FASTA files using the NCBI Datasets CLI ( O’Leary et al. 2024 ) by another auxiliary bash script. To speed up ID mapping, we make the Entrez-Direct utilities process large batches of 5000 scaffold IDs at once. However, the Entrez-Direct utilities do not preserve the mapping between scaffold IDs and assembly IDs when mapping in batches. Since maintaining this mapping is critical for pinpointing scaffolds hosting gene cluster hits retained after genome dereplication, CAGEcleaner reconstructs this mapping locally by matching each scaffold ID extracted from the cblaster session file to a scaffold ID contained within one of the retrieved assembly FASTA files. The second part of the workflow involves the dereplication of the genome assemblies. This process is performed using skDER, a fast genome dereplication tool which clusters genome assemblies based on pairwise average nucleotide identity (ANI), aligned fraction (AF), assembly N50 and connectedness metrics, and selects a representative genome for each genome cluster ( Salamzade and Kalan 2023 ). In an auxiliary bash script, skDER is run in greedy mode on the locally downloaded genome assemblies using a user-defined ANI threshold. The resulting output tables are then parsed to identify the skDER representative genome assemblies. The third part of the workflow identifies the gene cluster hits to be retained by mapping the IDs of the representative genome assemblies back to gene cluster scaffold IDs using the earlier reconstructed scaffold-assembly ID mapping. In addition, it recovers gene clusters within redundant, non-representative genome assemblies that exhibit sufficient cluster diversity to justify their retention in the final gene cluster hit set. This ensures the preservation of gene cluster diversity within highly similar genomes, which may have arisen through recent genomic reorganisation or horizontal gene transfer. Such hits are detected using two strategies. The first strategy evaluates the contents of each gene cluster by subdividing each skDER-identified genome cluster into subgroups based on the number of homologs for each gene in the query cluster. A new representative genome is then randomly selected from each subgroup and retained in the hit set, skipping the subgroup including the earlier retained skDER representative. The second strategy assesses the cblaster homology scores of each gene cluster hit. Within each subgroup, hits with outlier scores as identified using z-scoring, are retained, ensuring that functionally distinct gene clusters are not removed. Finally, CAGEcleaner generates seven output text files summarising the dereplicated gene cluster hit sets. Intermediate output, such as downloaded genomes or skDER results, can also be returned upon request. These seven output text files are described in Table 1 . View this table: View inline View popup Download powerpoint Table 1: Description of the seven CAGEcleaner output text files Example cases We evaluated the running time, disk usage and RAM usage of CAGEcleaner for two concrete example cases. In each case, we executed the workflow using 20 CPU cores (Intel Core i7-13700k) and provided 32 GB of RAM. In case 1, we queried four genes - the two core biosynthetic genes and their direct neighbours - from the actinorhodin biosynthetic gene cluster from Streptomyces coelicolor A3(2) (MIBiG ( Terlouw et al. 2023 ) entry BGC0000194) against the NCBI RefSeq Protein database using cblaster at default settings. This yielded 8934 gene cluster hits in the binary table. After running CAGEcleaner at its default settings (ANI threshold of 99%), this hit set was reduced to 4847 hits, representing a 1.84-fold reduction. Among the retained hits, 170 were recovered by cluster content and 11 by outlier score. This run required 1 hour 29 minutes, consuming 28.5 GB of disk space and 27.6 GB of memory. In case 2, we aimed to have more redundancy in the hit set. We performed a generic query of three colocalised Staphylococcus genes against NCBI RefSeq Protein using cblaster at default settings and applied an Entrez query filter “Staphylococcus[orgn]”. This yielded 1146 gene cluster hits., which CAGEcleaner reduced to just 22 hits, a 52-fold reduction. The run required 10 minutes, 1.2 GB of disk space and 1.7 GB of memory. The inputs and outputs for these two example cases are provided as Supplementary Material. Discussion Hit redundancy is a persisting challenge in genome mining, often complicating visualisation and downstream analyses. CAGEcleaner is the first tool capable of tackling this issue in an automated manner. By leveraging efficient genome dereplication strategies, it significantly reduces large gene cluster hit sets in a limited amount of time. CAGEcleaner has been deliberately designed to dereplicate on the host genome level, rather than on the gene cluster level. As such, it can discern different host species and/or strains, and preserve the genomic evolutionary signal throughout the dereplication process, opening up new avenues for downstream analysis. For example, by contrasting genome-level clustering from skDER with the clustering output from gene cluster-level analysis tools like BiG-SCAPE (Navarro-Muñoz et al. 2020), it may provide insights into horizontal gene transfer events, or uncover gene clusters that evolve at different rates compared to the overall genome. CAGEcleaner is implemented in Python 3 and bash, and can be freely installed from GitHub and the Python Package Index https://pypi.org/project/cagecleaner/ ). Source code and documentation is available on the GitHub page ( https://github.com/LucoDevro/CAGEcleaner ). Funding information This work was supported by the European Union (ERC, MiStiC, 101078461). Conflict of interest none declared. Acknowledgement The authors thank Cameron Gilchrist for his helpful advice on seamlessly integrating CAGEcleaner with cblaster. Footnotes https://github.com/LucoDevro/CAGEcleaner References ↵ van den Belt , Matthias , Gilchrist , Cameron , Booth Thomas J. , Chooi , Yit-Heng , Medema Marnix H. , and Alanjary , Mohammad , ‘ CAGECAT: The CompArative GEne Cluster Analysis Toolbox for Rapid Search and Visualisation of Homologous Gene Clusters’ , BMC Bioinformatics , 24/ 1 ( 2023 ), 181 OpenUrl CrossRef PubMed ↵ Blin , Kai , Shaw , Simon , Augustijn Hannah E. , Reitz Zachary L. , Biermann , Friederike , Alanjary , Mohammad , et al. , ‘ AntiSMASH 7.0: New and Improved Predictions for Detection, Regulation, Chemical Structures and Visualisation’ , Nucleic Acids Research , 51/ W1 ( 2023 ), W46 – 50 OpenUrl CrossRef PubMed ↵ Gilchrist , Cameron L M , Booth Thomas J , van Wersch , Bram , van Grieken , Liana , Medema Marnix H , and Chooi , Yit-Heng , ‘ Cblaster: A Remote Search Tool for Rapid Identification and Visualization of Homologous Gene Clusters’ , Bioinformatics Advances , 1/ 1 ( 2021 ) Gilchrist , Cameron L.M. , and Chooi, Yit Heng , ‘ Clinker & Clustermap.Js: Automatic Generation of Gene Cluster Comparison Figures’ , Bioinformatics , 37/ 16 ( 2021 ), 2473 – 75 OpenUrl CrossRef PubMed ↵ Medema , Marnix H. , Takano , Eriko , and Breitling , Rainer , ‘ Detecting Sequence Homology at the Gene Cluster Level with MultiGeneBlast’ , Molecular Biology and Evolution , 30/ 5 ( 2013 ), 1218 – 23 OpenUrl CrossRef PubMed Web of Science Navarro-Muñoz Jorge C. , Selem-Mojica , Nelly , Mullowney Michael W. , Kautsar Satria A. , Tryon James H. , Parkinson Elizabeth I. , et al. , ‘ A Computational Framework to Explore Large-Scale Biosynthetic Diversity’ , Nature Chemical Biology , 16/ 1 ( 2020 ), 60 – 68 OpenUrl CrossRef PubMed ↵ O’Leary Nuala A. , Cox , Eric , Holmes, J. Bradley , Anderson, W. Ray , Falk , Robert , Hem , Vichet , et al. , ‘ Exploring and Retrieving Sequence and Metadata for Species across the Tree of Life with NCBI Datasets’ , Scientific Data , 11/ 1 ( 2024 ), 732 OpenUrl CrossRef PubMed ↵ Salamzade , Rauf , and Kalan Lindsay R. , ‘ SkDER: Microbial Genome Dereplication Approaches for Comparative and Metagenomic Applications’ , BioRxiv , 2023 ↵ Terlouw , Barbara R , Blin , Kai , Navarro-Muñoz Jorge C , Avalon Nicole E , Chevrette Marc G , Egbert , Susan , et al. , ‘ MIBiG 3.0: A Community-Driven Effort to Annotate Experimentally Validated Biosynthetic Gene Clusters’ , Nucleic Acids Research , 51/ D1 ( 2023 ), D603 – 10 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted February 20, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following CAGEcleaner: reducing genomic redundancy in gene cluster mining Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share CAGEcleaner: reducing genomic redundancy in gene cluster mining Lucas De Vrieze , Miguel Biltjes , Sofya Lukashevich , Kodai Tsurumi , Joleen Masschelein bioRxiv 2025.02.19.639057; doi: https://doi.org/10.1101/2025.02.19.639057 Share This Article: Copy Citation Tools CAGEcleaner: reducing genomic redundancy in gene cluster mining Lucas De Vrieze , Miguel Biltjes , Sofya Lukashevich , Kodai Tsurumi , Joleen Masschelein bioRxiv 2025.02.19.639057; doi: https://doi.org/10.1101/2025.02.19.639057 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7622) Biochemistry (17648) Bioengineering (13868) Bioinformatics (41876) Biophysics (21422) Cancer Biology (18552) Cell Biology (25458) Clinical Trials (138) Developmental Biology (13364) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15589) Genomics (22475) Immunology (17711) Microbiology (40325) Molecular Biology (17144) Neuroscience (88469) Paleontology (666) Pathology (2826) Pharmacology and Toxicology (4815) Physiology (7635) Plant Biology (15113) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9814) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-21T05:10:58.409756+00:00
License: CC-BY-NC-ND-4.0