Full text
69,322 characters
· extracted from
preprint-html
· click to expand
METAHIT enables comprehensive and flexible genome-resolved microbiome analysis with metagenomic Hi-C | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results METAHIT enables comprehensive and flexible genome-resolved microbiome analysis with metagenomic Hi-C Shiyuan Wang , Zhen Qin , Hang Yu , Ruishan Liu , Yong Ge , Maitreya Dutta , Luan Vu , Yuxuan Du doi: https://doi.org/10.1101/2025.10.12.681839 Shiyuan Wang 1 Department of Electrical Engineering, University of Texas at San Antonio , San Antonio, 78249, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zhen Qin 1 Department of Electrical Engineering, University of Texas at San Antonio , San Antonio, 78249, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Hang Yu 1 Department of Electrical Engineering, University of Texas at San Antonio , San Antonio, 78249, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ruishan Liu 2 Department of Computer Science, University of Southern California , Los Angeles, 90089, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yong Ge 3 Department of Microbiology, Immunology & Molecular Genetics, University of Texas Health San Antonio , San Antonio, 78229, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Maitreya Dutta 1 Department of Electrical Engineering, University of Texas at San Antonio , San Antonio, 78249, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Luan Vu 4 Department of Molecular Microbiology and Immunology, University of Texas at San Antonio , San Antonio, 78249, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yuxuan Du 1 Department of Electrical Engineering, University of Texas at San Antonio , San Antonio, 78249, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: yuxuan.du{at}utsa.edu Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Metagenomic Hi-C (metaHi-C) augments shotgun sequencing with in-cell proximity information, enabling genome-resolved analysis of complex communities. However, computational tools for metaHi-C remain fragmented and rarely offer end-to-end, comprehensive analysis, and existing pipelines use only chimeric Hi-C pairs while ignoring non-chimeric reads, which often constitute a large fraction of Hi-C libraries. Here, we present METAHIT, an accessible and modular workflow that standardizes metaHi-C from raw reads to genome-resolved outputs. The pipeline provides alignment-derived, assumption-light quality checks, integrates three state-of-the-art Hi-C-based binners by consolidating their outputs into a single, non-redundant metagenome-assembled genome (MAG) set, and, for the first time, reuses informative intra-contig read pairs that other Hi-C workflows discard by identifying shotgun-like reads with an EM model on gap distances and using them for per-bin reassembly. METAHIT also supports Hi-C-guided scaffolding, focused visualizations for scaffold structure, MAG annotation, and mobile genetic element (MGE)–host interactions. Across six habitats spanning host-associated and environmental microbiomes, METAHIT increases the recovery of near-complete and high-quality MAGs relative to established Hi-C baselines, while per-bin reassembly lowers contamination and maintains completeness. Applied to a single sheep-gut long-read metaHi-C sample, METAHIT recovered 929 high-quality genomes, representing, to our knowledge, the highest species richness reported from a single sample, and revealed expanded diversity within Erysipelotrichales . In the human gut, METAHIT improved contiguity for an abundant Bacteroides vulgatus MAG via Hi-C–guided scaffolding, identified candidate novel Faecalibacterium lineages, and resolved MGE–host links involving F. prausnitzii and the novel Faecalibacterium MAG. Together, METAHIT delivers standardized, inspection-ready, genome-resolved outputs for comparative, hypothesis-driven microbiome studies across protocols and sequencing modalities. 1 Introduction Metagenomics profiles microbial communities directly from environmental or host-associated samples without isolating or culturing individual taxa, enabling analyses of community structure and metabolic potential in situ [ 1 – 4 ]. The recent integration of high-throughput chromosome conformation capture (Hi-C) with whole-metagenome shotgun (WMG) sequencing has added a spatial dimension to these studies: by recording physical co-localization of DNA within intact cells, metagenomic Hi-C (metaHi-C) provides long-range linkage that can associate fragments originating from the same cell and inform downstream genome-resolved analysis [ 5 – 10 ]. In a typical metaHi-C workflow, shotgun sequencing extracts and sequences DNA fragments from a single microbial sample, while a parallel Hi-C protocol on the same material crosslinks chromatin, performs proximity ligation to join spatially co-located loci, and yields paired-end reads capturing within-cell contacts. In this context, metaHi-C datasets can be categorized by the sequencing strategy of the shotgun library: short-read metaHi-C (e.g., Illumina) versus long-read metaHi-C (e.g., PacBio or Nanopore). Shotgun reads are assembled de novo into contiguous sequences, termed contigs, and Hi-C read pairs are then aligned to these contigs. The number of Hi-C pairs that bridge any two contigs defines a contig-by-contig contact matrix that reflects their relative spatial proximity within the cell. Because raw contact counts are influenced by non-biological factors, such as contig length and coverage, restriction-site density, GC content, and mappability, normalization is required before quantitative interpretation [ 11 – 15 ]. Once appropriately normalized, metaHi-C contacts supply long-range signal, which can be used to group contigs into metagenome-assembled genomes (MAGs) [ 16 ] via Hi-C–guided binning and to associate mobile genetic elements (e.g., viruses and plasmids) with their microbial hosts. These capabilities have enabled the construction of large genome compendia from complex communities and opened avenues to study species diversity, ecological interactions, and MGE–host relationships within single samples. Routine metaHi-C analysis remains technically demanding. A typical workflow spans adapter and chimeric-read handling, read mapping, contact construction, bias mitigation, binning, and downstream interpretation, each step invoking different tools with distinct dependencies and file formats [ 17 – 20 ]. In practice, investigators must reconcile library/version conflicts, convert intermediate outputs with custom scripts, and tune parameters that are reported inconsistently across studies. This fragmentation raises the barrier to entry and complicates reproducibility, especially for laboratories without dedicated computational support. Genome-resolved analysis with metaHi-C seeks to recover metagenome-assembled genomes (MAGs) informed by Hi-C contacts and to evaluate their quality by completeness and contamination. Multiple Hi-C–aware binners exist, but none performs consistently best across habitats and assemblies, making principled refinement of results from multiple methods a practical necessity [ 21 ]. At the same time, most pipelines prioritize inter-contig contacts and underuse informative intra-contig pairs that could be systematically recruited to strengthen the MAG reconstruction process. Beyond genome recovery, support for downstream steps is limited: MAG scaffolding from contact maps, standardized taxonomic and abundance summaries, Hi-C-based MGE-host linking, and integrated visualization are not well served by current tools. Therefore, it is imperative to develop new computational methods to fill these gaps. Here we introduce METAHIT, an accessible, modular, and reproducible workflow for metaHi-C that directly addresses these gaps. METAHIT (i) standardizes the path from raw reads to genome-resolved outputs with alignment-derived, assumption-light quality checks and documented defaults; (ii) integrates three state-of-the-art Hi-C–based binners, including bin3C [ 17 ], MetaCC [ 20 ], and ImputeCC [ 22 ], and consolidates their bin sets into a single, non-redundant MAG collection to refine MAG quality and yield across datasets; (iii) extracts shotgun-like intra-contig read pairs by an expectation–maximization (EM) model [ 23 ] on gap distances and supplies them to per-bin reassembly to reduce contamination while maintaining completeness; and (iv) extends analysis beyond binning with Hi-C–guided scaffolding, GTDB-Tk–based taxonomic assignment [ 24 ], and identification of MGE–host interaction. Software versions and analysis settings, including parameters and thresholds, are comprehensively documented for METAHIT. Together, these components make metaHi-C practical across habitats and sequencing modalities, turning raw proximity information into inspection-ready genomes and community summaries suitable for comparative and hypothesis-driven studies. We evaluate METAHIT on six habitats spanning host-associated (human [ 13 ], sheep [ 25 ], and pig [ 26 ] gut; bovine skin [ 27 ]) and environmental (wastewater [ 10 ]; hydrothermal mats [ 28 ]) microbiomes. Across datasets, METAHIT consistently increases the recovery of near-complete and high-quality MAGs relative to established Hi-C–based baselines, and per-bin reassembly lowers contamination while maintaining completeness. Bray–Curtis dissimilarities [ 29 ] of METAHIT-derived community profiles recapitulated habitat-specific structure, while the recovered MAGs reveal biologically informative signals, including candidate novel Faecalibacterium lineages in the human gut and expanded diversity within Erysipelotrichales in sheep gut. A focused case study shows that Hi-C–guided scaffolding improves contiguity for an abundant Bacteroides vulgatus MAG, and MGE–host analyses link phage-like elements to F. prausnitzii and to the novel Faecalibacterium MAG. Taken together, these results demonstrate that METAHIT standardizes metaHi-C analysis across protocols and sequencing modalities and delivers inspection-ready, genome-resolved outputs suitable for comparative and hypothesis-driven studies. 2 Results 2.1 Overview of METAHIT METAHIT is a modular, extensible pipeline for comprehensive Hi-C–based metagenomic analysis, providing integrated workflows for raw read processing, Hi-C contact construction and bias normalization, contig binning, metagenome-assembled genome (MAG) reassembly, scaffolding, annotation, and MGE–host interaction analysis ( Fig. 1 ). Implemented in Python with flexible command-line interfaces and built-in visualizations, METAHIT supports both short-read and long-read metaHi-C datasets across sequencing platforms. Unlike prior pipelines that discard non-chimeric pairs from the Hi-C library [ 17 – 20 ], METAHIT (i) integrates outputs from multiple Hi-C–aware binners into a single, non-redundant MAG set, (ii) identifies shotgun-like intra-contig pairs via an EM model and supplies them alongside shotgun reads for perbin reassembly, (iii) optionally scaffolds MAGs using Hi-C contacts, and (iv) supports downstream MGE–host analysis (viruses and plasmids). Full implementation details are provided in Methods. Download figure Open in new tab Fig. 1: Overview of METAHIT. Inputs are a shotgun library (short- or long-read) and a matched Hi-C library. After preprocessing and quality control, shotgun reads are assembled to contigs, coverage is estimated, and a Hi-C contact matrix is built. Three Hi-C–aware binners (bin3C, MetaCC, ImputeCC) independently generate bin sets, which METAHIT consolidates into a single non-redundant MAG collection. Intra-contig gap distances are modeled to identify non-chimeric reads from the Hi-C library; these are merged with shotgun reads, recruited per bin, and reassembled. The resulting MAGs feed downstream modules for Hi-C–guided scaffolding, GTDB-Tk–based annotation, and MGE analysis. 2.2 Benchmarking metaHi-C datasets from diverse environments To evaluate METAHIT across heterogeneous communities, we assembled a cross-habitat benchmark comprising six metaHi-C datasets spanning host-associated and environmental microbiomes: human gut [ 13 ], sheep gut [ 25 ], pig gut [ 26 ], bovine skin [ 27 ], wastewater [ 10 ], and hydrothermal mats [ 28 ]. Five datasets rely on short-read shotgun libraries paired with short-read Hi-C libraries (human gut, pig gut, bovine skin, wastewater, and hydrothermal mats), whereas the sheep gut dataset uses a PacBio HiFi long-read shotgun library [ 30 ] combined with a short-read Hi-C library. An overview of all benchmarking datasets is provided in Supplementary Table S1. In the human gut dataset [ 13 ], Illumina sequencing produced 125.4 million paired-end shotgun reads (37.9 Gbp) and 85.9 million paired-end Hi-C reads (25.9 Gbp). The pig gut dataset [ 26 ], sequenced on an Illumina HiSeq 4000, is of comparable scale, with 86.9 million shotgun read pairs (26.1 Gbp) and 99.3 million Hi-C read pairs (29.8 Gbp). The bovine skin dataset [ 27 ] contains 179.5 million shotgun read pairs (53.8 Gbp) and 130.6 million Hi-C read pairs (39.2 Gbp). Among the environmental samples, the wastewater dataset [ 10 ] comprises 269.3 million shotgun read pairs (81.3 Gbp) and 95.3 million Hi-C read pairs (28.8 Gbp). For hydrothermal mats [ 28 ], we analyzed sample sequenced on an Illumina NovaSeq 6000, yielding 259.3 million shotgun read pairs (77.8 Gbp) and 138.7 million Hi-C read pairs (41.6 Gbp). For the long-read dataset, we used the sheep gut metaHi-C sample [ 25 ], which includes PacBio HiFi shotgun sequencing and Illumina HiSeq 2000 Hi-C sequencing. The matched Hi-C library of the sheep gut dataset contained 107.7 million read pairs (32.3 Gbp). Restriction enzymes used to construct each Hi-C library are listed in Supplementary Table S1. This collection spans multiple hosts, environments, sequencing technologies, and library sizes, providing a stringent and representative testbed for metaHi-C analysis. Shotgun reads from the five short-read metaHi-C datasets (human gut, bovine skin, pig gut, wastewater, and hydrothermal mats) were assembled into contigs with MEGAHIT (v1.2.9) [ 31 ] while the sheep-gut long-read dataset (PacBio HiFi) was assembled with metaFlye (v2.9) [ 32 ] (see Methods, Subsection 4.2). Assembly statistics for all six datasets are summarized in Supplementary Table S2. 2.3 Comprehensive Hi-C library quality assessment Because downstream analyses depend critically on the strength of Hi-C signal, METAHIT reports two complementary library-level indicators. First, the 3D ratio [ 8 ], reported by the alignment module, quantifies enrichment for inter-contig contacts: the number of read pairs with both mates mapping unambiguously to different contigs divided by the total number of read pairs (see Methods, Subsection 4.2). Second, the informative fraction, reported by the following reassembly module of METAHIT, summarizes the share of proximity-ligation (PL) pairs in a library by combining observed inter-contig pairs with a model-based estimate of the PL proportion among intra-contig pairs (see Methods, Subsection 4.5). We reported the 3D ratio for all six datasets and the informative fraction for the five short-read metaHi-C datasets. Across the short-read datasets, the two indicators were strongly correlated (Pearson’s r = 0.78; Supplementary Table S3), indicating consistent assessments of long-range signal. As expected for the highly contiguous long-read assembly (sheep gut), the 3D ratio was lower because many ligations remain within single contigs rather than spanning contig boundaries. 2.4 Hi-C–informed ensemble binning improves MAG yield and quality across habitats We evaluated binning performance using CheckM2-estimated completeness and contamination [ 33 ], reporting the number of near-complete MAGs (completeness ≥90%, contamination < 5%) and high-quality MAGs (completeness ≥50%, contamination < 10%). In head-to-head comparisons against bin3C [ 17 ], MetaCC [ 20 ], and ImputeCC [ 22 ] across six environments, METAHIT binning module (in short METAHIT here) consistently produced higher MAG counts under both thresholds ( Fig. 2 ). Download figure Open in new tab Fig. 2: Benchmarking METAHIT binning across six habitats. Numbers of MAGs meeting quality thresholds (contamination < 5% or < 10%; completeness ≥50%, ≥70%, or ≥90%) for METAHIT, bin3C, MetaCC, and ImputeCC in A human gut, B sheep gut (long-read), C pig gut, D bovine skin, E wastewater, and F hydrothermal mats. Quality was assessed with CheckM2. Across datasets and thresholds, METAHIT outperforms the other three methods. In the human gut dataset, METAHIT recovered 69 near-complete and 131 high-quality MAGs, exceeding the next best method (ImputeCC) by 10 (16.9%) and 21 (19.1%), respectively. It also outperformed ImputeCC by retrieving 12.5% more high-quality bins in the pig gut dataset and 5.7% more near-complete bins in the bovine skin dataset, respectively. In environmental samples, METAHIT assembled 238 (wastewater) and 124 (hydrothermal mats) high-quality MAGs, improving on ImputeCC by 29 (13.9%) and 25 (25.3%), respectively. The largest gain was observed in the long-read sheep gut dataset. METAHIT generated 487 near-complete MAGs, surpassing bin3C, MetaCC, and ImputeCC by 237 (94.8%), 231 (90.2%), and 97 (24.9%), respectively. It also produced 929 high-quality MAGs, exceeding the same methods by 526 (130.5%), 397 (74.6%), and 48 (5.4%), respectively. To the best of our knowledge, this is the highest number of high-quality and near-complete MAGs reported from a single sample. To assess whether higher counts translate into broader biological coverage, we annotated all high-quality MAGs using METAHIT annotation module through GTDB-Tk [ 24 ] (see Methods, Subsection 4.6). Across every dataset, METAHIT recovered more distinct GTDB-defined taxa [ 24 ] at the species, genus, family, and order levels than the Hi-C–based alternatives ( Fig. 3 and Supplementary Fig. S1), indicating that the additional MAGs expand taxonomic breadth rather than duplicating closely related bins. We further quantified redundancy and uniqueness among binners utilizing all-vs-all Mash distances. We calculated distance between high-quality MAGs produced by METAHIT binning module and other Hi-C–based tools using Mash (v2.3) [ 34 ] with the ‘-s 10,000’ parameter. Two MAGs were considered to represent the same underlying genome if their Mash distance was at or below 0.01 [ 35 , 36 ]. Across all datasets, the majority of MAGs reported by other methods had a match in METAHIT’s set, while METAHIT also contributed a substantial number of unique MAGs not retrieved elsewhere ( Fig. 4 and Supplementary Fig. S2). Together, these analyses show that the METAHIT binning module increases the yield of high-quality MAGs and broadens taxonomic coverage across habitats. Download figure Open in new tab Fig. 3: METAHIT binning broadens taxonomic breadth across short- and long-read meta-Hi-C datasets. Counts of distinct GTDB-defined taxa (species, genus, family, order) from high-quality MAGs (completeness ≥50%, contamination < 10%) in pairwise comparisons between METAHIT binning and each Hi-C–based method (bin3C, MetaCC, ImputeCC). Bars are partitioned into taxa shared by both methods (grey) and taxa unique to one method (colors as indicated). Panels: A human gut, B pig gut, C wastewater. In all datasets, most taxa reported by the alternative methods are also recovered by METAHIT binning, which additionally contributes more unique taxa at all ranks. Download figure Open in new tab Fig. 4: METAHIT binning recovers most overlapping bins and contributes additional unique bins. Sankey diagrams show overlap of high-quality MAGs (completeness ≥50%, contamination < 10%) between METAHIT binning and each Hi-C–based method (bin3C, MetaCC, ImputeCC). Side bars represent all MAGs recovered by each method, while middle bars represent MAGs unique to each method. Panels: A human gut, B pig gut, C wastewater. In all datasets, most bins reported by the alternative methods are also recovered by METAHIT binning, which additionally contributes a substantial number of unique bins. 2.5 MAG reassembly reduces contamination while preserving completeness We evaluated the METAHIT reassembly module across five short-read metaHi-C environments; the long-read sheep-gut sample was excluded because the procedure targets short-read metaHi-C datasets (see Methods, Subsection 4.5). In all five datasets, reassembly reduced contamination while yielding small positive gains in completeness ( Fig. 5A ). On average, contamination decreased by 25.9% (human gut), 13.5% (pig gut), 25.8% (bovine skin), 23.0% (wastewater), and 20.8% (hydrothermal mats), and the percentage of bins with lower contamination was 42.7%, 33.3%, 52.5%, 39.5%, and 47.6%, respectively. Completeness increased slightly on average (mean change from 82.65 to 82.74 across datasets), and reductions in contamination were most pronounced for bins with higher pre-reassembly contamination (Supplementary Fig. S3). Consistent with Fig. 5A , contamination decreases were statistically significant ( p ≤ 0.05) in all environments except pig gut, and no dataset showed a significant loss of completeness. Download figure Open in new tab Fig. 5: Effects of reassembly and contribution of residual assemblies across short-read datasets. A Box plots compare contamination (left) and completeness (right) of high-quality MAGs before (Original) and after (Reassembled) per-bin reassembly in human gut, pig gut, bovine skin, wastewater, and hydrothermal mats. Asterisks indicate significant differences between Original and Reassembled ( p ≤ 0.05); NS indicates non-significant differences ( p > 0.05). Contamination decreases are significant in all environments except pig gut, while completeness shows no significant decline in any dataset. B Residual assemblies are enriched for mobile genetic elements (MGEs). For each environment, pie charts show the share of viral (blue) and plasmid (red) contigs identified by geNomad in contigs from bin-specific reassemblies (light wedges) versus contigs assembled from reads not recruited to any bin (darker wedges). Across all five environments, residual assemblies contain more MGE contigs than bin-derived assemblies. To avoid losing episomes and other mobile elements that do not co-assemble within chromosomal bins, METAHIT reassembly module also assembles the residual reads that are not recruited to any bin and merges those contigs with bin-specific reassemblies into a single contig set for downstream analyses (see Methods, Subsection 4.5). Applying geNomad (v1.11.0, default parameters) [ 37 ] to both sources showed that, across all five environments, residual contigs contain markedly more viral and plasmid sequences than the bin-derived contigs ( Fig. 5B ). This enrichment indicates that much of the MGE sequence content resides outside MAG bins, and relying only on binned contigs for MGE–host linking would therefore miss substantial signal. Assembling and screening the residual read set is thus essential for MGE discovery and downstream MGE–host analysis. Finally, beyond contamination and completeness, assembly contiguity improved at cohort scale: across all five environments, N50 (length at which 50% of the assembly is in scaffolds of at least this size) increased and L50 (number of scaffolds needed to cover 50% of the assembly) decreased after reassembly (Supplementary Table S4). The N50 gains ranged from roughly 6% to 28%, and L50 decreased by approximately 6%–26%, indicating more contiguous assemblies and improved bin quality overall. 2.6 Habitat-level structure and novel lineages across environments Using GTDB-Tk annotations [ 24 ] and coverage for high-quality reassembled MAGs (completeness ≥ 50%, contamination < 10%) produced by the annotation and coverage module, METAHIT profiles diversity, novelty, and community organization across habitats. Species-level richness was highest in sheep gut and wastewater (271 and 162 distinct GTDB species), followed by human and pig gut (130 and 113), hydrothermal mats (43), and bovine skin (23). The fraction of bins without a GTDB species label was lowest in human gut (0.8%) and highest in bovine skin (71.2%), indicating substantial unexplored diversity in non-gut and extreme environments. Community organization showed clear ecological signatures ( Fig. 6A ). The three gut habitats were dominated by Bacillota A and Bacteroidota , consistent with fiber- and mucin-associated fermentation [ 38 , 39 ]; wastewater displayed a mixed profile consistent with heterogeneous inputs and sewer biofilms [ 40 , 41 ]; hydrothermal mats were enriched for Campylobacterota and Desulfobacterota , consistent with sulfur cycling [ 42 , 43 ]; bovine skin featured Bacteroidota and Spirochaetota , in line with prior reports of Treponema -rich communities [ 44 ]. Phylum-level Bray–Curtis dissimilarities [ 29 ] placed human and sheep gut closest (0.196) and pig gut farthest from hydrothermal mats (0.822), indicating that host-associated guts cluster more closely than hydrothermal systems ( Fig. 6B ). Download figure Open in new tab Fig. 6: Habitat structure and a scaffolding case study from METAHIT outputs. A Phylum-level community composition (GTDB-Tk annotations) for high-quality reassembled MAGs (completeness ≥50%, contamination < 10%) across six habitats; phyla outside the top eight are grouped as “others.” Gut habitats are dominated by Bacillota A and Bacteroidota , wastewater shows a mixed profile, hydrothermal mats are enriched for Campylobacterota and Desulfobacterota , and bovine skin features Bacteroidota and Spirochaetota . B Bray–Curtis dissimilarities among habitats based on phylum-level profiles: human and sheep gut are most similar, whereas pig gut differs most from hydrothermal mats. C Intra-MAG contact map of Bin47 ( Bacteroides vulgatus ) from the human gut dataset after Hi-C–guided scaffolding; a clear main diagonal with minimal off-diagonal signal indicates structural coherence. Within prominent clades, METAHIT also recovers candidate novel taxa. In the sheep-gut long-read dataset, we retrieved eight Erysipelotrichales MAGs, an order implicated in host physiology and disease [ 45 ]. Four received species-level GTDB assignments, one resolved to genus, and three only to family, indicating under-sampled lineages and likely diversity expansion within this group. Targeted cultivation, strain-resolved sequencing, and phenotypic assays will be needed to confirm these candidates and clarify their roles. 2.7 METAHIT improved scaffolding and revealed novel Faecalibacterium –phage links in the human gut In the human gut dataset, METAHIT improved contiguity for a dominant Bacteroides vulgatus MAG and resolved Faecalibacterium phage–host networks, including links to a candidate novel lineage and MGEs recovered from the residual assembly. We first applied the scaffolding module to Bin47 , the most abundant high-quality MAG. The annotation module assigns it to Bacteroides vulgatus , a prevalent gut commensal involved in complex-carbohydrate metabolism [ 39 , 46 ]. Scaffolding improved contiguity, increasing N50 from 96,021 to 216,519 and decreasing L50 from 18 to 9. The intra-MAG contact map ( Fig. 6C ) showed a clear main diagonal with minimal off-diagonal signal, supporting structural coherence. METAHIT recovered seven Faecalibacterium MAGs, a key butyrate-producing genus often depleted in inflammatory bowel disease (IBD) [ 47 , 48 ], including a candidate novel lineage ( Bin32 ) without a GTDB species assignment. Bin32 was genetically distinct from the other Faecalibacterium MAGs (Mash distance 0.10–0.20), and inter-bin Hi-C contacts among these MAGs were sparse, supporting their interpretation as distinct genomes ( Fig. 7A ). Download figure Open in new tab Fig. 7: Phylum distribution of MGE links, Faecalibacterium contact structure, and host–phage networks from the human gut dataset. A Hi-C contact heatmap among seven Faecalibacterium MAGs showing strong intra-bin signal and sparse inter-bin contacts, consistent with distinct genomes. The MAGs are ordered by the number of contigs they contain. B Counts of viral (blue) and plasmid (red) contigs linked by Hi-C to MAGs from the four most abundant phyla; the majority of MGE links involve Bacillota A MAGs. The phyla are ordered by their total number of associated viral and plasmid contigs. C–D Representative host–phage networks for F. prausnitzii (C) and the novel Faecalibacterium MAG (D). Nodes represent viral orders (Virgo annotations; parentheses show summed contact support per order), and edges connect viral contigs to the host MAG; shorter edges denote stronger normalized Hi-C support. The novel Faecalibacterium MAG engages with a broader and more diverse phage repertoire than F. prausnitzii , highlighting intra-genus differences in phage association patterns. Applying the MGE module on the human gut dataset, geNomad (v1.11.0) [ 37 ] identified viral and plasmid sequences among assembled contigs; contigs annotated as provirus were excluded from the viral set. After removing 158 proviruses, 2,652 viral and 1,749 plasmid contigs remained. Notably, 74% of viral contigs and 50% of plasmid contigs originated from the residual assembly, indicating that a large fraction of MGE signal lay outside MAG bins and motivating assembly of the residual read set for downstream MGE analysis. Viral contigs were evaluated with CheckV (v1.0.3, database v1.5) [ 49 ], yielding 12 complete and 25 high-quality viral genomes. Contigs labeled “not determined” were removed, leaving 2,239 viral contigs for host attribution. Host links were then called from the bias-corrected, normalized Hi-C matrix (Methods, Subsection 4.7). Most MGE–host contacts involved MAGs in Bacillota A ( Fig. 7B ), consistent with its abundance in the human gut. Within Bacillota A , two Faecalibacterium MAGs showed active phage interactions: 27 candidate phages associated with F. prausnitzii and 86 candidate phages linked to the novel Faecalibacterium MAG. These phages were taxonomically annotated at the order level with Virgo (v1.0.0) [ 50 ]; representative host–phage networks were shown in Fig. 7C–D and probed intra-genus diversity within Faecalibacterium . The F. prausnitzii MAG displayed a sparse set of links, whereas the novel Faecalibacterium MAG connected to a broader and more diverse repertoire, including contigs assigned to Herpesvirales [ 51 ], which had been reported to infect animals, including humans. Additional evidence will be needed to determine whether these signals indicated true infection or arose from extra-cellular, non-infectious ligation. Viral contigs annotated as Methanobavirales showed strong linkage to F. prausnitzii but weak linkage to the novel Faecalibacterium MAG, indicating preferential association with one host over the other. This difference in Hi-C interaction patterns suggested ecological distinction within Faecalibacterium : some engaged with a limited phage set suggestive of a relatively stable genome context, while others exhibited extensive connections consistent with greater temperate-phage involvement and potential gene flow. By resolving such fine-scale host–phage structure from Hi-C data, METAHIT nominated concrete phage–host pairs for validation and provided hypotheses about genome plasticity and functional adaptation in the human gut. 3 Discussion METAHIT provides an end-to-end, modular workflow that standardizes metaHi-C analysis from raw reads to genome-resolved outputs. Across six habitats, it increased recovery of near-complete and high-quality MAGs over established Hi-C baselines, while per-bin reassembly reduced contamination with little impact on completeness. Habitat-level patterns derived from length-weighted profiles aligned with ecological expectations, and focused cases highlighted biologically informative genomes and interactions. Two design choices underpin these gains. First, treating Hi-C–informed binning as an ensemble task and consolidating multiple bin sets into a non-redundant collection avoids dependence on any single method and improves yield and breadth. Second, reusing intra-contig signal by extracting shotgun-like pairs via an EM classifier on gap distances and supplying them to reassembly offers a practical route to cleaner drafts. Alignment-derived indicators (3D ratio [ 8 ]) and the model-based informative fraction provide early, assumption-light checks of long-range signal and help separate library effects from assembly continuity. METAHIT also carries analysis beyond binning: Hi-C–guided scaffolding improves contiguity where warranted, standardized taxonomic annotation supports downstream interpretation, and identification of MGE–host associations facilitates inspection and hypothesis generation. In practice, reassembly is compute-intensive and benefits from adequate coverage; METAHIT therefore supports selective use (e.g., prioritizing higher-contamination or high-value MAGs) to match available resources. Where resources are constrained, users may skip this step and proceed with the consolidated bins from the METAHIT binning module for downstream analyses, although our results show it significantly reduces contamination of retrieved MAGs and increases contig contiguity. By default, MGE–host links are called from all normalized contacts after spurious-contact filtering; users may increase stringency by adjusting the filtering cutoff in the normalization module or by adding auxiliary criteria when warranted. Looking ahead, useful extensions include strain-aware binning and phasing, longitudinal tracking of MAGs and MGE–host edges, and data-driven thresholding for spurious-contact removal. These areas are active and evolving, so we do not prescribe them here; instead, METAHIT is structured to accommodate such advances as they mature. In summary, by combining ensemble binning, EM-guided read reuse, and streamlined downstream interpretation within a reproducible framework, METAHIT turns raw proximity information into inspection-ready, genome-resolved outputs suitable for comparative and hypothesis-driven microbiome studies, such as validating predicted phage–host pairs, prioritizing novel MAGs for cultivation and functional tests, quantifying condition-driven shifts in MAG abundances and MGE networks, and tracking MGE dynamics in longitudinal cohorts. 4 Methods 4.1 Implementation METAHIT is implemented as a comprehensive pipeline combining shell and Python scripts specifically designed for metagenomic Hi-C data analysis. The pipeline integrates multiple analytical modules in a sequential yet modular manner, providing flexibility, efficiency, and reproducibility in processing metaHi-C datasets. Each METAHIT module addresses specific analytical tasks by leveraging both newly developed methodologies and existing state-of-the-art tools, including raw data processing, Hi-C contact matrix generation and normalization, contig binning, MAG reassembly, scaffolding and visualization, taxonomy annotation, and detection and analysis of MGE. 4.2 Processing Raw Data METAHIT first provides an integrated workflow for processing raw metaHi-C sequencing reads, comprising four modules: Preprocessing, Assembly, Alignment , and Coverage . A consolidated summary of software versions, parameters, and exact command lines is provided in Supplementary Table S5. Preprocessing module Read cleaning is required before aligning Hi-C pairs because adaptor/linker sequences, low–quality bases, and PCR duplicates confound downstream analyses. METAHIT cleans shotgun and Hi-C libraries with BBduk (BBTools v38.95) [ 52 ], using (i) adaptor removal (default options: ktrim=r, k=23, mink=11, hdist=1 ), (ii) end-quality trimming (default options: qtrim=r, trimq=10, ftm=5 ), and (iii) a fixed 5’ hard-trim (default option: 10 bp). A minimum post-trim read length (default option: 50 bp) is enforced at each step. Subsequently, FastQC (v0.11.9) [ 53 ] is employed to assess read quality and evaluate the effectiveness of preprocessing. Assembly module: METAHIT assembles short-read shotgun libraries with MEGAHIT [ 31 ] using a broad k–mer sweep (v1.2.9, default options: -k-min 21, -k-max 141, -k-step 12, -merge-level 20,0.95 ). metaSPAdes (v4.2.0) [ 54 ] is provided as an alternative. For long-read shotgun libraries, assemblies are generated with metaFlye (v2.9) [ 32 ]. Alignment module METAHIT aligns preprocessed paired-end Hi-C reads to assembled contigs using BWA-MEM (v0.7.17) [ 55 ] with the ‘-5SP’ option, which disables pairing mode and retains the alignment with the lowest read-coordinate as primary [ 56 ]; post-alignment filtering with SAMtools (v1.17) [ 57 ] removes unmapped reads and excludes secondary, supplementary, and low-quality alignments (default options: nucleotide match length < 30 or mapping score < 30), ensuring accurate downstream analyses. METAHIT also reports a library-level 3D ratio [ 8 ] to quantify enrichment for between-contig contacts. It is defined as the ratio of primary read pairs whose mates map to two different contigs to the total number of primary read pairs. Coverage module METAHIT computes the coverage of contigs using the MetaBAT2 (v2.12.1) [ 58 ] script ‘jgi summarize bam contig depths’ with default parameters. 4.3 Generating raw and normalized Hi-C contacts Contact module The assembled contigs, Hi-C read alignment results, and restriction enzyme(s) information serve as inputs to the contact module. Short contigs with limited Hi-C signals and few restriction sites typically exhibit higher variance, reducing stability in downstream analyses [ 17 ]. Therefore, METAHIT imposes filtering criteria, excluding contigs shorter than a specified minimum length (default option: 1,000 bp), containing fewer than a minimum number of restriction sites (default option: one), or having insufficient Hi-C contacts (default option: two across-contig Hi-C contacts). The raw Hi-C contact matrix was constructed based on the alignment of paired-end Hi-C reads, with diagonal entries representing intra-contig contacts and off-diagonal entries indicating inter-contig contacts. Since raw metaHi-C contacts contain systematic biases and noise that compromise the reliability of microbial interaction networks, normalization is essential prior to downstream analyses [ 14 ]. METAHIT contact module adopts a two-step normalization procedure: bias correction followed by spurious-contact filtering. By default, bias correction uses NormCC (v1.2.0) [ 20 ]; alternative options are available, including the approaches used in bin3C [ 17 ], HiCzin [ 59 ], and MetaTOR [ 18 ] (see Supplementary Note 1). After bias correction, the lowest k percent (default option: 5) of normalized Hi-C contacts are removed to filter out potential spurious interactions [ 14 ], thereby enhancing the quality and reliability of subsequent analyses. 4.4 Hi-C-informed ensemble binning and MAG refinement Binning module The binning module of METAHIT integrates multiple Hi-C-based binning approaches to produce a refined set of high-quality metagenome-assembled genomes (MAGs). Specifically, METAHIT combines the complementary strengths of state-of-the-art Hi-C-informed binning methods, including bin3C (v0.1.1a) [ 17 ], MetaCC (v1.2.0) [ 20 ], and ImputeCC (v1.0.0) [ 22 ], each specifically optimized to leverage genomic interaction signals from Hi-C data and run with default parameters. Since bin3C was originally implemented in Python 2, we ported it to Python 3 for compatibility with METAHIT. The outputs of these individual binning methods are systematically combined using a hybrid refinement strategy inspired by [ 60 ]. First, hybridized bin sets are generated by taking pairwise and collective intersections of the initial binning predictions using Binning refiner (v1.2, default parameters) [ 61 ]. This process carefully resolves contigs assigned inconsistently across different binning tools, resulting in hybrid bins that integrate complementary strengths from multiple predictions. Subsequently, bins from all three original and four hybrid sets are evaluated with CheckM2 (v1.0.1, default parameters) [ 33 ] for completeness and contamination. Only bins that meet the thresholds for minimum completeness (default option: 50) and maximum contamination (default option: 10) are retained. Remaining sets are iteratively consolidated by identifying bins that consistently share the same contigs across different binners, and consensus bins are retained only if their total length exceeds a minimum size threshold (default option: 500 kb). Within each duplicate group, the representative bin is selected by maximizing where r is a penalty weight (default option: 5). Ties are resolved by preferring higher completeness, then lower contamination, and finally larger assembly size. After selecting one representative per group, the consolidated set is formed. Finally, any contig appearing in more than one retained bin is assigned uniquely to the highest–scoring bin under the same criterion, and CheckM2 is rerun on the resulting non-redundant collection to refresh quality metrics. 4.5 Classifying shotgun-like pairs for reassembly via EM on intra-contig distances Reassembly module Recent studies have revealed that Hi-C libraries frequently contain short-read pairs that are in fact whole-genome shotgun (WGS) reads and do not originate from proximity ligation (PL) events due to the imperfect efficiency of the Hi-C enrichment process [ 62 , 63 ]. Although these reads do not capture long-range interactions between contigs and are typically discarded in conventional metaHi-C analyses, they may still carry valuable genomic information. METAHIT identifies such pairs and reuses them to bolster MAG-specific reassembly. Specifically, we first treat all Hi-C read pairs mapped to different contigs as PL-derived products and focus only on distinguishing read pairs that map to the same contig. The mapped positions of read pairs are extracted from the BAM files generated by the alignment module, and we define d i as the gap distance between the two aligned reads of the i -th read pair mapped to the same contig. The empirical distribution of d i is assumed to show two regimes: shorter inserts consistent with shotgun-like pairs and longer inserts compatible with proximity-ligation chimeras. To separate these regimes, METAHIT fits a two-component Gaussian mixture to the gap distances, where component C models proximity-ligation chimeras and component N models shotgun-like pairs. Parameters are estimated by expectation–maximization (EM) [ 23 ] with robust quantile-based initialization and a log-likelihood convergence criterion (default options: log-likelihood change < 0.01, maximum 100 iterations). The E-step computes the posterior for the chimeric component, and the M-step updates Pairs are labeled non-chimeric (shotgun-like) if their posterior for C falls below where Z 0.05 and Z 0.95 denote the 5th and 95th percentile z-scores of the standard normal distribution, respectively. Equivalently, METAHIT uses the intersection of the two fitted densities on d as a data-driven cutoff and labels pairs below this cutoff as non-chimeric. All inter-contig pairs are treated as proximity-ligation products and are not reused for reassembly. The fitted mixture weight is reported as a model-based indicator of the PL share within intra-contig pairs. An overall informative fraction for the Hi-C library is then estimated as where m is the number of inter-contig pairs and n is the number of intra-contig pairs considered by the model. This estimate is diagnostic rather than ground truth and is interpreted alongside 3D ratio [ 8 ], the other alignment-derived indicator reported in the alignment module of METAHIT. Read identifiers for non-chimeric pairs are exported and merged with shotgun reads. METAHIT then performs per-bin reassembly using both sources, employing the reassembly framework from [ 60 ] to improve MAG quality; implementation details and full parameterization are provided in Supplementary Note 2. After per-bin read recruitment and reassembly, residual reads that are not assigned to any bin are pooled and assembled de novo with MEGAHIT [ 31 ] (v1.2.9; default options: -k-min 21, -k-max 141, -k-step 12, -merge-level 20,0.95 ). This residual assembly preserves episomes and other mobile elements that do not co-assemble within chromosomal bins and would otherwise be lost. Finally, METAHIT merges contigs from (i) the bin-specific reassemblies and (ii) the residual assembly into a single contig set for downstream analyses. Notably, the reassembly module of METAHIT is intended for short-read metaHi-C datasets and is not applied to long-read datasets to avoid mixing assembly paradigms; a comparable hybrid refinement for long-read data is outside the scope of this study. 4.6 MAG scaffolding and annotation Scaffolding module METAHIT scaffolds and visualizes MAGs using Hi-C contacts. Given a MAG’s contigs and the corresponding Hi-C alignments (BAM), METAHIT first runs YaHS (v1.2.2; default parameters) to infer scaffold order and orientation from contact patterns [ 64 ]. After scaffolding, per-MAG Hi-C contact heatmaps are rendered from the scaffolded assembly and the Hi-C contact matrix at a user-selectable resolution (default option: 10 kb). Here, the resolution denotes the genomic bin size used to aggregate contacts: scaffolds are partitioned into fixed-length bins (e.g., 10 kb per bin), and each heatmap cell reports the contacts between bin pairs. These views enable rapid inspection of scaffold structure such as continuous main diagonal and facilitate detection of potential mis-joins or gaps. Annotation module METAHIT provides an annotation module with GTDB-Tk (v2.4.0) [ 24 ] using the ‘classify wf’ workflow and GTDB database (release r220), providing standardized, rank-consistent taxonomic assignments that enhance biological interpretability. 4.7 Detecting and analyzing mobile genetic elements MGE module METAHIT detects and analyzes mobile genetic elements (MGEs), including viruses and plasmids, and their host associations. MGE discovery is performed with geNomad (v1.11.0; default parameters) [ 37 ], which identifies and annotates viral and plasmid sequences among assembled contigs. Contigs (or regions) annotated by geNomad as provirus are excluded from the standalone MGE set to avoid conflating host MAG sequence with MGEs. Host linkage then uses the bias-corrected, normalized Hi-C contact matrices generated by METAHIT’s contact module (spurious-contact filtering applied). By default, a linkage is reported whenever a normalized contact exists between an MGE contig and a non-MGE MAG (host bin). Here, host bins are defined after excluding contigs labeled as MGEs. For transparency, each reported link is accompanied by its normalized contact value. Declarations Declaration of interests The authors declare no competing interests. Consent for publication All authors have approved the manuscript for submission. Data Availability All datasets used for benchmarking METAHIT in this study are publicly accessible through the NCBI Sequence Read Archive (SRA; http://www.ncbi.nlm.nih.gov/sra ). The human gut dataset can be found under the following accession numbers: shotgun library SRR6131123, and Hi-C libraries SRR6131122 and SRR6131124. As a long-read dataset, the sheep gut assembly was used in this study. It is publicly available at https://doi.org/10.5281/zenodo.5228989 under the file name flye.v29.sheep gut.hifi.250g.fasta.gz. The Hi-C libraries of the sheep gut dataset can be found under the following accession numbers: SRR14350344. The pig gut dataset can be found under the following accession numbers: shotgun library ERR7197595-ERR7197599, and Hi-C libraries ERR7197655. The bovine skin dataset can be found under the following accession numbers: shotgun library SRR13765540, and Hi-C libraries SRR13765539. The wastewater dataset can be found under the following accession numbers: shotgun library SRR8239393, and Hi-C libraries SRR8239392. The hydrothermal mats dataset can be found under the following accession numbers: shotgun library SRR21545383, and Hi-C libraries SRR22355230. All other datasets utilized by the tool are described in the article. Code Availability The METAHIT software is available at https://github.com/dyxstat/METAHIT . Ethics approval and consent to participate Not applicable. Acknowledgements Y.D. is partially supported by the University of Texas Systems STARs Program. Funder Information Declared The University of Texas Systems STARs Program References [1]. ↵ Handelsman , J. : Metagenomics: application of genomics to uncultured microorganisms . Microbiology and molecular biology reviews 68 ( 4 ), 669 – 685 ( 2004 ) OpenUrl Abstract / FREE Full Text [2]. Streit , W.R. , Schmitz , R.A. : Metagenomics–the key to the uncultured microbes . Current opinion in microbiology 7 ( 5 ), 492 – 498 ( 2004 ) OpenUrl CrossRef PubMed Web of Science [3]. Hugenholtz , P. , Tyson , G.W. : Metagenomics . Nature 455 ( 7212 ), 481 – 483 ( 2008 ) OpenUrl CrossRef PubMed Web of Science [4]. ↵ Simon , C. , Daniel , R. : Metagenomic analyses: past and future trends . Applied and environmental microbiology 77 ( 4 ), 1153 – 1161 ( 2011 ) OpenUrl Abstract / FREE Full Text [5]. ↵ Yaffe , E. , Relman , D.A. : Tracking microbial evolution in the human gut using Hi-C reveals extensive horizontal gene transfer, persistence and adaptation . Nature microbiology 5 ( 2 ), 343 – 353 ( 2020 ) OpenUrl PubMed [6]. Kent , A.G. , Vill , A.C. , Shi , Q. , Satlin , M.J. , Brito , I.L. : Widespread transfer of mobile antibiotic resistance genes within individual gut microbiomes revealed through bacterial Hi-C . Nature communications 11 , 4379 ( 2020 ) OpenUrl PubMed [7]. Chen , Y. , Wang , Y. , Paez-Espino , D. , Polz , M.F. , Zhang , T. : Prokaryotic viruses impact functional microorganisms in nutrient removal and carbon cycle in wastewater treatment plants . Nature communications 12 , 5398 ( 2021 ) OpenUrl PubMed [8]. ↵ Marbouty , M. , Thierry , A. , Millot , G.A. , Koszul , R. : MetaHiC phage-bacteria infection network reveals active cycling phages of the healthy human gut . elife 10 , 60608 ( 2021 ) OpenUrl [9]. Du , Y. , Fuhrman , J.A. , Sun , F. : ViralCC retrieves complete viral genomes and virus-host pairs from metagenomic Hi-C data . Nature Communications 14 , 502 ( 2023 ) OpenUrl PubMed [10]. ↵ Stalder , T. , Press , M.O. , Sullivan , S. , Liachko , I. , Top , E.M. : Linking the resistome and plasmidome to the microbiome . The ISME journal 13 , 2437 – 2446 ( 2019 ) OpenUrl PubMed [11]. ↵ Burton , J.N. , Liachko , I. , Dunham , M.J. , Shendure , J. : Species-level deconvolution of metagenome assemblies with Hi-C–based contact probability maps . G3: Genes, Genomes, Genetics 4 ( 7 ), 1339 – 1346 ( 2014 ) OpenUrl PubMed [12]. Beitel , C.W. , Froenicke , L. , Lang , J.M. , Korf , I.F. , Michelmore , R.W. , Eisen , J.A. , Darling , A.E. : Strain- and plasmid-level deconvolution of a synthetic metagenome by sequencing proximity ligation products . PeerJ 2 , 415 ( 2014 ) OpenUrl CrossRef [13]. ↵ Press , M.O. , Wiser , A.H. , Kronenberg , Z.N. , Langford , K.W. , Shakya , M. , Lo , C.-C. , Mueller , K.A. , Sullivan , S.T. , Chain , P.S. , Liachko , I. : Hi-C deconvolution of a human gut microbiome yields high-quality draft genomes and reveals plasmid-genome interactions . biorxiv , 198713 ( 2017 ) [14]. ↵ Du , Y. , Laperriere , S.M. , Fuhrman , J. , Sun , F. : Normalizing metagenomic Hi-C data and detecting spurious contacts using zero-inflated negative binomial regression . Journal of Computational Biology 29 ( 2 ), 106 – 120 ( 2022 ) OpenUrl CrossRef PubMed [15]. ↵ McCallum , G.E. , Rossiter , A.E. , Quraishi , M.N. , Iqbal , T.H. , Kuehne , S.A. , Schaik , W. : Noise reduction strategies in metagenomic chromosome confirmation capture to link antibiotic resistance genes to microbial hosts . Microbial genomics 9 ( 6 ), 001030 ( 2023 ) OpenUrl [16]. ↵ Hugerth , L.W. , Larsson , J. , Alneberg , J. , Lindh , M.V. , Legrand , C. , Pinhassi , J. , Andersson , A.F. : Metagenome-assembled genomes uncover a global brackish microbiome . Genome biology 16 , 279 ( 2015 ) OpenUrl CrossRef PubMed [17]. ↵ DeMaere , M.Z. , Darling , A.E. : bin3C: exploiting Hi-C sequencing data to accurately resolve metagenome-assembled genomes . Genome biology 20 , 46 ( 2019 ) OpenUrl CrossRef PubMed [18]. ↵ Baudry , L. , Foutel-Rodier , T. , Thierry , A. , Koszul , R. , Marbouty , M. : MetaTOR: a computational pipeline to recover high-quality metagenomic bins from mammalian gut proximity-ligation (meta3C) libraries . Frontiers in genetics 10 , 753 ( 2019 ) OpenUrl PubMed [19]. Du , Y. , Sun , F. : HiCBin: binning metagenomic contigs and recovering metagenome-assembled genomes using Hi-C contact maps . Genome biology 23 , 63 ( 2022 ) OpenUrl CrossRef PubMed [20]. ↵ Du , Y. , Sun , F. : MetaCC allows scalable and integrative analyses of both longread and short-read metagenomic Hi-C data . Nature Communications 14 , 6231 ( 2023 ) OpenUrl PubMed [21]. ↵ Jia , L. , Wu , Y. , Dong , Y. , Chen , J. , Chen , W.-H. , Zhao , X.-M. : A survey on computational strategies for genome-resolved gut metagenomics . Briefings in Bioinformatics 24 ( 3 ), 162 ( 2023 ) OpenUrl [22]. ↵ Du , Y. , Zuo , W. , Sun , F. : ImputeCC enhances integrative Hi-C-based metagenomic binning through constrained random-walk-based imputation . In: International Conference on Research in Computational Molecular Biology , pp. 99 – 114 ( 2024 ). Springer [23]. ↵ Dempster , A.P. , Laird , N.M. , Rubin , D.B. : Maximum likelihood from incomplete data via the EM algorithm . Journal of the royal statistical society: series B (methodological) 39 , 1 – 22 ( 1977 ) OpenUrl CrossRef Web of Science [24]. ↵ Chaumeil , P.-A. , Mussig , A.J. , Hugenholtz , P. , Parks , D.H. : GTDB-Tk v2: memory friendly classification with the genome taxonomy database . Bioinformatics 38 ( 23 ), 5315 – 5316 ( 2022 ) OpenUrl CrossRef PubMed [25]. ↵ Bickhart , D.M. , Kolmogorov , M. , Tseng , E. , Portik , D.M. , Korobeynikov , A. , Tolstoganov , I. , Uritskiy , G. , Liachko , I. , Sullivan , S.T. , Shin , S.B. , et al : Generating lineage-resolved, complete metagenome-assembled genomes from complex microbial communities . Nature biotechnology 40 ( 5 ), 711 – 719 ( 2022 ) OpenUrl CrossRef PubMed [26]. ↵ Kalmar , L. , Gupta , S. , Kean , I.R. , Ba , X. , Hadjirin , N. , Lay , E.M. , Vries , S.P. , Bateman , M. , Bartlet , H. , Hernandez-Garcia , J. , et al : HAM-ART: an optimised culture-free Hi-C metagenomics pipeline for tracking antimicrobial resistance genes in complex microbial communities . PLoS Genetics 18 ( 3 ), 1009776 ( 2022 ) OpenUrl [27]. ↵ Beyi , A.F. , Hassall , A. , Phillips , G.J. , Plummer , P.J. : Tracking reservoirs of antimicrobial resistance genes in a complex microbial community using metagenomic Hi-C: The case of bovine digital dermatitis . Antibiotics 10 ( 2 ), 221 ( 2021 ) OpenUrl PubMed [28]. ↵ Hwang , Y. , Roux , S. , Coclet , C. , Krause , S.J. , Girguis , P.R. : Viruses interact with hosts that span distantly related microbial domains in dense hydrothermal mats . Nature Microbiology 8 ( 5 ), 946 – 957 ( 2023 ) OpenUrl PubMed [29]. ↵ Bray , J.R. , Curtis , J.T. : An ordination of the upland forest communities of southern Wisconsin . Ecological monographs 27 ( 4 ), 326 – 349 ( 1957 ) OpenUrl CrossRef Web of Science [30]. ↵ Rhoads , A. , Au , K.F. : PacBio sequencing and its applications . Genomics, proteomics & bioinformatics 13 ( 5 ), 278 – 289 ( 2015 ) OpenUrl CrossRef PubMed [31]. ↵ Li , D. , Liu , C.-M. , Luo , R. , Sadakane , K. , Lam , T.-W. : MEGAHIT: an ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph . Bioinformatics 31 ( 10 ), 1674 – 1676 ( 2015 ) OpenUrl CrossRef PubMed [32]. ↵ Kolmogorov , M. , Bickhart , D.M. , Behsaz , B. , Gurevich , A. , Rayko , M. , Shin , S.B. , Kuhn , K. , Yuan , J. , Polevikov , E. , Smith , T.P. , et al : metaFlye: scalable long-read metagenome assembly using repeat graphs . Nature methods 17 ( 11 ), 1103 – 1110 ( 2020 ) OpenUrl PubMed [33]. ↵ Chklovski , A. , Parks , D.H. , Woodcroft , B.J. , Tyson , G.W. : CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning . Nature methods 20 ( 8 ), 1203 – 1212 ( 2023 ) OpenUrl PubMed [34]. ↵ Ondov , B.D. , Treangen , T.J. , Melsted , P. , Mallonee , A.B. , Bergman , N.H. , Koren , S. , Phillippy , A.M. : Mash: fast genome and metagenome distance estimation using MinHash . Genome biology 17 , 132 ( 2016 ) OpenUrl CrossRef PubMed [35]. ↵ Nissen , J.N. , Johansen , J. , Allesøe , R.L. , Sønderby , C.K. , Armenteros , J.J.A. , 17 Grønbech , C.H. , Jensen , L.J. , Nielsen , H.B. , Petersen , T.N. , Winther , O. , et al : Improved metagenome binning and assembly using deep variational autoencoders . Nature biotechnology 39 ( 5 ), 555 – 560 ( 2021 ) OpenUrl CrossRef PubMed [36]. ↵ Pan , S. , Zhu , C. , Zhao , X.-M. , Coelho , L.P. : A deep siamese neural network improves metagenome-assembled genomes in microbiome datasets across different environments . Nature communications 13 , 2326 ( 2022 ) OpenUrl PubMed [37]. ↵ Camargo , A.P. , Roux , S. , Schulz , F. , Babinski , M. , Xu , Y. , Hu , B. , Chain , P.S. , Nayfach , S. , Kyrpides , N.C. : Identification of mobile genetic elements with geNomad . Nature biotechnology 42 ( 8 ), 1303 – 1312 ( 2024 ) OpenUrl CrossRef PubMed [38]. ↵ Martens , E.C. , Koropatkin , N.M. , Smith , T.J. , Gordon , J.I. : Complex glycan catabolism by the human gut microbiota: the Bacteroidetes Sus-like paradigm . Journal of Biological Chemistry 284 ( 37 ), 24673 – 24677 ( 2009 ) OpenUrl Abstract / FREE Full Text [39]. ↵ Flint , H.J. , Scott , K.P. , Duncan , S.H. , Louis , P. , Forano , E. : Microbial degradation of complex carbohydrates in the gut . Gut microbes 3 ( 4 ), 289 – 306 ( 2012 ) OpenUrl CrossRef PubMed [40]. ↵ Shanks , O.C. , Newton , R.J. , Kelty , C.A. , Huse , S.M. , Sogin , M.L. , McLellan , S.L. : Comparison of the microbial community structures of untreated wastewaters from different geographic locales . Applied and environmental microbiology 79 ( 9 ), 2906 – 2913 ( 2013 ) OpenUrl Abstract / FREE Full Text [41]. ↵ Newton , R.J. , McLellan , S.L. , Dila , D.K. , Vineis , J.H. , Morrison , H.G. , Eren , A.M. , Sogin , M.L. : Sewage reflects the microbiomes of human populations . MBio 6 ( 2 ), 10 – 1128 ( 2015 ) OpenUrl CrossRef [42]. ↵ Zhou , Z. , St. John , E. , Anantharaman , K. , Reysenbach , A.-L. : Global patterns of diversity and metabolism of microbial communities in deep-sea hydrothermal vent deposits . Microbiome 10 , 241 ( 2022 ) OpenUrl CrossRef PubMed [43]. ↵ Flieder , M. , Buongiorno , J. , Herbold , C.W. , Hausmann , B. , Rattei , T. , Lloyd , K.G. , Loy , A. , Wasmund , K. : Novel taxa of Acidobacteriota implicated in seafloor sulfur cycling . The ISME journal 15 ( 11 ), 3159 – 3180 ( 2021 ) OpenUrl PubMed [44]. ↵ Wilson-Welder , J.H. , Alt , D.P. , Nally , J.E. : Digital dermatitis in cattle: current bacterial and immunological findings . Animals 5 ( 4 ), 1114 – 1135 ( 2015 ) OpenUrl PubMed [45]. ↵ Kaakoush , N.O. : Insights into the role of Erysipelotrichaceae in the human host . Frontiers in cellular and infection microbiology 5 , 84 ( 2015 ) OpenUrl [46]. ↵ Wexler , H.M. : Bacteroides: the good, the bad, and the nitty-gritty . Clinical microbiology reviews 20 ( 4 ), 593 – 621 ( 2007 ) OpenUrl Abstract / FREE Full Text [47]. ↵ De Filippis , F. , Pasolli , E. , Ercolini , D. : Newly explored Faecalibacterium diversity is connected to age, lifestyle, geography, and disease . Current Biology 30 ( 24 ), 4932 – 4943 ( 2020 ) OpenUrl PubMed [48]. ↵ Fabbrini , M. , Candela , M. , Turroni , S. , Brigidi , P. , Rampelli , S. : Exploring clade differentiation of the Faecalibacterium prausnitzii complex . Iscience 25 ( 12 ) ( 2022 ) [49]. ↵ Nayfach , S. , Camargo , A.P. , Schulz , F. , Eloe-Fadrosh , E. , Roux , S. , Kyrpides , N.C. : Checkv assesses the quality and completeness of metagenome-assembled viral genomes . Nature biotechnology 39 ( 5 ), 578 – 585 ( 2021 ) OpenUrl CrossRef PubMed [50]. ↵ Riccardi , C. , Wang , Y. , Yooseph , S. , Sun , F. : Bidirectional subsethood of shared marker profiles enables accurate virus classification . Microbiome 13 , 170 ( 2025 ) OpenUrl PubMed [51]. ↵ Mettenleiter , T.C. , Klupp , B.G. , Granzow , H. : Herpesvirus assembly: an update . Virus research 143 ( 2 ), 222 – 234 ( 2009 ) OpenUrl CrossRef PubMed [52]. ↵ Bushnell , B. : BBMap: A Fast, Accurate, Splice-Aware Aligner . Technical Report LBNL-7065E, Lawrence Berkeley National Laboratory , Berkeley, CA ( 2014 ). LBNL Report. https://escholarship.org/uc/item/1h3515gn [53]. ↵ Andrews , S. : FastQC: A Quality Control Tool for High Throughput Sequence Data . http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ . Babraham Bioinformatics, Babraham Institute ( 2010 ) [54]. ↵ Nurk , S. , Meleshko , D. , Korobeynikov , A. , Pevzner , P.A. : metaSPAdes: a new versatile metagenomic assembler . Genome research 27 ( 5 ), 824 – 834 ( 2017 ) OpenUrl Abstract / FREE Full Text [55]. ↵ Li , H. : Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM . arXiv preprint arXiv: 1303.3997 ( 2013 ) [56]. ↵ Wang , Y. , Zuo , W. , Huang , J. , Sun , F. , Du , Y. : Benchmarking Alignment Strategies for Hi-C Reads in Metagenomic Hi-C Data . bioRxiv ( 2025 ) doi: 10.1101/2025.07.30.667754 OpenUrl Abstract / FREE Full Text [57]. ↵ Li , H. , Handsaker , B. , Wysoker , A. , Fennell , T. , Ruan , J. , Homer , N. , Marth , G. , Abecasis , G. , Durbin , R. , Subgroup , .G.P.D.P. : The sequence alignment/map format and SAMtools . bioinformatics 25 ( 16 ), 2078 – 2079 ( 2009 ) OpenUrl CrossRef PubMed Web of Science [58]. ↵ Kang , D.D. , Li , F. , Kirton , E. , Thomas , A. , Egan , R. , An , H. , Wang , Z. : MetaBAT 2: an adaptive binning algorithm for robust and efficient genome reconstruction from metagenome assemblies . PeerJ 7 , 7359 ( 2019 ) OpenUrl CrossRef [59]. ↵ Du , Y. , Laperriere , S.M. , Fuhrman , J. , Sun , F. : Hiczin: Normalizing metagenomic hi-c data and detecting spurious contacts using zero-inflated negative binomial regression . bioRxiv , 2021 – 03 ( 2021 ) [60]. ↵ Uritskiy , G.V. , DiRuggiero , J. , Taylor , J. : MetaWRAP—a flexible pipeline for genome-resolved metagenomic data analysis . Microbiome 6 , 158 ( 2018 ) OpenUrl CrossRef PubMed [61]. ↵ Song , W.-Z. , Thomas , T. : Binning refiner: improving genome bins through the combination of different binning programs . Bioinformatics 33 ( 12 ), 1873 – 1875 ( 2017 ) OpenUrl CrossRef PubMed [62]. ↵ Liu , M. , Darling , A. : Metagenomic Chromosome Conformation Capture (3C): techniques, applications, and challenges . F1000Research 4 , 1377 ( 2015 ) OpenUrl [63]. ↵ DeMaere , M.Z. , Darling , A.E. : Sim3C: simulation of Hi-C and Meta3C proximity ligation sequencing technologies . GigaScience 7 ( 2 ), 103 ( 2018 ) OpenUrl [64]. ↵ Zhou , C. , McCarthy , S.A. , Durbin , R. : YaHS: yet another Hi-C scaffolding tool . Bioinformatics 39 ( 1 ), 808 ( 2023 ) OpenUrl View the discussion thread. Back to top Previous Next Posted October 14, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following METAHIT enables comprehensive and flexible genome-resolved microbiome analysis with metagenomic Hi-C Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share METAHIT enables comprehensive and flexible genome-resolved microbiome analysis with metagenomic Hi-C Shiyuan Wang , Zhen Qin , Hang Yu , Ruishan Liu , Yong Ge , Maitreya Dutta , Luan Vu , Yuxuan Du bioRxiv 2025.10.12.681839; doi: https://doi.org/10.1101/2025.10.12.681839 Share This Article: Copy Citation Tools METAHIT enables comprehensive and flexible genome-resolved microbiome analysis with metagenomic Hi-C Shiyuan Wang , Zhen Qin , Hang Yu , Ruishan Liu , Yong Ge , Maitreya Dutta , Luan Vu , Yuxuan Du bioRxiv 2025.10.12.681839; doi: https://doi.org/10.1101/2025.10.12.681839 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7633) Biochemistry (17680) Bioengineering (13889) Bioinformatics (41928) Biophysics (21445) Cancer Biology (18585) Cell Biology (25491) Clinical Trials (138) Developmental Biology (13373) Ecology (19897) Epidemiology (2067) Evolutionary Biology (24308) Genetics (15606) Genomics (22496) Immunology (17736) Microbiology (40385) Molecular Biology (17175) Neuroscience (88583) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4822) Physiology (7641) Plant Biology (15149) Scientific Communication and Education (2045) Synthetic Biology (4293) Systems Biology (9822) Zoology (2271)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.