Full text
36,634 characters
· extracted from
preprint-html
· click to expand
BandHiC: a memory-efficient and user-friendly Python package for organizing and analyzing Hi-C matrices down to sub-kilobase resolution | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results BandHiC: a memory-efficient and user-friendly Python package for organizing and analyzing Hi-C matrices down to sub-kilobase resolution View ORCID Profile Weibing Wang , Junping Li , Yusen Ye , View ORCID Profile Lin Gao doi: https://doi.org/10.1101/2025.10.16.682752 Weibing Wang 1 Department of Computer Science, School of Computer Science and Technology, Xidian University , Xi’an, Shaanxi, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Weibing Wang Junping Li 1 Department of Computer Science, School of Computer Science and Technology, Xidian University , Xi’an, Shaanxi, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yusen Ye 1 Department of Computer Science, School of Computer Science and Technology, Xidian University , Xi’an, Shaanxi, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: lgao{at}mail.xidian.edu.cn ysye{at}xidian.edu.cn Lin Gao 1 Department of Computer Science, School of Computer Science and Technology, Xidian University , Xi’an, Shaanxi, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lin Gao For correspondence: lgao{at}mail.xidian.edu.cn ysye{at}xidian.edu.cn Abstract Full Text Info/History Metrics Preview PDF Abstract Recent advances in high-resolution Hi-C and Micro-C technologies have enabled finer-scale characterization of 3D genome architecture, but they also introduce substantial computational challenges, as the size of dense contact matrices scales quadratically with resolution, resulting in prohibitive memory demands. To address this, we developed BandHiC, a memory-efficient and user-friendly Python package for organizing and analyzing Hi-C matrices down to sub-kilobase resolution. BandHiC adopts a banded storage strategy that preserves only a configurable diagonal bandwidth of the dense contact matrix, reducing memory usage by up to 99% while maintaining fast random access and intuitive indexing operations. In addition, it provides flexible masking mechanisms to handle missing values, outliers, and unmappable regions, and supports efficient vectorized operations optimized with NumPy, thereby enabling scalable analysis of ultra-high-resolution Hi-C datasets. Introduction High-throughput chromosome conformation capture (Hi-C) [ 1 – 4 ] and its variants, such as Micro-C [ 5 – 7 ], have substantially advanced our understanding of genome architecture by enabling genome-wide mapping of chromatin interactions at progressively higher resolutions. Hi-C data typically measures the interaction frequency between evenly spaced chromatin segments, represented mathematically as a contact matrix, where each bin corresponds to a genomic interval and the bin size defines the resolution of the contact map. Over the past decade, improvements in sequencing throughput and experimental protocols have enabled a dramatic increase in Hi-C data resolution, from early megabase (Mb)-scale maps to kilobase (kb)- and even sub-kilobase (e.g., 500 bp, 250 bp) contact matrices [ 4 , 6 ]. More recently, single-cell Micro-C has achieved contact maps at resolutions as high as 5 kb [ 8 ]. These advances in resolution have facilitated the discovery of finer-scale chromatin structures and their regulatory roles in gene expression [ 3 , 6 , 7 , 9 , 10 ]. However, higher-resolution data generated by these assays pose significant computational challenges, particularly in terms of memory consumption [ 11 , 12 ]. For instance, loading a dense Hi-C matrix into Random Access Memory (RAM) at 1 kb resolution for the human genome (∼3 billion base pairs) would require approximately (3 × 10 9 / 10 3) ) 2 × 8 bytes = 7.2 × 10 13 bytes = 72 ter a bytes of memory (≈ 66 tebibytes), assuming double-precision floating-point representation (8 bytes per entry). Such memory demands far exceed the capacities of most computational environments. This issue becomes even more pronounced at sub-kilobase or single-cell resolutions, rendering dense matrix representations impractical for many real-world applications. These limitations highlight the need for more memory-efficient data structures tailored for high-resolution Hi-C data. Many current methods for identifying genome structural patterns, such as chromatin loops and topologically associating domains (TADs), rely heavily on dense matrix representations to facilitate rapid, random data access during computation [ 13 ]. Tools, such as TopDom [ 14 ], MSTD [ 15 ], DeTOKI [ 16 ] and SnapHiC [ 17 ], exhibit high memory consumption associated with dense matrices, which limits their scalability to higher-resolution Hi-C datasets. Although conventional sparse matrix formats can reduce memory usage, they lack efficient random-access capability, significantly slowing downstream analyses and complicating algorithm design, particularly for tasks requiring frequent element-wise access or submatrix extraction. Other methods, including Mustache [ 18 ] and Chromosight [ 19 ], attempt to circumvent this limitation by partitioning the full dense matrix into smaller blocks that are loaded into memory on demand. However, these methods introduce added implementation complexity and require careful memory management to maintain computational efficiency. However, these methods typically leverage high-resolution Hi-C data predominantly within short-range genomic distances (typically within 10 Mb), as chromatin loops and TADs are generally constrained to such scales [ 3 ]. Long-range interactions are increasingly sparse at high resolutions and often irrelevant to local structures such as loops and TADs. Thus, focusing on short-range contacts is both computationally and biologically justified. Therefore, there is an urgent need for a novel memory scheme tailored specifically for short-range higher-resolution Hi-C data, one that dramatically reduces memory usage while retaining the fast random-access capabilities of dense matrices. Built upon NumPy [ 20 , 21 ], a fundamental library for numerical computing in Python, we developed BandHiC , a memory-efficient Python package specifically designed for organizing and analyzing short-range contacts of Hi-C data down to sub-kilobase resolutions. BandHiC adopts a banded storage scheme that stores only a configurable diagonal bandwidth of the full Hi-C contact matrix. Importantly, it preserves efficient random-access capabilities by employing a direct index mapping between the banded and dense matrix representations, while supporting familiar NumPy-style indexing semantics (slicing, Boolean array indexing, integer array indexing) to facilitate user-friendly and efficient data access. BandHiC also integrates masking functionality akin to NumPy’s MaskedArray module, enabling straightforward handling of gaps, outliers, and other aberrant values in Hi-C matrices. Finally, BandHiC supports diverse numerical operations optimized through NumPy’s efficient vectorized computations, thus offering both memory efficiency and high computational performance essential for practical high-resolution Hi-C data analysis. Design and implementation Data representation To address the increasing memory demands posed by high-resolution Hi-C data, we introduce band_hic_matrix , the core class implemented in the BandHiC package. Given a Hi-C contact matrix A ∈ R n × n at resolution r , band_hic_matrix retains only the diagonals within a user-defined bandwidth k , yielding a compact representation D ∈ R n × k ( Fig 1 ). This format ensures that each column in D corresponds to a fixed diagonal of A , such that the mapping A [ i, j ] = D [ i, j ― i ] holds for | i ― j | < k . Download figure Open in new tab Fig 1. Data model of BandHiC. Schematic illustration of converting a dense symmetric matrix A into a banded representation consisting of a data matrix D , an element-wise mask matrix M , a row/column mask matrix X , and a default value d for out-of-band entries. Diagonal elements from A are reorganized into columns of D ; M marks missing or outlier entries; X indicates masked rows or columns. The memory efficiency achieved by this strategy is substantial. When k ≪ n , the memory footprint of band_hic_matrix is reduced from 𝒪( n 2 ) to 𝒪( nk ). For example, assuming a resolution of 1 kb and a bandwidth of 2 Mb ( k = 2000), the representation of chromosome 1 of the human genome (∼249 Mb) requires 3.7 GiB of memory, less than 1% of the memory required by the dense matrix (∼461.9 GiB). This compression makes high-resolution Hi-C data accessible even on commodity hardware, without compromising the efficiency of random data access. To further enhance the flexibility of usage, band_hic_matrix supports an optional two-layer masking mechanism. An element-wise mask matrix M ∈ {0,1} n × k allows users to selectively ignore missing or outlier contacts, enabling robust statistical estimation on unmasked subsets. Additionally, a bin-level mask X ∈ {0,1} n supports the exclusion of entire rows or columns, particularly useful for removing repetitive genomic regions lacking valid Hi-C signals. These masking features facilitate downstream tasks such as estimation of average contact intensity at specific genomic distances, while preserving statistical validity. Lastly, a scalar default value d is defined to fill in the undefined entries of A not covered by the banded matrix D . This default is typically set to 0, consistent with the assumption that long-range interactions are negligibly sparse. The advantage of using default values is that, in addition to treating the band_hic_matrix as a full dense matrix for indexing and conversion with a dense matrix, it also allows out-of-band entries to participate in mathematical operations. For example, when adding 1 to the band_hic_matrix object, not only are the in-band entries incremented, but the default value representing the out-of-band entries also increases by 1. Together, the components D, M, X , and d allow for seamless reconstruction of the dense matrix A when required. Overall, band_hic_matrix provides an efficient, flexible data representation for scalable Hi-C data analysis. BandHiC package BandHiC is distributed as an open-source Python package under the MIT license. It is compatible with Python version 3.11 or higher and can be deployed on Linux and macOS platforms. The BandHiC package relies solely on four dependencies: NumPy, SciPy, cooler, and hic-straw. NumPy and SciPy serve as the fundamental backbone of BandHiC, supporting the construction of the banded matrix data structure and implementing its core computational operations. In addition, BandHiC wraps the file-reading functions of cooler and hic-straw, allowing it to directly read.hic,.cool, or.mcool files as inputs for creating a band_hic_matrix object ( Fig 2A ). Download figure Open in new tab Fig 2. Overview of the BandHiC package. (A) Example of a band_hic_matrix object in the BandHiC package. (B) Indexing methods supported by BandHiC. (C) Example of computation methods supported by BandHiC. BandHiC primarily defines a matrix class, band_hic_matrix , which represents Hi-C contact data in a banded matrix representation. Each instance contains a numerical NumPy array of shape (bin_num, diag_num) , together with Boolean arrays mask and mask_ row _col that record element-wise and row/column-wise exclusions. Elements outside the stored bandwidth are represented by a scalar default_value , allowing the matrix to behave as a dense symmetric array while avoiding redundant storage. Objects can be constructed from.hic,.mcool files, or from triplet-form contact records (rows, columns, and contact frequencies) ( Fig 2A ). The package fully supports NumPy-style indexing ( Fig 2B ), including item-wise, slice, Boolean array, and integer array indexing. It also provides a series of methods and functions for constructing, manipulating, and performing computations on the band_hic_matrix objects ( Fig 2C ). The design and implementation of the indexing and computational operations supported by BandHiC are described in detail in the following two subsections. Indexing operations A key feature of band_hic_matrix is its direct coordinate mapping between the banded matrix B and the full dense matrix A ( Fig 1 ). For any pair of genomic loci ( i, j ) satisfying the band constraint: | i ― j | < k , the interaction frequency A [ i, j ] can be accessed in constant time via D [ i, j ― i ]. This mapping ensures random access in 𝒪(1) time, which is critical for performance-sensitive Hi-C analyses, particularly when memory constraints preclude the use of fully dense matrices. 𝒪(1) random access means that any element can be retrieved in constant time, independent of dataset size, because its memory location is computed directly from the index. This property is particularly important for matrix-like data structures such as Hi-C contact matrices, where efficient access to arbitrary entries is essential for large-scale analysis. Data access in band_hic_matrix is fully consistent with that of a dense matrix, as each entry is accessed via B [ i, j ] = D [ i, j ― i ] = A [ i, j ]. As a result, users can interact with a band_hic_matrix object as if it were a standard dense array, without needing to consider the underlying storage details. In computer science, indexing is the process of locating and accessing elements through their position or key, providing direct and efficient data retrieval compared to sequential search. Leveraging this random-access capability, band_hic_matrix supports full NumPy-style indexing semantics, including slicing, Boolean array, and integer array indexing ( Fig 2B ). Slicing selects contiguous ranges of data, such as A [2:5]. Boolean array indexing extracts elements that meet specific conditions, for example A [ A > 0]. Integer array indexing allows arbitrary selection using arrays of integer indices, such as A [[0, 2 , 4 ]]. This design allows users to easily query local chromatin contacts and provides a flexible and efficient framework for data manipulation in scientific computing. For instance, a slice operation such as B [ i : j, i : j ] or B [ i : j ] retrieves a banded submatrix. Combined with the todense operation, this enables reconstruction of the dense submatrix for downstream analysis or visualization. For a band_hic_matrix object with a mask matrix, indexing operations return either a ma.MaskedArray object or the masked constant. In NumPy, a MaskedArray stores numerical data together with a Boolean mask that marks missing or invalid entries. This ensures that such values are automatically excluded from computations while maintaining the array structure, making it well-suited for handling incomplete or noisy datasets. In practice, indexing a band_hic_matrix behaves as if operating directly on a MaskedArray , thereby providing users with considerable flexibility. Since indexing may refer to elements outside the predefined diagonal bandwidth, such out-of-band entries do not raise errors but are instead filled with a default value (e.g., zero or other user-specified constants). This design further enhances the robustness and flexibility of BandHiC in practical applications. Numerical computation In addition to flexible data access, band_hic_matrix also supports a wide range of numerical operations, including element-wise mathematical operations and reduction operations ( Fig 2C ). BandHiC is built on top of NumPy, which provides the foundation for efficient numerical computation in Python. NumPy offers high-performance, C-optimized universal functions (ufuncs) that perform element-wise operations with support for broadcasting and type casting. By leveraging these ufuncs, BandHiC implements 71 element-wise mathematical operations directly on the band_hic_matrix ( Table 1 ). This design eliminates the need for explicit Python loops, ensures full compatibility with the NumPy ecosystem, and significantly improves computational speed. Consequently, BandHiC inherits the scalability and efficiency of NumPy, enabling fast and memory-efficient analysis of large genomic contact matrices. Moreover, NumPy provides interfaces for defining custom array-like objects while maintaining seamless integration with NumPy, enabling BandHiC to implement specialized matrix types efficiently and flexibly. By building on NumPy in this way, BandHiC inherits both its computational efficiency and its flexible programming model, making it well-suited for scalable analysis of large-scale Hi-C data. View this table: View inline View popup Download powerpoint Table 1. Universal functions that BandHiC supports. Reduction operations refer to functions that aggregate multiple values into a single result. They can be applied globally to all elements of a matrix, or along specific axes to summarize rows or columns. Examples include sum , min , max , and mean . BandHiC supports ten such reduction operations ( Table 2 ), which work along conventional axes (rows or columns) in the same way as NumPy. In addition, BandHiC extends these operations to the diagonal axis—a feature not available in NumPy. This diagonal reduction is particularly useful for Hi-C data, as it allows interaction frequencies to be summarized by genomic distance, thereby supporting distance-dependent normalization and analyses such as distance-decay profiling. All operations remain fully compatible with masked band_hic_matrix objects, ensuring robust handling of missing or low-quality data in large-scale Hi-C analysis. View this table: View inline View popup Download powerpoint Table 2. Reduction functions that BandHiC supports. The implementation of reduction operations in BandHiC is designed to behave equivalently to those on a dense matrix, but without explicitly constructing the dense matrix, which would otherwise consume substantial memory. During computation, BandHiC automatically fills out-of-band entries with the default value, symmetrizes interactions in the lower-triangular part of the matrix, and excludes entries masked by either element-wise or row/column masks. Taken together, band_hic_matrix combines the memory efficiency of a banded storage model with the expressiveness of NumPy’s interface. By mimicking both Numpy’s ndarray and MaskedArray behaviors, it provides an intuitive and powerful interface for users, substantially lowering the barrier to adoption and enabling seamless integration into existing Hi-C data analysis pipelines. Please refer to BandHiC’s website for a detailed list of all supported functions and tutorials. Results Usage Examples BandHiC can serve as an alternative to the NumPy package when managing and manipulating Hi-C matrices, aiming to address the issue of excessive memory usage caused by storing dense matrices with NumPy’s ndarray . At the same time, BandHiC supports masking operations like NumPy’s ma.MaskedArray module, with enhancements tailored for Hi-C data. Users can leverage their experience with NumPy when using the BandHiC package, so it is recommended that users have some basic knowledge of NumPy. Here are some code examples providing a quick guide and demonstration of the core functionalities of BandHiC: Download figure Open in new tab Download figure Open in new tab The defined band_hic_matrix object mat corresponds to the example shown in Fig 2 . The example is presented in the IPython-style interactive format, in which “In [n]:” indicates input commands and “Out[n]:” indicates the corresponding outputs. Application: Reducing TopDom memory consumption While the previous section demonstrates the core functionality and syntax of BandHiC, here we evaluate its practical utility by integrating it with the TAD-calling algorithm TopDom [ 14 ]. TopDom is an efficient and robust algorithm for detecting topologically associating domains (TADs). It uses a sliding window approach on Hi-C contact matrices to identify local minima in contact intensity, which mark potential TAD boundaries. Owing to its robustness and reproducibility, TopDom was rated as the best-performing TAD detection method in a benchmark study [ 9 ]; however, its reliance on dense matrices makes it difficult to apply to sub-kilobase resolution Hi-C data. The original TopDom algorithm was developed in the R language [ 14 ]. In BandHiC, we provide a Python implementation of TopDom as a built-in function, ensuring functional consistency with the original method while extending support for both NumPy’s ndarray and BandHiC’s band_hic_matrix class. This integration not only facilitates direct identification of TADs from high-resolution Hi-C data within the BandHiC but also enables a fair comparison of memory usage and runtime performance between dense and banded representations. To evaluate the effectiveness of BandHiC, we benchmarked the TopDom algorithm on chromosome 1 of mouse embryonic stem cell (mESC) Micro-C data across multiple resolutions using both dense matrix and band_hic_matrix . As shown in Fig 3A , BandHiC substantially reduces memory usage at all tested resolutions. At 1000 bp and 500 bp resolution, the dense matrices require over 72 GiB of memory, exceeding the available system memory (60 GiB RAM and 12 GiB swap space), and causing the program to terminate due to memory allocation failure. In contrast, the band_hic_matrix version of TopDom completes successfully, with memory usage of only 5,989 MiB and 23,902 MiB at 1000 bp and 500 bp resolution, respectively. The banded matrix introduces a modest increase in runtime compared to the dense matrix ( Fig 3B ). This overhead arises not from masking operations (which were disabled for this evaluation) but from index computation performed during element access within the band_hic_matrix object. These results indicate that band_hic_matrix enables TAD-calling algorithms like TopDom to process high-resolution Hi-C data on a standard personal computer, making large-scale analysis feasible without incurring significant computational cost. Owing to BandHiC’s NumPy-like API, other Hi-C-based pattern identification methods can be reimplemented with minimal modifications to the original code, thereby significantly improving their scalability and adaptability to higher-resolution Hi-C datasets. Download figure Open in new tab Fig 3. Evaluation of BandHiC. (A) Memory usage comparison between the banded and dense matrices when running TopDom on mouse embryonic stem cell (mESC) Micro-C data (chromosome 1) at various resolutions. (B) Runtime comparison for the same task. At 1,000 bp and 500 bp resolution, the dense representations failed due to memory overflow. Availability and Future Directions The source code of the BandHiC Python package is publicly available on GitHub ( https://github.com/xdwwb/BandHiC-Master ), and comprehensive documentation is provided at https://xdwwb.github.io/BandHiC-Master/ . Installation can be performed conveniently through Python’s pip package manager: $ pip install bandhic BandHiC alleviates the computational challenges of high-resolution Hi-C analysis through a banded storage scheme that reduces memory usage to ∼1% of dense matrices while preserving constant-time random access. This enables domain-calling algorithms, such as TopDom, to run at sub-kilobase resolution on standard hardware. Seamless integration with the NumPy ecosystem, including support for universal functions, reductions, and Hi-C–specific diagonal operations, facilitates efficient distance-dependent analyses and easy adoption in existing pipelines. Furthermore, BandHiC’s flexible programming model—featuring masking, scalar defaults, and robust handling of noisy or unmappable regions—provides an extensible framework for downstream applications ranging from loop and TAD detection to single-cell Hi-C analysis. Nevertheless, BandHiC’s current design focuses on short-range, cis interactions, which may limit its applicability for studying long-range or inter-chromosomal contacts that can also carry biological relevance. Future work could address this limitation by introducing hybrid storage strategies that combine banded and sparse formats. In fact, BandHiC can already be applied to single-cell Hi-C analysis by constructing a separate band_hic_ matrix object for each cell. A more direct and scalable solution would be to extend the underlying numpy.ndarray used for banded storage from two to three dimensions, thereby introducing a “cell axis”. This modification would further enhance the utility of BandHiC for single-cell Hi-C data, providing users with a more convenient and powerful tool for large-scale single-cell 3D genome studies. Additionally, integrating BandHiC with visualization tools or downstream analysis pipelines could further broaden its applicability in exploring multi-scale chromatin organization. Taken together, BandHiC represents both a practical solution to the pressing computational challenges posed by ultra-high-resolution Hi-C data and a flexible foundation for future methodological advances. By combining memory efficiency, computational scalability, and a user-friendly interface, BandHiC has the potential to become a core component of the bioinformatics toolkit for 3D genomics. Funding This work was supported by the National Natural Science Foundation of China [Grant Nos. 62502361 to W.W.; 62550005 and 62132015 to L.G.; 62573335 to Y.Y.]. Conflict of Interest none declared . Data availability Micro-C data for the mouse embryonic stem cell (mESC) line were obtained from the NCBI GEO database (accession number: GSE130275). Acknowledgements We thank all the members of Prof. Gao’s lab for helpful comments. Reference 1. ↵ Lieberman-Aiden E , van Berkum NL , Williams L , Imakaev M , Ragoczy T , Telling A , et al. Comprehensive mapping of long-range interactions reveals folding principles of the human genome . Science . 2009 ; 326 : 289 – 293 . doi: 10.1126/science.1181369 OpenUrl Abstract / FREE Full Text 2. ↵ Dixon JR , Selvaraj S , Yue F , Kim A , Li Y , Shen Y , et al. Topological domains in mammalian genomes identified by analysis of chromatin interactions . Nature . 2012 ; 485 : 376 – 380 . doi: 10.1038/nature11082 OpenUrl CrossRef PubMed Web of Science 3. ↵ Rao SSP , Huntley MH , Durand NC , Stamenova EK , Bochkov ID , Robinson JT , et al. A 3D Map of the Human Genome at Kilobase Resolution Reveals Principles of Chromatin Looping . Cell . 2014 ; 159 : 1665 – 1680 . doi: 10.1016/j.cell.2014.11.021 OpenUrl CrossRef PubMed Web of Science 4. ↵ Akgol Oksuz B , Yang L , Abraham S , Venev SV , Krietenstein N , Parsi KM , et al. Systematic evaluation of chromosome conformation capture assays . Nat Methods . 2021 ; 18 : 1046 – 1055 . doi: 10.1038/s41592-021-01248-7 OpenUrl CrossRef PubMed 5. ↵ Hsieh T-HS , Weiner A , Lajoie B , Dekker J , Friedman N , Rando OJ . Mapping Nucleosome Resolution Chromosome Folding in Yeast by Micro-C . Cell . 2015 ; 162 : 108 – 119 . doi: 10.1016/j.cell.2015.05.048 OpenUrl CrossRef PubMed 6. ↵ Krietenstein N , Abraham S , Venev SV , Abdennur N , Gibcus J , Hsieh T-HS , et al. Ultrastructural Details of Mammalian Chromosome Architecture . Molecular Cell . 2020 ; 78 : 554 - 565.e7 . doi: 10.1016/j.molcel.2020.03.003 OpenUrl CrossRef PubMed 7. ↵ Sun L , Zhou J , Xu X , Liu Y , Ma N , Liu Y , et al. Mapping nucleosome-resolution chromatin organization and enhancer-promoter loops in plants using Micro-C-XL . Nat Commun . 2024 ; 15 : 35 . doi: 10.1038/s41467-023-44347-z OpenUrl CrossRef PubMed 8. ↵ Wu H , Zhang J , Tan L , Xie XS . Single-cell Micro-C profiles 3D genome structures at high resolution and characterizes multi-enhancer hubs . Nat Genet . 2025 ; 57 : 1777 – 1786 . doi: 10.1038/s41588-025-02247-6 OpenUrl CrossRef PubMed 9. ↵ Zufferey M , Tavernari D , Oricchio E , Ciriello G. Comparison of computational methods for the identification of topologically associating domains . Genome Biol . 2018 ; 19 : 217 . doi: 10.1186/s13059-018-1596-9 OpenUrl CrossRef PubMed 10. ↵ Goel VY , Huseyin MK , Hansen AS . Region Capture Micro-C reveals coalescence of enhancers and promoters into nested microcompartments . Nat Genet . 2023 ; 55 : 1048 – 1056 . doi: 10.1038/s41588-023-01391-1 OpenUrl CrossRef PubMed 11. ↵ Dekker J , Belmont AS , Guttman M , Leshyk VO , Lis JT , Lomvardas S , et al. The 4D nucleome project . Nature . 2017 ; 549 : 219 – 226 . doi: 10.1038/nature23884 OpenUrl CrossRef PubMed 12. ↵ Raffo A , Paulsen J. The shape of chromatin: insights from computational recognition of geometric patterns in Hi-C data . Briefings in Bioinformatics . 2023 ; 24 : bbad302 . doi: 10.1093/bib/bbad302 OpenUrl CrossRef PubMed 13. ↵ Xu J , Xu X , Huang D , Luo Y , Lin L , Bai X , et al. A comprehensive benchmarking with interpretation and operational guidance for the hierarchy of topologically associating domains . Nat Commun . 2024 ; 15 : 4376 . doi: 10.1038/s41467-024-48593-7 OpenUrl CrossRef 14. ↵ Shin H , Shi Y , Dai C , Tjong H , Gong K , Alber F , et al. TopDom: an efficient and deterministic method for identifying topological domains in genomes . Nucleic Acids Research . 2016 ; 44 : e70 . doi: 10.1093/nar/gkv1505 OpenUrl CrossRef PubMed 15. ↵ Ye Y , Gao L , Zhang S. MSTD: an efficient method for detecting multi-scale topological domains from symmetric and asymmetric 3D genomic maps . Nucleic Acids Research . 2019 ; 47 : e65 – e65 . doi: 10.1093/nar/gkz201 OpenUrl CrossRef PubMed 16. ↵ Li X , Zeng G , Li A , Zhang Z. DeTOKI identifies and characterizes the dynamics of chromatin TAD-like domains in a single cell . Genome Biol . 2021 ; 22 : 217 . doi: 10.1186/s13059-021-02435-7 OpenUrl CrossRef PubMed 17. ↵ Yu M , Abnousi A , Zhang Y , Li G , Lee L , Chen Z , et al. SnapHiC: a computational pipeline to identify chromatin loops from single-cell Hi-C data . Nat Methods . 2021 ; 18 : 1056 – 1059 . doi: 10.1038/s41592-021-01231-2 OpenUrl CrossRef PubMed 18. ↵ Roayaei Ardakany A , Gezer HT , Lonardi S , Ay F. Mustache: multi-scale detection of chromatin loops from Hi-C and Micro-C maps using scale-space representation . Genome Biology . 2020 ; 21 : 256 . doi: 10.1186/s13059-020-02167-0 OpenUrl CrossRef PubMed 19. ↵ Matthey-Doret C , Baudry L , Breuer A , Montagne R , Guiglielmoni N , Scolari V , et al. Computer vision for pattern detection in chromosome contact maps . Nat Commun . 2020 ; 11 : 5795 . doi: 10.1038/s41467-020-19562-7 OpenUrl CrossRef PubMed 20. ↵ Van Der Walt S , Colbert SC , Varoquaux G. The NumPy Array: A Structure for Efficient Numerical Computation . Comput Sci Eng . 2011 ; 13 : 22 – 30 . doi: 10.1109/MCSE.2011.37 OpenUrl CrossRef 21. ↵ Harris CR , Millman KJ , van der Walt SJ , Gommers R , Virtanen P , Cournapeau D , et al. Array programming with NumPy . Nature . 2020 ; 585 : 357 – 362 . doi: 10.1038/s41586-020-2649-2 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 16, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following BandHiC: a memory-efficient and user-friendly Python package for organizing and analyzing Hi-C matrices down to sub-kilobase resolution Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share BandHiC: a memory-efficient and user-friendly Python package for organizing and analyzing Hi-C matrices down to sub-kilobase resolution Weibing Wang , Junping Li , Yusen Ye , Lin Gao bioRxiv 2025.10.16.682752; doi: https://doi.org/10.1101/2025.10.16.682752 Share This Article: Copy Citation Tools BandHiC: a memory-efficient and user-friendly Python package for organizing and analyzing Hi-C matrices down to sub-kilobase resolution Weibing Wang , Junping Li , Yusen Ye , Lin Gao bioRxiv 2025.10.16.682752; doi: https://doi.org/10.1101/2025.10.16.682752 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17697) Bioengineering (13894) Bioinformatics (41951) Biophysics (21455) Cancer Biology (18593) Cell Biology (25509) Clinical Trials (138) Developmental Biology (13380) Ecology (19903) Epidemiology (2067) Evolutionary Biology (24322) Genetics (15611) Genomics (22509) Immunology (17737) Microbiology (40398) Molecular Biology (17183) Neuroscience (88619) Paleontology (667) Pathology (2833) Pharmacology and Toxicology (4825) Physiology (7644) Plant Biology (15158) Scientific Communication and Education (2046) Synthetic Biology (4296) Systems Biology (9825) Zoology (2271)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.