A new compression strategy to reduce the size of nanopore sequencing data

preprint OA: closed CC-BY-4.0
📄 Open PDF Full text JSON View at publisher
Full text 49,152 characters · extracted from preprint-html · click to expand
A new compression strategy to reduce the size of nanopore sequencing data | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results A new compression strategy to reduce the size of nanopore sequencing data Kavindu Jayasooriya , Sasha P. Jenner , Pasindu Marasinghe , Udith Senanayake , Hassaan Saadat , View ORCID Profile David Taubman , View ORCID Profile Roshan Ragel , View ORCID Profile Hasindu Gamaarachchi , View ORCID Profile Ira W. Deveson doi: https://doi.org/10.1101/2024.10.02.616377 Kavindu Jayasooriya 1 Genomics and Inherited Disease Program, Garvan Institute of Medical Research , Sydney, NSW, Australia 2 Centre for Population Genomics, Garvan Institute of Medical Research and Murdoch Children’s Research Institute , Australia 3 School of Computer Science and Engineering, University of New South Wales , Sydney, NSW, Australia 4 Department of Computer Engineering, University of Peradeniya , Peradeniya, Sri Lanka Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sasha P. Jenner 1 Genomics and Inherited Disease Program, Garvan Institute of Medical Research , Sydney, NSW, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pasindu Marasinghe 4 Department of Computer Engineering, University of Peradeniya , Peradeniya, Sri Lanka Find this author on Google Scholar Find this author on PubMed Search for this author on this site Udith Senanayake 4 Department of Computer Engineering, University of Peradeniya , Peradeniya, Sri Lanka Find this author on Google Scholar Find this author on PubMed Search for this author on this site Hassaan Saadat 5 School of Electrical Engineering and Telecommunications, University of New South Wales , Sydney, NSW, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site David Taubman 5 School of Electrical Engineering and Telecommunications, University of New South Wales , Sydney, NSW, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for David Taubman Roshan Ragel 4 Department of Computer Engineering, University of Peradeniya , Peradeniya, Sri Lanka Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Roshan Ragel Hasindu Gamaarachchi 3 School of Computer Science and Engineering, University of New South Wales , Sydney, NSW, Australia 1 Genomics and Inherited Disease Program, Garvan Institute of Medical Research , Sydney, NSW, Australia 2 Centre for Population Genomics, Garvan Institute of Medical Research and Murdoch Children’s Research Institute , Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hasindu Gamaarachchi For correspondence: hasindu{at}garvan.org.au i.deveson{at}garvan.org.au Ira W. Deveson 1 Genomics and Inherited Disease Program, Garvan Institute of Medical Research , Sydney, NSW, Australia 2 Centre for Population Genomics, Garvan Institute of Medical Research and Murdoch Children’s Research Institute , Australia 6 St Vincent’s Clinical School, Faculty of Medicine, University of New South Wales , Sydney, NSW, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ira W. Deveson For correspondence: hasindu{at}garvan.org.au i.deveson{at}garvan.org.au Abstract Full Text Info/History Metrics Supplementary material Preview PDF ABSTRACT Nanopore sequencing is an increasingly central tool for genomics. Despite rapid advances in the field, large data volumes and computational bottlenecks continue to pose major challenges. Here we introduce ex-zd , a new data compression strategy that helps address the large size of raw signal data generated during nanopore experiments. Ex-zd encompasses both a lossless compression method, which modestly outperforms all current methods for nanopore signal data compression, and a ‘lossy’ method, which can be used to achieve dramatic additional savings. The latter component works by reducing the number of bits used to encode signal data. We show that the three least significant bits in signal data generated on instruments from Oxford Nanopore Technologies (ONT) predominantly encode noise. Their removal reduces file sizes by half without impacting downstream analyses, including basecalling and detection of DNA methylation. Ex-zd compression saves hundreds of gigabytes on a single ONT sequencing experiment, thereby increasing the scalability, portability and accessibility of nanopore sequencing. BACKGROUND Nanopore sequencing enables high-throughput sequencing of native DNA or RNA molecules of any length. Platform updates from Oxford Nanopore Technologies (ONT) have enabled increasingly cost-effective and scalable sequencing in recent years ( Wang et al. 2021 ; Marx 2023 ). As the technology continues to improve, there is a need for ongoing improvement in data management, storage and analysis methods to match. An ONT device measures the displacement of ionic current as a DNA or RNA molecule passes through a nanoscale protein pore. Time-series current signal data is recorded and ‘basecalled’ into sequence reads, and can be analysed directly to identify ‘modified’ DNA ( Simpson et al. 2017 ; Zhang et al. 2023 ) or RNA ( Jain et al. 2022 ) nucleotides, DNA damage ( An et al. 2015 ), RNA secondary structures ( Stephenson et al. 2022 ; Bizuayehu et al. 2022 ), or other features beyond the primary nucleotide sequence ( Wan et al. 2022 ). Because algorithms for ONT basecalling and other signal-level analysis processes are continually evolving, it is common practice to retain raw signal data for future re-analysis ( Wan et al. 2022 ). Raw data retention is also critical for reproducibility, standardisation and open science. We previously introduced a new file format for the storage and analysis of nanopore raw signal data called SLOW5 (and its binary equivalent BLOW5), one benefit of which was an average ∼25% smaller file size compared to ONT’s original native file format, called FAST5 ( Gamaarachchi et al. 2022 ). This reduction was achieved by addressing metadata redundancy and inefficient space allocation, and similar improvements were subsequently adopted by ONT in a new file format for signal data called POD5 ( https://github.com/nanoporetech/pod5-file-format ). BLOW5 and POD5 also employ similar lossless data compression methods, which reduce the size of the chain of sequential signal values that make up a raw nanopore read. Despite these savings, signal data in both formats remain ∼10x larger than their corresponding basecalled reads, or ∼1.7 TiB for a typical human genome sample at ∼40× coverage ( Table S1 ). The large size of raw ONT signal data creates several challenges. Long-term storage is expensive; a major consideration both for ONT users and for government-funded data repositories. Upload, download or transfer of signal datasets is slow, may incur large egress costs and is often non-feasible in low-bandwidth settings, such as field studies or remote clinical sites. Large file sizes also create analysis bottlenecks, as data typically needs to be co-located with compute resources during the execution of analysis software, or even during sequencing, as data production on an ONT sequencing device rapidly consumes all disk space on the accompanying computer. To alleviate these challenges, we have developed a new nanopore signal data compression strategy called ex-zd , which delivers further space savings over existing methods. In doing so, we demonstrate how ONT signal data is amenable to ‘lossy’ data compression methods (Zaidi et al.), in which a portion of data is removed to greatly reduce file size with no impact on the utility of the data. We provide new ex-zd lossless and lossy compression methods for the nanopore community, via our open source libraries slow5lib, pyslow5 and data toolkit slow5tools ( Samarakoon et al. 2023b ). RESULTS Lossless data compression with ex-zd We developed a new compression strategy, called ex-zd , with the goal of improving nanopore signal data file sizes. Ex-zd can be used, among several alternate compression methods supported in slow5lib, pyslow5 and slow5tools (version 1.3.0 or later) ( Samarakoon et al. 2023b ), to reduce the size of data stored in BLOW5 format ( Gamaarachchi et al. 2022 ). Ex-zd compresses the chain of sequential signal data values that make up a read, and should therefore be equally applicable to raw data written in ONT’s FAST5 or POD5 format. By default, ex-zd is a ‘lossless’ compression method, meaning data is identical following compression and subsequent decompression. The lossless component of ex-zd builds upon an existing method, called VBZ ( https://github.com/nanoporetech/vbz_compression ), which is the current state-of-the-art for ONT data compression. A key element of VBZ is the transformation of each chain of raw signal values into a chain of differences between sequential values. Because most adjacent values are of similar magnitude, the differences or ‘zigzag deltas’ are small compared to the raw values. Ex-zd extends this concept, taking advantage of the high density of one-byte zig-zag deltas, which are encoded verbatim and separately from the two-byte data to achieve further savings (see Methods ). To evaluate this strategy, we applied lossless ex-zd compression to a typical human genome ONT sequencing dataset generated with current standard protocols (HG002-Prom5K chr22 subset; see Table S1 ). We compared the compression ratio achieved by ex-zd on this dataset to a wide range of other possible lossless compression methods ( n = 44), including VBZ. Ex-zd achieved the highest compression ratio (2.35) of any method tested ( Figure 1 ; Table S2 ). This translated to a 2.23% reduction in file size for a BLOW5 file compressed with ex-zd when compared to VBZ , 2.35% when compared to a native POD5 file, or a saving of 39 GiB on a typical human genome sequencing dataset at ∼40× coverage ( Table S1 ). We also observed that ex-zd compression adds minimal additional overhead in terms of computational time and RAM usage ( Table S3 ). Download figure Open in new tab Figure 1. Comparison of alternative lossless compression methods. Bar chart shows compression ratios achieved when applying different lossless compression methods to a typical ONT PromethION signal dataset (HG002-Prom5K chr22 subset; see Table S1 ). Compression ratio is calculated as follows: Compression ratio = Uncompressed size / Compressed size. A wide range of alternative methods (n = 44) was tested, most of which combine multiple algorithms. Algorithms are indicated in shorthand with “_” separators on the vertical axis and Table S2 provides a full summary of the algorithms used. Although ex-zd showed best-in-class performance, it produced a relatively modest saving over existing alternatives. Furthermore, based on the small differences observed between the best performing methods tested above ( Figure 1 ) we believe we are approaching the limit of what is practically achievable with lossless compression methods. Lossy data compression with ex-zd To further reduce the size of signal data, ex-zd combines a ‘lossy’ compression method, which can be optionally applied prior to the lossless encoding described above. Lossy compression methods, in which some portion of the starting data is non-reversibly removed to reduce the footprint, are common in other domains, such as image or audio processing (Zaidi et al.). One previous study considered the potential utility of lossy compression for nanopore sequencing data, with promising results ( Chandak et al. 2021 ). However, there is currently no usable implementation of a lossy compression method available to ONT users and further exploration is warranted. Ex-zd lossy compression uses a simple bit-reduction strategy, which was motivated by the following observations regarding ONT signal data properties. Signal data generated on an ONT PromethION instrument is currently recorded using 11 bits. When plotting a frequency distribution of current signal values in their native 11-bit format, the distribution is not smooth, but characterised by sporadic ‘spikes’ where the frequencies of adjacent values differ substantially ( Figure 2A ). These spikes occur reproducibly at specific signal values across independent reads and datasets, and tend to occur on signal values when the two least significant bits of the values transition from 11 2 to 00 2 (e.g. 011 2 -to-100 2 , 0111 2 -to-1000 2 , etc). It is highly unlikely that this unusual pattern reflects natural biomolecular and/or electrophysical dynamics at play during the sequencing process. It is much more likely that this is an artifact of the analog to digital converter (ADC), or another hardware component, used in ONT devices, and could be erased without compromising the molecular information encoded in the data. Importantly, we observed that this pattern of spikes was reduced when the same dataset was represented with fewer than 11 bits, with a smooth bimodal frequency distribution obtained when data was encoded in just 7 bits ( Figure 2A ). Download figure Open in new tab Figure 2. Evaluating ex-zd bit-reduction strategy for lossy compression of ONT PromethION data. ( A ) Frequency distributions for raw signal values in a typical ONT PromethION dataset (HG002-Prom5K chr22 subset; see Table S1 ) represented in native 11-bit encoding (red) or encoded with a smaller number of bits (10–5 bits). ( B ) Bar chart shows relative file sizes for the same dataset in BLOW5 format with current lossless compression methods (grey bars) compared to lossy ex-zd compression with decreasing numbers of bits (native 11-bit down to 5-bit). Sizes are shown as percentages relative to zlib-svb-zd, which is currently the default compression method used in slow5tools/slow5lib. Native POD5 format, which uses zstd-svb12-zd compression, is shown for comparison (dark grey bar). ( C ) Bar chart shows basecalling accuracy, as measured by mean read:reference identity, for the same dataset and bit-reduced encodings as above. Basecalling accuracies are shown separately for ONT’s Dorado (light grey) vs Guppy (dark grey) software and SUP (upper) vs HAC (lower) models. ( D ) Density scatter plots show read:reference identities for individual basecalled reads. The left plot compares native 11-bit data vs bit-reduced 8-bit data, both basecalled with Guppy SUP model. The right plot shows native 11-bit data basecalled with Guppy vs Dorado software, using a matched SUP basecalling model. This analysis suggests that the three or even four least significant bits in 11-bit signal data from an ONT PromethION primarily encode technical noise, rather than useful signal. Therefore, file sizes may be reduced by decreasing the number of bits used to encode signal values, without compromising the data. As an analogy, this is akin to reducing the number of decimal places used for each number when writing a list of numbers; fewer digits are required to produce the list but there is little impact on the values encoded or the differences between successive values. Prompted by these observations, we implemented a flexible bit-reduction strategy within ex-zd , in which the user can optionally reduce the number of bits used to encode signal values in a BLOW5 file from the default 11 bits for PromethION data down to 5 bits (or from 13 bits down to 7 bits for MinION data; see below). The N least significant bits are zeroed by rounding them down to 0 or up to 2 N , depending which is closer. Ex-zd lossless compression is then applied to the bit-reduced data. The two methods are synergistic because bit-reduction increases the density of one-byte zigzag deltas, allowing the lossless algorithm to achieve higher compression ratios (see Methods ). This results in proportional reductions to the BLOW5 file size, with a >10% saving for each additional bit removed ( Figure 2B ). For example, a BLOW5 file with 8-bit ex-zd compression is 44% smaller than native 11-bit POD5, or 737 GBytes smaller for a human genome sequencing dataset ( Table S1 ). Validation of ex-zd lossy compression It is critical that the space savings from ex-zd lossy compression do not come at the cost of data integrity. That is, we should see no meaningful impact on the outcomes of basecalling or other signal-level analysis when using bit-reduced data. We first assessed the outcomes of ONT basecalling on a human genome sequencing dataset encoded with decreasing numbers of bits, testing ONT’s Dorado and Guppy basecalling software with both ‘high accuracy’ (HAC) and ‘super accuracy’ (SUP) models (see Methods ). By comparison to the 11-bit (i.e. lossless) encoding, we saw no reduction in basecalling accuracy for 10-bit, 9-bit or 8-bit encoding, as assessed by mean, median or modal read:reference identities ( Figure 2C ). A small 0.3% mean reduction occurred at 7-bit, followed by a steep decline in basecalling accuracy when fewer than seven bits were used ( Figure 2C ; Table S4 ). Scatter plots showing read:reference identities for individual reads between datasets with different encodings indicated highly similar outcomes at 8-bit or above ( Figure 2D ). While not all reads are identical (i.e. some stray from the diagonal), we observed a greater degree of difference between identical 11-bit data basecalled with Dorado vs Guppy software using the same underlying models (R 2 = 0.988) than between an 8-bit vs 11-bit ex-zd encoding (R 2 = 0.991; Figure 2D ). Therefore, the small degree of difference seen in this read-level analysis reflects inherent stochasticity in the basecalling process, not a result of ex-zd lossy compression, and is implicitly tolerated by the nanopore community. We next considered the impact of ex-zd lossy compression on 5-methylcytosine (5mC) DNA methylation profiling. We assessed performance by comparison of 5mC frequencies at CpG sites ascertained by Guppy or Dorado on ONT data to matched reference data generated with whole-genome bisulphite sequencing (wgBS; see Methods ). We observed no reduction in the correlation of ONT vs wgBS results across global CpG sites for encodings of 8-bit or greater ( Figure 3A,B ; Table S5 ). As was observed for basecalling accuracy, individual reads showed highly similar methylation states between different encodings, and a greater degree of difference between Dorado vs Guppy (R 2 = 0.929) than the 8-bit vs 11-bit encoding (R 2 = 0.977; Figure 3C ). These results were recapitulated when using open source methylation profiling software f5c ( Gamaarachchi et al. 2020 ) as an alternative to Guppy or Dorado ( Table S5 ). All basecalling and methylation profiling results were also recapitulated as above using a dataset generated with a 4KHz (rather than 5KHz) data sampling rate, as was used on ONT devices earlier prior to 2023, and with data generated with the previous generation of ONT flow cells (R9.4.1; Tables S6 , S7 ). Download figure Open in new tab Figure 3. Impact of ex-zd bit-reduction on ONT DNA methylation profiling. ( A ) Bar chart shows the correlation of 5mC methylation frequencies at global CpG sites recorded with whole-genome bisulfite sequencing (wgBS) vs ONT methylation profiling on a matched sample (HG002 genome reference sample). ONT methylation profiling was performed with either Dorado (light grey) vs Guppy (dark grey) software and SUP (left) vs HAC (right) models on the same raw signal dataset (HG002-Prom5K chr22 subset; see Table S1 ) encoded with a decreasing number of bits (native 11-bit down to 5-bit). ( B ) Density scatter plots compare global 5mC profiles from native 11-bit data and bit-reduced 8-bit data to wgBS data, as per the above comparisons. ( C ) Density scatter plots show 5mC methylation frequencies for individual basecalled reads; i.e. the fraction of CpG bases within a given read that are called as being methylated. The left plot compares native 11-bit data vs bit-reduced 8-bit data, both basecalled with Guppy SUP model. The right plot shows native 11-bit data basecalled with Guppy vs Dorado software, using a matched SUP basecalling model. Data generated on an ONT MinION device is natively encoded with 13 bits, rather than 11 bits for PromethION data. Using a typical MinION dataset (HG002-Min5K; see Table S1 ), we next confirmed that ex-zd lossy compression is also effective on MinION data. We found that up to three bits could be removed with no impact on basecalling or 5mC profiling, delivering a space saving of 44% at a 10-bit vs 13-bit encoding ( Supplementary Figure 1A-C ; Table S8 ). Finally, we assessed the suitability of ex-zd lossy compression on RNA sequencing data generated using ONT’s RNA004 protocol on a PromethION device (UHRR-Prom; see Table S1 ). Similar to DNA sequencing, we found that up to three bits could be removed with no impact on basecalling, delivering a space saving of ∼40% for the 8-bit vs 11-bit encoding ( Supplementary Figure 2A-C ; Table S9 ). In summary, we observed no meaningful impact in the quality of basecalling or detection of DNA methylation when applying ex-zd lossy compression with up to three bits removed. DISCUSSION With the breadth of ONT sequencing adoption and the scale of datasets growing ( Alonge et al. 2020 ; Beyter et al. 2021 ; Chen et al. 2021 ; Gustafson et al. 2024; Reis et al. 2023 ; Schloissnig et al. 2024), there is a need for new and efficient methods for data storage and data sharing. Ex-zd is a new compression strategy that can be used to reduce file sizes of raw nanopore signal data to help address this challenge. Ex-zd encompasses both a lossless compression method, which modestly outperforms other available methods, and a lossy bit-reduction method, with the two working in tandem to deliver substantial savings. While lossy compression methods are popular in other domains, they are not currently used for nanopore data and are rare in the genomics field. Lossy methods irreversibly transform the underlying data and are generally avoided in scenarios where it is more important to maximize precision than to reduce the storage footprint of the data (Zaidi et al.). However, we demonstrate above that ONT PromethION signal data can be reduced from 11-bit to 8-bit encoding with no negative impact on analysis outcomes for either basecalling or detection of modified bases (e.g. 5mC), thereby delivering space savings without a tradeoff in precision. In fact, our analyses indicate that the three least or even four significant bits in native ONT data primarily encode noise. Given that 8-bit PromethION data with ex-zd compression is ∼45% smaller than 11-bit native POD5 format, this is an important development for the field. Moreover, this provides the basis to evaluate and/or develop alternative lossless or lossy compression strategies, which may be applied on top of bit-reduction to deliver greater savings. For example, our preliminary observations suggest the Free Lossless Audio Codec (FLAC) algorithm, commonly used for audio-compression, may be well suited for compression of bit-reduced ONT signal data (see Supplementary Note 1 ). While our results demonstrate the promise of lossy compression methods for nanopore data, any lossy method must be rigorously evaluated and applied with care, as their misuse can permanently compromise the user’s data. Our results show equivalent basecalling accuracy with bit-reduced 8-bit PromethION data compared to native 11-bit, and just a small (0.3%) reduction in accuracy with 7-bit data. It is interesting to note that the ONT basecalling models used here are neural network models, trained on 11-bit data. Given the characteristic differences seen between 11-bit vs 7-bit data (see Figure 2A ), we were surprised at the strong performance on 8-bit and 7-bit data. This opens the intriguing possibility that basecalling performance could be improved via re-training on bit-reduced data. We hypothesise that the removal of noise from the signal data, which appears to be optimal for the 7-bit encoding, may have analytic advantages. File size reductions delivered by ex-zd or other future lossless methods will have many benefits for the community. The most obvious will be proportional reductions in the cost of data storage, which are a major expense both for everyday users and for public data repositories, as such as EBI’s European Nucleotide Archive (ENA) or NCBI’s Sequence Read Archive (SRA). The time and cost required to upload/download data from these repositories will be similarly reduced, encouraging open data sharing of raw signal data. This complements our recent tool slow5curl ( Wong et al. 2024 ) which allows a user to quickly fetch specific reads (e.g. for a gene of interest) from a nanopore signal dataset on a remote server, such as ENA or SRA, without downloading the entire dataset. Smaller file sizes will facilitate data transfer between sites with limited bandwith, which can be a major obstacle for remote field studies enabled by portable ONT devices ( Quick et al. 2016 ). The less obvious impact of file size reductions will be to increase sequencing throughput on ONT devices, such as the PromethION P48, where available storage can currently accommodate only around half of the maximum theoretical data generation capacity. Applying ex-zd compression to each new batch of reads generated during sequencing would increase the sequencing throughput that is practically achievable by almost two-fold (given the 44% space saving with 3-bit reduction), without any further updates to the hardware. Finally, smaller file sizes can also address a common analysis bottleneck for ONT users, wherein disk space required to hold data during analysis is the limiting resource, rather than compute capacity. In such a scenario, a pedantic user may choose to apply lossy compression to their dataset to alleviate space constraints during analysis, while retaining an original lossless copy in their archive for long-term storage. Ex-zd is the latest innovation in the SLOW5 data ecosystem ( https://hasindu2008.github.io/slow5/ ), which includes the SLOW5/BLOW5 file format itself ( Gamaarachchi et al. 2022 ); software libraries for reading/writing files ( https://github.com/hasindu2008/slow5lib ); a toolkit for working with SLOW5/BLOW5 files ( Samarakoon et al. 2023b ); the slow5curl utility for remote data access ( Wong et al. 2024 ); BLOW5-enabled basecalling software ( Samarakoon et al. 2023a ); packages for simulation (Gamaarachchi et al. 2023) and visualisation ( Samarakoon et al. 2024 ) of signal data; and a range of other open source tools ( Zhang et al. 2021 ; Firtina et al. 2024 ; Kovaka et al. 2024 ; Guo et al. 2024 ; Gamaarachchi et al. 2020 ; Shih et al. 2022 ; Senanayake et al. 2023 ; Simpson et al. 2017 ). Ex-zd compression is now supported within slowlib, pyslow5 and slow5tools , and all methods and formats are open source, in case ONT or other future nanopore vendors want to adopt them. METHODS & IMPLEMENTATION Ex-zd compression strategy Ex-zd is a new compression strategy for nanopore signal data, which separately encodes one-byte and two-byte zig-zag delta transformed data. The ex-zd strategy is illustrated in Figure 4 and mathematical derivations are provided in Supplementary Note 2 . Download figure Open in new tab Figure 4. Schematic overview of ex-zd lossless compression strategy. Schematic illustrates the structure of the raw signal values for a single nanopore sequencing read encoded with ex-zd. Orange blocks represent the ex-zd metadata; blue block represents one-byte data; purple blocks represent two-byte exception data. Assuming exceptions exist, the exception data structure can take two forms (shown below), depending whether the number of exceptions n x > 1 or n x = 1. If there are no exceptions, n x = 0 and X = 0 (i.e. purple block is absent). The ex-zd encoding begins by writing the version number using one byte, followed by the number of signal samples written using eight bytes, then the number of bits eliminated during the lossy encoding using one byte ( Figure 4 ). Next, each signal sample is bit-shifted to the right by the smallest length of successive zero least significant bits (which is greater than or equal to the number of bits eliminated during lossy compression). Next, the zig-zag delta transformation is applied. In this transformation, the first signal sample followed by the consecutive differences (deltas) are zig-zag encoded, meaning positive integers are doubled and the absolute value of negative integers are doubled then subtracted by one. The first signal sample after zig-zag encoding is then written using two bytes ( Figure 4 ). Afterwards, the data is divided into two groups: integers which fit into one byte (the one-byte values) and those which require two bytes (the exceptions). The exceptions are now subtracted 256 (256 is the minimum value that an exception can have). The number of exceptions is written using four bytes ( Figure 4 ). If there is only one exception, the exception’s position and the exception are both written using four bytes each. When there are zero exceptions no exception data is written (the purple box in Figure 4 would not exist). If there is more than one exception, the positions of the exceptions are encoded as follows: the first position is left unchanged whilst the remainder are delta encoded and subtracted by 1; finally all the integers are streamvbyte encoded. The size of this encoding is written using four bytes, followed by the encoding itself. Next, the exceptions are streamvbyte encoded. As before, the size of this encoding is written using four bytes, followed by the encoding itself. Finally, each data point in the one-byte data is written using one byte (blue box in Figure 4 ). Bit elimination during ex-zd lossy compression Ex-zd lossy compression is based on a simple bit-reduction strategy, in which the user can specify the number of bits to be eliminated from their signal dataset. If n bits are to be eliminated, for each signal value x , the following bit-wise rounding operation is applied that will zero the n least significant bits: When performing the bit-reduction, the number of bits eliminated is stored as described above and the signal values are bit shifted to the right. When decoding, the values are left-shifted by this same amount. During lossless encoding, this will be zero and no shifting is performed. Benchmark experiments Datasets The datasets used for the experiments are listed in Table S1 . HG002-Prom5K is a DNA sequencing experiment run on the popular human genome reference sample HG002, sequenced on an ONT PromethION device with a R10.4.1 flowcell and the data was collected at 5kHz sampling rate. HG002-Prom4K is similar except that the data was collected at 4kHz. HG002-Min5K is sequenced on a MinION R10.4.1 at 5 kHz. UHRR-Prom is a direct RNA sequencing experiment run on the human transcriptome reference sample, Universal Human Reference RNA (Agilent). This was sequenced on a PromethION using the latest RNA004 kit and flowcell for direct RNA sequencing. Similar HG001 and UUHR datasets were also available from the previous generation R9.4.1 PromethION flowcell version. For many experiments, a limited subset of the full dataset was used to minimise compute resources. For DNA sequencing data was achieved by subsetting reads corresponding to human chr22. Subsets were generated by basecalling the signal data, aligning the reads to the hg38 reference using minimap2 and then extracting those reads using samtools and slow5tools . The RNA 500K subset was generated by randomly picking 500,000 reads from the signal dataset. File size and performance measurement The experiments for measuring the file sizes and performance were executed on a server with an Intel Xeon Silver 4114 CPU (20 cores, 40 threads), 376 GiB RAM and an HDD-based network-attached storage (12 spinning disks configured with RAID 10) mounted via Network File System (NFS). The system was running Ubuntu 18.04.5 as the operating system. File sizes were measured using the du command ( Supplementary Note 3 ). The runtime and peak RAM were measured using GNU time utility. Converting to/from lossless ex-zd was performed using slow5tools view (v1.3.0) command. Lossy compression was performed using slow5tools degrade . The disk I/O cache (pagecache, dentries and inodes) was cleaned before runtime measurement experiments. Details of the commands and software versions are in Supplementary Note 3 . Accuracy evaluation Basecalling and methylation calling were performed using Guppy (via Buttery-eel ) and Dorado (via slow5-dorado ), with full commands and versions provided in Supplementary Note 3 . Basecalled reads were aligned to the reference (hg38 with no alternate contigs for DNA data and Gencode v40 human transcriptome for RNA data) using minimap2 . For measuring the basecalling accuracies, blast-like identity scores were calculated for primary alignments using paftools . js in the minimap2 package (blast-like identity score = 10th column divided by 11th column in a PAF file). To measure the 5mC calling accuracy, we first mapped the basecalls with methylation tags using minimap2 , sorted them using samtools and then the methylation frequencies were extracted using modkit (v0.1.13) ( Supplementary Note 3 ). The 5mC methylation frequencies were compared to publicly available data from whole-genome bisulfite sequencing (see Data Availability statement) using the compare_methylation . py script associated with nanopolish/f5c . To assess per-read modification calling, we extracted the modification calls per site using modkit extract ( Supplementary Note 3 ). Then we extracted the modification type of interest (mod_code ‘m’ for 5mC). Then, per each read, we calculated the modification frequency across the read, taking modification probability > 0.8 as ‘modified’ and <0.2 as ‘unmodified’. The modification frequency of a given read was calculated as modified calls / (modified calls + unmodified calls). DATA & CODE AVAILABILITY The large datasets HG002-Prom5K, HG002-Prom4K and UHRR-Prom used for benchmarking experiments are available under the European Nucleotide Archive (ENA) at Bioproject PRJEB64652 (Runs ERR12997168, ERR11777845, and ERR12997170, respectively). These are also available as part of the AWS Open Data Program ( https://registry.opendata.aws/gtgseq/ ) in the gtgseq S3 bucket ( https://gtgseq.s3.amazonaws.com/index.html ). The smaller datasets HG002-Prom5K (chr22 subset), HG002-Prom4K (chr22 subset), HG002-Min5K, UHRR-Prom (500K read subset), HG001-PromR9 (chr22 subset) and UHRR-PromR9 are available through the Dryad dataset 10.5061/dryad.1vhhmgr3p. Bisulphite data was downloaded from publicly available sources: for HG001 from Encode (ENCFF835NTC) and for HG002 from ONT open-data AWS repository (s3://ont-open-data/gm24385_mod_2021.09/bisulphite/cpg). Ex-zd compression implementation is available through slow5lib ( https://github.com/hasindu2008/slow5lib ) and slow5tools ( https://github.com/hasindu2008/slow5tools ) version 1.3.0 onwards. Commands and software versions used for executing benchmark experiments are available in Supplementary Note 3 . DECLARATIONS I.W.D. manages a fee-for-service sequencing facility at the Garvan Institute of Medical Research and is a customer of Oxford Nanopore Technologies but has no further financial relationship. H.G., and I.W.D. have previously received travel and accommodation expenses from Oxford Nanopore Technologies. I.W.D. has paid consultant roles with Sequin PTY. H.G. has paid consultant roles with Sequin PTY and Swan Genomics PTY. The authors declare no other competing financial or nonfinancial interests. CONTRIBUTIONS All authors contributed to the conception, design and benchmarking of ex-zd . K.J., S.P.J., & H.G. implemented ex-zd and integrated into slow5lib and slow5tools . K.J., S.P.J., & H.G. performed benchmarking experiments. S.P.J., H.G. & I.W.D prepared the figures and manuscript. ACKNOWLEDGEMENTS We acknowledge the following funding support: Australian Medical Research Futures Fund grants MRF2016008, and MRF2023126 (to I.W.D.), Australian Research Council DECRA Fellowship DE230100178 and Australian Research Council’s Discovery Project DP230100651 (to H.G). The views expressed herein are those of the authors and are not necessarily those of the Australian Government or the Australian Research Council. We thank Sri Parameswaran, John Stavrakakis (University of Sydney) for insightful discussions. We also thank James Ferguson (Garvan Institute) for assistance with buttery-eel and pyslow5. Footnotes ↵ * Joint-first authors; ↵ # Joint-senior authors; REFERENCES ↵ Alonge M , Wang X , Benoit M , Soyk S , Pereira L , Zhang L , Suresh H , Ramakrishnan S , Maumus F , Ciren D , et al. 2020 . Major Impacts of Widespread Structural Variation on Gene Expression and Crop Improvement in Tomato . Cell 182 : 145 – 161.e23 . OpenUrl CrossRef ↵ An N , Fleming AM , White HS , Burrows CJ . 2015 . Nanopore detection of 8-oxoguanine in the human telomere repeat sequence . ACS Nano 9 : 4296 – 4307 . OpenUrl ↵ Beyter D , Ingimundardottir H , Oddsson A , Eggertsson HP , Bjornsson E , Jonsson H , Atlason BA , Kristmundsdottir S , Mehringer S , Hardarson MT , et al. 2021 . Long-read sequencing of 3,622 Icelanders provides insight into the role of structural variants in human diseases and other traits . Nat Genet 53 : 779 – 786 . OpenUrl CrossRef ↵ Bizuayehu TT , Labun K , Jakubec M , Jefimov K , Niazi AM , Valen E. 2022 . Long-read single-molecule RNA structure sequencing using nanopore . Nucleic Acids Res 50 : e120 . OpenUrl ↵ Chandak S , Tatwawadi K , Sridhar S , Weissman T. 2021 . Impact of lossy compression of nanopore raw signal data on basecalling and consensus accuracy . Bioinformatics 36 : 5313 – 5321 . OpenUrl ↵ Chen Y , Davidson NM , Wan YK , Patel H , Yao F , Low HM , Hendra C , Watten L , Sim A , Sawyer C , et al. 2021 . A systematic benchmark of Nanopore long read RNA sequencing for transcript level analysis in human cell lines . bioRxiv doi.org/ 10.1101/2021.04.21.440736 . OpenUrl Abstract / FREE Full Text ↵ Firtina C , Soysal M , Lindegger J , Mutlu O. 2024 . RawHash2: mapping raw nanopore signals using hash-based seeding and adaptive quantization . Bioinformatics 40 : btae478 . OpenUrl Gamaarachchi H , Ferguson JM , Samarakoon H , Liyanage K , Deveson IW . 2024 . Squigulator: simulation of nanopore sequencing signal data with tunable noise parameters . Genome Res 34 : 778 – 783 . OpenUrl Abstract / FREE Full Text ↵ Gamaarachchi H , Lam CW , Jayatilaka G , Samarakoon H , Simpson JT , Smith MA , Parameswaran S. 2020 . GPU accelerated adaptive banded event alignment for rapid comparative nanopore signal analysis . BMC Bioinformatics 21 : 343 . OpenUrl ↵ Gamaarachchi H , Samarakoon H , Jenner SP , Ferguson JM , Amos TG , Hammond JM , Saadat H , Smith MA , Parameswaran S , Deveson IW . 2022 . Fast nanopore sequencing data analysis with SLOW5 . Nat Biotechnol 40 : 1026 – 1029 . OpenUrl ↵ Guo Z , Ni Y , Tan L , Shao Y , Ye L , Chen S , Li R. 2024 . Nanopore Current Events Magnifier (nanoCEM): a novel tool for visualizing current events at modification sites of nanopore sequencing . NAR Genomics and Bioinformatics 6 : qae052 . OpenUrl Gustafson , JA , et al. 2024 . Nanopore sequencing of 1000 Genomes Project samples to build a comprehensive catalog of human genetic variation . bioRxiv doi.org/10.1101/2024.03.05.24303792 ↵ Jain M , Abu-Shumays R , Olsen HE , Akeson M. 2022 . Advances in nanopore direct RNA sequencing . Nat Methods 19 : 1160 – 1164 . OpenUrl CrossRef ↵ Kovaka S , Hook PW , Jenike KM , Shivakumar V , Morina LB , Razaghi R , Timp W , Schatz MC . 2024 . Uncalled4 improves nanopore DNA and RNA modification detection via fast and accurate signal alignment . bioRxiv doi.org/ 10.1101/2024.03.05.583511 OpenUrl Abstract / FREE Full Text ↵ Marx V. 2023 . Method of the year: long-read sequencing . Nat Methods 20 : 6 – 11 . OpenUrl CrossRef ↵ Quick J , Loman NJ , Duraffour S , Simpson JT , Severi E , Cowley L , Bore JA , Koundouno R , Dudas G , Mikhail A , et al. 2016 . Real-time, portable genome sequencing for Ebola surveillance . Nature 530 : 228 – 232 . OpenUrl CrossRef PubMed ↵ Reis ALM , Rapadas M , Hammond JM , Gamaarachchi H , Stevanovski I , Ayuputeri Kumaheri M , Chintalaphani SR , Dissanayake DSB , Siggs OM , Hewitt AW , et al. 2023 . The landscape of genomic structural variation in Indigenous Australians . Nature 624 : 602 – 610 . OpenUrl ↵ Samarakoon H , Ferguson JM , Gamaarachchi H , Deveson IW . 2023a . Accelerated nanopore basecalling with SLOW5 data format . Bioinformatics 39 . doi: 10.1093/bioinformatics/btad352 . OpenUrl CrossRef ↵ Samarakoon H , Ferguson JM , Jenner SP , Amos TG , Parameswaran S , Gamaarachchi H , Deveson IW . 2023b . Flexible and efficient handling of nanopore sequencing signal data with slow5tools . Genome Biol 24 : 69 . OpenUrl CrossRef ↵ Samarakoon H , Liyanage K , Ferguson JM , Parameswaran S , Gamaarachchi H , Deveson IW . 2024 . Interactive visualisation of raw nanopore signal data with Squigualiser . Bioinformatics 40 : btae501 . OpenUrl Schloissnig , S. et al. 2024 . Long-read sequencing and structural variant characterization in 1,019 samples from the 1000 Genomes Project . bioRxiv doi.org/ 10.1101/2024.04.18.590093 OpenUrl Abstract / FREE Full Text ↵ Senanayake A , Gamaarachchi H , Herath D , Ragel R. 2023 . DeepSelectNet: deep neural network based selective sequencing for oxford nanopore sequencing . BMC Bioinformatics 24 : 31 . OpenUrl CrossRef ↵ Shih PJ , Saadat H , Parameswaran S , Gamaarachchi H. 2022 . Efficient real-time selective genome sequencing on resource-constrained devices . Gigascience 12 . z10.1093/gigascience/giad046 . ↵ Simpson JT , Workman RE , Zuzarte PC , David M , Dursi LJ , Timp W. 2017 . Detecting DNA cytosine methylation using nanopore sequencing . Nat Methods 14 : 407 – 410 . OpenUrl CrossRef PubMed ↵ Stephenson W , Razaghi R , Busan S , Weeks KM , Timp W , Smibert P. 2022 . Direct detection of RNA modifications and structure using single-molecule nanopore sequencing . Cell Genom 2 . doi: 10.1016/j.xgen.2022.100097 . OpenUrl CrossRef ↵ Wang Y , Zhao Y , Bollas A , Wang Y , Au KF . 2021 . Nanopore sequencing technology, bioinformatics and applications . Nat Biotechnol 39 : 1348 – 1365 . OpenUrl CrossRef PubMed ↵ Wan YK , Hendra C , Pratanwanich PN , Göke J. 2022 . Beyond sequencing: machine learning algorithms extract biology hidden in Nanopore signal data . Trends Genet 38 : 246 – 257 . OpenUrl CrossRef ↵ Wong B , Ferguson JM , Do JY , Gamaarachchi H , Deveson IW . 2024 . Streamlining remote nanopore data access with slow5curl . Gigascience 13 . doi: 10.1093/gigascience/giae016 . OpenUrl CrossRef Zaidi HN , Luo WQ , Shiri I. 2006 . Fundamental Data Compression . Butterworth-Heinemann . doi: 10.1016/B978-075066310-6/50004-0 OpenUrl CrossRef ↵ Zhang H , Li H , Jain C , Cheng H , Au KF , Li H , Aluru S. 2021 . Real-time mapping of nanopore raw signals . Bioinformatics 37 : i477 – i483 . OpenUrl CrossRef ↵ Zhang Y , Zhang Q , Yang X , Gu X , Chen J , Shi T. 2023 . 6mA DNA Methylation on Genes in Plants Is Associated with Gene Complexity, Expression and Duplication . Plants 12 . doi: 10.3390/plants12101949 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted October 03, 2024. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A new compression strategy to reduce the size of nanopore sequencing data Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A new compression strategy to reduce the size of nanopore sequencing data Kavindu Jayasooriya , Sasha P. Jenner , Pasindu Marasinghe , Udith Senanayake , Hassaan Saadat , David Taubman , Roshan Ragel , Hasindu Gamaarachchi , Ira W. Deveson bioRxiv 2024.10.02.616377; doi: https://doi.org/10.1101/2024.10.02.616377 Share This Article: Copy Citation Tools A new compression strategy to reduce the size of nanopore sequencing data Kavindu Jayasooriya , Sasha P. Jenner , Pasindu Marasinghe , Udith Senanayake , Hassaan Saadat , David Taubman , Roshan Ragel , Hasindu Gamaarachchi , Ira W. Deveson bioRxiv 2024.10.02.616377; doi: https://doi.org/10.1101/2024.10.02.616377 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7651) Biochemistry (17746) Bioengineering (13928) Bioinformatics (42066) Biophysics (21499) Cancer Biology (18650) Cell Biology (25579) Clinical Trials (138) Developmental Biology (13409) Ecology (19947) Epidemiology (2067) Evolutionary Biology (24374) Genetics (15633) Genomics (22557) Immunology (17775) Microbiology (40505) Molecular Biology (17217) Neuroscience (88796) Paleontology (667) Pathology (2845) Pharmacology and Toxicology (4836) Physiology (7664) Plant Biology (15179) Scientific Communication and Education (2047) Synthetic Biology (4304) Systems Biology (9839) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-24T02:00:01.246996+00:00
License: CC-BY-4.0