Leveraging Basecaller’s Move Table to Generate a Lightweight k-mer Model

doi:10.1101/2024.06.30.601452

Leveraging Basecaller’s Move Table to Generate a Lightweight k-mer Model

2024 · doi:10.1101/2024.06.30.601452

preprint OA: closed CC-BY-4.0

📄 Open PDF Full text JSON View at publisher

Full text 32,880 characters · extracted from preprint-html · click to expand

Leveraging Basecaller’s Move Table to Generate a Lightweight k-mer Model | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Leveraging Basecaller’s Move Table to Generate a Lightweight k-mer Model View ORCID Profile Hiruna Samarakoon , Yuk Kei Wan , View ORCID Profile Sri Parameswaran , View ORCID Profile Jonathan Göke , View ORCID Profile Hasindu Gamaarachchi , View ORCID Profile Ira W. Deveson doi: https://doi.org/10.1101/2024.06.30.601452 Hiruna Samarakoon 1 School of Computer Science and Engineering, University of New South Wales , Sydney, NSW, Australia 2 Genomics and Inherited Disease Program, Garvan Institute of Medical Research , Sydney, NSW, Australia 3 Centre for Population Genomics, Garvan Institute of Medical Research and Murdoch Children’s Research Institute , Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hiruna Samarakoon For correspondence: hiruna{at}unsw.edu.au Yuk Kei Wan 4 Genome Institute of Singapore, A*STAR , Singapore, Singapore 5 Yong Loo Lin School of Medicine, National University of Singapore , Singapore, Singapore Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sri Parameswaran 6 School of Electrical and Information Engineering, University of Sydney , Sydney, NSW, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sri Parameswaran Jonathan Göke 4 Genome Institute of Singapore, A*STAR , Singapore, Singapore 7 Department of Statistics and Data Science, National University of Singapore , Singapore, Singapore 8 National Cancer Center of Singapore , Singapore, Singapore Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jonathan Göke Hasindu Gamaarachchi 1 School of Computer Science and Engineering, University of New South Wales , Sydney, NSW, Australia 2 Genomics and Inherited Disease Program, Garvan Institute of Medical Research , Sydney, NSW, Australia 3 Centre for Population Genomics, Garvan Institute of Medical Research and Murdoch Children’s Research Institute , Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hasindu Gamaarachchi Ira W. Deveson 2 Genomics and Inherited Disease Program, Garvan Institute of Medical Research , Sydney, NSW, Australia 3 Centre for Population Genomics, Garvan Institute of Medical Research and Murdoch Children’s Research Institute , Australia 9 St Vincent’s Clinical School, Faculty of Medicine, University of New South Wales , Sydney, NSW, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ira W. Deveson For correspondence: i.deveson{at}garvan.org.au Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Nanopore sequencing by Oxford Nanopore Technologies (ONT) enables direct analysis of DNA and RNA by capturing raw electrical signals. Different nanopore chemistries have varied k-mer lengths, current levels, and standard deviations, which are stored in k-mer models. Particularly in cases where official models are lacking or unsuitable for specific sequencing conditions, tailored k-mer models are crucial to ensure precise signal-to-sequence alignment and interpretation. The process of transforming raw signals into nucleotide sequences, known as basecalling, is a fundamental step in nanopore sequencing. In this study, we leverage the basecaller’s move table to create a lightweight denovo k-mer model for RNA004 chemistry. We showcase the effectiveness of our custom k-mer model through high alignment rates (97.48%) compared to larger default models. Additionally, our 5-mer model exhibits similar performance as the default 9-mer models in m6A methylation detection. Introduction Nanopore sequencing allows for the direct examination of unaltered DNA [ 1 ],[ 2 ], RNA[ 3 ], and protein molecules [ 4 ], offering numerous possibilities in various life science fields. Instruments developed by Oxford Nanopore Technologies (ONT) detect changes in ionic current as these molecules traverse a tiny protein pore at the nanoscale. These devices capture time-series data of the current signal, often called ‘squiggle’ data, which can then be converted into sequence reads through basecalling or analyzed directly [ 5 ]. Extracting meaningful biological information from squiggle data requires converting it into interpretable sequences. This conversion process, known as basecalling, utilizes advanced algorithms, often involving neural networks, to identify and assign nucleotide bases based on the specific signal patterns [ 6 ]. One key challenge in nanopore sequencing lies in accurately aligning the raw electrical signals with the corresponding nucleotide sequence. This alignment process is crucial for correctly interpreting the data. In this context, k-mers, short nucleotide sequences of a defined length (e.g., 5-mers), are utilized in the alignment. Event alignment aims to match basecalled k-mers to their corresponding “events” – specific current levels observed in the raw signal that represent k-mers passing through the nanopore at different times. Modern basecallers uses Connectionist Temporal Classifiers (CTCs) to produce a crude signal-to-base alignments [ 7 ]. For example, in handwritten images, CTCs map sequential data such as strokes or characters to their image counterparts, ensuring accurate recognition [ 8 ]. Similarly, in basecalling, CTCs help align event data from raw signals to their basecalled sequences [ 9 ]. This alignment output, known as the “move table,” provides a basic mapping, albeit crude, of events to their corresponding basecalled sequences, facilitating the initial alignment of nucleotide sequences to the electrical data. Different nanopore chemistries exhibit distinct pore specifications, resulting in variations in k-mer lengths, current levels, and standard deviations. These details are compiled and stored in a file known as the k-mer model or table, sometimes also referred to as the pore model or table. These models assume a specific length for k-mers, which may or may not precisely match the actual k-mer length within the nanopore [ 10 ]. Typically, a basic k-mer model includes 4 k different k-mers, where ‘k’ represents the length of the k-mer, reflecting the four nucleotides (A, C, G, T) in DNA. Each k-mer, treated as an event, is associated with an expected current level and a standard deviation, capturing the variability in current levels observed for different k-mers passing through the nanopore. Various signal alignment methods, such as Nanopolish/F5c event-align [ 1 ][ 11 ], Uncalled4 event-align [ 12 ], Nanopolish signal projection, Squigulator simulation [ 13 ], Sigmap signal mapping [ 14 ], and Sigfish dtw [ 15 ], rely on these k-mer models for accurate alignment of raw signal data to their corresponding nucleotide sequences. These signal alignment methods are vital in downstream analysis pipelines [ 1 ]. For example, the m6A modification detection tool - m6Anet uses the signal alignment produced by Nanopolish/F5c . It’s important to note that while all bases within a k-mer influence the current or voltage level at a given moment, their contributions may vary. This variability in contribution is accounted for in the k-mer model, allowing for a more nuanced understanding of how different nucleotide combinations affect the observed electrical signals during nanopore sequencing. Creating a denovo k-mer model is useful, especially in scenarios where an official model is not readily available or optimized for specific sequencing contexts. Usually, when Oxford Nanopore Technologies (ONT) releases a new sequencing chemistry, they also provide a corresponding k-mer model [ 16 ]. However, The k-mer model generation software is proprietary [ 17 ] and in instances such as r10.4.1 and RNA004 chemistries saw delays in the release of their k-mer models. Without a suitable k-mer model, event alignment algorithms reliant on such models become ineffective or less reliable, emphasizing the need for a tailored k-mer model for accurate signal alignment and interpretation. Moreover, the official ONT k-mer models often have exact k-mer lengths, leading to large models due to the exponential growth of possible k-mers (e.g., RNA004 with a 9-mer model has 4 9 possible k-mers). Deducing lightweight k-mer models that maintain similar performance metrics is advantageous for computational efficiency and resource utilization. Furthermore, similar to basecalling models that may require retraining for different sequencing contexts beyond the initial focus on human samples, k-mer models should also be trained or adapted for specific sequencing contexts to ensure better alignment accuracy and signal interpretation tailored to the unique characteristics of the sample being sequenced. In this work, we leverage the basecaller’s move table to deduce a lightweight denovo k-mer model for RNA004 chemistry. We gather a substantial number of samples for each k-mer using information from the move table and then calculate the mean and standard deviation. To ensure the quality of our model, we employ various filtering techniques to capture only the most reliable samples. Detailed methodologies regarding sample collection and filtering techniques are outlined in the Methods section. The results of our k-mer model creation process, including the performance metrics and comparisons with existing models, are presented in the Results section. Methods Our methodology for k-mer model creation utilizes a custom program named Poregen . This program extracts current samples for each k-mer based on a provided alignment. The alignment can be either a signal-to-read alignment, such as a move table, or a signal-to-reference alignment, like the one generated by Nanopolish/F5c event-align ( Fig. 1 ). When a k-mer model isn’t readily available for a new nanopore chemistry, Poregen can utilize the move table alignment ( Fig. 1 ). The move table can be either the direct signal-to-read alignment or a signal-to-reference alignment derived using Squigualiser reform and realign [ 18 ]. Poregen takes the raw signal in SLOW5 format [ 19 ][ 20 ], the sequence in FASTA format, and the signal-to-sequence in SAM or PAF formats. Download figure Open in new tab Figure 1. Schematic diagram summarises data preprocessing steps and alternative workflow paths to generate custom k-mer models using Poregen . ( a ) The raw signal is basecalled and aligned to the reference. the move table is a signal-to-read alignment that does not depend on a k-mer model. Alternatively, the user may perform signal-to-reference alignment with a variety of external software, including F5c, Nanopolish or Uncalled4 which in most cases require a k-mer model. ( b ) The signal-to-sequence encoded using ‘ss’ format is filtered and fed to Poregen which does the sampling of k-mers. Then the mean and standard deviation are calculated for each k-mer to create a new k-mer model. Data preparation Poregen requires the alignment to be in a specific format, denoted as the ss format. For example, ss:Z:7,2D3,4I,5, is translated to: 7 consecutive signal samples match, 2 base deletions, 3 consecutive signal samples match, 4 signal sample insertions followed by a final match of 5 signal samples along the read or reference sequence Before processing the alignment, the datasets must be cleaned based on the metrics such as read quality score (qscore), alignment score (if it is a signal-to-reference alignment), and read length. This ensures a baseline quality for the analyzed data. During the sampling process, Poregen can further refine the raw signal event samples using several techniques: ▪ Minimum and maximum dwell time thresholds: This eliminates samples considered too short or too long (potentially representing noise). ▪ Standard deviation filtering: Samples with excessively high standard deviation are discarded, indicating an unstable event. ▪ Indel skipping (optional): When signal-to-reference alignments are used as input, alignments around insertions and deletions (indels) can be skipped for a user-specified number of positions from either end of the indel. This helps to minimize the influence of potentially noisy indel regions on the k-mer model. Algorithm 1 Finding the significant base indices of a k-mer model Download figure Open in new tab Algorithm 2 Creating a new k-mer model Download figure Open in new tab Identifying the most significant bases within a k-mer to create a lighter k-mer model While all bases within a k-mer contribute to the observed current signal in nanopore sequencing, some bases may exert a stronger influence. The density plots created using the Squigualiser calculate_offset subtool and the Algorithm 1 can be utilized to pinpoint the most significant bases within a k-mer. Poregen can be configured to generate k-mer models that only capture these most significant bases effectively reducing the size of the k-mer model (see Section: Generating a new k-mer model). The algorithm 1 is designed to identify significant base indices within a k-mer model. It starts by creating 1-mer models for each base index in the k-mer (lines 1-7). These 1-mer models store the current levels associated with each base (A, C, G, T) and each 1-mer has 4 k−mer _ length− 1 current levels. The algorithm then calculates the Pearson correlation between the histograms of current levels for different 1-mer pairs of a 1-mer model (lines 8-17). If the average similarity value for a 1-mer model falls below a specified threshold (0.96), the corresponding base index is considered significant (lines 18-19). Generating a new k-mer model The generation of a new k-mer model is explained in Algorithm 2. It iterates through each alignment (line 3) and extracts a user-specified number of current samples for each k-mer (lines 9-11). Each sample represents a single event that is essentially a series of current values. The length of the event is further filtered using min _ dur and max _ dur (line 9). Finally, for each k-mer, the program calculates either the mean (or the median) and the standard deviation (or median absolute deviation (MAD)) to build the k-mer model (lines 14-16). In the absence of an existing k-mer model, the user can use a sufficiently larger k-mer length ( k −mer _ length = 9) first to create a k-mer model which can then be used to deduce the most significant bases (line 17) as explained in Algorithm 1. Results Determining the optimal k-mer length As described in Methods, we analysed the density plots generated using Squigualiser’s calculate_offsets subtool in conjunction with the Algorithm 1 to determine the significant base indices of each available k-mer model. Each subplot in FigS1-5 illustrates the ability of each base position within the k-mer to discriminate between the four nucleotides (A, C, G, and T/U). Table 1 presents the most significant base indices obtained using the Algorithm 1. The density plots and indices corroborate each other’s findings. For instance, in the ONT 5-mer (r9.4.1 DNA) model, the last four bases emerge as the most significant indices, a finding echoed in the ONT 6-mer (r9.4.1 DNA) model. The ONT 5-mer (r9.4.1 RNA) model, on the other hand, exhibits significance across all its bases. Its density plot reveals a stronger significance around the central base that diminishes towards the ends (FigS3). View this table: View inline View popup Download powerpoint Table 1: Significant base indices (0-based) for different k-mer models Interestingly, the ONT 9-mer (r10.4.1 DNA) model displays two sets of most significant base indices (1,2 and 4,5,6,7), affirming the presence of the two reader heads within the pore. For ONT 9-mer (RNA004) model, the most significant base indices are 2,3,4,5,and 6, suggesting that a 5-mer model offers a lightweight yet effective model. Table 2 presents the Pearson correlation scores between each RNA model. In order to calculate the correlation between the 5-mer and 9-mer models, 9-mers were collapsed to their central 5-mers by taking their mean current value. Notably, all models, including the ONT 5-mer (r9.4.1 RNA) model, exhibit strong correlations with each other, validating our decision to opt for a 5-mer model. In the following section, we evaluate a 5-mer model generated using the move table information for the latest ONT RNA chemistry (RNA004). We used a dataset sequenced using the Universal Human Reference RNA to generate the model. View this table: View inline View popup Download powerpoint Table 2: Mean current level correlation between k-mer models Signal Event-Alignment Nanopolish/F5c conducts signal-to-reference alignment utilizing the k-mer model. The success of alignment is directly influenced by the accuracy of the k-mer model employed ( Table 3 ). The F5c event-alignment statistics reveal that all RNA004 models, including the generated 5-mer model and notably the r9.4.1 RNA model, achieve alignment rates well exceeding 97%. In contrast, the r9.4.1 DNA 5-mer model, with a 0% alignment rate, was deliberately included as a control test to verify the impact of the k-mer model on the alignment process. View this table: View inline View popup Download powerpoint Table 3: F5c event-alignment statistics Moreover, the resultant signal-to-reference alignments can be visually assessed at a per-base level using Squigualiser for a more comprehensive evaluation (see Fig. 2 ). Notably, the signal pileups illustrate highly consistent eventalignments across all three RNA004 models. Download figure Open in new tab Figure 2. Visualisation of F5c event-align output using three Squigualiser pileups for the same six RNA004 reads. The k-mer models employed for each event-align step are, in order: (1) F5c ’s built-in 9-mer (RNA004), (2) ONT’s default 9-mer (RNA004), and (3) a custom 5-mer (RNA004) model generated by Poregen . Methylation Detection For specific use cases involving methylation detection (e.g., m6A in RNA), the performance of the custom k-mer model can be compared to the default model using methylatin detection tools like m6Anet [ 21 ]. We used F5c event-alignment to align the current signal of a HEK293T RNA004 sample from the Singapore Nanopore Expression project [ 22 ] with each k-mer model ( F5c inbuilt 9-mer RNA004 model, ONT 9-mer RNA004 model, and poregen 5-mer RNA004 model). Each event-alignment output was used to predict the presence of m6A in the sample with m6Anet’s inbuilt RNA004 neural network model. We determined the m6A prediction performance using the m6ACE-seq-detected m6A sites as ground truth. Table 4 and Figure. 3 summarises the results. It can be observed that poregen 5-mer performs similarly to the F5c 9-mer model and both models could detect more than twice the number of sites detected by ONT’s 9-mer model. View this table: View inline View popup Download powerpoint Table 4: Comparison of m6A sites detected Download figure Open in new tab Figure 3. ROC curves for m6Anet results using different k-models Discussion Creating tailored k-mer models is crucial for accurate signal alignment and interpretation in nanopore sequencing. Our study focused on developing a denovo, ligh-weight k-mer model for the RNA004 chemistry, employing the basecaller’s move table with data cleaning, sampling techniques, and significant base identification to ensure model quality and effectiveness. A key finding was the determination of the optimal 5-mer length for RNA004, balancing computational efficiency and discriminatory power. This finding is pivotal, especially in resource-constrained settings, facilitating efficient signal interpretation and alignment. The refinement of initial k-mer models is an important step towards enhancing their accuracy and robustness. An initial k-mer model created using a move table can be further refined through an iterative process by using it as a custom model in the signal-to-reference alignment process (e.g., Nanopolish/F5c event-align ). The newly generated signal-to-reference alignment serves as the input for Poregen in a subsequent run. This allows Poregen to extract samples based on a potentially more accurate alignment, leading to a refined k-mer model. Optionally, the user can consider employing the Nanopolish training subtool. This tool iteratively fits a Gaussian mixture model (GMM) to the events detected for each k-mer, potentially leading to a more robust k-mer model after each iteration. Uncalled4 has independently developed a method for generating k-mer models in parallel to our work. However, this work did not investigate the performance of light-weight k-mer models beyond the default size. Overall, our work contributes methodological insights that advance nanopore sequencing by enabling lightweight and effective k-mer models tailored to specific chemistries, even in the absence of official models. These models play a pivotal role in improving data analysis and understanding complex biological phenomena at the molecular level. Author Contributions H.S., H.G. and I.W.D. conceived and designed Poregen , devised the experiments and prepared the manuscript, with support from all authors. H.S. implemented Porgen . H.S. and Y.K.W. conducted the experiments. H.S., Y.K.W. and I.W.D. generated the figures. All authors contributed to testing the software, read and approved the final manuscript. Declarations I.W.D. manages a fee-for-service sequencing facility at the Garvan Institute of Medical Research and is a customer of Oxford Nanopore Technologies but has no further financial relationship. H.G. and I.W.D. have previously received travel and accommodation expenses from Oxford Nanopore Technologies. J.G. received reimbursement for travel and accommodation from Oxford Nanopore Technologies to present at the Nanopore Community Meeting in San Francisco in 2018. The authors declare no other competing financial or non-financial interests. Data & code availability Poregen ( https://github.com/hiruna72/poregen ) is free and open source with an MIT licence. The RNA004 dataset along with the bash scripts to reproduce the 5-mer model generation is available at zenodo.10966311 ). The dataset used for F5c event-align is available at ENA: ERR12997170. Download figure Open in new tab FigS1: ONT 5-mer (r9.4.1 DNA) model Download figure Open in new tab FigS2: ONT 6-mer (r9.4.1 DNA) model Download figure Open in new tab FigS3: ONT 5-mer (r9.4.1 RNA) model Download figure Open in new tab FigS4: ONT 9-mer (r10.4.1 DNA) model Download figure Open in new tab FigS5: ONT 9-mer (RNA004) model Acknowledgements We thank Jared Simpson and Sam Kovaka for helping with understanding the k-mer model training. We acknowledge the following funding support: Australian Medical Research Futures Fund grants MRF1173594, and MRF2023126 (to I.W.D.), Australian Research Council DECRA Fellowship DE230100178 (to H.G.) and Australian Research Council’s Discovery Project DP230100651 (to H.G and S.P). Y.K.W is supported by the Singapore International Graduate Award from Agency for Science, Technology and Research. The views expressed herein are those of the authors and are not necessarily those of the Australian Government or the Australian Research Council. Footnotes https://zenodo.org/records/10966311 References [1]. ↵ Jared T Simpson et al. “ Detecting DNA cytosine methylation using nanopore sequencing ”. In: Nature methods 14 . 4 ( 2017 ), pp. 407 – 410 . OpenUrl [2]. ↵ Yue Zhang et al. “ 6mA DNA Methylation on Genes in Plants Is Associated with Gene Complexity, Expression and Duplication ”. In: Plants 12 . 10 ( 2023 ), p. 1949 . OpenUrl [3]. ↵ Miten Jain et al. “ Advances in nanopore direct RNA sequencing ”. In: Nature Methods 19 . 10 ( 2022 ), pp. 1160 – 1164 . OpenUrl [4]. ↵ Zheng-Li Hu et al. “ Biological nanopore approach for single-molecule protein sequencing ”. In: Angewandte Chemie 133 . 27 ( 2021 ), pp. 14862 – 14873 . OpenUrl [5]. ↵ Yunhao Wang et al. “ Nanopore sequencing technology, bioinformatics and applications ”. In: Nature biotechnology 39 . 11 ( 2021 ), pp. 1348 – 1365 . OpenUrl [6]. ↵ Ryan R Wick , Louise M Judd , and Kathryn E Holt . “ Performance of neural network basecalling tools for Oxford Nanopore sequencing ”. In: Genome biology 20 ( 2019 ), pp. 1 – 10 . OpenUrl CrossRef [7]. ↵ Alex Graves et al. “ Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks ”. In: Proceedings of the 23rd international conference on Machine learning . 2006 , pp. 369 – 376 . [8]. ↵ Hongjian Zhan , Qingqing Wang , and Yue Lu . “ Handwritten digit string recognition by combination of residual network and RNN-CTC ”. In: Neural Information Processing: 24th International Conference, ICONIP 2017, Guangzhou, China, November 14–18, 2017, Proceedings, Part VI 24 . Springer . 2017 , pp. 583 – 591 . [9]. ↵ Oxford Nanopore Technologies . Oxford Nanopore’s Basecaller - dorado . https://github.com/nanoporetech/dorado/commit/7602541ac756c79ef7bb4467b9405ec741de7533 . 2024 . [10]. ↵ Hongxu Ding et al. “ Towards inferring nanopore sequencing ionic currents from nucleotide chemical structures ”. In: Nature Communications 12 . 1 ( 2021 ), p. 6545 . OpenUrl [11]. ↵ Hasindu Gamaarachchi et al. “ GPU accelerated adaptive banded event alignment for rapid comparative nanopore signal analysis ”. In: BMC bioinformatics 21 ( 2020 ), pp. 1 – 13 . OpenUrl [12]. ↵ Sam Kovaka et al. “ Uncalled4 improves nanopore DNA and RNA modification detection via fast and accurate signal alignment ”. In: bioRxiv ( 2024 ), pp. 2024 – 03 . [13]. ↵ Hasindu Gamaarachchi et al. “ Simulation of nanopore sequencing signal data with tunable parameters ”. In: vGenome Research ( 2024 ), gr–278730. [14]. ↵ Haowen Zhang et al. “ Real-time mapping of nanopore raw signals ”. In: Bioinformatics 37 . Supplement_1 ( 2021 ), pp. i477 – i483 . OpenUrl [15]. ↵ Po Jui Shih et al. “ Efficient real-time selective genome sequencing on resource-constrained devices ”. In: Giga-Science 12 ( 2023 ), giad046 . OpenUrl [16]. ↵ Oxford Nanopore Technologies . kmer_models . https://github.com/nanoporetech/kmer_models/commit/4e56daed7fbb79b538f58e41262d5c54b07356ea . 2023 . [17]. ↵ Oxford Nanopore Technologies . Oxford Nanopore’s Methylation detector - emora . https://github.com/nanoporetech/remora/commit/04fb7e5f68f9a5642ae59e63bfb3ff83a19dfdf4 . 2023 . [18]. ↵ Hiruna Samarakoon et al. “ Interactive visualisation of raw nanopore signal data with Squigualiser ”. In: Biorxiv ( 2024 ), pp. 2024 – 02 . [19]. ↵ Hasindu Gamaarachchi et al. “ Fast nanopore sequencing data analysis with SLOW5 ”. In: Nature biotechnology 40 . 7 ( 2022 ), pp. 1026 – 1029 . OpenUrl [20]. ↵ Hiruna Samarakoon et al. “ Flexible and efficient handling of nanopore sequencing signal data with slow5tools ”. In: Genome Biology 24 . 1 ( 2023 ), p. 69 . OpenUrl [21]. ↵ Christopher Hendra et al. “ Detection of m6A from direct RNA sequencing using a multiple instance learning framework ”. In: Nature methods 19 . 12 ( 2022 ), pp. 1590 – 1598 . OpenUrl [22]. ↵ Ying Chen et al. “ A systematic benchmark of Nanopore long read RNA sequencing for transcript level analysis in human cell lines ”. In: BioRxiv ( 2021 ), pp. 2021 – 04 . View the discussion thread. Back to top Previous Next Posted July 01, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Leveraging Basecaller’s Move Table to Generate a Lightweight k-mer Model Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Leveraging Basecaller’s Move Table to Generate a Lightweight k-mer Model Hiruna Samarakoon , Yuk Kei Wan , Sri Parameswaran , Jonathan Göke , Hasindu Gamaarachchi , Ira W. Deveson bioRxiv 2024.06.30.601452; doi: https://doi.org/10.1101/2024.06.30.601452 Share This Article: Copy Citation Tools Leveraging Basecaller’s Move Table to Generate a Lightweight k-mer Model Hiruna Samarakoon , Yuk Kei Wan , Sri Parameswaran , Jonathan Göke , Hasindu Gamaarachchi , Ira W. Deveson bioRxiv 2024.06.30.601452; doi: https://doi.org/10.1101/2024.06.30.601452 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7646) Biochemistry (17728) Bioengineering (13917) Bioinformatics (42038) Biophysics (21489) Cancer Biology (18637) Cell Biology (25554) Clinical Trials (138) Developmental Biology (13403) Ecology (19941) Epidemiology (2067) Evolutionary Biology (24368) Genetics (15624) Genomics (22547) Immunology (17764) Microbiology (40475) Molecular Biology (17208) Neuroscience (88756) Paleontology (667) Pathology (2842) Pharmacology and Toxicology (4834) Physiology (7659) Plant Biology (15175) Scientific Communication and Education (2047) Synthetic Biology (4304) Systems Biology (9835) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-23T02:00:01.238055+00:00

License: CC-BY-4.0