Accurate and Reproducible Whole-Genome Genotyping for Bacterial Genomic Surveillance with Nanopore Sequencing Data

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 42,310 characters · extracted from preprint-html · click to expand
Accurate and Reproducible Whole-Genome Genotyping for Bacterial Genomic Surveillance with Nanopore Sequencing Data | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Accurate and Reproducible Whole-Genome Genotyping for Bacterial Genomic Surveillance with Nanopore Sequencing Data View ORCID Profile K. Prior , View ORCID Profile K. Becker , C. Brandt , A. Cabal Rosel , J. Dabernig-Heinz , C. Kohler , View ORCID Profile M. Lohde , View ORCID Profile W. Ruppitsch , F. Schuler , View ORCID Profile G.E. Wagner , View ORCID Profile A. Mellmann doi: https://doi.org/10.1101/2025.02.24.639834 K. Prior a Department of Periodontology and Operative Dentistry, University Hospital Münster , Münster, Germany ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for K. Prior K. Becker b Friedrich Loeffler-Institute of Medical Microbiology, University Medicine Greifswald , F.-Sauerbruch-Str., Greifswald, Germany ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for K. Becker C. Brandt c Institute for Infectious Diseases and Infection Control, Jena University Hospital , Jena, Germany ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site A. Cabal Rosel d Institute for Surveillance and Infectious Disease Epidemiology , Austrian Agency for Health and Food Safety, Vienna, Austria ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site J. Dabernig-Heinz e Diagnostic and Research Institute of Hygiene, Microbiology and Environmental Medicine, Medical University of Graz , Graz, Austria Find this author on Google Scholar Find this author on PubMed Search for this author on this site C. Kohler b Friedrich Loeffler-Institute of Medical Microbiology, University Medicine Greifswald , F.-Sauerbruch-Str., Greifswald, Germany ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site M. Lohde c Institute for Infectious Diseases and Infection Control, Jena University Hospital , Jena, Germany ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for M. Lohde W. Ruppitsch d Institute for Surveillance and Infectious Disease Epidemiology , Austrian Agency for Health and Food Safety, Vienna, Austria ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for W. Ruppitsch F. Schuler f Institute of Medical Microbiology, University Hospital Münster , Münster, Germany ; Find this author on Google Scholar Find this author on PubMed Search for this author on this site G.E. Wagner e Diagnostic and Research Institute of Hygiene, Microbiology and Environmental Medicine, Medical University of Graz , Graz, Austria Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for G.E. Wagner A. Mellmann g Institute of Hygiene, University Hospital Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for A. Mellmann For correspondence: mellmann{at}uni-muenster.de Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Despite recent advances in error rate reduction, until recently Oxford Nanopore Technology (ONT) sequences lacked the accuracy required for fine scale bacterial genomic analysis. Here, recent software improvements of ONT and the ONT-cgMLST-Polisher within the SeqSphere + software were evaluated. We used short-(Illumina) and long-read ONT sequences of 80 multidrug-resistant bacteria (MDROs) for benchmarking. Illumina reads were de-novo -assembled using SKESA. For ONT, Dorado super accurate (SUP) model 4.3 or 5.0 basecalled reads were assembled with Flye and then polished with Medaka v1.12 m4.3 or Medaka v2.0 bacterial methylation model. In addition, the ONT-cgMLST-Polisher was run over all assemblies. The ‘ground truth’ (GT) hybrid assemblies were created using Hybracter v0.10.0. Sixteen isolates from four species out of the original 80 isolates were sent to six laboratories for a ring trial. The 80 MDROs basecalled with SUP m4.3 had an average cgMLST allele distance (AD) to the GT of 4.94 with Medaka v1.12 and 1.78 with Medaka v2.0, respectively. After further polishing the Medaka v2.0 data with the ONT-cgMLST-Polisher, the AD dropped to 0.09. Using data basecalled with SUP m5.0 with Medaka v2.0 further reduced the AD significantly to 0.04. While the ring trial data basecalled with Dorado SUP m4.3 showed more variability and insufficient results for some samples, model 5.0 data resulted in average ADs of 0.36 and 0.17 without and with the ONT-cgMLST-Polisher, respectively. In conclusion, recent ONT Dorado and Medaka models combined with the ONT-cgMLST-Polisher improved ONT sequencing accuracy and made it sufficiently reproducible for genomic surveillance of bacteria. Importance Oxford Nanopore Technologies (ONT) sequencing methodology is especially attractive for small and medium-sized laboratories due to its relatively low capital investment and price per sample consumable costs. However, until recently it lacked accuracy and reproducibility for bacterial genomic genotyping. Here, we present an evaluation of the most recent ONT bioinformatic (basecalling and polishing of consensus) improvements and a new ONT-cgMLST-Polisher tool. We demonstrate that by applying those procedures ONT whole-genome genotyping-based surveillance of bacteria is finally accurate and reproducible enough for routine application even in small laboratories Introduction Whole-genome sequencing (WGS)-based genotyping is an essential part of infectious disease control ( 1 ). However, reliable results require a low error-rate and high reproducibility. Short-read sequencing (i. e., Illumina technology [Illumina Inc., San Diego, CA, USA]) has been the gold standard for many years due to its high accuracy, but lacks the ability to fully reconstruct mobile genetic elements, which play a major role in the dissemination of antimicrobial resistance (AMR) ( 2 , 3 ). Long-read sequencing overcomes this limitation, however, until recently, the available technologies were either hampered by the capital investment for routine laboratories (PacBio sequencing [Pacific Biosciences Inc., Menlo Park, CA, USA]) or did not provide the required accuracy (Oxford Nanopore sequencing [Oxford Nanopore Technologies, Oxford, UK]). However, recent advances in Oxford Nanopore Technologies (ONT) sequencing chemistry and flow cells as well as basecalling models significantly reduced their error rates. In 2022, Sereika et al. ( 4 ) reported that the quality of assemblies of ONT R10.4 sequences was comparable to Illumina data. Similarly, a study comparing Illumina and ONT sequencing of 356 multidrug-resistant organisms (MDRO) found ONT sequence accuracy generally suitable for genomic surveillance with the exception of Pseudomonas aeruginosa ( 5 ). In contrast, Lerminiaux et al. ( 6 ) found ONT R10.4 assemblies to be only sufficient for plasmid characterization and AMR detection but still recommended short-read polishing for fine-scale analyses such as single nucleotide polymorphisms (SNP) detection. Another study found extensive base errors caused by methylation sites, which led to discrepancies in core-genome multi-locus sequence typing (cgMLST) and resulted in false exclusions from outbreak clusters in Klebsiella pneumoniae ( 7 ). In a multicenter study, Dabernig-Heinz et al. ( 8 ) analyzed cgMLST results of four public health-relevant species and found highly strain-specific typing errors related to consistent DNA methylation motifs. Again, these errors led to discrepancies in cgMLST outbreak clusters in some cases. The authors were able to improve the results by PCR preamplification, more recent basecalling models and optimizing the polishing step. However, they concluded that improvements are still necessary to gain the accuracy needed for bacterial genomic surveillance and that computational approaches have the potential to deliver these improvements if strain-specific differences are taken into account when training new models. Since this study was conducted, several advances in the bioinformatic tools used for processing ONT data became available. First, a new basecalling model, Dorado SUP m5.0, was published, which increases the accuracy compared to Dorado SUP m4.3 but is much slower and not (yet) part of MinKNOW, the software which controls the sequencing and enables real-time basecalling. Further, with Medaka v2.0, a new polishing model was introduced, which was specifically trained to reduce errors resulting from bacterial methylation. Finally, SeqSphere + , a widely used software for cgMLST-based bacterial genotyping ( 9 ), includes a new ONT-cgMLST-Polisher. To determine the remaining error rates, we sequenced 80 MDRO including all ESKAPE species ( Enterococcus faecium, Staphylococcus aureus, Klebsiella pneumoniae, Acinetobacter baumannii, Pseudomonas aeruginosa, and Enterobacter species ), the leading causes of nosocomial infections throughout the world, with Illumina and ONT and evaluated the various bioinformatics solutions mentioned above. Further, to analyze the reproducibility, we conducted a ring trial with six laboratories 103 that were sent DNA from 16 challenging samples of this set of microorganisms. Materials and Methods Strains and DNA isolation of evaluation dataset A total of 80 MDRO belonging to eleven species (Table S1) were grown overnight on Columbia Agar at 35±2°C (ambient air). Genomic DNA (gDNA) was extracted using the ZymoBiomics DNA Miniprep Kit (Zymo Research, USA) following the manufacturer’s instructions. Quantity and quality of the extracted gDNA were assessed using the Qubit 2.0 dsDNA BR Kit (Thermo Fisher Scientific, USA) and a NanoDrop spectrophotometer (Thermo Fisher Scientific, USA), respectively. Short read sequencing Illumina Library was prepared with the Illumina DNA Prep Kit and UDI Indexes according to the manufacturer’s instructions (Illumina Inc.). Sequencing was performed using an Illumina NextSeq 2000 platform (Illumina Inc.) generating 2×100 bp paired-end reads. Adapter-trimmed FASTQ files were downsampled to 100x and de novo assembled with SKESA v2.3.0 ( 10 ) in combination with BWA-MEM v0.7.15 ( 11 ) for read mapping and consensus calling with no coverage minimum. A consensus call read support threshold of 60% was applied. Long-read sequencing The ONT MinION in combination with the v14 chemistry (SQK-RBK114.24) was used for long-read sequencing. Library preparation was conducted with the Rapid Barcoding Kit v14 (RBK, ONT) and basecalling was performed using Dorado v0.7.1 with super accurate mode (SUP) models version 4.3 (ONT, [email protected] ) and version 5.0 (ONT; [email protected] ). Chopper v0.7.0 ( 12 ) was used for read trimming with the following parameters: quality threshold of 10 and minimum read length of 500 bp. Reads were downsampled to 100x coverage in deterministic mode using Rasusa v0.8.0 ( 13 ). Flye v2.9.3-b1797 ( 14 ) was used for de novo assembly with the parameters --nano-hq --deterministic. Medaka v1.12 (ONT) in combination with the fitting version 4.3 polishing model and the bacterial methylation model of Medaka v2.0.1 (ONT; https://github.com/nanoporetech/medaka ) were applied for assembly polishing. Ground truth assemblies Hybrid assemblies representing the ‘ground truth’ (GT) for all comparisons were generated using Hybracter v0.10.0 ( 15 ) and Medaka v2.0.1 with the bacterial methylation model, utilizing ONT SUP model m4.3 datasets combined with the Illumina short-read data. cgMLST data analysis All assemblies were analyzed with SeqSphere + version 10.5 (Ridom GmbH, Münster Germany) using the available public cgMLST schemes ( https://www.cgmlst.org/ ) for the following species: Acinetobacter baumannii, Enterococcus faecium, Escherichia coli, K. pneumoniae, P. aeruginosa, Serratia marcescens, and Staphylococcus aureus. New public cgMLST schemes were generated for Enterobacter hormaechei , Morganella morganii , and Proteus mirabilis , while a stable but not public scheme with 2,839 cgMLST targets was created for Citrobacter freundii . Priority AMR target analysis The NCBI AMR Finder Plus v3.11.26 ( 16 ) task template included in the SeqSphere + software was used to find specific genes from the NCBI Bacterial Antimicrobial Resistance Reference Gene Database. Targets that might confer resistance to carbapenems, colistin, vancomycin, or methicillin or that contain ESBL or AmpC in their name were highlighted as priority AMR targets. ONT-cgMLST-Polisher All ONT datasets were analyzed with and without the proprietary ONT-cgMLST-Polisher v1.0 that is part of the SeqSphere + v10.5 ONT Data Assembly module. This tool maps Dorado SUP basecalled FASTQ reads (≥ model 4.2) to a Medaka-derived assembly consensus FASTA sequence using Minimap2 ( 17 ). The alignment is then scanned for potential methylation-related sequencing errors in core and accessory genome MLST genes, e. g., differing strand-specific majority consensus calls. Those ‘ambiguous’ positions are then compared against a sequence with a closely related cgMLST allelic profile. Finally, based on the comparison the consensus sequence of ambiguous positions is either confirmed or masked with an ‘N’ call. Thereby, core genome MLST target genes that contain an ‘N’ are regarded as missing targets as they fail the quality control that does not allow for ambiguous bases by default. Strain selection and procedure for the ring trial dataset For the ring trial, three Gram negative and one Gram positive bacterial species of the evaluation dataset ( K. pneumoniae, E. hormaechei, M. morganii, and E. faecium ) with chromosome sizes of approximately 5.7, 4.9,4.1, and 2.9 Mb, respectively, were selected focusing on isolates exhibiting large distances to the GT. From each species, four isolates were chosen, including both high- and low-distance samples (Table S1, isolates in bold). This selection included the five ‘worst’ isolates. To rule out the possibility that the allelic differences were a result of contamination, the datasets of the selected isolates were analyzed with CheckM2 ( 18 ). Cultivation, isolation, and quality control of the isolated gDNA was performed as described for the evaluation data set. Aliquots from the same DNA preparations were sent to the six participating laboratories. Library preparation was performed with RBK v14 chemistry according to the manufacturer’s recommendations. Three laboratories used a fixed 200 ng gDNA input per sample, whereas the other three adjusted input according to genome size: 150 ng for ∼3 Mb genomes, 200 ng for ∼4 Mb genomes, 250 ng for ∼5 Mb genomes, and 300 ng for ∼6 Mb genomes. All participating laboratories performed one ONT MinION or GridION sequencing run for 72 hours using an ONT FLOMIN-114 flow cell (R10.4.1 pores). Initial basecalling was done using the Dorado server included in MinKNOW v24.06 in SUP mode with the default model 4.3 and a minimum Q-score of 10. Re-basecalling of raw data was subsequently performed using Dorado v0.8.3 with the SUP m5.0. Following basecalling, all datasets underwent the same analysis pipeline described for the evaluation dataset with two exceptions. Datasets were not downsampled prior to assembly and only the bacterial methylation model of Medaka v2.0.1 was used for polishing. Each dataset was analyzed with and without the ONT-cgMLST-Polisher and compared against the GT. Data Availability Illumina and ONT read datasets of the evaluation data set as well as of the ring trial data of all participants have been deposited under Study accession nos. PRJEB86764 (evaluation dataset) and PRJEB86767 (ring trial data) in the European Nucleotide Archive (ENA), respectively. Results Evaluation dataset cgMLST error analysis In total, 80 isolates were sequenced with Illumina and ONT, with all datasets achieving a minimum sequencing depth of > 50-fold, using a de novo hybrid assembly approach thereby generating the GT for all subsequent comparisons. Of the Illumina data, only one sample deviated by a single allele from the GT ( Table 1 ). The ONT data generated with the Dorado SUP m4.3 were subjected to several analysis pipelines. A Flye-only assembly, without any polishing, resulted in 37 samples exhibiting differences to the GT, with a maximum allele distance (AD) of 98 alleles (range: 0-98, median, 0). Additional polishing with Medaka v1.12 reduced the number of deviating samples to 33, although the maximum AD remained at 98. Further improvement was observed after polishing the Flye assembly with Medaka v2.0, where only 13 samples showed a deviation from the GT and the maximum AD was reduced to 57 (range: 0-57, median: 0). Moreover, incorporating the ONT-cgMLST-Polisher into the workflow led to a substantial reduction in ADs - yielding maximum ADs of 4 for both the Flye-only (median: 0) and Flye plus Medaka v1.12 pipelines (median: 0), and 3 for the Flye plus Medaka v2.0 pipeline (median: 0). Notably, when used without the ONT-cgMLST-Polisher, Medaka v2.0 reduced the average cgMLST AD from approximately 4.5 (Flye only) to 1.8, whereas its combination with the ONT-cgMLST-Polisher lowered this average to 0.09. View this table: View inline View popup Download powerpoint TABLE 1 Error analysis results of the 80 sequenced isolates from the evaluation The ONT-cgMLST-Polisher masks ambiguous positions by substituting them with an ‘N’. In the Dorado SUP m4.3 datasets, this resulted in an average of 18.75 Ns for the Flye-only assemblies, 20.14 Ns for the Flye plus Medaka v1.12 assemblies, and 12.13 Ns for the Flye plus Medaka v2.0 assemblies. Data obtained using the ONT Dorado SUP m5.0 model (version 0.8.3) - where basecalling takes two to three times longer than with model m4.3 and currently does not support live basecalling in MinKNOW - showed a dramatic reduction in both the number of samples deviating from the GT and the maximum cgMLST AD across all analysis setups. When combined with the ONT-cgMLST-Polisher, the maximum AD was reduced to one in every setting. The average cgMLST AD was lowered to 0.05 for the Flye-only assembly and to 0.03 when Medaka v1.12 or Medaka v2.0 was additionally employed. Moreover, the average number of Ns introduced by the ONT-cgMLST-Polisher was lowest (7.14) in the Flye plus Medaka v2.0 pipeline and highest (9.25) in the Flye plus Medaka v1.12 pipeline but in all m5.0 setups approximately 50% lower than the values observed in the m4.3 datasets. Evaluation dataset priority AMR target analysis In our study, the recovery and localization of priority AMR targets was evaluated by comparing Illumina short-read data with ONT data processed using the SUP 4.3 and SUP 5.0 basecalling models against the GT ( Table 2 ). The ONT datasets demonstrated excellent performance with nearly 99% of AMR targets correctly placed either as chromosomal or plasmid-borne, while Illumina data achieved a correct placement rate of only 84%. When assessing misassignments on plasmids, ONT SUP m4.3 showed the best performance with only 3% of targets wrongly placed, followed by ONT SUP m5.0 at 6% and Illumina at 9%. The misplaced targets were mainly genes encoding for resistance against beta-lactams or vancomycin. Furthermore, ONT data did not exhibit any missing AMR targets, whereas Illumina results lacked 6% of the expected targets. The number of excess AMR targets was also lowest with ONT SUP m4.3 compared to both ONT SUP m5.0 and Illumina. Finally, while only 6% of the samples processed with both ONT basecalling model showed errors, Illumina short-read sequencing was erroneous in 19% of the samples. View this table: View inline View popup Download powerpoint TABLE 2 Priority AMR target recovery and chromosomal or plasmid-borne location results of the evaluation dataset (n=80 multidrug-resistant isolates). Total numbers as well as the numbers per specific target or species are given. Polishing of ONT data was performed with Medaka v2 bacterial methylation model. Ring trial run statistics The total number of called bases ranged from 5.83 to 17.02 Gb for the six labs, while the N50 values varied between 6.35 and 9.91 kb, and the average depth of coverage assembled was between 78 and 266 ( Table 3 ). Only in one lab, four out of the 16 samples did not achieve the recommended sequencing depth of 50-fold ( https://a.storyblok.com/f/196663/x/f90d1fe1a4/workflow-bacterial-isolates.pdf ). Interestingly, neither the number of total (range 978-1.528) nor the number of sequencing (442–511) pores at the start correlated with the output. However, a higher number of total and sequencing pores after 90 minutes did correspond to a higher final output. In the present study, three participating labs applied genome size compensation (GSC) to account for differences in bacterial genome sizes and to normalize coverage distribution across sequencing runs. In general, the average coverage assembled ranged from 78-fold to 266-fold. Notably, except for Lab 1, which exhibited an overall lower coverage, GSC led to a more even coverage distribution, as demonstrated by the narrower standard deviation of the datasets where GSC was applied ( Table 3 ). View this table: View inline View popup Download powerpoint TABLE 3 Run statistics and coverage metrics of the ring trial data (16 isolates per laboratory). Ring trial cgMLST error analysis Evaluation datasets of the 16 selected ring trial isolates were examined with CheckM2 to assess potential contamination. The analysis revealed that 11 isolates exhibited no contamination (< 0.5% contamination) whereas five samples (A24994, A26728, A27739, A31008, A33124) showed only low-level contamination (< 5%), indicating that discrepancies from the GT could not be attributed to gDNA contamination. To analyze the reproducibility using the different software tools, we compared the cgMLST ADs of the 16 ring trial samples against the GT for each participating lab. We analyzed the results of data basecalled with Dorado SUP m4.3 ( Table 4a ) and Dorado SUP m5.0 ( Table 4b ), as well as with and without the ONT-cgMLST-Polisher. The detailed results, broken down by sample ID and participant, can be found in Table S2. For the data basecalled with Dorado SUP m4.3, the mean average cgMLST AD over all labs was 10.29 (range: 8.38-12.94) without the ONT-cgMLST-Polisher and 1.26 (range: 0.56-1.81) with the ONT-cgMLST-Polisher ( Table 4a ). The mean average number of missing cgMLST targets increased from 24.11 to 39.85 when the ONT-cgMLST-Polisher was applied, while the GT itself had an average of 14.31 missing targets. Notably, not all samples were equally affected. Of the 16, only three to six samples had a distance to the GT at all, but a quite high maximum AD of 49 to 75 without and 4 to 14 with the ONT-cgMLST-Polisher. Overall, the errors were either species-specific, e.g., the E. faecium samples had hardly any inaccuracy, or strain-specific for the other three species distributed (Fig. S1-S4). View this table: View inline View popup Download powerpoint TABLE 4 Error analysis results of the 16 ring trial isolates stratified by participating laboratories. Basecalling was performed with Dorado SUP m4.3 (4a) and Dorado SUP m5.0 (4b), respectively. All datasets were analyzed with and without the ONT-cgMLST-Polisher. Distances were calculated against the GT. For data basecalled with Dorado SUP m5.0, the results improved substantially ( Table 4b ). Here, the mean average AD was 0.36 without and 0.17 with the ONT-cgMLST-Polisher and the mean average number of missing data reduced to 15.21 and 17.17 without and with the ONT-cgMLST-Polisher, respectively. 295 This is in line with the average number of Ns called by the ONT-cgMLST-Polisher, which decreased from 42.02 to 6.40. The maximum cgMLST AD in a sample was reduced from four to one allele. Most recently, we tested the novel (January 2025) Dorado release v0.9.1, for which a significant basecalling speed improvement is claimed ( https://github.com/nanoporetech/dorado/releases ), using the ring trial dataset of laboratory 4 with 17.02 Gb basecalled bases ( Table 3 ). Indeed, we could detect a significant speed improvement for model 5.0 basecalling with 18.7 hours (version 0.9.1) versus 46.3 hours (version 0.8.3) for re-basecalling (Table S3). Discussion In this study, we first used a set of 80 MDRO to test the most recent available bioinformatics tools for ONT data processing. We found that Medaka v2.0 with the bacterial methylation model results in a substantial error reduction in comparison to Medaka v1.12 with both Dorado SUP m4.3 and m5.0 data. Still, the error rate for data basecalled with Dorado SUP m4.3 is occasionally too high, while data basecalled with Dorado SUP m5.0 had almost no errors in our samples. In line with this, the ONT-cgMLST-Polisher improved Dorado SUP m4.3 data substantially, for the price of a slightly higher number of missing targets, whereas no effect on the error-rate was noted with Dorado SUP m5.0 data. We are not the first claiming to have achieved accurate bacterial genomic genotyping results from ONT data with a tertiary correction. Chiou et al. developed a quite compute-intensive reference-based-method, Modpolish, for correcting ONT base-modification-mediated errors ( 19 ). When evaluating this tool with some of our data, we got sometimes polished sequences with (nearly) no errors but often sequences with newly introduced errors. The authors of a more recent study used a deep learning variant caller, Clair3, to process ONT data from 14 diverse bacterial species for SNP and indel variant calling ( 20 , 21 ). When reanalyzing the 14 ONT raw datasets, it turned out that they showed hardly any errors caused by methylation. Finally, Fuchs et al. again used - among others - Clair3 in their new tool, NanoCore, to calculate and visualize cgMLST-like sample distances ( 22 ). Also, this tool is quite compute-intensive and produced not always sufficient results with our data (data not shown). With respect to the recovery of chromosomal or plasmid-borne location of AMR targets, ONT was superior to Illumina data ( Table 2 ). Interestingly ONT data called with Dorado SUP m4.3 performed better than with Dorado SUP m5.0. Here, however, not the accuracy but the contiguity of assemblies was evaluated. To assess the reproducibility of our findings, we organized a ring trial with six laboratories that performed ONT sequencing on the 16 isolates including some of the most challenging samples of the 80 MDRO isolates. Again, we found that data basecalled with Dorado SUP m4.3 occasionally contained too many errors even after polishing with Medaka v2.0 and the ONT-cgMLST-Polisher. Whereas the 16 samples chosen from the evaluation dataset exhibited an average AD to the GT of 0.375 using Dorado SUP m4.3, higher averages were observed with quite some variation between the laboratories ( Table 4a ). For data basecalled with Dorado SUP m5.0 and polished with Medaka v2.0, assemblies could be further improved by applying the ONT-cgMLST-Polisher and resulted in nearly perfect data ( Table 4b ) with only marginally higher numbers of missing targets. Overall, using the ONT-cgMLST-Polisher safeguards for the possibility of methylation-patterns that the Dorado and Medaka models were not trained for. In addition to testing the reproducibility of post-processing tools, the ring trial provided some insights into raw data acquisition. Neither the total number nor the sequencing number of pores at the start are reliable indicators for final data output in terms of bases called. However, the total number and/or sequencing number of pores after 90 minutes seem to be such a predictor. Further, applying genome size compensation seems to mitigate the effects of uneven average depth of coverage. We have shown that with the newest polishing tools cgMLST-based analysis of ONT data is now comparable to Illumina sequencing with respect to the error-rate. Moreover, ONT outperforms short-read data in terms of AMR target recovery and correct chromosomal or plasmid-borne localization, which might affect infection control measures in the hospital setting. In January 2025, ONT released Dorado v0.9.1 that shows a significant speed improvement of about 60% for model 5.0 basecalling with newer graphical processing units (GPU; Table S3). Thereby, live basecalling in SUP mode becomes possible with recent GPUs like the Nvidia RTX 4090. For such a ‘gamer’ desktop computer an investment of about 5,000 € is required. Therefore, it would be very desirable that ONT includes Dorado v0.9.1 or newer SUP m5 in MinKNOW soon and ‘freezes’ the R10.4.1 flowcell and at least RBK chemistry for CLIA/ISO accredited laboratories. Our study has some limitations. First, we sent only DNA instead of living organisms to the six laboratories that participated in the ring trial, and thereby did not test the influence of DNA extraction methods. Second, the accuracy was only determined for cgMLST targets. In accordance with recent practices in public health and clinical microbiology, the intergenic regions were not controlled here, although they are also corrected by the ONT-cgMLST-Polisher. Third, the base patterns of the nucleotide modifications were not analyzed in detail. However, this was done recently by some of the co-authors and there is no reason to believe that something has changed since then ( 7 , 8 ). Finally, a proprietary tertiary correction tool was evaluated. However, just the improvements of the primary (basecaller) and secondary (Medaka) correction would warrant an own publication. In conclusion, Dorado SUP 4.3 and even more Dorado SUP 5.0 model combined with Medaka v2.0 and the ONT-cgMLST-Polisher improve ONT sequencing accuracy substantially and make it finally accurate and sufficiently reproducible for whole-genome genotyping based surveillance of bacteria. The rather low capital investment and per-sample consumable costs provide opportunities also for smaller 374 laboratories to perform WGS-based surveillance on a routine basis. Conflict of Interest All authors declare no conflict of interest. Authors contributions Karola Prior and Alexander Mellmann conceptualized the study, analyzed the results, and drafted the manuscript. All authors contributed to the laboratory ring trial work and contributed to the correction of the draft and agreed with its content. Footnotes Apart from minor changes in the manuscript, the main reason for revision is the addition of the European nucleotide archive (ENA) study numbers to ensure data availability. References 1. ↵ Mellmann A , Bletz S , Böking T , Kipp F , Becker K , Schultes A , Prior K , Harmsen D . 2016 . Real-time genome sequencing of resistant bacteria provides precision infection control in an institutional setting . Journal of Clinical Microbiology 54 : 2874 – 2881 . OpenUrl Abstract / FREE Full Text 2. ↵ Sheppard AE , Stoesser N , Wilson DJ , Sebra R , Kasarskis A , Anson LW , Giess A , Pankhurst LJ , Vaughan A , Grim CJ , Cox HL , Yeh AJ , Modernising Medical Microbiology (MMM) Informatics Group, Sifri CD, Walker AS, Peto TE, Crook DW, Mathers AJ. 2016 . Nested Russian doll-like genetic mobility drives rapid dissemination of the carbapenem resistance gene blaKPC . Antimicrob Agents Chemother 60 : 3767 – 3778 . OpenUrl Abstract / FREE Full Text 3. ↵ Paganini JA , Plantinga NL , Arredondo-Alonso S , Willems RJL , Schürch AC . 2021 . Recovering Escherichia coli plasmids in the absence of long-read sequencing data . Microorganisms 9 : 1613 . OpenUrl CrossRef PubMed 4. ↵ Sereika M , Kirkegaard RH , Karst SM , Michaelsen TY , Sørensen EA , Wollenberg RD , Albertsen M . 2022 . Oxford Nanopore R10.4 long-read sequencing enables the generation of near-finished bacterial genomes from pure cultures and metagenomes without short-read or reference polishing . Nat Methods 19 : 823 – 826 . OpenUrl CrossRef PubMed 5. ↵ Landman F , Jamin C , de Haan A , Witteveen S , Bos J , van der Heide HGJ , Schouls LM , Hendrickx APA , Dutch CPE/MRSA surveillance study group. 2024 . Genomic surveillance of multidrug-resistant organisms based on long-read sequencing . Genome Med 16 : 137 . OpenUrl CrossRef PubMed 6. ↵ Lerminiaux N , Fakharuddin K , Mulvey MR , Mataseje L . 2024 . Do we still need Illumina sequencing data? Evaluating Oxford Nanopore Technologies R10.4.1 flow cells and the Rapid v14 library prep kit for Gram negative bacteria whole genome assemblies . Can J Microbiol 70 : 178 – 189 . OpenUrl CrossRef PubMed 7. ↵ Lohde M , Wagner GE , Dabernig-Heinz J , Viehweger A , Braun SD , Monecke S , Diezel C , Stein C , Marquet M , Ehricht R , Pletz MW , Brandt C . 2024 . Accurate bacterial outbreak tracing with Oxford Nanopore sequencing and reduction of methylation-induced errors . Genome Res 34 : 2039 – 2047 . OpenUrl Abstract / FREE Full Text 8. ↵ Dabernig-Heinz J , Lohde M , Hölzer M , Cabal A , Conzemius R , Brandt C , Kohl M , Halbedel S , Hyden P , Fischer MA , Pietzka A , Daza B , Idelevich EA , Stöger A , Becker K , Fuchs S , Ruppitsch W , Steinmetz I , Kohler C , Wagner GE . 2024 . A multicenter study on accuracy and reproducibility of nanopore sequencing-based genotyping of bacterial pathogens . Journal of Clinical Microbiology 62 : e00628 – 24 . OpenUrl PubMed 9. ↵ Jünemann S , Sedlazeck FJ , Prior K , Albersmeier A , John U , Kalinowski J , Mellmann A , Goesmann A , von Haeseler A , Stoye J , Harmsen D. 2013 . Updating benchtop sequencing performance comparison . Nature biotechnology 31 : 294 – 296 . OpenUrl CrossRef PubMed 10. ↵ Souvorov A , Agarwala R , Lipman DJ . 2018 . SKESA: strategic k-mer extension for scrupulous assemblies . Genome Biol 19 : 153 . OpenUrl CrossRef PubMed 11. ↵ Li H . 2013 . Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM (2) . arXiv doi: 10.48550/ARXIV.1303.3997 . OpenUrl CrossRef 12. ↵ De Coster W, Rademakers R. 2023 . NanoPack2: population-scale evaluation of long-read sequencing data . Bioinformatics 39 :btad311. 13. ↵ Hall M . 2022 . Rasusa: Randomly subsample sequencing reads to a specified coverage . JOSS 7 : 3941 . OpenUrl CrossRef 14. ↵ Kolmogorov M , Yuan J , Lin Y , Pevzner PA . 2019 . Assembly of long, error-prone reads using repeat graphs . Nat Biotechnol 37 : 540 – 546 . OpenUrl CrossRef PubMed 15. ↵ Bouras G , Houtak G , Wick RR , Mallawaarachchi V , Roach MJ , Papudeshi B , Judd LM , Sheppard AE , Edwards RA , Vreugde S . 2024 . Hybracter: enabling scalable, automated, complete and accurate bacterial genome assemblies . Microbial Genomics 10 . 16. ↵ Feldgarden M , Brover V , Haft DH , Prasad AB , Slotta DJ , Tolstoy I , Tyson GH , Zhao S , Hsu CH , McDermott PF , Tadesse DA , Morales C , Simmons M , Tillman G , Wasilenko J , Folster JP , Klimke W . 2019 . Validating the AMRFINder tool and resistance gene database by using antimicrobial resistance genotype-phenotype correlations in a collection of isolates . Antimicrobial Agents and Chemotherapy 63 : 1 – 19 . OpenUrl CrossRef 17. ↵ Li H . 2018 . Minimap2: pairwise alignment for nucleotide sequences . Bioinformatics 34 : 3094 – 3100 . OpenUrl CrossRef PubMed 18. ↵ Chklovski A , Parks DH , Woodcroft BJ , Tyson GW . 2023 . CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning . Nat Methods 20 : 1203 – 1212 . OpenUrl CrossRef PubMed 19. ↵ Chiou C-S , Chen B-H , Wang Y-W , Kuo N-T , Chang C-H , Huang Y-T . 2023 . Correcting modification-mediated errors in nanopore sequencing by nucleotide demodification and reference-based correction . Commun Biol 6 : 1215 . OpenUrl CrossRef PubMed 20. ↵ Hall MB , Wick RR , Judd LM , Nguyen AN , Steinig EJ , Xie O , Davies M , Seemann T , Stinear TP , Coin L . 2024 . Benchmarking reveals superiority of deep learning variant callers on bacterial nanopore sequence data . Elife 13 : RP98300 . OpenUrl CrossRef PubMed 21. ↵ Zheng Z , Li S , Su J , Leung AW-S , Lam T-W , Luo R . 2022 . Symphonizing pileup and full-alignment for deep learning-based long-read variant calling . Nat Comput Sci 2 : 797 – 803 . OpenUrl CrossRef PubMed 22. ↵ Fuchs SA , Hülse L , Tamayo T , Kolbe-Busch S , Pfeffer K , Dilthey AT . 2024 . NanoCore: core-genome-based bacterial genomic surveillance and outbreak detection in healthcare facilities from Nanopore and Illumina data . mSystems 0 : e01080 – 24 OpenUrl View the discussion thread. Back to top Previous Next Posted May 07, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Accurate and Reproducible Whole-Genome Genotyping for Bacterial Genomic Surveillance with Nanopore Sequencing Data Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Accurate and Reproducible Whole-Genome Genotyping for Bacterial Genomic Surveillance with Nanopore Sequencing Data K. Prior , K. Becker , C. Brandt , A. Cabal Rosel , J. Dabernig-Heinz , C. Kohler , M. Lohde , W. Ruppitsch , F. Schuler , G.E. Wagner , A. Mellmann bioRxiv 2025.02.24.639834; doi: https://doi.org/10.1101/2025.02.24.639834 Share This Article: Copy Citation Tools Accurate and Reproducible Whole-Genome Genotyping for Bacterial Genomic Surveillance with Nanopore Sequencing Data K. Prior , K. Becker , C. Brandt , A. Cabal Rosel , J. Dabernig-Heinz , C. Kohler , M. Lohde , W. Ruppitsch , F. Schuler , G.E. Wagner , A. Mellmann bioRxiv 2025.02.24.639834; doi: https://doi.org/10.1101/2025.02.24.639834 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41937) Biophysics (21452) Cancer Biology (18589) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15156) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00