NSeqVerify: An Easy-to-Use Desktop Suite for Integrated NGS Data Analysis, from Raw Reads to Taxonomic Assignment

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 24,712 characters · extracted from preprint-html · click to expand
NSeqVerify: An Easy-to-Use Desktop Suite for Integrated NGS Data Analysis, from Raw Reads to Taxonomic Assignment | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results NSeqVerify: An Easy-to-Use Desktop Suite for Integrated NGS Data Analysis, from Raw Reads to Taxonomic Assignment View ORCID Profile Roberto Reinosa Fernández doi: https://doi.org/10.1101/2025.10.31.685854 Roberto Reinosa Fernández 1 HIV-1 Molecular Epidemiology Laboratory, Microbiology Department, Ramon y Cajal University Hospital-Ramon y Cajal Institute for Health Research (IRYCIS) , Madrid, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Roberto Reinosa Fernández For correspondence: roberto117343{at}gmail.com Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Motivation The proliferation of next-generation sequencing (NGS) data has created a computational bottleneck, especially for researchers lacking specialized bioinformatics training. Standard analysis workflows require mastering multiple command-line tools, hindering exploratory data analysis and delaying scientific discovery. Results This work presents NSeqVerify , a new cross-platform, open-source desktop software developed in Java, designed to overcome these barriers. NSeqVerify implements a fully integrated genomic workflow within a single, intuitive graphical user interface (GUI). The suite includes: (1) a preprocessing module for quality control and filtering of FASTQ files; (2) a de novo assembler employing a sophisticated De Bruijn graph algorithm with an iterative multi-k-mer strategy to maximize contiguity; and (3) a taxonomic assignment module that automates BLAST searches against NCBI databases and displays the results in an easily interpretable tabular format. The tool was validated through controlled use cases, demonstrating its ability to accurately reconstruct reference viral genomes (HIV-1) and to deconvolute metagenomic mixtures (HIV-1 and SARS-CoV-2). The final test consisted of analyzing a real elephant fecal virome (SRA: SRR35776009), where NSeqVerify successfully assembled contigs — two of which overlapped and appeared to form a partial 1555 bp genome of a putative Smacovirus , enabling the identification of its capsid protein and the prediction of its 3D structure using AlphaFold. Conclusion and Availability NSeqVerify democratizes NGS data analysis, providing a robust “all-in-one” solution that empowers molecular biologists, students, and clinicians to perform end-to-end genomic analyses. The software is freely available under the GNU GPLv3 license at ( https://github.com/roberto117343/NSeqVerify ). Contact roberto117343{at}gmail.com 1. Introduction Next-generation sequencing (NGS) has revolutionized the life sciences, enabling unprecedented-scale research in fields such as metagenomics, virology, clinical genomics, and evolutionary biology [ 1 ]. However, the resulting “data deluge” has shifted the main challenge from data generation to analysis and interpretation. A typical short-read genome analysis workflow is a multi-step process requiring substantial computational expertise. This canonical process begins with rigorous quality control (QC) and preprocessing of raw data (FASTQ format), using tools such as FastQC [ 2 ] and Trimmomatic [ 3 ]. Next, the filtered reads are assembled de novo to reconstruct the original genome — a computationally intensive task for which sophisticated algorithms have been developed, such as those used by SPAdes [ 4 ], MEGAHIT [ 5 ], or Velvet [ 6 ]. Finally, the resulting contigs must be annotated to determine their taxonomic origin and potential function, often involving large-scale homology searches using BLAST [ 7 ] against public databases. Each of these steps not only requires a different command-line tool but also dependency management, format conversion, and parameter optimization. This “command-line barrier” represents a significant obstacle for the research community. To bridge this gap, NSeqVerify was developed based on three guiding principles: (1) Integration , combining critical stages into a coherent workflow; (2) Accessibility , providing an intuitive GUI; and (3) Reproducibility , allowing users to easily document the parameters used. 2. System and Methods NSeqVerify is a standalone desktop software built in Java, using the Swing library for its interface. This design ensures portability across Windows, macOS , and Linux . 2.2.1. Module 1: FASTQ Preprocessing (Preprocess FASTQ) This module is the first step toward ensuring a high-quality assembly. It accepts a FASTQ file and performs user-configurable operations such as Phred quality filtering, end trimming, reverse complement generation , and read subsampling . 2.2.2. Module 2: De Novo Assembly (Assemble) The core of NSeqVerify is its assembler , which implements a De Bruijn graph (DBG) algorithm [ 8 ]. Its key features include: Multi-K-mer Strategy:: Assembly is performed iteratively using a series of user-defined k-mer sizes. The use of multiple k-mers has been shown to significantly improve assembly contiguity, especially in metagenomic data [ 9 ]. The contigs from one round are used as “super-reads” in the next. Advanced Graph Simplification: Heuristics are applied to prune “tips” and resolve “bubbles” — graph topologies commonly arising from sequencing errors and genomic variants [ 10 ]. 2.2.3 Module 3: Taxonomic Assignment (Classify nt /Classify aa) This final module provides an initial biological characterization of the contigs. It automates the submission of each contig to the NCBI web API for BLASTn or BLASTp searches [ 7 ]. The results are automatically parsed to extract key metrics from the best hit and are presented in a tab-delimited text file (TSV) . Contigs with no significant matches are isolated, highlighting them as candidates for further investigation. 3. Results and Validation The performance and accuracy of NSeqVerify were evaluated through a series of rigorous case studies using both simulated and real metagenomic data. 3.1 Case Study 1: Accuracy in Reconstructing a Reference Genome (HIV-1) To assess the assembler’s accuracy, 100,000 fragments of 150 bp were generated in silico from the HIV-1 reference genome (HXB2). Using a multi-k-mer strategy with odd k-mer values from 21 to 91, a minimum k-mer frequency of 5, and a minimum contig length of 200 bp, NSeqVerify reconstructed a pair of contigs, with the main one covering most of the reference genome. The assembly was nearly perfect, except for the terminal regions. 3.2 Case Study 2: Resolution of a Mixed Viral Metagenome To test the tool’s ability to deconvolute complex data, a synthetic metagenome was created by mixing 75,000 150 bp reads from the SARS-CoV-2 genome with 75,000 reads from HIV-1 . The same parameters were used as in the previous case, except for the k-mers (odd lengths 21–81). NSeqVerify successfully separated the mixture, producing two distinct long contigs. Each contig corresponded to one of the original viral genomes, demonstrating the algorithm’s effectiveness in resolving and assembling individual genomes from a mixed sample. As in the previous case, the genomes were complete except for the terminal ends. 3.3 Case Study 3: Detection of a Viral Genome in a Real Virome To validate NSeqVerify in a realistic discovery scenario, a public dataset from an elephant fecal virome (SRA: SRR35776009 ) was analyzed. The data (Illumina NovaSeq 6000, 250 bp paired-end) were preprocessed using NSeqVerify ( trim ends: 30, max reads: 50,000, reverse complement: enabled). De novo assembly was performed using a multi-k-mer strategy (odd k-mers: 21–91), a minimum k-mer frequency of 5, and a minimum contig length of 200 bp. For filtering, the minimum quality was set to 0 since this value was not available in the FASTQ file. This analysis produced numerous contigs. Among them, two overlapping contigs — Contig_4 (1 , 471 bp) and Contig_5 (1 , 421 bp) — showed nucleotide-level homology to viruses of the Smacoviridae family. Given their overlap, a final consensus sequence of 1 , 555 bp was generated using the EpiMolBio tool [ 12 ]. The open reading frames (ORFs) of this consensus sequence were analyzed using OrfViralScan [ 13 ]. The resulting protein sequences were classified with the BLASTp module of NSeqVerify, revealing a strong homology for one of the ORFs with Smacovirus capsid proteins ( ORF_6, 334 aa ). The closest hit was the capsid protein of Smacoviridae sp . (Accession: WCR62194.1), showing 51.66% amino acid identity and an E-value of 1e-102 over 90% of the sequence. The low identity percentages with the nearest relatives (48–55%) strongly suggest that this contig may represent a member of a new viral species within the Smacoviridae family. This workflow — from the initial assembly to manual merging and final characterization — demonstrates the potential of NSeqVerify to produce high-quality contigs that can serve as the foundation for potential genomic discoveries in complex metagenomic samples. The partial genome sequence and its ORF are presented in Appendix A . 4. Discussion NSeqVerify stands as a robust and validated solution for NGS data analysis, focused on accessibility . The goal is not to replace high-performance command-line tools, which will remain the gold standard for large-scale analyses on computational clusters. Instead, NSeqVerify fills a crucial niche: accessible desktop analysis . When compared to the existing bioinformatics tool ecosystem, its main competitive advantage lies in its vertical integration within a GUI . While other graphical suites offer a broader range of features, they are often more complex and not specifically focused on this workflow. NSeqVerify specializes in guiding the user from raw FASTQ files to classified contigs in the most straightforward way possible. 4.1. Limitations and Future Directions To provide a transparent evaluation of the tool, it is essential to acknowledge its current limitations, which define the path for future development. It should be noted that NSeqVerify is currently in an alpha version , but given its present utility, the release of the tool is more than justified. Scalability and Performance: De novo assembly is a memory-intensive process. NSeqVerify is currently limited to processing tens of thousands of reads . Dependence on the BLAST Web Service: The classification module relies exclusively on the NCBI web service , making it slow for thousands of contigs and requiring an internet connection. Handling of Paired-End Read Data: The current workflow does not explicitly use paired-end read information, thereby losing valuable spatial data for scaffolding . Lack of Command-Line Interface (CLI): The GUI-centered design limits automation and large-scale reproducibility . Exclusive Support for Illumina Reads: The assembler is specifically designed for short reads (Illumina technology) . In conclusion, NSeqVerify is a tool that significantly reduces the entry barrier to genomic analysis. By packaging powerful algorithms into an easy-to-use interface, it serves as an excellent “first-look” and exploratory analysis tool , enabling a broader community of scientists to obtain meaningful insights from their own data. 5. Availability NSeqVerify is open-source software distributed under the terms of the GNU General Public License v3 . It is available on GitHub at the following address: https://github.com/roberto117343/NSeqVerify 7. Figures and Appendices Download figure Open in new tab Figure 1. Graphical User Interface of NSeqVerify. Download figure Open in new tab Figure 2. Predicted 3D Structure of the Putative Smacovirus Capsid Protein. Download figure Open in new tab Figure 3. Predicted 3D Structure of the Putative Smacovirus Capsid Protein. Prediction of the assembly of 15 subunits (maximum allowed by AlphaFold), View 1. Download figure Open in new tab Figure 4. Predicted 3D Structure of the Putative Smacovirus Capsid Protein. Prediction of the assembly of 15 subunits (maximum allowed by AlphaFold), View 2. [APPENDIX A] Consensus Genome and ORF6 (Capsid Protein) Sequence of the Putative Smacovirus (1,555 bp) Putative Identification Partial genome of a presumably uncharacterized Smacovirus . Contig Sequence (FASTA format) >Smacovirus_consensus_length_1555 ACATATTCTTGCCTCTGTCAAGTTCTCGTCAAAGACGGTACAGCGCTTGACAATTATTAACGGAGGAACTA CTATGTATGGTTTTCGTCGTAGAAGGTATGGTTATCGTAGGAAGTCTAGGTATTCTAGGCGTAGGAGGTACTACTGATGGTTGTCCATCCTGTCCTGACGGCCGTAGGTCTTGCCGGTCTCGGAGTTTCCGCCGGTGCTAACGTCTACGCCCAGTATCGTCAGAGGCAGTTGTACCGCCAACAGGCTAATGCTTATTCTAACCTCCATCGTGGATACACAAAATATCTGAAATCCCATGGCAGACAAATCAACCCGGACCGCGCTTGGACGTCGTATTATGGCCAGTATCAGAGAGCATTGGCCAATTATGAGAGCAGTTATGCTGGTAGTTTTGGTACTGTCGGAGGTTCTGTCGGAGCTGGTTCAGCTATTGCGCAGCATTCTCTTAGATCCACGAATGGAACATTTAGGAGGTTACCCAGATGAAATACACATTTCAGCACTATATCGATATCAGTACCTCGGCTGAATCGATGCAGATCATTTCGGTTAATGCCGGTGGTCAGTATCTGATTAATCGTTGCAGACATCTTCTTGGAACTTACAAATACTACAAGCTCGGAAAGGTTTCCATTAGGCTCGTTCCGGCTTCTACTCTTCCTGTGGACCCTCTCGGCCTGTCCTATGCCGATACGGATCCGCAGACCGTCGACCCTCGCGACCAGCTTAATCCCGGTCTTGTCCGTATTACCAACGGTGAAGATTTCCAGTATTCGATTGATGGGGTCTCTAGTGCCTCACAGGACGAAATTTACAAAGCTATGATGCTCGACCCTCGTTGGTCTAAGTTCATGCTTCAGAGTGGCTTTAGGCGTTCAGCATCGCCCCTCTTCTGGTCCGTCGGTCAGCTCCACCAGGACGCATACCCCGGTTCAACTGTCAACGTCTTCACTAAAGGTACTGGACTACCCCAGACTAATTCTTGGATGTTTTCTTCTACTCGTTCCGGTACTACTGATGTTTCTGCTGCAATGAAGAATCTCGGTTCCGAGGGCATTAGGGTTAACTGTCAGGATTCGGATCCTCACGGATTTTTCCAGACTGGTCACCGTCAGCGTATGTCTTGGTTACCTACCGATATGCTGCAGCAGTTTGCCGGTGGTTCTTCTATTGCTTCCACGGATATGTTTACAATGGCCGGCCTTAATCCCATCTTGGCCCCCAATATTATCACGTGCATCCTGCCTAGAGCCTACAAGACACTGTACTACTACCGTCTTTTCATTACTGAAACTGTCTACTTCAGCGGTATCAAGAATGTTGGACTTGGCATTGAGGAGGCTGAAAGCTTGTACGAATACAACGGACTTGATAACTTCACTAACCCGATGTTCCCGACTGGGGTTAATCCGGTAAACGGAATTCAGATCCTGAAGACTTATTCGGAGCTTGTTACTCCGCCTAATGACGGTGATTCGAATGACTGAAATCAAGATCTATACATCTCTCGGTGTAGTCACCGCCAAGCCTATGAGCTTGGTAG >Smacovirus_consensus_length_1555_rev_compl CTACCAAGCTCATAGGCTTGGCGGTGACTACACCGAGAGATGTATAGATCTTGATTTCAGTCATTCGAATCACCGTCATTAGGCGGAGTAACAAGCTCCGAATAAGTCTTCAGGATCTGAATTCCGTTTACCGGATTAACCCCAGTCGGGAACATCGGGTTAGTGAAGTTATCAAGTCCGTTGTATTCGTACAAGCTTTCAGCCTCCTCAATGCCAAGTCCAACATTCTTGATACCGCTGAAGTAGACAGTTTCAGTAATGAAAAGACGGTAGTAGTACAGTGTCTTGTAGGCTCTAGGCAGGATGCACGTGATAATATTGGGGGCCAAGATGGGATTAAGGCCGGCCATTGTAAACATATCCGTGGAAGCAATAGAAGAACCACCGGCAAACTGCTGCAGCATATCGGTAGGTAACCAAGACATACGCTGACGGTGACCAGTCTGGAAAAATCCGTGAGGATCCGAATCCTGACAGTTAACCCTAATGCCCTCGGAACCGAGATTCTTCATTGCAGCAGAAACATCAGTAGTACCGGAACGAGTAGAAGAAAACATCCAAGAATTAGTCTGGGGTAGTCCAGTACCTTTAGTGAAGACGTTGACAGTTGAACCGGGGTATGCGTCCTGGTGGAGCTGACCGACGGACCAGAAGAGGGGCGATGCTGAACGCCTAAAGCCACTCTGAAGCATGAACTTAGACCAACGAGGGTCGAGCATCATAGCTTTGTAAATTTCGTCCTGTGAGGCACTAGAGACCCCATCAATCGAATACTGGAAATCTTCACCGTTGGTAATACGGACAAGACCGGGATTAAGCTGGTCGCGAGGGTCGACGGTCTGCGGATCCGTATCGGCATAGGACAGGCCGAGAGGGTCCACAGGAAGAGTAGAAGCCGGAACGAGCCTAATGGAAACCTTTCCGAGCTTGTAGTATTTGTAAGTTCCAAGAAGATGTCTGCAACGATTAATCAGATACTGACCACCGGCATTAACCGAAATGATCTGCATCGATTCAGCCGAGGTACTGATATCGATATAGTGCTGAAATGTGTATTTCATCTGGGTAACCTCCTAAATGTTCCATTCGTGGATCTAAGAGAATGCTGCGCAATAGCTGAACCAGCTCCGACAGAACCTCCGACAGTACCAAAACTACCAGCATAACTGCTCTCATAATTGGCCAATGCTCTCTGATACTGGCCATAATACGACGTCCAAGCGCGGTCCGGGTTGATTTGTCTGCCATGGGATTTCAGATATTTTGTGTATCCACGATGGAGGTTAGAATAAGCATTAGCCTGTTGGCGGTACAACTGCCTCTGACGATACTGGGCGTAGACGTTAGCACCGGCGGAAACTCCGAGACCGGCAAGACCTACGGCCGTCAGGACAGGATGGACAACCATCAGTAGTACCTCCTACGCCTAGAATACCTAGACTTCCTACGATAACCATACCTTCTACGACGAAAACCATACATAGTAGTTCCTCCGTTAATAATTGTCAAGCGCTGTACCGTCTTTGACGAGAACTTGACAGAGGCAAGAATATGT ORF6 Sequence (FASTA format) > ORF6 Capsid Protein MKYTFQHYIDISTSAESMQIISVNAGGQYLINRCRHLLGTYKYYKLGKVSIRLVPASTLPVDPLGLSYADTDPQTVDPRDQLNPGLVRITNGEDFQYSIDGVSSASQDEIYKAMMLDPRWSKFMLQSGFRRSASPLFWSVGQLHQDAYPGSTVNVFTKGTGLPQTNSWMFSSTRSGTTDVSAAMKNLGSEGIRVNCQDSDPHGFFQTGHRQRMSWLPTDMLQQFAGGSSIASTDMFTMAGLNPILAPNIITCILPRAYKTLYYYRLFITETVYFSGIKNVGLGIEEAESLYEYNGLDNFTNPMFPTGVNPVNGIQILKTYSELVTPPNDGDSND Footnotes https://github.com/roberto117343/NSeqVerify 6. References [1]. ↵ Metzker ML . Sequencing technologies — the next generation . Nat Rev Genet . 2010 ; 11 ( 1 ): 31 – 31 . doi: 10.1038/nrg2626 . PMID: 19997069 . OpenUrl CrossRef PubMed Web of Science [2]. ↵ Andrews S. FastQC: A quality control tool for high throughput sequence data . Babraham Bioinformatics, Babraham Institute ; 2010 . Available from: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ [3]. ↵ Bolger AM , Lohse M , Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data . Bioinformatics . 2014 ; 30 ( 15 ): 2114 – 2114 . doi: 10.1093/bioinformatics/btu170 . PMID: 24695404 . OpenUrl CrossRef PubMed Web of Science [4]. ↵ Bankevich A , Nurk S , Antipov D , Gurevich AA , Dvorkin M , Kulikov AS , et al. SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing . J Comput Biol . 2012 ; 19 ( 5 ): 455 – 455 . doi: 10.1089/cmb.2012.0021 . PMID: 22506599 . OpenUrl CrossRef PubMed [5]. ↵ Li D , Liu CM , Luo R , Sadakane K , Lam TW . MEGAHIT: an ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph . Bioinformatics . 2015 ; 31 ( 10 ): 1674 – 1674 . doi: 10.1093/bioinformatics/btv033 . PMID: 25609793 . 31(10), 1674–1676. OpenUrl CrossRef PubMed [6]. ↵ Zerbino DR , Birney E. Velvet: algorithms for de novo short read assembly using de Bruijn graphs . Genome Res . 2008 ; 18 ( 5 ): 821 – 821 . doi: 10.1101/gr.074492.107 . PMID: 18349386 . OpenUrl Abstract / FREE Full Text [7]. ↵ Altschul SF , Gish W , Miller W , Myers EW , Lipman DJ . Basic local alignment search tool . J Mol Biol . 1990 ; 215 ( 3 ): 403 – 403 . doi: 10.1016/S0022-2836(05)80360-2 . PMID: 2231712 . OpenUrl CrossRef PubMed Web of Science [8]. ↵ Compeau PEC , Pevzner PA , Tesler G. How to apply de Bruijn graphs to genome assembly . Nat Biotechnol . 2011 ; 29 ( 11 ): 987 – 987 . doi: 10.1038/nbt.2023 . PMID: 22068540 . OpenUrl CrossRef PubMed [9]. ↵ Peng Y , Leung HCM , Yiu SM , Chin FYL . IDBA-UD: a de novo assembler for single-cell and metagenomic sequencing data with highly uneven depth . Bioinformatics . 2012 ; 28 ( 11 ): 1420 – 1428 . doi: 10.1093/bioinformatics/bts174 . PMID: 22495754 . OpenUrl CrossRef PubMed Web of Science [10]. ↵ Simpson JT , Wong K , Jackman SD , Schein JE , Jones SJM , Birol I. ABySS: a parallel assembler for short read sequence data . Genome Res . 2009 ; 19 ( 6 ): 1117 – 1117 . doi: 10.1101/gr.089532.108 . PMID: 19251739 . OpenUrl Abstract / FREE Full Text [11]. Jumper J , Evans R , Pritzel A , Green T , Figurnov M , Ronneberger O , et al. Highly accurate protein structure prediction with AlphaFold . Nature . 2021 ; 596 ( 7873 ): 583 – 583 . doi: 10.1038/s41586-021-03819-2 . PMID: 34265844 ; PMCID: PMC8371605 . OpenUrl CrossRef PubMed [12]. ↵ Reinosa R , Troyano-Hernáez P , Valadés-Alcaraz A , Holguín Á. EpiMolBio: A novel userfriendly bioinformatic program for genetic variability analysis . Comput Struct Biotechnol J . 2025 ; 27 : 2968 – 2975 . doi: 10.1016/j.csbj.2025.06.034 . PMID: 40687996 ; PMCID: PMC12273208 . OpenUrl CrossRef PubMed [13]. ↵ Reinosa RF . OrfViralScan 3.0: An intuitive tool for the identification and tracking of open reading frames in viral genomes . bioRxiv . 2025 Apr 26:2025.04.26.650794. doi: 10.1101/2025.04.26.650794 . OpenUrl Abstract / FREE Full Text View the discussion thread. Back to top Previous Next Posted November 03, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following NSeqVerify: An Easy-to-Use Desktop Suite for Integrated NGS Data Analysis, from Raw Reads to Taxonomic Assignment Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share NSeqVerify: An Easy-to-Use Desktop Suite for Integrated NGS Data Analysis, from Raw Reads to Taxonomic Assignment Roberto Reinosa Fernández bioRxiv 2025.10.31.685854; doi: https://doi.org/10.1101/2025.10.31.685854 Share This Article: Copy Citation Tools NSeqVerify: An Easy-to-Use Desktop Suite for Integrated NGS Data Analysis, from Raw Reads to Taxonomic Assignment Roberto Reinosa Fernández bioRxiv 2025.10.31.685854; doi: https://doi.org/10.1101/2025.10.31.685854 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7633) Biochemistry (17680) Bioengineering (13889) Bioinformatics (41927) Biophysics (21445) Cancer Biology (18585) Cell Biology (25491) Clinical Trials (138) Developmental Biology (13373) Ecology (19897) Epidemiology (2067) Evolutionary Biology (24308) Genetics (15606) Genomics (22494) Immunology (17736) Microbiology (40385) Molecular Biology (17175) Neuroscience (88583) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4822) Physiology (7641) Plant Biology (15149) Scientific Communication and Education (2045) Synthetic Biology (4293) Systems Biology (9822) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00