Ragnarok: a flexible and RApid GeNe Annotation (ROcKs) pipeline deployed through Nextflow

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 62,149 characters · extracted from preprint-html · click to expand
Ragnarok: a flexible and RApid GeNe Annotation (ROcKs) pipeline deployed through Nextflow | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Ragnarok: a flexible and RApid GeNe Annotation (ROcKs) pipeline deployed through Nextflow View ORCID Profile Ryan D. Kuster , View ORCID Profile Zane C. Smith , View ORCID Profile Lauren Whitt , View ORCID Profile Margaret Staton , View ORCID Profile Ben N. Mansfeld , View ORCID Profile Christopher Gottschalk doi: https://doi.org/10.1101/2025.10.03.680343 Ryan D. Kuster 1 Department of Entomology and Plant Pathology, University of Tennessee , Knoxville, TN, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ryan D. Kuster For correspondence: christopher.gottschalk{at}usda.gov Zane C. Smith 1 Department of Entomology and Plant Pathology, University of Tennessee , Knoxville, TN, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zane C. Smith Lauren Whitt 2 Appalachian Fruit Research Station, Agriculture Research Service, United States Department of Agriculture , Kearneysville, WV, USA 3 HudsonAlpha Institute for Biotechnology , Huntsville, AL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lauren Whitt Margaret Staton 1 Department of Entomology and Plant Pathology, University of Tennessee , Knoxville, TN, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Margaret Staton Ben N. Mansfeld 4 Department of Biology, Washington University in St. Louis , St. Louis, MO Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ben N. Mansfeld Christopher Gottschalk 2 Appalachian Fruit Research Station, Agriculture Research Service, United States Department of Agriculture , Kearneysville, WV, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Christopher Gottschalk For correspondence: christopher.gottschalk{at}usda.gov Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Motivation High-quality genome assemblies and pangenomes are increasingly accessible and achievable due to advances in third-generation sequencing and assembly algorithms, but genome annotation remains a critical bottleneck. Existing gene annotation pipelines often require complex installations, multiple steps, long runtimes, and produce variable results, which impede quality and the downstream usage of the annotations. Results We developed RAGNAROK (RApid GeNe Annotation ROcKs), a modular annotation pipeline built on Nextflow and Apptainer that integrates ab initio prediction, transcriptome and protein evidence, repeat annotation, functional assignment, and quality assessment into a single reproducible workflow. Moreover, RAGNAROK utilizes GPU acceleration and parallelization to produce high-quality gene annotations. We benchmarked against BRAKER3 across five diverse, reference-quality plant genomes and demonstrated that RAGNAROK achieved higher sensitivity, precision, and F1 scores at the exon, transcript, and gene levels. Furthermore, we demonstrated RAGNAROK’s improvement in re-annotating a suite of five Rosaceae genomes that were previously annotated using MAKER. Overall, RAGNAROK produced more ideal mono:multi-exonic gene ratios, improved BUSCO completeness scores, and reduced missing gene content compared to other pipelines. Additionally, RAGNAROK consistently outperformed BRAKER3 in runtime, scaling efficiently from small to gigabase-scale genomes. RAGNAROK provides a flexible, rapid, scalable, and accurate solution for de novo and re-annotation of plant genomes. Its modular design and workflow scalability lay the foundation for future extensions to animal, fungal, and other eukaryotic genomes. Availability and Implementation RAGNAROK is available as a GitHub repository at https://github.com/ryandkuster/ragnarok . Introduction With recent advancements in third-generation sequencing technologies and improved genome assembly software, the ability to assemble high-quality eukaryotic genomes has become increasingly accessible and achievable ( Li and Durbin, 2024 ). These advances have rapidly increased genomic resources for species with large, heterozygous, and repetitive genomes, including plants ( Li and Harkess, 2018 ; Gladman et al ., 2023 ). However, genome assembly is only the first stage of generating a high-quality genomic resource. Annotation of genic and repeat-rich regions is currently the most time-intensive and computationally demanding step ( Jung et al ., 2020 ; Kim and Kim, 2022 ; Dominguez Del Angel et al ., 2018 ). The requirements of annotation are amplified in the pan-genomic context, where uniformly annotating several to tens of complex genomes becomes crucial. Despite recent advances, producing accurate, reproducible, and accessible genome annotations remains a challenge. The process of genome annotation begins within the intergenic space, which consists of low-complexity sequences that harbor transposable elements (TE), simple sequence repeats, pseudogenes, and other non-coding features. Many tools have been developed to annotate specific features within genomes, such as Generic Repeat Find, LTR_FINDER, and HelitronScanner ( Xiong et al ., 2014 ; Xu and Wang, 2007 ; Shi and Liang, 2019 ). Outputs of these repeat-identifying tools are often used as inputs into other software, such as LTR_retriever and RepeatModeler2 ( Ou and Jiang, 2018 ; Flynn et al ., 2020 ), to broaden their utility by identifying new or more complex related features. Over time, all-in-one pipelines or online servers that rely on such software have become available, providing greater ease-of-use ( Ou et al ., 2019 ; Humann et al ., 2019 ; Su et al ., 2021 ; Baril et al ., 2024 ; The Galaxy Community, 2024 ). Apart from the biological importance of TE annotation, the accurate annotation of repetitive space is critical for gene annotation, as it allows masking of repeats that would otherwise contain thousands of untranslated pseudogenes, generated from TE activity in the genome. Repeat masking is typically handled by either the TE annotation pipelines themselves ( e . g ., EDTA) or by a separate program such as RepeatMasker ( Ou et al ., 2019 ; Su et al ., 2021 ; Smit et al ., 2013 ). While the sensitivity and precision of TE annotators are critical to annotating a genome, the success of this step hinges on appropriate masking following TE identification. Masking typically comes in two forms, hard- or soft-masking. Hard masking replaces the nucleotides of the annotated repeats with Ns, whereas soft-masking swaps the capitalized nucleotides of the FASTA format with lowercase. Additional methods include partial hard-masking, in which hundreds to thousands of nucleotides in the left and right flanks of TEs remain unmasked in an attempt to balance between over- and under-masking. However, masking in its forms may introduce additional bias and could hinder annotating important genes or more complex features ( Bayer et al ., 2020 ). For example, nucleotide-binding leucine-rich repeat genes (commonly referred to as R genes) contain repetitive sequences intrinsically, while also commonly grouped in repetitive clusters within repeat-rich regions ( Young, 2000 ; Hammond-Kosack and Jones, 1997 ; Ellis et al ., 2000 ; Bayer et al ., 2018 ). These arrays can be further confounded by the presence of sequence-similar pseudogenes, which can serve as regulatory elements ( Balakirev and Ayala, 2003 ; Pink et al ., 2011 ). This complexity within perceived low-complexity regions creates additional challenges to gene annotation. To overcome these challenges, R gene-specific annotation pipelines such as NLR-parser, NLR-Annotator, and FindPlantNLRs have been developed ( Chen et al ., 2023a ; Steuernagel et al ., 2015 , 2020 ). Typically, these pipelines are executed on an unmasked genomic sequence to identify R genes. However, these tools have not yet been integrated into mainstream gene annotation pipelines, despite their ability to identify novel R genes missing from traditional gene annotations ( Larson et al ., 2025 ; Liu et al ., 2024 ). While low-complexity repetitive space remains challenging to annotate, high-complexity genic space is even more onerous. Many tools have been developed to recognize sequence features of protein-coding genes and predict gene annotations without evidence, such as Augustus, GeneMark, SNAP, and Helixer ( Brůna et al ., 2024 ; Stanke et al ., 2004 ; Korf, 2004 ; Stiehler et al ., 2021 ). These ab initio gene predictors can improve their accuracy and sensitivity for identifying features when iteratively trained ( i . e ., Augustus, GeneMark, and SNAP) ( Stanke et al ., 2004 ; Korf, 2004 ; Brůna et al ., 2024 ). However, this process is time-consuming and requires substantial user input and operations. In comparison, some ab initio predictors, such as Helixer, rely on a preset inference dataset and require execution once ( Stiehler et al ., 2021 ). In addition to ab initio predictions, annotation can also leverage evidence-based inputs such as a de novo or reference-guided transcriptome assembly. Transcriptome assemblers, such as Trinity and StringTie2, can assemble transcriptional models from short-read RNA-seq data either de novo or by aligning the RNA-seq reads to the genome, respectively ( Pertea et al ., 2016 ; Grabherr et al ., 2011 ). With the advent of third-generation sequencers, long-read RNA-seq ( i . e ., iso-seq) can also be employed in transcriptome assembly by programs such as StringTie2 and RNA-bloom2 ( Kovaka et al ., 2019 ; Nip et al ., 2023 ; Shumate et al ., 2022 ). Beyond transcripts, protein sequence-based evidence annotation is also useful. Tools such as Exonerate’s protein2genome and miniprot are capable and fast protein aligners ( Li, 2023 ; Slater and Birney, 2005 ). Ultimately, most published genome reports utilize a combination of ab initio predictions and evidence-based annotations. This combination method for gene annotation has been integrated into all-in-one annotation pipelines such as MAKER, BRAKER, and FUNANNOTATE ( Campbell et al ., 2014 ; Palmer and Stajich, 2020 ; Gabriel, Brůna, et al ., 2024). Although aimed at simplifying the genome annotation process, these all-in-one annotation pipelines can be challenging to install, cumbersome to execute, complex to parameterize, slow to run, and difficult to reproduce. For example, de novo annotations using BRAKER, where RNA-seq and protein data are available, require genome annotations generated independently by BRAKER1 (RNA-seq) and BRAKER2 (protein), followed by the transcript selection tool TSEBRA ( Gabriel et al ., 2021 ). However, this procedure has since been integrated into a singular pipeline in BRAKER3 (Gabriel, Brůna, et al ., 2024). Furthermore, under- and over-annotation can be observed when using these pipelines. For example, the MAKER and BRAKER pipelines are often challenged by large, highly repetitive plant genomes and a lack of available evidence-based data ( Vuruputoor et al ., 2023 ). Vuruputoor et al. (2023) demonstrated these flaws by reannotating the repetitive and large Liriodendron genome using various combinations of RNA-seq evidence in the BRAKER and BRAKER + TSBERA pipelines. Vuruputoor et al. (2023) reported a total of 30,035 to 51,804 annotated genes, compared to the published annotation of 35,261, when using different RNA-seq inputs and filtering methods ( Vuruputoor et al ., 2023 ; Chen et al ., 2019 ), demonstrating the high variability in annotation. To overcome the challenges associated with annotation, we developed a Nextflow pipeline ( Langer et al ., 2025 ) that utilizes Apptainer ( Kurtzer et al ., 2017 ) to combine a suite of fast and accurate, ab initio and evidence-based gene annotators into a novel, all-in-one annotation pipeline ( Langer et al ., 2025 ). This pipeline, named RApid GeNe Annotation (ROcKs) or RAGNAROK ( https://github.com/ryandkuster/ragnarok ), aims to be a rapid, flexible, accurate, and reproducible option for de novo and re-annotation of plant genomes. Moreover, RAGNAROK’s implementation also allows for the optional incorporation of repeat annotation using EDTA before gene annotation. Furthermore, the pipeline performs integrated gene model filtering, functional annotation, and quality assessment. We benchmarked our pipeline, RAGNAROK, against other annotation pipelines (MAKER and BRAKER) on both model and non-model genomes to demonstrate its higher accuracy, precision, and overall annotation quality. By deploying our pipeline through the increasingly popular Nextflow workflow management system ( Langer et al ., 2025 ), we eliminate the need for users to install all dependencies, thereby creating a reproducible and scalable workflow. Methods Pipeline development and implementation RAGNAROK annotation is split into several parallel processes for annotation: 1) ab initio gene prediction, 2) evidence-based transcriptome assembly, 3) protein database mapping, and 4) previous annotation liftover ( Fig. 1 ). For ab initio gene prediction, RAGNAROK leverages the graphical processing unit (GPU) accelerated Helixer gene predictor ( Stiehler et al ., 2021 ), which maintains support for the CPU implementation of Helixer at the expense of additional processing time. Here, the masked genome file and the Helixer scoring file are used as inputs into Helixer with default plant parameters ( Stiehler et al ., 2021 ; Holst et al ., 2023 ). Download figure Open in new tab Figure 1. Flowchart of critical RAGNAROK pipeline stages. Evidence-based steps are represented in yellow, and all steps post-Mikado are represented in blue. For evidence-based transcriptome assembly, we utilize the StringTie2 pipeline with short- and/or long-reads ( Pertea et al ., 2016 ; Kovaka et al ., 2019 ; Shumate et al ., 2022 ). With short-read data, reads are first assessed for quality using FASTQC and MULTIQC, and processed for adapters and trimmed by FASTP ( Chen et al ., 2018 ; Andrews, 2010 ; Ewels et al ., 2016 ). A second assessment of the resulting trimmed reads is conducted with FASTQC and MULTIQC. The masked genome file is then indexed using the STAR aligner, and trimmed reads are mapped onto the genome using the following parameters --outSAMtype BAM Unsorted --outSAMstrandField intronMotif --alignIntronMax 10000 ( Dobin et al ., 2013 ). With long-read data, the masked genome is indexed using minimap2, and reads are aligned with the following parameters -ax splice:hq -uf enabled ( Li, 2018 ). All aligned reads are then sorted using SAMtools and used as input into StringTie2, with the hybrid mode enabled if long reads were used, to perform transcript assembly ( Pertea et al ., 2016 ; Kovaka et al ., 2019 ; Shumate et al ., 2022 ; Danecek et al ., 2021 ; Pertea et al ., 2015 ). The resulting GTF annotation file is subsequently converted to GFF3 format using gffread ( Pertea and Pertea, 2020 ). We additionally use the StringTie2 GTF as input into TransDecoder ( https://github.com/TransDecoder/ ). TransDecoder aims to extract the longest open-reading frame transcripts from the StringTie2 output. Here, we first execute gtf_genome_to_cdna_fasta.pl using the StringTie2 GTF and masked genome file. We then execute TransDecoder . LongOrfs and TransDecoder . Predict using the output from gtf_genome_to_cdna_fasta.pl. The StringTie2 GTF is then used again as input into gtf_to_alignment_gff3 . pl . The last step of the TransDecoder steps was cdna_alignment_orf_to_genome_orf . pl using the outputs from all three prior steps as suggested by TransDecoder’s documentation ( https://github.com/TransDecoder/ ). Our second evidence-based annotation comes from alignments of a reference protein data set using miniprot ( Li, 2023 ). First, an index is created from the masked genome file. Next, the reference protein fasta is used as input into miniprot with the flag for –gff evoked for output ( Li, 2023 ). We also added an additional, optional step for evidence-based annotation via LiftOff (1.6.3) to lift over existing reference annotations from a similar species by selecting optimal exon alignments ( Shumate and Salzberg, 2021 ). LiftOff is also particularly useful for transferring pre-existing gene annotations from previous annotation versions. This step was executed using the default parameters and gene copy search, considering only alignments with at least 95% identity. Additionally, RAGNAROK flexibly supports the inclusion of user-generated GFFs from other tools, e . g ., cufflinks ( Trapnell et al ., 2012 ) or prothint ( Brůna et al ., 2020 ), or other annotation pipelines. However, the user must include their own weights for Mikado transcript selection. Following gene predictions and evidence-based annotations, we use the Mikado transcript selector to reduce redundancy and select the best representative transcript models ( Venturini et al ., 2018 ). Although written for polishing of reference annotations, Mikado serves usefully in selecting a consensus annotation from de novo and ab initio sources. The first step in transcript selection by Mikado was the design of the input list file. This tab-delimited file includes all the resulting GTF/GFFs from all previous annotation steps as described by Mikado’s documentation. These inputs included the StringTie2 GTF, miniprot GFF, Helixer GFF, TransDecoder GFF, and optionally the LiftOff GFF. The miniprot, StringTie2, and LiftOff datasets were flagged as reference annotations. Our scoring for all inputs was scaled by our confidence in the annotation tools (StringTie2=10, miniprot=10, Helixer=1, TransDecoder=-0.5). With the list file created, we then executed Mikado configure with the list file as the –list input, the masked genome file as the –reference input, the Mikado prepared plants.yaml as the –scoring input, the UniProt Swiss-Prot protein sequences ( The UniProt Consortium, 2025 ) as the –bt input, and the Mikado provided configuration.yaml file as the final input. We then executed mikado prepare --json-conf using the mikado prepared configuration.yaml ( Venturini et al ., 2018 ). Once completed, we then prepared a DIAMOND database using the UniProt Swiss-Prot proteins ( Buchfink et al ., 2021 ). DIAMOND was then executed on the Mikado prepared transcript fasta file with the following flags: blastx -k 5 -f 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore ppos btop ( Buchfink et al ., 2021 ). Next, we used TransDecoder.LongOrfs on the Mikado prepared transcript fasta file. We were then able to continue with the Mikado pipeline. First, Mikado serialise was executed using the Mikado prepared configuration.yaml as the --json-conf input, the Mikado prepared fasta as --transcripts input, the DIAMOND output at the --xml input, and the TransDecoder.LongOrfs output was prepared on the Mikado transcript fasta as the --orfs input, and finally, the UniProt Swiss-Prot proteins were used as the --blast_targets input ( Venturini et al ., 2018 ). The final step was to execute the Mikado pick with the masked genome fasta as the --fasta input, Mikado prepared the configuration.yaml as the --configuration input, and the --subloci-out and --loci-out outputs were set. The resulting loci-out file was treated as a final, unfiltered gene annotation file ( Venturini et al ., 2018 ). To further process and increase the evidence that a gene is annotated correctly, RAGNAROK carries out functional annotation. This functional annotation is conducted using EnTAP (2.2.0) ( Hart et al ., 2020 ). Briefly, EnTAP conducts functional annotation by performing similarity searches of the protein to several databases using DIAMOND ( Buchfink et al ., 2021 ). We specifically use annotations from Swiss-Prot and NCBI RefSeq Plant databases as references. Gene Ontology (GO) terms are additionally assigned using EggNOG-mapper (v.2) and the EggNOG database (v.6) ( Cantalapiedra et al ., 2021 ; Hernández-Plaza et al ., 2022 ). Following functional annotation, we created a custom script to further filter the annotations from Mikado2 to remove pseudogenes and monoexonic features that lack transcribed proteins. This custom filtering script relies on AGAT (v1.4.1) subset functionality ( Dainet, 2019 ). To assess the quality of the resulting RAGNAROK annotation, we integrated several assessment tools. First, to determine the accuracy of annotating universal single-copy orthologs and utilize the benchmarking program BUSCO (5.8.2) ( Simão et al ., 2015 ; Manni et al ., 2021 ). BUSCO is implemented to utilize the latest OrthoDB v12 databases ( Tegenfeldt et al ., 2025 ). Complementary, we also implemented Compleasm, which is functionally similar to BUSCO with a reportedly higher sensitivity ( Huang and Li, 2023 ). Benchmarking To evaluate the performance of RAGNAROK for de novo annotation, we set out to compare against BRAKER3 on several plant genomes with either heavily curated reference gene annotations or that have undergone extensive annotation parameter testing (Gabriel, Brůna, et al ., 2024). These genomes include Arabidopsis thaliana TAIR10, Prunus persica ‘Lovell 2D’ v2, Malus domestica GDDH13, Zea mays B73, and Liriodendron chinense ( Chen et al ., 2019 ; Lamesch et al ., 2012 ; Wu et al ., 2023 ; Verde et al ., 2013 , 2017 ; Daccord et al ., 2017 ; Cheng et al ., 2017 ; Hufford et al ., 2021 ). To directly compare against BRAKER3, we utilized the same TE annotation and evidence datasets ( e . g ., RNA-seq and protein databases). This was accomplished by using the EDTA mod.MAKER.masked (FASTA) output generated within RAGNAROK as masked genome input into BRAKER3, thus ensuring both pipelines annotated the same masked sequence. All corresponding commands for RAGNAROK and BRAKER3 for these five genomes can be found in Supplemental File 1. Precision and sensitivity statistics of the two resulting de novo annotations per species were obtained using Mikado2’s stats function and BUSCO using OrthoDB v10 and v12 ( Venturini et al ., 2018 ; Simão et al ., 2015 ; Manni et al ., 2021 ; Tegenfeldt et al ., 2025 ). Comparisons between the two de novo annotations and the reference annotation of each species were obtained using Mikado2 compare feature ( Venturini et al ., 2018 ). Genomes, reference annotations, evidence datasets, and versions for each of the five subject species can be found in Supplement Table 1 ( Chen et al ., 2019 ; Lamesch et al ., 2012 ; Verde et al ., 2013 , 2017 ; Daccord et al ., 2017 ; Hufford et al ., 2021 ). View this table: View inline View popup Download powerpoint Table 1. Annotation statistics between RAGANAROK, BRAKER3, and reference annotation on five reference quality genomes. To compare run times between RAGNAROK and BRAKER3, all five reference quality genomes were annotated in a paired fashion on the same computing hardware. For A. thaliana, P. persica, M. domestica , and L. chinense , their RAGNAROK and BRAKER3 annotations were generated on a local server. For RAGNAROK, the nextflow script was executed using the profile local and eight (SFile 1). This profile allowed for the greatest parallelization of jobs on a server outfitted with dual CPUs capable of 96 threads, 200 GB RAM, and twin NVIDIA RTX 4500 GPUs (Santa Clara, CA). For BRAKER3, all four datasets run on local hardware were executed with the threads parameter set to the maximum allowed of 48 (SFile 1). Z. mays was executed on a high-performance computing cluster due to the computational demands for a large, complex genome. Here, RAGNAROK was executed using the profile slurm and default settings (SFile 1). For BRAKER3, it was again set to a maximum thread parameter of 48. We also evaluated RAGANROK for re-annotation of several non-model species that were previously annotated using the MAKER pipeline ( Campbell et al ., 2014 ). These genomes included Malus angustifolia, Malus fusca, Prunus persica ‘Lovell 2D’ v3 and ‘Lovell 5D’, and Pyrus communis ‘d’Anjou’ ( Gottschalk et al ., 2025 ; Mansfeld, Yocca, et al ., 2023; Mansfeld, Ou, et al ., 2023; Yocca et al ., 2024 ). Similar to the de novo annotation, the same masked genome file and evidence datasets were used as input. Statistics for the RAGNAROK and MAKER annotations were obtained using Mikado2 stats function and BUSCO using OrthoDB v10 and v12 ( Venturini et al ., 2018 ; Simão et al ., 2015 ; Manni et al ., 2021 ; Tegenfeldt et al ., 2025 ). Genomes, reference annotations, evidence datasets, and versions for each of the five subject species can also be found in Supplement Table 1 . Statistical analysis was conducted using R (v.4.5.1) ( R Core Team, 2024 ). Figure images were generated using ggplot2 ( Wickham, 2016 ). Results Comparing de novo annotation of RAGNAROK vs BRAKER3 To evaluate the performance of RAGNAROK, we compared it directly to BRAKER3, the most widely used eukaryotic annotation pipeline (Gabriel, Brůna, et al ., 2024). Here, we selected five reference-quality or thoroughly annotated plant genomes - Arabidopsis thaliana, Prunus persica, Malus domestica, Zea mays, and Liriodendron chinense to serve as independent cases. Each genome was treated as being de novo annotated using publicly available data that was either presented in the original publication of the genome or in previous evaluations of BRAKER3 (S Table 1 ). To compare the quality of the annotations, the separate annotations generated by RAGNAROK and BRAKER3 were compared to the reference annotations (S Table 1 ). Overall, we observed higher sensitivity and precision with RAGNAROK compared to BRAKER3 at the base, exon, transcript, and gene level when compared to the reference annotation of the genome ( Fig. 2 ; STable 2). Significantly higher sensitivity was observed for RAGNAROK at the exon, transcript, and gene levels. This trend continued for precision, with RAGNAROK being significantly higher in precision at the transcript and gene level. As a result, the F1 values, which are dependent on sensitivity and precision, were also found to be significantly higher for RAGNAROK annotations at the transcript and gene level (Fig.3; S Table 2 ). View this table: View inline View popup Download powerpoint Table 2. Re-annotation statistics between MAKER and RAGNAROK. Download figure Open in new tab Figure 2. Precision and sensitivity at different levels of annotation from BRAKER3 and RAGNAROK compared to reference annotation for five genomes. A) Exon level, B) Transcript level, C) Gene level We also compared the resulting RAGNAROK, BRAKER3, and reference annotations for each of the five species for the total number of genes annotated, mono-exonic, and mono:multi-exonic rates ( Table 1 ; Fig. 4 ). Here, neither RAGNAROK nor BRAKER3 produced significantly different annotated total genes compared to the reference. However, total monoexonic genes and the ratio of mono:multi-exonic annotated genes were found to be significant (ANOVA p- value = 0.00169 and 0.000817, respectively). In monoexonic gene annotations, RAGNAROK generally exhibited similar numbers to the reference annotation. In contrast, BRAKER3 was significantly higher compared to RAGANROK ( p- value = 0.00228) and the reference ( p- value = 0.00472). As a result, the ratio of mono:multi-exonic annotated genes was also significantly different for BRAKER3 compared to RAGNAROK ( p- value = 0.00108) and the reference annotations ( p- value = 0.00258). Furthermore, RAGNAROK consistently had a lower number of exons and genes missed compared to BRAKER3. With the exception being BRAKER3 had lower missed genes than RAGNAROK for Z. mays . However, the differences were only significant at the exon level ( p- value = 0.00725). BRAKER3 also exhibited higher numbers of novel genes annotated; however, the difference was not significant ( Table 1 ; Fig. 4 ). Download figure Open in new tab Figure 3. F1 scores between BRAKER3 and RAGNAROK when compared to the reference annotation for five genomes. ** denotes significant differences between the F1 score distributions using a paired t-test and Bonferroni adjustment. Download figure Open in new tab Figure 4. Annotation metrics between BRAKER3 and RAGANAROK compared to the reference annotation for five genomes. Dotted lines connect the paired datasets. To further evaluate the performance of RAGNAROK compared to BRAKER3 beyond statistical metrics, we compared BUSCO values of the resulting transcriptomes from each annotation method to the reference ( Table 1 ; Fig. 5 ). BRAKER3 and RAGNAROK exhibited lower BUSCO complete scores compared to the reference annotation for A. thaliana, P. persica , and Z. mays . In all cases except A. thaliana and Z. mays , RAGNAROK was higher in the BUSCO complete and fragmented scores over BRAKER3. This trend was inversely consistent with the BUSCO missing category, with RAGNAROK being lower than BRAKER3 except for Z. mays ( Table 1 ; Fig. 5 ). Download figure Open in new tab Figure 5. Comparisons of BUSCO scores of BRAKER3, RAGNAROK, and the reference annotations. Dotted lines connect the paired datasets. Improvement in reannotation of MAKER-annotated genomes Our group and others have published and/or released several MAKER-generated annotations for Rosaceae crops ( Gottschalk et al ., 2025 ; Mansfeld, Ou, et al ., 2023; Mansfeld, Yocca, et al ., 2023; Yocca et al ., 2024 ). We took this opportunity to re-annotate these five genome assemblies using RAGNAROK to compare against MAKER annotations ( Table 2 ). There was no trend with either annotation method yielding more or fewer genes consistently than the other. However, RAGNAROK annotated significantly higher mono-exonic genes compared to MAKER ( p- value =0.00781). The resulting ratio of mono:multi-exonic genes was significantly lower in the MAKER annotations compared to RAGNAROK ( p- value=0.00992) ( Table 2 ). When comparing BUSCO scores using either the general Embryophyta (OBD10) or Rosaceae (OBD12) specific databases, RAGNAROK consistently outperformed MAKER ( Fig. 6 ). RAGNAROK produced significantly higher BUSCO complete, as well as significantly lower fragmented and missing BUSCO genes ( Fig. 6 ). Download figure Open in new tab Figure 6. Comparisons of the improvements when reannotating genomes from MAKER to RAGNAROK. Dotted lines connect the paired datasets. * and ** denote significant differences in means using a paired t-test. Run-time comparisons We recorded the run times for annotating each dataset with BRAKER3 and RAGNAROK. For comparisons, we did not consider the time allocated to TE annotation via EDTA in RAGNAROK to provide a comparable comparison with BRAKER3, which does not include a TE annotation feature ( Table 3 ). Overall, RAGNAROK was faster than BRAKER3, regardless of genome size ( Fig. 7 ). When comparing the regression run time models ( Fig. 7 ), we found the interaction between annotation method and genome size to be significant (ANOVA p- value = 0.0268); both BRAKER3 and RAGNAROK run times scaled linearly. The slope of the regression model for RAGNAROK was significantly lower than BRAKER3 ( p- value = 0.0268) ( Fig. 7 ). View this table: View inline View popup Download powerpoint Table 3. Run-time comparisons between RAGNAROK and BRAKER3. Download figure Open in new tab Figure 7. Comparing the run times between BRAKER3 and RAGNAROK annotation pipelines on five datasets. Regression lines were determined using a linear model. Discussion Installation We aimed to develop an annotation pipeline that is easy to operate and install. Several other annotation pipelines, such as BRAKER3 and Funannotate, offer a user-friendly installation method using Docker or Apptainer images ( Palmer and Stajich, 2020 ; Gabriel, Brůna, et al ., 2024; Kurtzer et al ., 2017 ; Merkel, 2014 ). However, both of these pipelines require users to manually download and install the GeneMark predictor, which requires a license agreement and continually updated permission keys to operate ( Brůna et al ., 2024 ). MAKER requires local installation of the pipeline and all of its dependencies ( Campbell et al ., 2014 ). To overcome these hurdles and allow for greater accessibility, we elected to rely on Nextflow and Apptainer to manage operations and virtual installation of the dependencies for RAGNAROK ( Langer et al ., 2025 ; Kurtzer et al ., 2017 ). To install and run RAGNAROK, a user will only be required to have Nextflow and Apptainer installed on their system. Furthermore, Nextflow facilitates ease of scalability, which allows RAGNAROK to be run on local machines or on a high-performance computing center (HPCC) ( Langer et al ., 2025 ). Performance against other annotation pipelines To compare performance against BRAKER3, the current top annotation pipeline, we annotated five reference-quality genomes (Gabriel, Brůna, et al ., 2024). These genomes include the small genome model plant Arabidopsis , as well as larger fruit crops of peach and apple, and lastly, Gb-scale genomes of maize and L. chinense ( Table 1 ) ( Chen et al ., 2019 ; Lamesch et al ., 2012 ; Verde et al ., 2013 , 2017 ; Daccord et al ., 2017 ; Hufford et al ., 2021 ). We undertook a similar approach as the BRAKER3 authors by limiting the amount of input data to reflect real-world conditions (Gabriel, Brůna, et al ., 2024). We found that RAGNAROK produced a higher level of exon, transcript, and gene-level annotation accuracy compared to BRAKER3 ( Fig. 2 ). In most annotation levels (exon, transcript, gene), RAGNAROK was significantly higher (S Table 2 ). As a result, RAGNAROK produced significantly higher transcript and gene F1 scores (S Table 2 ). Furthermore, RAGNAROK demonstrated lower monoexonic gene annotations than BRAKER3 ( Table 2 ; Fig. 4 ). Other authors have found similar high rates of monoexonic gene annotations when using BRAKER ( Venturini et al ., 2018 ). We observed mono:multi-exonic ratios of ∼0.17 with RAGNAROK, which is close to the suggested ideal ratio of 0.2 ( Venturini et al ., 2018 ; Jain et al ., 2008 ). Several of the reference annotations used for benchmarking were previously generated using BRAKER3, which may have contributed to the observed concordance in gene model predictions in maize and the general trend of mono-exonic:multi-exonic ratios greater than 0.2 in the reference ( Table 1 ). In all cases except A. thaliana and Z. mays , RAGNAROK was higher in the BUSCO complete and fragmented scores over BRAKER3 ( Table 1 ; Fig. 5 ). This result could be attributed to methodological consistency of these annotations with those found in the BUSCO lineage databases, including monoexonic genes. This will require further analysis to conclude. BRAKER3 and RAGNAROK exhibited lower BUSCO complete scores compared to the reference annotation for A. thaliana, P. persica, and Z. mays . However, the differences were minimal, with the maximum difference in Complete Gene percentage being <2% ( Table 1 ). This is not a surprising result, as all three of these species have undergone manual curation to their annotation and utilized more evidential datasets during their annotation ( Lamesch et al ., 2012 ; Verde et al ., 2013 , 2017 ; Hufford et al ., 2021 ). We also compared RAGANROK to MAKER2 annotations for a set of Rosaceae genomes ( Gottschalk et al ., 2025 ; Mansfeld, Yocca, et al ., 2023; Mansfeld, Ou, et al ., 2023; Yocca et al ., 2024 ). Since these were not reference genome annotations, sensitivity, precision, and F1 scores could not be assessed. However, we did carry out comparisons on annotated gene number, monoexonic annotations, ratios, and BUSCO. In contrast to the comparisons with BRAKER, MAKER2 produced significantly fewer monoexonic gene annotations than RAGANROK. However, the average ratio for MAKER2 (∼0.11) was below the suggested ratio of 0.2 ( Venturini et al ., 2018 ; Jain et al ., 2008 ). Whereas, RAGNAROK maintained a ratio of ∼0.19. When comparing BUSCO scores between MAKER2 and RAGNAROK, RAGNAROK consistently outperformed MAKER2 ( Fig. 6 ; Table 2 ). This result was demonstrated using the >10k Rosaceae-specific ODB12 BUSCO dataset ( Tegenfeldt et al ., 2025 ). Here, RAGNAROK achieved a complete score averaging ∼96.8% compared to 93.2% for MAKER2, a significant increase ( Fig. 6 ). Run-time RAGNAROK demonstrated consistent and significantly faster run times across five plant genomes of various lengths compared to BRAKER3 ( Fig. 7 ). We observed slightly faster BRAKER3 runtimes than what was demonstrated in the BRAKER3 publication (Gabriel, Brůna, et al ., 2024). For example, that publication reported a BRAKER3 runtime of 5h 37m for Arabidopsis compared to the 4h 58m report in this study (Gabriel, Brůna, et al ., 2024). This improvement could be associated with revisions to the BRAKER3 pipeline since its release. However, the resource allocation between BRAKER3 and RAGNAROK can conflate the performance in favor of RAGNAROK. For example, BRAKER3 is thread-limited to 48 threads due to a max level in the optimize_augustus.pl script. In contrast, RAGNAROK can be scaled to whatever is available on the local system or the HPCC, and some of the subprocesses ( i . e ., Helixer) are GPU-accelerated. Future Directions We aim to introduce further improvements and broaden the usage of RAGNAROK in future versions. Currently, we have limited RAGNAROK to plants, but anticipate adding flexibility for other Eukaryotes such as animals and fungi. This will be achieved through the implementation of user-selected specific Helixer models released by the software’s developers ( Stiehler et al ., 2021 ; Holst et al ., 2023 ). There are further opportunities to allow user-generated Helixer models trained on RNA-seq data. Similarly, as other novel tools and models become available, we envision adding support for additional annotation software ( e . g ., Tiberius) (Gabriel, Becker, et al ., 2024) to continue to leverage the advantages of both traditional and deep-learning-based approaches. The modular implementation of RAGNAROK and the parallel processing efficiency of Nextflow will continue to support the ongoing development and expansion of pipeline features. Nevertheless, the current implementation of RAGNAROK is already capable of incorporating gene models from such emerging tools with the addition of manual weighting from the end-user. We also plan to incorporate additional subparameters to enhance user-specified intron length during the RNA-seq alignment step performed by STAR ( Dobin et al ., 2013 ). Currently, it is encoded at a 10Kb max intron size to support what is commonly observed in plants. However, allowing flexibility will be required for use with other Eukaryotes. For example, when annotating a fungal genome, the max intron length should be reduced from the current 10kb to ∼2kb ( Kupfer et al ., 2004 ). We also plan to integrate the FindPlantNLR pipeline as an additional annotation process ( Chen et al ., 2023b ). Our team has found FindPlantNLR offers improvements in annotating complex nucleotide-binding domain (NBD) leucine-rich repeat (LRR) proteins (NLRs), which are commonly found in complex tandem duplications ( Larson et al ., 2025 ; Liu et al ., 2025 ). These annotation tools will further serve in preserving annotations when conducting reannotation of genomes. We also envision some improvements to decrease run time. For example, there are other emerging GPU-accelerated implementations of standard bioinformatic pipelines through programs such as Parabricks from NVIDIA (Santa Clara, CA)( Franke and Crowgey, 2020 ). Tasks such as RNAseq mapping using STAR and minimap have already been made available through Parabricks and would be complementary to the GPU-accelerated Helixer gene prediction. Lastly, we expect to integrate hybrid transcript assembly from mixed short-read and long-read data as implemented in StringTie2 ( Shumate et al ., 2022 ). Availability To download and utilize the RAGNAROK pipeline, please refer to its repository on GitHub at the following web address: https://github.com/ryandkuster/ragnarok . Contributions CG conceptualized the study and developed the initial methodology. RK advanced the methodology, implementation using NextFlow, and maintains the pipeline. ZS, LW, MS, and BNM contributed methodology improvements, testing, and validation of the pipeline. All authors contributed to the writing, editing, and reviewing of the manuscript. Funding MS and RK received funding support from USDA 2023-70029-41315, ARS-USDA grant 58-6062-6, and The Nature Conservancy. ZS was supported by the National Science Foundation Graduate Research Fellowship (NSF 22-614 2439861) and The Nature Conservancy. CG received funding support from USDA-ARS National Program 301 Plant Genetic Resources, Genomics and Genetic Improvement, project number 8080-21000-033-000D. Supplemental Supplemental Table 1. Genomes, reference annotations, evidence datasets, and versions for different species used in the comparisons between annotation pipelines. Supplemental Table 2. Precision and sensitivity for gene predictions by RAGNAROK and BRAKER3 compared to the reference annotation for five reference quality genomes. Supplemental File 1. Commands executed in the comparisons between RAGNAROK, BRAKER3, and MAKER. Acknowledgments We appreciate input and feedback from Dr. Chris Dardick and Shade Niece during the development of the pipeline and evaluation of its performance. References ↵ Andrews , S. ( 2010 ) FastQC: A Quality Control tool for High Throughput Sequence Data . ↵ Balakirev , E.S. and Ayala , F.J. ( 2003 ) Pseudogenes: Are They “Junk” or Functional DNA? Annu. Rev. Genet ., 37 , 123 – 151 . OpenUrl CrossRef PubMed Web of Science ↵ Baril , T. et al. ( 2024 ) Earl Grey: A Fully Automated User-Friendly Transposable Element Annotation and Analysis Pipeline . Mol. Biol. Evol ., 41 , msae068 . OpenUrl CrossRef PubMed ↵ Bayer , P.E. et al. ( 2018 ) Bias in resistance gene prediction due to repeat masking . Nat. Plants , 4 , 762 – 765 . OpenUrl PubMed ↵ Bayer , P.E. et al. ( 2020 ) Plant pan-genomes are the new reference . Nat. Plants , 6 , 914 – 920 . OpenUrl PubMed ↵ Brůna , T. et al. ( 2020 ) GeneMark-EP+: eukaryotic gene prediction with self-training in the space of genes and proteins . NAR Genomics Bioinforma ., 2 , lqaa026 . OpenUrl ↵ Brůna , T. et al. ( 2024 ) GeneMark-ETP significantly improves the accuracy of automatic annotation of large eukaryotic genomes . Genome Res ., 34 , 757 – 768 . OpenUrl Abstract / FREE Full Text ↵ Buchfink , B. et al. ( 2021 ) Sensitive protein alignments at tree-of-life scale using DIAMOND . Nat. Methods , 18 , 366 – 368 . OpenUrl CrossRef PubMed ↵ Campbell , M.S. et al. ( 2014 ) Genome Annotation and Curation Using MAKER and MAKER-P . Curr. Protoc. Bioinforma ., 48 , 4.11.1-4.11.39. ↵ Cantalapiedra , C.P. et al. ( 2021 ) eggNOG-mapper v2: Functional Annotation, Orthology Assignments, and Domain Prediction at the Metagenomic Scale . Mol. Biol. Evol ., 38 , 5825 – 5829 . OpenUrl CrossRef PubMed ↵ Chen , J. et al. ( 2019 ) Liriodendron genome sheds light on angiosperm phylogeny and species–pair differentiation . Nat. Plants , 5 , 18 – 25 . OpenUrl PubMed ↵ Chen , S. et al. ( 2018 ) fastp: an ultra-fast all-in-one FASTQ preprocessor . Bioinformatics , 34 , i884 – i890 . OpenUrl CrossRef PubMed ↵ Chen , S.H. et al. ( 2023a ) A high-quality pseudo-phased genome for Melaleuca quinquenervia shows allelic diversity of NLR-type resistance genes . GigaScience , 12 , giad102 . OpenUrl CrossRef Chen , S.H. et al. ( 2023b ) A high-quality pseudo-phased genome for Melaleuca quinquenervia shows allelic diversity of NLR-type resistance genes . GigaScience , 12 , giad102 . OpenUrl CrossRef ↵ Cheng , C.-Y. et al. ( 2017 ) Araport11: a complete reannotation of the Arabidopsis thaliana reference genome . Plant J ., 89 , 789 – 804 . OpenUrl CrossRef PubMed ↵ Daccord , N. et al. ( 2017 ) High-quality de novo assembly of the apple genome and methylome dynamics of early fruit development . Nat. Genet ., 49 , 1099 – 1106 . OpenUrl CrossRef PubMed ↵ Dainet , J. ( 2019 ) AGAT: Another Gff Analysis Toolkit to handle annotations in any GTF/GFF format . ↵ Danecek , P. et al. ( 2021 ) Twelve years of SAMtools and BCFtools . GigaScience , 10 , giab008 . OpenUrl CrossRef PubMed ↵ Dobin , A. et al. ( 2013 ) STAR: ultrafast universal RNA-seq aligner . Bioinformatics , 29 , 15 – 21 . OpenUrl CrossRef PubMed Web of Science ↵ Dominguez Del Angel , V. et al. ( 2018 ) Ten steps to get started in Genome Assembly and Annotation . ↵ Ellis , J. et al. ( 2000 ) Structure, function and evolution of plant disease resistance genes . Curr. Opin. Plant Biol ., 3 , 278 – 284 . OpenUrl CrossRef PubMed Web of Science ↵ Ewels , P. et al. ( 2016 ) MultiQC: summarize analysis results for multiple tools and samples in a single report . Bioinformatics , 32 , 3047 – 3048 . OpenUrl CrossRef PubMed ↵ Flynn , J.M. et al. ( 2020 ) RepeatModeler2 for automated genomic discovery of transposable element families . Proc. Natl. Acad. Sci ., 117 , 9451 – 9457 . OpenUrl Abstract / FREE Full Text ↵ Franke , K.R. and Crowgey , E.L. ( 2020 ) Accelerating next generation sequencing data analysis: an evaluation of optimized best practices for Genome Analysis Toolkit algorithms . Genomics Inform ., 18 , e10 . OpenUrl CrossRef Gabriel , L. , Brůna , T. , et al. ( 2024 ) BRAKER3: Fully automated genome annotation using RNA-seq and protein evidence with GeneMark-ETP, AUGUSTUS and TSEBRA . BioRxiv Prepr. Serv. Biol ., 2023.06.10.544449. Gabriel , L. , Becker , F. , et al. ( 2024 ) Tiberius: end-to-end deep learning with an HMM for gene prediction . Bioinformatics , 40 , btae685 . OpenUrl CrossRef PubMed ↵ Gabriel , L. et al. ( 2021 ) TSEBRA: transcript selector for BRAKER . BMC Bioinformatics , 22 , 566 . OpenUrl CrossRef PubMed ↵ Gladman , N. et al. ( 2023 ) Era of Gapless Plant Genomes: Innovations in Sequencing and Mapping Technologies Revolutionize Genomics and Breeding . Curr. Opin. Biotechnol ., 79 , 102886 . OpenUrl CrossRef PubMed ↵ Gottschalk , C. et al. ( 2025 ) Improved genome assembly of double haploid Prunus persica siblings ‘Lovell 2D’ and ‘Lovell 5D’ and the peach NLRome . 2025.06.03.657495. ↵ Grabherr , M.G. et al. ( 2011 ) Full-length transcriptome assembly from RNA-Seq data without a reference genome . Nat. Biotechnol ., 29 , 644 – 652 . OpenUrl CrossRef PubMed ↵ Hammond-Kosack , K.E. and Jones , J.D.G. ( 1997 ) PLANT DISEASE RESISTANCE GENES . Annu. Rev. Plant Physiol. Plant Mol. Biol ., 48 , 575 – 607 . OpenUrl CrossRef Web of Science ↵ Hart , A.J. et al. ( 2020 ) EnTAP: Bringing faster and smarter functional annotation to non-model eukaryotic transcriptomes . Mol. Ecol. Resour ., 20 , 591 – 604 . OpenUrl CrossRef PubMed ↵ Hernández-Plaza , A. et al. ( 2022 ) eggNOG 6.0: enabling comparative genomics across 12 535 organisms . Nucleic Acids Res ., 51 , D389 – D394 . OpenUrl CrossRef ↵ Holst , F. et al. ( 2023 ) Helixer–de novo Prediction of Primary Eukaryotic Gene Models Combining Deep Learning and a Hidden Markov Model . 2023.02.06.527280. ↵ Huang , N. and Li , H. ( 2023 ) compleasm: a faster and more accurate reimplementation of BUSCO . Bioinformatics , 39 , btad595 . OpenUrl CrossRef PubMed ↵ Hufford , M.B. et al. ( 2021 ) De novo assembly, annotation, and comparative analysis of 26 diverse maize genomes . Science , 373 , 655 – 662 . OpenUrl Abstract / FREE Full Text ↵ Humann , J.L. et al. ( 2019 ) Structural and Functional Annotation of Eukaryotic Genomes with GenSAS . Methods Mol. Biol. Clifton NJ , 1962 , 29 – 51 . OpenUrl ↵ Jain , M. et al. ( 2008 ) Genome-wide analysis of intronless genes in rice and Arabidopsis . Funct. Integr. Genomics , 8 , 69 – 78 . OpenUrl CrossRef PubMed ↵ Jung , H. et al. ( 2020 ) Twelve quick steps for genome assembly and annotation in the classroom . PLOS Comput. Biol ., 16 , e1008325 . OpenUrl CrossRef PubMed ↵ Kim , J. and Kim , C. ( 2022 ) A beginner’s guide to assembling a draft genome and analyzing structural variants with long-read sequencing technologies . STAR Protoc ., 3 , 101506 . OpenUrl PubMed ↵ Korf , I. ( 2004 ) Gene finding in novel genomes . BMC Bioinformatics , 5 , 59 . OpenUrl CrossRef PubMed ↵ Kovaka , S. et al. ( 2019 ) Transcriptome assembly from long-read RNA-seq alignments with StringTie2 . Genome Biol ., 20 , 278 . OpenUrl CrossRef PubMed ↵ Kupfer , D.M. et al. ( 2004 ) Introns and Splicing Elements of Five Diverse Fungi . Eukaryot. Cell , 3 , 1088 – 1100 . OpenUrl Abstract / FREE Full Text ↵ Kurtzer , G.M. et al. ( 2017 ) Singularity: Scientific containers for mobility of compute . PLOS ONE , 12 , e0177459 . OpenUrl CrossRef PubMed ↵ Lamesch , P. et al. ( 2012 ) The Arabidopsis Information Resource (TAIR): improved gene annotation and new tools . Nucleic Acids Res ., 40 , D1202 – D1210 . OpenUrl CrossRef PubMed Web of Science ↵ Langer , B.E. et al. ( 2025 ) Empowering bioinformatics communities with Nextflow and nf-core . Genome Biol ., 26 , 228 . OpenUrl PubMed ↵ Larson , D.A. et al. ( 2025 ) A haplotype-resolved reference genome of Quercus alba sheds light on the evolutionary history of oaks . New Phytol ., 246 , 331 – 348 . OpenUrl PubMed ↵ Li , F. and Harkess , A. ( 2018 ) A guide to sequence your favorite plant genomes . Appl. Plant Sci ., 6 , e1030 . OpenUrl ↵ Li , H. ( 2018 ) Minimap2: pairwise alignment for nucleotide sequences . Bioinformatics , 34 , 3094 – 3100 . OpenUrl CrossRef PubMed ↵ Li , H. ( 2023 ) Protein-to-genome alignment with miniprot . Bioinformatics , 39 , btad014 . OpenUrl CrossRef PubMed ↵ Li , H. and Durbin , R. ( 2024 ) Genome assembly in the telomere-to-telomere era . Nat. Rev. Genet ., 25 , 658 – 670 . OpenUrl CrossRef PubMed ↵ Liu , J. et al. ( 2024 ) Deep R-gene discovery in HLB resistant wild Australian limes uncovers evolutionary features and potentially important loci for hybrid breeding . Front. Plant Sci ., 15 , 1503030 . OpenUrl PubMed ↵ Liu , J. et al. ( 2025 ) Deep R-gene discovery in HLB resistant wild Australian limes uncovers evolutionary features and potentially important loci for hybrid breeding . Front. Plant Sci ., 15 . ↵ Manni , M. et al. ( 2021 ) BUSCO: Assessing Genomic Data Quality and Beyond . Curr. Protoc ., 1 , e323 . OpenUrl CrossRef Mansfeld , B.N. , Yocca , A. , et al. ( 2023 ) A haplotype resolved chromosome-scale assembly of North American wild apple Malus fusca and comparative genomics of the fire blight Mfu10 locus . Plant J ., 116 , 989 – 1002 . OpenUrl CrossRef PubMed Mansfeld , B.N. , Ou , S. , et al. ( 2023 ) Genome of the North American wild apple species Malus angustifolia . 2023.11.16.567428. ↵ Merkel , D. ( 2014 ) Docker: Lightweight Linux Containers for Consistent Development and Deployment . Linux J . ↵ Nip , K.M. et al. ( 2023 ) Reference-free assembly of long-read transcriptome sequencing data with RNA-Bloom2 . Nat. Commun ., 14 , 2940 . OpenUrl CrossRef PubMed Ou , S. et al. ( 2019 ) Benchmarking transposable element annotation methods for creation of a streamlined, comprehensive pipeline . Genome Biol ., 20 , 275 . OpenUrl CrossRef PubMed ↵ Ou , S. and Jiang , N. ( 2018 ) LTR_retriever: A Highly Accurate and Sensitive Program for Identification of Long Terminal Repeat Retrotransposons . Plant Physiol ., 176 , 1410 – 1422 . OpenUrl Abstract / FREE Full Text ↵ Palmer , J.M. and Stajich , J. ( 2020 ) Funannotate v1.8.1: Eukaryotic genome annotation . ↵ Pertea , G. and Pertea , M. ( 2020 ) GFF Utilities: GffRead and GffCompare . F1000Research , 9 , ISCB Comm J-304. ↵ Pertea , M. et al. ( 2015 ) StringTie enables improved reconstruction of a transcriptome from RNA-seq reads . Nat. Biotechnol ., 33 , 290 – 295 . OpenUrl CrossRef PubMed ↵ Pertea , M. et al. ( 2016 ) Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown . Nat. Protoc ., 11 , 1650 – 1667 . OpenUrl CrossRef PubMed ↵ Pink , R.C. et al. ( 2011 ) Pseudogenes: Pseudo-functional or key regulators in health and disease? RNA , 17 , 792 – 798 . OpenUrl Abstract / FREE Full Text ↵ R Core Team ( 2024 ) R: A Language and Environment for Statistical Computing . Foundation for Statiscal Computing , Vienna, Austria . ↵ Shi , J. and Liang , C. ( 2019 ) Generic Repeat Finder: A High-Sensitivity Tool for Genome-Wide De Novo Repeat Detection . Plant Physiol ., 180 , 1803 – 1815 . OpenUrl Abstract / FREE Full Text ↵ Shumate , A. et al. ( 2022 ) Improved transcriptome assembly using a hybrid of long and short reads with StringTie . PLOS Comput. Biol ., 18 , e1009730 . OpenUrl CrossRef PubMed ↵ Shumate , A. and Salzberg , S.L. ( 2021 ) Liftoff: accurate mapping of gene annotations . Bioinformatics , 37 , 1639 – 1643 . OpenUrl CrossRef PubMed ↵ Simão , F.A. et al. ( 2015 ) BUSCO: assessing genome assembly and annotation completeness with single-copy orthologs . Bioinformatics , 31 , 3210 – 3212 . OpenUrl CrossRef PubMed ↵ Slater , G.S.C. and Birney , E. ( 2005 ) Automated generation of heuristics for biological sequence comparison . BMC Bioinformatics , 6 , 31 . OpenUrl CrossRef PubMed ↵ Smit , A.F. et al. ( 2013 ) RepearMasker Open-4.0 . ↵ Stanke , M. et al. ( 2004 ) AUGUSTUS: a web server for gene finding in eukaryotes . Nucleic Acids Res ., 32 , W309 – 312 . OpenUrl CrossRef PubMed Web of Science ↵ Steuernagel , B. et al. ( 2015 ) NLR-parser: rapid annotation of plant NLR complements . Bioinformatics , 31 , 1665 – 1667 . OpenUrl CrossRef PubMed ↵ Steuernagel , B. et al. ( 2020 ) The NLR-Annotator Tool Enables Annotation of the Intracellular Immune Receptor Repertoire1 [OPEN] . Plant Physiol ., 183 , 468 – 482 . OpenUrl Abstract / FREE Full Text ↵ Stiehler , F. et al. ( 2021 ) Helixer: cross-species gene annotation of large eukaryotic genomes using deep learning . Bioinformatics , 36 , 5291 – 5298 . OpenUrl CrossRef PubMed Su , W. et al. ( 2021 ) A Tutorial of EDTA: Extensive De Novo TE Annotator . Methods Mol. Biol. Clifton NJ , 2250 , 55 – 67 . OpenUrl ↵ Tegenfeldt , F. et al. ( 2025 ) OrthoDB and BUSCO update: annotation of orthologs with wider sampling of genomes . Nucleic Acids Res ., 53 , D516 – D522 . OpenUrl CrossRef PubMed ↵ The Galaxy Community ( 2024 ) The Galaxy platform for accessible, reproducible, and collaborative data analyses: 2024 update . Nucleic Acids Res ., 52 , W83 – W94 . OpenUrl CrossRef PubMed ↵ The UniProt Consortium ( 2025 ) UniProt: the Universal Protein Knowledgebase in 2025 . Nucleic Acids Res ., 53 , D609 – D617 . OpenUrl CrossRef PubMed ↵ Trapnell , C. et al. ( 2012 ) Differential gene and transcript expression analysis of RNA-seq experiments with TopHat and Cufflinks . Nat. Protoc ., 7 , 562 – 578 . OpenUrl CrossRef PubMed ↵ Venturini , L. et al. ( 2018 ) Leveraging multiple transcriptome assembly methods for improved gene structure annotation . GigaScience , 7 , giy093 . OpenUrl PubMed ↵ Verde , I. et al. ( 2013 ) The high-quality draft genome of peach (Prunus persica) identifies unique patterns of genetic diversity, domestication and genome evolution . Nat. Genet ., 45 , 487 – 494 . OpenUrl CrossRef PubMed ↵ Verde , I. et al. ( 2017 ) The Peach v2.0 release: high-resolution linkage mapping and deep resequencing improve chromosome-scale assembly and contiguity . BMC Genomics , 18 , 225 . OpenUrl CrossRef PubMed ↵ Vuruputoor , V.S. et al. ( 2023 ) Welcome to the big leaves: Best practices for improving genome annotation in non-model plant genomes . Appl. Plant Sci ., 11 , e11533 . OpenUrl ↵ Wickham , H. ( 2016 ) ggplot2: Elegant Graphics for Data Analysis Springer-Verlag , New York, NY . ↵ Wu , H. et al. ( 2023 ) Re-annotation of the Liriodendron chinense genome identifies novel genes and improves genome annotation quality . Tree Genet. Genomes , 19 , 30 . OpenUrl ↵ Xiong , W. et al. ( 2014 ) HelitronScanner uncovers a large overlooked cache of Helitron transposons in many plant genomes . Proc. Natl. Acad. Sci ., 111 , 10263 – 10268 . OpenUrl Abstract / FREE Full Text ↵ Xu , Z. and Wang , H. ( 2007 ) LTR_FINDER: an efficient tool for the prediction of full-length LTR retrotransposons . Nucleic Acids Res ., 35 , W265 – W268 . OpenUrl CrossRef PubMed Web of Science ↵ Yocca , A. et al. ( 2024 ) A chromosome-scale assembly for ‘d’Anjou’ pear . G3 GenesGenomesGenetics , 14 , jkae003 . OpenUrl ↵ Young , N.D. ( 2000 ) The genetic architecture of resistance . Curr. Opin. Plant Biol ., 3 , 285 – 290 . OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted October 04, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Ragnarok: a flexible and RApid GeNe Annotation (ROcKs) pipeline deployed through Nextflow Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Ragnarok: a flexible and RApid GeNe Annotation (ROcKs) pipeline deployed through Nextflow Ryan D. Kuster , Zane C. Smith , Lauren Whitt , Margaret Staton , Ben N. Mansfeld , Christopher Gottschalk bioRxiv 2025.10.03.680343; doi: https://doi.org/10.1101/2025.10.03.680343 Share This Article: Copy Citation Tools Ragnarok: a flexible and RApid GeNe Annotation (ROcKs) pipeline deployed through Nextflow Ryan D. Kuster , Zane C. Smith , Lauren Whitt , Margaret Staton , Ben N. Mansfeld , Christopher Gottschalk bioRxiv 2025.10.03.680343; doi: https://doi.org/10.1101/2025.10.03.680343 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17633) Bioengineering (13856) Bioinformatics (41841) Biophysics (21399) Cancer Biology (18529) Cell Biology (25422) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24282) Genetics (15582) Genomics (22462) Immunology (17700) Microbiology (40295) Molecular Biology (17140) Neuroscience (88419) Paleontology (666) Pathology (2823) Pharmacology and Toxicology (4813) Physiology (7632) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00