The avocado pangenome reveals dynamic clustering and lineage-specific diversity of  NLR  genes

doi:10.1101/2025.10.28.684993

The avocado pangenome reveals dynamic clustering and lineage-specific diversity of NLR genes

2025 · doi:10.1101/2025.10.28.684993

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 86,690 characters · extracted from preprint-html · click to expand

The avocado pangenome reveals dynamic clustering and lineage-specific diversity of NLR genes | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results The avocado pangenome reveals dynamic clustering and lineage-specific diversity of NLR genes Robert Backer , View ORCID Profile Alicia Clarke , Alicia Vermeulen , View ORCID Profile Aureliano Bombarely , View ORCID Profile Noёlani van den Berg doi: https://doi.org/10.1101/2025.10.28.684993 Robert Backer 1 Department of Biochemistry , Genetics and Microbiology, University of Pretoria , Pretoria, Gauteng, South Africa 2 Hans Merensky Chair in Avocado Research, Forestry and Agricultural Biotechnology Institute, University of Pretoria , Pretoria, Gauteng, South Africa Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alicia Clarke 1 Department of Biochemistry , Genetics and Microbiology, University of Pretoria , Pretoria, Gauteng, South Africa 2 Hans Merensky Chair in Avocado Research, Forestry and Agricultural Biotechnology Institute, University of Pretoria , Pretoria, Gauteng, South Africa Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alicia Clarke Alicia Vermeulen 1 Department of Biochemistry , Genetics and Microbiology, University of Pretoria , Pretoria, Gauteng, South Africa 2 Hans Merensky Chair in Avocado Research, Forestry and Agricultural Biotechnology Institute, University of Pretoria , Pretoria, Gauteng, South Africa Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aureliano Bombarely 3 Instituto de Biología Molecular y Celular de Plantas , Consejo Superior de Investigaciones Científicas- Universitat Politècnica de València (IBMCP-CSIC-UPV), Valencia, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Aureliano Bombarely Noёlani van den Berg 1 Department of Biochemistry , Genetics and Microbiology, University of Pretoria , Pretoria, Gauteng, South Africa 2 Hans Merensky Chair in Avocado Research, Forestry and Agricultural Biotechnology Institute, University of Pretoria , Pretoria, Gauteng, South Africa Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Noёlani van den Berg For correspondence: noelani.vdberg{at}fabi.up.ac.za Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Avocado is an economically important perennial tree crop with a complex domestication history, yet modern genomic resources are limited. We present the first high-quality avocado pangenome, spanning seven diverse accessions across Mexican, Guatemalan, and West-Indian lineages. PacBio HiFi, partially phased assemblies deliver near chromosome-level continuity (N50 ~57 Mb; BUSCO >96%), and functional annotations show enrichment of immune-related families. Core–accessory partitioning indicates adaptive functions — pathogen response and secondary metabolism — are overrepresented in accessory genes. Nucleotide-binding Leucine-rich repeat ( NLR ) genes were catalogued (226–256 per accession), showing lineage-specific expansions, diverse domain architectures, and frequent chromosomal clustering. Structural variants concentrate within NLR loci, marking hotspots of pathogen-detection diversification. Comparative analyses show ~54% of NLRs are shared, with extensive functional and sequence diversity among accessions. Together, these results define the avocado NLRome and its core– accessory interplay, providing a graph-based framework to accelerate discovery of resistance loci and breeding for durable disease resistance. Introduction Genomic resources for perennial tree crops trail those of annuals, hampered by biological and technical challenges. Many fruit and nut trees are long-lived, with extended juvenility and long generation times, making conventional breeding arduous 1 – 4 . Their genomes are often large, structurally complex, and highly heterozygous from outcrossing, complicating de novo assembly 5 . Consequently, whereas annuals such as rice 6 , common bean 7 , and maize 8 have large pangenomes, most tree crops lack comparable references. Notably, perennials such as apple 9 , peach 10 and citrus 11 only recently moved beyond single references to first pangenomes. This gap hampers discovery of structural-variants (SVs) and trait-linked alleles in tree crops 12 – 14 , underscoring the need for deeper catalogues of perennial diversity. Avocado ( Persea americana ) exemplifies these challenges. Cultivated avocados derive from three landraces – Mexican, Guatemalan, and West Indian – but genomics focused on a few elite cultivars within the admixed Hass lineage 15 – 17 . The first reference genome was only published in 2019 16 , followed with a chromosome-scale Hass assembly in 2022 17 . However, no pangenome captures diversity across landraces and hybrids. This need is evident from studies of defence-related genes 18 – 20 , reporting large gene-number differences across assemblies, highlighting the need for an avocado pangenome. Variation in defence-gene sequences can influence immune activation and disease outcome 21 – 23 . Many defence proteins activate immunity upon recognising pathogen-associated patterns or pathogen effectors. Effector recognition is highly specific; thus, sequence differences may alter protein function 24 . Nucleotide-binding Leucine-rich repeat (NLR) receptor proteins, for example, mainly recognise effectors via a C-terminal Leucine-rich repeat region 25 . Mutations within this domain have been linked with pathogen susceptibility 26 . Furthermore, NLR copy number variants are frequently associated with quantitative trait loci linked to disease resistance 27 – 29 . Understanding population-level NLR variation will clarify plant-pathogen interactions, resistance, and the evolutionary pressures shaping this protein family 30 . NLRs often occur in clusters — homogeneous tandem arrays or heterogeneous groups from segmental duplication – that serve as evolutionary hotspots, generating immune diversity 30 . Pangenome approaches are therefore valuable, capturing conserved and variable NLRs across avocado accessions and enabling comprehensive study repertoire and lineage-specific diversification 12 – 14 . Here we present the first high-quality, partially phased avocado pangenome spanning all major horticultural lineages — a foundation for trait discovery and improvement. We delineate core and accessory gene sets; the accessory compartment is enriched for defence- and secondary-metabolism GO terms and shows expansions of NLR-linked Pfam domains. Using a graph pangenome, we map genome-wide structural variation, with inversions, deletions and insertions enriched in NLR -associated regions. Comparative analyses reveal differences among accessions in NLR domain organisation and sequence conservation, providing candidates for functional validation. The identification of accession-specific variants also offers candidate NLRs for functional validation. Ultimately, this pangenome reveals extensive NLRome diversity and lays the groundwork for durable resistance strategies. Results Nuclear assemblies We generated high-quality, partially phased genome assemblies for seven P. americana accessions, including the previously published Hass genome 17 , spanning the major landraces and hybrid groups. PacBio HiFi sequencing on Revio yielded ~26–36 Gb per accession (mean read length >15 kb, Q>27). Assemblies built with Hifiasm produced primary and alternate haplotypes; primaries were the most contiguous. After filtering organelle and other non-nuclear sequences, primary assemblies ranged from 862 Mb (Leola™) to 915 Mb (Hass; Table S1). Contig counts varied from 56 (Choquette) to 810 (Ashdot; Fig. S1), yet L90 values of 16 to 28, indicated that most of each assembly was captured in relatively few contigs. N50 values ranged from 3.6 Mb (Hass) to 57 Mb (Ashdot). For the previously published Hass dataset, available raw data (~17.5 Gb) were substantially lower than the ~47.9 Gb reported in the original study 17 , likely contributing to its reduced contiguity relative to the newly sequenced accessions. BUSCO completeness for Hass (96.7%) matched our assemblies (96.3–96.7%). Merqury confirmed comparable quality, with Hass QV 61.3 and completeness of 98.0% versus QV 63.3–71.6 and completeness of 98.8–99.3% for the newly sequenced accessions (Table S1). To assess structural consistency, primary assemblies were aligned to the West-Indian (WI) pure accession genome. Dotplots showed broad syntenic concordance with limited lineage-specific rearrangements (Fig. S2). The main exception was Choquette contig 0003, in which WI chromosome 6 had been appended to chromosome 4; we corrected this misjoin by manually splitting the contig. Aside from this case, structural differences were largely confined to regions of lower concordance — most notably central chromosomal regions — none of which could be confidently classified as assembly errors. Very few contigs showed concordance with chromosome 0 of the WI reference (Fig. S2). Earlier assemblies used “chromosome 0” for unplaced sequences that often represented contaminants, organelle fragments, or other artefacts. Our workflow used stringent contaminant and organelle filtering and, without explicitly targeting “chromosome 0”, effectively excluded such sequences. Organelle assemblies Complete circular chloroplast genomes were recovered for all seven accessions (152,609– 152,763 bp; Ashdot; Fig. S3, S4, Table S2), consistent with prior reorts 31 . Mitochondrial assemblies were more variable; Hass, Ashdot, and Mike each yielded two circular contigs totalling ~850 kb, similar to the recently published complete P. americana mitogenome 32 . Dusa ® , Choquette, and Leola™ produced near-complete but non-circular assemblies, and no contiguous mitogenome was reconstructed for Gottfried. This pattern likely reflects nuclear DNA isolation minimizing organellar carry-over. Repeat landscape and transposable element composition All assemblies displayed similar repeat architectures, with repetitive DNA comprising ~59– 60% of the genome space (Fig. S5; Table S3). Long terminal repeat (LTR) retrotransposons dominated (~35%): ~18–19% LTR/unknown (unclassified), ~10–11% Gypsy, and ~5% Copia. DNA transposons accounted for ~10–11% (primarily Mutator-like, ~5–6%, and hAT, ~2%), long interspersed nuclear elements (LINEs) for ~6–7%, and miniature inverted-repeat transposable elements (MITEs) and other low-complexity repeats contributed <5%. Approximately 3% of repeats could not be assigned to known families. Divergence (Kimura) landscapes revealed a pronounced wave of recent LTR expansion peaking at ~10% divergence, largely driven by unclassified and Gypsy elements, with broader shoulders indicating older bursts ( Fig. 1 ). Temporal patterns were highly conserved across accessions: overlaid landscapes ( Fig. 1h ) were nearly indistinguishable, indicating shared historical retrotransposon activity rather than recent accession-specific proliferation. Download figure Open in new tab Figure 1. Kimura (percent-substitution) divergence landscapes of broad transposable-element (TE) groups for seven Persea americana primary assemblies. a–g ) Per-assembly TE divergence profiles (x axis = Kimura substitution level, 0–50%; y axis = genome fraction % in each 1% bin) for long terminal repeat (LTR) retrotransposons, DNA transposons, LINEs (long interspersed nuclear elements), MITEs (miniature inverted-repeat transposable elements), RC/Helitron (rolling-circle Helitrons), and Other. h ) Overlays of the same profiles from all assemblies to facilitate direct comparison. Divergence was calculated from RepeatMasker alignments and binned at 1% resolution; values were normalized to each assembly’s size. The plots highlight a conserved, low-divergence peak (~10% Kimura) dominated by LTRs, with older shoulders indicating ancient TE activity. Gene annotation and pangenome content Annotation quality and gene space recovery Structural annotation of primary assemblies yielded 34,215 (Leola™) to 36,931 (Ashdot) predicted protein-coding genes per accession (Table S4). BUSCO completeness ranged from 97.4% (Hass) to 98.7% (Ashdot). Mean exons per transcript (~5.5) and median gene length (3,953–4,227 bp) were consistent across accessions, and Annotation Edit Distance (AED) distributions indicated well supported models (Fig. S6). Alternate assemblies also captured substantial gene space: BUSCO scores 85.3–97.6% and 31,392 to 34,819 predicted genes. In some cases (e.g. Choquette), the alternate assembly encoded more genes than the primary despite slightly lower BUSCO completeness, illustrating that alternates, though less contiguous, can encode comparable gene sets. Functional annotation Approximately 86% of gene models received functional descriptions (Table S5). Abundant Pfam domains included those linked to immunity — particularly NLR proteins combining NB-ARC (PF00931; ~12/1k genes) with multiple LRR classes (PF13855, PF00560, PF08263; ~15– 29/1k; Fig. S7, Table S6). These NLR-associated domains ranked alongside protein kinases (PF00069; ~42/1k), pentatricopeptide repeats (PPR; PF01535, PF13041; ~22–25/1k), and F-box proteins (PF00646; ~11–12/1k). Cytochrome P450s (PF00067) and UDP-glucosyltransferases (PF00201) were also prominent. Pangenome partitioning We partitioned gene space into core, soft-core, shell, and cloud compartments based on orthogroup presence across haplotypes. Because partially phased assemblies capture allelic and structural diversity, each haplotype (primary and alternate) was treated as an independent unit; counts and percentages are reported per haplotype. Across 15 haplotypes (7 accessions x 2 haplotypes plus the WI reference), we identified 38,448 orthogroups: 14,246 core (>95% of haplotypes), 9,005 soft-core (85–94%), 11,393 shell (20–84%), and 3,804 cloud (≤19%; Fig. 2 , Fig. S8, Table S7). These corresponded to 17,417–18,155 core genes, 8,107–11,197 soft-core genes, 4,946–6,947 shell genes, and 326–631 cloud genes per assembly. Download figure Open in new tab Figure 2. Orthogroup presence–absence variation (PAV) across the avocado pangenome. A binary PAV matrix was derived from orthogroup assignments: a genome is scored present (1) if any of its genes belong to the orthogroup, otherwise absent (0). Colours are consistent across panels by category (Core = green, Soft-core = orange, Shell = purple, Cloud = magenta). Orthogroups were classified by cohort prevalence for n = 15 genomes (West-Indian pure genome; Ashdot, Leola TM , Mike, Dusa ® , Choquette, Gottfried, Hass — each with primary then alternate haplotypes treated separately) using thresholds: Core ≥95% (15/15), Soft-core 85–94% (13–14/15), Shell 20–84% (3–12/15), Cloud <20% (1–2/15). a) UpSet-style summary of the most frequent intersection patterns, grouped by category. Bars are coloured by class and the y-axis is split to accommodate the large Core intersection. The dot matrix indicates which genomes contribute to each intersection (row order as above, primary followed by alternate haplotypes). b) Per-accession stacked composition of orthogroups present in each genome, coloured by category. c) Presence/absence heat map (rows = genomes, columns = orthogroups) sorted by category and prevalence. For legibility, Core and Soft-core are width-compressed; Shell and Cloud are shown at full density. The coloured header strip marks category boundaries. d) Pie-chart of the overall pangenome orthogroup category composition. Relative to the WI reference, 9,107 orthogroups were absent, corresponding to at least 19% novel genes (assuming one gene per orthogroup; Fig. S9), and up to 27.5% novel gene content when accounting for multi-gene orthogroups (13,149 additional genes). Orthogroup accumulation curves indicated an open pangenome (β = 0.109), consistent with continued discovery of novel genes as additional haplotypes are added (Fig. S10). In contrast, sequence-based modelling with Panacus yielded α = 1.138, suggesting that while new orthogroups continue to emerge, broader sequence space is nearing saturation (Fig. S11). Functional enrichment of accessory genome Gene ontology (GO) enrichment of the accessory genome (shell + cloud) revealed strong over-representation of defence- and stress-related processes. Notably, response to oomycetes, defence response, defence response to Gram-negative bacterium, programmed cell death, and interspecies interactions were enriched (Fig. S12, Table S8). Consistent with immune signalling, calcium-mediated signalling was among the strongest signals, with further enrichment for positive regulation of hydrogen peroxide metabolism and calcineurin–NFAT- related pathways. Specialised metabolism was also prominent, including secondary-metabolite biosynthesis and sesquiterpene pathways, mirrored at the molecular-function level by quercetin glucosyltransferase and sesquiterpene synthase activities. Together with the observed expansions of NLR-associated Pfam domains, these results demonstrate that adaptive immunity and pathogen-response pathways are disproportionately concentrated within the accessory genome. Diversity of NLR repertoires across accessions NLR identification and classification NLR genes identified with NLRtracker exceeded the 161 NLRs previously described in the WI pure accession genome 19 , with 226 (Mike) to 256 (Ashdot) complete NLR s ( Fig. 3a ). Subfamily distributions varied substantially between accessions. Coiled-coil NLRs ( CNLs ) dominated; Dusa ® encoded the largest set (169) and Hass the smallest (149). Toll/interleukin-1 receptor NLRs ( TNLs ) were rare but present in all accessions, with three in Ashdot and Leola™ versus two in all others. Meanwhile, RPW8-like NLRs ( RNLs ) were only absent from Choquette. Download figure Open in new tab Figure 3. Nucleotide-binding Leucine-rich repeat ( NLR ) gene numbers and domain architectures across seven Persea americana accessions. a) Total number of genes encoding NLR subfamilies identified in each accession. b) Number of CNL-encoding genes partitioned by domain organization. c) Number of NLR-type encoding genes carrying alternative domain organizations. Domains were identified with NLRtracker. (C, coiled-coil; N, Nucleotide-binding (NB-ARC); L, Leucine-rich repeat; O, other, integrated or non-canonical domains). Created in BioRender. van den berg, N. (2026) https://BioRender.com/t3s2tsy . Domain architectures also diverged. Among CNLs, the predominant configuration was C–N– O–L; only a single canonical C–N–L (Dusa ® ) was observed ( Fig. 3b ). Four accessions (Hass, Mike, Dusa ® , Choquette) each carried a gene encoding a “C–N–O–L–O–C–N–O–L–O” architecture, suggestive of duplication producing a double NLR within a single protein. All such genes were classified as “cat2” during gene annotation curation, supporting their authenticity. For NLs lacking canonical N-terminal motifs, the most frequent arrangement was N–O–L–O, with Dusa ® uniquely carrying N–L–O–L–O ( Fig. 3c ). Collectively, although copy numbers differ by only a few dozen genes, encoded architectures vary strikingly. Structural variation concentrates at NLR pangenes Intersecting variants with pangene loci showed markedly elevated SV density within NLR windows relative to matched genomic windows ( Fig. 4 ). We observed ~0.35 SV/kb, exceeding the null distribution ( Fig. 4a ). Single- and multi-nucleotide polymorphisms (SNPs/MNPs) showed a smaller but significant increase of ~15/kb against the null ( Fig. 4b ). Per-type stratification ( Fig. 4c ) indicated enrichment driven primarily by inversions and deletions, with insertions also enriched. Download figure Open in new tab Figure 4. Variation within ± 5 kb of Nucleotide-binding Leucine-rich repeat ( NLR ) loci relative to matched genomic nulls. a) The null distribution of structural-variant (SV) density (SVs per kb) in length- and chromosome-matched random windows; the dashed line marks the observed density within ± 5 kb of NLR pangenes; p from a right-tailed bootstrap (N = 2,000; add-one smoothing). b) Null distribution as in panel a for substitutions (SNPs + MNPs). Panel c displays the log 2 fold-change of the observed density over the null median for each variant class (log 2 fold change value above bars); SNP (single nucleotide polymorphism), multi nucleotide polymorphism (MNP), insertion (INS), deletion (DEL), inversion (INV). Analysis of NLR organization within genomic clusters revealed that, on average, 60% of NLRs were clustered ( Fig. 5a ). Gottfried had the fewest clustered NLRs (133; 57%), whereas Dusa ® had the most (169; 67%). Cluster distribution also varied across chromosomes, with the largest concentrations on chromosome 7; only Ashdot and Dusa ® harboured clusters on chromosome 12, and none were detected on chromosome 9 ( Fig. 5b ). Accessions differed strongly on chromosome 2: Hass carried 32 clustered NLRs compared with 16 in Mike. Similarly, Leola™ lacked clusters on chromosome 3, while other accessions averaged ~12 genes. Download figure Open in new tab Figure 5. Distribution and organization of clustered Nucleotide-binding Leucine-rich repeat ( NLR ) genes. a) Proportion of NLRs located within genomic clusters per accession. Dark-coloured segments indicate clustered NLRs ; transparent segments indicate non-clustered NLRs . b) Chromosomal distribution of clustered NLRs , shown as the total number per chromosome in each accession. c) Gene organization of the largest cluster on chromosome 7, compared between all accessions. For each accession, gene identifiers are provided, with grey arcs showing high sequence similarity (>85%) between NLRs within a given accession, while yellow connectors highlight NLRs with high sequence similarity between different accessions. Grey dash lines are shown for intron regions which span across NLR genes. True gene identifiers for different avocado accession are: PeameHass#1 – Hass; PeameRB001#1 – Ashdot; PeameRB002#1 – Leola™; PeameRB003#1 – Mike; PeameRB004#1 – Dusa ® ; PeameRB005#1 – Choquette; PeameRB006#1 – Gottfried. The largest individual cluster on chromosome 7 comprised 7–14 NLRs depending on accession and showed substantial variation in gene order and orientation ( Fig. 5c ). Comparative sequence analysis indicated multiple, independent duplications within accessions. For example, in Ashdot, two genes (PeameRB003#1_g215650 and PeameRB003#1_g215660) exhibited >98% sequence similarity) to other NLRs in the same cluster, but <85% similarity to syntenic NLRs in other accessions, suggesting accession-specific duplication. Finally, no significant correlation was detected between NLR cluster density and local TE density (r = 0.012, p > 0.05), suggesting that recent NLR duplications are not primarily driven by TE activity (Fig. S13). Core and accessory NLR repertoires To test whether NLRs skew toward the accessory genome, we quantified NLR occupancy across pangenome compartments. On average, 54% of NLRs per accession were core (119 in Mike to 140 in Choquette; Fig. 6a ), with the remainder consisting ~35% soft-core, ~10% shell, and ~0.4% cloud; Hass and Mike contained no cloud NLRs . NLR orthogroup accumulation curves indicated an open repertoire (β = 0.147), implying slow but continuing discovery of novel NLR orthogroups (Fig. S14). Download figure Open in new tab Figure 6 Shared Nucleotide-binding Leucine-rich repeat ( NLR ) genes, functional repertoires, and NLR sequence diversity across avocado accessions. a) Number of core, soft-core, shell, and cloud NLR genes per accession, classified with GENESPACE. Core genes are present in ≥95% of accessions, soft-core in 85–94%, shell in 20–84%, and cloud in ≤19%. b) UpSet plot showing the number of Gene Ontology (GO) terms shared across different combinations of accessions and accession-specific terms. The set size indicates the total number of GO terms identified in each accession. c) Multigene species phylogenetic tree inferred with OrthoFinder from the NLR proteome orthogroups. d) Average Shannon entropy score per GO term, shown for the top 50 GO terms shared across all accessions. Shannon entropy reflects per-residue amino acid variation within GO term–associated NLRs. Gene ontology revealed 90 terms shared across all accessions, with the remaining 94 unevenly distributed ( Fig. 6b ). Each accession thus harbours a partially distinct NLR repertoire with limited overlap in accession-specific terms. Phylogenetic analysis of GO term profiles grouped accessions according to genetic relationships: Dusa ® and Leola™ clustered together, Hass, Mike, Choquette, and Ashdot formed a second clade, and Gottfried branched separately ( Fig. 6c ). Average Shannon entropy scores for the top 50 shared GO terms exceeded 2 bits, reflecting substantial amino acid variability ( Fig. 6d ). Thus, even NLRs that share identical GO annotations across accessions display substantial sequence divergence. Discussion Single reference genomes capture only a portion of a species’ gene space. Crop pangenomes — including tomato 33 , common bean 34 and apple 9 — show that reference-only views overlook thousands of genes. By assembling seven P. americana accessions spanning all major landraces and hybrid groups and generating near chromosome-scale, partially phased genomes, we reveal an open pangenome: 19–27.5% of genes are absent from the WI reference. Orthogroup accumulation curves indicate continued discovery as additional accessions are added, consistent with other outcrossing species. Avocado’s diversity, shaped by multiple domestication histories 15 , remains unsaturated. Thus, continued germplasm sampling will uncover additional, potentially valuable traits. Core genes encode essential functions, whereas accessory compartments often harbour adaptive traits 35 . In avocado, shell and cloud genes are strongly enriched for defence- and stress-related responses — programmed cell death, pathogen response, and calcium signalling — with similar patterns reported in crops such as pea 36 and apple 9 . Thus, a disproportionate share of biotic- stress capacity likely resides in the accessory genome. Underutilised accessions may carry unique resistance, or metabolic traits absent from industry-preferred cultivars and rootstocks, underscoring the importance of conserving broad genetic diversity. Transposable elements drive genome evolution 12 , yet whether their activity differs among avocado lineages was previously unknown. Across accessions we observed highly conserved repeat content (~59–60%), dominated by LTR retrotransposons, and concordant Kimura divergence profiles. Most LTR activity therefor predates landrace divergence; present-day structural differences largely reflect rearrangements of shared repeats rather than recent bursts, in contrast to maize 37 . This stability provides a robust genomic scaffold while still allowing localized variation. Linear reference genomes collapse alternative haplotypes, obscuring complex SVs 35 . Our graph-based pangenome embeds SNPs, MNPs, inversions, deletions, and insertions directly in the graph. Meanwhile, orthogroup analyses capture presence/absence variation, revealing diversity often invisible in linear models. Similar graph-based approaches in potato 38 and wheat 39 have underlined this strategy for resolving SV-driven trait variation. In avocado, SVs are disproportionately concentrated within NLR clusters, highlighting an evolutionary hotspot of immune diversification and paralleling observations in cacao 30 . By representing alternative haplotypes, the graph retains key immune-related SVs rather than collapsing them. Although overall NLR counts are broadly comparable across accessions, structural, organisational, and sequence-level features vary extensively. Domain architectures diverge, including rare canonical or multi-domain configurations that differ between accessions 40 , echoing observations in Arabidopsis 41 and rice 42 . This indicates that the avocado NLRome is shaped not simply by expansions in gene number but also through the reorganization of protein domain structures and the retention of unusual configurations that may provide functional novelty. Most NLRs occurred in clusters, yet cluster locations, sizes, and arrangements differ; gene order and orientation frequently vary, consistent with independent duplication and rearrangement events 43 . Importantly, these duplication events appear unrelated to TE activity: repeat landscapes are conserved and NLR cluster density does not correlate with local TE density, unlike in Arabidopsis 43 . Instead, localized duplication mechanisms seem to have played a greater role in shaping the NLR repertoire. Functional annotation adds resolution: many NLRs are shared at the orthogroup level, yet GO terms are unevenly distributed. Even widely shared functions exhibit high Shannon entropy scores, indicating substantial amino acid variation. Reported entropy values exceed those for whole NLR sequences in maize 44 , Arabidopsis and Brachypodium distachyon 45 . Thus, proteins with the same annotations are far from uniform across accessions, and diversification within conserved functional classes may be critical for expanding pathogen effector recognition capacity 43 . Phylogenetic analyses of functional profiles mirrored genetic relationships: related accessions grouped together — consistent with shared domestication and hybridization histories — whereas distant ones followed independent evolutionary trajectories. These patterns emphasize the mosaic nature of the avocado NLRome, where conserved elements coexist with lineage-specific innovations. Collectively, avocado NLR evolution reflects substantial structural heterogeneity with significant functional consequences. Differences in domain organization, cluster dynamics, duplication histories and sequence diversity produce a repertoire richer and more complex than any single genome can capture. Integrating multiple accessions resolves a conserved backbone alongside lineage-specific innovations, providing the most comprehensive view to date. The avocado pangenome thus reveals a dual architecture: a stable core of genes and conserved repeats, alongside adaptive zones enriched for accessory genes and SVs at NLR clusters. This combination provides both resilience and evolutionary flexibility. For breeding, the path forward is to mine accessory diversity and SV variation to improve disease resistance to key pathogens such as Phytophthora cinnamomi and Dematophora necatrix 46 , 47 . Future work should expand sampling to wild relatives, integrate expression and epigenomic data, and experimentally validate candidate loci. Adopting a graph framework ensures that full diversity is represented for association studies and marker design, establishing an integrated avocado pangenome across all major horticultural groups. Methods Plant material, DNA extraction and sequencing Six diverse Persea americana accessions were selected to capture major genetic backgrounds based on a prior genotyping-by-sequencing (GBS) survey 48 and in-house genotyping with a 384-marker SNP chip 49 : Ashdot (RB001; West-Indian), Leola™ (RB002; Mexican), Mike (RB003; Guatemalan), Dusa ® (RB004; Guatemalan x Mexican), Choquette (RB005; Guatemalan x West-Indian), and Gottfried (RB006; Mexican x West-Indian). To broaden the reference space and benchmark the assembly pipeline, we also incorporated previously generated PacBio HiFi reads from a Hass (Guatemalan x Mexican; Accessions: SRR13510945/6) accession 17 . Young recently expanded leaf flushes were sampled from shaded canopy positions within the germplasm blocks of Westfalia ® Fruit Estate, Tzaneen, Limpopo, South Africa. High-molecular-weight (HMW) DNA was extracted for long-read sequencing using a slightly modified plant HMW DNA extraction protocol 50 . In the modified protocol, (i) DNA was precipitated with chilled isopropanol and incubated overnight at −20°C; (ii) following precipitation, samples were centrifuged at 3,000 x g for 60 minutes at 4°C; (iii) RNase A was added, followed by incubation at 37°C for 15 minutes. The resulting HMW DNA was shipped to Macrogen Inc. (Seoul, South Korea) for SMRTbell library preparation and PacBio Revio sequencing. Libraries were run two samples per flow cell, yielding ~30–40 Gb per sample. Raw HiFi reads were returned as compressed FASTQ files (.fastq.gz) for downstream assembly and analyses. Genome assembly Raw HiFi reads were quality-assessed with FastQC v0.12.1 51 and reports were aggregated with MultiQC v1.29 52 . Although PacBio HiFi reads are expected to be free of adapter contamination, we verified and removed any residual adapters or artefacts with HiFiAdapterFilt v2.0.0 53 using the pbadapterfilt.sh wrapper. Read-level summary statistics were generated with fastq-stats (ea-utils 54 ), and k-mer spectra were computed with Jellyfish v2.2.10 55 (k = 20) to produce histograms. The homozygous-coverage peak estimated from these histograms was subsequently used to inform downstream assembly parameters. Assemblies were generated with Hifiasm v0.25.0 56 using the computed --hom-cov value and the options -u 1 --telo-m TTTAGGG --telo-s 700 --telo-d 5000 –primary, to recover phased primary and alternate contig sets. The GFA outputs (*.p_ctg.gfa, *.a_ctg.gfa) were converted to FASTA using gfatools gfa2fa v0.5 57 . Resulting primary and alternate FASTA files were carried forward to contaminant removal, repeat annotation, masking and gene prediction. Organelle genome assembly and filtering Organelle genomes (plastid and mitochondrial) were assembled de novo from filtered PacBio HiFi reads using OATK v1.0 58 with the embryophyta_mito.fam and embryophyta_pltd.fam reference families. Mean genome-wide coverage was estimated by aligning reads to both primary and alternate assemblies with minimap2 v2.29 59 (map-hifi mode, --secondary=no) followed by depth profiling with samtools v1.22 60 . Organelle-enriched reads were identified using a coverage threshold ≥5x the mean. The resulting sequences were annotated with GeSeq 61 (Chlorobox) and visualised with OGDRAW 62 . To remove organelle-associated contigs from the genome assemblies, the mitochondrial and plastid genomes were concatenated into separate reference databases. Nuclear genome contigs were screened against these databases using BLAST + v2.16.0 63 (blastn, E-value [ expected number of chance matches ] cutoff =1e-5, max_target_seqs=1). Contigs with query coverage ≥80%, identity ≥90%, and E-value ≤1e-10 were classified as organelle-associated contigs and removed using seqtk v1.4 64 . Filtering was performed iteratively, first excluding mitochondrial-, then plastid-associated contigs. Contaminant identification and filtering The remaining contigs were taxonomically annotated by protein homology searches against the UniProt Knowledgebase (UniProtKB) SwissProt protein database 65 (release 2025_01) using DIAMOND v2.1.12 66 (blastx, ultra-sensitive mode; minimum score 60; ORF ≥30 aa; identity ≥30%; E-value ≤1e-5; top 10 hits per query). The DIAMOND database was built with embedded NCBI taxonomy identifiers 67 , 68 . To reduce redundancy, overlapping hits on each contig were collapsed into non-overlapping intervals, retaining only the highest-scoring representative per region. Consensus taxonomy was then assigned per contig by scoring candidate taxonomic identifiers according to unique hit count (2 points), maximum bitscore (1 point), maximum alignment length (0.5 points), and minimum E-value (0.5 points). The taxid with the highest cumulative score was selected as the consensus; in the case of ties, the taxid corresponding to the first best-scoring hit in the filtered BLAST output was chosen. Contaminant removal was performed with BlobToolKit v4.4.5 69 . For each assembly, BlobToolKit databases were built from organelle-filtered contigs ( blobtools create ), supplemented with coverage profiles from PacBio HiFi read alignments ( minimap2 v2.29; samtools v1.22) and taxonomic assignments from DIAMOND consensus hits ( blobtools add ). To ensure correct coverage profiles, BAM files were regenerated after removing organellar contigs. Assemblies were then filtered to retain only contigs assigned to Viridiplantae or lacking a taxonomic hit ( blobtools filter with --query-string “bestsumorder_kingdom-- Inv=Viridiplantae,no-hit”). BlobToolKit outputs included cleaned and contaminant FASTA files, tabular summaries of contig length, GC content, coverage, and phylum-level taxonomy, as well as interactive GC–coverage–taxonomy plots ( blobtools view --plot ) for manual inspection of contamination profiles. The resulting cleaned assembly FASTA files, devoid of organellar and taxonomic contaminants, were used in subsequent analyses. Assembly quality assessment and benchmarking Assembly-level quality was assessed using multiple complementary approaches. Basic assembly statistics, including N50, L50, total assembly length, GC content, and k-mer spectrum profiles, were generated with QUAST v5.3.0 70 . For each accession, raw PacBio HiFi reads were counted with Meryl v1.3 71 (k=20), and compared against concatenated primary and alternate assemblies in Merqury v1.3 71 to estimate consensus QV, completeness, and copy-number spectra. Structural consistency relative to the WI pure reference genome was assessed with minimap2 v2.29 (asm5 preset, –secondary=no). Alignments were converted to sorted BAM files with samtools v1.22, and PAF alignment files were generated with paftools.js v2.29 (sam2paf) for dotplot visualization. Dotplots were inspected with D-GENIES 72 to evaluate large-scale structural concordance and detect potential misassemblies. Assembly gene-space completeness was assessed with BUSCO v5.8.3 73 in genome mode (–m genome) using the embryophyta_odb10 lineage dataset. BUSCO was run with long mode enabled (–long), Augustus species parameter set to Theobroma cacao (closest available model), and protein alignment refinement with miniprot. BUSCO full tables were then integrated into BlobToolKit databases for interactive visualization alongside taxonomic and coverage information. Annotation of repetitive elements Repetitive regions in all genome assemblies were annotated using a combination of EDTA v2.2.2 37 , panEDTA, and RepeatMasker v4.1.5 74 . High-confidence coding sequence (CDS) evidence for TE annotation was prepared from the WI pure accession genome (article in preparation) and provided to EDTA to reduce false annotations in gene regions. Repetitive elements were annotated for each assembly with EDTA (--species others --sensitive 1 --anno 1). A pangenome TE library was then generated by merging individual EDTA annotations with panEDTA, using the high-confidence CDS from the WI pure accession as evidence. The resulting merged library was classified with TEsorter v1.4.7 75 and subsequently used as a custom repeat database for masking. All assemblies were soft-masked with RepeatMasker, using the curated panEDTA TE library as input. RepeatMasker outputs were processed in R v4.4.3 76 with the repeatR package 77 to quantify and visualize repeat composition. For each assembly we computed (i) the total masked proportion, (ii) TE class composition, and (iii) Kimura substitution profiles by aggregating per-hit divergence (p_sub) into 1% bins from 0–50% and normalizing by assembly size to report genome fraction per bin. For cross-assembly comparisons, subclasses were collapsed into six groups (LTR, DNA, LINE, MITE, RC/Helitron, and Other). For visualization, plots were generated using the ggplot2 78 and RcolorBrewer 79 packages. RNA-sequencing data acquisition, processing, and normalisation RNA-seq libraries were retrieved from the NCBI Sequence Read Archive (SRA) database using esearch/efetch (sratools v3.2.1 80 ), with the query “Persea americana [Organism] AND illumina [Platform] AND RNA-Seq [Strategy] AND paired [Layout]”. Runs with fewer than 15 million spots were excluded. Sequence data were downloaded with prefetch (sratools) and converted to paired-end FASTQ files using fastq-dump (sra-tools). Adapter trimming and quality filtering (Q ≥30, minimum length ≥50 bp) were performed with fastq-mcf v1.04.807 (ea-utils) using Illumina adapter sequences. Read quality was then assessed with fastq-stats v1.01 (ea-utils), and filtered libraries were retained if they contained ≥15 million reads, mean read length ≥75 bp, mean base quality ≥30, and ≤50% duplication. Runs failing these criteria were discarded. The RNA-seq datasets used are listed in Table S9. RNA-seq libraries were normalized individually with Trinity v2.15.2 81 (insilico_read_normalization.pl) using strandedness information curated from metadata and verified with the check_strandedness function in Kallisto v0.44.0 82 . Libraries were normalized to a maximum coverage of 100x to (i) prevent any single library from dominating downstream analyses, (ii) balance contributions across samples, and (iii) reduce overall computational load. Libraries were processed separately to preserve sample integrity and maintain a clean output structure for subsequent alignment. S tructural annotation RNA-seq alignments were performed using STAR v2.7.11b 83 with a two-pass approach on individually normalized libraries. STAR genome indices were built for both primary and alternate assemblies, and resulting alignments were sorted by coordinate with non-canonical splice junctions removed. For gene prediction, we applied two complimentary approaches: BRAKER3 v3.0.8 84 , which integrated RNA-seq alignments and protein homology from OrthoDB v12 using the Viridiplantae partition 85 , and Helixer v0.3.5 86 which performed ab initio prediction using the land_plant model. Annotation quality was assessed with AED scores generated with InGenAnnot v0.0.15 87 . AED values were calculated relative to RNA-seq transcript assemblies (StringTie2 v2.2.1 88 ) and protein alignments against the proteomes of closely related species and the UniProtKB SwissProt reviewed protein database (miniprot v0.17 89 ; Table S10), producing both transcript- and protein-based AED metrics. Because Helixer predictions do not account for TE, Helixer gene models were filtered with an in-house Python script (Filter_repeat_region_genes.py, available: https://github.com/RobBacker/Genome_assembly_and_annotation_tools ). The script calculates CDS-level overlap with repeat regions (RepeatMasker annotation file), merges CDS and repeat intervals to avoid double-counting, and removes models with >25% CDS overlap with repeats unless rescued by transcript-based AED (aed_ev_tr < 0.75). BRAKER3 and Helixer annotations were subsequently merged into a unified gene set with AGAT v1.4.2 90 (agat_sp_complement_annotations.pl), setting the BRAKER3 annotation as the reference. RNA-seq-based transcript evidence was then clustered, ranked, and UTRs were added using InGenAnnot (clusterize, isoform_ranking, and utr_refine), guided by strandedness-aware BAM inputs. Gene model curation Gene model quality was evaluated with PSAURON v1.0.6 91 using default settings. To guide automated curation, an in-house Python script (aed_psauron_curation.py) combined AED scores with support inferred from PSAURON. Each transcript was assigned to one of six categories (cat1–cat6) ranging from high confidence (cat1: strong transcript and protein support) to ab initio only (cat6: no external support). Categories cat1–cat4 were automatically retained, cat5–cat6 were retained only if PSAURON support was detected. This produced an annotation file with per-transcript support categories and PSAURON tags. Diagnostic plots of transcript vs. protein AED distributions were generated to visualize category assignments (curated_aed_psauron_scatter.py). Curated annotations were then filtered with AGAT utilities to remove low-confidence or pseudogenic models, and summary statistics were generated. For downstream benchmarking, the longest isoform per gene was extracted (agat_sp_keep_longest_isoform.pl) and assessed with BUSCO. Annotation of TE-related genes was performed using DeTEnGA 92 , which integrates sequence similarity searches against curated TE protein families. For each assembly (primary and alternate), DeTEnGA was run via the GAQET2 wrapper with the plant-specific REXdb database (“rexdb-plant”), using default parameters. The curated GFF3 annotation file served as input, alongside the repeat-masked genome assembly. DeTEnGA produced TE annotation summaries for assembly, which were subsequently used to tag candidate TE-related models using a custom Python script (tag_detenga.py). Functional annotation Functional annotation of predicted proteins was performed using EnTAP v2.3.0 93 . Longest isoform CDS from both primary and alternate assemblies were used as input. EnTAP was run with a Viridiplantae taxonomic scope, designating bacterial, fungal, insect, and viral sequences as contaminants. Reference databases included RefSeq Plant and UniProt SwissProt, while domain-based annotation employed InterProScan v5.74-105.0 94 (databases: Pfam, Panther, SMART, Superfamily, Gene3D, PrositeProfiles, and PRINTS). EnTAP was executed with ontology assignment enabled (GO and KEGG). Functional annotations were parsed and integrated with structural annotation features using AGAT (agat_sp_manage_functional_annotations.pl). Pfam domain summaries were generated from the EnTAP tables using a custom script (entap_pfam_qc.py), which collapses isoforms, counts unique gene–Pfam pairs, and normalizes counts to per-1,000 genes. Pangenome graph assembly and assessment Assemblies and annotations were reformatted to the PanSN-spec naming scheme, with headers encoded as sample#haplotype#contigID (haplotypes: 1 = primary, 2 = alternate, 0 = reference). Gene models were updated accordingly, and multi-isoform loci were collapsed to the longest isoform using AGAT. A whole-genome alignment–based pangenome graph was then constructed with Minigraph-Cactus v8.1.0b1 95 , using the WI pure accession as the reference backbone and incorporating both haplotypes of each assembly in order of k-mer completeness. Runs were performed with the --lastTrain option and produced multiple formats (--odgi, --gfa full clip, --gbz, --vcf, --giraffe, --viz, --vcfwave). Graph growth statistics were calculated with Panacus v0.3.3 96 from GFA walk records at the bp level, using quorum thresholds (-q 1,0.5,0.1) and coverage levels (-l 1,3,6,7), both with and without sample merging (-S). Curves were visualized with panacus-visualize. Orthology and synteny were resolved with GENESPACE v1.3.1 97 , using BED files derived from GFF3 annotations and peptide FASTAs (longest isoform per gene). Primary and alternate haplotypes were treated as independent genomes to maximise discovery of allelic variation across accessions, with the WI reference processed in parallel. Orthogroup presence/absence was analysed with a custom workflow (pangenome_pav_support.py), built around OrthoFinder v2.5.5 98 outputs from GENESPACE. This produced: (i) a binary presence/absence variation (PAV) matrix, (ii) an orthogroup classification table, and (iii) per-assembly gene-level support matrices from curated GFF3 annotations. Category cutoffs were set to match whole-haplotype counts for our dataset (n = 15): Core ≥95% (15/15), Soft-core 85–94% (13–14/15), Shell 20– 84% (3–12/15), Cloud <20% (1–2/15). Cloud orthogroups were further divided into “Cloud-supported” and “Cloud-unsupported” based on AED curation tags. Plots were generated from the binary orthogroup PAV matrix with a custom Python script (upset_orthogroups.py), which: (i) selects the top 15 intersections from each non-core category for the UpSet-style panel (split y-axis), (ii) draws per-accession stacked compositions, and (iii) renders a presence/absence heat map sorted by category and prevalence. Pangenome accessory genes (Cloud and Shell) were tested for GO enrichment with topGO v2.58.0 99 (classic Fisher’s exact test; BP, MF, CC ontologies), restricting terms to the Viridiplantae lineage using the GO taxon constraints ontology (go_taxon_constraints.owl 100 ). Statistical significance was adjusted using the Benjamini–Hochberg false discovery rate (FDR) procedure. Results were simplified with clusterProfiler v4.14.6 101 (semantic similarity, cutoff = 0.7) and visualized as dotplots with enrichplot. Pan- and core-genome accumulation curves were derived from the binary orthogroup PAV matrix using 1,000 random genome order permutations. At each increment, mean, median, and 95% confidence intervals of pan- and core-genome sizes were computed. The pangenome was modeled by Heaps’ law (Pan(n) = S·n^β) using log–log regression, and the core-genome with a Tettelin-style exponential decay model (Core(n) = Ω + k·e^−n/τ). Model parameters and statistics are reported in Table S11. Curves, model fits, and inset log–log regressions were visualized in R v4.4 with ggplot2. Pangenome graph structural variant analysis From each assembly, the longest gene model per locus was mapped to the WI reference in the Minigraph-Cactus graph using halLiftover v2.2 102 . Annotations were standardised with awk and converted to BED with bedtools v2.31.1 103 , after which each mapped gene was reduced to a single span. Pangene loci were defined as regions were these spans overlapped on the same strand across assemblies; with locus coordinates taken as the union of all member spans. From this, we generated a binary PAV matrix and a pangene–gene ID map. Graph variants were obtained from the Minigraph-Cactus WAVE VCF and processed with bcftools v1.22 60 to extract SVs (bcftools view -i ‘INFO/INV=1 || INFO/LEN=50’) and substitutions (SNPs and MNPs; bcftools view -i ‘INFO/TYPE=“snp” || INFO/TYPE=“mnp”’). Records were exported to BED8 (one line per alternate allele) with TYPE and LEN fields. Pangene loci were then intersected with these variant BEDs using bedtools to obtain per-locus overlap tables and type-stratified summaries for enrichment analyses and figures. This workflow is generalised in the bash script, extract_pan_gene_loci.sh. Variation in and around NLRs was assessed by comparing observed variant density within ±5 kb windows around NLR -associated pangene loci to a length- and chromosome-matched bootstrap null. For each NLR window, a random genomic window of identical length was sampled, with any overlap with the observed NLR set excluded. The bootstrap null was generated from 2,000 replicates and, per replicate, overlaps with SVs, SNPs and MNPs were counted with bedtools intersect, reporting densities as variants per kb and stratifying by type. Right-tail significance used add-one smoothing: p = (1 + #(null ≥ obs)) / (N + 1). SV and SNP+MNP bootstraps shared random seeds for directly comparable nulls. Analyses and plotting were performed with a custom Python script, sv_enrichment.py. NLR data analysis NLR-encoding genes were identified from predicted protein sequences using NLRtracker 40 . Genes were classified as complete NLRs if they contained at least a central Nucleotide-binding (NB-ARC) domain and a C-terminal Leucine-rich repeat (LRR). N-terminal domains were further categorized as coiled-coil (CC), Toll/interleukin-1 receptor (TIR), or other (O) non-canonical integrated domains (classified as “Other” by NLRtracker). Domain composition schematics were generated with BioRender, and summary statistics were visualized in Microsoft Excel.To assess genomic clustering, GFF3 annotations were used to map NLR positions. A cluster was defined as ≥2 NLR genes located within 250 kb of one another with <5 non- NLR genes between neighbouring NLRs . Chromosomal assignment of clusters was determined using the contig–chromosome alignments from whole-genome scaffolding. Detailed analysis of chromosome 7 clusters was performed with BLAST+ v2.13.0. Genes were considered sequence-similar if they shared >85% nucleotide identity. Local duplication patterns and gene orientations were visualised using custom Python scripts (chromosome_viz.py). To investigate relationships between NLRs and TEs, we developed a custom Python script (gff3_density_analyzer.py). The script parsed GFF3 annotation files (for main gene annotations, NLRs , and TEs) and determined chromosome boundaries from annotated genes. NLR and TE densities were then calculated in non-overlapping sliding 50 kb windows, with values smoothed using a Gaussian filter. A scatter plot of paired NLR and TE densities across windows were generated, and the Pearson correlation coefficient was computed to quantify the association between NLR and TE distributions. NLRs were assigned to core, soft-core, shell, or cloud compartments using orthogroup classifications from GENESPACE v0.9.0 in combination with functional annotation derived from GO analysis. GO terms were also used to construct an UpSet plot, summarizing shared and accession-specific functional annotations. Classification thresholds followed the same definitions applied to the whole-genome pangenome (core ≥95% of haplotypes; soft-core 85–94%; shell 20–84%; cloud ≤19%). Orthogroups of NLRs from all accessions were further analysed with OrthoFinder v2.5.5 to infer a species-level phylogeny. The analysis was automated with a custom Python pipeline (orthofinder_phylogeny.py), which validated input FASTA files, executed OrthoFinder with default parameters, and produced species trees. Where applicable, IQ-tree was used for tree inference. The resulting tree was visualized using the iTOL webtool. To assess amino acid diversity within functional groups, protein sequences of NLRs annotated with the same GO term were aligned using MUSCLE v5.3. Shannon entropy was calculated for each alignment using ShannonEnt (available at: https://github.com/wldolan/shannon-entropy ), and average entropy scores per GO term were computed. These values were used for graph construction using a custom Python script (orthogroup_pipeline.sh), and to compare sequence variability across shared functional categories. Data Availability Raw long-read sequencing data have been deposited in the Sequence Read Archive (SRA) under BioProject PRJNA1331494 (sample accessions: SRR35509374 – SRR35509379 ). Nuclear genome assemblies are available in GenBank under accessions JBRKCI000000000 , JBRKCJ000000000 , JBRKCK000000000 , JBRKCL000000000 , JBRKCM000000000 , JBRKCN000000000 , JBRKCO000000000 , JBRKCP000000000 , JBRKCR000000000 , JBRKCS000000000 , JBRKCT000000000 , JBRKCU000000000. Code Availability Custom Python scripts referenced throughout this article are available on GitHub at https://github.com/RobBacker/Genome_assembly_and_annotation_tools . Author Contributions NB, AC, AV, AB and RB conceived the study. AV optimized DNA extraction and performed all wet-lab experiments. RB generated genome assemblies, developed analysis scripts and conducted primary analyses; AC performed fine-grained NLR analyses. AB provided initial conceptual guidance and mentoring. NB provided resources, supervision, project administration and funding acquisition. RB and AC wrote the original draft. All authors (RB, AC, AV, AB, NB) reviewed and edited the manuscript. Competing Interests The authors have no competing commercial or financial interests relevant to this study. Supplementary Information Supplementary materials can be found in Supplementary_Tables.xlsx and Supplementary_Figures.docx. Acknowledgements We thank the Avocado Genome Consortium for providing pre-publication access to genome resources. We gratefully acknowledge Westfalia ® Fruit Estate for providing access to their germplasm and authorizing its research use, and ZZ2 for authorizing the research use of Ashdot material. Institutional support was provided by the University of Pretoria and the Forestry and Agricultural Biotechnology Institute (FABI). This research was funded by the Hans Merensky Legacy Foundation. References 1. ↵ Gélinas Bélanger J . Taming the wild: domesticating untapped northern fruit tree and shrub resources in the era of high-throughput technologies . AoB Plants 17 , plae074 ( 2025 ). doi: 10.1093/aobpla/plae074 OpenUrl CrossRef PubMed 2. Vahdati K , Sarikhani S , Arab MM , Leslie CA , Dandekar AM , Aletà N , et al. Advances in Rootstock Breeding of Nut Trees: Objectives and Strategies . Plants 10 , 2234 ( 2021 ). doi: 10.3390/plants10112234 OpenUrl CrossRef PubMed 3. Iwata H , Minamikawa MF , Kajiya-Kanegae H , Ishimori M , Hayashi T . Genomics- assisted breeding in fruit trees . Breeding Science 66 , 100 – 115 ( 2016 ). doi: 10.1270/jsbbs.66.100 OpenUrl CrossRef PubMed 4. ↵ McClure KA , Sawler J , Gardner KM , Money D , Myles S . Genomics: a potential panacea for the perennial problem . American Journal of Botany 101 , 1780 – 1790 ( 2014 ). doi: 10.3732/ajb.1400143 OpenUrl Abstract / FREE Full Text 5. ↵ Kong W , Wang Y , Zhang S , Yu J , Zhang X . Recent Advances in Assembly of Complex Plant Genomes . Genomics, Proteomics & Bioinformatics 21 , 427 – 439 ( 2023 ). doi: 10.1016/j.gpb.2023.04.004 OpenUrl CrossRef PubMed 6. ↵ Guo D , Li Y , Lu H , Zhao Y , Kurata N , Wei X , et al. A pangenome reference of wild and cultivated rice . Nature 642 , 662 – 671 ( 2025 ). doi: 10.1038/s41586-025-08883-6 OpenUrl CrossRef PubMed 7. ↵ Wang X , Yan M , Cui S , Li F , Zhao Q , Wang Q , et al. Common bean pan-genome reveals abundant variation patterns and relationships of stress response genes and pathways . BMC Genomics 26 , 495 ( 2025 ). doi: 10.1186/s12864-025-11662-2 OpenUrl CrossRef PubMed 8. ↵ Gui S , Wei W , Jiang C , Luo J , Chen L , Wu S , et al. A pan- Zea genome map for enhancing maize improvement . Genome Biology 23 , 178 ( 2022 ). doi: 10.1186/s13059-022-02742-7 OpenUrl CrossRef PubMed 9. ↵ Wang T , Duan S , Xu C , Wang Y , Zhang X , Xu X , et al. Pan-genome analysis of 13 Malus accessions reveals structural and sequence variations associated with fruit traits . Nature Communications 14 , 7377 ( 2023 ). doi: 10.1038/s41467-023-43270-7 OpenUrl CrossRef PubMed 10. ↵ Li Y , Arús P , Wu J , Zhu G , Fang W , Chen C , et al. Panvariome and pangenome of 1,020 global peach accessions shed light on evolution patterns, hidden natural variations, and efficient gene discovery . Molecular Plant 18 , 995 – 1013 ( 2025 ). doi: 10.1016/j.molp.2025.04.009 OpenUrl CrossRef PubMed 11. ↵ Huang Y , He J , Xu Y , Zheng W , Wang S , Chen P , et al. Pangenome analysis provides insight into the evolution of the orange subfamily and a key gene for citric acid accumulation in citrus fruits . Nature Genetics 55 , 1964 – 1975 ( 2023 ). doi: 10.1038/s41588-023-01516-6 OpenUrl CrossRef PubMed 12. ↵ Yuan Y , Bayer PE , Batley J , Edwards D . Current status of structural variation studies in plants . Plant Biotechnology Journal 19 , 2153 – 2163 ( 2021 ). doi: 10.1111/pbi.13646 OpenUrl CrossRef PubMed 13. Shi T , Zhang X , Hou Y , Jia C , Dan X , Zhang Y , et al. The super-pangenome of Populus unveils genomic facets for its adaptation and diversification in widespread forest trees . Molecular Plant 17 , 725 – 746 ( 2024 ). doi: 10.1016/j.molp.2024.03.009 OpenUrl CrossRef PubMed 14. ↵ Hämälä T , Wafula EK , Guiltinan MJ , Ralph PE , dePamphilis CW , Tiffin P. Genomic structural variants constrain and facilitate adaptation in natural populations of Theobroma cacao , the chocolate tree . Proceedings of the National Academy of Sciences 118 , e2102914118 ( 2021 ). doi: 10.1073/pnas.2102914118 OpenUrl Abstract / FREE Full Text 15. ↵ Solares E , Morales-Cruz A , Balderas RF , Focht E , Ashworth VETM , Wyant S , et al. Insights into the domestication of avocado and potential genetic contributors to heterodichogamy . G3 Genes|Genomes|Genetics 13 , ( 2022 ). doi: 10.1093/g3journal/jkac323 OpenUrl CrossRef 16. ↵ Rendón-Anaya M , Ibarra-Laclette E , Méndez-Bravo A , Lan T , Zheng C , Carretero-Paulet L , et al. The avocado genome informs deep angiosperm phylogeny, highlights introgressive hybridization, and reveals pathogen-influenced gene space adaptation . Proceedings of the National Academy of Sciences 116 , 17081 – 17089 ( 2019 ). doi: 10.1073/pnas.1822129116 OpenUrl Abstract / FREE Full Text 17. ↵ Nath O , Fletcher SJ , Hayward A , Shaw LM , Masouleh AK , Furtado A , et al. A haplotype resolved chromosomal level avocado genome allows analysis of novel avocado genes . Horticulture Research 9 , uhac157 ( 2022 ). doi: 10.1093/hr/uhac157 OpenUrl CrossRef 18. ↵ Harvey A , van den Berg N , Swart V. In silico characterisation of the avocado WAK/WAKL gene family with a focus on genes involved in defence against Phytophthora cinnamomi . Frontiers in Plant Science 15 , 1474781 ( 2025 ). doi: 10.3389/fpls.2024.1474781 OpenUrl CrossRef PubMed 19. ↵ Fick A , Swart V , Backer R , Bombarely A , Engelbrecht J , van den Berg N. Partially resistant avocado rootstock Dusa® shows prolonged upregulation of Nucleotide binding- Leucine rich repeat genes in response to Phytophthora cinnamomi infection . Frontiers in Plant Science 13 , ( 2022 ). doi: 10.3389/fpls.2022.793644 OpenUrl CrossRef 20. ↵ Yang T , Cai Y , Huang T , Yang D , Yang X , Yin X , et al. A telomere-to-telomere gap-free reference genome assembly of avocado provides useful resources for identifying genes related to fatty acid biosynthesis and disease resistance . Horticulture Research 11 , uhae119 ( 2024 ). doi: 10.1093/hr/uhae119 OpenUrl CrossRef 21. ↵ Segretin ME , Pais M , Franceschetti M , Chaparro-Garcia A , Bos JIB , Banfield MJ , et al. Single amino acid mutations in the potato immune receptor R3a expand response to Phytophthora effectors . Molecular Plant-Microbe Interactions 27 , 624 – 637 ( 2014 ). doi: 10.1094/MPMI-02-14-0040-R OpenUrl CrossRef PubMed 22. Liu G , Fang Y , Liu X , Jiang J , Ding G , Wang Y , et al. Genome-wide association study and haplotype analysis reveal novel candidate genes for resistance to powdery mildew in soybean . Frontiers in Plant Science 15 , 1369650 ( 2024 ). doi: 10.3389/fpls.2024.1369650 OpenUrl CrossRef PubMed 23. ↵ De la Concepcion JC , Franceschetti M , MacLean D , Terauchi R , Kamoun S , Banfield MJ . Protein engineering expands the effector recognition profile of a rice NLR immune receptor . eLife 8 , e47713 ( 2019 ). doi: 10.7554/eLife.47713 OpenUrl CrossRef PubMed 24. ↵ Adachi H , Derevnina L , Kamoun S . NLR singletons, pairs, and networks: evolution, assembly, and regulation of the intracellular immunoreceptor circuitry of plants . Current Opinion in Plant Biology 50 , 121 – 131 ( 2019 ). doi: 10.1016/j.pbi.2019.04.007 OpenUrl CrossRef PubMed 25. ↵ Contreras MP , Lüdke D , Pai H , Toghani A , Kamoun S. NLR receptors in plant immunity: making sense of the alphabet soup . EMBO Reports 24 , e57495 ( 2023 ). doi: 10.15252/embr.202357495 OpenUrl CrossRef PubMed 26. ↵ He H , Guo R , Gao A , Chen Z , Liu R , Liu T , et al. Large-scale mutational analysis of wheat powdery mildew resistance gene Pm21 . Frontiers in Plant Science 13 , ( 2022 ). doi: 10.3389/fpls.2022.988641 OpenUrl CrossRef 27. ↵ Dolatabadian A , Yuan Y , Bayer PE , Petereit J , Severn-Ellis A , Tirnaz S , et al. Copy Number Variation among Resistance Genes Analogues in Brassica napus . Genes (Basel) 13 , ( 2022 ). doi: 10.3390/genes13112037 OpenUrl CrossRef 28. Van Ghelder C , Parent GJ , Rigault P , Prunier J , Giguère I , Caron S , et al. The large repertoire of conifer NLR resistance genes includes drought responsive and highly diversified RNLs . Scientific Reports 9 , 11614 ( 2019 ). doi: 10.1038/s41598-019-47950-7 OpenUrl CrossRef PubMed 29. ↵ Rivera-Burgos L , VanGessel C , Guedira M , Smith J , Marshall D , Jin Y , et al. Fine mapping of stem rust resistance derived from soft red winter wheat cultivar AGS2000 to an NLR gene cluster on chromosome 6D . Theoretical and Applied Genetics 137 , 206 ( 2024 ). doi: 10.1007/s00122-024-04702-0 OpenUrl CrossRef PubMed 30. ↵ Winters NP , Wafula EK , Timilsena PR , Ralph PE , Maximova SN , dePamphilis CW , et al. Local gene duplications drive extensive NLR copy number variation across multiple genotypes of Theobroma cacao . G3 Genes|Genomes|Genetics 15 , jkaf147 ( 2025 ). doi: 10.1093/g3journal/jkaf147 OpenUrl CrossRef 31. ↵ Ge Y , Dong X , Wu B , Wang N , Chen D , Chen H , et al. Evolutionary analysis of six chloroplast genomes from three Persea americana ecological races: Insights into sequence divergences and phylogenetic relationships . PLOS ONE 14 , e0221827 ( 2019 ). doi: 10.1371/journal.pone.0221827 OpenUrl CrossRef PubMed 32. ↵ Liu Y , Zhang D , Huang J , Zhu W , Yang S , Song Y . The complete sequence and comparative analysis of avocado mitochondrial genomes . Tree Genetics & Genomes 21 , 25 ( 2025 ). doi: 10.1007/s11295-025-01702-z OpenUrl CrossRef 33. ↵ Gao L , Gonda I , Sun H , Ma Q , Bao K , Tieman DM , et al. The tomato pan-genome uncovers new genes and a rare allele regulating fruit flavor . Nature Genetics 51 , 1044 – 1051 ( 2019 ). doi: 10.1038/s41588-019-0410-2 OpenUrl CrossRef PubMed 34. ↵ Cortinovis G , Vincenzi L , Anderson R , Marturano G , Marsh JI , Bayer PE , et al. Adaptive gene loss in the common bean pan-genome during range expansion and domestication . Nature Communications 15 , 6698 ( 2024 ). doi: 10.1038/s41467-024-51032-2 OpenUrl CrossRef PubMed 35. ↵ Jayakodi M , Shim H , Mascher M . What Are We Learning from Plant Pangenomes? Annual Review of Plant Biology 76 , 663 – 686 ( 2025 ). doi: 10.1146/annurev-arplant-090823-015358 OpenUrl CrossRef 36. ↵ Yang T , Liu R , Luo Y , Hu S , Wang D , Wang C , et al. Improved pea reference genome and pan-genome highlight genomic features and evolutionary characteristics . Nature Genetics 54 , 1553 – 1563 ( 2022 ). doi: 10.1038/s41588-022-01172-2 OpenUrl CrossRef PubMed 37. ↵ Ou S , Scheben A , Collins T , Qiu Y , Seetharam AS , Menard CC , et al. Differences in activity and stability drive transposable element variation in tropical and temperate maize . Genome Research 34 , 1140 – 1153 ( 2024 ). doi: 10.1101/gr.278131.123 OpenUrl Abstract / FREE Full Text 38. ↵ Zhu X , Yang R , Liang Q , Yu Y , Wang T , Meng L , et al. Graph-based pangenome provides insights into structural variations and genetic basis of metabolic traits in potato . Molecular Plant 18 , 590 – 602 ( 2025 ). doi: 10.1016/j.molp.2025.01.017 OpenUrl CrossRef PubMed 39. ↵ Cheng H , Kong L , Zhu K , Zhao H , Li X , Zhang Y , et al. Structural variation-based and gene-based pangenome construction reveals untapped diversity of hexaploid wheat . Journal of Genetics and Genomics 52 , 774 – 785 ( 2025 ). doi: 10.1016/j.jgg.2025.03.015 OpenUrl CrossRef 40. ↵ Kourelis J , Sakai T , Adachi H , Kamoun S. RefPlantNLR is a comprehensive collection of experimentally validated plant disease resistance proteins from the NLR family . PLOS Biology 19 , e3001124 ( 2021 ). doi: 10.1371/journal.pbio.3001124 OpenUrl CrossRef PubMed 41. ↵ Van de Weyer A-L , Monteiro F , Furzer OJ , Nishimura MT , Cevik V , Witek K , et al. A species-wide inventory of NLR genes and alleles in Arabidopsis thaliana . Cell 178 , 1260 – 1272.e1214 ( 2019 ). doi: 10.1016/j.cell.2019.07.038 OpenUrl CrossRef PubMed 42. ↵ Wang Z , Fan Y , Sun J , Ma S , Wang Z , Li J , et al. Pan-analysis of intra- and inter- species diversity reveals a group of highly variable immune receptor genes in rice . The Plant Journal 122 , e70163 ( 2025 ). doi: 10.1111/tpj.70163 OpenUrl CrossRef PubMed 43. ↵ Teasdale LC , Murray KD , Collenberg M , Contreras-Garrido A , Schlegel T , van Ess L , et al. Pangenomic context reveals the extent of intraspecific plant NLR evolution . Cell Host & Microbe 33 , 1291 – 1305.e1299 ( 2025 ). doi: 10.1016/j.chom.2025.07.011 OpenUrl CrossRef PubMed 44. ↵ Thatcher S , Jung M , Panangipalli G , Fengler K , Sanyal A , Li B , et al. The NLRomes of Zea mays NAM founder lines and Zea luxurians display presence-absence variation, integrated domain diversity, and mobility . Molecular Plant Pathology 24 , 742 – 757 ( 2023 ). https://doi.10.1111/mpp.13319 OpenUrl PubMed 45. ↵ Prigozhin DM , Krasileva KV . Analysis of intraspecies diversity reveals a subset of highly variable plant immune receptors and predicts their binding sites . The Plant Cell 33 , 998 – 1015 ( 2021 ). https://doi.10.1093/plcell/koab013 OpenUrl CrossRef PubMed 46. ↵ Fick A , Swart V , Backer R , Bombarely A , Engelbrecht J , van den Berg N . Partially resistant avocado rootstock Dusa® shows prolonged upregulation of nucleotide binding- leucine rich repeat genes in response to Phytophthora cinnamomi infection . Front Plant Sci 13 , 793644 ( 2022 ). doi: 10.3389/fpls.2022.793644 OpenUrl CrossRef 47. ↵ Zumaquero A , Martínez-Ferri E , Matas AJ , Reeksting B , Olivier NA , Pliego-Alfaro F , et al. Rosellinia necatrix infection induces differential gene expression between tolerant and susceptible avocado rootstocks . PLOS ONE 14 , e0212359 ( 2019 ). doi: 10.1371/journal.pone.0212359 OpenUrl CrossRef PubMed 48. ↵ Talavera A , Soorni A , Bombarely A , Matas AJ , Hormaza JI . Genome-Wide SNP discovery and genomic characterization in avocado ( Persea americana Mill .). Scientific Reports 9 , 20137 ( 2019 ). doi: 10.1038/s41598-019-56526-4 OpenUrl CrossRef PubMed 49. ↵ Kuhn DN , Groh A , Rahaman J , Freeman B , Arpaia ML , Van den Berg N , et al. Creation of an avocado unambiguous genotype SNP database for germplasm curation and as an aid to breeders . Tree Genetics and Genomes 15 , 71 ( 2019 ). doi: 10.1007/s11295-019-1374-1 OpenUrl CrossRef 50. ↵ Li Z , Parris S , Saski CA . A simple plant high-molecular-weight DNA extraction method suitable for single-molecule technologies . Plant Methods 16 , 38 ( 2020 ). doi: 10.1186/s13007-020-00579-4 OpenUrl CrossRef PubMed 51. ↵ Andrews S. FastQC: a quality control tool for high throughput sequence data . Babraham Institute ( 2010 ). Available from: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ . 52. ↵ Ewels P , Magnusson M , Lundin S , Käller M . MultiQC: summarize analysis results for multiple tools and samples in a single report . Bioinformatics 32 , 3047 – 3048 ( 2016 ). doi: 10.1093/bioinformatics/btw354 OpenUrl CrossRef PubMed 53. ↵ Sim SB , Corpuz RL , Simmonds TJ , Geib SM . HiFiAdapterFilt, a memory efficient read processing pipeline, prevents occurrence of adapter sequence in PacBio HiFi reads and their negative impacts on genome assembly . BMC Genomics 23 , 157 ( 2022 ). doi: 10.1186/s12864-022-08375-1 OpenUrl CrossRef PubMed 54. ↵ Aronesty E. ea-utils: Command-line tools for processing biological sequencing data. GitHub ( 2011 ). Available from: https://expressionanalysis.github.io/ea-utils/ . 55. ↵ Marçais G , Kingsford C . A fast, lock-free approach for efficient parallel counting of occurrences of k-mers . Bioinformatics 27 , 764 – 770 ( 2011 ). doi: 10.1093/bioinformatics/btr011 OpenUrl CrossRef PubMed Web of Science 56. ↵ Cheng H , Concepcion GT , Feng X , Zhang H , Li H . Haplotype-resolved de novo assembly using phased assembly graphs with hifiasm . Nature Methods 18 , 170 – 175 ( 2021 ). doi: 10.1038/s41592-020-01056-5 OpenUrl CrossRef 57. ↵ Li H , Feng X , Chu C . The design and construction of reference pangenome graphs with minigraph . Genome Biology 21 , 265 ( 2020 ). doi: 10.1186/s13059-020-02168-z OpenUrl CrossRef PubMed 58. ↵ Zhou C , Brown M , Blaxter M , McCarthy SA , Durbin R , Darwin Tree of Life Project C. Oatk: a de novo assembly tool for complex plant organelle genomes . Genome Biology 26 , 235 ( 2025 ). doi: 10.1186/s13059-025-03676-6 OpenUrl CrossRef PubMed 59. ↵ Li H . Minimap2: pairwise alignment for nucleotide sequences . Bioinformatics 34 , 3094 – 3100 ( 2018 ). doi: 10.1093/bioinformatics/bty191 OpenUrl CrossRef PubMed 60. ↵ Danecek P , Bonfield JK , Liddle J , Marshall J , Ohan V , Pollard MO , et al. Twelve years of SAMtools and BCFtools . GigaScience 10 , giab008 ( 2021 ). doi: 10.1093/gigascience/giab008 OpenUrl CrossRef PubMed 61. ↵ Tillich M , Lehwark P , Pellizzer T , Ulbricht-Jones ES , Fischer A , Bock R , et al. GeSeq – versatile and accurate annotation of organelle genomes . Nucleic Acids Research 45 , W6 – W11 ( 2017 ). doi: 10.1093/nar/gkx391 OpenUrl CrossRef PubMed 62. ↵ Greiner S , Lehwark P , Bock R . OrganellarGenomeDRAW (OGDRAW) version 1.3.1: expanded toolkit for the graphical visualization of organellar genomes . Nucleic Acids Research 47 , W59 – W64 ( 2019 ). doi: 10.1093/nar/gkz238 OpenUrl CrossRef 63. ↵ Camacho C , Coulouris G , Avagyan V , Ma N , Papadopoulos J , Bealer K , et al. BLAST+: architecture and applications . BMC Bioinformatics 10 , 421 ( 2009 ). doi: 10.1186/1471-2105-10-421 OpenUrl CrossRef PubMed 64. ↵ Li H. . Seqtk . GitHub ( 2023 ). Available from: https://github.com/lh3/seqtk . 65. ↵ Consortium TU . UniProt: the Universal Protein Knowledgebase in 2023 . Nucleic Acids Research 51 , D523 – D531 ( 2022 ). doi: 10.1093/nar/gkac1052 OpenUrl CrossRef PubMed 66. ↵ Buchfink B , Reuter K , Drost H-G . Sensitive protein alignments at tree-of-life scale using DIAMOND . Nature Methods 18 , 366 – 368 ( 2021 ). doi: 10.1038/s41592-021-01101-x OpenUrl CrossRef PubMed 67. ↵ Schoch CL , Ciufo S , Domrachev M , Hotton CL , Kannan S , Khovanskaya R , et al. NCBI Taxonomy: a comprehensive update on curation, resources and tools . Database (Oxford ) 2020 , baaa062 ( 2020 ). doi: 10.1093/database/baaa062 OpenUrl CrossRef PubMed 68. ↵ NCBI Taxonomy database and accession2taxid mapping files [Online Database]. National Library of Medicine (US) ( 2025 ) [accessed: 10/06/2025 ]. Available from: ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/ . 69. ↵ Challis R , Richards E , Rajan J , Cochrane G , Blaxter M . BlobToolKit - interactive quality assessment of genome assemblies . G3 (Bethesda) 10 , 1361 – 1374 ( 2020 ). doi: 10.1534/g3.119.400908 OpenUrl Abstract / FREE Full Text 70. ↵ Mikheenko A , Prjibelski A , Saveliev V , Antipov D , Gurevich A . Versatile genome assembly evaluation with QUAST-LG . Bioinformatics 34 , i142 – i150 ( 2018 ). doi: 10.1093/bioinformatics/bty266 OpenUrl CrossRef PubMed 71. ↵ Rhie A , Walenz BP , Koren S , Phillippy AM . Merqury: reference-free quality, completeness, and phasing assessment for genome assemblies . Genome Biology 21 , 245 ( 2020 ). doi: 10.1186/s13059-020-02134-9 OpenUrl CrossRef PubMed 72. ↵ Cabanettes F , Klopp C . D-GENIES: dot plot large genomes in an interactive, efficient and simple way . PeerJ 6 , e4958 ( 2018 ). doi: 10.7717/peerj.4958 OpenUrl CrossRef PubMed 73. ↵ Manni M , Berkeley MR , Seppey M , Simão FA , Zdobnov EM . BUSCO update: novel and streamlined workflows along with broader and deeper phylogenetic coverage for scoring of eukaryotic, prokaryotic, and viral genomes . Molecular Biology and Evolution 38 , 4647 – 4654 ( 2021 ). doi: 10.1101/gr.278131.12310.1093/molbev/msab199 OpenUrl CrossRef PubMed 74. ↵ Smith AFA , Hubley R , Green P. RepeatMasker Open-4.0. ( 2013 ). Available from: http://www.repeatmasker.org . 75. ↵ Zhang R-G , Li G-Y , Wang X-L , Dainat J , Wang Z-X , Ou S , et al. TEsorter: An accurate and fast method to classify LTR-retrotransposons in plant genomes . Horticulture Research 9 , uhac017 ( 2022 ). doi: 10.1093/hr/uhac017 OpenUrl CrossRef 76. ↵ Team RC . R: A language and environment for statistical computing Vienna, Austria: R Foundation for Statistical Computing ; 2021 [Available from: https://www.r-project.org/ . 77. ↵ Winter D. repeatR: Read and analyse RepeatMasker output in R. GitHub ( 2021 ). Available from: https://github.com/dwinter/repeatR . 78. ↵ Wickham H. ggplot2: elegant graphics for data analysis : Springer-Verlag New York ; 2016 . 79. ↵ Neuwirth E . RColorBrewer: ColorBrewer Palettes. ( 2022 ). Available from: https://CRAN.R-project.org/package=RColorBrewer . 80. ↵ (NCBI) NCfBI . Tools for accessing data from the INSDC Sequence Read Archive (SRA). National Library of Medicine (US) ( 2024 ). Available from: https://github.com/ncbi/sra-tools . 81. ↵ Grabherr MG , Haas BJ , Yassour M , Levin JZ , Thompson DA , Amit I , et al. Full-length transcriptome assembly from RNA-Seq data without a reference genome . Nature Biotechnology 29 , 644 - 652 ( 2011 ). doi: 10.1038/nbt.1883 OpenUrl CrossRef PubMed 82. ↵ Bray NL , Pimentel H , Melsted P , Pachter L. Near-optimal probabilistic RNA-seq quantification . Nature Biotechnology 34 , 525 - 527 ( 2016 ). doi: 10.1038/nbt.3519 OpenUrl CrossRef PubMed 83. ↵ Dobin A , Davis CA , Schlesinger F , Drenkow J , Zaleski C , Jha S , et al. STAR: ultrafast universal RNA-seq aligner . Bioinformatics 29 , 15 – 21 ( 2012 ). doi: 10.1093/bioinformatics/bts635 OpenUrl CrossRef PubMed Web of Science 84. ↵ Gabriel L , Brùna T , Hoff KJ , Ebel M , Lomsadze A , Borodovsky M , et al. BRAKER3: Fully automated genome annotation using RNA-seq and protein evidence with GeneMark-ETP, AUGUSTUS, and TSEBRA . Genome Res 34 , 769 - 777 ( 2024 ). doi: 10.1101/gr.278090.123 OpenUrl Abstract / FREE Full Text 85. ↵ Kuznetsov D , Tegenfeldt F , Manni M , Seppey M , Berkeley M , Kriventseva Evgenia V , et al. OrthoDB v11: annotation of orthologs in the widest sampling of organismal diversity . Nucleic Acids Research 51 , D445 - D451 ( 2022 ). doi: 10.1093/nar/gkac998 OpenUrl CrossRef 86. ↵ Stiehler F , Steinborn M , Scholz S , Dey D , Weber APM , Denton AK. Helixer: cross-species gene annotation of large eukaryotic genomes using deep learning . Bioinformatics 36 , 5291 - 5298 ( 2020 ). doi: 10.1093/bioinformatics/btaa1044 OpenUrl CrossRef 87. ↵ Lapalu N. InGenAnnot: Inspection of Gene Annotation. GitLab repository ( 2025 ). Available from: https://forgemia.inra.fr/bioger/ingenannot . 88. ↵ Kovaka S , Zimin AV , Pertea GM , Razaghi R , Salzberg SL , Pertea M. Transcriptome assembly from long-read RNA-seq alignments with StringTie2 . Genome Biology 20 , 278 ( 2019 ). doi: 10.1186/s13059-019-1910-1 OpenUrl CrossRef PubMed 89. ↵ Li H. Protein-to-genome alignment with miniprot . Bioinformatics 39 , btad014 ( 2023 ). doi: 10.1093/bioinformatics/btad014 OpenUrl CrossRef 90. ↵ Dainat J-S. AGAT: Another Gff Analysis Toolkit to handle annotations in any GTF/GFF format . Zenodo ( 2019 ). Available from: https://github.com/NBISweden/AGAT . 91. ↵ Sommer Markus J , Zimin Aleksey V , Salzberg Steven L . PSAURON: a tool for assessing protein annotation across a broad range of species . NAR Genomics and Bioinformatics 7 , lqae189 ( 2025 ). doi: 10.1093/nargab/lqae189 OpenUrl CrossRef 92. ↵ Burgos VG-C. Detection of Transposable Elements (TEs) on Gene Annotations (DeTEnGA): GitHub, Inc.; 2025 [Available from: https://github.com/victorgcb1987/DeTEnGA . 93. ↵ Hart AJ , Ginzburg S , Xu M , Fisher CR , Rahmatpour N , Mitton JB , et al. EnTAP: Bringing faster and smarter functional annotation to non-model eukaryotic transcriptomes . Molecular Ecology Resources 20 , 591 – 604 ( 2020 ). doi: 10.1111/1755-0998.13106 OpenUrl CrossRef 94. ↵ Jones P , Binns D , Chang H-Y , Fraser M , Li W , McAnulla C , et al. InterProScan 5: genome-scale protein function classification . Bioinformatics 30 , 1236 - 1240 ( 2014 ). doi: 10.1093/bioinformatics/btu031 OpenUrl CrossRef PubMed Web of Science 95. ↵ . Hickey G , Monlong J , Ebler J , Novak AM , Eizenga JM , Gao Y , et al. Pangenome graph construction from genome alignments with Minigraph-Cactus . Nature Biotechnology 42 , 663 - 673 ( 2024 ). doi: 10.1038/s41587-023-01793-w OpenUrl CrossRef PubMed 96. ↵ Parmigiani L , Garrison E , Stoye J , Marschall T , Doerr D. Panacus: fast and exact pangenome growth and core size estimation . Bioinformatics 40 , btae720 ( 2024 ). doi: 10.1093/bioinformatics/btae720 OpenUrl CrossRef PubMed 97. ↵ Lovell JT , Sreedasyam A , Schranz ME , Wilson M , Carlson JW , Harkess A , et al. GENESPACE tracks regions of interest and gene copy number variation across multiple genomes . eLife 11 , e78526 ( 2022 ). doi: 10.7554/eLife.78526 OpenUrl CrossRef PubMed 98. ↵ Emms DM , Kelly S. OrthoFinder: phylogenetic orthology inference for comparative genomics . Genome Biology 20 , 238 ( 2019 ). doi: 10.1186/s13059-019-1832-y OpenUrl CrossRef PubMed 99. ↵ Alexa A , Rahnenfuhrer J. topGO: enrichment analysis for gene ontology. ( 2024 ). Available from: https://bioconductor.org/packages/topGO . 100. ↵ GO taxon constraints (go_taxon_constraints.owl) [Online Database]. Gene Ontology ( 2025 ) [accessed: 2025/08/14 ]. Available from: https://release.geneontology.org/2025-07-22/ontology/imports . 101. ↵ Yu G. Thirteen years of clusterProfiler . Innovation (Camb) 5 , 100722 ( 2024 ). doi: 10.1016/j.xinn.2024.100722 OpenUrl CrossRef PubMed 102. ↵ Hickey G , Paten B , Earl D , Zerbino D , Haussler D. HAL: a hierarchical format for storing and analyzing multiple genome alignments . Bioinformatics 29 , 1341 - 1342 ( 2013 ). doi: 10.1093/bioinformatics/btt128/ OpenUrl CrossRef PubMed Web of Science 103. ↵ Quinlan AR , Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features . Bioinformatics 26 , 841 - 842 ( 2010 ). doi: 10.1093/bioinformatics/btq033 OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted October 28, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following The avocado pangenome reveals dynamic clustering and lineage-specific diversity of NLR genes Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share The avocado pangenome reveals dynamic clustering and lineage-specific diversity of NLR genes Robert Backer , Alicia Clarke , Alicia Vermeulen , Aureliano Bombarely , Noёlani van den Berg bioRxiv 2025.10.28.684993; doi: https://doi.org/10.1101/2025.10.28.684993 Share This Article: Copy Citation Tools The avocado pangenome reveals dynamic clustering and lineage-specific diversity of NLR genes Robert Backer , Alicia Clarke , Alicia Vermeulen , Aureliano Bombarely , Noёlani van den Berg bioRxiv 2025.10.28.684993; doi: https://doi.org/10.1101/2025.10.28.684993 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Plant Biology Subject Areas All Articles Animal Behavior and Cognition (7636) Biochemistry (17704) Bioengineering (13898) Bioinformatics (41967) Biophysics (21460) Cancer Biology (18599) Cell Biology (25525) Clinical Trials (138) Developmental Biology (13384) Ecology (19909) Epidemiology (2067) Evolutionary Biology (24326) Genetics (15613) Genomics (22512) Immunology (17740) Microbiology (40423) Molecular Biology (17191) Neuroscience (88645) Paleontology (667) Pathology (2835) Pharmacology and Toxicology (4825) Physiology (7646) Plant Biology (15158) Scientific Communication and Education (2046) Synthetic Biology (4302) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00