Full text
95,900 characters
· extracted from
preprint-html
· click to expand
High-Fidelity Long-Read Sequencing of an Avian Herpesvirus Reveals Extensive Intrapopulation Diversity in Tandem Repeat Regions | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results High-Fidelity Long-Read Sequencing of an Avian Herpesvirus Reveals Extensive Intrapopulation Diversity in Tandem Repeat Regions View ORCID Profile Alejandro Ortigas-Vasquez , Christopher D. Bowen , Daniel W. Renner , Susan J. Baigent , Yaoyao Zhang , Yongxiu Yao , Venugopal Nair , David A. Kennedy , View ORCID Profile Moriah L. Szpara doi: https://doi.org/10.1101/2025.02.10.637388 Alejandro Ortigas-Vasquez 1 Departments of Biology, Center for Infectious Disease Dynamics and the Huck Institutes of the Life Sciences, Pennsylvania State University, University Park , Pennsylvania 16802, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alejandro Ortigas-Vasquez Christopher D. Bowen 1 Departments of Biology, Center for Infectious Disease Dynamics and the Huck Institutes of the Life Sciences, Pennsylvania State University, University Park , Pennsylvania 16802, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Daniel W. Renner 1 Departments of Biology, Center for Infectious Disease Dynamics and the Huck Institutes of the Life Sciences, Pennsylvania State University, University Park , Pennsylvania 16802, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Susan J. Baigent 3 Viral Oncogenesis Group, The Pirbright Institute , Woking, UK , GU24 0NF Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yaoyao Zhang 3 Viral Oncogenesis Group, The Pirbright Institute , Woking, UK , GU24 0NF Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yongxiu Yao 3 Viral Oncogenesis Group, The Pirbright Institute , Woking, UK , GU24 0NF Find this author on Google Scholar Find this author on PubMed Search for this author on this site Venugopal Nair 3 Viral Oncogenesis Group, The Pirbright Institute , Woking, UK , GU24 0NF Find this author on Google Scholar Find this author on PubMed Search for this author on this site David A. Kennedy 1 Departments of Biology, Center for Infectious Disease Dynamics and the Huck Institutes of the Life Sciences, Pennsylvania State University, University Park , Pennsylvania 16802, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Moriah L. Szpara 1 Departments of Biology, Center for Infectious Disease Dynamics and the Huck Institutes of the Life Sciences, Pennsylvania State University, University Park , Pennsylvania 16802, USA 2 Biochemistry and Molecular Biology, Center for Infectious Disease Dynamics and the Huck Institutes of the Life Sciences, Pennsylvania State University, University Park , Pennsylvania 16802, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Moriah L. Szpara For correspondence: moriah{at}psu.edu Abstract Full Text Info/History Metrics Preview PDF Summary Comparative genomic studies of Marek’s disease virus (MDV) have suggested that attenuated and virulent strains share >98% sequence identity. However, these estimates fail to account for variation in regions of the MDV genome harboring tandem repeats. To resolve these loci and enable assessments of intrapopulation diversity, we used a PacBio Sequel II platform to sequence MDV strains CVI988/Rispens (attenuated), HPRS-B14 (virulent), Md5 (very virulent) and 675A (very virulent plus). This approach enabled us to identify patterns of variation in tandem repeat regions that are consistent with known phenotypic differences across these strains. We also found CVI988/Rispens variants showing a 4.3-kb deletion in the Unique Short (US) region, resulting in the loss of five genes. These findings support a potential link between MDV tandem repeats and phenotypic traits like virulence and attenuation, and demonstrate that DNA viruses can harbor high levels of intrapopulation diversity in tandem repeat regions. Introduction Marek’s disease virus (MDV) is an alphaherpesvirus that causes a lymphoproliferative and demyelinating disease in poultry ( Mardivirus gallidalpha 2 , Genus Mardivirus ; Family Herpesviridae ) ( 1 ). Since the first description of Marek’s disease (MD) in 1907, MDV has undergone three major shifts in virulence ( 2 – 4 ). As a result, currently circulating strains of MDV have been classified into four main groups, or “pathotypes”, depending on their ability to bypass the protection induced by commercial vaccines: mild (m), virulent (v), very virulent (vv) or very virulent plus (vv+) ( 5 ). The current “gold standard” commercial vaccine against MDV is a live-attenuated Mardivirus gallidalpha 2 vaccine known as CVI988 or Rispens. This vaccine was produced through serial passage in duck-embryo fibroblast cells, and has successfully protected commercial flocks for the past three decades ( 6 , 7 ). However, a number of studies have recently reported the emergence of MDV strains capable of overcoming the protection conferred by the CVI988/Rispens vaccine ( 8 – 10 ). To facilitate the development of novel vaccines that can protect commercial flocks against future outbreaks, there is a pressing need to understand the molecular basis of MDV attenuation and the factors contributing to the overall virulence of MDV strains ( 11 , 12 ). In addition, a better understanding of MDV vaccines and circulating strains could help to inform the development of vaccine candidates for human alphaherpesviruses such as herpes simplex virus (HSV) and varicella-zoster virus (VZV). The MDV genome is 170-181-kb in length and shows a structure typical of alphaherpesviruses, which consists of a Unique Long (UL) and a Unique Short (US) region (∼115-kb and ∼12-kb in length, respectively) that are each flanked by inverted structural repeat regions. Comparative analyses of MDV consensus genomes performed in the last two decades have suggested that the CVI988/Rispens vaccine is antigenically and genetically >98% identical to virulent strains ( 13 ). However, these estimates fail to account for variation in 5 MDV genomic features associated with long stretches of tandem repeats, including: the MDV006.5/MDV075.2 transcripts, the proline-rich region of MDV049/UL36 (UL36-PRR), the multiple telomeric repeats (mTMR) region of the a -like sequence, the promoter region of the latency-associated transcript (LAT), and the proline-rich region of the meq oncogene (Meq-PRR) ( 14 ). Most of these features occur in the structural repeat regions flanking the UL and US regions, giving rise to 9 distinct genomic locations that harbor tandem repeats ( Figure 1D ). While available sequencing data suggests that MDV tandem repeats can be highly polymorphic, they have been excluded from most comparative genomic studies due to the difficulty of resolving them using Illumina-based approaches ( 15 – 18 ). The few studies that have attempted to resolve MDV tandem repeats relied mostly on a combination of PCR and Sanger sequencing, but such an approach is low-throughput, cost ineffective, and cannot reliably resolve repeats >800-bp in length ( 19 ). As a result, these regions have remained relatively understudied compared to the rest of the MDV genome, and it is currently unknown whether and/or how they correlate with attenuation or virulence. Download figure Open in new tab Figure 1: Using PacBio HiFi reads to resolve tandem repeats in four MDV strains and assess their intrapopulation diversity A) Four MDV strains with known phenotypic differences (att = attenuated, v = virulent, vv = very virulent, vv+ = very virulent plus) were sequenced using a PacBio Sequel II platform. Histograms (left) show the cumulative length distribution of quality-filtered, MDV-specific PacBio HiFi reads used to generate viral consensus genomes for each strain. The mean read length is displayed at the top right corner of each histogram. Coverage graph (right) shows the number of PacBio HiFi reads overlapping each position of the strain-matched de novo -assembled Illumina reference genome for each strain. B) To resolve tandem repeats, only reads >5-kb in length that fully contained the repeated motifs and extended at least 100-bp into non-repetitive sequences at either side were considered. Reads ending or starting with repeated motifs, as well as reads that only contained repetitive sequence, were excluded regardless of length. C) Tandem repeats can be classified based on the complexity of their repetitive patterns. Simple repeats (top, circle symbol) consist of perfect reiterations of a single repeating unit (e.g., CAG). Compound repeats (middle, triangle symbols) involve an alternative version of the repeating unit or interruptions in the repeat sequence by another motif. Complex repeats (bottom, star symbol) involve multiple alternative repeat versions and/or the presence of multiple repeating units in a variety of different patterns. D) The MDV genome consists of two large unique segments (Unique Long = UL, Unique Short = US), each flanked by inverted structural repeats (Terminal/Internal Repeat Long = TRL/IRL, or Short = IRS/TRS). A total of 5 genomic features are associated with tandem repeats, including: the MDV006.5/MDV075.2 transcripts, the proline-rich region of MDV049/UL36 (UL36-PRR), the multiple telomeric repeats (mTMR) region of the a -like sequence, the promoter region of the latency-associated transcript (LAT), and the proline-rich region of the meq oncogene (Meq-PRR). These features give rise to 9 distinct genomic locations in the MDV genome, indicated with arrow heads. For each genomic feature, a graphic representation of its characteristic repetitive patterns is shown (additional details in Figures 2 - 6 ). Symbol shapes indicate the complexity of the observed patterns (circle = simple, triangle = compound, star = complex). In recent years, the development of sequencing technologies capable of generating longer reads than those produced by Illumina platforms have made it increasingly practical for comparative genomic studies to account for variation in tandem repeats ( 20 , 21 ). Sequencing platforms relying on Single-Molecule Real-Time (SMRT) and Nanopore technologies have already been successfully used to resolve tandem repeats >800-bp in length ( 22 , 23 ). However, early iterations of long-read sequencing technologies suffered from relatively high error rates (>25%), making them unsuitable for the detection of viral genomic variants present at low frequencies (i.e., minor variants) ( 24 ). More recently, Pacific Biosciences (PacBio) developed a high-fidelity (HiFi) long-read sequencing method known as circular consensus sequencing (CCS), which can generate reads 5-25 kb in length with error-rates comparable to Illumina platforms ( 25 ). In this study, we sought to take advantage of the length and accuracy of PacBio HiFi reads to resolve tandem repeat regions in the MDV genome and perform assessments of intrapopulation diversity at these loci. We used a PacBio Sequel II platform to sequence cultured viral stocks of MDV strains CVI988/Rispens (attenuated), HPRS-B14 (v), Md5 (vv) and 675A (vv+). PacBio HiFi reads were then mapped to a strain-matched, de novo -assembled Illumina reference genome and visualized in a genome browser to facilitate manual curation and variant calling. Using this approach, we identified several patterns of tandem repeat variation potentially associated with known phenotypic differences across these strains. The use of PacBio HiFi reads also enabled us to identify three distinct repeating units in the Meq-PRR, which differs from past descriptions of this locus. These three repeating units can account for structural differences across all Meq “isoforms”, including a new variant of Meq identified as part of this study. We also detected CVI988/Rispens genomes containing a 4.3-kb deletion in the US region, which resulted in the loss of 5 genes. These findings showcase the ability of PacBio HiFi reads to accurately resolve tandem repeats in herpesvirus genomes, and support a potential role of MDV tandem repeat loci in virulence and attenuation. Methods Cells, virus master stocks, and virus working stocks Primary chick embryo fibroblasts (CEF) were prepared from 10-day old embryos and maintained in M199 medium (Thermo Fisher Scientific, Waltham, MA, USA) supplemented with 5% fetal bovine serum (FBS, Sigma-Aldrich, Darmstadt, Germany), 100 units/mL of penicillin and streptomycin (Thermo Fisher Scientific), and 10% tryptose phosphate broth (Sigma). Virus master stocks for MDV strains HPRS-B14, 675A, and Md5 (all 7th duck embryo fibroblast passage stocks) were kindly provided by Dr. A. M. Fadly (Avian Disease and Oncology Laboratory, USA). To amplify these stocks, 5-day-old Rhode Island Red chickens were inoculated intra-abdominally with 1000 plaque forming units (pfu) of virus. Lymphocytes isolated from spleens harvested at 14 days post-infection were cultured with CEF, the cell-associated virus was harvested at 7 days when cytopathic effect was clearly visible, and then further passaged in CEF every 5 to 6 days to produce working stocks of virus. A commercial CVI988/Rispens vaccine was sourced in the UK (Poulvac Marek CVI vaccine; Zoetis). Virus cultures and DNA isolation for Illumina sequencing of MDV strains HPRS-B14 and 675A The MDV working stocks we refer to as “HPRS-B14-Illumina” and “675A-Illumina” were derived from the master stocks as described above. DNA was prepared from approximately 5×10 6 cells of these working stocks (5 th passage CEF stock for HPRS-B14, and 4 th passage CEF stock for 675A), using the DNeasy-96 kit (Qiagen, Hilden, Germany), according to the manufacturer’s instructions, and eluted in DNase-free water. Virus cultures and DNA isolation for PacBio HiFi sequencing The MDV working stock we refer to as “CVI988-HiFi” was obtained from the commercial CVI988/Rispens vaccine following 2 passages in CEF with an infection level of 0.01 pfu/cell. The MDV working stocks for HPRS-B14 and 675A, as well as the MDV stock we refer to as “Md5-HiFi”, were derived from the master stocks as described above. DNA was prepared from approximately 5 ξ 10 6 cells of these working stocks (2 nd passage CEF stock for CVI988/Rispens, 9 th passage CEF stock of Md5, 5 th passage CEF stock for HPRS-B14, and 4 th passage CEF stock for 675A), using the DNeasy-96 kit as described above. DNA library preparation and Illumina sequencing DNA libraries for MDV strains HPRS-B14 and 675A were prepared for next generation sequencing using an Illumina Miseq platform as previously described ( 26 – 28 ). Briefly, extracted DNA from MDV cultures was quantified for total DNA by Qubit (Thermo Fisher) and total viral DNA by a qPCR assay targeting viral gene pp38 ( 29 ). Total DNA was then acoustically sheared using a Covaris M220, with settings at 60s duration, peak power 50, 10% duty cycle, at 4 °C. Sheared DNA fragments were processed for library preparation using the KAPA HyperPrep Library Amplification kit (KAPA Biosystems) according to the manufacturer’s specifications. Custom MDV-specific oligonucleotide primers (Arbor Biosciences) were used to enrich for virus-specific material ( 30 ). Enriched libraries were amplified by PCR (14 cycles) using the KAPA HiFi HotStart Library Amplification Kit (KAPA Biosystems). Finally, libraries were quantified again by Qubit and qPCR as well as a KAPA-specific qPCR (KAPA Biosystems) and Tapestation (Agilent Technologies). Libraries were then sequenced using 300 x 300 bp paired end reads v3 chemistry on an Illumina Miseq (Illumina). DNA library preparation and PacBio HiFi sequencing The SMRTbell Prep Kit 3.0 was used to prepare indexed templates according to the manufacturer’s protocol (Pacific Biosciences of California, Inc.). DNA concentration was measured by Qubit and MDV genome copy number was quantified by qPCR, as above. Templates were pooled and subsequently bound to polymerase using the Sequel II Binding Kit 2.0 (Pacific Biosciences of California, Inc.). Templates were sequenced with a 30-hour movie time on a Sequel IIe using a Sequel II Sequencing Plate 2.0 (Pacific Biosciences of California, Inc.). Processing of Illumina reads and genome assembly MDV-specific Illumina reads were identified using Kraken2 with default settings and extracted into a separate file using a custom Python script ( 31 ). The extracted reads were then subjected to the quality control and preprocessing step (Step 1) of our published viral genome assembly (VirGA) workflow, which performs adaptor trimming and trimming of low-quality bases ( 32 ). These properly paired reads were then used for de novo assembly using metaSPAdes v3.14.0 ( 33 ). The resulting file containing the metaSPAdes scaffolds in FASTA format served as input for the remaining steps of VirGA (Steps 3-4), which include genome linearization, annotation and post-assembly quality assessments. Processing of PacBio HiFi reads and genome assembly MDV-specific PacBio HiFi reads were identified using Kraken2 with default settings and extracted into a separate file using a custom Python script. For each strain, PacBio HiFi reads were mapped to a strain-matched, full-length Illumina reference using Minimap2 v2.28 ( 34 ). Reads with mapping quality of zero (MAPQ = 0) and supplementary alignments were removed using Samtools v1.16. Mapped reads were visualized without clipping using the Integrative Genomic Viewer (IGV) v2.12.3 ( 35 ). In non-repetitive regions, variant calling was conducted by identifying positions with at least 2x coverage and >50% disagreement between mapped reads and the reference sequence. Illumina templates of each strain were then manually modified using Geneious Prime v2024.0.5 to generate PacBio HiFi consensus genomes ( 36 ). To resolve repetitive loci, reads of length >5-kb that fully contained the corresponding repeat cluster and also extended partially (>100-bp) into non-repetitive regions upstream and downstream were aligned using MAFFT v7.505 and imported into Geneious Prime for manual re-alignment ( 37 ). The “Generate Consensus Sequence” tool in Geneious Prime was then used to obtain the consensus sequence for each repetitive locus. To account for low read coverage in TRL and TRS, the reverse complements of the PacBio HiFi-resolved IRL and IRS regions were used. The final full-length genomes were deposited into GenBank under accession numbers: PV035744 , PV035745 , PV035746 and PV035747 . Whole-genome pairwise identity and genetic distance comparisons Full-length PacBio HiFi consensus genomes of all four strains (n = 4) and previously published MDV consensus genomes with PubMed Identifiers (n = 39, see Supplementary Table 3 for list of GenBank accessions) were aligned using MAFFT. Gaps were masked using the “Mask Alignment” tool in Geneious Prime. A neighbor-joining (NJ) tree with gaps excluded was constructed using the Geneious Tree Builder tool with a Tamura-Nei genetic distance model and no outgroup. The genome alignment file is deposited at doi:10.26207/e0fh-0c45. MDV049/UL36 proline-rich region pairwise identity and genetic distance comparisons The amino-acid sequence of the MDV049/UL36 proline-rich region (UL36-PRR) for the four strains sequenced using PacBio HiFi reads (n = 4) and for MDV strains with previously published consensus genomes with PubMed Identifiers (n = 39, Supplementary Table 3 ) was extracted and aligned using the “Geneious Alignment” tool in Geneious Prime. The resulting alignment was visually inspected and manually re-aligned. Strains exhibiting stretches of Ns in the UL36-PRR or completely lacking this region were excluded (n = 3). The remaining sequences (n = 41) were used to generate a UPGMA tree using a Jukes-Cantor genetic distance model. Results Whole-genome sequencing of four MDV strains using high-fidelity long reads After quality control and filtering steps, the total number of MDV-specific PacBio HiFi reads for all four strains ranged from 245 to 17,548 ( Table 1 ). Read lengths ranged from 0.1-14 kb and averaged ∼7 kb in each of the four sequenced samples ( Figure 1A , left ). To generate viral consensus genomes, PacBio HiFi reads were mapped to a strain-matched Illumina reference genome ( Figure 1A , right ). For MDV strains CVI988/Rispens and Md5, a previously published consensus genome of the same strain was used as the reference (“CVI988-UK” and “Md5-UK”, respectively, see Supplementary Table 3 for GenBank accessions). The lack of published consensus genomes of MDV strains HPRS-B14 and 675A was addressed by generating new Illumina consensus genomes to serve as reference for each of these two strains (see Methods ). View this table: View inline View popup Download powerpoint Table 1: Sequencing metrics for MDV strains sequenced using PacBio HiFi The resulting alignment was then visually inspected to identify disagreements between the mapped PacBio HiFi reads and the Illumina reference sequence. As part of this process, we noticed that a high proportion (>50%) of reads mapping to tandem repeat loci contained large insertions and/or deletions (indels) relative to the reference. For tandem repeats involving a single repeating unit, indel lengths were found to correlate with discrete changes in repeat copy numbers (i.e., repeat expansions/contractions). Most reads contained indels of lengths that were exact multiples of the repeat period size. A minority of reads contained indels that were 1-3 bp away from a length that was exactly divisible by the period size. These small deviations were found to exclusively result from changes in homopolymer lengths, and were highly associated with misaligned reads. For tandem repeats involving multiple repeating units (i.e., complex repeats), motif compositions could not be inferred from indel sizes alone. To account for this discrepancy, reads of length >5-kb that fully contained the corresponding repetitive region and also extended partially (>100-bp) into non-repetitive regions upstream and downstream were extracted and manually re-aligned ( Figure 1B , Supplementary Table 1 ). For each strain, final consensus genomes were generated by manually modifying each initial reference genome to reflect and encompass all sites where >50% of the mapped PacBio HiFi reads differed from the reference. Phylogenomic analyses based on a multiple-sequence alignment with gaps excluded of the 4 newly assembled genomes alongside 39 previously published consensus genomes of MDV revealed that strain 675A is closest genetically to 648A, another vv+ strain isolated in North America. The nearest neighbor to strain HPRS-B14 (v), although still on a distinct branch, was strain CU-2 (mild), a low virulence strain ( Supplementary Figure 1 ). The new PacBio HiFi genomes of strains CVI988/Rispens (CVI988-HiFi) and Md5 (Md5-HiFi) clustered near prior sequences of the same strains. To account for tandem repeat variation in the four MDV strains sequenced using PacBio HiFi, we first characterized the observed repetitive patterns (i.e., number of repeating units involved, presence of alternative and/or interrupting motifs, etc.) ( Figure 1C ). We then investigated the underlying patterns of intrapopulation diversity found at each MDV tandem repeat locus in these strains ( Figure 1D ). MDV strains CVI988-HiFi and 675A exhibit intrapopulation diversity in the 132-bp repeats overlapping the MDV006.5/MDV075.2 transcripts The MDV006.5/MDV075.2 transcripts are located in Terminal Repeat Long (TRL) and Internal Repeat Long (IRL), respectively. While their function is still unknown, these transcripts have been shown to be variable in length due to partially overlapping a set of 132-bp tandem repeats ( 38 , 39 ). To date, two versions of the 132-bp repeating unit have been identified ( 14 ). The most common version contains a cytosine (based on MDV075.2) in position 67, and is typically present regardless of pathotype. However, some very virulent (vv) and very virulent plus (vv+) strains have been shown to contain an alternative version with a thymine at this position. Both nucleotides encode alanine (i.e., synonymous) in the resulting protein ( Figure 2A ). We refer to these repeated motifs as 132A (cytosine) and 132B (thymine). At the consensus level, the attenuated vaccine strain CVI988-HiFi was found to contain 10 copies of the 132A repeat ( Figure 2B ). Strains HPRS-B14 (v) and Md5 (vv) each contained 2 copies of the 132A repeat, while strain 675A (vv+) contained one 132A repeat followed by a 132B repeat. For CVI988-HiFi, a total of 684 PacBio HiFi reads were found to completely contain the 132-bp repeats and extend into the Unique Long (UL) region, enabling comparisons between MDV006.5 (n = 355) and MDV075.2 (n = 329) 132-bp repeat copy numbers. A total of 352 reads (99.2%) mapping to MDV006.5 and 325 reads mapping to MDV075.2 (98.8%) showed copy number variation relative to the Illumina reference. CVI988-HiFi reads mapping to MDV006.5 contained 2-37 copies of the 132A repeat, while reads mapping to MDV075.2 contained 3-31 copies ( Figure 2C ). For both loci, the most frequent (i.e., consensus) copy number was 10, which was represented by 40 reads (11.3%) in MDV006.5 and 30 reads (9.1%) in MDV075.2. We found no sequencing reads of CVI988-HiFi containing 132B repeats. Additionally, no reads with partial copies of the repeating unit were identified. A Mann-Whitney U test showed a significant difference in 132-bp repeat copy numbers between MDV006.5 and MDV075.2 (U ( 1 ) = 53129, p = 0.0408). Out of the three virulent strains, only vv+ strain 675A exhibited reads with copy number variation relative to the Illumina reference, albeit at lower proportions (MDV006.5 = 2 out of 7 (28.6%) reads, MDV075.2 = 6 out of 27 (22.6%) reads). PacBio HiFi reads of MDV strain 675A mapping to both loci exhibited copy numbers ranging from 2-3, but always contained at least one copy of the 132B repeat ( Figure 2D ). The majority genotype of strain 675A consisted of one 132A repeat followed by one 132B repeat (MDV006.5 = 62.5%, MDV075.2 = 77.2%), and was also shared across both loci. Conversely, a genotype consisting of two copies of the 132B repeat and no 132A repeats was only found in reads mapping to MDV006.5 (14.3%). Similarly, a genotype consisting of one 132A repeat followed by two 132B repeats was only found in reads mapping to MDV075.2 (3.9%). Additionally, we found no reads with more than two copies of the 132B repeat. Download figure Open in new tab Figure 2: High copy number heterogeneity and presence of an alternative motif in the MDV006.5/MDV075.2 transcripts are associated with phenotypic extremes A) The MDV006.5/MDV075.2 transcripts are found in TRL and IRL, respectively, and have been shown to vary in length due to their partial overlap with a set of 132-bp tandem repeats (light blue). Two versions of the 132-bp repeating unit are known to exist, distinguished only by a C>T mutation at nucleotide position 67 within the repeating unit (132A = light blue, 132B = fuchsia). The synonymous codons associated with this position both encode an alanine (Ala). B ) The modal copy number was used to determine the consensus sequence of the MDV006.5/MDV075.2 transcripts for CVI988-HiFi, HPRS-B14, Md5-HiFi and 675A. C ) PacBio HiFi reads of MDV strain CVI988-HiFi mapping to MDV006.5 and MDV075.2 exclusively contained the 132A version (light blue) of the repeating unit. Copy numbers ranged from 2-37 copies and 3-32 copies, respectively. For both loci, a modal copy number of 10 copies was supported by less than 12% of the total reads mapping to each location. D ) For MDV strain 675A, PacBio HiFi reads mapping to these loci showed patterns of variation involving both versions of the 132-bp repeating unit, but always including the 132B version. A total of four genotypes with repeat copy numbers ranging from 2-3 were identified. These include “AB”, “BB”, “ABB”, and “AAB”, where A indicates the 132A version of the repeat (light blue) and B indicates the 132B version (fuchsia). In both MDV006.5 and MDV075.2, the modal genotype was found to be “AB”. The proline-rich region of MDV049/UL36 is highly variable across MDV strains and involves six alternative versions of a 6-AA repeating unit The MDV049/UL36 gene encodes a large tegument protein (∼3300-AA) that contains a proline-rich region (UL36-PRR) near its carboxyl terminus. This locus, which constitutes ∼6% of the MDV049/UL36 coding sequence, harbors the only set of tandem repeats in the MDV genome that are found outside of the structural repeat regions ( Figure 1D ). Due to the presence of silent mutations at the nucleotide level, the repetitive patterns of the UL36-PRR are typically described in terms of their amino-acid composition ( Figure 3A ) ( 40 ). The general pattern at this locus consists of 6-AA repeated motifs interrupted at irregular intervals by 10-AA motifs. High-fidelity long read sequencing of the four MDV strains revealed six different versions of the 6-AA repeating unit: “6A” (SPAPKP), “6B” (SPASKP), “6C” (TPAPKP), “6D” (PPAPKP), “6E” (PPASKP), and “6F” (TPASKP) ( Figure 3B ). We also found three different versions of the 10-AA interrupting motif: “10A” (KPPPDPDFKP), “10B” (KPPPAPDSKP), and “10C” (KPPPTPDSKP). At the consensus level, the versions and copy numbers of 6-AA repeats occurring between the 10-AA motifs were found to be highly variable across all four strains, with CVI988-HiFi being the only one lacking 6B-type repeats and containing 6D- and 6E-type repeats ( Figure 3B ). For CVI988-HiFi, a total of 640 reads were found to completely contain the UL36-PRR ( Supplementary Table 3 ). Of these, 638 (99.7 %) showed copy number variation relative to the Illumina reference, specifically in terms of the number of 6A-type repeats occurring between the third and fourth 10AA motifs (4-22 copies) ( Figure 3B , blue histogram). For MDV strain Md5-HiFi, only a single read (out of a total of eight) mapping to this region showed evidence of variation relative to the reference. This read was found to contain five copies of the 6C-type repeat prior to the first 10-AA motif, as opposed to the six copies of the 6C-type repeat displayed by the reference sequence. MDV strain 675A had 26 reads mapping to this region, of which a total of 3 (11.5 %) showed indels relative to the reference. Two of these reads (7.7%) showed a large 240-bp deletion, resulting in a repetitive pattern consisting of a single 10A motif along with two sets of 6-AA repeats (i.e., Variant 1) ( Figure 3B ). The remaining read showed a 6F repeat inserted into the first set of 6-AA repeats (i.e., Variant 2). MDV strain HPRS-B14 showed no intrapopulation diversity associated with the UL36-PRR in a total of 43 reads mapping to this locus. To add context to the extent of sequence diversity we found at this locus, we examined the 39 published consensus genomes of MDV used in earlier genetic distance comparisons, which were all sequenced using either Illumina- or Sanger-based methods ( Supplementary Figure 2, Supplementary Table 3 ). Published consensus genomes of MDV strains Md5-USDA, 814 and HC/0803 were found to lack the UL36-PRR and were excluded from further analyses. Phylogenetic analysis of the amino-acid sequence of the remaining 40 proline-rich regions resulted in three distinct clusters ( Figure 3C ). Cluster 1 (pink) included mostly Eurasian strains, which were found to always contain a single 6D repeat and lack 6C repeats in their final stretch ( Supplementary Figure 2 ). Cluster 2 (blue) included mostly North American strains, which were characterized by a lack of 6D repeats and the presence of a 6C repeat in their final stretch of 6-AA repeats ( Supplementary Figure 2 ). Cluster 3 (green) included all four consensus genomes of CVI988/Rispens sequenced to date (CVI988-GenBank, CVI988-UK, CVI988-USDA, CVI988-HiFi) and six Chinese strains (CH/10, DH/1307, DH/1504, HNLC503, Hrb/1504, SY/1209) previously described as natural recombinants of CVI988/Rispens. Strains in this cluster followed the exact repetitive pattern of CVI988-HiFi, with the only differences being in the number of copies of 6A repeats ( Supplementary Figure 2 ). Analysis of the previously published genomes also revealed two additional versions of the 10-AA interrupting motif: “10D” (KPPPAPDFKP) and “10E” (KPPPDPDSKP) ( Supplementary Figure 2 ). Download figure Open in new tab Figure 3: The proline-rich region of MDV049/UL36 shows distinct patterns of variation in CVI998/Rispens and virulent strains of Eurasian and North American origin A) The MDV049/UL36 gene is ∼10-kb in length and encodes a large tegument protein (∼3300-AA). The N-terminus corresponds to the viral deubiquitinase (DUB) domain. A proline-rich repetitive region (UL36-PRR, dark red) is found near the C-terminus. B) The proline-rich region of MDV049/UL36 exhibits complex repetitive patterns that are typically described at the amino-acid level. The legend depicts the repeating units that occur at this locus, which include six different versions of a 6-AA tandem repeat (named 6A-6E) and three different versions of a 10-AA non-tandem repeat (named 10A-C). The consensus sequence of the MDV049/UL36 proline-rich region is shown for CVI988-HiFi, HPRS-B14, Md5-HiFi, and 675A, with the number of reads for each shown at the right. Strains showing intrapopulation diversity at this locus (bold) included CVI988-HiFi and 675A. CVI988-HiFi variants differed only in terms of the number of copies of 6A repeats in a single segment (bracket and histogram), with copy numbers ranging from 4-22 copies and a modal copy number of 9 copies. PacBio HiFi reads of MDV strain 675A contained 2 minor variants, which either lacked a large segment of repeats (Variant 1, 7.7%) or included a single copy of an 6F repeat between an 6B and 6C repeats in the first set of tandem repeats (Variant 2, 3.8%) C) Phylogenetic analysis of the MDV049/UL36 PRR for 40 MDV strains. These include the four strains sequenced using PacBio HiFi and 36 additional MDV strains with published consensus genomes (Supplementary Table 3) . A dendrogram based on the PRR amino-acid sequence of the 40 strains grouped them into three distinct clusters. The lower-left inset shows a summary sequence representative of the strains within each cluster, with names color-coded in the dendrogram on the right (Cluster 1 = pink, Cluster 2 = green, Cluster 3 = blue) (see Supplementary Figure 2 for a more detailed view of PRR alignment). Sequence elements that discriminate between clusters are highlighted in the inset diagram (black boxes). Strains lacking one or more cluster-defining sequence elements are shown in black in the dendrogram (i.e., RB-1B, J-1). The geographic origin of each strain is listed to its right. Dendrogram was generated using the Geneious Tree Builder tool and calculated with UPGMA using a Jukes-Cantor model. MDV strain CVI988-HiFi variants completely lack the LAT promoter region and exhibit deletions of variable sizes in the 5’ end of LAT MDV genomes contain a family of non-protein-coding transcripts that are overexpressed during latency, collectively known as the latency-associated transcripts (LATs) ( 41 ). These transcripts are all derived from an ∼11-kb region known as the LAT gene, with the different transcripts being the product of alternative-splicing events. Due to its size, the LAT gene overlaps several MDV open-reading frames (ORFs), including the major transcriptional regulatory protein MDV084/ICP4, in the antisense direction ( 42 ). The 5’ end of LAT is associated with a set of 60-bp tandem repeats that act as the LAT promoter ( Figure 4A ). These repeats are also known as the “p53” repeats due to containing a highly-conserved p53-binding site, and at least two copies are needed for strong promoter activity ( 43 , 44 ). However, MDV strains with mild and attenuated pathotypes have been shown to sometimes lack any copies of the 60-bp repeating unit and/or contain deletions of variable lengths in the 5’ end of LAT ( Figure 4A ) ( 45 ). A total of 29 MDV variants with 5’LAT deletions, also known as “subtypes”, have been identified in past studies ( Supplementary Table 2 ) ( 14 , 46 ). As an alternative to the prior description by Stik et al., a new characterization of the 60-bp repeated motif revealed that these deletions impact the first 32-1,400 nucleotides immediately downstream from the point of overlap with LAT, starting with nucleotide +37 relative to the LAT transcription start site (TSS) proposed by Strassheim et al. ( Figure 4B ). Larger deletions (>683-bp) start to impact a cluster of miRNAs encoded by LAT, which have been shown to play a role in MDV pathogenesis ( 47 , 48 ). PacBio HiFi reads of HPRS-B14, Md5-HiFi, and 675A mapping to the 60-bp repeats located near the IRL/IRS junction (HPRS-B14 = 13 reads, Md5-HiFi = 5 reads, 675A = 5 reads; Supplementary Table 1 ) all showed a range of different 60-bp repeat copy numbers (HPRS-B14 = 6-58 copies, Md5-HiFi = 2-26 copies, 675A = 9-53 copies) ( Figure 4B ). Since no two reads were found to contain the same number of copies of the 60-bp repeating unit, the median copy number was used to determine the consensus sequence for each of these three strains (HPRS-B14 = 18 copies, Md5-HiFi = 17 copies, 675A = 22 copies) ( Figure 4C ). A Kruskal-Wallis H test showed no significant differences in 60-bp repeat copy numbers (H ( 2 ) = 0.4947, p = 0.7809). For CVI988-HiFi, a total of 201 reads were found mapping to the LAT promoter region, of which 172 (85.6%) completely lacked the 60-bp repeats ( Figure 4D, 4E ). Among reads lacking repeats, 83 (48.3%) exhibited a 124-bp deletion past the point of overlap, corresponding to the A1 subtype ( Figure 4F , Supplementary Table 2 ). We also found reads exhibiting the α subtype (5.2%), the β subtype (8.7%), the A subtype (6.4%), the A2 subtype (7%), and the B1 subtype (1.2%). A subset of reads (34.3%) showed deletions extending past the point of overlap that did not correspond with any known subtypes, including reads with a 203-bp deletion (11%), a 182-bp deletion (2.9%), a 139-bp deletion (1.7%), a 129-bp deletion (3.5%), a 118-bp deletion (2.3%), and an 82-bp deletion (1.7 %) ( Figure 4F , Supplementary Table 2 ). A small minority of CVI988-HiFi reads (14.4%) did contain copies of the 60-bp repeating unit ( Figure 4E ). For these reads, 60-bp repeat copy numbers ranged from 0-25 copies (median = 9). None of the more virulent MDV strains (HPRS-B14, Md5-HiFi, 675A) showed any signs of LAT deletions in the sequencing reads available. Download figure Open in new tab Figure 4: The absence of repeats in the LAT promoter region of MDV is associated with deletions of varying sizes in 5’ end of LAT A) The promoter region of the MDV LAT gene consists of 60-bp simple repeats (rose, top), with at least two copies being required for promoter activity. An earlier study by Strassheim et al. proposed that the LAT transcription start site (TSS) is found inside the last 60-bp repeat ( 48 ). However, published genomes of attenuated and mild strains of MDV can lack the 60-bp repeats and exhibit deletions of variable lengths in the 5’ end of LAT (i.e., 5’LAT-deleted “subtypes”, bottom). Larger deletions can also impact a cluster of miRNAs (miR-M8-M10, blue). Black arrows indicate the 3’ end of the 5’LAT-deleted subtypes found in the CVI988-HiFi sample. B) As part of this study, we propose a new characterization of the 60-bp repeated motif (top, rose). Under this model, 5’LAT deletions impact the first 32-1,400 nucleotides immediately downstream from the region of overlap (black font, bold), starting at nucleotide +37 relative to the TSS proposed by Strassheim et al. In a prior description by Stik et al. (bottom, violet), the first 4 nucleotides impacted by 5’LAT deletions were proposed to belong to the non-repetitive region immediately upstream from the 60-bp repeats. C) More virulent strains did not show any reads with deletions in 5’LAT, instead harboring long tracts of 60-bp repeats with variable copy numbers, as shown in these box plots (HPRS-B14 = 13 reads, Md5-HiFi = 5 reads, 675A = 5 reads). No significant differences in 60-bp repeat copy numbers were found based on a Kruskal-Wallis test. D) Median 60-bp motif copy numbers were used to determine the consensus sequence for all three non-attenuated strains (HPRS-B14 = 18 copies, Md5-HiFi = 17 copies, 675A = 22 copies). The CVI988-HiFi majority variant lacked any copies of the 60-bp repeats and showed a 124-bp deletion in 5’LAT corresponding to the A1 molecular subtype. E) CVI988-HiFi reads that still contained the 60-bp repeats (∼14%, cerise) had a median copy number of 9 repeats. F) A total of twelve 5’LAT-deleted molecular subtypes were identified in the CVI988-HiFi sample, including 6 novel subtypes (indicated in bold). Only one of these subtypes (810-bp deletion, blue) impacted the cluster of miRNAs further downstream, with a total of three miRNAs (miR-M8, miR-M13, miR-M6) being impacted. The length and composition of the multiple telomeric repeats (mTMR) region of the MDV a -like sequence is highly variable across strains The MDV a -like sequence occurs at the genomic termini and the IRL/IRS junction, where it is reported to play a central role in replication, pathogenesis and tumor formation ( 49 ). The name “ a -like” is derived from early comparisons to the HSV-1 a sequence, which exhibits a similar structure and is located in the same genomic sites in the HSV-1 genome ( 50 ). However, a distinct feature of the MDV a -like sequence is the presence of two sets of 6-bp “telomeric” repeats. The sequence of these repeats (TTAGGG) is identical to the chicken host telomeric sequences, enabling the virus to integrate into the host telomeres via homologous recombination during latency ( 51 , 52 ). The first set of repeats, known as the short telomeric repeats (sTMR), has been observed to always contain exactly six copies of the 6-bp repeating unit ( 53 ). In contrast, the second set has been shown to be highly variable in copy number, and is known as the multiple telomeric repeats (mTMR) ( 54 ). The telomeric repeats occurring in the mTMR are also interrupted at varying intervals by a recurring 13-bp motif (GGGTTCAGGCCTA). Previous studies have described this repetitive pattern as “stretches” of 6-bp telomeric repeats inundated with 13-bp “islands” ( 14 , 55 ). In addition to the sTMR and the mTMR, the MDV a-like sequence also contains several non-repeating motifs. These include the packaging signals pac -1 and pac -2, as well as two direct repeats (DR), which mediate cleavage of MDV genomes from concatemeric intermediates generated during rolling-circle replication ( 56 ). For each strain, reads corresponding to the majority 5’LAT molecular subtype (CVI988-HiFi = 83 reads, HPRS-B14 = 13 reads, Md5-HiFi = 5 reads, 675A = 5 reads; Supplementary Table 1 ) were manually re-aligned to optimize the mTMR alignment and visually inspected. In the sTMR, reads of CVI988-HiFi, HPRS-B14, Md5-HiFi and 675A all showed exactly six telomeric repeat copies, suggesting that this locus does not exhibit intrapopulation diversity. In the mTMR, reads of all four strains showed extensive differences in the total number of 13-bp islands and in telomeric repeat copy numbers at each stretch ( Figure 5B ). The consensus mTMR of each of the four strains was determined by first calculating the median number of islands (CVI988-HiFi = 22 islands, HPRS-B14 = 4 islands, Md5-HiFi = 19 islands, 675A = 2 islands) ( Figure 6 ). Pairwise comparisons of island copy numbers using Dunn’s test showed a significant difference between CVI988-HiFi and non-attenuated strains HPRS-B14 and 675A ( Figure 6 , Supplementary Table 5 ). The total number of stretches in the consensus for each strain was always one more than the number of islands, with every island flanked by two stretches. We then graphed the median copy number of telomeric repeats at each stretch ( Figure 6 ). In addition to copy number variation in the mTMR, a subset of reads from each of the four strains exhibited a -like sequences with atypical structures ( Supplementary Figure 3A ). These included: reads with reiterations of the pac-1 and pac-2 motifs accompanied by additional instances of the mTMR region, reads with extra copies of the entire a -like sequence, and reads with combinations of both. Reads with 3 copies of the entire a -like sequence were only found for HPRS-B14, while reads showing duplications of the pac-2 motif were only found for CVI988-HiFi ( Supplementary Figure 3B ). Download figure Open in new tab Figure 5: The mTMR region of the MDV a -like sequence shows extensive intrapopulation diversity in all four strains sequenced using PacBio HiFi A) The a -like sequence of MDV, which occurs in the IRL/IRS junction and the genomic termini, begins and ends with a 12-bp motif (GGCCGCGAGAGG) known as direct repeat 1 (DR1) (violet). The initial DR1 is followed by six copies of a 6-bp motif (TTAGGG), collectively known as the short telomeric repeats (sTMR) (yellow). Near the center of the a -like sequence lies another region harboring 6-bp telomeric repeats, which varies in length and is therefore known as the multiple telomeric repeats (mTMR) (dark yellow). Telomeric repeats in the mTMR are interrupted at irregular intervals by “islands” consisting of a 13-bp motif (TTCAGGCCTAGGG; blue). The remainder of the MDV a -like sequence consists of the pac -1 (dark green) and pac -2 (green) motifs, in addition to three small segments containing quasi-unique sequences (Ua, Ub and Uc) (grey). B-E) To resolve the mTMR length and repeat composition, we calculated the median number of copies of telomeric repeats at each repetitive stretch (bar plots, dark yellow) and the median number of interrupting islands (box plots, blue). Error bars represent the standard deviation for the copy number of telomeric repeats in each stretch (yellow bars) or the total number of islands (blue boxes). Letters above island box plots represent statistically significant differences across strains based on pairwise comparisons using Dunn’s test ( Supplementary Table 5 ). A graphical representation of the resulting consensus sequence is shown for each stretch and for the consensus-level mTMR of each strain as a whole, followed by the length of the consensus sequence. Download figure Open in new tab Figure 6: Manual repeat decompositions reveal three distinct repeating units in the proline-rich region of Meq A) The Meq oncoprotein is comprised of an N-terminal proline/glutamine (Pro/Gln) rich domain, a basic region (BR), a leucine zipper (ZIP) domain and a transactivation domain (TAD). The TAD contains a proline-rich region (PRR, aqua) associated with a major repetitive element that is typically described at the amino-acid level. B) The prior “isoform” model of variation in Meq can alternatively be represented as differences in the number of copies of three newly-identified repeating units. C ) The newly identified repeating motifs which make up the Meq-PRR include: eight alternative versions of a 27-AA repeat, two alternative versions of a 19-AA repeat, and one 14-AA repeat. All three repeating units bear sections of sequence commonality (gray vs. black letters, gray and blue shading). D) Consensus sequence of the Meq-PRR for the four strains sequenced using PacBio HiFi reads. Meq-PRRs are depicted with color-coding to indicate the identity and copy number of each repeating unit, with the corresponding isoform-based description indicated to the right (grey text). E) Relative frequencies of CVI988-HiFi variants identified through visual inspection and manual re-alignment of reads mapping to MDV076/Meq. F) Comparison of the Meq-PRR across 43 MDV strains, including the present 4 determined by PacBio HiFi (bold) and 39 previously published Meq-PRRs (see Supplementary Table 3-4 for accessions). The proline-rich region of the Meq oncoprotein involves three distinct amino-acid repeating units The meq oncogene (MDV005/MDV076) encodes the major MDV oncoprotein Meq, which has been shown to be indispensable for MDV-induced transformation of T lymphocytes ( 57 , 58 ). Variation in Meq has been historically described in terms of “isoforms”, with the standard “Meq” being 339-AA in length ( 59 ). “Long Meq” (L-Meq) has been described as a longer version of Meq containing a 59-60-AA insertion in the transactivation domain (TAD) ( 60 ). “Short Meq” (S-Meq) has been described as a shorter version of Meq exhibiting a 41-AA deletion in the TAD ( 61 ). Alternatively, “Very Long Meq” (VL-Meq) and “Very Short Meq” (VS-Meq) isoforms have only been described recently, with the first examples showing a 79-AA insertion and 74-AA deletion in the TAD, respectively ( 62 , 63 ). Similar to MDV049/UL36, the C-terminus of Meq contains a region that is known to harbor proline-rich repeats (Meq-PRR) ( Figure 6A ). Past descriptions of these repeats by Jones et al., Lee et al., and Chang et al. have suggested that they start at or around AA residue 149 ( 60 , 64 – 67 ). However, these early characterizations disagree on the length and the number of repeats associated with each Meq “isoform”. To account for these inconsistencies, we performed manual repeat decompositions of the Meq-PRR based on a multiple-sequence alignment of PacBio HiFi consensus genomes. Three distinct repeating units were identified, which were 27-AA, 14-AA, and 19-AA in length, respectively, and started at AA residue 169 ( Figure 6A-B ). Additionally, more than one version of the 27-AA repeating unit was identified ( Figure 6C ). PacBio HiFi reads of non-attenuated strains mapping to MDV076 (HPRS-B14 = 15 reads, Md5 = 2 reads, 675A = 36 reads; Supplementary Table 1 ) were not found to contain indels in the TAD ( Figure 6D ). Visual inspection of CVI988-HiFi reads mapping to this locus (n = 734) revealed a variant with a 120-AA insertion in the TAD ( Figure 6E ). This variant did not match any of the known isoforms and was provisionally labeled as “Very, Very Long Meq” (VVL-Meq) ( Figure 6B, 6E ). To test the ability of the newly identified repeating units to reliably account for structural variation in the Meq-PRR, we extended our analyses to the Meq sequences of the 39 MDV strains used in our earlier genetic distance comparisons, as well as to 3 additional strains representing the VL-Meq (Italy/Ck/507/15), VS-Meq (Ck/IR/99-35/2021), and S-Meq (Iraq3A) isoforms ( Supplementary Table 4 ). A multiple-sequence alignment of these Meq-PRRs confirmed that the three repeating units could reliably account for structural differences across all known Meq isoforms ( Figure 6B ). In this model, standard Meq contains two 27-AA repeats, interrupted by a 14-AA motif. Longer forms such as L-Meq, VL-Meq, and VVL-Meq extend this pattern via a new 19-AA repeat, plus additional copies of the 14-AA and 27-AA repeats. In contrast, shorter forms such as S-Meq and VS-Meq involve sequential loss of these repeats, with total repeat loss in VS-Meq also accompanied by deletions in the non-repetitive region immediately downstream ( Figure 6B ). A total of eight different versions of the 27-AA motif (27A, 27A-P3, 27B, 27C, 27D, 27E, 27F, 27G), a single version of the 14-AA motif (14A), and two different versions of the 19-AA motif (19A, 19B) were identified across the 44 Meq-PRRs available for analysis ( Figure 6C, 6F ). At the consensus level, the repetitive patterns of CVI988-HiFi and HPRS-B14 were found to match each other in length and composition ( Figure 6D ). These two strains were also found to not share any versions of the 27-AA repeating unit with Md5-HiFi and 675A, which both contained 27F repeats. Out of the four strains, only CVI988-HiFi was observed to exhibit intrapopulation diversity at this locus ( Figure 6E ). Detection of CVI988-HiFi variants with a 4.3-kb deletion in the Unique Short region A subset of 79 (20%) CVI988-HiFi reads mapping to the IRS/US junction were found to exhibit an atypical US region ( Supplementary Figure 4 ). The first 186 nucleotides of this atypical US region corresponded to the reverse complement of the last 186 nucleotides of a typical US region ( Supplementary Figure 4A ). The next 31 nucleotides were not found to map anywhere else in the MDV genome. These were immediately followed by a 4.3 kb deletion that removed SORF2, US1, US10, SORF3, and approximately half of US2. PacBio HiFi reads mapping to this locus showed a variety of start and end points, suggesting that these variants were part of otherwise intact genomes ( Supplementary Figure 4B ). Discussion Identifying genomic regions that contribute to observed phenotypic differences across Marek’s disease virus (MDV) strains remains of high importance to the field. However, the limitations of DNA sequencing technologies have made it difficult to account for genomic diversity in MDV loci harboring long stretches of tandem repeats ( 68 , 69 ). To address this gap in knowledge, we sequenced four MDV strains using a PacBio high-fidelity (HiFi) long read platform: CVI988-HiFi (attenuated), HPRS-B14 (virulent), Md5-HiFi (very virulent), and 675A (very virulent plus). In addition to showcasing the ability of PacBio HiFi reads to resolve complex tandem repeat patterns in the MDV genome, our results suggest that all of these loci are highly polymorphic and exhibit patterns of variation that could be contributing to phenotypic differences across strains. These findings challenge long-standing assumptions in the field regarding the inherent stability of herpesviruses, and suggest that long-read approaches may reveal similar levels of tandem repeat diversity in other members of the Herpesviridae virus family. The decision to perform multiple steps of our analysis manually as opposed to relying on widely-used bioinformatic tools stems from the lack of benchmarking studies performed in the context of large DNA viruses like MDV ( 70 – 72 ). Here, we demonstrate that herpesviruses can exhibit high levels of intrapopulation diversity in tandem repeats, a phenomenon that uniquely combines the two biggest challenges faced by modern bioinformatic algorithms ( 73 ). We also show that MDV tandem repeats can exhibit complex repetitive patterns involving multiple repeating units and/or alternative versions of a repeating unit. Since most widely-used bioinformatic tools have not been designed with these challenges in mind, our approach, although limited in scalability, offers a more reliable outcome and provides a ground truth for future benchmarking studies ( 74 ). Consistent with prior observations, we found that the heavily passaged and attenuated vaccine strain CVI988-HiFi exhibits extensive intrapopulation diversity in the 132-bp tandem repeats overlapping the MDV006.5/MDV075.2 transcripts. This diversity manifested in the wide range of 132-bp repeat copy numbers (2-37 copies) contained by PacBio HiFi reads, and in the fact that no single copy number was supported by more than 12% of the reads mapping to either locus. Likewise, we found that the only strain containing any copies of the alternative 132-bp motif with a C>T base change in nucleotide 67 (132B) was MDV strain 675A (vv+). We also confirmed that all of the viral genomes in the 675A sample contained at least one copy of the 132B repeat, and none of the other strains harbored any reads with 132B repeats. These observations provide additional support for a potential link between the presence of the 132B repeat and the vv+ pathotype ( 14 ). Notably, since both versions of the 132-bp repeat share the same amino-acid sequence, any phenotypic effects resulting from the C>T base change are likely stemming from impacts at the nucleic acid level (e.g., secondary structures and/or changes in translation efficiency). The proline-rich region of MDV049/UL36 (UL36-PRR) has been proposed as the single most divergent locus between CVI988/Rispens and virulent strains ( 13 ). PRRs are known to have special signaling and protein-protein interaction functions, and the presence of a PRR in the C-terminus of UL36 is conserved across all herpesviruses ( 75 – 77 ). Nevertheless, functional studies of MDV049/UL36 have primarily focused on the N-terminus, which contains the viral deubiquitinase catalytic domain ( 78 , 79 ). As such, little is known about the role of the MDV UL36-PRR in virulence and attenuation. In this study, we identified several repetitive motifs occurring within this locus, including six versions of a 6-AA tandem repeat and five versions of a 10-AA interrupting motif. While a subset of these elements has been identified in prior studies, these findings provide the most complete picture of MDV UL36-PRR diversity to date ( 14 , 80 ). Notably, we found that the patterns of variation at this locus segregate with the geographic origin of MDV strains. Furthermore, we found that CVI988/Rispens exhibits a characteristic pattern in the UL36-PRR that is unique to this strain, along with a set of 6 Chinese isolates that have been previously described as natural vaccine recombinants ( 81 – 83 ). In parallel, we found that most of the intrapopulation diversity associated with the UL36-PRR is limited to specific repeat segments, with the rest of the UL36-PRR appearing highly conserved within each sample. These findings suggest that the repeating motifs of the UL36-PRR could be used as reliable biomarkers, and even offer alternative qPCR targets to the commonly used “SNP #320” of pp38 for differential quantification of CVI988/Rispens and virulent strains in field samples ( 29 ). In addition to our findings for the UL36-PRR, we also investigated the repetitive patterns associated with the proline-rich region of Meq (Meq-PRR), the major MDV oncoprotein. Contrary to past descriptions of this locus, we have identified three distinct repeating units in the Meq-PRR. These repeating units are 27-AA, 14-AA, and 19-AA in length, respectively, and were found to mostly reiterate in an alternating pattern. Although accurate decomposition of complex tandem repeats remains a significant challenge, the three proposed repeating units can reliably account for all of the structural differences observed across known Meq “isoforms”. We also identified eight different versions of the 27-AA repeated motif and two versions of the 19-AA motif across the 46 Meq-PRRs analyzed as part of this study. Notably, our new model suggests that both of the proline-rich regions found in the MDV genome (i.e., UL36-PRR, Meq-PRR) exhibit complex repetitive patterns that are highly variable across MDV strains. Since past functional studies of Meq-PRR repeats have largely been based on the original model proposed by Jones et al., this new characterization has the potential to greatly advance our understanding of Meq-PRR variation and its impact on overall Meq function ( 65 , 84 , 59 , 80 ). In contrast to the complex repetitive patterns found in proline-rich regions, we found no evidence of alternative motifs or additional repeating units in PacBio HiFi reads of all four strains mapping to the 60-bp repeats that make up the MDV LAT promoter. However, the majority of CVI988-HiFi reads completely lacked the 60-bp repeats and also exhibited deletions of varying sizes starting at nucleotide +37 relative to the LAT TSS ( 48 ). Importantly, this coordinate is based on an alternative characterization of the 60-bp repeating unit proposed as part of this study. A series of experiments by Labaille et al. identified 29 molecular “subtypes” of 5’LAT deletions across six commercial batches of the CVI988/Rispens vaccine ( 46 ). In addition to confirming the presence of six of these subtypes in the CVI988-HiFi sample (A1, α, β, A, A2 and B1), we identified six novel subtypes with deletions sizes ranging from 82-203-bp. While these findings provide the most complete picture of 5’LAT diversity to date, the molecular mechanisms giving rise to 5’LAT-deleted subtypes are still largely unknown. Interestingly, a similar pattern of variation was observed in the Meq-PRR for MDV strain Ck/IR/99-35/202 (VS-Meq). In this strain, the absence of proline-rich repeats was also accompanied by an 18-bp deletion in the non-repetitive region immediately downstream. Future studies should continue to explore the relationship between total repeat loss and deletions in adjacent non-repetitive regions, and determine whether Meq-PRR deletions are driven by the same molecular mechanisms giving rise to 5’LAT-deleted subtypes. Additional work is also needed to understand the impact of 5’LAT deletions on attenuation and tumor formation. Similar to the MDV006.5/MDV075.2 transcripts, the mTMR region of the MDV a -like sequence has been shown to significantly expand during serial passage in cell culture ( 85 ). In contrast, the nearby sTMR region has been described as always consisting of exactly six copies of a 6-bp telomeric repeat (TTAGGG)( 53 ). In line with these observations, we found that PacBio HiFi reads of all four strains showed extensive sequence diversity in the mTMR region, while the sTMR region was highly conserved ( 50 , 85 ). While the presence of interrupting motifs (e.g., the 13-bp islands) is often associated with reduced repeat expansion and increased stability, our findings suggest that the MDV mTMR region is highly polymorphic and unstable ( 86 ). Further studies are needed to assess the functional impact of different mTMR repeat compositions, and to determine whether similar levels of intrapopulation diversity can be found in vivo . In addition to copy number variation, we also found structural variants of MDV with 1-2 additional copies of the entire a -like sequence, as well as variants with partial duplications that resulted in additional copies of the pac -1 and pac -2 packaging motifs and additional mTMR arrays ( Supplementary Figure 3 ). Interestingly, we noticed that the relative frequency of a -like sequence variants with additional copies of pac -1 was higher in strains with more virulent pathotypes (e.g. 675A), and that variants with additional copies of pac -2 were only present in the CVI988-HiFi sample. Nevertheless, the relatively low sequencing coverages for strains HPRS-B14, Md5-HiFi, and 675A limits our ability to confirm the absence of these rare variants. To the best of our knowledge, there is no precedent in the literature for a single a -like sequence containing multiple copies of pac -1, pac -2, or even additional mTMR arrays. Future experiments will be necessary to determine the impact of these variants on viral integration and virulence. In addition to our assessments of genomic diversity in repetitive loci, as part of this study we detected variants of CVI988-HiFi with a 4.3-kb deletion in the US region. The fact that these variants represent a significant portion (∼20%) of the CVI988-HiFi sample despite lacking several ORFs suggests that these could be defective viral genomes (DVGs) of MDV ( 87 ). Past reports of DVGs in MDV are limited; however, they are known to occur in herpesviruses during high-multiplicity serial passage, and their presence in other live-attenuated viral vaccines is well-documented ( 88 – 93 ). Notably, the Bartha vaccine strain of pseudorabies virus has been shown to contain a similarly-sized deletion (3.5-kb) in the US region, which contributes to its attenuation ( 94 , 95 ). In recent years, DVGs and defective interfering particles (DIPs) have become the focus of several studies seeking to assess their potential as broad-spectrum antivirals and vaccine adjuvants ( 96 , 97 ). The ability of high-fidelity long read sequencing to detect potential DVGs in MDV samples offers a powerful new strategy to explore this phenomenon in MDV and other herpesviruses. While the above findings significantly advance our understanding of MDV tandem repeats, several other factors need to be addressed before these types of analyses can be widely implemented for comparative genomics or MDV field samples. First, the relatively low sequencing coverages obtained for HPRS-B14, Md5-HiFi, and 675A limited our ability to comprehensively assess the genomic diversity of these samples. Virulent strains of MDV are notoriously difficult to grow to high titers in vitro , and long-read sequencing platforms require higher quantities of input DNA than Illumina platforms ( 98 ). As target enrichment protocols are adapted for long-read sequencing platforms, it will become easier to overcome this obstacle. Second, the levels of intrapopulation diversity associated with MDV tandem repeats can lead to situations where no single genotype predominates. In such cases, we can use the median genotype in place of the consensus; however, the concept of a “consensus” genome requires careful consideration in these cases. Finally, public databases like GenBank do not currently allow for the inclusion of tandem repeat variants when depositing consensus genomes ( 99 ). While this is also true for single-nucleotide variants, tandem repeat variants exhibit much higher levels of intrapopulation diversity. Text files in tabular formats (e.g., TSV) containing information on copy number variation are already routinely submitted as part of data entries for repositories like the Genomic Data Commons (GDC) ( 100 ). Allowing researchers to append such files when submitting viral consensus genomes to GenBank would allow for more accurate representations of viral samples and facilitate large-scale computational analyses of structural variation. Supplementary information Document S1. Figures S1-S4 and Tables S1-S5. Acknowledgements The authors thank members of the Szpara, Kennedy, and Nair labs for helpful feedback and discussion. The authors would like to acknowledge the support of Craig Paul, Daniel Hannon and other personnel of the Huck Institutes’ Genomic Core Facility (RRID:SCR_023645), as well as their PacBio Sequel II platform and other instrumentation. This work was supported by NSF-NIH EEID award 1 R01 GM140459 (DK, MS, VN), and the Biotechnology and Biological Sciences Research Council (BBSRC) grants BBS/E/I/00007039, BBS/E/PI/23NB0003, BB/CCG2250/1 and BB/V017748/1 (VN, YY). Footnotes CRediT (Contributor Roles Taxonomy; https://credit.niso.org/contributor-roles-defined/ ) Conceptualization – AOV, MLS, DAK Data curation – AOV, DWR Formal analysis – AOV Funding acquisition – DAK, VN, MLS Investigation – AOV, CDB, SB, YZ Methodology – AOV, CDB, SB, YZ Project administration – MLS, DAK Resources – DWR, SB, YY, VN, MLS Software – DWR, AOV Supervision – MLS, DAK, YY, VN Validation – AOV, CDB, DWR Visualization – AOV, MLS, DAK Writing – AOV Writing – review & editing – AOV, MLS, DAK References 1. ↵ Osterrieder N , Kamil JP , Schumacher D , Tischer BK , Trapp S . Marek’s disease virus: from miasma to model . Nat Rev Microbiol . 2006 Apr ; 4 ( 4 ): 283 – 94 . OpenUrl CrossRef PubMed Web of Science 2. ↵ Marek J . Multiple Nervenentzündung (Polyneuritis) bei Hühnern . Dtsch Tierärztl Wochenschr . 1907 ; 15 : 417 – 21 . OpenUrl 3. Schat KA , Calnek BW , Fabricant J . Characterisation of two highly oncogenic strains of Marek’s disease virus . Avian Pathol J WVPA . 1982 ; 11 ( 4 ): 593 – 605 . OpenUrl CrossRef 4. ↵ Witter RL . Increased virulence of Marek’s disease virus field isolates . Avian Dis . 1997 Mar ; 41 ( 1 ): 149 – 63 . OpenUrl CrossRef PubMed Web of Science 5. ↵ Witter RL , Calnek BW , Buscaglia C , Gimeno IM , Schat KA . Classification of Marek’s disease viruses according to pathotype: philosophy and methodology . Avian Pathol J WVPA . 2005 Apr ; 34 ( 2 ): 75 – 90 . OpenUrl CrossRef 6. ↵ Rispens BH , van Vloten H , Mastenbroek N , Maas HJ , Schat KA . Control of Marek’s disease in the Netherlands. I. Isolation of an avirulent Marek’s disease virus (strain CVI 988) and its use in laboratory vaccination trials . Avian Dis . 1972 Apr ; 16 ( 1 ): 108 – 25 . OpenUrl CrossRef PubMed Web of Science 7. ↵ Reddy SM , Izumiya Y , Lupiani B . Marek’s disease vaccines: Current status, and strategies for improvement and development of vector vaccines . Vet Microbiol . 2017 Jul ; 206 : 113 – 20 . OpenUrl CrossRef PubMed 8. ↵ Zhang Y ping , Li Z jie , Bao K yan , Lv H chao , Gao Y long , Gao H lei , et al. Pathogenic characteristics of Marek’s disease virus field strains prevalent in China and the effectiveness of existing vaccines against them . Vet Microbiol . 2015 May 15 ; 177 ( 1–2 ): 62 – 8 . OpenUrl CrossRef PubMed 9. Sun GR , Zhang YP , Lv HC , Zhou LY , Cui HY , Gao YL , et al. A Chinese Variant Marek’s Disease Virus Strain with Divergence between Virulence and Vaccine Resistance . Viruses . 2017 Apr 3 ; 9 ( 4 ): E71 . OpenUrl CrossRef 10. ↵ Liu JL , Teng M , Zheng LP , Zhu FX , Ma SX , Li LY , et al. Emerging Hypervirulent Marek’s Disease Virus Variants Significantly Overcome Protection Conferred by Commercial Vaccines . Viruses . 2023 Jun 25 ; 15 ( 7 ): 1434 . OpenUrl CrossRef PubMed 11. ↵ Gimeno IM . Marek’s disease vaccines: a solution for today but a worry for tomorrow? Vaccine . 2008 Jul 18 ; 26 Suppl 3 : C31 – 41 . OpenUrl CrossRef PubMed Web of Science 12. ↵ Baigent SJ , Smith LP , Nair VK , Currie RJW . Vaccinal control of Marek’s disease: current challenges, and future strategies to maximize protection . Vet Immunol Immunopathol . 2006 Jul 15 ; 112 ( 1–2 ): 78 – 86 . OpenUrl CrossRef PubMed Web of Science 13. ↵ Spatz SJ , Petherbridge L , Zhao Y , Nair V . Comparative full-length sequence analysis of oncogenic and vaccine (Rispens) strains of Marek’s disease virus . J Gen Virol . 2007 Apr ; 88 (Pt 4 ): 1080 – 96 . OpenUrl CrossRef PubMed 14. ↵ Spatz SJ , Silva RF . Sequence determination of variable regions within the genomes of gallid herpesvirus-2 pathotypes . Arch Virol . 2007 ; 152 ( 9 ): 1665 – 78 . OpenUrl CrossRef PubMed 15. ↵ Tarailo-Graovac M , Chen N . Using RepeatMasker to identify repetitive elements in genomic sequences . Curr Protoc Bioinforma . 2009 Mar ;Chapter 4:4.10.1-4.10.14. 16. Pop M , Salzberg SL . Bioinformatics challenges of new sequencing technology . Trends Genet TIG . 2008 Mar ; 24 ( 3 ): 142 – 9 . OpenUrl CrossRef PubMed 17. Tørresen OK , Star B , Mier P , Andrade-Navarro MA , Bateman A , Jarnot P , et al. Tandem repeats lead to sequence assembly errors and impose multi-level challenges for genome and protein databases . Nucleic Acids Res . 2019 Dec 2 ; 47 ( 21 ): 10994 – 1006 . OpenUrl CrossRef PubMed 18. ↵ Treangen TJ , Salzberg SL . Repetitive DNA and next-generation sequencing: computational challenges and solutions . Nat Rev Genet . 2011 Nov 29 ; 13 ( 1 ): 36 – 46 . OpenUrl CrossRef PubMed 19. ↵ De Bustos A , Cuadrado A , Jouve N . Sequencing of long stretches of repetitive DNA . Sci Rep . 2016 Nov 7 ; 6 : 36665 . 20. ↵ Tandem repeats in the long-read sequencing era . Nat Rev Genet . 2024 Jul ; 25 ( 7 ): 449 . OpenUrl CrossRef PubMed 21. ↵ Melters DP , Bradnam KR , Young HA , Telis N , May MR , Ruby JG , et al. Comparative analysis of tandem repeats from hundreds of species reveals unique insights into centromere evolution . Genome Biol . 2013 Jan 30 ; 14 ( 1 ): R10 . OpenUrl CrossRef PubMed 22. ↵ Rhoads A , Au KF . PacBio Sequencing and Its Applications . Genomics Proteomics Bioinformatics . 2015 Oct ; 13 ( 5 ): 278 – 89 . OpenUrl CrossRef PubMed 23. ↵ Wang Y , Zhao Y , Bollas A , Wang Y , Au KF . Nanopore sequencing technology, bioinformatics and applications . Nat Biotechnol . 2021 Nov ; 39 ( 11 ): 1348 – 65 . OpenUrl CrossRef PubMed 24. ↵ Zhang H , Jain C , Aluru S . A comprehensive evaluation of long read error correction methods . BMC Genomics . 2020 Dec 21 ; 21 ( Suppl 6 ): 889 . OpenUrl CrossRef PubMed 25. ↵ Wenger AM , Peluso P , Rowell WJ , Chang PC , Hall RJ , Concepcion GT , et al. Accurate circular consensus long-read sequencing improves variant detection and assembly of a human genome . Nat Biotechnol . 2019 Oct ; 37 ( 10 ): 1155 – 62 . OpenUrl CrossRef PubMed 26. ↵ Bell AS , Kennedy DA , Jones MJ , Cairns CL , Pandey U , Dunn PA , et al. Molecular epidemiology of Marek’s disease virus in central Pennsylvania, USA . Virus Evol . 2019 Jan ; 5 ( 1 ): vey042 . OpenUrl CrossRef PubMed 27. Pandey U , Bell AS , Renner DW , Kennedy DA , Shreve JT , Cairns CL , et al. DNA from dust: Comparative genomics of large DNA viruses in field surveillance samples . Smith GA , editor. mSphere . 2016 Oct 26 ; 1 ( 5 ): e00132 – 16 . OpenUrl PubMed 28. ↵ Bowen CD , Renner DW , Shreve JT , Tafuri Y , Payne KM , Dix RD , et al. Viral forensic genomics reveals the relatedness of classic herpes simplex virus strains KOS, KOS63, and KOS79 . Virology . 2016 Mar 4 ; 492 : 179 – 86 . OpenUrl CrossRef PubMed 29. ↵ Baigent SJ , Nair VK , Le Galludec H . Real-time PCR for differential quantification of CVI988 vaccine virus and virulent strains of Marek’s disease virus . J Virol Methods . 2016 Jul ; 233 : 23 – 36 . OpenUrl CrossRef PubMed 30. ↵ Trimpert J , Groenke N , Jenckel M , He S , Kunec D , Szpara ML , et al. A phylogenomic analysis of Marek’s disease virus reveals independent paths to virulence in Eurasia and North America . Evol Appl . 2017 Dec ; 10 ( 10 ): 1091 – 101 . OpenUrl CrossRef PubMed 31. ↵ Wood DE , Lu J , Langmead B . Improved metagenomic analysis with Kraken 2 . Genome Biol . 2019 Nov 28 ; 20 ( 1 ): 257 . OpenUrl CrossRef PubMed 32. ↵ Parsons LR , Tafuri YR , Shreve JT , Bowen CD , Shipley MM , Enquist LW , et al. Rapid genome assembly and comparison decode intrastrain variation in human alphaherpesviruses . mBio . 2015 Mar 31 ; 6 ( 2 ): e02213 – 14 . OpenUrl PubMed 33. ↵ Nurk S , Meleshko D , Korobeynikov A , Pevzner PA . metaSPAdes: a new versatile metagenomic assembler . Genome Res . 2017 May ; 27 ( 5 ): 824 – 34 . OpenUrl Abstract / FREE Full Text 34. ↵ Li H . Minimap2: pairwise alignment for nucleotide sequences . Bioinforma Oxf Engl . 2018 Sep 15 ; 34 ( 18 ): 3094 – 100 . OpenUrl 35. ↵ Robinson JT , Thorvaldsdóttir H , Winckler W , Guttman M , Lander ES , Getz G , et al. Integrative genomics viewer . Nat Biotechnol . 2011 Jan ; 29 ( 1 ): 24 – 6 . OpenUrl CrossRef PubMed Web of Science 36. ↵ Kearse M , Moir R , Wilson A , Stones-Havas S , Cheung M , Sturrock S , et al. Geneious Basic: An integrated and extendable desktop software platform for the organization and analysis of sequence data . Bioinformatics . 2012 Jun 15 ; 28 ( 12 ): 1647 – 9 . OpenUrl CrossRef PubMed Web of Science 37. ↵ Katoh K , Misawa K , Kuma K ichi , Miyata T. MAFFT: a novel method for rapid multiple sequence alignment based on fast Fourier transform . Nucleic Acids Res . 2002 Jul 15 ; 30 ( 14 ): 3059 – 66 . OpenUrl CrossRef PubMed Web of Science 38. ↵ Niikura M , Dodgson JB , Cheng HH . Stability of Marek’s disease virus 132-bp repeats during serial in vitro passages . Arch Virol . 2006 Jul ; 151 ( 7 ): 1431 – 8 . OpenUrl CrossRef PubMed 39. ↵ Volkening JD , Spatz SJ , Ponnuraj N , Akbar H , Arrington JV , Vega-Rodriguez W , et al. Viral proteogenomic and expression profiling during productive replication of a skin-tropic herpesvirus in the natural host . PLoS Pathog . 2023 Jun ; 19 ( 6 ): e1011204 . OpenUrl CrossRef PubMed 40. ↵ Luo H , Nijveen H . Understanding and identifying amino acid repeats . Brief Bioinform . 2014 Jul ; 15 ( 4 ): 582 – 91 . OpenUrl CrossRef PubMed 41. ↵ Cantello JL , Parcells MS , Anderson AS , Morgan RW . Marek’s disease virus latency-associated transcripts belong to a family of spliced RNAs that are antisense to the ICP4 homolog gene . J Virol . 1997 Feb 1 ; 71 ( 2 ): 1353 – 61 . OpenUrl Abstract / FREE Full Text 42. ↵ Burnside J , Bernberg E , Anderson A , Lu C , Meyers BC , Green PJ , et al. Marek’s disease virus encodes MicroRNAs that map to meq and the latency-associated transcript . J Virol . 2006 Sep ; 80 ( 17 ): 8778 – 86 . OpenUrl Abstract / FREE Full Text 43. ↵ Stik G , Laurent S , Coupeau D , Coutaud B , Dambrine G , Rasschaert D , et al. A p53-dependent promoter associated with polymorphic tandem repeats controls the expression of a viral transcript encoding clustered microRNAs . RNA N Y N . 2010 Nov ; 16 ( 11 ): 2263 – 76 . OpenUrl CrossRef 44. ↵ Chasseur AS , Trozzi G , Istasse C , Petit A , Rasschaert P , Denesvre C , et al. Marek’s Disease Virus Virulence Genes Encode Circular RNAs . J Virol . 2022 May 11 ; 96 ( 9 ): e0032122 . OpenUrl CrossRef PubMed 45. ↵ Dambrine G , Labaille J , Boissel É , Dupuy C , Rasschaert D . Marek’s disease herpesvirus: a model of vaccine-dependent adaptation . Virol Montrouge Fr . 2016 Oct 1 ; 20 ( 5 ): 273 – 86 . OpenUrl 46. ↵ Labaille J , Lion A , Boissel E , Trapp S , Nair V , Rasschaert D , et al. Vaccine and oncogenic strains of gallid herpesvirus 2 contain specific subtype variations in the 5’ region of the latency-associated transcript that evolve in vitro and in vivo . Arch Virol . 2015 Jan ; 160 ( 1 ): 161 – 71 . OpenUrl CrossRef PubMed 47. ↵ Bai Y , Liao Y , Yang S , Jin J , Lu W , Teng M , et al. Deletion of miR-M8 and miR-M13 eliminates the bursa atrophy induced by Marek’s disease virus infection . Vet Microbiol . 2022 May ; 268 : 109409 . OpenUrl CrossRef PubMed 48. ↵ Strassheim S , Stik G , Rasschaert D , Laurent S . mdv1-miR-M7-5p, located in the newly identified first intron of the latency-associated transcript of Marek’s disease virus, targets the immediate-early genes ICP4 and ICP27 . J Gen Virol . 2012 Aug ; 93 (Pt 8 ): 1731 – 42 . OpenUrl CrossRef PubMed 49. ↵ Volkening JD , Spatz SJ . Identification and characterization of the genomic termini and cleavage/packaging signals of gallid herpesvirus type 2 . Avian Dis . 2013 Jun ; 57 ( 2 Suppl): 401 – 8 . OpenUrl CrossRef PubMed 50. ↵ Kishi M , Bradley G , Jessip J , Tanaka A , Nonoyama M . Inverted repeat regions of Marek’s disease virus DNA possess a structure similar to that of the a sequence of herpes simplex virus DNA and contain host cell telomere sequences . J Virol . 1991 Jun ; 65 ( 6 ): 2791 – 7 . OpenUrl Abstract / FREE Full Text 51. ↵ Osterrieder N , Wallaschek N , Kaufer BB . Herpesvirus Genome Integration into Telomeric Repeats of Host Cell Chromosomes . Annu Rev Virol . 2014 Nov ; 1 ( 1 ): 215 – 35 . OpenUrl CrossRef PubMed 52. ↵ Wood ML , Neumann R , Roy P , Nair V , Royle NJ . Characterization of integrated Marek’s disease virus genomes supports a model of integration by homology-directed recombination and telomere-loop-driven excision . J Virol . 2023 Oct 31 ; 97 ( 10 ): e0071623 . OpenUrl CrossRef PubMed 53. ↵ Greco A , Fester N , Engel AT , Kaufer BB . Role of the short telomeric repeat region in Marek’s disease virus replication, genomic integration, and lymphomagenesis . J Virol . 2014 Dec ; 88 ( 24 ): 14138 – 47 . OpenUrl Abstract / FREE Full Text 54. ↵ Kaufer BB , Jarosinski KW , Osterrieder N . Herpesvirus telomeric repeats facilitate genomic integration into host telomeres and mobilization of viral DNA during reactivation . J Exp Med . 2011 Mar 14 ; 208 ( 3 ): 605 – 15 . OpenUrl Abstract / FREE Full Text 55. ↵ Spatz SJ , Silva RF . Polymorphisms in the repeat long regions of oncogenic and attenuated pathotypes of Marek’s disease virus 1 . Virus Genes . 2007 May 12 ; 35 ( 1 ): 41 – 53 . OpenUrl CrossRef PubMed 56. ↵ McPherson MC , Delany ME . Virus and host genomic, molecular, and cellular interactions during Marek’s disease pathogenesis and oncogenesis . Poult Sci . 2016 Feb ; 95 ( 2 ): 412 – 29 . OpenUrl CrossRef PubMed 57. ↵ Levy AM , Gilad O , Xia L , Izumiya Y , Choi J , Tsalenko A , et al. Marek’s disease virus Meq transforms chicken cells via the v-Jun transcriptional cascade: a converging transforming pathway for avian oncoviruses . Proc Natl Acad Sci U S A . 2005 Oct 11 ; 102 ( 41 ): 14831 – 6 . OpenUrl Abstract / FREE Full Text 58. ↵ Lupiani B , Lee LF , Cui X , Gimeno I , Anderson A , Morgan RW , et al. Marek’s disease virus-encoded Meq gene is involved in transformation of lymphocytes but is dispensable for replication . Proc Natl Acad Sci U S A . 2004 Aug 10 ; 101 ( 32 ): 11815 – 20 . OpenUrl Abstract / FREE Full Text 59. ↵ Conradie AM , Bertzbach LD , Trimpert J , Patria JN , Murata S , Parcells MS , et al. Distinct polymorphisms in a single herpesvirus gene are capable of enhancing virulence and mediating vaccinal resistance . Szpara ML , editor. PLOS Pathog . 2020 Dec 11 ; 16 ( 12 ): e1009104 . OpenUrl CrossRef PubMed 60. ↵ Lee SI , Takagi M , Ohashi K , Sugimoto C , Onuma M . Difference in the meq gene between oncogenic and attenuated strains of Marek’s disease virus serotype 1 . J Vet Med Sci . 2000 Mar ; 62 ( 3 ): 287 – 92 . OpenUrl CrossRef PubMed Web of Science 61. ↵ Murata S , Yamamoto E , Sakashita N , Maekawa N , Okagawa T , Konnai S , et al. Research Note: Characterization of S-Meq containing the deletion in Meq protein’s transactivation domain in a Marek’s disease virus strain in Japan . Poult Sci . 2021 Nov ; 100 ( 11 ): 101461 . OpenUrl CrossRef PubMed 62. ↵ Mescolini G , Lupini C , Felice V , Guerrini A , Silveira F , Cecchinato M , et al. Molecular characterization of the meq gene of Marek’s disease viruses detected in unvaccinated backyard chickens reveals the circulation of low- and high-virulence strains . Poult Sci . 2019 Aug 1 ; 98 ( 8 ): 3130 – 7 . OpenUrl CrossRef PubMed 63. ↵ Molouki A , Ghalyanchilangeroudi A , Abdoshah M , Shoushtari A , Abtin A , Eshtartabadi F , et al. Report of a new meq gene size: The first study on genetic characterisation of Marek’s disease viruses circulating in Iranian commercial layer and backyard chicken . Br Poult Sci . 2022 Apr ; 63 ( 2 ): 142 – 9 . OpenUrl CrossRef PubMed 64. ↵ Jones D , Lee L , Liu JL , Kung HJ , Tillotson JK . Marek disease virus encodes a basic-leucine zipper gene resembling the fos/jun oncogenes that is highly expressed in lymphoblastoid tumors . Proc Natl Acad Sci U S A . 1992 May 1 ; 89 ( 9 ): 4042 – 6 . OpenUrl Abstract / FREE Full Text 65. ↵ Qian Z , Brunovskis P , Rauscher F , Lee L , Kung HJ . Transactivation activity of Meq, a Marek’s disease herpesvirus bZIP protein persistently expressed in latently infected transformed T cells . J Virol . 1995 Jul ; 69 ( 7 ): 4037 – 44 . OpenUrl Abstract / FREE Full Text 66. Chang KS , Lee SI , Ohashi K , Ibrahim A , Onuma M . The detection of the meq gene in chicken infected with Marek’s disease virus serotype 1 . J Vet Med Sci . 2002 May ; 64 ( 5 ): 413 – 7 . OpenUrl CrossRef PubMed Web of Science 67. ↵ Chang KS , Ohashi K , Onuma M . Diversity (polymorphism) of the meq gene in the attenuated Marek’s disease virus (MDV) serotype 1 and MDV-transformed cell lines . J Vet Med Sci . 2002 Dec ; 64 ( 12 ): 1097 – 101 . OpenUrl CrossRef PubMed Web of Science 68. ↵ Gemmell NJ . Repetitive DNA: genomic dark matter matters . Nat Rev Genet . 2021 Jun ; 22 ( 6 ): 342 . OpenUrl 69. ↵ Ortigas-Vasquez A , Pandey U , Renner DW , Bowen CD , Baigent SJ , Dunn J , et al. Comparative analysis of multiple consensus genomes of the same strain of Marek’s disease virus reveals intrastrain variation . Virus Evol . 2024 ; 10 ( 1 ): veae047 . OpenUrl CrossRef PubMed 70. ↵ Howe K , Chow W , Collins J , Pelan S , Pointon DL , Sims Y , et al. Significantly improving the quality of genome assemblies through curation . GigaScience . 2021 Jan 9 ; 10 ( 1 ): giaa153 . OpenUrl CrossRef PubMed 71. Goubert C , Craig RJ , Bilat AF , Peona V , Vogan AA , Protasio AV . A beginner’s guide to manual curation of transposable elements . Mob DNA . 2022 Mar 30 ; 13 ( 1 ): 7 . OpenUrl CrossRef PubMed 72. ↵ Gavrielatos M , Kyriakidis K , Spandidos DA , Michalopoulos I . Benchmarking of next and third generation sequencing technologies and their associated algorithms for de novo genome assembly . Mol Med Rep . 2021 Apr ; 23 ( 4 ): 251 . OpenUrl CrossRef PubMed 73. ↵ Rhie A , McCarthy SA , Fedrigo O , Damas J , Formenti G , Koren S , et al. Towards complete and error-free genome assemblies of all vertebrate species . Nature . 2021 Apr ; 592 ( 7856 ): 737 – 46 . OpenUrl CrossRef PubMed 74. ↵ Garrity GM . Ground truth . Stand Genomic Sci . 2009 Sep 24 ; 1 ( 2 ): 91 – 2 . OpenUrl CrossRef PubMed 75. ↵ Williamson MP . The structure and function of proline-rich regions in proteins . Biochem J . 1994 Jan 15 ; 297 (Pt 2)(Pt 2): 249 – 60 . OpenUrl FREE Full Text 76. Loncoman CA , Vaz PK , Coppo MJ , Hartley CA , Morera FJ , Browning GF , et al. Natural recombination in alphaherpesviruses: Insights into viral evolution through full genome sequencing and sequence analysis . Infect Genet Evol J Mol Epidemiol Evol Genet Infect Dis . 2017 Apr ; 49 : 174 – 85 . OpenUrl 77. ↵ Renner DW , Szpara ML . Impacts of Genome-Wide Analyses on Our Understanding of Human Herpesvirus Diversity and Evolution . J Virol . 2018 Jan 1 ; 92 ( 1 ): e00908 – 17 . OpenUrl CrossRef PubMed 78. ↵ Veiga IB , Jarosinski KW , Kaufer BB , Osterrieder N . Marek’s disease virus (MDV) ubiquitin-specific protease (USP) performs critical functions beyond its enzymatic activity during virus replication . Virology . 2013 Mar 15 ; 437 ( 2 ): 110 – 7 . OpenUrl CrossRef PubMed 79. ↵ Lin J , Ai Y , Zhou H , Lv Y , Wang M , Xu J , et al. UL36 Encoded by Marek’s Disease Virus Exhibits Linkage-Specific Deubiquitinase Activity . Int J Mol Sci . 2020 Mar 5 ; 21 ( 5 ): 1783 . OpenUrl CrossRef PubMed 80. ↵ Murata S , Machida Y , Isezaki M , Maekawa N , Okagawa T , Konnai S , et al. Genetic characterization of a Marek’s disease virus strain isolated in Japan . Virol J . 2020 Nov 23 ; 17 ( 1 ): 186 . OpenUrl CrossRef PubMed 81. ↵ Zhang Y , Lan X , Wang Y , Lin Y , Yu Z , Guo R , et al. Emerging natural recombinant Marek’s disease virus between vaccine and virulence strains and their pathogenicity . Transbound Emerg Dis . 2022 Sep ; 69 ( 5 ): e1702 – 9 . OpenUrl PubMed 82. He L , Li J , Peng P , Nie J , Luo J , Cao Y , et al. Genomic analysis of a Chinese MDV strain derived from vaccine strain CVI988 through recombination . Infect Genet Evol J Mol Epidemiol Evol Genet Infect Dis . 2020 Mar ; 78 : 104045 . 83. ↵ He L , Li J , Zhang Y , Luo J , Cao Y , Xue C . Phylogenetic and molecular epidemiological studies reveal evidence of recombination among Marek’s disease viruses . Virology . 2018 Mar ; 516 : 202 – 9 . OpenUrl CrossRef PubMed 84. ↵ Conradie AM , Bertzbach LD , Bhandari N , Parcells M , Kaufer BB . A Common Live-Attenuated Avian Herpesvirus Vaccine Expresses a Very Potent Oncogene . mSphere . 2019 Oct 9 ; 4 ( 5 ): e00658 – 19 . OpenUrl PubMed 85. ↵ Hildebrandt E , Dunn JR , Perumbakkam S , Niikura M , Cheng HH . Characterizing the molecular basis of attenuation of Marek’s disease virus via in vitro serial passage identifies de novo mutations in the helicase-primase subunit gene UL5 and other candidates associated with reduced virulence . J Virol . 2014 Mar 19 ; 88 ( 11 ): 6232 – 42 . OpenUrl Abstract / FREE Full Text 86. ↵ Völker J , Breslauer KJ . How sequence alterations enhance the stability and delay expansion of DNA triplet repeat domains . QRB Discov . 2023 ; 4 : e8 . OpenUrl CrossRef 87. ↵ Brennan JW , Sun Y . Defective viral genomes: advances in understanding their generation, function, and impact on infection outcomes . mBio . 2024 May 8 ; 15 ( 5 ): e0069224 . OpenUrl CrossRef PubMed 88. ↵ McLaren LC , Holland JJ . Defective interfering particles from poliovirus vaccine and vaccine reference strains . Virology . 1974 Aug ; 60 ( 2 ): 579 – 83 . OpenUrl CrossRef PubMed 89. Hennes-Stegmann B , Schröder CH . Low infectivity of HSV-1 DNA caused by defective-interfering genomes . J Gen Virol . 1982 Dec ; 63 ( 2 ): 307 – 14 . OpenUrl CrossRef PubMed 90. Wu CA , Harper L , Ben-Porat T . Molecular basis for interference of defective interfering particles of pseudorabies virus with replication of standard virus . J Virol . 1986 Aug ; 59 ( 2 ): 308 – 17 . OpenUrl Abstract / FREE Full Text 91. Calain P , Roux L . Generation of measles virus defective interfering particles and their presence in a preparation of attenuated live-virus vaccine . J Virol . 1988 Aug ; 62 ( 8 ): 2859 – 66 . OpenUrl Abstract / FREE Full Text 92. Bellocq C , Mottet G , Roux L . Wide occurrence of measles virus subgenomic RNAs in attenuated live-virus vaccines . Biol J Int Assoc Biol Stand . 1990 Oct ; 18 ( 4 ): 337 – 43 . OpenUrl 93. ↵ Gould PS , Easton AJ , Dimmock NJ . Live Attenuated Influenza Vaccine contains Substantial and Unexpected Amounts of Defective Viral Genomic RNA . Viruses . 2017 Sep 21 ; 9 ( 10 ): 269 . OpenUrl CrossRef PubMed 94. ↵ Lomniczi B , Blankenship ML , Ben-Porat T . Deletions in the genomes of pseudorabies virus vaccine strains and existence of four isomers of the genomes . J Virol . 1984 Mar ; 49 ( 3 ): 970 – 9 . OpenUrl Abstract / FREE Full Text 95. ↵ Szpara ML , Tafuri YR , Parsons L , Shamim SR , Verstrepen KJ , Legendre M , et al. A wide extent of inter-strain diversity in virulent and vaccine strains of alphaherpesviruses . PLoS Pathog . 2011 Oct ; 7 ( 10 ): 1 – 23 . OpenUrl CrossRef 96. ↵ Vignuzzi M , López CB . Defective viral genomes are key drivers of the virus-host interaction . Nat Microbiol . 2019 Jul ; 4 ( 7 ): 1075 – 87 . OpenUrl CrossRef PubMed 97. ↵ Manzoni TB , López CB . Defective (interfering) viral genomes re-explored: impact on antiviral immunity and virus persistence . Future Virol . 2018 Jul ; 13 ( 7 ): 493 – 503 . OpenUrl CrossRef PubMed 98. ↵ Wen L , Zhang A , Li Y , Lai H , Li H , Luo Q , et al. Suspension culture of Marek’s disease virus and evaluation of its immunological effects . Avian Pathol J WVPA . 2019 Jun ; 48 ( 3 ): 183 – 90 . OpenUrl CrossRef 99. ↵ Sayers EW , Cavanaugh M , Clark K , Ostell J , Pruitt KD , Karsch-Mizrachi I . GenBank . Nucleic Acids Res . 2019 Jan 8 ; 47 ( D1 ): D94 – 9 . OpenUrl CrossRef PubMed 100. ↵ Jensen MA , Ferretti V , Grossman RL , Staudt LM . The NCI Genomic Data Commons as an engine for precision medicine . Blood . 2017 Jul 27 ; 130 ( 4 ): 453 – 9 . OpenUrl Abstract / FREE Full Text View the discussion thread. Back to top Previous Next Posted February 11, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following High-Fidelity Long-Read Sequencing of an Avian Herpesvirus Reveals Extensive Intrapopulation Diversity in Tandem Repeat Regions Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share High-Fidelity Long-Read Sequencing of an Avian Herpesvirus Reveals Extensive Intrapopulation Diversity in Tandem Repeat Regions Alejandro Ortigas-Vasquez , Christopher D. Bowen , Daniel W. Renner , Susan J. Baigent , Yaoyao Zhang , Yongxiu Yao , Venugopal Nair , David A. Kennedy , Moriah L. Szpara bioRxiv 2025.02.10.637388; doi: https://doi.org/10.1101/2025.02.10.637388 Share This Article: Copy Citation Tools High-Fidelity Long-Read Sequencing of an Avian Herpesvirus Reveals Extensive Intrapopulation Diversity in Tandem Repeat Regions Alejandro Ortigas-Vasquez , Christopher D. Bowen , Daniel W. Renner , Susan J. Baigent , Yaoyao Zhang , Yongxiu Yao , Venugopal Nair , David A. Kennedy , Moriah L. Szpara bioRxiv 2025.02.10.637388; doi: https://doi.org/10.1101/2025.02.10.637388 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Microbiology Subject Areas All Articles Animal Behavior and Cognition (7626) Biochemistry (17654) Bioengineering (13878) Bioinformatics (41895) Biophysics (21430) Cancer Biology (18569) Cell Biology (25471) Clinical Trials (138) Developmental Biology (13366) Ecology (19876) Epidemiology (2067) Evolutionary Biology (24294) Genetics (15593) Genomics (22480) Immunology (17719) Microbiology (40331) Molecular Biology (17155) Neuroscience (88500) Paleontology (666) Pathology (2829) Pharmacology and Toxicology (4818) Physiology (7635) Plant Biology (15116) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9817) Zoology (2269)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.