Genome graphs reveal the importance of structural variation in Mycobacterium tuberculosis evolution and drug resistance

doi:10.1101/2025.05.07.652570

Genome graphs reveal the importance of structural variation in Mycobacterium tuberculosis evolution and drug resistance

2025 · doi:10.1101/2025.05.07.652570

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 92,289 characters · extracted from preprint-html · click to expand

Genome graphs reveal the importance of structural variation in Mycobacterium tuberculosis evolution and drug resistance | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Genome graphs reveal the importance of structural variation in Mycobacterium tuberculosis evolution and drug resistance View ORCID Profile Aleix Canalda-Baltrons , Matthew Silcocks , Michael B. Hall , Derrick Theys , Xuling Chang , Linda T. Viberg , Norelle L. Sherry , Lachlan Coin , Sarah J. Dunstan doi: https://doi.org/10.1101/2025.05.07.652570 Aleix Canalda-Baltrons 1 Department of Infectious Diseases, University of Melbourne, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Aleix Canalda-Baltrons Matthew Silcocks 1 Department of Infectious Diseases, University of Melbourne, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . Find this author on Google Scholar Find this author on PubMed Search for this author on this site Michael B. Hall 2 Department of Microbiology and Immunology, University of Melbourne, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . Find this author on Google Scholar Find this author on PubMed Search for this author on this site Derrick Theys 1 Department of Infectious Diseases, University of Melbourne, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . Find this author on Google Scholar Find this author on PubMed Search for this author on this site Xuling Chang 1 Department of Infectious Diseases, University of Melbourne, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . 3 Department of Paediatrics, Yong Loo Lin School of Medicine, National University of Singapore , 119228, Singapore , Singapore . 4 Khoo Teck Puat – National University Children’s Medical Institute, National University of Singapore , 119228, Singapore , Singapore . Find this author on Google Scholar Find this author on PubMed Search for this author on this site Linda T. Viberg 5 Victorian Infectious Diseases Reference Laboratory , Melbourne Health, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . Find this author on Google Scholar Find this author on PubMed Search for this author on this site Norelle L. Sherry 6 Microbiological Diagnostic Unit Public Health Laboratory (MDU PHL), University of Melbourne, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . 7 Department of Infectious Diseases and Immunology , Austin Health, 3186, Victoria, Australia . Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lachlan Coin 2 Department of Microbiology and Immunology, University of Melbourne, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sarah J. Dunstan 1 Department of Infectious Diseases, University of Melbourne, at the Peter Doherty Institute for Infection and Immunity , 3000, Victoria, Australia . Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: sarah.dunstan{at}unimelb.edu.au Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Structural variants (SVs) are increasingly recognized as key drivers of bacterial evolution, yet their role has not been explored thoroughly. This is due to limitations in traditional short-read sequencing and linear reference-based analyses, which can miss complex structural changes. Tuberculosis (TB), a disease caused by Mycobacterium tuberculosis ( Mtb ), remains a major global health concern. In this study, we harness long-read sequencing technologies and genome graph tools to construct a Mtb pangenome reference graph (PRG) from 859 high-quality, diverse, long-read assemblies. To enable accurate genotyping of SVs leveraging the PRG, we developed miniwalk , a tool that outperforms a traditional linear genome-based approach in precision for SV detection. We characterize patterns of structural variation genome-wide, revealing a virulence-associated ESX-5 deletion to be recurrent across the phylogeny, and fixed in a sub-lineage of L4. Systematic screens for additional genes that are recurrently affected by SVs implicated those related to metal homeostasis, including a copper exporter fixed in the widely distributed L1.2.1 sub-lineage. Lastly, we genotyped 41,134 isolates and found SVs putatively associated with resistance to various first and second-line drugs. These findings underscore the broader role of SVs in shaping Mtb diversity, highlighting their importance in both understanding evolution and designing strategies to combat drug-resistant TB. 1 Introduction Mycobacterium tuberculosis ( Mtb ) was the cause of 10.8 million new cases of tubercuosis (TB) and 1.25 million deaths in 2023 alone[ 1 ]. Despite the availability of effective antibiotics, drug-resistant TB is widespread with 400,000 new drug-resistant cases recorded in 2023, leading to poorer patient outcomes[ 1 ]. Next Generation Sequencing (NGS) technologies have proven useful in predicting drug resistance from genomic sequences of Mtb and are being implemented in public health laboratories in some jurisdictions[ 2 , 3 ]. Genetic variability within the Mtb genome can reveal aspects of the pathogen’s biology, and single nucleotide polymorphisms (SNPs) have been investigated for this purpose. SNPs in Mtb have been well characterized, are used to define lineages[ 4 ] and constitute the majority of mutations listed by the World Health Organization (WHO) responsible for drug resistance (DR), alongside small insertions and deletions (indels; genomic variants smaller than 50bp)[ 5 ]. Structural variants (SVs), that is genomic variants larger than 49bp, have been less well studied. SVs have played an important role in various aspects of TB pathogenesis. The TbD1 deletion has been reported to be involved in the evolution and differentiation of the modern from the ancestral Mtb lineages, with modern lineages gaining enhanced resistance to hypoxic conditions[ 6 ]. Various deletions have also been associated with resistance to anti-TB drugs, for example the deletion of katG to isoniazid resistance or the deletion of ethA to ethionamide resistance [ 7 ]. Despite their potential significance, SVs in Mtb have been less comprehensively studied than SNPs and are often dismissed in studies[ 8 , 9 ]. This knowledge gap is partly due to the limitations of short-read sequencing technologies (Illumina), which can miss or mischaracterize SVs—particularly in repetitive or GC-rich regions—as well as technical and bioinformatics limitations in variant detection and interpretation[ 10 ]. Long-read sequencing technologies (Oxford Nanopore Technologies (ONT) or Pacific Biosciences (PB)) have provided an opportunity for accurate identification of SVs, enabling the exploration of their role in DR and Mtb pathogenesis[ 11 – 14 ]. However, Illumina remains the most used technology, with a total of °57,000 new NCBI entries n 2024 against °2,250 for ONT and PB. Pangenome reference graphs (PRGs) are data structures that represent both shared and variable genomic sequences across multiple individuals or isolates[ 15 ]. Unlike traditional linear reference genomes, which provide a single consensus sequence, PRGs can capture a broader spectrum of genomic diversity. This allows for improved detection of structural variants (SVs), especially when analyzing short-read sequencing data[ 16 ]. Although genome graph tools have been scarcely exploited in Mtb genomics[ 14 , 17 ], they hold the potential to unveil a wider scope of SVs across thousands of isolates that only have Illumina data available. In this study, we leverage long-read sequencing technologies and genome graph tools to create a Mtb -PRG. We also develop miniwalk, a tool that genotypes SVs from an assembly mapped to a PRG and show it is significantly more precise than genotyping using a linear genome. We also explore the evolution of SVs across the phylogeny, identifying recurrent mutations implicating the ESX loci, and a key copper exporter now deleted in a sub-lineage of L1. Lastly, we genotype SVs in 41,134 isolates to search for associations with DR across 22 drugs. Our findings underscore the diverse roles of SVs in shaping Mtb evolution and contributing to DR-TB. 2 Results 2.1 Construction of a M. tuberculosis pangenome reference graph We obtained 1,363 long-read datasets (ONT or PB) from public repositories and performed quality control for read length and decontamination of human DNA before assembling genomes with Flye[ 18 ]. After also incorporating 176 long-read assemblies from NCBI to enlarge the dataset, all assemblies were evaluated with QUAST[ 19 ] with 834 remaining after filtering (see Methods; Fig.S1). To ensure phylogenetic coverage, we sequenced 25 isolates from underrepresented lineages with ONT, achieving representation of lineages 1–9, including phenotypically drug-resistant isolates with-out known resistance mutations (Table S1). Sub-lineage 2.2.1 (Beijing) was the most frequent of the dataset (23.4%), while lineage 4 and its sub-lineages dominated over-all (48%; Fig.S2). The final cohort of 859 assemblies, representing both genetic and geographical diversity (Supplementary Data 1; Fig.S3), had a mean N50 of 2.13 Mb (range: 11.2 kb–4.45 Mb) and mean coverage depth of 125× (range: 7×–859×; full metrics in Supplementary Data 2). To construct the Mtb -PRG, we used minigraph[ 20 ], which efficiently incorporates sequences that are larger than 49 bp and missing from the reference genome while preserving reference coordinates. Starting with the H37Rv reference genome (accession: NC 000962.3) as the backbone, additional genomes were added to the Mtb -PRG in lexicographic order by sample name. Newly added sequences formed nodes, which represent distinct segments of genomic sequences that are connected by paths, resulting in 9,350 nodes in total. Nodes connected to each other but not part of the reference formed 848 bubbles, with the largest spanning 97,162 bp (H37Rv coordinates: 3,386,328–3,483,490; Fig.S4). 2.2 Genotyping SVs using the Mtb -PRG and miniwalk is more precise than using a traditional short-read SV caller PRG-involving approaches have been shown to call SVs more accurately from short-read sequencing data compared to traditional single-reference genome approaches[ 16 ]. Since most Mtb genomic data are derived from Illumina short-read sequencing, accurately identifying SVs using such data is valuable. However, no existing method genotypes SVs directly from graphs generated using minigraph. To address this, we developed miniwalk, a tool that identifies SVs by comparing the graph traversal paths of an assembly or long reads, to a reference genome mapped to a minigraph PRG (see Supplementary Methods 1). Miniwalk outputs insertions (INSs), deletions (DELs), duplications (DUPs) and inversions (INVs). We benchmarked miniwalk using 16 samples which each had a high-quality polished long-read assembly to act as truth. To avoid bias the 16 samples were removed from the PRG for the benchmarking[ 21 ]. Truth SVs were called using SVIM-ASM[ 22 ] on assemblies mapped to the H37Rv reference genome while miniwalk called/queried SVs on assemblies mapped to the Mtb -PRG. Precision and recall, calculated using the miniwalk bench module (see Supplementary Methods 2), averaged 0.85 and 0.82, respectively, showing comparable genotyping performance to SVIM-ASM ( Fig.1a ). Download figure Open in new tab Fig. 1 Benchmark of SV genotyping using the Mtb -PRG with miniwalk against the linear genome with SVIM-ASM and manta. a Results comparing Mtb -PRG and miniwalk SV genotyping against SVIM-ASM truth SV calls using the 16 long-read, polished assemblies. b Histogram with 50 bp bins showing the found SVs in a , with two segments with 50 bp bin from SVs of size 50-100 bp, 400 bp bin for SVs of size 100-500 bp, 500 bp bin for SVs of size 500-3,000 bp and SVs larger than 3,000 bp grouped together. The two segments show the precision and recall across different SV sizes. c Results for genotyping SVs from Illumina data mapped to H37Rv and calling with manta and mapping to the Mtb -PRG and calling with miniwalk against the SVIM-ASM truth SVs in a . d Average SV precision, recall and frequency of short-read data stratified by length from manta SVs against the SVIM-ASM truth SVs. Bin sizes as in b . e Same plot as d for genotyped short-read SVs using the Mtb -PRG and miniwalk against the SVIM-ASM truth SVs. f Results for genotyping SVs from Illumina data mapped to H37Rv and calling with manta and mapping to the Mtb -PRG and calling with miniwalk-long-read SVs as truth ( a ). g Same plot as d with miniwalk-long-read SVs as truth. h Same plot as e with miniwalk-long-read as truth SVs. i Precision values across different vcfdist partial credit thresholds for miniwalk-long-read (long-read assemblies mapped to the graph, genotyped with miniwalk), manta (short reads mapped to H37Rv, genotyped with manta) and miniwalk-short-read (short-read assemblies mapped to the graph, genotyped with miniwalk) SVs against the SVIM-ASM truth SVs. j Similar to i , for recall values. Nevertheless, precision varied across SV types, with DELs of size 500-1,000bp having the lowest precision (0.5; Fig.1b ), though fewer SVs were found in that size range. We also constructed an additional PRG excluding assemblies with N50 < 90,000bp and coverage < 25× to determine whether a more stringent approach should have been taken. The average precision and recall using the more stringent PRG was 0.84 and 0.83, respectively, indicating that our initial PRG had better precision. To assess short-read data performance, Illumina reads used for polishing the 16 assemblies were analyzed using manta[ 23 ], a leading short-read SV caller[ 24 ]. Reads were also assembled with shovill[ 25 ], mapped to the Mtb -PRG, and SVs genotyped with miniwalk. To assess precision and recall of SV genotyping using these two approaches (linear H37Rv/manta and Mtb -PRG/miniwalk), we compared both to SVIM-ASM truth SVs as in Fig.1a . We found miniwalk demonstrated significantly higher precision (0.7 vs. 0.46 for manta; two-sided t-test, p = 1e-6) at the cost of slightly lower recall (0.34 vs. 0.42 for manta; two-sided t-test, p = 0.007; Fig. 1c ). Breaking down by SV type, manta achieved a precision and recall of 0.42 and 0.63, respectively, for DELs and 0.42 and 0.36, respectively, for INSs ( Fig. 1d ). Meanwhile, miniwalk achieved a precision and recall of 0.64 and 0.54, respectively, for DELs, and 0.86 and 0.19, respectively, for INSs. DELs sized > 1,000 bp had precision exceeding 0.8 ( Fig. 1e ). Considering SVs genotyped with miniwalk from the long-read assemblies mapped to the Mtb -PRG as the truth (SVs from Fig.1a-b ; miniwalk-long-read truth SV), precision and recall decreased for Illumina data mapped to the H37Rv linear reference ( Fig. 1f,g ) but improved slightly for short-read assemblies mapped to the Mtb -PRG ( Fig. 1f,h ). These results highlight miniwalk’s precision advantage when using a PRG compared to a linear genome-based method. To further validate the use of miniwalk, we used partial credit from vcfdist[ 26 ] to evaluate the similarity between our variant calls and the SVIM-ASM truth set. Vcfdist assigns each pair of variants a partial credit score ranging from 0 to 1, based on edit distance (sequence content, position), where 1 means identical variants and 0 means no overlap. We applied growing partial credit (ct) thresholds starting from 0.001. Lower ct thresholds allow called SVs with partial similarity to the truth to be considered true positives, while higher ct thresholds impose stricter criteria, requiring close matches in SV type, position, and sequence content. This evaluation included the previous long-and short-read assemblies mapped to the Mtb -PRG as well as the variants called from manta. For miniwalk-long-read SVs, precision and recall showed a gradual decline as the threshold increased, reflecting differences in sequence content (e.g., long-read sequencing errors, SNPs) and positions between called and truth SVs. Nonetheless, precision (0.75) and recall (0.74) remained moderately high for long-read assemblies mapped to the Mtb -PRG at a ct threshold of 0.5, demonstrating the accuracy of SVs called using our graph-based approach. For short-read data, miniwalk outperformed manta across all ct thresholds up to 0.985 for both precision and recall ( Fig. 1i-j ). Further analysis revealed that SVs in complex regions were the ones driving the decrease in precision and recall in short-read data (see Supplementary Results 1) These results suggest using a genome graph as a superior alternative to a linear reference genome for genotyping SVs. With this in mind, we proceeded to learn about SVs in Mtb using the Mtb -PRG and miniwalk. 2.3 Landscape of structural variation across the M. tuberculosis genome and phylogeny We sought to improve the characterization of SVs across diverse Mtb lineages using high-quality long-read data. Assemblies used to construct the Mtb -PRG were mapped back to the PRG. The 38 most fragmented assemblies were excluded from this analysis as they could not be mapped using minigraph’s default parameters (see Methods). SVs in the remaining assemblies (821) were genotyped against the H37Rv reference genome using miniwalk. To further define the diversity of SVs in the Mtb genome, DELs and DUPs in close proximity were grouped as copy number variants (CNVs). SVs within ± 25bp proximity, ± 50bp size, and of the same type were considered identical across solates. A total of 3,077 unique SVs were identified relative to H37Rv, broken down into 1,127 INSs, 1,694 DELs, 59 DUPs, 58 INVs and 139 CNVs. The largest SV was a 66,582bp region (H37Rv coordinates: 2,901,855–2,968,437) that was inverted and translocated to a novel genomic position (H37Rv coordinates: 415,370) in lineage 9 (Fig.S5). The translocation implicated 70 genes and truncated Rv2577 and Rv0345 . This rearrangement highlights the complexity of SV patterns observed in Mtb , though most isolates presented small DELs, CNVs and numerous IS6110 elements (°1,358bp; Fig.2a ). A principal component analysis (PCA) performed with the SV genotypes revealed that SVs contribute to population structure, especially along PC2 which separates the ancient (L1, L5-9) from the modern (L2-4) lineages ( Fig.2b ). Notably, the “TbD1” SV at position 1,761,789 explains 2% of the variability in PC2 (Fig.S6). To understand the acquisition of SVs across lineages over time, SVs were geno-typed against an ancestrally reconstructed sequence (MTBC0[ 27 ]) and mapped to a phylogenetic tree from the genotyped isolates ( Fig.2c ). This showed that all isolates have acquired a similar amount of SVs across the phylogeny, the least being L4 with an average of 67 SVs per isolate and the most being L8 with 82 SVs (though 1 isoate was used). Interestingly, Beijing lineage isolates segregated into two branches, one with an average of 86 SVs per isolate and the other with an average of 68 SVs per solate, indicating distinct evolutionary trajectories of SV accumulation within this sub-lineage. When analyzing L4 sub-lineages, the globally spread generalists L4.1.2 and L4.3[ 28 ] had an average of 60 and 63 SVs per isolate, respectively, whereas the specialists L4.1.3 and L4.6.2[ 28 ] had an average of 74. From these results, generalist ineages have significantly less SVs than specialist lineages (two-sided wilcoxon test, p -value=0.03). SVs are generally deleterious in nature, so lineages with more SVs could have suffered more deleterious mutations than those with less SVs, hindering their geographical spread[ 28 ]. Alternately, the higher number of SVs in specialist subineages could reflect the process of host-population adaptation via reductive genomic evolution[ 29 ]. Download figure Open in new tab Fig. 2 Structural variation genotyped from 821 long-read Mtb assemblies covering L1-L9. a Distribution of SVs in isolates by sizes and type. The x axis is in log10 scale. b PCA based on the isolates’ SV genotypes. c Phylogenetic tree filtering out 75 isolates with disconcordant SNP genotypes and lineage calls due to lower quality long-read sequencing data. Barplots represent the number of SVs compared to the ancestral reference sequence MTBC0. Rings, inner to outer, represent 0, 50 and 100 SVs, respectively. d Distribution of SVs along the genome by their fold-enrichment and type across all isolates. Horizontal lines represent no fold change of a specific position from the rest of the genome. The panel on the right represents the zoomed-in region of 3,200,000-3,600,000 with its own fold-enrichment values and genes along the region. To identify SV hotspots, we scanned the genome using 50,000bp windows with 5,000bp steps and calculated fold-enrichment relative to the average SV density. Results indicated an uneven SV distribution, with a prominent hotspot between 3,200,000–3,600,000bp enriched in PE/PPE genes ( Fig. 2d ). We found that 75% of SV breakpoints fall in genes. This is lower than expected by chance (mean random break-points = 92%, p -value = 1e-10), likely due to the deleterious effect of SVs (Fig.S7). Moreover, SV breakpoints were significantly associated with GC-rich sequences compared to randomly sampled genome regions (two-sided wilcoxon test, p = 5e-8; Fig.S8). PE/PPE genes are known for their GC-rich repetitive regions, which serve as an SV hotspot. These findings highlight that SVs in Mtb are not randomly distributed but are enriched in specific genomic features and regions. 2.4 Pangenomic insights into structural variation in the ESX loci of M. tuberculosis The Mtb genome encodes for five type-7 secretion systems which are known to play an mportant role in pathogenesis and virulence, the most studied being ESX-1[ 30 , 31 ]. ESX-5 is important for full virulence[ 32 ] and has been involved with various functions, such as the secretion of PE/PPE proteins, the activation of the host cell inflammasome response or macrophage death induction[ 31 ]. Its paralogs’ (ESX-5a, ESX-5b and ESX-5c) functions, however, are less understood. To determine the presence of SVs in the ESX-5 loci and which are more conserved, we analyzed the haplotypes from the 821 ong-read assemblies mapped to the Mtb -PRG. All ESX-5 loci had one SV bubble spanning their locus ( Fig.3a ). Most isolates acked SVs across these loci with most non-reference haplotypes being present solely n 1-6 isolates. Nevertheless, the most frequent non-reference haplotype was a ppe25 - ppe27 deletion, present in 9% of all isolates ( Fig.3b ). The deletion was found in all sub-ineage 4.4 isolates and sporadically in other lineages (L4.1, L4.3, L4.5, L1, L3 and L8). Download figure Open in new tab Fig. 3 Visualization of the structural variation in the ESX-5 loci through the pangenome reference graph. a Locations of ESX-5, ESX-5a, ESX-5b and ESX-5c in the Mtb -PRG. Different colours in each loci represent different genes. b Structural haplotypes of each ESX-5 locus. The order of the loci is the same as in a , the graph representation is of the H37Rv reference on the left and of the non-reference most frequent haplotype on the right, for each locus. Nodes traversed through each haplotype are coloured in blue with arrows pointing towards the direction of the genome. Tables and gene representations show the frequency of each haplotype and its effect on the gene content. Visualizations done using Bandage[ 40 ]. The deletion can most likely be attributed to non-allelic homologous recombination as the 3’ sequences of ppe25 and ppe27 are highly homologous (Fig.S9). The deletion was found to provide higher persistence in mouse models and showed convergent evolution n M. canettii [ 33 ], which could explain its homoplasic effect across the Mtb phylogeny (Fig.S10). A previous study documenting this deletion hypothesized that the loss of immunomodulatory PE/PPE genes could help the bacteria escape the immune system[ 34 ]. We extended the analysis to assess the impact of SVs across all of the ESX genes. To understand the probability of observing SVs in a gene, we calculated the trimmed (10%; removal of outliers) mean number of SVs in essential (0) and non-essential (0.76) genes[ 35 , 36 ], with 33% of genes being overlapped by an SV. Genes prefixed with the etters ‘ecc’ and ‘mycP’ in the ESX loci encode proteins which form the secretion system responsible for translocating proteins involved in virulence[ 37 ]. Consistent with their essential role in virulence [ 38 ], we found 0 isolates from our collection of 821 to have SVs impacting structural components of the ESX-3 and ESX-5 loci, and only one with an SV impacting ESX-1 genes eccE1 and mycP1 . In contrast, the ESX-2 and ESX-4 loci, which are non-essential and have yet to be ascribed a function ([ 38 ]), had between 1 and 16 isolates with SVs overlapping their gene members[ 38 ] (Fig.S11a). Among all genes, espI from ESX-1 had the highest number of SVs, with 125 isolates exhibiting overlaps (Fig.S11b). EspI acts as a negative regulator of the ESX-1 secretion system; thus, the high frequency of SVs in this gene may suggest positive selection for maintaining constant expression of the system[ 39 ]. Genes within the ESX loci prefixed with the letters ‘esx’ encode proteins that are secreted through the ESX apparatus and thought to be implicated in host-pathogen interaction. Across our collection of 821 isolates, no esx genes in the ESX-1 and ESX-5 loci were found to have any SVs (Fig.S11c), however esx genes in the ESX-2 locus displayed SVs in 3 isolates. Of the ESX-5 paralogs, ESX-5a was the most strongly conserved (1 isolate with SVs), suggesting deletions of this locus are less well tolerated than deletions n ESX-5b and ESX-5c ( Fig.3b ,S11c-d). Interestingly, one instance of a full ESX-5c deletion was found in the geographically restricted African lineages (L6 and L9). 2.5 A genome and phylogeny-wide screen of SV positive selection in M. tuberculosis We performed a systematic screen for genes which evolved SVs repeatedly across the phylogeny, following the rationale that this may reveal genes targeted by positive selection[ 41 ]. The criteria for our screen stipulated that the gene had to be affected by a SV n at least 10 isolates, belonging to more than one Mtb lineage. We further excluded genes in or ± 3,000bp of low mappability regions[ 10 ], as those regions are more prone to structural rearrangements, which could be falsely perceived as positive selection. This analysis resulted in 59 homoplasic gene candidates. The gene with the strongest homoplasic effect was Rv3177 , with SVs in 158 isolates from L1.2.1, L1.1.1, L2.2.1 (82% of all isolates), L2.2.2, L3.1.2, L3.1.3, various L4 sub-lineages and L5 in 15 different breakpoints across the gene ( Fig.4 ). Rv3177 is a probable peroxidase involved n detoxification of reactive oxygen species (ROS). In a previous study, variants in Rv3177 were associated with isolates harboring the isoniazid resistance-conferring katG -S315T mutation, suggesting a possible compensatory role in drug resistance[ 42 ]. Another gene that acquired SVs in different branches of the phylogeny was ctpG with 6 different DEL breakpoints across the gene (Fig.S12) in L3, L4.3.4 and L5 ( Fig.4 ). Download figure Open in new tab Fig. 4 Homoplasic effect of SVs overlapping genes across the phylogeny. Same phylogenetic tree as in Fig.2c . The outer rings represent the presence (colour) or absence (white) of an SV in a specific gene or each isolate. The patchy distribution of some SVs within clades may be explained by inaccurate placement of isolates in the phylogeny, likely due to higher SNP error rates associated with long-read sequencing, especially in isolates generated using earlier versions of these technologies. CtpG is a zinc (Zn) exporter whose deletion has been associated with Zn accumulation[ 43 ]. During host infection, Mtb goes from Zn-rich to Zn-limited environments[ 44 ]. A non-functional Zn exporter could possibly provide bacteria in Zn-limited conditions a selective advantage. Other interesting genes were mmpS1 , with SVs in various L4 sub-lineages with a prominence in L4.1 as well as few isolates in L2 and L3, and glnA3 , with SVs in all L2.2.2 isolates and sporadically in L2.2.1, L3 and L4.1 ( Fig.4 ). We sought to validate these results within a larger cohort. To do so, we downloaded a list of 44,709 high-quality Illumina sequences grouped in a previous study[ 17 ]. After assembly we ended with 41,134 SV VCF files. We found few isolates with SVs in Rv3177 and mmpS1 , as Rv3177 is in a large bubble, likely hard to genotype using short-read data, and the mmpS1 SVs were all INSs, variants with low genotyping recall using short-read data. We were able to confirm the same distribution of SVs for ctpG and glnA3 (Fig.S13). A similar distribution was observed for SVs in ethA , a gene whose deletion is associated with ethionamide (ETO) resistance, with a main cluster ocated in the Beijing lineage. For pncA , a pyrazinamide (PZA) resistance-associated gene, we found less isolates with SVs and the distribution was not as clustered, though mainly isolates in L2 had SVs in that gene. These results showcase the added benefit of using a pangenome to identify key genes implicated in Mtb evolution, and highlight four novel candidates for prioritisation in future research. 2.6 Characterization of a M. tuberculosis lineage 1.2.1-specific copper exporter deletion We expanded the previous screen to detect genes with multiple lineage-specific SVs. This analysis revealed diverse deletions of the ctpV gene, unique to the L1.2.1 sublineage (excluding its deepest branch split; Fig.5a ; Fig.S14). These deletions increased in size further from the branch split ( Fig.5b ) and overlapped the gene’s start codon, indicating a loss-of-function mutation ( Fig.5c ). SNPs could not be reliably called for all the isolates as long-read data was used, nevertheless, we found no loss-of-function variants in this gene in other L1 sub-lineages. CtpV is a putative P-type ATPase copper (Cu) exporter protein, with the main Cu binding site located in the first 66 amino acids. A multiple sequence alignment of the protein sequence among diverse mycobacteria showed > 80% pairwise identity across the Mycobacterium genus (Fig.S15), aside from Mycobacterium leprae , highlighting the importance of this protein not only in Mtb . Download figure Open in new tab Fig. 5 Discovery of growing ctpV deletions across the L1.2.1 phylogeny and a transcriptomics analysis. a All genes across the genome with the number of isolates that have SVs overlapping them in the y axis. Red-coloured points represent those that passed the second positive selection screen; i.e. those genes that have different SVs in a same gene, unique to a lineage. b Phylogenetic tree of all ineage 1 isolates, showing those that present ctpV deletions and/or non-synonymous mutations. SV IDs are formulated as following “ { start position of SV } . { SV type } . { size of SV } ”. Tree visualized with TreeViewer[ 53 ]. c Read coverage across the csoR operon for each unique deletion spanning ctpV . Y axis shows the read depth at each genomic position. SV ID the same as b . Visualization done using pyGenomeTracks. d Volcano plot summarizing the differential expression results comparing L1.2.1 against L1.1.2 and L1.1.1 isolates. P-values are calculated using a two-sided t-test, with Benjamin-Hochberg multiple hypothesis correction. Log fold change on the x-axis. e Functional categories from Mycobrowser of the significant differentially expressed genes. f Dot plots showing expression patterns or 4 copper-associated genes in L1.2.1 against L1.1.2 and L1.1.1. In red are genes enriched and in blue are genes depleted in L1.2.1. L1.2.1 is predominant in Southeast Asia, especially the Philippines, the fourth country in the world with the highest burden of active tuberculosis[ 1 ], accounting for °65-80% of all lineages in the country[ 45 , 46 ]. Moreover, this sub-lineage shows high drug resistance rates, high transmissibility and is the most prevalent L1 sub-lineage globally[ 47 , 48 ]. Given these characteristics, we hypothesized that this deletion impacts the expression of copper-associated genes, potentially contributing to the unique adaptive traits of this sub-lineage. To determine this, we compared the transcriptional profile of an L1.2.1 isolate (297bp ctpV confirmed deletion) against L1.1.1 and L1.1.2 isolates under normal growth conditions[ 49 ] (Table S2; Fig.S16). This analysis revealed 147 differentially expressed genes ( Fig.5d ). Functional categorization revealed that the L1.2.1 isolate showed enrichment of genes involved in virulence, detoxification and adaptation ( Fig.5e ). ctpG and Rv0968 were among the differentially expressed genes ( Fig.5f ). Rv0968 is part of the cso operon, where ctpV lies. This operon is controlled by CsoR which inhibits its own expression in normal conditions. However, when Cu is present, it attaches to CsoR and inhibits its function, enabling Rv0968 and ctpV expression[ 50 , 51 ]. The higher expression of both Rv0968 and ctpV (non-significant; adjusted p -value=0.053) was indicative of Cu accumulation inside these isolates (Fig.S17). Moreover, the enrichment of ctpG , another P-type ATPase metal transport protein, has been previously associated with Cu accumulation in the cell, as well as the depletion of ctpB , a P-type ATPase Cu transport protein associated with Cu uptake in Cu-replete medium (nonsignificant; adjusted p -value=0.059)[ 51 , 52 ]. Nevertheless, genes associated with oxidative stress (i.e. sigE , furA , nuoB ) or protein stress (i.e. rpsR , rplI , rpsQ ) were not differentially expressed, indicating that Cu was not present at toxic levels (Supplementary Data 3)[ 51 ]. Altogether, these results are indicative of higher Cu accumulation in an L1.2.1 isolate compared to other L1 sub-lineages. 2.7 Discovery of DR-associated SVs in 41,134 M. tuberculosis isolates Using the Mtb -PRG and miniwalk, we aimed to discover DR-associated SVs across various first and second-line drugs for treating TB. The previous list of 41,134 isolates also contained phenotypic information for at least one drug, with first-line drugs being vastly more represented than second-line drugs ( Fig.6a ). To search for associations we used two parallel methods: Firth logistic regression and a supervised machine learning model (xgboost), testing each drug independently and accounting for population structure and TB-Profiler predictions[ 54 ]. Analyses were conducted at both the SV and gene levels (SV-gene burden test). Download figure Open in new tab Fig. 6 Resistance-conferring SVs across 41,134 isolates and 22 drugs. a Stacked barplot showing the percentage of isolates that are susceptible, resistant or have no information (NA) across the 41,134 solates, for each drug. Percentage labels are only shown when the value is ≥ 1%. b Machine learning importance scores (%) of the TB-Profiler mutations predictions. TAC is not illustrated as only resistant isolates were available. c Thorough description of the 14 associated SVs, ranked by their adjusted p-value (top - lower p-value). “Total Resistant” is the number of resistant isolates that presented the SV; the same applies to “Total Susceptible” for susceptible isolates. AMK: amikacin, BDQ: bedaquiline, CAP: capreomycin, CIP: ciprofloxacin, CFZ: clofazimine, CS: cycloserine, DLM: delamanid, EMB: ethambutol, ETO: ethionamide, GAT: gatifloxacin, INH: isoniazid, KAN: kanamycin, LEV: levofloxacin, LZD: linezolid, MFX: moxifloxacin, OFX: ofloxacin, PAS: para-aminosalicylic acid, PZA: pyrazinamide, RBT: rifabutin, RIF: rifampicin, STR: streptomycin, TAC: thioacetazone. Logistic regression revealed 14 non-canonical SVs (post-filtering) significantly associated (adj. p-value < 0.05) with 7 DR phenotypes (Supplementary Data 4). Associations in PE/PPE genes were excluded due to their SV hotspot nature (list of discovered associations: see Supplementary Data 5). Xgboost outputs the estimated importance of each SV and TB-Profiler prediction to the analysis (ranging between 0-100%), indicating the relevance of these factors in classifying DR phenotypes. This analysis highlighted the contribution of the current list of WHO mutations, as identified by TB-Profiler, in predicting DR, with first-line drugs as well as fluoroquinolones and aminoglycosides having high importance scores (importance > 50%). However, newer drugs such as bedaquiline, delamanid or clofazimine showed lower importance scores ( Fig.6b ). The SV-gene burden test revealed 27 non-canonical genes associated with drug resistance across 10 drugs (Supplementary Data 6). To increase confidence in our findings, we performed, for each associated gene and SV, a permutation analysis to estimate the likelihood of our observed p-values under a null distribution. This showed that under the null hypothesis, there’s < 5% chance of observing these associations, suggesting statistical significance for all variants. For isoniazid (INH), we found 2 SVs and 3 genes associated with DR ( Fig.6c ; Fig.S18-19). The most significant SV was a deletion overlapping Rv3434c ( Fig.6c ), a gene previously associated with kanamycin (KAN) and amikacin (AMK) DR[ 55 ]. An overwhelmingly 99% of isolates with a mutation in this gene presented an isoniazid resistance phenotype, the vast majority from L2. Nevertheless, only five isolates did not present TB-Profiler predictions for INH DR. Another deletion spanning Rv0738-Rv0741 was predominantly found in resistant isolates (97%), with an Rv0740 knock-out having previously been associated with INH resistance[ 56 ]. Five isolates with SVs in Rv0740 did not have TB-Profiler predictions for INH DR. These findings underscore the potential of SVs in identifying additional resistance mechanisms, particularly in a widely distributed and commonly used drug like INH, not captured by TB-Profiler. For rifampicin no non-canonical SVs or genes with SVs were detected in our analysis. For PZA, we found 4 SVs and 6 genes. The highest-scoring gene was pncA , with 2.6% of DR isolates having SVs overlapping that gene (Fig.S19). An operon with SVs and also highly associated with PZA DR was Rv1505c-Rv1507c , which plays a role in diacyltrehalose biosynthesis, a lipid that forms part of the cell wall[ 57 ]. The only gene with SVs associated with ethambutol (EMB) DR was accE5 , whose encoded protein is found in the cell wall[ 58 ] (Fig.S19). For second-line drugs, we found non-canonical SVs and genes with SVs for AMK, bedaquiline (BDQ), capreomycin (CAP), ciprofloxacin (CIP), clofazimine (CFZ), delamanid (DLM), ETO and streptomycin (STR) ( Fig.6c , Fig.S19; see Supplementary Results 2). Our results highlight the importance of different types of SVs in DR, such as an insertion associated with STR DR, as well as the role of non-coding regions, such as a deletion upstream Rv2258c associated with CAP DR. The low importance scores of second-line, recently introduced drugs (BDQ, CFZ, DLM) highlight the need to find more DR variants. 3 Discussion SVs have a higher potential to disrupt gene-coding and cis-regulatory regions than SNPs due to their larger sizes, yet they have been excluded and understudied in many species[ 9 , 59 ]. Their role in Mtb pathogenesis has been vaguely explored, mainly in uncovering their role in drug resistance[ 7 , 11 , 14 ] and evolution[ 6 , 60 ]. Nevertheless, previous studies have focused on specific genes, had small sample sizes lacking lineage diversity, were limited to first-line drugs or used short-read data, reducing potential to uncover the role of these variants. In this study, we leveraged both long-read sequencing data and genome graph methodologies across thousands of isolates to reveal how SVs affect diverse aspects of Mtb pathogenesis. To profile the diversity and effect of SVs across the Mtb genome, we constructed a Mtb -PRG from 859 long-read assemblies. The inclusion of at least one isolate across lineages 1-9 in the Mtb -PRG was important to capture genomic diversity. Mini-walk was developed to use the output from minigraph to genotype SVs against a reference genome. Benchmarking showed that using minigraph and miniwalk on long-read assemblies performs similarly to SVIM-ASM. Using minigraph and miniwalk on short-read assemblies provided a significantly higher precision than using a linear genome and manta. The precision increase is seen especially in insertions and large deletions. Recall is not significantly higher as minigraph provides an NA whenever it cannot correctly map the assembly through a bubble. As miniwalk complements minigraph’s output, it could be used with graphs from other species, allowing for wider applications, though this remains to be tested. One current limitation of mini-walk would be that it needs assemblies or long reads in order to genotype SVs. A future direction for miniwalk lies in genotyping SVs using unassembled short reads which could also improve the recall. Our findings illustrate the advantages of adopting pangenome-based approaches for SV detection, highlighting miniwalk as a useful resource. We provide a description of 3,077 SVs in the Mtb genome across 821 diverse isolates with long-read data. When investigating the structural variability of the ESX loci, we revealed that ESX-1, −3 and −5 are more conserved than ESX-2 and −4, and that ESX-5a is the most conserved out of the three ESX-5 paralogs. This suggests that the function of ESX-5a could be more essential for the bacterium. ESX-5c, on the other hand, was found to be completely deleted in all L6 and L9 isolates, showing its redundancy regarding bacterial viability. A frequent ppe25-ppe27 deletion was observed in all sublineage 4.4 isolates, a variant previously associated with greater bacterial persistence in a mouse model[ 33 ]. Interestingly, the gene with the most SVs in the ESX loci across all isolates was espI (15% of all isolates), from the ESX-1 locus. espI is located in a region of low-mappability, making it challenging to determine whether these SVs are under positive selection. Nevertheless, a previous study[ 39 ] found that, in the presence of BDQ, ESX-1 secretion is blocked due to ATP repletion, yet this is corrected with a loss of espI . These findings warrant further investigation of espI mutations in clinical DR isolates using long-read data. L1.2.1 is the most prevalent L1 sub-lineage globally, with L1.2.1.1.2 being geo-graphically unrestricted while L1.1.1.1 and L1.1.3 are geographically restricted[ 48 ]. L1.2.1 predominantly affects individuals in the Philippines and Indonesia, the two countries with the highest rise in TB incidence from 2020 to 2023[ 1 ]. Understanding the higher transmissibility of this sub-lineage could reveal Mtb mechanisms enhancing bacterial survival or host infection efficiency, which could have a direct implication for TB surveillance in these regions. Moreover, knowledge of transmissibility can inform better designs on sub-lineage-specific vaccine candidates[ 61 ]. A lineage-specific SV screen revealed L1.2.1-specific ctpV (a copper exporter transmembrane protein) deletions. The transcriptional profile of an L1.2.1 isolate with the deletion against other L1 sub-lineages revealed enriched genes associated with copper accumulation, though not toxicity. Among other effects, copper can inhibit PdtaS, an important transcriptional repressor, which could explain the enrichment of genes associated with virulence and detoxification in this sub-lineage[ 62 ]. However, more evidence is still needed to directly link the ctpV deletion to Cu accumulation. Additionally, Mtb has other copper exporter proteins[ 63 ], which would explain the viability of these lineages despite having the ctpV deletion. It remains to be seen whether other variants occur on the branch housing the ctpV deletion, and whether they contribute to the virulence of the sub-lineage. Future studies in animal models that support granulomatous lesions could be performed, as Cu is induced in macrophages in hypoxic conditions[ 63 ]. A limtation of the supporting RNA-Seq data is that it was generated in only 1 isolate for each sub-lineage, which may not represent the transcriptional profile of all isolates in those sub-lineages. Regardless, our study identifies a novel gene that may contribute to the higher transmissibility of L1.2.1. Other positive selection signatures across lineages revealed deletions in another metal exporter, ctpG , highlighting the importance of metal homeostasis in Mtb adaptation to different host environments. By accounting for WHOv2-listed mutations, we were able to detect non-canonical SVs, and genes with SVs, associated to DR across 11 drugs, including important, recently introduced second-line drugs. Consistent with previous studies[ 55 , 64 ], the dentified SVs affect both coding and intergenic regions, emphasizing the importance of the non-coding regions of the genome. Our graph approach facilitates the discovery of these SVs, which could be especially beneficial in those settings where whole genome sequencing is routinely done to diagnose DR-TB[ 2 , 3 ]. The high frequency of SVs in ETO- and PZA-DR isolates showcases the need to improve SV detection in clinical strains. Since our approach relies on SVs found in 859 assemblies, future studies that focus on long-read sequencing isolates with phenotypic DR might add undiscovered DR-associated SVs to the Mtb -PRG. As recent drugs were underrepresented in our DR Illumina dataset, the lack of power might not have allowed us to detect important SVs associated with DR; as more Illumina data becomes available for these drugs, the power to detect associations will improve. As shown in recent studies[ 65 , 66 ], using minimum inhibitory concentration (MIC) instead of binary phenotypes might enhance ow-level DR variants detection. Finally, external validation of the associations should be done to further confirm our findings. The Mtb -PRG and miniwalk tools developed here, which are now openly accessible to the TB research community, open new opportunities to investigate the contribution of SVs to drug resistance, evolution and Mtb pathogenesis. 4 Methods 4.1 Public Mtb long-read sequence and assembly datasets We gathered available long-read (Pacific Biosciences, Oxford Nanopore Technology) sequence data and assemblies from public repositories using fastq-dl (v2.0.4)[ 67 ] (relevant ENA/SRA/GCA IDs to access all data can be found in Supplementary Data 1). 4.2 Mtb culture and DNA extraction Genomic DNA was extracted from Mtb isolates grown on solid culture utilising the FastPrep 24 homogeniser followed by purification and concentration of the crude cell ysate using the QIAamp 96 DNA QIAcube HT Kit. 4.3 Long-read sequencing of underrepresented Mtb isolates Sequencing was done in two batches for African (L5, L6 and L9) and non-African ineages separately. For African lineages, DNA was quantified using the Quant-iT 1X dsDNA high sensitivity assay kit (Q33232) (Thermo Fisher Scientific) as per the manufacturer’s instructions. Library preparation was performed using the Rapid Barcoding Kit (SQK-RBK004) (Oxford Nanopore Technologies), as per the manufacturer’s instructions. Briefly, DNA (variable input ranging from 60–180 ng) was fragmented enzymatically and barcode adapters ligated to each sample. The barcoded libraries were then pooled and sequencing adaptors added prior to loading onto an R9.4.1 flow cell. Libraries were sequenced on the GridION Mk1 platform (Oxford Nanopore Technologies). Basecalling was done using Guppy (v7.2.13) and the “dna r9.4.1 e8 [email protected] ” model For non-African lineages, sequencing was performed using the Ligation Sequencing Kit (SQK-LSK109) and the Native Barcoding Kit 1D (EXP-NBD104 and EXP-NBD114) according to the manufacturer’s instructions on the GridION platform with R10.4.1 flow cells. Basecalling was done using dorado (v0.5.3) and the “dna r10.4.1 e8.2 400bps [email protected] ”. 4.4 Sequencing data quality control, assembling, NCBI assemblies and assembly quality control Nanopore raw fastq files were pre-processed using Porechop (v0.2.4)[ 68 ] to remove all adapters in the sequences and nanoq (v0.10.0)[ 69 ] to filter out reads that were under 500bp in length and with a quality lower than 7. PacBio-CLR fastq files were preprocessed with seqkit (v2.3.1)[ 70 ] by removing reads shorter than 500bp and duplicate reads. Pbclip[ 71 ] was used to correct reads in case the PacBio toolchains handled the raw to fastq file conversion improperly. For Pacbio-Hifi reads, HifiAdapterFilt (v2.0.1)[ 72 ] was applied to detect and remove possible adapters that were left in the samples and seqkit was applied to remove reads shorter than 500bp. After read preprocessing, reads from each technology were decontaminated as described in Hall & Coin [ 73 ] which, in short, is a 2-step process that uses kraken2 (v2.1.2)[ 74 ] to firstly extract human reads from a sample and after that selects solely those reads classified as belonging to Mtb . Flye (v2.9.2)[ 18 ] was used to create the assemblies. Assemblies were finally polished with their own long reads to correct for assembling mistakes using Pilon (v1.23)[ 75 ]. Long-read-sequenced NCBI assemblies were also downloaded to obtain a wider dataset and, jointly with the long-read assemblies, were run through QUAST (v5.2.0)[ 19 ]. A threshold of 95% was used for completeness to filter incomplete assemblies. Contiguity was assessed by applying an N50 filter; the largest repetitive sequence in the Mtb genome (10,000bp). Correctness was assessed by applying a number of misassemblies filter of maximum 100. As PB-CLR data showed completely distinct distributions of data and few assemblies were passing the filters, this data was completely removed from the dataset. Contigs with a length shorter than 1000bp were removed using sequence-stats (v1.1)[ 76 ]. Removing assemblies according to their length was not necessary as the remaining ones were all inside the reference genome length range by 10% (4,852,685.2 - 3,970,378.8). Moreover, filtering by the number of contigs was not applied as the maximum number of contigs an assembly had was 589, far from the filter applied by Blackwell et al. [ 77 ] (2000). 4.5 Lineage assignment Long reads used to create the assemblies were sent through TB-Profiler (v5.0.1)[ 54 ] which assigned the lineages to each sample from the fastq data using Coll et al. [ 4 ]’s lineage nomenclature. For the NCBI assemblies, as they weren’t in fastq format, they were mapped to the reference genome first using minimap2’s -x asm5 flag (v2.24)[ 78 ] as well as its paftools.js script to transform the alignment into a VCF file with the detected SNPs. The file was then sent to fast-lineage-caller (v0.3.2)[ 48 ] to determine the lineage under the same scheme as the long-read assemblies. Isolates that were assigned more than one lineage, a Mycobacterium bovis lineage or an N/A were filtered out. 4.6 Pangenome reference graph construction To create the PRG, minigraph (v0.20-r559)[ 20 ] was used with −l 10,000 -d 5,000 parameters. The backbone of the PRG was the reference genome H37Rv and the rest of 859 samples were added to the PRG. After running minigraph with the -cxggs flag, the PRG was created. Nodes containing long mononucleotide sequences were removed from the dataset, as these are likely artifacts resulting from sequencing errors in long-read technologies. Such errors commonly occur in homopolymer regions, where the sequencer struggles to accurately determine the number of repeated bases. 4.7 Genotyping SVs using miniwalk When mapping assemblies to minigraph using default parameters, highly fragmented assemblies are filtered out, however, this can be overcome by applying the −l 10000 -d 5000 flags. Nevertheless, to find robust SV signals across less fragmented assemblies, we used default parameters and filtered out 38 assemblies. SVs of 821 assemblies were genotyped using our tool, miniwalk (v0.1). Genotyping is done in 6 steps. First we create an assembly, next we map it to the PRG using minigraph. Following that, our assembly’s path is merged with a reference path (in this case H37Rv’s path through the PRG) to create a VCF. The VCF is then refined using miniwalk in the last 3 steps: using mod to output specific SVs and their positions, ref to order the SVs and cluster them together where necessary and ins2dup to change assigned INSs to DUPs where necessary. More details in Supplementary Methods 2. 4.8 Creating an Mtb SV gold-standard As there exists no Mtb SV gold-standard to benchmark against, we used a set of high-quality Mtb assemblies[ 21 ] and mapped them to the reference genome H37Rv using minimap2 with the -x asm5 option and SVs were called using SVIM-ASM (v1.0.3)[ 22 ] n haploid mode. The resulting VCF would be used as the gold-standard. 4.9 Calling SVs on short-read Illumina data In this work we called SVs on Illumina data using 2 different methods. We did quality control on the Illumina reads using fastp (v0.23.2)[ 79 ] and filtered out human reads using kraken2, mapped them to H37Rv using bwa mem (v0.7.17)[ 80 ] and called SVs using manta (v1.6.0)[ 23 ]. SVs in the resulting VCF file that were marked as “BND” were filtered out. For the second method we used the filtered reads to create an assembly using shovill (v1.1.0)[ 25 ], mapped the assembly to the Mtb -PRG using minigraph and called SVs using miniwalk. 4.10 Benchmarking SVs called mapping to H37Rv against Mtb -PRG and miniwalk To evaluate the SVs called using minigraph and miniwalk we did 3 separate bench-marks calculating the precision and recall. The first benchmark consisted on comparing the SV calls of polished long-read assemblies with minigraph + miniwalk against H37Rv + SVIM-ASM as the truth. The second consisted on comparing the SV calls of short-read Illumina data with minigraph + miniwalk or H37Rv + manta against H37Rv + SVIM-ASM as the truth. The third benchmark consisted on comparing the SV calls of short-read Illumina data with minigraph + miniwalk or H37Rv + manta against minigraph + miniwalk (polished long-read assembly SVs) as the truth. We used miniwalk’s bench mode to calculate precision and recall. Precision was defined as the rate of true positives (SVs correctly called when comparing to a truth set) over the total sum of true positives and false positives (SVs called but not in the truth set) (TP/(TP+FP)) while recall was defined as the rate of true positives over the total sum of true positives and false negatives (SVs that were not called but were in the truth set) (TP/(TP+FN)). A true positive was determined as such in the case where the called SV overlapped ≥ 25% of a truth SV[ 81 ], matched the sequence at least 50% with a standard SV, in case both were in the same tandem duplication[ 82 ] or spanned 2 standard SVs that were < 200bp apart or vice versa (detailed explanation in Supplementary Methods 2). Miniwalk’s bench code was tested by using synthetic VCF files with expected precision and recall values across different SV representations (detailed explanation in Supplementary Methods 2). The first and second benchmarks were repeated using an external tool, vcfdist (v2.3.2)[ 26 ], to validate the results found using miniwalk bench. The VCF file resulting from minigraph + miniwalk was slightly modified to be accepted by vcfdist, such as replacing the asterisk where there is no sequence in the REF or ALT columns for the corresponding base in that position. Vcfdist was then run with -ct 0.001 to determine the ct score for all SVs. 4.11 PRG assemblies SV PCA SVs that were ± 25bp of proximity, ± 50bp in size and the same type were considered as the same SV across different isolates. A genotype matrix was then created with isolates as rows and all found and clustered SVs as columns and a PCA done using the factoextra package (v1.0.7) in R (v4.1.2). 4.12 PRG Mtb phylogenetic tree To construct a phylogenetic tree, the SNPs of all isolates used to create the PRG was needed. For NCBI assemblies, SNPs were obtained as mentioned in “Lineage assignment”. For long-read data, reads were mapped to H37Rv using minimap2, variants called using Pilon, non-SNP variants were excluded and SNPs of QUAL > 100 were kept. VCF files were merged and consensus fasta files were created using a custom script. The phylogenetic tree was inferred using RAxML (v8.2)[ 83 ] using a GTR model of nucleotide substitution and the L8 isolate as the outgroup. Isolates with disconcordant lineage call and phylogenetic position were filtered out of the tree. 4.13 Pairwise identity calculation Pairwise sequence identities were calculated using a custom Python script implemented with Biopython and NumPy. A multiple sequence alignment of the CtpV protein amino acid sequences across various Mycobacterium species was first loaded from a Clustal-formatted file. For each pair of sequences in the alignment, a pairwise identity score was computed by comparing aligned positions and calculating the proportion of identical residues. 4.14 L1.2.1 transcriptome analysis Public Illumina RNA-Seq data was downloaded using fastq-dl. All L1 isolate data from Chiner-Oms et al. [ 49 ] was downloaded; RNA-Seq data from L1.2.1, L1.1.1 and L1.1.2 in normal growth conditions, with 2 replicates per isolate. Reads were pre-processed using fastp and mapped to the reference genome H37Rv with bwa mem. To identify the strand specificity of the RNA-Seq libraries, the mapped reads were visualized in Integrative Genomics Viewer (IGV)[ 84 ]. Read duplicates were removed using samtools (v1.16.1). Counts of reads mapped to each gene were obtained using featurecounts (v2.0.8)[ 85 ]. To quantify the transcriptional expression, we followed the pre-processing steps of Bei et al. [ 86 ]. We removed small genes ( ≤ 150bp), non-coding transcripts (tRNA, rRNA, and annotated non-coding RNAs in the Mtb genome), non-expressing genes (read counts in all samples were zero) as well as genes that had full genome deletions. As the isolates used for this analysis were the same isolates used for the benchmark, deletions in those isolates were known. Read counts were subsequently normalized using the trimmed mean of M-values (TMM) factor[ 87 ], and the TMM normalized Reads Per kilobase million (RPKM) were calculated using the edgeR package (v3.36.0)[ 88 ]. After TMM normalization, log2(RPKM+1) was calculated and defined as transcriptional expression levels. We used limma (v3.50.3)[ 89 ] for the subsequent steps: we created a multidimensional scaling plot of distances between gene expression profiles to visualize the transcription profiles (Fig.S16). Expression values were fitted to a linear model for estimation of gene expression levels specific to each condition. To stabilize variance estimates and improve the detection of differentially expressed genes, empirical Bayes moderation was applied to the standard errors. Finally, signifcant differentially expressed genes were identified by extracting genes with adjusted p-values < 0.05 (Benjamin-Hochberg correction) with an absolute log2 fold change ≥ 1. 4.15 SV genotyping of 44,709 Illumina-sequenced isolates 44,709 high-quality Illumina fastq files were downloaded from public repositories using fastq-dl. SVs were genotyped as described above using shovill, minigraph with the −l 10,000 and -d 5,000 flags to account for more fragmented assemblies and miniwalk (in this case using the -na flag). SVs were merged as done for the long-read data. 4.16 Phylogeny of 41,134 isolates SNPs for each sample were called using snippy (v4.6.0)[ 90 ]. A pairwise SNP matrix was done using psdm (v0.3.0)[ 91 ] and a phylogenetic tree built using VeryFastTree (v4.0.4)[ 92 ]. 4.17 SV association analysis with DR To account for population structure in the association analyses, SNPs for each sample were called using snippy, non-SNP variants and SNPs falling in the masking sites detailed in Marin et al [ 10 ] were filtered out of each VCF file. All VCF files were merged nto one using bcftools. SNPs with a minimum allele frequency < 0.01 were filtered out and a principal component analysis (PCA) was done using hail (v0.2.13)[ 93 ]. The first 4 PCs accounted for 90.9% of the variance, so those were used in the association analyses. Each sample was also sent through TB-Profiler to account for those samples where DR was explained by SNVs or already-annotated SVs by the WHO[ 94 ]. Two models were done to find SVs associated with DR. First, a Firth logistic regression was done with the drug as the outcome and the SVs as the predictors with the 4 PCs and the TB-Profiler predictions as covariates. Firth was chosen over regular logistic regression to account for the class imbalance in our dataset (less resistant isolates than susceptible ones). Adding TB-Profiler DR predictions as a covariate ensured that associated SVs were independent from already-known WHOv2-listed or TB-Profiler resistance mutations (including already-listed SVs such as the katG deletion). P-values were adjusted after each drug using the Benjamin-Hochberg correction. Associated SVs were filtered out if their adjusted p-values were > 0.05, their frequency in resistant solates was less than double that of susceptible isolates, they were small deletions ( ≤ 100bp) in complex regions[ 10 ], presented a minimum allele frequency (MAF) < 0.01 n the resistant isolates or were phylogenetically restricted. The second approach was xgboost (v1.7.8.1)[ 95 ] as it was able to handle sparse data. As we were only interested n finding the features associated with each drug and not in creating a predictive model, we did not separate between testing and training datasets, allowing us to detect less frequent SVs associated with DR. As for the logistic regression model, the drug phenotype was used as the outcome and the SVs as predictors with the same covariates as the logistic regression model. Feature importance was then extracted from the resulting model. A third analysis was done for those SVs that were associated to various drugs, to determine which drug association was strongest as there were various cases of MDR isolates. In those cases, the SV was used as the dependent variable while the drugs were the independent variables with the same covariates as before. To further confirm the role of the SVs as the sole mutations associated with the resistant phenotype, we assessed whether the isolates presenting those SVs, with no TB-Profiler resistance prediction, had SVs in known, resistance-conferring genes[ 5 ], to filter out SVs associated with DR-causing genes but not with DR (for example large deletions spanning the resistance-causing gene and surrounding genes). For the SV-gene burden approach, if an SV overlapped a gene, regardless of the position and length, those genes would be annotated as affected for those isolates with the SVs. The same pipeline as for the SV-based test was applied. 4.18 Permutation analysis to assess statistical significance To evaluate whether the associations observed between structural variants (SVs) and DR phenotypes could occur by chance, we performed a permutation analysis. For each variant/gene, we randomly shuffled the DR labels among the samples, breaking the true associations while preserving the dataset structure. We repeated this process 200 times, running the same association tests for each permutation to generate a null distribution of p-values for each SV. The observed p-values from the actual dataset were then compared to this null distribution to calculate empirical p-values, representing the proportion of permutations with p-values as significant as or more significant than those observed. This analysis allowed us to assess the likelihood of finding these associations by chance. 4.19 Statistics and visualisations Unless stated otherwise, all statistics and visualizations were done in R. Supplementary information Supplementary files attached. Declarations Funding A.C.B. is supported by a University of Melbourne Research Scholarship. The authors acknowledge support from the National Health and Medical Research Council, Australia APP1172853 (S.J.D), the US National Institutes of Health U19AI162583 (S.J.D). Conflict of interest/Competing interests None to declare. Ethics approval and consent to participate Not applicable. Data availability ONT data generated in this study for African lineages (L5, L6 and L9) is available in BioProject PRJNA857537 while the other newly sequenced lineages can be found in Bioproject PRJNA119321. All public long-read data is available in Supplementary Data 1. All RNA-Seq accession IDs are in Supplementary Table 2 and 3. All public Illumina data used for this project can be found in https://doi.org/10.5281/zenodo.7819984 . The Mtb-PRG and phylogenetic trees are available at https://doi.org/10.5281/zenodo.14842102 . Materials availability Not applicable. Code availability Miniwalk can be accessed through https://github.com/aleixcanalda/miniwalk and the code used for this manuscript can be found in https://github.com/aleixcanalda/Mtb PRG Paper. Author contribution A.C.B., M.S., M.H., L.C. and S.J.D. conceived this project. ONT sequencing was carried out by D.T., L.V. and N.S. All bioinformatics analyses and experiments were done by A.C.B. M.H. computed the 44k isolates’ phylogenetic tree. The work was supervised by M.S., M.H., X.C., L.C. and S.D. A.C.B created the first draft which was modified and revised by all authors. Editorial Policies for: Springer journals and proceedings: https://www.springer.com/gp/editorial-policies Nature Portfolio journals: https://www.nature.com/nature-research/editorial-policies Acknowledgements We thank Jose Antonio Barraza Lopez for assistance with figure design and image preparation. Funder Information Declared University of Melbourne, https://ror.org/01ej9dk98 National Institutes of Health, https://ror.org/01cwqze88 National Health and Medical Research Council, Australia Footnotes https://github.com/aleixcanalda/miniwalk https://github.com/aleixcanalda/Mtb_PRG_Paper https://doi.org/10.5281/zenodo.14842102 https://doi.org/10.5281/zenodo.7819984 References 1. ↵ WHO . Global Tuberculosis Report 2024 2. ↵ Lim , A. Y. H. et al. Implementation of national whole-genome sequencing of Mycobacterium tuberculosis, National Public Health Laboratory, Singapore, 2019—2022. Publisher: Microbiology Society, 001139 . Microbial Genomics , 911 , 2023 . 3. ↵ Dale , K. et al. Whole genome sequencing for tuberculosis in Victoria, Australia: A genomic implementation study from 2017 to 2020. Publisher: Elsevier . The Lancet Regional Health – Western Pacific , 28 , 2022 . 4. ↵ Coll , F. et al. A robust SNP barcode for typing Mycobacterium tuberculosis complex strains. Number: 1 Publisher: Nature Publishing Group, 4812 . Nature Communications , 51 , 2014 . 5. ↵ Walker , T. M. et al. The 2021 WHO catalogue of Mycobacterium tuberculosis complex mutations associated with drug resistance: a genotypic analysis, e265–e273 . The Lancet. Microbe , 34 , 2022 . 6. ↵ Bottai , D. et al. TbD1 deletion as a driver of the evolutionary success of modern epidemic Mycobacterium tuberculosis lineages. Publisher: Nature Publishing Group, 684 . Nature Communications , 111 , 2020 . 7. ↵ Gomes , L. C. , Campino , S. , Marinho , C. R. F. , Clark , T. G. & Phelan , J. E . Whole genome sequencing reveals large deletions and other loss of function mutations in Mycobacterium tuberculosis drug resistance genes. Publisher: Microbiology Society, 000724 . Microbial Genomics , 712 , 2021 . 8. ↵ Goig , G. A. et al. Ecology, global diversity and evolutionary mechanisms in the Mycobacterium tuberculosis complex. Publisher: Nature Publishing Group, 1–13 . Nature Reviews Microbiology , 2025 . 9. ↵ West , P. T. , Chanin , R. B. & Bhatt , A. S . From genome structure to function: insights into structural variation in microbiology, 102192 . Current Opinion in Microbiology , 69 , 2022 . 10. ↵ Marin , M. et al. Benchmarking the empirical accuracy of short-read sequencing across the M. tuberculosis genome, 1781–1787 . Bioinformatics , 387 , 2022 . 11. ↵ Worakitchanon , W. et al. Comprehensive analysis of Mycobacterium tuberculosis genomes reveals genetic variations in bacterial virulence, 1972–1987.e6 . Cell Host & Microbe , 3211 , 2024 . 12. Stritt , C. et al. Gene conversion and duplication contribute to genetic variation in an outbreak of Mycobacterium tuberculosis. Publisher: Microbiology Society, 001396 . Microbial Genomics , 115 , 2025 . 13. Thorpe , J. et al. Multi-platform whole genome sequencing for tuberculosis clinical and surveillance applications. Publisher: Nature Publishing Group, 5201 . Scientific Reports , 141 , 2024 . 14. ↵ Marin , M. G. , et al. Analysis of the limited M. tuberculosis accessory genome reveals potential pitfalls of pan-genome analysis approaches Pages: 2024.03.21.586149 Section: New Results . Mar . 25 , 2024 . https://www.biorxiv.org/content/10.1101/2024.03.21.586149v1 ( 2024 ). 15. ↵ Siren , J. et al. Pangenomics enables genotyping of known structural variants in 5202 diverse genomes . Publisher: American Association for the Advancement of Science , abg8871 . Science , 3746574 , 2021 . OpenUrl 16. ↵ Liao , W.-W. et al. A draft human pangenome reference. Number: 7960 Publisher: Nature Publishing Group, 312–324 . Nature , 6177960 , 2023 . 17. ↵ Hall , M. B. , Lima , L. , Coin , L. J. M. & Iqbal , Z . Drug resistance prediction for Mycobacterium tuberculosis with reference graphs. Publisher: Microbiology Society, 001081 . Microbial Genomics , 98 , 2023 . 18. ↵ Kolmogorov , M. , Yuan , J. , Lin , Y. & Pevzner , P. A . Assembly of long, error-prone reads using repeat graphs. Number: 5 Publisher: Nature Publishing Group, 540–546 . Nature Biotechnology , 375 , 2019 . 19. ↵ Gurevich , A. , Saveliev , V. , Vyahhi , N. & Tesler, G. QUAST: quality assessment tool for genome assemblies, 1072–1075 . Bioinformatics , 298 , 2013 . 20. ↵ Li , H. , Feng , X. & Chu , C . The design and construction of reference pangenome graphs with minigraph, 265 . Genome Biology , 211 , 2020 . 21. ↵ Hunt , M. et al. Minos: variant adjudication and joint genotyping of cohorts of bacterial genomes, 147 . Genome Biology , 231 , 2022 . 22. ↵ Heller , D. & Vingron , M. SVIM-asm: structural variant detection from haploid and diploid genome assemblies, 5519–5521 . Bioinformatics , 3622 , 2021 . 23. ↵ Chen , X . et al. Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications, 1220–1222 . Bioinformatics , 328 , 2016 . 24. ↵ Cameron , D. L. , Di Stefano , L. & Papenfuss , A. T . Comprehensive evaluation and characterisation of short read general-purpose structural variant calling software. Number: 1 Publisher: Nature Publishing Group, 3240 . Nature Communications , 101 , 2019 . 25. ↵ Seemann , T. Shovill original-date: 2016-09-05T23:50:20Z . Dec . 1 , 2023 . https://github.com/tseemann/shovill ( 2023 ). 26. ↵ Dunn , T. & Narayanasamy , S. vcfdist: accurately benchmarking phased small variant calls in human genomes. Publisher: Nature Publishing Group, 8149 . Nature Communications , 141 , 2023 . 27. ↵ Harrison , L. B. , Kapur , V. & Behr , M. A . An imputed ancestral reference genome for the Mycobacterium tuberculosis complex better captures structural genomic diversity for reference-based alignment workflows. Publisher: Microbiology Society, 001165 . Microbial Genomics , 101 , 2024 . 28. ↵ Stucki , D. et al. Mycobacterium tuberculosis Lineage 4 comprises globally distributed and geographically restricted sublineages, 1535–1543 . Nature genetics , 4812 , 2016 . 29. ↵ Nimmo , C. et al. Evolution of Mycobacterium tuberculosis drug resistance in the genomic era . Frontiers in Cellular and Infection Microbiology , 12 , 2022 . 30. ↵ Tufariello , J. M. et al. Separable roles for Mycobacterium tuberculosis ESX-3 effectors in iron acquisition and virulence. Publisher: Proceedings of the National Academy of Sciences, E348–E357 . Proceedings of the National Academy of Sciences , 1133 , 2016 . 31. ↵ Shah , S. & Briken , V . Modular Organization of the ESX-5 Secretion System in Mycobacterium tuberculosis, 49 . Frontiers in Cellular and Infection Microbiology , 6 , 2016 . 32. ↵ Bottai , D. et al. Disruption of the ESX-5 system of Mycobacterium tuberculosis causes loss of PPE protein secretion, reduction of cell wall integrity and strong attenuation, 1195–1209 . Molecular Microbiology , 836 , 2012 . 33. ↵ Allen , A. C. et al. Parallel in vivo experimental evolution reveals that increased stress resistance was key for the emergence of persistent tuberculosis bacilli. Number: 8 Publisher: Nature Publishing Group, 1082–1093 . Nature Microbiology , 68 , 2021 . 34. ↵ Karboul , A. et al. Frequent Homologous Recombination Events in Mycobacterium tuberculosis PE/PPE Multigene Families: Potential Role in Antigenic Variability. Publisher: American Society for Microbiology. 7838–7846 . Journal of Bacteriology , 19023 , 2008 . 35. ↵ Comas , I. et al. Human T cell epitopes of Mycobacterium tuberculosis are evolutionarily hyperconserved. Publisher: Nature Publishing Group, 498–503 . Nature Genetics , 426 , 2010 . 36. ↵ DeJesus , M. A. et al. Comprehensive Essentiality Analysis of the Mycobacterium tuberculosis Genome via Saturating Transposon Mutagenesis. Publisher: American Society for Microbiology , doi: 10.1128/mbio.02133-16 . mBio , 81 , 2017 . OpenUrl CrossRef 37. ↵ Gröschel , M. I. , Sayes , F. , Simeone , R. , Majlessi , L. & Brosch , R. ESX secretion systems: mycobacterial evolution to counter host immunity. Publisher: Nature Publishing Group, 677–691 . Nature Reviews Microbiology , 1411 , 2016 . 38. ↵ Pal , R. , Bisht , M. K. & Mukhopadhyay , S. Secretory proteins of Mycobacterium tuberculosis and their roles in modulation of host immune responses: focus on therapeutic targets . eprint : https://onlinelibrary.wiley.com/doi/pdf/10.1111/febs.16369 , 4146–4171 . The FEBS Journal , 28914 , 2022 . 39. ↵ Zhang , M. et al. EspI regulates the ESX-1 secretion system in response to ATP levels in Mycobacterium tuberculosis, 1057–1065 . Molecular microbiology , 935 , 2014 . 40. ↵ Wick , R. R. , Schultz , M. B. , Zobel , J. & Holt , K. E. Bandage: interactive visualization of de novo genome assemblies, 3350–3352 . Bioinformatics , 3120 , 2015 . 41. ↵ Holt , K. E. et al. Frequent transmission of the Mycobacterium tuberculosis Beijing lineage and positive selection for the EsxW Beijing variant in Vietnam. Number: 6 Publisher: Nature Publishing Group, 849–856 . Nature Genetics , 506 , 2018 . 42. ↵ Hang , N. T. L. et al. Whole genome sequencing, analyses of drug resistance-conferring mutations, and correlation with transmission of Mycobacterium tuberculosis carrying katG-S315T in Hanoi, Vietnam, 15354 . Scientific Reports , 9 , 2019 . 43. ↵ Chen , L. , Li , X. , Xu , P. & He , Z.-G . A Novel Zinc Exporter CtpG Enhances Resistance to Zinc Toxicity and Survival in Mycobacterium bovis. Publisher: American Society for Microbiology, e01456–21 . Microbiology Spectrum , 102 , 2022 . 44. ↵ Dow , A. et al. Zinc limitation triggers anticipatory adaptations in Mycobacterium tuberculosis. Publisher: Public Library of Science, e1009570 . PLOS Pathogens , 175 , 2021 . 45. ↵ Phelan , J. E. et al. Mycobacterium tuberculosis whole genome sequencing provides insights into the Manila strain and drug-resistance mutations in the Philippines. Publisher: Nature Publishing Group, 9305 . Scientific Reports , 91 , 2019 . 46. ↵ Wang , L. et al. Whole genome sequencing analysis of Mycobacterium tuberculosis reveals circulating strain types and drug-resistance mutations in the Philippines. Publisher: Nature Publishing Group, 19602 . Scientific Reports , 141 , 2024 . 47. ↵ Netikul , T. , Palittapongarnpim , P. , Thawornwattana , Y. & Plitphonganphim , S . Estimation of the global burden of Mycobacterium tuberculosis lineage 1, 104802 . Infection, Genetics and Evolution , 91 , 2021 . 48. ↵ Freschi , L. et al. Population structure, biogeography and transmissibility of Mycobacterium tuberculosis. Number: 1 Publisher: Nature Publishing Group, 6099 . Nature Communications , 121 , 2021 . 49. ↵ Chiner-Oms , A . et al. Genome-wide mutational biases fuel transcriptional diversity in the Mycobacterium tuberculosis complex. Publisher: Nature Publishing Group, 3994 . Nature Communications , 101 , 2019 . 50. ↵ Marcus , S. A. , Sidiropoulos , S. W. , Steinberg , H. & Talaat , A. M . CsoR Is Essential for Maintaining Copper Homeostasis in Mycobacterium tuberculosis. Publisher: Public Library of Science, e0151816 . PLOS ONE , 113 , 2016 . 51. ↵ Ward , S. K. , Abomoelak , B. , Hoye , E. A. , Steinberg , H. & Talaat , A. M. CtpV: a putative copper exporter required for full virulence of Mycobacterium tuberculosis. eprint : https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1365-2958.2010.07273.x , 1096 – 1110 . Molecular Microbiology , 775 , 2010 . OpenUrl 52. ↵ Shey-Njila , O. et al. CtpB Facilitates Mycobacterium tuberculosis Growth in Copper-Limited Niches, 5713 . International Journal of Molecular Sciences , 2310 , 2022 . 53. ↵ Bianchini , G. & Śanchez-Baracaldo , P. TreeViewer: Flexible, modular software to visualise and manipulate phylogenetic trees , e10873 . Ecology and Evolution , 142 , 2024 . OpenUrl 54. ↵ Phelan , J. E. et al. Integrating informatics tools and portable sequencing technology for rapid detection of resistance to anti-tuberculous drugs . Genome Medicine , 111 , 2019 . 55. ↵ Farhat , M. R. et al. GWAS for quantitative resistance phenotypes in Mycobacterium tuberculosis reveals resistance genes and regulatory regions. Number: 1 Publisher: Nature Publishing Group, 2128 . Nature Communications , 101 , 2019 . 56. ↵ Furío , V. et al. An evolutionary functional genomics approach identifies novel candidate regions involved in isoniazid resistance in Mycobacterium tuberculosis. Publisher: Nature Publishing Group, 1–12 . Communications Biology , 41 , 2021 . 57. ↵ Brodin , P. et al. High Content Phenotypic Cell-Based Visual Screen Identifies Mycobacterium tuberculosis Acyltrehalose-Containing Glycolipids Involved in Phagosome Remodeling. Publisher: Public Library of Science, e1001100 . PLOS Pathogens , 69 , 2010 . 58. ↵ Wolfe , L. M. , Mahaffey , S. B. , Kruh , N. A. & Dobos , K. M . Proteomic Definition of the Cell Wall of Mycobacterium tuberculosis. Publisher: American Chemical Society, 5816–5826 . Journal of Proteome Research , 911 , 2010 . 59. ↵ Waters , E. V. , Cameron , S. K. , Langridge , G. C. & Preston , A . Bacterial genome structural variation: prevalence, mechanisms, and consequences . English. Publisher: Elsevier. Trends in Microbiology , 00 , 2025 . 60. ↵ Campos-Pardos , E. , Uranga , S. , Pićo , A. , Gomez , A. B. & Gonzalo-Asensio , J. Dependency on host vitamin B12 has shaped Mycobacterium tuberculosis Complex evolution. Publisher: Nature Publishing Group, 2161 . Nature Communications , 151 , 2024 . 61. ↵ Tientcheu , L. D. et al. Immunological consequences of strain variation within the Mycobacterium tuberculosis complex , 432 – 445 . European Journal of Immunology , 473 , 2017 . OpenUrl 62. ↵ Buglino , J. A. , Sankhe, G. D., Lazar, N., Bean, J. M. & Glickman, M. S . Integrated sensing of host stresses by inhibition of a cytoplasmic two-component system controls M. tuberculosis acute lung infection (eds Kana, B. D., Voskuil, M. & Cox, J. S.) Publisher: eLife Sciences Publications, Ltd, e65351 . eLife , 10 , 2021 . 63. ↵ Wolschendorf , F. et al. Copper resistance is essential for virulence of Mycobacterium tuberculosis, 1621–1626 . Proceedings of the National Academy of Sciences of the United States of America , 1084 , 2011 . 64. ↵ Lempens , P. et al. Isoniazid resistance levels of Mycobacterium tuberculosis can largely be predicted by high-confidence resistance-conferring mutations. Publisher: Nature Publishing Group, 3246 . Scientific Reports , 81 , 2018 . 65. ↵ Roberts , L. W. et al. MmpR5 protein truncation and bedaquiline resistance in Mycobacterium tuberculosis isolates from South Africa: a genomic analysis. Publisher: Elsevier . The Lancet Microbe , 58 , 2024 . 66. ↵ Barilar , I. et al. Quantitative measurement of antibiotic resistance in Mycobacterium tuberculosis reveals genetic determinants of resistance and susceptibility in a target gene approach. Publisher: Nature Publishing Group, 488 . Nature Communications , 151 , 2024 . 67. ↵ Petit III , R. A. , Hall , M. B. , Tonkin-Hill , G. , Zhu , J. & Read , T. D . fastq-dl: efficiently download FASTQ files from SRA or ENA repositories version 2.0.2 . original-date: 2019-10-10T13:02:30Z. Feb. 10, 2025 . https://github.com/rpetit3/fastq-dl ( 2025 ). 68. ↵ Wick , R. Porechop original-date: 2017-02-13T04:20:00Z. Oct. 8 , 2023 . https://github.com/rrwick/Porechop ( 2023 ). 69. ↵ Steinig , E. & Coin , L . Nanoq: ultra-fast quality control for nanopore reads, 2991 . Journal of Open Source Software , 769 , 2022 . 70. ↵ Shen , W. , Le , S. , Li , Y. & Hu , F . SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. Publisher: Public Library of Science, e0163962 . PLOS ONE , 1110 , 2016 . 71. ↵ Kolmogorov , M. et al. metaFlye: scalable long-read metagenome assembly using repeat graphs. Number: 11 Publisher: Nature Publishing Group, 1103–1110 . Nature Methods , 1711 , 2020 . 72. ↵ Sim , S. B. , Corpuz , R. L. , Simmonds , T. J. & Geib , S. M . HiFiAdapterFilt, a memory efficient read processing pipeline, prevents occurrence of adapter sequence in PacBio HiFi reads and their negative impacts on genome assembly, 157 . BMC Genomics , 231 , 2022 . 73. ↵ Hall , M. B. & Coin , L. J. M . Comparison of computational methods for host removal and classification of mycobacteria from clinical metagenomic data Pages: 2023.09.18.558339 Section: New Results . Sept. 19 , 2023 . https://www.biorxiv.org/content/10.1101/2023.09.18.558339v1 ( 2023 ). 74. ↵ Wood , D. E. , Lu , J. & Langmead , B . Improved metagenomic analysis with Kraken 2, 257 . Genome Biology , 201 , 2019 . 75. ↵ Walker , B. J. et al. Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement. Publisher: Public Library of Science, e112963 . PLOS ONE , 911 , 2014 . 76. ↵ Kiu , R. sequence-stats original-date: 2020-08-25T10:52:58Z . Oct. 4, 2023 . https://github.com/raymondkiu/sequence-stats ( 2023 ). 77. ↵ Blackwell , G. A. et al. Exploring bacterial diversity via a curated and searchable snapshot of archived DNA sequences. Publisher: Public Library of Science, e3001421 . PLOS Biology , 1911 , 2021 . 78. ↵ Li , H. Minimap2: pairwise alignment for nucleotide sequences, 3094–3100 . Bioinformatics , 3418 , 2018 . 79. ↵ Chen , S. , Zhou , Y. , Chen , Y. & Gu , J . fastp: an ultra-fast all-in-one FASTQ preprocessor, i884–i890 . Bioinformatics , 3417 , 2018 . 80. ↵ Li , H. & Durbin , R. Fast and accurate short read alignment with Burrows–Wheeler transform, 1754–1760 . Bioinformatics , 2514 , 2009 . 81. ↵ Jakubosky , D. et al. Discovery and quality analysis of a comprehensive set of structural variants and short tandem repeats. Publisher: Nature Publishing Group, 2928 . Nature Communications , 111 , 2020 . 82. ↵ Zook , J. M. et al. A robust benchmark for detection of germline large deletions and insertions. Publisher: Nature Publishing Group, 1347–1355 . Nature Biotechnology , 3811 , 2020 . 83. ↵ Stamatakis , A . RAxML version 8: a tool for phylogenetic analysis and post-analysis of large phylogenies, 1312–1313 . Bioinformatics , 309 , 2014 . 84. ↵ Thorvaldsdottir , H. , Robinson , J. T. & Mesirov , J. P . Integrative Genomics Viewer (IGV): high-performance genomics data visualization and exploration, 178–192 . Briefings in Bioinformatics , 142 , 2013 . 85. ↵ Liao , Y. , Smyth , G. K. & Shi , W . featureCounts: an efficient general purpose program for assigning sequence reads to genomic features , 923 – 930 . Bioinformatics (Oxford, England) , 307 , 2014 . OpenUrl CrossRef 86. ↵ Bei , C. et al. Genetically encoded transcriptional plasticity underlies stress adaptation in Mycobacterium tuberculosis. Publisher: Nature Publishing Group, 3088 . Nature Communications , 151 , 2024 . 87. ↵ Robinson , M. D. & Oshlack , A . A scaling normalization method for differential expression analysis of RNA-seq data. eng, R25 . Genome Biology , 113 , 2010 . 88. ↵ Robinson , M. D. , McCarthy , D. J. & Smyth , G. K . edgeR: a Bioconductor package for differential expression analysis of digital gene expression data , 139 – 140 . Bioinformatics (Oxford, England) , 261 , 2010 . OpenUrl 89. ↵ Ritchie , M. E. et al. limma powers differential expression analyses for RNA-sequencing and microarray studies , e47 . Nucleic Acids Research , 437 , 2015 . OpenUrl CrossRef 90. ↵ Seemann , T. tseemann/snippy original-date: 2014-05-15T01:56:12Z . Nov . 13 , 2024 . https://github.com/tseemann/snippy ( 2024 ). 91. ↵ Hall , M. B. psdm: Compute a pairwise SNP distance matrix from one or two alignment(s) version 0 . 3 . 0 . Zenodo, Nov. 1 , 2021 . doi: 10.5281/zenodo.5706785 . OpenUrl CrossRef 92. ↵ Piñeiro , C. , Abúın , J. M. & Pichel , J. C. Very Fast Tree: speeding up the estimation of phylogenies for large alignments through parallelization and vectorization strategies, 4658–4659 . Bioinformatics , 3617 , 2020 . 93. ↵ Hail Team , H. T . Hail original-date: 2015-10-27T20:55:42Z . Nov . 15 , 2024 . https://github.com/hail-is/hail ( 2024 ). 94. ↵ WHO . Catalogue of mutations in Mycobacterium tuberculosis complex and their association with drug resistance https://www.who.int/publications-detail-redirect/9789240028173 ( 2023 ). 95. ↵ Chen , T. & Guestrin , C. XGBoost: A Scalable Tree Boosting System in Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (Aug. 13, 2016), 785–794. arXiv: 1603.02754[cs] . http://arxiv.org/abs/1603.02754 ( 2025 ). View the discussion thread. Back to top Previous Next Posted May 07, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Genome graphs reveal the importance of structural variation in Mycobacterium tuberculosis evolution and drug resistance Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Genome graphs reveal the importance of structural variation in Mycobacterium tuberculosis evolution and drug resistance Aleix Canalda-Baltrons , Matthew Silcocks , Michael B. Hall , Derrick Theys , Xuling Chang , Linda T. Viberg , Norelle L. Sherry , Lachlan Coin , Sarah J. Dunstan bioRxiv 2025.05.07.652570; doi: https://doi.org/10.1101/2025.05.07.652570 Share This Article: Copy Citation Tools Genome graphs reveal the importance of structural variation in Mycobacterium tuberculosis evolution and drug resistance Aleix Canalda-Baltrons , Matthew Silcocks , Michael B. Hall , Derrick Theys , Xuling Chang , Linda T. Viberg , Norelle L. Sherry , Lachlan Coin , Sarah J. Dunstan bioRxiv 2025.05.07.652570; doi: https://doi.org/10.1101/2025.05.07.652570 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genomics Subject Areas All Articles Animal Behavior and Cognition (7624) Biochemistry (17650) Bioengineering (13871) Bioinformatics (41882) Biophysics (21424) Cancer Biology (18566) Cell Biology (25461) Clinical Trials (138) Developmental Biology (13365) Ecology (19867) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15590) Genomics (22476) Immunology (17713) Microbiology (40331) Molecular Biology (17148) Neuroscience (88477) Paleontology (666) Pathology (2828) Pharmacology and Toxicology (4816) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00