Full text
41,324 characters
· extracted from
preprint-html
· click to expand
An explainable artificial intelligence framework reveals mutations associated with drug resistance in Mycobacterium tuberculosis | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results An explainable artificial intelligence framework reveals mutations associated with drug resistance in Mycobacterium tuberculosis Hui Cen , Peng Zhang , Yunchao Ling , Guoping Zhao , Guoqing Zhang doi: https://doi.org/10.1101/2024.12.18.629160 Hui Cen 1 Bio-Med Big Data Center, CAS Key Laboratory of Computational Biology, Shanghai Institute of Nutrition and Health, University of Chinese Academy of Sciences, Chinese Academy of Sciences , Shanghai 200031, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Peng Zhang 1 Bio-Med Big Data Center, CAS Key Laboratory of Computational Biology, Shanghai Institute of Nutrition and Health, University of Chinese Academy of Sciences, Chinese Academy of Sciences , Shanghai 200031, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yunchao Ling 1 Bio-Med Big Data Center, CAS Key Laboratory of Computational Biology, Shanghai Institute of Nutrition and Health, University of Chinese Academy of Sciences, Chinese Academy of Sciences , Shanghai 200031, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Guoping Zhao 1 Bio-Med Big Data Center, CAS Key Laboratory of Computational Biology, Shanghai Institute of Nutrition and Health, University of Chinese Academy of Sciences, Chinese Academy of Sciences , Shanghai 200031, China 2 School of Life Science, Hangzhou Institute for Advanced Study, University of Chinese Academy of Sciences , Hangzhou 310024, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: gqzhang{at}sinh.ac.cn gpzhao{at}sibs.ac.cn Guoqing Zhang 1 Bio-Med Big Data Center, CAS Key Laboratory of Computational Biology, Shanghai Institute of Nutrition and Health, University of Chinese Academy of Sciences, Chinese Academy of Sciences , Shanghai 200031, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: gqzhang{at}sinh.ac.cn gpzhao{at}sibs.ac.cn Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Understanding the mechanisms of drug resistance in Mycobacterium tuberculosis (MTB) is essential for the rapid detection of resistance and for guiding effective treatment, ultimately contributing to reducing the global burden of tuberculosis (TB). Under anti-TB drugs pressure, MTB continues to accumulate resistance loci. The current repertoire of known resistance-associated mutations requires further refinement, necessitating efficient methods for the timely identification of potential resistance sites. Here, we introduce xAI-MTBDR, an explainable artificial intelligence framework designed to identify new resistance-associated mutations and predict drug resistance in MTB. It outperforms state-of-the-art methods in predicting drug resistance for all first-line drugs, and scoring each mutation’s contribution to resistance. By leveraging public whole-genome sequencing data from nearly 40,000 MTB isolates, the framework identified 788 candidate resistance-related mutations and revealed 30 potential resistance markers, several of which are structurally located closer to their respective drugs compared to known resistance mutations. Furthermore, these scores enabled the framework to efficiently subgroup isolates with different resistance mechanisms and reflect varying levels of resistance. The framework serves as a valuable tool for accurate detection of drug-resistant MTB and offers new insights into its underlying mechanisms. Introduction Tuberculosis (TB), caused by Mycobacterium tuberculosis (MTB), is the world’s leading bacterial infectious disease 1 , 2 . In 2022, approximately 10.6 million people developed TB, resulting in 1.3 million deaths 3 . The rise of drug-resistant TB has worsened this crisis, with treatment success rates for drug-resistant TB being 25% lower than for drug-susceptible TB 3 , 4 . This highlights the urgent need to understand resistance mechanisms for rapid detection of drug-resistant MTB, guiding effective treatment and reducing the global TB burden 5 . The majority of drug resistance in MTB isolates is attributed to chromosomal mutations 6 . Genome-wide association studies (GWAS) have been successfully applied to investigate novel resistance mechanisms in MTB by identifying nucleotide variants associated with drug resistance 7 . This approach has not only detected known resistance loci but also uncovered new resistance-related loci, such as specific sites in the ald gene correlated with increased resistance to D-cycloserine, novel codons in the ethA gene associated with ethionamide resistance, and variations in the promoter region of thyX related to para-aminosalicylic acid resistance 8 - 10 . The World Health Organization (WHO)-endorsed catalogue of MTB resistance mutations, based on whole-genome sequencing (WGS) data, categorizes mutations according to their association with resistance and serves as a high-quality reference 11 , 12 . Pre-determined mutation catalogues, which are essential for detecting drug resistance, form the foundation upon which popular prediction tools such as TB-profiler 13 and Mykrobe 14 infer drug resistance. With the rapid accumulation of WGS data, these catalogues need further refinement 4 , 7 , particularly for low-frequency mutations in natural populations 7 . Deep learning models have enriched mutation catalogues by identifying complex patterns in genomic data 15 , as demonstrated by Green et al.’s convolutional neural network, which identified 18 sites not previously associated with resistance 16 . However, this study did not explore deeper individual-level interpretations, which are essential for understanding how specific mutations contribute to resistance in individual cases. The need for models that provide both global (population-level) and local (individual-level) explanations is critical. Global explanations help identify broad mutation contributions, while local explanations are essential for understanding individual mutations that drive resistance in specific isolates. The SHapley Additive exPlanations (SHAP) method is well-suited for this purpose, offering both global and local insights by evaluating the contribution of each mutation to drug resistance predictions 17 . SHAP’s dual capability enhances mutation catalogue completeness and improves the understanding of resistance mechanisms, making it a valuable tool for research on pathogenic microorganisms. In this study, we present xAI-MTBDR, an explainable artificial intelligence framework that integrates machine learning with SHAP to identify new resistance-associated mutations and predict drug resistance in MTB. Leveraging public whole-genome sequencing data from nearly 40,000 MTB isolates, xAI-MTBDR outperforms state-of-the-art methods in predicting resistance for all first-line anti-TB drugs. By providing both population-wide and individual-level insights, xAI-MTBDR enables revelation of potential biomarkers, grouping of drug-resistant isolates, and reflection of varying resistance levels. It is helpful for advancing the understanding of TB resistance mechanisms. Result The framework and datasets characteristics The framework is consisted of four main stages: dataset collection, feature extraction, model construction and explanation ( Fig.1 , detailed in Methods). In stage 1, we compiled 39,145 high-quality MTB isolates from three datasets, each providing WGS and DST data for 11 anti-TB drugs 11 , 18 , 19 . In stage 2, features were derived from diverse mutations, including insertions, deletions, and single nucleotide polymorphisms (SNPs), based on 49 preselected antimicrobial resistance (AMR) genes and promoter regions 20 . Stage 3 involved training and testing for each drug by the leave-one-out (LOO) strategy. In the final stage, we applied SHAP to provide interpretable insights, identifying key mutations driving resistance and exploring resistance mechanisms at both population and individual levels. Download figure Open in new tab Fig.1 Workflow of the xAI-MTBDR framework. The process includes four stages: (1) Data collection from three datasets; (2) Variation calling and feature extraction from genes and promoter regions; (3) Construction and testing of machine learning models; and (4) Application of SHAP for model explanation. Proportions of drug-resistant isolates across different drugs are summarized in Table 1 . Each drug is represented by at least 250 isolates per dataset 16 . Rifampicin and isoniazid have the highest numbers, with 36,552 and 36,240 isolates. In contrast, capreomycin has the fewest DST entries, with only 11,246 isolates. The proportion of resistant isolates also varies, ranging from 8% for amikacin to 34% for isoniazid. View this table: View inline View popup Download powerpoint Table 1 Proportions of drug-resistant isolates among all isolates. xAI-MTBDR outperforms state-of-the-art methods in predicting resistance Based on the xAI-MTBDR framework, we trained individual models for each drug to predict MTB resistance and compared it with four state-of-the-art methods, including machine learning models (random forest and logistic regression), deep learning models (WDNN), and catalog-based methods (TB-profiler) 13 , 20 , 21 . As shown in Table 2 , our models improved area under the curve (AUC) by 0.271-1.439% for first-line drugs, with an average improvement of 0.564%, and by 0.045-3.060% for second-line drugs, with an average improvement of 1.135%. The lower performance for amikacin and capreomycin may be due to the limited number of isolates. These results demonstrate that our framework outperforms existing methods in predicting drug resistance. View this table: View inline View popup Download powerpoint Table 2 The prediction performance (AUC) of methods for different drugs Population-level explanations of SHAP values aligned with WHO mutation categories The WHO MTB resistance mutation catalogue classifies mutations into five categories based on their confidence level of association with resistance 12 . In the xAI-MTBDR framework, SHAP analysis revealed a strong correlation between mutation importance (Supplementary Fig.1a) and these categories. For rifampicin, category 1 mutations (associated with resistance) had the highest SHAP values, ranging from 4.62 to 6.29, while category 5 mutations (not associated with resistance) had the lowest (from -0.03 to 0.15) ( Fig.2a ). Similar trends were observed for isoniazid and ethambutol ( Fig.2b,c ). In category 3 (mutations with uncertain resistance associations), some mutations overlapped with category 1, particularly for isoniazid (InhA I21V, InhA S94A, KatG S315G) and ethambutol (EmbB D1024N), whose SHAP values were above the median of category 3, aligning more closely with category 1, consistent with their reported resistance associations 22 - 24 . These findings suggest SHAP values may help reclassify uncertain category 3 mutations ( Fig.2b , c ). For pyrazinamide, where the WHO mutation catalogue adopted a “relaxed” grading criteria, we observed some overlap in SHAP values across categories ( Fig.2d ), but category 1 mutations still showed relatively higher SHAP values. Overall, SHAP values aligned well with the WHO mutation catalogue, effectively distinguishing resistance-associated mutations at the population level. Download figure Open in new tab Fig.2 Population-level explanations for first-line drugs. a - d , Violin plot depicts the SHAP values of mutations in relation to their WHO classifications. Category 1: associated with resistance; Category 2: associated with resistance – interim; Category 3: uncertain significance; Category 4: not associated with resistance – interim; Category 5: not associated with resistance. Some mutations from the WHO mutation catalogue are not present in our model’s feature list and thus do not have corresponding SHAP values. Significance was determined using Mann-Whitney test. *** P < 0.001; **** P < 0.0001. Individual-level explanations identify potential resistance markers The individual-level explanations revealed variations in mutation contributions among isolates (Supplementary Fig.1b), enabling the identification of 98 important mutations for the four first-line drugs, with 30 were not included in the WHO category 1 or 2 (Supplementary Table 1). For isoniazid and ethambutol, we identified potential resistance markers, excluding rifampicin due to the absence of novel markers in RpoB, and pyrazinamide as its markers overlapped with known mutations in PncA. The potential markers I194 (isoniazid) and D1024 (ethambutol) were located 3.7 Å and 6.1 Å from their respective drugs, compared to 4 Å and 8.6 Å for the known markers S94 25 and M306 ( Fig.3a , b , Supplementary Table 2). The median distances of known and potential markers to ethambutol were 12.6 Å and 9.1 Å, respectively, with no significant difference ( Fig.3c ), suggesting these mutations may influence the interaction between EmbB and ethambutol. Download figure Open in new tab Fig.3 Structural analysis of resistance mutation. a-b , Resistance markers in the structure of InhA (PDB: 2NV6) and EmbB (PDB: 7BVF) protein. c , Minimum atom distance between known and potential resistance mutations to ethambutol. Mann-Whitney test was used to compare differences between the known mutations group and potential mutations group. n.s.: P > 0.05, not statistically significant. Individual-level explanations subgroup MTB drug-resistant isolates We performed a principal component analysis (PCA) on the SHAP values of the isolates. For rifampicin, the susceptible and resistant isolates were clearly separated, and the resistant isolates were clustered into three distinct groups ( Fig.4b , Supplementary Fig.2 and 3). In contrast, the PCA of the original features failed to achieve this separation ( Fig.4a ). Similar patterns were observed for other first-line drugs (Supplementary Fig.4). We further visualized the top 10 rifampicin resistance-associated mutations across these three groups of resistant isolates, which may reflect the underlying mechanisms driving their differentiation ( Fig.4c ). Group 1 is primarily driven by the RpoB D435V mutation within the 81-bp rifampicin resistance-determining region (RRDR, corresponding to codons 426–452 in MTB). Group 2, which contains the most isolates, is predominantly associated with the RpoB S450L mutation within the RRDR. In contrast, group 3 displays a more diverse mutation pattern, including non-RRDR RpoB V170F and I491F. These mutations are positioned near or directly interact with rifampicin ( Fig.4d ), potentially altering the structure of the active center or the interactions between RNA polymerase (RNAP) subunits, leading to resistance, consistent with previous research 26 . These results suggest that SHAP values not only capture key characteristics of drug-resistant isolates, but also reveal complex resistance mechanisms, emphasizing the importance of non-RRDR mutations in resistance detection. Download figure Open in new tab Fig. 4 PCA and structural analysis of rifampicin resistance isolates. a , PCA of the original features for rifampicin resistance isolates. Marginal density plots are included to illustrate the distribution of the isolates along the first two principal components. b , PCA of SHAP values, with resistant isolates color-coded by hierarchical clustering. Marginal density plots are also included, as in panel a. c , Heatmap of the top 10 mutations in each resistant group. d , Crystal structure of RNA polymerase (PDB: 5UHB) showing the V170F and I491F mutations. The framework reflects drug resistance level of isolates When low-level resistance to first-line drugs occurs, increasing the dosage may help overcome this resistance 27 . We analyzed the relationships between SHAP and MIC values for three first-line drugs using the CRyPTIC dataset (excluding pyrazinamide due to missing data). The isolates were classified into three categories based on MIC value: susceptible, low-level resistant, and high-level resistant following previous studies 26 , 28 , 29 . A significant positive correlation between SHAP values of isolates and resistance levels is observed, with the coefficient of determination (R 2 ) ranging from 0.44 to 0.69 ( Fig.5a-c ). Notably, the explainable results from binary classification models appear to contain information related to resistance levels. To further evaluate this, we constructed a ternary classifier using the original features and the SHAP values separately. The classifier trained with SHAP values outperformed the one using original features, showing over 1% improvement ( Fig.5d ), indicating that SHAP values provide a more effective representation of resistance levels. Download figure Open in new tab Fig.5 Correlations between the SHAP values and resistance levels of isolates. a-c , Violin plots of SHAP values for different resistance levels across the three first-line drugs. Significance was determined using Mann-Whitney test. ** P < 0.01; **** P < 0.0001. d , Performance of the ternary classifier using the original features and SHAP values across different drugs. Discussion xAI-MTBDR predicts MTB drug resistance and identifies resistance-associated mutations at both population and individual levels. To our knowledge, this is the largest study utilizing the MTB WGS dataset (39,145 isolates) to assess the performance and applicability of similar models for drug resistance prediction. The models achieved state-of-the-art performance in predicting resistance to 11 anti-TB drugs, while also identifying new resistance biomarkers, subgrouping drug-resistant isolates, and capturing resistance levels. These insights contribute to a deeper understanding of resistance mechanisms and provide clinical guidance. Rapid and accurate detection of drug-resistant TB is critical for effective treatment. Our analysis identified potential resistance markers that could expand the current mutation catalogue and provide important clues for further in vivo and in vitro validation to confirm their roles in MTB resistance. A key strength of xAI-MTBDR is its ability to detect resistance mutations at the individual level, enabling the clustering of isolates with similar resistance mechanisms. For instance, rifampicin-resistant isolates were grouped by distinct mutation patterns, offering a more detailed understanding of resistance. This individual-level insight sets the foundation for more precise prediction models tailored to specific mutation profiles. Although xAI-MTBDR currently focuses on MTB, it is adaptable to other pathogens by substituting resistance-associated genes and reference genomes. This flexibility makes it applicable to a broad range of pathogens, including Mycobacterium abscessus 30 , Staphylococcus aureus 14 , Escherichia coli , and Klebsiella pneumoniae 31 , broadening its scope in antimicrobial resistance research. In summary, we developed an explainable and high-accuracy drug resistance prediction framework designed to facilitate the discovery of new resistance-associated mutations, offering a new perspective on resistance mechanisms at both population and individual levels. As WGS and resistance phenotype data continue to improve, integrating explainable AI methods will enhance prediction accuracy, deepen our understanding of resistance mechanisms, and optimize treatment strategies. This approach has the potential to reduce morbidity and mortality from drug-resistant tuberculosis and other pathogens. Methods Sequence and DST data The raw WGS data and corresponding phenotypic DST data for MTB isolates were obtained from three studies: CRyPTIC 18 , Walker et al. 11 , and GenTB 19 . The processing workflow for these datasets is shown in Supplementary Fig.5. The original datasets contained 12,288, 38,223, and 20,379 isolates, respectively. After manual curation to remove unavailable and duplicate isolates, a total of 41,790 isolates were retained. To balance the scale of the datasets, isolates were randomly selected with proportional distributions close to 1:1:1, resulting in 12,285 isolates from CRyPTIC, 14,324 from Walker et al., and 15,181 from GenTB. Drugs with fewer than 250 isolates in any dataset were excluded 16 . Given the potential overlaps and inconsistencies among the datasets, the DST data were validated and reconciled based on the following principles: (1) consistent DST results from multiple sources were retained; (2) phenotypes from one source were adopted when another lacked data; and (3) inconsistent results were marked as “missing”. The DST categories included susceptible, resistant, and missing. Notably, 12,006 isolates from the CRyPTIC dataset included MIC data, offering a more detailed resistance level beyond the binary resistance phenotype. Table 1 summarizes the proportions of drug-resistant isolates across all datasets. Variant calling and feature extraction All raw WGS data were processed using Clockwork (v0.11.3, https://github.com/iqbal-lab-org/clockwork ), a WHO-recommended bioinformatics pipeline for variant calling in Illumina paired-end sequencing data. Quality control filters were applied, retaining only isolates with an average sequencing depth greater than 15, aligned to the H37Rv reference genome (NC000962.3), and with less than 5% of reads mapped to nontuberculous mycobacteria genomes 12 . After filtering, 12,006 isolates from CRyPTIC, 13,750 from Walker et al., and 13,389 from GenTB remained (Supplementary Fig.5). To ensure high-quality mutation data, additional quality control criteria were applied: (1) a minimum sequencing depth of 5x, (2) a fraction of supporting reads of 0.4 or higher, and (3) a genotype confidence percentile (GCP) of 0.5 or greater 18 . Mutation annotation was performed using SnpEff (v5.1d) 32 . We categorized the mutations identified within 49 genes or promoter regions associated with drug resistance (Supplementary Table 3) 13 , 14 , 33 , 34 into common (>30 isolates) and rare (≤30 isolates) mutations, following a previous study 20 . While including all rare mutations provides valuable information, their abundance could lead to model overfitting. To address this, rare mutations were classified based on their gene locus (coding, intergenic, and putative promoter regions) and grouped as “derived predictors.” Only common mutations and derived predictors present in at least 30 isolates were retained for the final feature dataset. In total, 788 features were extracted, including 717 common mutations and 71 derived predictors, from 39,145 isolates. Model construction and explanation We combined the strengths of eXtreme Gradient Boosting (XGBoost), random forest, and logistic regression 20 , 21 , 35 to construct a soft voting ensemble model using Python’s Scikit-Learn library (v0.24.2), with voting weights set to [2, 1.5, 1]. The Leave-One-Out (LOO) strategy was applied to train and test the ensemble model for each drug, where one dataset was iteratively selected as the test set while the other two were used for training. Hyperparameters were tuned using RandomizedSearchCV with five-fold cross-validation, followed by manual optimization. Our framework employs the SHapley Additive exPlanation (SHAP) method, a post-hoc interpretability tool based on game theory, to evaluate each feature’s contribution to the prediction 17 . SHAP is implemented via the TreeExplainer for the XGBoost model, which plays the largest role in predicting drug resistance within the ensemble model. SHAP values were calculated for the test dataset of each drug, providing local explanations by assigning SHAP values to each feature for every sample, thus identifying mutations that influence drug resistance at the individual level. Global explanations were derived by averaging the absolute SHAP values of each feature across all samples, highlighting mutations broadly associated with resistance at the population level. Identifying potential resistance markers To accurately identify potential resistance markers, we analyzed the SHAP values of mutations for each correctly predicted isolate, focusing on the most important mutation in each isolate as indicated by the SHAP values from three datasets. Only mutations classified with a final confidence grading of category 1 (associated with resistance) or category 2 (associated with resistance – interim) were considered known resistance-conferring mutations from the WHO mutation catalogue 12 , 16 . Mutations that are not listed in the WHO mutation catalogue, specifically those not classified in category 1, category 2, category 4 (not associated with resistance – interim), or category 5 (not associated with resistance), but classified in category 3 (uncertain significance) or as unknown mutations, were considered as potential resistance markers. We used Mann-Whitney test to compare differences between the known mutations group and potential mutations group. Subgroup the MTB drug-resistant isolates To investigate the role of SHAP values, we performed principal component analysis (PCA) using the PCA method from sklearn.decomposition for data visualization and compared it with PCA using the original features (where 0 indicates missing mutations and 1 indicates presence). To subgroup the drug-resistant isolates based on SHAP values, we determined the optimal number of clusters using the elbow method (Supplementary Fig.3) and performed clustering using the AgglomerativeClustering method from sklearn.cluster. To identify the key driving mutations within different groups, we applied the Kruskal-Wallis H test, and visualized the top 10 statistically significant mutations associated with resistance ( P < 0.0001) in a heatmap. Assessing relationships between SHAP values and resistance levels To assess the relationship between SHAP values and drug resistance levels, we calculated the sum of SHAP values for each isolate and classified them into three groups: susceptible, low-level resistant, and high-level resistant, based on established criteria 26 , 28 , 29 . For rifampicin, MIC ≤ 0.5 μg/ml indicated susceptibility, MIC (0.5, 2] μg/ml indicated low-level resistance, and MIC > 16 μg/ml indicated high-level resistance. For isoniazid, MIC ≤ 0.1 μg/ml indicated susceptibility, MIC (0.1, 3.2] μg/ml indicated low-level resistance, and MIC ≥ 6.4 μg/ml indicated high-level resistance. For ethambutol, MIC < 4 μg/ml indicated susceptibility, MIC [4, 16] μg/ml indicated low-level resistance, and MIC ≥ 32 μg/ml indicated high-level resistance. We used the ggplot2 R package to create violin plots of SHAP values across different resistance levels and employed Spearman analysis to investigate correlations. The Mann-Whitney test was used to compare differences among the three groups. We developed a ternary XGBoost model (sensitive, low-level resistant, and high-level resistant) using MIC data and trained it separately with the original features and SHAP values from the CRyPTIC dataset. The data was split into training and testing sets with an 8:2 ratio to evaluate model performance. Author contributions Hui Cen was responsible for data collection, analysis, and interpretation, and wrote the manuscript. Peng Zhang assisted with experimental design, particularly in model selection and interpretation, and contributed to the in-depth analysis and interpretation of data. Yunchao Ling participated in result discussions and provided valuable suggestions on data processing and analytical methods. The project was designed by Guoping Zhao. Guoqing Zhang conceived the project, supervised its execution, and edited the manuscript. All authors read and approved the final manuscript. Competing interests The authors declare no competing interests. References 1. ↵ Lange , C. et al. Drug-resistant tuberculosis: an update on disease burden, diagnosis and treatment . Respirology 23 , 656 – 673 ( 2018 ). OpenUrl CrossRef PubMed 2. ↵ Pai , M. et al. Tuberculosis . Nat Rev Dis Primers 2 , 16076 ( 2016 ). OpenUrl CrossRef PubMed 3. ↵ Global tuberculosis report 2023 (WHO, 2023); https://iris.who.int/bitstream/handle/10665/373828/9789240083851-eng.pdf?sequence=1 . 4. ↵ Farhat , M. et al. Drug-resistant tuberculosis: a persistent global health concern . Nat Rev Microbiol 22 , 617 – 635 ( 2024 ). OpenUrl CrossRef PubMed 5. ↵ Yang , H. et al. A rapid, accurate, and low-cost method for detecting Mycobacterium tuberculosis and its drug-resistant genes in pulmonary tuberculosis: Applications of MassARRAY DNA mass spectrometry . Front Microbiol 14 , 1093745 ( 2023 ). OpenUrl CrossRef PubMed 6. ↵ Miotto , P. , Zhang , Y. , Cirillo , D. M. & Yam , W. C. Drug resistance mechanisms and drug susceptibility testing for tuberculosis . Respirology 23 , 1098 – 1113 ( 2018 ). OpenUrl CrossRef PubMed 7. ↵ Cohen , K. A. , Manson , A. L. , Desjardins , C. A. , Abeel , T. & Earl , A. M. Deciphering drug resistance in Mycobacterium tuberculosis using whole-genome sequencing: progress, promise, and challenges . Genome Med 11 , 45 ( 2019 ). OpenUrl CrossRef PubMed 8. ↵ Zhang , H. et al. Genome sequencing of 161 Mycobacterium tuberculosis isolates from China identifies genes and intergenic regions associated with drug resistance . Nat Genet 45 , 1255 – 1260 ( 2013 ). OpenUrl CrossRef PubMed 9. Desjardins , C. A. et al. Genomic and functional analyses of Mycobacterium tuberculosis strains implicate ald in D-cycloserine resistance . Nat Genet 48 , 544 – 551 ( 2016 ). OpenUrl CrossRef PubMed 10. ↵ Coll , F. et al. Genome-wide analysis of multi- and extensively drug-resistant Mycobacterium tuberculosis . Nat Genet 50 , 307 – 316 ( 2018 ). OpenUrl CrossRef PubMed 11. ↵ Walker , T. M. et al. The 2021 WHO catalogue of Mycobacterium tuberculosis complex mutations associated with drug resistance: a genotypic analysis . Lancet Microbe 3 , e265 – e273 ( 2022 ). OpenUrl CrossRef 12. ↵ Catalogue of mutations in Mycobacterium tuberculosis complex and their association with drug resistance (WHO , 2021 ); https://iris.who.int/bitstream/handle/10665/341981/9789240028173-eng.pdf?sequence=1 . 13. ↵ Phelan , J. E. et al. Integrating informatics tools and portable sequencing technology for rapid detection of resistance to anti-tuberculous drugs . Genome Med 11 , 41 ( 2019 ). OpenUrl CrossRef PubMed 14. ↵ Bradley , P. et al. Rapid antibiotic-resistance predictions from genome sequence data for Staphylococcus aureus and Mycobacterium tuberculosis . Nat Commun 6 , 10063 ( 2015 ). OpenUrl CrossRef PubMed 15. ↵ Ali , T. , Ahmed , S. & Aslam , M. Artificial intelligence for antimicrobial resistance prediction: challenges and opportunities towards practical implementation . Antibiotics (Basel) 12 ( 2023 ). 16. ↵ Green , A. G. et al. A convolutional neural network highlights mutations relevant to antimicrobial resistance in Mycobacterium tuberculosis . Nat Commun 13 , 3817 ( 2022 ). OpenUrl CrossRef PubMed 17. ↵ Lundberg , S. M. et al. From local explanations to global understanding with explainable AI for trees . Nat Mach Intell 2 , 56 – 67 ( 2020 ). OpenUrl CrossRef PubMed 18. ↵ Consortium, T. C . A data compendium associating the genomes of 12,289 Mycobacterium tuberculosis isolates with quantitative resistance phenotypes to 13 antibiotics . PLoS Biol 20 , e3001721 ( 2022 ). OpenUrl CrossRef PubMed 19. ↵ Gröschel , M. I. et al. GenTB: A user-friendly genome-based predictor for tuberculosis resistance powered by machine learning . Genome Med 13 , 138 ( 2021 ). OpenUrl CrossRef PubMed 20. ↵ Chen , M. L. et al. Beyond multidrug resistance: leveraging rare variants with machine and statistical learning models in Mycobacterium tuberculosis resistance prediction . EBioMedicine 43 , 356 – 369 ( 2019 ). OpenUrl CrossRef PubMed 21. ↵ Farhat , M. R. et al. Genetic determinants of drug resistance in Mycobacterium tuberculosis and their diagnostic value . Am J Respir Crit Care Med 194 , 621 – 630 ( 2016 ). OpenUrl CrossRef PubMed 22. ↵ Maladan , Y. et al. The whole-genome sequencing in predicting Mycobacterium tuberculosis drug susceptibility and resistance in Papua, Indonesia . BMC Genomics 22 , 844 ( 2021 ). OpenUrl CrossRef PubMed 23. Kumar , V. & Sobhia , M. E. Molecular dynamics assisted mechanistic study of isoniazid-resistance against Mycobacterium tuberculosis InhA . PLoS One 10 , e0144635 ( 2015 ). OpenUrl CrossRef PubMed 24. ↵ Cade , C. E. , Dlouhy , A. C. , Medzihradszky , K. F. , Salas-Castillo , S. P. & Ghiladi , R. A. Isoniazid-resistance conferring mutations in Mycobacterium tuberculosis KatG: catalase, peroxidase, and INH-NADH adduct formation activities . Protein Sci 19 , 458 – 474 ( 2010 ). OpenUrl CrossRef PubMed Web of Science 25. ↵ Vilchèze , C. et al. Transfer of a point mutation in Mycobacterium tuberculosis inhA resolves the target of isoniazid . Nat Med 12 , 1027 – 1029 ( 2006 ). OpenUrl CrossRef PubMed Web of Science 26. ↵ Shea , J. et al. Low-level rifampin resistance and rpoB mutations in Mycobacterium tuberculosis: an analysis of whole-genome sequencing and drug susceptibility test data in New York . J Clin Microbiol 59 ( 2021 ). 27. ↵ Domínguez , J. et al. Clinical implications of molecular drug resistance testing for Mycobacterium tuberculosis: a 2023 TBnet/RESIST-TB consensus statement . Lancet Infect Dis 23 , e122 – e137 ( 2023 ). OpenUrl CrossRef PubMed 28. ↵ Liu , L. et al. The impact of combined gene mutations in inhA and ahpC genes on high levels of isoniazid resistance amongst katG non-315 in multidrug-resistant tuberculosis isolates from China . Emerg Microbes Infect 7 , 183 ( 2018 ). OpenUrl PubMed 29. ↵ Safi , H. et al. Evolution of high-level ethambutol-resistant tuberculosis through interacting mutations in decaprenylphosphoryl-β-D-arabinose biosynthetic and utilization pathway genes . Nat Genet 45 , 1190 – 1197 ( 2013 ). OpenUrl CrossRef PubMed 30. ↵ Solanki , P. , Lipman , M. , McHugh , T. D. & Satta , G. Whole genome sequencing and prediction of antimicrobial susceptibilities in non-tuberculous mycobacteria . Front Microbiol 13 , 1044515 ( 2022 ). OpenUrl CrossRef PubMed 31. ↵ Stoesser , N. et al. Predicting antimicrobial susceptibilities for Escherichia coli and Klebsiella pneumoniae isolates using whole genomic sequence data . J Antimicrob Chemother 68 , 2234 – 2244 ( 2013 ). OpenUrl CrossRef PubMed Web of Science 32. ↵ Cingolani , P. et al. A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3 . Fly (Austin) 6 , 80 – 92 ( 2012 ). OpenUrl PubMed 33. ↵ Steiner , A. , Stucki , D. , Coscolla , M. et al. KvarQ: targeted and direct variant calling from fastq reads of bacterial genomes . BMC Genomics , 15 ( 2014 ). 34. ↵ Feuerriegel , S. et al. PhyResSE: a web tool delineating Mycobacterium tuberculosis antibiotic resistance and lineage from whole-genome sequencing data . J Clin Microbiol 53 , 1908 – 1914 ( 2015 ). OpenUrl Abstract / FREE Full Text 35. ↵ Chen , T. & Guestrin , C. XGBoost: a scalable tree boosting system . In Proc. 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining 785–794 ( ACM , 2016 ); doi: 10.1145/2939672.2939785 . OpenUrl CrossRef 36. WHO consolidated guidelines on tuberculosis: Module 4: treatment - drug-susceptible tuberculosis treatment (WHO , 2022 ); https://iris.who.int/bitstream/handle/10665/353829/9789240048126-eng.pdf?sequence=1 . View the discussion thread. Back to top Previous Next Posted January 03, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following An explainable artificial intelligence framework reveals mutations associated with drug resistance in Mycobacterium tuberculosis Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share An explainable artificial intelligence framework reveals mutations associated with drug resistance in Mycobacterium tuberculosis Hui Cen , Peng Zhang , Yunchao Ling , Guoping Zhao , Guoqing Zhang bioRxiv 2024.12.18.629160; doi: https://doi.org/10.1101/2024.12.18.629160 Share This Article: Copy Citation Tools An explainable artificial intelligence framework reveals mutations associated with drug resistance in Mycobacterium tuberculosis Hui Cen , Peng Zhang , Yunchao Ling , Guoping Zhao , Guoqing Zhang bioRxiv 2024.12.18.629160; doi: https://doi.org/10.1101/2024.12.18.629160 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Microbiology Subject Areas All Articles Animal Behavior and Cognition (7622) Biochemistry (17650) Bioengineering (13871) Bioinformatics (41881) Biophysics (21424) Cancer Biology (18566) Cell Biology (25461) Clinical Trials (138) Developmental Biology (13365) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15590) Genomics (22476) Immunology (17713) Microbiology (40331) Molecular Biology (17148) Neuroscience (88473) Paleontology (666) Pathology (2827) Pharmacology and Toxicology (4816) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.