Full text
45,251 characters
· extracted from
preprint-html
· click to expand
Network Analysis of Pairwise Relative Tuberculosis Transmission Probabilities in Lima, Peru | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Network Analysis of Pairwise Relative Tuberculosis Transmission Probabilities in Lima, Peru View ORCID Profile Anne N. Shapiro , Meredith B. Brooks , Chuan-Chin Huang , Megan B. Murray , Laura F. White , Helen E. Jenkins doi: https://doi.org/10.1101/2025.11.18.25340467 Anne N. Shapiro 1 Department of Biostatistics, Boston University School of Public Health , Boston, Massachusetts Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Anne N. Shapiro For correspondence: anshap{at}bu.edu Meredith B. Brooks 2 Department of Global Health, Boston University , Boston, Massachusetts Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chuan-Chin Huang 3 Department of Global Health and Social Medicine, Harvard Medical School , Boston, Massachusetts 4 Division of Global Health Equity, Brigham and Women’s Hospital , Boston, Massachusetts 5 Center for Communicable Disease Dynamics, Department of Epidemiology, Harvard T.H. Chan School of Public Health , Boston, Massachusetts Find this author on Google Scholar Find this author on PubMed Search for this author on this site Megan B. Murray 3 Department of Global Health and Social Medicine, Harvard Medical School , Boston, Massachusetts 4 Division of Global Health Equity, Brigham and Women’s Hospital , Boston, Massachusetts 5 Center for Communicable Disease Dynamics, Department of Epidemiology, Harvard T.H. Chan School of Public Health , Boston, Massachusetts Find this author on Google Scholar Find this author on PubMed Search for this author on this site Laura F. White 1 Department of Biostatistics, Boston University School of Public Health , Boston, Massachusetts Find this author on Google Scholar Find this author on PubMed Search for this author on this site Helen E. Jenkins 1 Department of Biostatistics, Boston University School of Public Health , Boston, Massachusetts Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF ABSTRACT Background Identifying transmission events is important in understanding infectious disease dynamics. Such events are typically unobservable, particularly in diseases with long serial intervals such as tuberculosis (TB). We apply network techniques to identify transmission clusters and features shared within clusters. Methods We estimate directed pairwise transmission probabilities via an existing iterative algorithm that employs a modified Naïve Bayes classifier to incorporate demographic, clinical, and genetic data and use these probabilities to create a network. We explore noise reduction techniques to trim low probability edges. We apply clustering algorithms to group together individuals with TB based on edges informed by transmission probabilities. We apply our framework to simulated data and assess how the clustering algorithms captured the simulated clusters. We then apply this approach to data from a cohort study in Lima, Peru and examine the homogeneity of the clusters using a binary entropy measure. Results We find cluster performance to be consistent across all edge trimming scenarios and clustering methods. We find high levels of entropy for age, sex, socioeconomic status, and individuals who work outside the house and use public transit, indicating these variables are heterogenous across clusters. Conclusions We demonstrate approaches to analyze estimated directed pairwise transmission probabilities with network techniques. The approach is consistent across network construction and clustering methods. This method can be applied to any disease outbreak to understand its dynamics. 1. INTRODUCTION Understanding transmission trends is critical to informing targeted interventions to interrupt infectious disease spread. However, transmission is often unobservable and difficult to trace, particularly for diseases with long serial intervals such as tuberculosis (TB). 1 Despite TB being a leading cause of death globally 2 , its epidemiological parameters remain poorly informed, largely due to these tracing challenges. 3 , 4 Common methods to quantify transmission trends use genetic data, often single nucleotide polymorphisms (SNP) differences obtained from pathogen whole genome sequencing (WGS), to form transmission clusters. Isolates are included in a cluster if they are within a predetermined SNP distance from at least one other isolate in the cluster. 5 , 6 , 7 , 8 By construction, two individuals in a cluster can be separated by more than the specified cutoff. This method has notable concerns: SNP distances alone cannot definitively determine transmission 9 and results can vary depending on the selected SNP distance cutoff. 5 In our previous work, henceforth called “mlTransEpi,” we estimated pairwise transmission probabilities using demographic and clinical data combined with SNP differences. 10 mlTransEpi is a data driven method that does not assume that genetic differences alone can inform transmission events. Instead, it uses an iterative machine learning algorithm to estimate pairwise transmission probabilities. We seek to use these transmission probabilities to identify epidemiologically informed transmission clusters via a network graph, with each node being an individual with TB and each edge weighted by the directed transmission probability. We hypothesize that the clusters identified from the transmission network improve on clusters created solely based on SNP distances. As stated earlier, SNP distances may not accurately represent transmission. Additionally, SNP clustering methods will add an individual to a cluster if they are within the cutoff of at least one other individual in the cluster; thus, not all individuals in the cluster are necessarily similar and/or share transmission events. In contrast, network-based clustering methods consider the relationship between all individuals within a cluster. Here, we first examine the accuracy of these clustering methods via simulation. We then apply them to pairwise transmission probabilities estimated with published cohort data from Lima, Peru. 11 We compare clusters created using network methods with mlTransEpi probabilities to those generated with traditional SNP cutoff methods. Finally, we analyze clusters to identify if the distributions of demographic and clinical features of individuals vary between clusters. 2. METHODS 2.1 Graph construction We estimate directed pairwise transmission probabilities using mlTransEpi, described further elsewhere 10 , 12 , 13 and shown in Figure 1 . Download figure Open in new tab Figure 1. Diagram of the iterative process to generate transmission probabilities, create the network, and assign clusters. We create a network graph where nodes are individuals with TB and each estimated transmission probability is a weighted, directed edge (where the weight is the estimated transmission probability). Our iterative algorithm estimates all pairwise probabilities; all pairwise edges between individuals will be nonzero, but many will be very small reflecting the extreme improbability that they are transmission links. To reduce noise, we consider three methods to remove these low probability edges: a probability cutoff, hierarchical clustering 14 , and kernel density estimation (see Supplementary materials S1 and Figure 1 , panel 2). 15 We examine two clustering methods: the Infomap algorithm 16 and the Leiden algorithm. 17 The calculation of modularity in the Leiden algorithm requires specification of a resolution parameter; higher resolution parameters result in more clusters and lower resolution lead to fewer clusters. There is no optimal value for the resolution parameter in the Leiden algorithm, rather it is selected based on the desired granularity of clusters. For this reason, we consider multiple values (see Figure 1 , panel 3). 17 2.2 Simulation study We simulate 100 TB-like outbreaks following a procedure described in Supplementary Materials Section S2. We apply the iterative algorithm to each of the simulated outbreaks with 10 iterations and 10 cross-validation folds. We consider pairs with fewer than 4 SNP differences to be probable links and pairs with greater than 12 SNP differences to be probable non-links. All pairs with 4-12 SNP differences are considered indeterminate and used only in the prediction dataset. We vary SNP difference boundaries in sensitivity analyses. We include all pairwise combinations in which the infector was observed before the infectee or up to one year after. We assess edge trimming performance via multiple metrics. We calculate the number of true edges remaining as the number of edges corresponding to true transmission events not removed by our edge trimming method and sensitivity (Se), specificity (Sp), and positive predictive value (PPV) with the truth being an edge that corresponds to a true transmission instance. We consider two types of true clusters to assess clustering performance. First, we allow each transmission chain to be a true cluster (henceforth “full outbreak”). Each simulation generates a random number of transmission chains. Second, we remove the first 12 years of each simulation and consider each separate subgraph (groups of nodes connected by transmission events) to be a true cluster (henceforth “trimmed outbreak”). In this scenario, we exclude nodes that become singletons (i.e. are not connected to any other node). This scenario is more similar to the data available from TB outbreaks, which are collected over a shorter period with the beginning of the epidemic unobserved. Figure 2 illustrates these two true cluster constructions. Download figure Open in new tab Figure 2. Single simulation example of true cluster assignments. In image (a), each of the transmission chains (differentiated by color) is considered a true cluster. Images (b) and (c) depict the process of creating true clusters after removing the first 12 years of each simulation. In image (b), all people diagnosed outside of the 8-year period of interest are colored white. Note that edge lengths and node positions do not scale to time. Image (c) depicts true clusters after removing people diagnosed after the first 12 years of our outbreak (with colors corresponding to their original outbreak chain); we consider each separate subgraph to be an outbreak. We cluster using the Infomap algorithm and the Leiden algorithm with resolution parameters of 2, 3, and 4; note resolution is user-determined for the analytic goals of the network analysis. 18 We assess cluster performance using the mutual information score (MI), 19 pairwise F-score (F), and Fowlkes-Mallows index (FM). 20 MI has a lower bound of 0 and no upper bound; F and FM are bounded between 0 and 1. A higher value of all three metrics indicates better clustering performance. See Supplementary Materials Section 2 for further details. We perform a sensitivity analysis to assess clusters’ robustness to various sampling coverage scenarios. We examine four different scenarios: randomly sampling 50% and 80% of individuals (nodes), sampling only 80% of individuals in one class of the binary variable X 1 (to simulate differential sampling by sex 21 ), and individuals in one class of the categorical variable X 4 (to simulate lack of access to healthcare in a specific neighborhood 22 ). For each of these scenarios, we simulate an outbreak and remove individuals per the sampling scheme prior to calculating transmission probabilities. We also vary the time allowed between diagnosis of infectee and infector to assess the method’s robustness to diagnosis date misspecification. 2.3 Data application We estimate pairwise transmission probabilities on data from a cohort study from Lima, Peru previously described by Trevisi et al. 11 We consider all pairs in which a potential infector was diagnosed up to one year after a potential infectee and use the same SNP distance thresholds as in simulations. We use the pairwise variables as described in Trevisi et al. 11 in our Naïve Bayes model. Individual-level and pairwise variables and their frequencies are described in Supplemental Materials Table S2a and S2b. We apply the iterative method with 60 iterations and 10 cross-validation folds. We use hierarchical clustering with a cutoff of 0.1 to trim edges and the Infomap algorithm to create clusters for results in the main text; results using additional cutoffs and clustering methods are shown in the Supplementary Materials Section S3. We calculate the binary entropy of each cluster and calculate a weighted average by cluster size to assess the homogeneity of clusters. Binary entropy is a measure of the randomness of a binary variable. A value of 0 indicates perfect homogeneity and increasing values indicate increasing heterogeneity with the maximal value of 1 corresponding to equal probability of each feature of the binary variable within a cluster. We also calculate the proportion of perfectly homogenous clusters. We exclude clusters with fewer than 4 individuals from both homogeneity measures. We also compare our methods to traditional SNP based clustering. For these clusters, we consider individuals to be in the same cluster if they were within the SNP distance cutoff of at least one other individual in the cluster. We also generate traditional clusters using SNP cutoffs of <12 and <20; commonly used cutoffs for SNP distance clustering. 5 , 23 We compare these to clusters generated using the mlTransEpi algorithm with an upper bound of 12 and 20 SNPs. 3. RESULTS 3.1 Simulation study Edge trimming metrics are consistent for both hierarchical clustering cutoffs and kernel density estimation binwidths ( Figure 3 ). Metrics using probability cutoffs have similar means to hierarchical clustering and kernel density estimation but larger variations ( Figure 3 ). Clustering performance is also consistent across edge trimming methods for trimmed outbreak simulations ( Figure 4 ). Clusters using the Leiden algorithm with a resolution parameter of 2 have slightly higher F and FM scores, indicating that this clustering algorithm best captures the true clusters for our simulated outbreaks. Download figure Open in new tab Figure 3. Edge trimming metrics from 100 simulated outbreaks, using the last 8 years of each outbreak. We consider three edge trimming methods: hierarchical clustering (HC), kernel density (KD) estimation, and probability cutoffs. Results are consistent across HC and KD edge trimming scenarios; probability cutoffs show more noise. Download figure Open in new tab Figure 4. Pairwise F (F), Fowlkes-Mallows (FM), and mutual information (MI) scores assessing clustering performance for 100 simulated outbreaks, using the last 8 years of each outbreak. Results are consistent across all edge trimming scenarios. These trends hold when using the full outbreak (Figures S1a and S1b). A SNP distance lower bound of 4 results in a higher percentage of true edges kept as well as F and FM scores, but trends are consistent across edge trimming and clustering methods for each set of bounds (Figures S2a and S2b). Similarly, positive predictive value increases as the generation interval distribution used to simulate the outbreak increases (Figures S3a and S3b) but trends also hold across edge trimming and clustering methods for each value. Results are consistent when varying the amount of time allowed between diagnosis of the potential infector and infectee (Supplemental Materials Figure S4; recall that main results allow for the potential infector to be observed up to one year after the potential infectee) and across all sampling coverage scenarios (Supplemental Materials Figure S5). 3.2 Data application Applying the Infomap clustering algorithm to a network trimmed with a hierarchical clustering cutoff of 0.1 results in 275 clusters. Of those, 163 clusters (59.2%) are of size 2 and 90% have 5 or fewer individuals. The largest cluster consists of 45 individuals. Network and cluster characteristics are similar for different hierarchical clustering cutoffs (used to trim low-probability edges; a higher cutoff will result in more edges being removed) and cluster methods (Table S3). Variables for public transit use, working outside the home, age, sex, and socioeconomic status (SES) all have high weighted average entropy and low proportions of purely homogenous clusters ( Figure 5 ). Recall that our entropy average is weighted by cluster size. HIV and drinking statuses have the lowest average entropy and highest proportion of homogeneity. Incarceration status also has a low average entropy. Supplemental materials Figure S6 shows individual binary entropy by variable for each cluster. Supplementary materials Figures S7 and S8 show the average entropy and proportion of homogeneity across multiple hierarchical clustering cutoffs and clustering methods. Download figure Open in new tab Figure 5. Weighted average binary entropy of clusters and proportion of homogeneous clusters using data from Lima, Peru. Values closer to 0 correspond to higher homogeneity. Features with a large proportion of homogenous clusters would be expected to have lower entropy. Definition of abbreviations: SES = socioeconomic status; PLHIV = people living with HIV. 3.2.1 Comparison to traditional SNP distance cut-off methods Using an upper bound of 12 and 20 SNPs in the mlTransEpi algorithm results in the same network clusters, whereas SNP distance-based clusters vary by SNP cutoff ( Table 1 ), indicating network-based clusters are more robust to SNP bounds than traditional methods. Our network-based clusters are smaller on average than those created using SNP cutoffs and have a lower average maximum SNP distance (5.0 for Infomap clusters, 6.9 for clusters with SNP cutoff of 12, and 10.8 for clusters with SNP cutoff of 20). Network-based clusters exclude more individuals than SNP based clusters. Individuals are excluded from network-based clusters if they do not have a high probability infector and are excluded from SNP distance-based clusters if they are not within 12 or 20 SNPs of anyone else in the cohort. View this table: View inline View popup Download powerpoint Table 1. Comparison of clusters using network methods and hierarchical clustering (HC) for edge trimming to traditional clustering methods with varying single nucleotide polymorphism (SNP) distance cutoffs. Unclustered individuals refers to those who were found to not cluster with any other individuals. Note that using an upper bound of 12 and 20 SNPs in the mlTransEpi algorithm results in the same clustered network. 4. DISCUSSION We use network techniques to organize and analyze estimated directed pairwise transmission probabilities generated using mlTransEpi. We compare these clusters to traditional SNP distance-based clusters and show that while our clusters remain invariant under different SNP distance cutoffs in the mlTransEpi algorithm, traditional methods are sensitive to the chosen SNP distance cutoff. We also find that TB transmission clusters in our Lima, Peru settings are heterogenous across most individual level features, potentially indicating significant mixing between age, sex, and SES groups. These methods are useful to better understand transmission dynamics and can be applied to any infectious disease with WGS and/or contact tracing data. We find high levels of heterogeneity across clusters, particularly by age, sex, SES status, public transit use, and working outside the home. Recent research has begun to deemphasize the role of household TB transmission, particularly in high burden TB settings, and suggest that transmission may not always be between known close contacts. 24 , 25 We would expect this to result in higher rates of assortative mixing amongst those transmitting TB, as seen in our results. We see high homogeneity in HIV and smoking across clusters, although this may be due to very low prevalences of HIV positivity and smoking in our cohort (Table S2a). Incarceration history also has a low prevalence in our cohort, however it has moderate entropy and proportion of purely homogeneous clusters. Prisons are known TB transmission hotspots, 26 , 27 particularly in Latin America. 28 , 29 Our results indicate that people with a history of incarceration are distributed amongst transmission clusters rather than being clustered together, indicating that they may be contributing to community spread of TB. We also see moderate entropy amongst people who drink; these individuals are contributing to transmission clusters that are both homogenous and heterogenous for drinking status. Alcohol use is a known risk factor for TB, with multiple outbreaks being traced to people who frequent bars. Our results suggest that these individuals contribute to general community spread, not just amongst those who also drink. A strength of using mlTransEpi generated probabilities for cluster analysis is that the results do not depend solely on genetic data and/or diagnosis dates. Existing methodologies relying on these data 5 , 7 , 8 assume that pathogen genomes mutate rapidly enough to detect differences between infected individuals. 9 TB has a low mutation rate, 30 , 31 suggesting that methodologies using genetic information alone may not adequately capture transmission dynamics. Furthermore, existing methods often depend on an arbitrary SNP distance cutoff to determine clusters. 9 , 23 mlTransEpi is robust to varying SNP cutoffs, and it softens the assumption that SNP distances either do or do not indicate transmission by denoting case pairs as probable (rather than confirmed) transmission events and estimates transmission probabilities for all pairs within its iterative algorithm. 10 We showed that clusters based on mlTransEpi probabilities are invariant to varying cutoffs, while traditional clusters are not. That said, clusters based on mlTransEpi probabilities also exclude more individuals than traditional SNP based clusters, due to lack of high probability infectors. This is both a strength and a weakness of these network clusters, as only individuals for whom we can identify high probability infectors will be included. Thus, while we cluster fewer individuals, we have a higher confidence in the relationships between individuals in a cluster. Alternative methods such as TransPhylo 32 detect clusters on reconstructed phylogenies and transmission trees that are generated via a Bayesian model that incorporates diagnosis dates and genetic sequences. These models are resource intensive and rely on strong evolutionary model assumptions, as the user must provide often unknown parameters to inform the evolutionary model. TransPhylo methods are also extremely sensitive to the proportion of cases sampled. 33 New work incorporates epidemiological data into the TransPhylo framework, which has improved performance, but the method still remains computationally intensive. 34 Our methods do not rely on such assumptions, and we demonstrate that our clusters are invariant to multiple sampling scenarios (Supplemental Materials Figure S3). This may be due to mlTransEpi not assuming one true infector for each infectee but rather calculating the relative probabilities for all possible infectors, as well as excluding individuals for which we do not have a high probability infector from clustering analyses. Additionally, we also allow infectors to be diagnosed after infectees. This is important for a disease such as TB, which often has a long and variable diagnostic delay and latent and subclinical period, such that the infectee may develop disease and symptoms before the infector. 35 , 36 , 37 Our clusters are based on underlying transmission probabilities generated using naïve Bayes, a simple and well-studied machine learning classifier. 38 , 39 We chose naïve Bayes over more complex machine learning algorithms because of its transparency and ease of use, particularly regarding sparse and missing data. 10 Though it has many advantages, naïve Bayes assumes that covariates are independent when conditioning on the outcome. While this assumption may be unrealistic, numerous studies demonstrate that naïve Bayes is robust to violations of this assumption. 38 , 39 , 40 Multiple studies have introduced extensions of naïve Bayes for situations in which covariates are dependent. 41 , 42 Future work seeks to incorporate these extensions into mlTransEpi. While our results were independent of clustering algorithm specifications, the algorithms we used had limitations. The Leiden algorithm is a local community detection algorithm that optimizes modularity to determine clusters. 17 It builds upon the Louvain method, which was shown to produce poorly connected communities and merge smaller communities into larger ones, by employing a refinement phase that allows communities to be split in order to ensure all communities are well-connected. 17 That said, it can miss small clusters in certain cases, and relies on specification of a resolution parameter for which there is no one optimal value. 17 While the Infomap algorithm does not require specification of a resolution parameter, it too has biases. It sometimes combines multiple clusters into a single community if edges between them have enough weight to guide the random walk or if nodes exist with high edge weights to multiple other clusters. 16 , 43 These two algorithms were selected because they are well researched and perform on weighted, directed networks. Future work seeks to examine and apply other network clustering algorithms for transmission cluster detection. We demonstrate a method to organize pairwise directed transmission probabilities into transmission clusters and analyze these clusters to determine which demographics tend to cluster together. Both the underlying transmission probabilities and clustering algorithms rely on minimal assumptions, making this a data-driven method that can be applied to any infectious disease outbreak dataset with demographic data, WGS, and/or contact tracing data. An understanding of disease transmission clusters can be used to inform disease and targeted interventions to interrupt transmission. DECLARATIONS Ethics approval The Harvard School of Public Health institutional review board and Peru’s Research Ethics Committee of the National Institutes of Health gave ethical approval for this work. All study participants provided voluntary written informed consent prior to study participation. Data Availability Data cannot be shared publicly to protect study participant privacy. Author contributions ANS, HEJ, and LFW designed the study methodology. ANS implemented the analysis and drafted the manuscript. MBB, MBM and CCH advised study implementation. All authors aided in manuscript revision. Supplemental materials Supplementary materials are available at IJE online Conflict of interest None declared Funding The authors disclosed receipt of the following financial support for the research, authorship, and/or publication of this article: ANS is funded by the National Institute of Allergy and Infectious Disease, National Institutes of Health (grant number 1F31AI183782-01A1). MBB is funded by the National Institutes of Health and the National Institute of Allergy and Infectious Diseases (grant number K01AI151083). MBM is funded by funded by the National Institutes of Health and the National Institute of Allergy and Infectious Diseases grants (grant numbers U01AI057786, U19AI076217, U19AI109755, U19AI111224, and U19AI142793). LFW is funded by the National Institutes of Health (grant number R35GM141821). The content of the article is solely the responsibility of the authors and does not necessarily represent the views of the funding agencies. The funders had no role in the decision to publish this manuscript. Data availability Data cannot be shared publicly to protect study participant privacy. Code is available upon request. Use of Artificial intelligence (AI) tools No AI tools were used in this analysis or drafting of this manuscript. Acknowledgements The authors thank Leonid Lecca, Roger I. Calderon, Carmen C. Contreras, Judith Jimenez, and others at Socios en Salud in Lima, Peru. We also thank the patients, their families, and the healthcare personnel at the participating health centers in Lima, Peru. Without these people, this study would not have been possible. References 1. ↵ Kendall EA , Shrestha S , Dowdy DW . The Epidemiological Importance of Subclinical Tuberculosis. A Critical Reappraisal . Am J Respir Crit Care Med . 2021 ; 203 ( 2 ): 168 – 74 . OpenUrl CrossRef PubMed 2. ↵ World Health Organization . Global tuberculosis report 2024 . Geneva : World Health Organization ; 2024 . 3. ↵ Vink MA , Bootsma MCJ , Wallinga J . Serial intervals of respiratory infectious diseases: a systematic review and analysis . American journal of epidemiology . 2014 ; 180 ( 9 ): 865 – 75 . OpenUrl CrossRef PubMed Web of Science 4. ↵ Ma Y , Horsburgh CR , White LF , Jenkins HE . Quantifying TB transmission: a systematic review of reproduction number and serial interval estimates for tuberculosis . Epidemiol Infect . 2018 ; 146 ( 12 ): 1478 – 94 . OpenUrl CrossRef PubMed 5. ↵ Ryckman TS , Hopkins L , Tang L , Biche P , Mohlamonyane M , Morolo M , et al. Molecular epidemiology of Mycobacterium tuberculosis across three distinct geographic sites in South Africa . The Journal of Infectious Diseases . 2025 :jiaf326. 6. ↵ Warren JL , Chitwood MH , Sobkowiak B , Colijn C , Cohen T . Spatial modeling of Mycobacterium tuberculosis transmission with dyadic genetic relatedness data . Biometrics . 2023 ; 79 ( 4 ): 3650 – 63 . OpenUrl PubMed 7. ↵ Gardy JL , Johnston JC , Sui SJH , Cook VJ , Shah L , Brodkin E , et al. Whole-genome sequencing and social-network analysis of a tuberculosis outbreak . New England Journal of Medicine . 2011 ; 364 ( 8 ): 730 – 9 . OpenUrl CrossRef PubMed Web of Science 8. ↵ Huang C-C , Trevisi L , Becerra MC , Calderón RI , Contreras CC , Jimenez J , et al. Spatial scale of tuberculosis transmission in Lima, Peru . Proceedings of the National Academy of Sciences . 2022 ; 119 ( 45 ): e2207022119 . OpenUrl CrossRef PubMed 9. ↵ Campbell F , Strang C , Ferguson N , Cori A , Jombart T . When are pathogen genome sequences informative of transmission events? PLoS Pathog . 2018 ; 14 ( 2 ): e1006885 . OpenUrl CrossRef PubMed 10. ↵ Leavitt SV , Lee RS , Sebastiani P , Horsburgh CR , Jenkins HE , White LF . Estimating the relative probability of direct transmission between infectious disease patients . Int J Epidemiol . 2020 ; 49 ( 3 ): 764 – 75 . OpenUrl PubMed 11. ↵ Trevisi L , Brooks MB , Becerra MC , Calderon RI , Contreras CC , Galea JT , et al. Who Transmits Tuberculosis to Whom: A Cross-Sectional Analysis of a Cohort Study in Lima, Peru . Am J Respir Crit Care Med . 2024 ; 210 ( 2 ): 222 – 33 . OpenUrl PubMed 12. ↵ Leavitt SV , Horsburgh CR , Jr. , Lee RS , Tibbs AM , White LF , Jenkins HE . What Can Genetic Relatedness Tell Us About Risk Factors for Tuberculosis Transmission? Epidemiology . 2022 ; 33 ( 1 ): 55 – 64 . OpenUrl PubMed 13. ↵ Leavitt SV , Jenkins HE , Sebastiani P , Lee RS , Horsburgh Jr CR , Tibbs AM , et al. Estimation of the generation interval using pairwise relative transmission probabilities . Biostatistics . 2022 ; 23 ( 3 ): 807 – 24 . OpenUrl PubMed 14. ↵ Murtagh F , Contreras P . Algorithms for hierarchical clustering: an overview . Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery . 2012 ; 2 ( 1 ): 86 – 97 . OpenUrl CrossRef 15. ↵ Davis RA , Lii K-S , Politis DN . Remarks on some nonparametric estimates of a density function . Selected Works of Murray Rosenblatt: Springer ; 2011 . p. 95 – 100 . 16. ↵ Rosvall M , Axelsson D , Bergstrom CT . The map equation . The European Physical Journal Special Topics . 2009 ; 178 ( 1 ): 13 – 23 . OpenUrl CrossRef 17. ↵ Traag VA , Waltman L , Van Eck NJ . From Louvain to Leiden: guaranteeing well-connected communities . Scientific reports . 2019 ; 9 ( 1 ): 1 – 12 . OpenUrl PubMed 18. ↵ Traag V , Waltman L , Van Eck N . From Louvain to Leiden: guaranteeing well-connected communities . Sci. Rep . 9 , 5233 . 2019 . OpenUrl CrossRef PubMed 19. ↵ Shannon CE . A mathematical theory of communication . The Bell system technical journal . 1948 ; 27 ( 3 ): 379 – 423 . OpenUrl CrossRef Web of Science 20. ↵ Fowlkes EB , Mallows CL . A method for comparing two hierarchical clusterings . Journal of the American statistical association . 1983 ; 78 ( 383 ): 553 – 69 . OpenUrl CrossRef Web of Science 21. ↵ Horton KC , MacPherson P , Houben RM , White RG , Corbett EL . Sex Differences in Tuberculosis Burden and Notifications in Low- and Middle-Income Countries: A Systematic Review and Meta-analysis . PLoS Med . 2016 ; 13 ( 9 ): e1002119 . OpenUrl CrossRef PubMed 22. ↵ Zumla A , Sahu S , Ditiu L , Singh U , Park YJ , Yeboah-Manu D , et al. Inequities underlie the alarming resurgence of Tuberculosis as the world’s top cause of death from an Infectious Disease - Breaking the silence and addressing the underlying root causes . IJID Reg . 2025 ; 14 ( Suppl 2 ): 100587 . OpenUrl PubMed 23. ↵ Walker TM , Ip CL , Harrell RH , Evans JT , Kapatai G , Dedicoat MJ , et al. Whole-genome sequencing to delineate Mycobacterium tuberculosis outbreaks: a retrospective observational study . Lancet Infect Dis . 2013 ; 13 ( 2 ): 137 – 46 . OpenUrl CrossRef PubMed Web of Science 24. ↵ Mathema B , Andrews JR , Cohen T , Borgdorff MW , Behr M , Glynn JR , et al. Drivers of tuberculosis transmission . The Journal of infectious diseases . 2017 ; 216 ( suppl_6 ): S644 – S53 . OpenUrl PubMed 25. ↵ Kasaie P , Andrews JR , Kelton WD , Dowdy DW . Timing of tuberculosis transmission and the impact of household contact tracing. An agent-based simulation model . American journal of respiratory and critical care medicine . 2014 ; 189 ( 7 ): 845 – 52 . OpenUrl CrossRef PubMed 26. ↵ Haeusler IL , Torres-Ortiz A , Grandjean L . A systematic review of tuberculosis detection and prevention studies in prisons . Global Public Health . 2022 ; 17 ( 2 ): 194 – 209 . OpenUrl PubMed 27. ↵ Miyahara R , Piboonsiri P , Chiyasirinroje B , Imsanguan W , Nedsuwan S , Yanai H , et al. Risk for prison-to-community tuberculosis transmission, Thailand, 2017–2020 . Emerging Infectious Diseases . 2023 ; 29 ( 3 ): 477 . OpenUrl PubMed 28. ↵ Liu YE , Mabene Y , Camelo S , Rueda ZV , Pelissari DM , Johansen FDC , et al. Mass incarceration as a driver of the tuberculosis epidemic in Latin America and projected effects of policy alternatives: a mathematical modelling study . The Lancet Public Health . 2024 ; 9 ( 11 ): e841 – e51 . OpenUrl PubMed 29. ↵ Soto A . Prisons as boosters of tuberculosis and drug resistance tuberculosis transmission in Latin America . The Lancet Regional Health–Americas . 2024 ; 31 . 30. ↵ Ford CB , Shah RR , Maeda MK , Gagneux S , Murray MB , Cohen T , et al. Mycobacterium tuberculosis mutation rate estimates from different lineages predict substantial differences in the emergence of drug-resistant tuberculosis . Nature genetics . 2013 ; 45 ( 7 ): 784 – 90 . OpenUrl CrossRef PubMed 31. ↵ David HL . Probability distribution of drug-resistant mutants in unselected populations of Mycobacterium tuberculosis . Applied microbiology . 1970 ; 20 ( 5 ): 810 – 4 . OpenUrl CrossRef PubMed Web of Science 32. ↵ Didelot X , Fraser C , Gardy J , Colijn C . Genomic Infectious Disease Epidemiology in Partially Sampled and Ongoing Outbreaks . Mol Biol Evol . 2017 ; 34 ( 4 ): 997 – 1007 . OpenUrl PubMed 33. ↵ Goldstein IH , Bayer D , Barilar I , Kizito B , Matsiri O , Modongo C , et al. Using genetic data to identify transmission risk factors: Statistical assessment and application to tuberculosis transmission . PLoS Comput Biol . 2022 ; 18 ( 12 ): e1010696 . OpenUrl PubMed 34. ↵ Carson J , Keeling M , Ribeca P , Didelot X . Incorporating Epidemiological Data into the Genomic Analysis of Partially Sampled Infectious Disease Outbreaks . Mol Biol Evol . 2025 ; 42 ( 4 ). 35. ↵ Ku CC , MacPherson P , Khundi M , Nzawa Soko RH , Feasey HRA , Nliwasa M , et al. Durations of asymptomatic, symptomatic, and care-seeking phases of tuberculosis disease with a Bayesian analysis of prevalence survey and notification data . BMC Med . 2021 ; 19 ( 1 ): 298 . OpenUrl PubMed 36. ↵ Ehsanul Huq K , Moriyama M , Zaman K , Chisti MJ , Long J , Islam A , et al. Health seeking behaviour and delayed management of tuberculosis patients in rural Bangladesh . BMC Infect Dis . 2018 ; 18 ( 1 ): 515 . OpenUrl PubMed 37. ↵ Karim F , Islam MA , Chowdhury AM , Johansson E , Diwan VK . Gender differences in delays in diagnosis and treatment of tuberculosis . Health Policy Plan . 2007 ; 22 ( 5 ): 329 – 34 . OpenUrl CrossRef PubMed Web of Science 38. ↵ Kuncheva LI . On the optimality of Naïve Bayes with dependent binary features . Pattern Recognition Letters . 2006 ; 27 ( 7 ): 830 – 7 . OpenUrl 39. ↵ Rish I , editor An empirical study of the naive Bayes classifier. IJCAI 2001 workshop on empirical methods in artificial intelligence ; 2001 : Seattle, USA . 40. ↵ Zhang H . The optimality of naive Bayes . Aa . 2004 ; 1 ( 2 ): 3 . OpenUrl 41. ↵ Jiang L , Wang D , Cai Z , Yan X , editors. Survey of improving naive bayes for classification. Advanced Data Mining and Applications: Third International Conference, ADMA 2007 Harbin, China, August 6-8, 2007 Proceedings 3 ; 2007 : Springer . 42. ↵ Jiang L , Zhang L , Yu L , Wang D . Class-specific attribute weighted naive Bayes . Pattern recognition . 2019 ; 88 : 321 – 30 . OpenUrl 43. ↵ Smiljanić J , Blöcker C , Edler D , Rosvall M . Mapping flows on weighted and directed networks with incomplete observations . Journal of Complex Networks . 2021 ; 9 ( 6 ):cnab044. View the discussion thread. Back to top Previous Next Posted November 19, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Network Analysis of Pairwise Relative Tuberculosis Transmission Probabilities in Lima, Peru Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Network Analysis of Pairwise Relative Tuberculosis Transmission Probabilities in Lima, Peru Anne N. Shapiro , Meredith B. Brooks , Chuan-Chin Huang , Megan B. Murray , Laura F. White , Helen E. Jenkins medRxiv 2025.11.18.25340467; doi: https://doi.org/10.1101/2025.11.18.25340467 Share This Article: Copy Citation Tools Network Analysis of Pairwise Relative Tuberculosis Transmission Probabilities in Lima, Peru Anne N. Shapiro , Meredith B. Brooks , Chuan-Chin Huang , Megan B. Murray , Laura F. White , Helen E. Jenkins medRxiv 2025.11.18.25340467; doi: https://doi.org/10.1101/2025.11.18.25340467 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Infectious Diseases (except HIV/AIDS) Subject Areas All Articles Addiction Medicine (569) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1511) Epidemiology (15230) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6610) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15923) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6607) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3337) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (664) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5448) Public and Global Health (9238) Radiology and Imaging (2202) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a01bba8a19623fe2',t:'MTc3OTc4NzQ4NQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.