Identification of transmission clusters with high influence on shared parameter estimates in Bayesian phylodynamics

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 95,071 characters · extracted from preprint-html · click to expand
Identification of transmission clusters with high influence on shared parameter estimates in Bayesian phylodynamics | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Identification of transmission clusters with high influence on shared parameter estimates in Bayesian phylodynamics View ORCID Profile Ariane Weber , View ORCID Profile Ruth Boersma , View ORCID Profile Sebastian Duchene doi: https://doi.org/10.1101/2025.11.18.25340546 Ariane Weber 1 Department of Archaeogenetics, Max Planck Institute for Evolutionary Anthropology , Leipzig, Germany 2 Transmission, Infection, Diversification & Evolution Group (tide), Max Planck Institute of Geoanthropology , Jena, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ariane Weber For correspondence: ariane_weber{at}eva.mpg.de Ruth Boersma 1 Department of Archaeogenetics, Max Planck Institute for Evolutionary Anthropology , Leipzig, Germany 2 Transmission, Infection, Diversification & Evolution Group (tide), Max Planck Institute of Geoanthropology , Jena, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ruth Boersma For correspondence: ariane_weber{at}eva.mpg.de Sebastian Duchene 3 Université Paris Cité , Paris, France 4 ED-ID unit, Dept of Computational Biology, Institut Pasteur , Paris, France 5 Peter Doherty Institute for Infection and Immunity, Dept of Microbiology and Immunology, University of Melbourne , Melbourne, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sebastian Duchene Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Estimating a shared rate of transmission from multiple clusters of pathogen genome se-quences has become increasingly common. Such inferences are appealing, as they allow the overall characterisation of a polyphyletic population and the inclusion of clusters that could not easily be analysed by themselves, for example due to a very small number of sequence samples. Differences in sample size, however, can result in substantial variations in the information content each cluster carries about the rate of transmission. When the clusters additionally differ in their respective transmission rates, the shared estimate would not represent all clusters equally well. It is therefore integral to assess the influence of each cluster on the estimate. Here, we focus on Bayesian phylodynamic inference which combines an evolutionary with an epidemiological model to infer transmission rates from clusters of genetic sequences. We build on results about case influence in Bayesian models to suggest a computationally inexpensive measure that quantifies influence in the context of inference from multiple conditionally independent clusters, regardless of the exact epidemiological model. We further analyse clusters simulated under various birth-death-sampling models to evaluate the performance of the method and explore which properties can generally be drivers of high influence. We find the influence to be a complex interplay of size, persistence and rate relative to all included clusters. We further highlight that the shared estimate from clusters of different rates does not correspond to a population mean or median, but rather to a certainty-weighted average. Finally, we demonstrate the practical insights that can be gained through influence assessment in the analysis of SARS-CoV-2 genomes, sampled in Germany before the implementation of the first nation-wide lockdown in early 2020. Our results illustrate how variable the influence of individual clusters can be and how quantifying it can guide further studies, for example into transmission rate differences. 1 Introduction Bayesian inference refers to the estimation of parameters-of-interest in terms of probability distributions from observed data. Given the data, these distributions directly quantify the uncertainty inherent in the estimation. To construct them, a probability model that connects the observed data with the parameters-of-interest and prior information on them is needed. As hierarchical dependencies can be incorporated into this model, Bayesian inference is a flexible method that allows complex model structures ( Gelman et al., 2013 ). In phylogenetics, it has thus proven to be a valuable addition to other inference methods, like maximum likelihood and maximum par-simony ( Felsenstein, 2004 ; Yang & Rannala, 2012 ). Bayesian phylogenetics aims to reconstruct phylogenetic trees and evolutionary parameters from genetic sequences. It has provided insights into many diversification histories on different time scales, from reconstructed evolutionary rela-tionships of mammals ( Álvarez-Carretero et al., 2021 ) reaching back millions of years or of human pathogens like HBV ( Kocher et al., 2021 ) over the last ten thousand years, to the emergence of human adapted SARS-CoV-2 on the scale of years and months (Pekar, Lytras, et al., 2025). Bayesian phylodynamics makes use of the hierarchical model structures that can be incorpo-rated into the analyses and constructs the prior probability distribution for the phylogenetic tree from a population dynamic process. Through this approach it does not only provide a mechanistic description of the generative process of the phylogenetic tree, but in turn also connects it with an underlying population (E. M. Volz et al., 2013 ). Thus, in addition to the tree, population dynamic parameters can be inferred from models based on coalescent theory ( Kingman, 1982 ) or birth-death processes ( Kendall, 1948 ). These parameters include population sizes through time or epidemiological quantities, for example the growth rate or the reproductive number, R , which represents the average number of infections caused by one case. In recent years, and especially during the COVID-19 pandemic, the construction and application of models for disease transmis-sion has grown into a focal point of Bayesian phylodynamic inference. These models can serve as powerful additions to other data-driven methods, like incidence time series analyses. Flexible coa-lescent models that can be adjusted to the specific epidemiological contexts are increasingly used and have, for example, revealed transmission rate heterogeneity between clusters of SARS-CoV-2 (E. Volz et al., 2021 ; E. M. Volz & Siveroni, 2018 ; Wilinski et al., 2024 ). Similarly, birth-death models provide an adjustable framework such that they can either be combined with or used to infer prevalence and incidence ( Kühnert et al., 2014 ; T. G. Vaughan & Stadler, 2025 ; Zarebski et al., 2025 ). However, infectious disease outbreaks often cannot be connected by a single phylogenetic tree to estimate the parameters-of-interest: multiple jumps of a pathogen into the host population, multiple introductions of it into the area of interest or the re-emergence of a genetic variant that impacts transmissibility result in outbreaks with complex substructure. Although substructure-aware models exist, their substantially higher computational demand restricts their practical use. Simpler models have therefore been applied more extensively. If, for example, the subpopulations form monophyletic groups, they can be joined in one analysis by assuming them to be indepen-dent realisations of the same process. Thus multiple trees that share the same parameters but are assumed to evolve independently are estimated, one for each subpopulation. This framework has been applied extensively to characterise the dispersal of several pathogens in different epidemic settings: the joint analysis of epidemiological clusters of HIV-1 infections sampled in Botswana enabled the estimation of individual reproductive numbers for each cluster by sharing the dura-tion of infectiousness and the evolutionary rate ( Novitsky et al., 2015 ). Inferences from genetic sequences of influenza virus in Basel, Switzerland, pointed to a correlation between infectivity and temperature ( Müller et al., 2020 ). Through the inference of a shared growth rate from mpox virus clusters in New York City, the epidemic has been shown to follow similar transmission patterns as other sexually transmitted diseases (Pekar, Wang, et al., 2025). By combining multiple inde-pendent introductions of SARS-CoV-2 into Switzerland in 2020 the impact of non-pharmaceutical interventions on its spread has been evaluated ( Nadeau et al., 2023 ). Several other studies have focused on the transmission of SARS-CoV-2 or HIV in specific regions, for example Washington State, Russia, Portugal and Germany ( Kühnert et al., 2018 ; Müller et al., 2021 ; Safina et al., 2022 ; M. R. Smith et al., 2021 ; Vasylyeva et al., 2019 ; Weber et al., 2023 ), as well as other pathogens ( Walas et al., 2024 ). Besides facilitating the inference of a shared parameter that is informed by multiple genetic clusters, this approach also comes with the advantage that subpopulations of smaller sizes can be included in the analysis. It has therefore not only been applied in a Bayesian setting, but also in maximum likelihood inference ( Zhukova et al., 2023 ). There are two main limitations for the multi-cluster framework described above: first, the indi-vidual subpopulations have to be known prior to the Bayesian inference. As this information often does not exist independently, the data has to be preprocessed to identify the subpopulations, for example through phylogeographic approaches or genetic cluster thresholds ( Nadeau et al., 2023 ; Novitsky et al., 2015 ; Weber et al., 2023 ). Second, the combined clusters differ, often substantially, in the characteristics that inform their tree-generating distribution and therefore also the inferred parameter. Such characteristics include the number of genetic sequences or the persistence time of the subpopulation. To assess the robustness of the shared estimate, analyses on reduced data sets or with different prior assumptions have been used (Översti et al., 2025). However, repeating the inference multiple times comes with high computational costs, particularly for data sets comprising many hundreds to thousands of genome sequences. In addition, a small number of sensitivity analyses does not provide a more general understanding of the way in which specific characteristics influence the shared estimate. This becomes particularly relevant when clusters that are characterised by different rates are combined into one analysis: without a comprehensive assessment of the influence of each cluster, the shared parameter estimate cannot be considered representative of all individual clusters. From a theoretical perspective, case influence in Bayesian inference has been studied widely ( Bradlow & Zaslavsky, 1997 ; Cook, 1986 ; Danilevicz & Ehlers, 2020 ; McCulloch, 1989 ; Millar & Stewart, 2007 ; Peruggia, 1997 ; Thomas et al., 2018 ). Several procedures have been established based on different approaches and formal definitions of influence. Fundamentally, it is also closely related to the problem of quantifying predictive accuracy of Bayesian models (see chapter 6 in McElreath, 2020 and Vehtari et al., 2016). Leave-one-out strategies or case deletion analyses, for example, study the behaviour of the shared estimate after removal of individual or groups of cases from the observed data set ( Peruggia, 1997 ; Weiss & Cook, 1992 ). Influence is then understood as the change induced in the posterior of the shared estimate when the information provided by that case is completely neglected. Practical implementations can differ in how this change is quantified and in how the case-deleted posterior distributions are calculated. Besides analytical solutions, the posterior distributions can be determined via independent approximations through Markov chain Monte Carlo (MCMC) sampling or by reweighting of the posterior sample of the full data set. While running MCMC chains is computationally intensive, reweighting approaches like importance sampling can lack in accuracy, particularly when the influence of a case is high ( Peruggia, 1997 ). To quantify difference in the sampled posterior distributions, point estimates like the first moment are, for example, compared in their absolute value or distances between the full posterior distributions are calculated, e.g. the Wasserstein distance. Alternatively to case-deletion approaches, the impact of slight perturbations in the likelihood function of the respective case can be considered: local influence techniques evaluate the distance between the full posterior distribution and the weighted version of it in the area around equality ( McCulloch, 1989 ). To quantify the distance, basic divergence measures like the Kullback-Leibler or Bregman divergence have been used, but others can equally be applied ( Danilevicz & Ehlers, 2020 ; Millar & Stewart, 2007 ). To assess the local influence of genetic clusters on a shared parameter in a Bayesian phylo-dynamic analysis, we focus in this study on the approach based on a geometric weighting of the likelihood function, introduced by Millar and Stewart, 2007 and revisited by Thomas et al., 2018 . We will first summarise the theoretical background and set out how it extends to Bayesian phylo-dynamic analysis. From this, we will suggest a simple measure that can be used to evaluate the influence each cluster has on the joint analysis, independent of the exact form of the probability dis-tributions for the data and phylogenetic tree. We will then turn our focus to birth-death-sampling models as tree-generating processes and the inference of one shared transmission rate jointly from multiple genetic clusters using the software BEAST2 ( Bouckaert et al., 2014 , 2019 ). Through the analysis of simulated data sets, we explore how cluster size, persistence and rate can practically determine the influence of a cluster. To demonstrate their complex interplay we end with pre-senting the analysis of SARS-CoV-2 genome sequences, which were collected in Germany before the implementation of the first nation-wide large-scale non-pharmaceutical intervention during the COVID-19 pandemic. 2 Results 2.1 Local influence in Bayesian phylodynamics Theoretical framework We address the problem of inferring a shared parameter ̂θ from a collection of n multiple sequence alignments (MSAs) = { D i | i = 1 , . . ., n }. Each alignment D i represents a genetic cluster composed of a set of genetic sequences. The sequences in D i correspond to the tips of a bifurcating tree T i ∈ = { T i | i = 1 , . . ., n }. In a Bayesian inference framework we are interested in the posterior distribution: where is the likelihood of defined by an evolutionary model on , parameterised by θ S , and is the probability of the trees calculated from a population dynamic model with parameters θ E . Assuming conditional independence of the genetic clusters in their evolutionary as well as population process, meaning that the elements of both and are independent identically distributed (i.i.d.), their likelihoods and tree prior distributions can be written as products over all clusters, i.e. In the following sections, we assume p i ( D i | T i , θ S ) to be the phylogenetic likelihood of tree T i , as described by Felsenstein ( Felsenstein, 1981 ), and p i ( T i | θ E ) to be derived from a birth-death-sampling model generating the phylogenetic tree T i , formulated by Stadler et al., 2012 . However, other formulations, like more complex birth-death models ( MacPherson et al., 2021 ) or coalescent-based probability distributions for the tree ( Kingman, 1982 ), can be used analogically. Depending on the specific research question, the parameter of interest ̂θ can correspond to any parameter that is shared across the processes underlying the observed data. Here, we focus on the transmission rate due to its epidemiological relevance and high prevalence in the existing relevant literature ( Müller et al., 2021 ; Nadeau et al., 2023 ; Vasylyeva et al., 2019 ; Weber et al., 2023 ). Thus, we assume ̂θ to be the transmission rate (also: lambda), which is one of the parameters of the population dynamic model, and denote all remaining model parameters by θ for brevity. We refer to the contribution of cluster i to the full posterior distribution as cluster likelihood , such that To quantify the influence of each cluster on the joint estimation of θ^, we study the extent to which small changes in its information content affect the posterior distribution. We vary the information content by applying a geometric weighting to the cluster likelihoods, thus defining the quasi-posterior distribution with ω = ( ω 1 , …, ω n ) T . Note that the integral is finite in an open neighbourhood of ω = ω eq = (1 , . . ., 1) T (Appendix B in Thomas et al., 2018 ). We then quantify the effect of the weighting using the Kullback-Leibler divergence between the original and quasi-posterior: In this approach we follow Millar and Stewart, 2007 , who suggested it for studying the influence of individual observations, and by Thomas et al., 2018 , who extended it to all observations simultane-ously. We will therefore only shortly summarise their further argument. To quantify the influence each cluster has on the joint posterior, we consider the behaviour of KL ( p ω (· | ·) , p (· | ·)) in a small neighbourhood around ω eq , i.e. the local influence . Through a local approximation of the Kullback-Leibler divergence in terms of a second-order Taylor series expansion and evaluation of the behaviour of the resulting Hessian matrix around ω eq , we find the curvature of the Kullback-Leibler divergence to be well approximated by the variance-covariance matrix of the logarithmic cluster likelihoods: with i, j ∈ 1 , . . ., n . The variance-covariance matrix of the cluster likelihoods can easily be calcu-lated using the sampled values in an MCMC analysis of the full data set. The variances, i.e. the diagonal elements of the matrix, quantify the local influence of each individual cluster on the joint analysis, while the off-diagonal elements, the covariances, reveal synergistic and directional effects, for example if two clusters pull the estimate in the same direction. In Supplementary Figure 1 and 2 we show how the variance of the cluster likelihoods, quantifying the local influence, relates to em-pirical differences between posterior distributions of cluster deletion analyses. In this comparison, we contrast both a different measure of influence (local perturbation vs. complete deletion) and a different measure of divergence between probability distributions (Kullback-Leibler divergence vs. Wasserstein distance). As such we do not necessarily expect these to yield the same values or trends; we rather illustrate how and in which cases they differ: When considering inferences from i.i.d. clusters, we find little agreement regarding the influence on the full posterior, whereas the agreement increases when considering the influence on the transmission rate only. However, we find the relationship between local influence and case-deletion to be well approximated by a linear function for analyses of non-identically distributed clusters, both for the full posterior as well as the posterior sample of the transmission rate only. In this case, the scaled variances are also substantially higher than for the i.i.d. clusters. Download figure Open in new tab Figure 1: Drivers of cluster influence given independent identically distributed clusters: Scaled variance of the 20 logarithmic cluster likelihoods for each of the 20 simulation replicates against the respective cluster height or size. Simulation replicates are indicated by colour (legend ’sim’); the black line within the grey area illustrates the smoothed trend of the data and the associated error. Plots in the left panels correspond to analyses of the simulated data when tree topology, transmission rate and evolutionary rate are estimated, plots in the right panel to those estimating only the transmission rate. In the top row only trees with 20 tips are simulated, in the second row trees with a height of 3.0 y and in the third row trees with processes running for up to 6.0 y . Download figure Open in new tab Figure 2: Drivers of cluster influence given a single non-identically distributed cluster: Scaled variance of the 20 logarithmic cluster likelihoods against the simulation replicates. The 19 i.i.d. clusters are represented by the violin plots, the cluster simulated under a higher transmission rate by the black dot. Plots in the left panel correspond to analyses of the simulated data when tree topology, transmission rate and evolutionary rate are estimated, plots in the right panel to estimating only the transmission rate. Scaling according to model complexity To make the simulation replicates in Section 2.2.1 comparable between each other, we follow Millar and Stewart, 2007 and work with the scaled variance As the magnitude of the variance depends on the model complexity, using the scaled variance also allows comparison between models of different complexity, for example when a different number of parameters is estimated. Combined influence of multiple clusters Often we are not only interested in the influence each cluster has by itself, but also in the joined influence of multiple clusters. In this case the combined influence can be summarised by the variances of the respective summed up logarithmic cluster likelihoods. Inference of multiple parameters Similarly, instead of being interested in the influence of one cluster on the entire analysis, we might mainly be concerned with its influence on one specific parameter. As assessing the influence of only one of multiple estimated parameters would require integration over all other parameters, Thomas et al., 2018 propose to reevaluate the cluster like-lihoods for the posterior sample of the parameter-of-interest with all other previously estimated parameters fixed to their estimation mean or median. In Supplementary Figure 3 we provide a comparison of the approximation to the inference of only the respective parameter. As the infer-ence of the other parameters also impacts the influence on the parameter-of-interest, we do not expect these two to be equal, but only to show similar trends, which we observe to be the case. In addition, we relate the single-parameter influence to the empirical difference induced in the poste-rior distribution of the single parameter under a case-deletion setting in Supplementary Figures 1 and 2. These, too, suggest that the approximation works well in the phylodynamic setting. 2.2 Drivers of cluster influence Utilising the above approach, we now turn to the phylodynamic inference framework implemented in the software package BEAST2 ( Bouckaert et al., 2014 ) to study local influence in a joint analysis of multiple genetic clusters. As we are interested in the inference of epidemiological parameters, particularly the transmission rate, we focus on the joint analysis of transmission clusters using birth-death-sampling models. This corresponds to the basic inference problem 1 with the probability of the sampled tree being calculated from the birth-death-sampling density introduced by Stadler et al., 2012 . We first run basic phylodynamic inferences of the transmission rate based on MCMC sampling from the posterior distribution, to then calculate the covariance matrix of all sampled logarithmic cluster likelihoods. For the influence of single clusters, we focus on the diagonal elements, i.e. the variances, while turning to the sign of the covariance matrix to understand in which direction the respective clusters pull the estimate relative to each other. In the above implementation, three main factors can influence the variance of the birth-death-sampling probability density: the number of branches, their lengths and the three rate parameters, i.e. those that describe birth/transmission, death and sampling intensity, individually. To study how these act on the variance individually and combined, we empirically explore the local influence as a function of tree height, as a proxy for the branch lengths, tree size, i.e. the number of samples or tips, which is a linear function of the number of internal nodes, and the transmission rate. 2.2.1 Simulations under identical distribution We consider 20 simulation replicates, in each we simulate 20 bifurcating trees from a birth-death-sampling process, controlling for either tree height, size or neither, and use nucleotide sequence alignments simulated along these trees to infer either the tree, evolutionary and transmission rate or only the transmission rate, respectively (estimates of the transmission rate are given in Supplementary table 1). In these settings, all 20 trees combined in one analysis are independent realisations of the same birth-death-sampling process, as assumed in the analytical formulation of the inference problem. Impact of tree height (setting I) In the first row of Figure 1 we show the scaled variance of the logarithmic cluster likelihoods ( equation 5 ) plotted against the respective tree heights. Both for the inference of tree and transmission rate (left column) and of only the transmission rate (right column), we do not detect a strong association between the two quantities. Impact of tree size (setting II) The association between tree size and scaled variance is illustrated in the second row Figure 1 . When estimating both tree and transmission rate (left column), the influence depends approximately linearly on the size of the cluster. This, however, is not the case when only the transmission rate is estimated: although smaller clusters tend to have less influence, bigger clusters do not necessarily have more. Combined impact of tree height and size (setting III) When both the tree size and the height are allowed to vary between clusters, we find a similar pattern. While inferences of both tree and transmission rate show a linear relationship between influence and size, we only observe a tendency for larger trees to have higher influence when inferring the transmission rate alone. As bigger trees are generally also higher (insets), we also see a tendency for higher influence of higher trees. Comparing the influence on the transmission rate between the individual effect of the tree size and the combined effect with height (second and third row, right column) suggests that, in these simulation settings, size-dependent effects on the transmission rate only become strong above sizes of 50 samples. In summary, we find that clusters with more samples are associated with more peaked cluster likelihoods in log space and thus higher variances. However, this effect is only strongly present when the tree itself is part of the inference, indicating that the uncertainty of the cluster likelihood is to a large part mediated by uncertainty in the tree. Equally, when we consider the influence on only the rate in an analysis in which it is estimated together with the tree, larger trees tend to be associated with higher influence, although the relationship is not definitive (Supplementary Figure 3). The tree height, however, we do not find to be strongly associated with higher influence. We also note that the scaled variances observed on identically distributed trees are generally relatively small. 2.2.2 Simulations under non-identical distribution As the analysis of real-world data often combines clusters that are, other than analytically assumed, not realisations of the same birth-death-sampling process ( Weber et al., 2023 ), we combine trees that were simulated under different transmission rates into one analysis to better understand practical applications (estimates of the transmission rate are listed in Supplementary tables 2 and 3). Impact of rate outlier (setting IV) We first consider a scenario in which one of 20 clusters grows with a higher rate. Figure 2 presents the resulting influence metric of all simulated clusters, grouped by transmission rate, against the respective simulation replicate. For the inference of both tree and transmission rate, the different transmission rate does not have a distinct effect on the variance of the posterior distribution in log space. However, on the transmission rate itself we find the cluster with higher rate to have a higher influence than the other 19 i.i.d. clusters. (see also Supplementary Figure 3). Combined impact of rate and height (setting V) Second, we evaluate how the combination of differences in rate and tree height drive the influence of individual clusters. The results are shown in the first two rows of Figure 3 . When we combine all 20 clusters with rates varying between 1.6 y − 1 and 4.4 y − 1 in one inference, the posterior distribution of the rate is not centred around the mean or median of the rates that were used for simulation, rather pulled downwards. Download figure Open in new tab Figure 3: Drivers of cluster influence given independent non-identically distributed clusters: Scaled variance of the 20 cluster likelihoods for each of the 20 simulation replicates against the transmission rate used to simulate the respective cluster (size and height shown as insets). Replicates are indicated by colour (legend ’sim’); the dashed vertical line corresponds to the mean of the rate values used for simulation, the grey area to the range in which the means of the posterior distributions fall. The first two rows show results from simulation setting V, the last two rows from setting VI; rows one and three correspond to inferences from all 20 clusters jointly (left column inferring tree, evolutionary and transmission rate and right column inferring only the latter), rows two and four to inferences of the transmission rate based only on a subset of ten clusters (left column smallest ten rates, right column highest ten rates). This is because clusters simulated under lower rates tend to have a higher influence on the shared rate. The effect is more pronounced for the influence on the transmission rate only (right column and Supplementary Figure 3), but also present for the influence on the tree and rate (left column). Clusters with high influence tend to have higher trees, suggesting that higher trees generate a more peaked log likelihood and are therefore more informative for the transmission rate than shorter trees (insets of Figure 3 ). In Supplementary Figure 4 we show that long branches are generally more informative against high rates than short branches are against low rates. In the second row of Figure 3 we shorten the range of the transmission rates used for simulation to [1.6 y − 1 , 2.5 y − 1 ] and [3.5 y − 1 , 4.4 y − 1 ], respectively, by combining only the ten low-rate (left column) and ten high-rate (right column) clusters in one analysis. In both cases, we find the posterior centre to be closer to the simulation mean, however, still slightly pulled downwards for most of the replicates. The insets of both plots illustrate how, as before, the higher influence correlates with higher trees, therefore lower simulation transmission rates. However, more pronounced than before, we see how the influence on the transmission rate is strongly driven by the distance of the simulation rate of the respective cluster to the shared estimate. Clusters that agree well with the shared rate have low individual influence on the estimate, while clusters that are less likely under the shared estimate are more influential. To illustrate how the local influence of the low- and high-rate clusters in the full analyses relates to the subset analyses in Figure 3 (second row), we calculated the grouped influence, described in Section 4.1 and plotted in Supplementary Figure 5. Despite the deletion of the low-rate group ( Figure 3 right column) leading to a bigger absolute difference in mean posterior estimates to the full analysis, the grouped influence is indistinguishable from the high-rate clusters (Supplementary Figure 5 left column). When we divide the data into four groups, we see the lowest- or highest-rate group, respectively, to generally have the highest influence. The discrepancy observed for two groups can be the result of the different likelihood reweighting approach (local influence vs. case deletion) or of the different distance measure (Kullback-Leibler divergence vs. absolute difference of mean). Considering the latter, however, it would not be surprising: the Kullback-Leibler divergence takes into account the full probability distribution, not just the behaviour around the centre, and quantifies the expectation under the reweighted posterior of the logarithmic difference between the full and reweighted posterior ( Kullback, 1959 ). In addition to the variance, we can consider the covariance of the cluster likelihoods to indicate the direction in which each cluster pulls the estimate. In Supplementary Figure 6 we colour the variances of Figure 3 by their respective covariance sign, i.e. the pull direction. In both the first and second row, the pull direction perfectly aligns with the position of the simulation rate of each cluster relative to the shared estimate. Combined impact of rate, size and height (setting VI) Finally, to include the effect of variable cluster sizes, we simulate birth-death-sampling trees with rates as above, allowed to grow until a maximum of 2.75 y . Similarly as for the i.i.d. trees, the size has a strong impact on the influence of each cluster ( Figure 3 row three and four). As expected, trees that were simulated under higher transmission rates tend to generate bigger and higher trees (insets). These also show the highest influence values, particularly when estimating both the transmission rate and the tree. As before, when only the influence on the transmission rate is considered (right column), we find big clusters with low influence. This can, for example, be caused by the impact of the true transmission rate, as clusters that agree well with the shared estimate have lower influence. Nevertheless, the posterior centre of the transmission rate is shifted to much higher values than in the previous simulation settings, with values ranging around 4.0 y − 1 . To reduce the variability of the joined clusters, we again run the same analyses with only the 10 low-rate and 10 high-rate clusters, respectively ( Figure 3 row four). For the low-rate group (left column), the strongest impact on the cluster influence seems to be mediated by the height of the tree (insets). However, for some of the simulation replicates the transmission rate is overestimated, as the centre of the posterior distribution is higher than the highest transmission rate used for simulation. As the cluster size in this setting ranges between 2 and 7 sequences, the overestimation of the rate can be the result of combining many likelihoods with very small log variance. For the high-rate group, we find a similar pattern as for the full data, with the shared estimate being pulled to values slightly higher than the simulation-mean by the high-rate trees. Nevertheless, when considering the influence by group in the full analysis, we cannot detect strong differences between the low-and high-rate clusters; only when separating the clusters into four groups (Supplementary Figure 5). In contrast to the previous simulations, the pull direction, coloured in Supplementary Figure 6, does not perfectly align with the relative position of the respective cluster to the shared estimate. However, this is particularly pronounced for the analysis of the low-rate group, suggesting that the small clusters mediate this effect. We, for example, find multiple clusters simulated under smaller rates to pull the shared estimate upward – which in turn can explain the overestimation. Taken together, if clusters that were simulated under different transmission rates, are joined in one analysis, the less the cluster agrees with the shared estimate, the higher the influence on it is. This influence is additionally impacted by the size and persistence of the cluster, with bigger and longer persistent clusters often mediating higher influence. For the shared estimate this means that a complex interplay of the properties of the combined clusters relative to each other determines its value. This value then represents the probabilistically most plausible description of the combined clusters. 2.3 Transmission dynamics of SARS-CoV-2 clusters To illustrate which insights can be practically gained through the assessment of cluster influence, we further inferred a shared reproductive number from pathogen genome sequences. More precisely, we analysed clusters of identical SARS-CoV-2 genomes sampled in Germany before the first nation-wide strict non-pharmaceutical interventions were implemented. We chose SARS-CoV-2, as most studies presenting shared Bayesian estimates so far focus on this virus. We restricted our analyses to sequences sampled in Germany before the first lockdown measures to obtain a data set of sensible size that allows the inference of one time-constant reproductive number. We focus on clusters of identical sequences, as these present the highest resolution of transmission chains that genome sequences can directly provide. Practically, this also means that the data for the phylogenetic likelihood is little informative and the analysis should be mainly driven by the sampling times through the birth-death-sampling likelihood. As commonly described for fast spreading pathogens ( Tsui et al., 2025 ), we find a power law-like distribution of cluster sizes, with many clusters consisting of less than five sequences and few of more than 30. Joining all clusters from which a phylogenetic tree can be constructed, that is those that contain at least two sequences, into one analysis, we inferred one shared, time-constant reproductive number. We estimate a ragged posterior density with a mean of 2.80 and a highest posterior density interval (HPDI) of [2.60, 2.91] ( Figure 4 row 1 column 1). From the previous simulations, we expect this inference to be mainly driven by the biggest cluster, containing 290 sequences. Indeed, we find it to have the highest local influence ( Figure 4 row 1 column 2). Other than expected, we also discover that the smallest clusters included in the analysis, containing only two sequences, have a high influence on the shared reproductive number. When grouped together, their influence even substantially exceeds the influence of the biggest cluster (Supplementary Figure 7). However, not all two-sequence clusters are associated with higher influence; the high-influence subset exclusively consists of clusters for which both sequences were sampled on the same day, such that the corresponding phylogenetic trees are ultrametric ( Figure 4 row 1 column 2). Inspecting the MCMC chain of the parameters associated with these clusters, we see that progressively small tree heights are sampled, as they increase the birth-death-sampling likelihood. For the smallest heights, though, the likelihood becomes ragged and non-smooth, such that it jumps between three likelihood modes for all sampled values of the reproductive number (Supplementary Figure 8). For the two ultrametric clusters that are not associated with increased influence, one with two and one with three sequences, the sampled tree heights do not extend to such small values and the sampled likelihood behaves smoothly. As this is indicative of numerical instability due to the extremely small tree heights, the high influence is likely a numerical artefact potentially leading to biases in the shared estimate. We note, however, that we condition the birth-death-sampling probability density on the time of the first transmission event, that is the most recent common ancestor in the phylogenetic tree, rather than on the beginning of the process. In analyses using the latter conditioning this artefact might not be present for ultrametric two-tip trees. In addition, we did not observe similar behaviour in our simulations; this is unsurprising as under the simulation conditions the temporal resolution is higher such that sampled tips are very unlikely to fall on the exact same time point and clusters with truly ultrametric trees are therefore not included. Indeed, when excluding all ultrametric trees, the estimated posterior distribution of the reproductive number is unimodal, with a clear peak around the mean value 2.73 and the HPDI covering the interval [2.58, 2.89] ( Figure 4 row 2 column 1). The assessment of the local influence for all clusters reveals that now the small clusters have little influence on the estimate, individually as well as grouped together (Supplementary Figure 7). Instead, the two biggest clusters have the highest influence, pulling the estimate in different directions ( Figure 4 row 2 column 2). To determine in which absolute direction the clusters pull the estimate, we ran a third analysis excluding the biggest cluster ( Figure 4 row 3). We find the resulting posterior distribution of the reproductive number to be broader and pulled downwards, with a median of 2.54 and HPDI of [2.31, 2.77]. This suggests that the biggest cluster with 290 sequences, as most other clusters, pulls the estimate to higher values while the second biggest with 49 sequences is more likely under lower reproductive numbers. This agrees well with previous phylodynamic analyses of the latter cluster, which support that after an initial superspreading event in the district of Heinsberg in mid-February 2020 the reproductive number fell to values below 1 during the time directly after the event until the end of April ( Översti et al., 2025 ), when strict lockdown measures were already implemented regionally. Taken together, the assessment of the local influence of all joined clusters does not only reveal to what degree the clusters agree about the shared estimate, but can also highlight potentially biasing model and software artefacts. Download figure Open in new tab Figure 4: Shared estimate of the reproductive number from SARS-CoV-2 clusters and their individual influence: The left column depicts the sampled posterior distribution of the reproductive number. In the right column the variance of the logarithmic cluster likelihood marginalised for the transmission rate, i.e. the local influence on that rate only, is plotted against the size of the clusters. The colouring indicates the direction in which the cluster pulls the estimate, i.e. the sign of the covariance. Points that are shaped as triangles correspond to ultrametric clusters, that is clusters consisting only of sequences sampled on the same day, while circles to all others. The first row shows results from the analysis of all 59 clusters, the second of all 41 non-ultrametric trees and the third with all ultrametric trees as well as the biggest one excluded. 3 Discussion The previous years have seen many applications of Bayesian phylodynamic models to the problem of estimating one shared parameter from multiple transmission clusters, assuming them to be conditionally independent realisations of the same process ( Kühnert et al., 2018 ; Müller et al., 2020 , 2021 ; Nadeau et al., 2023 ; Safina et al., 2022 ; M. R. Smith et al., 2021 ; Vasylyeva et al., 2019 ; Weber et al., 2023 ). As clusters of varying information content are joined together, it is important to understand to which extent the shared estimate is representative of them. In addition, Bayesian phylodynamic models are very complex structures that have been shown to be sensitive to various violations of model assumptions ( Louca et al., 2021 ; Möoller et al., 2018; Översti et al., 2023 ; E. M. Volz & Frost, 2014 ). It is therefore particularly relevant to question and validate the inference results. To this end, we have presented a mathematical framework that can be used to easily quantify the information content that each cluster provides for the shared estimate, through the local influence of its likelihood functions. We have applied this framework to synthetic as well as real data to evaluate the formal drivers of influence, as considered here, and illustrated the practical insights that this framework allows. Focusing on birth-death-sampling models as tree generating processes, we find the shared estimate to be an average over all clusters that is relatively weighted by the certainty that each cluster provides about the parameter. Bigger sampling sizes and longer persistence of the cluster are generally associated with higher certainty and therefore higher influence on the joint analysis. The difference of the cluster-defining parameter to the shared estimate additionally mediates higher influence. All three characteristics strongly interact in determining the peakedness of its logarithmic cluster likelihood. The tree size, for example, can itself be informative about the birth-death-sampling parameters ( MacPherson & Pennell, 2024 ). As this relative interplay is not trivial to predict, it is important to assess it on a case-by-case basis, for example through the local influence of each clusters presented herein. Our results have important implications for the interpretation of shared estimates of the trans-mission rate in epidemiological contexts. Contrary to what is often assumed, the shared estimate does not correspond to a value that can be directly interpreted on the population-level but rather to an abstract value that is generated through the mathematical integration of uncertainties. Sim-ulation setting II, which isolates the impact of the tree size, for example, demonstrates how the shared estimate is not equal to the cluster mean or median, as clusters of bigger sizes pull the esti-mate towards values that are more probable under their likelihoods. The shared estimate neither corresponds to the population mean or median, as illustrated by simulation setting V, in which clusters with longer persistence times have higher influence on the inference, given the same restrictions on the population size, purely due to them being stochastically more informative against specific value ranges. Instead, it will depend on the mathematical formulation of the inference problem, the specific setting and the relative interactions of all clusters included in the analysis, which epidemiological population-level entity, if any, the shared estimate best matches. Instead of estimating a shared rate, especially in the likely presence of rate heterogeneity between clusters, it is therefore often more sensible to employ models that can account for differences in the respective rates: hierarchical models can, for example, be used to constrain the rate of each cluster by a governing distribution ( Edo-Matas et al., 2010 ; Taouk et al., 2024 ). Alternatively, individual rates can be inferred for each cluster and the resulting distributions merged in another post-inference Bayesian model. Comparisons of shared estimates of the reproductive number with quantifications of it from other models therefore have to be undertaken with care, as a direct comparison usually is not possible. As for the reproductive number, in particular, multiple definitions exist with slight differences in the population-level interpretation, the inter-comparability problem has already been discussed for other methods ( Brockhaus et al., 2023 ). Influence assessment can serve as a first step in better understanding in which way which parts of the analysed data set inform the inference, particularly because it is computationally undemanding to calculate for any MCMC-based sample from a posterior distribution. Thus, we recommend using it as a default diagnostic tool in joint Bayesian phylodynamic analyses. For other applications, like multi-level mediation models or regression analysis similar frameworks have allowed the identification of influential observations in specific model layers or data subtypes ( Thomas et al., 2018 ; Věcěrováa et al., 2020). Our analyses of SARS-CoV-2 data demonstrate how employing it as a diagnostic tool does not only provide information about the epidemiological relevance of the clusters but can also hint towards computational artefacts. It is, nevertheless, important to keep in mind what exactly our framework assesses: Influence is defined by the difference that a geometric reweighting of the cluster likelihoods causes on the posterior distribution, relative to the unweighted posterior. This local weighting of the cluster likelihood in a phylodynamic posterior distribution is not equivalent to a its complete deletion. Indeed, cluster-deleted analyses are therefore not directly comparable to local influence assessment, as we illustrate in Figure 4 . In addition, there is no perfect way to measure the difference between two probability distributions. In our approach we follow Millar and Stewart, 2007 and Thomas et al., 2018 by calculating it as the Kullback-Leibler divergence, which can be understood as the relative lack of information provided by one distribution about the other. Importantly, it is not symmetric, meaning that the divergence between distribution A and B does not match the divergence between B and A (see Kullback, 1959 for a general derivation of the divergence score and McElreath, 2020 for a discussion of its use in Bayesian predictive accuracy). Symmetric measures of distance between distributions, as the Wasserstein distance, could alternatively be considered. In Supplementary Figures 1 and 2, we exemplify how both decisions, influence definition and difference measure, can impact the quantification of influence. As for the empirical results that we provide using birth-death-sampling models as tree gener-ating processes in Bayesian phylodynamics, these have to be qualified by their specific contexts. Influence assessment is by definition highly relative and as such our decisions about the simulation settings will have a strong impact on our conclusions. In our simulations we opted to analyse a set of 20 clusters that varied in size between two and multiple hundred sequences and were generated by a birth rate between 2 y − 1 and 4 y − 1 , a death rate of 1 y − 1 and a sampling rate of 0.01 y − 1 . We chose these values for computational convenience, as multiple simulation replicates on a signifi-cantly bigger data set would take much more compute power, and due to their comparability to similar studies ( Boskova et al., 2014 ). Different settings, however, might lead to different conclu-sions, although we expect the same statistical principle described above. Additionally, we only analysed trees generated by time-constant rates, while practical applications often involve much more complex models with the transmission rate, for example, changing tens to hundreds of times. As especially the influence of a single of multiple estimated parameters is based only on an approx-imation, it is likely to perform less good the more parameters are estimated. Nevertheless, in this study we opted for the simpler case to better isolate individual effects, like the numerical issues associated with ultrametric two-tip trees shown in Figure 4 . As the basic influence measure applies for different models, we do expect similar patterns when other models, like coalescent-based prior distributions on the tree, are used: the influence of each observation will be driven by its degree of disagreement with the other observations as well as general factors that increase informativeness like sample size. In our case, sample size could easily be measured by tree size, i.e. tips in the tree, as only one underlying population is modelled. For multitype models, which combine several pop-ulations in one tree, sample size will be determined by a combination of the number of branches and nodes assigned to the respective types. In addition, if number and timing of type change events in the tree are not fixed but inferred, the sample size is not a constant property anymore but will change during the MCMC analysis. For inference problems, in which observations are not conditionally independent, further research is needed. In summary, the presented method can serve as a convenient tool to evaluate inferences of shared parameters from multiple clusters. In this study, we focused on the inference of popula-tion dynamic parameters from a birth-death-sampling model, but the analytical framework can straightforwardly be extended to other parameters, like the evolutionary rate of the phylogenetic likelihood. Similarly, other inference settings can also be analysed: the main assumption of the method is conditional independence of the observations, which is also fulfilled by individual sites in an alignment under Felsenstein’s likelihood ( Felsenstein, 1981 ). Extensions of influence assessment on groups of genomic sites could, for example, highlight those that drive our inference of evolu-tionary rates or those that might be in disagreement with the signal from other genomic positions, therefore pointing towards important evolutionary patterns. 4 Methods 4.1 Local influence in Bayesian phylodynamics Local influence We calculated the variance-covariance matrix of posterior samples of the log-arithmic cluster likelihoods in the software environment R 4.4.2 ( R Core Team, 2024 ) using the graphical user interface RStudio 2024.12.0+467. To evaluate the quality of the posterior samples we applied functions from the packages boa 1.1.8-2 (B. J. Smith, 2007 ) and coda v0.19-4.1 (Plum-mer et al., 2006), for parsing of simulated trees packages ape 5.8-1 ( Paradis & Schliep, 2019 ) and phytools v2.4-4 ( Revell, 2024 ) and for plotting of the results packages from the tidyverse collection v2.0.0 ( Wickham et al., 2019 ). Inference of multiple parameters To approximate the influence of a single parameter in a BEAST2.7.7 ( Bouckaert et al., 2019 ) analysis when inferring multiple parameters, as discussed in Section 2.1 , we recalculated the log cluster likelihoods for all posterior samples of the parameter-of-interest while fixing all other inferred parameters to their median estimate. The posterior sample of phylogenetic trees we summarised by the maximum clade credibility (MCC) tree with median heights. The MCC trees were reconstructed with treeannotator, which is part of the BEAST2.7.7 ( Bouckaert et al., 2019 ) distribution, while the cluster likelihoods were recalculated with the Log-FileIterator class, included in the package feast v10.4.0 (T. Vaughan, 2025 ). 4.2 Drivers of cluster influence 4.2.1 Simulations of identically distributed clusters Setting I: Impact of tree height We simulated bifurcating trees distributed according to a birth-death-sampling process with birth rate λ = 2 per year ( y − 1 ), death rate µ = 1 y − 1 and sampling rate ψ = 0.01 y − 1 in the software BEAST2.7.7 ( Bouckaert et al., 2019 ) with functionalities included in the package ReMaster v2.7.2 (T. G. Vaughan, 2024 ). To study the impact of the tree height on local influence, we controlled for the tree size by setting as simulation end condition 20 sampled tips. We generated a set of 20 trees for each of 20 simulation replicates. Along all trees we simulated sequences of 5000 nucleotides in length under the Jukes-Cantor-69 (JC69)( Jukes & Cantor, 1969 ) model with a rate of 0.002 substitutions per site per year ( ss − 1 y − 1 ). Using these nucleotide sequence alignments we set up two Bayesian inferences in BEAST 2.7.7 with the package BDSKY v1.5.1 ( Stadler et al., 2012 ) for each replicate: i) estimating the tree, the transmission rate as well as the evolutionary rate and ii) estimating only the transmission rate. For both inferences, we fixed the remaining parameters to the true value, conditioned the birth-death-sampling process on the root node and set LN (0.0, 1.25 2 ) as prior distribution on the transmission rate and U (0, 1) on the evolutionary rate, when applicable. We approximated the posterior distribution of the parameters-of-interest through MCMC-based sampling from the respective distributions, running the chain for 1.5 ∗ 10 7 and 5 ∗ 10 5 steps, sampling every 10000th and 1000th iteration, respectively. We further approximated the influence only on the transmission rate in inference i by recalculating the birth-death-sampling likelihood for each posterior sample of the transmission rate with the trees fixed to the respective MCC trees with median heights, as described in Section 2.1 . Setting II: Impact of tree size To evaluate the isolated influence of the tree size on local influence, we controlled for the tree height while allowing for variation in the number of sampled tips. We thus simulated, in the same software as above, 20 replicates of 20 trees generated by a birth-death-sampling process with birth rate 2 y − 1 , death rate 1 y − 1 and sampling rate 1.01 y − 1 . To keep the tree height comparable between trees and replicates, we enforced a transmission event to happen at time 0.0 y and 3.0 y while setting a maximum simulation time of 3.0 y . Again, we simulated 5000 nucleotide long sequences along each tree under the JC69 model with a rate of 0.002 ss − 1 y − 1 and used these sequences as input data in two Bayesian inference settings: i) estimating the tree, transmission rate as well as evolutionary rate and ii) estimating only the transmission rate. Setting III: Combined impact of tree height and size We studied the combined local influence of tree height and size by simulations of birth-death-processes with birth rate 2 y − 1 , death rate 1 y − 1 and sampling rate 0.01 y − 1 and a maximum simulation time of 6.0 y , conditioned on a minimum of two samples. We set up the simulation replicates and Bayesian analyses analogously to both previous sections. 4.2.2 Simulations of non-identically distributed clusters Setting IV: Impact of transmission rate We evaluated the impact of differences in the tree generating process between jointly analysed clusters by simulating trees generated under different transmission rates. We note that in the following simulation settings the tree height is also not controlled for. First, we focused on the impact of a single transmission rate outlier: We simulated 19 trees from a birth-death-sampling process with birth rate 2 y − 1 , death rate 1 y − 1 and sampling rate 0.01 y − 1 and one cluster with identical death and sampling rate but with a birth rate of 3 y − 1 . We kept the size of all clusters constant by stopping the simulation when 20 samples were generated. The nucleotide sequences simulation and Bayesian inference were again set up as described above, notably with two inference settings: estimating i) the tree, evolutionary rate and transmission rate and ii) only the transmission rate. Setting V: Combined impact of transmission rate and tree height For the 20 replicates of 20 tree simulations, we set the death rate to 1 y − 1 , the sampling rate to 0.01 y − 1 and the transmission rate to a value between 1.6 y − 1 and 2.5 y − 1 , in steps of 0.1, for 10 trees as well as between 3.5 y − 1 and 4.4 y − 1 , again in steps of 0.1 for further 10 trees for each replicate; such that each tree in one simulation replicate is characterised by a unique transmission rate. Keeping the tree size constant, we used as simulation end condition the generation of 20 samples. As for the previous four settings, we simulated nucleotide sequences along these trees, which we used as input in a Bayesian inference of i) the tree, transmission and evolutionary rate from all 20 clusters, ii) the transmission rate from all 20 clusters and iii) the transmission rate only from the the ten clusters simulated under lowest and highest rate, respectively. Setting VI: Combined impact of transmission rate, tree height and size As previously, we generated 20 trees for 20 replicates. Each tree was simulated from a process with death rate 1 y − 1 , sampling rate 0.01 y − 1 and individual transmission rate between 1.6 y − 1 and 2.5 y − 1 and between 3.5 y − 1 and 4.4 y − 1 . To also allow for variation in tree size, the simulations were stopped after 2.75 y or upon depletion of the infected population. Sequence alignment simulation and Bayesian inferences were also set up as in simulation setting V. 4.3 Transmission dynamics of SARS-CoV-2 clusters We collected all SARS-CoV-2 genomes from GISAID, the global data science initiative ( Shu & McCauley, 2017 ), on January 19th 2025 that were sampled in Germany until March 22nd 2020, when the first nation-wide strict non-pharmaceutical interventions (’lockdown’) were started (Sup-plementary table 4). We restricted the search to include only complete, high coverage genomes with complete collection date information. This resulted in a data set of 826 SARS-CoV-2 genome sequences, which we aligned to the reference genome with GenBank-ID MN908947.3 ( Wu et al., 2020 ) in mafft v7.490 ( Katoh & Standley, 2013 ) with the alignment method chosen automatically and the alignment length kept constant. Sites that have been suggested not to contain phylogenetic signal (W, 2025), we replaced with an uninformative ’N’ in all sequences. We identified clusters as groups of identical genome sequences with minimal size of two in R 4.4.2 ( R Core Team, 2024 ) with functionality from the packages ape ( Paradis & Schliep, 2019 ) and tidyverse ( Wickham et al., 2019 ), more precisely by calculating pairwise distances using the function dist.dna without speci-fication of a substition model and pairwise deletion set to false to avoid sequences being added to multiple clusters. This left us with 582 sequences in 59 clusters of a minimal cardinality of two, the biggest of which consisted of 290 sequences while most others contained less than five. We next set up a Bayesian inference in the software BEAST2.7.7 ( Bouckaert et al., 2019 ) of one shared effective reproductive number R e from all clusters together with their individual phylodynamic trees. For this, we defined independent tree likelihoods for all clusters, calculated under a strict clock model with fixed rate 8 ∗ 10 − 4 ss − 1 y − 1 and the Hasegawa-Kishino-Yano-85 (HKY85) ( Hasegawa et al., 1985 ) substitution model with estimated base frequencies and transition/transversion ratio. The prior distribution for each tree we specified to be the birth-death-skyline model from the package BDSKY v1.5.1 ( Stadler et al., 2012 ) with the rate to become non-infectious δ fixed to 36.5 y − 1 while the reproductive number R e and sampling proportion s were estimated and shared between all trees. We used the epidemiological, in contrast to the rate parameterisation in the simulation analyses, as its quantities can be directly interpreted while arising through the simple reparameterisation: , with λ, µ, ψ being the birth/transmission, death and sampling rate, respectively. We finalised the Bayesian model by defining prior probability distributions for all hyperparameters: the reproductive number was constrained by a LN (0.0, 1.25 2 ) distribution, the sampling proportion by U (0.0, 0.1), the base frequencies by a standard uniform distribution and the transition/transversion ratio by LN (1.0, 1.25 2 ). We sampled from the posterior distribution via a MCMC algorithm with 5 ∗ 10 8 steps, storing every 10 4 th step for all parameters except the phylodynamic trees, which were stored every 7.5 ∗ 10 4 th step. We assessed convergence of the chain in R with the package coda ( Plummer et al., 2006 ), running chains for which any estimated pa-rameter did not yet have an effective sample size above 150 for another 2.5 ∗ 10 8 steps. In addition, we ran two independent chains with different seeds of the random number generator and validated that both converged to similar values. To approximate the influence on a single parameter, we proceeded as described above. We further include two analyses with slightly different data sets. First, with all clusters that contain only sequences sampled on the same day removed, second with the biggest cluster additionally excluded. For both data sets we set up the Bayesian inference as described for the full data set and proceeded with the influence assessment analogously. Data Availability The analyses of SARS-CoV-2 data are based on 826 sequences and their associated metadata available on GISAID up to January 19th 2025, via gisaid.org/EPI_SET_250119mc and under doi.org/10.55876/gis8.250119mc. Scripts, input files and the raw data shown in figures of the main text are published as a data set under https://doi.org/10.17617/3.DNLGXJ . https://doi.org/10.17617/3.DNLGXJ https://gisaid.org/EPI_SET_250119mc 5 Data availability The analyses of SARS-CoV-2 data presented in Section 2.3 are based on 826 sequences and their associated metadata available on GISAID up to January 19th 2025, via gisaid.org/EPLSET250119mc and under doi.org/10.55876/gis8.250119mc. Scripts, input files and the raw data shown in figures of the main text are published as a data set under https://doi.org/10.17617/3.DNLGXJ . 6 Author contributions Conceptualization: AW, RB, SD Data curation: AW Formal analysis: AW Investigation: AW Methodology: AW, RB, SD Software: AW Supervision: AW, SD Validation: AW, RB, SD Visualization: AW, RB Writing – original draft: AW, RB Writing – review & editing: AW, RB, SD 7 Conflicts of interest The authors declare no conflicts of interest. 8 Funding Financial support of this work came from the Inception program (SD: Investissement d’Avenir grant ANR-16-CONV-0005433), the Australian National Health and Medical Research Council (SD: grant number 2017284) and the Max Planck Society (RB). Acknowledgements We gratefully acknowledge all data contributors, i.e., the Authors and their Originating laborato-ries responsible for obtaining the specimens, and their Submitting laboratories for generating the genetic sequence and metadata and sharing via the GISAID Initiative, on which this research is based. References 1. ↵ Álvarez-Carretero , S. , Tamuri , A. U. , Battini , M. , Nascimento , F. F. , Carlisle , E. , Asher , R. J. , Yang , Z. , Donoghue , P. C. J. , & dos Reis , M. ( 2021 ). A species-level timeline of mammal evolution integrating phylogenomic data . Nature , 602 ( 7896 ), 263 – 267 . doi: 10.1038/s41586-021-04341-1 (cit. on p. 2). OpenUrl CrossRef PubMed 2. ↵ Boskova , V. , Bonhoeffer , S. , & Stadler , T . ( 2014 ). Inference of epidemiological dynamics based on simulated phylogenies using birth-death and coalescent models (K. Koelle, Ed.) . PLoS Computational Biology , 10 ( 11 ), e1003913 . doi: 10.1371/journal.pcbi.1003913 (cit. on p. 14). OpenUrl CrossRef PubMed 3. ↵ Bouckaert , R. , Heled , J. , Kühnert , D. , Vaughan , T. , Wu , C.-H. , Xie , D. , Suchard , M. A. , Rambaut , A. , & Drummond , A. J. ( 2014 ). Beast 2: A software platform for bayesian evolutionary analysis (A. Prlic , Ed.). PLoS Computational Biology , 10 ( 4 ), e1003537 . doi: 10.1371/journal.pcbi.1003537 (cit. on pp. 4, 7). OpenUrl CrossRef 4. ↵ Bouckaert , R. , Vaughan , T. G. , Barido-Sottani , J. , Duchêne , S. , Fourment , M. , Gavryushkina , A. , Heled , J. , Jones , G. , Kühnert , D. , De Maio , N. , Matschiner , M. , Mendes , F. K. , Müller , N. F. , Ogilvie , H. A. , du Plessis , L. , Popinga , A. , Rambaut , A. , Rasmussen , D. , Siveroni , I. , . . . Drummond , A. J. ( 2019 ). Beast 2.5: An advanced software platform for bayesian evolutionary analysis (M. Pertea, Ed.) . PLOS Computational Biology , 15 ( 4 ), e1006650 . doi: 10.1371/journal.pcbi.1006650 (cit. on pp. 4, 15, 17). OpenUrl CrossRef PubMed 5. ↵ Bradlow , E. T. , & Zaslavsky , A. M . ( 1997 ). Case influence analysis in bayesian inference . Journal of Computational and Graphical Statistics , 6 ( 3 ), 314 – 331 . doi: 10.1080/10618600.1997.10474745 (cit. on p. 3). OpenUrl CrossRef 6. ↵ Brockhaus , E. K. , Wolffram , D. , Stadler , T. , Osthege , M. , Mitra , T. , Littek , J. M. , Krymova , E. , Klesen , A. J. , Huisman , J. S. , Heyder , S. , Helleckes , L. M. , an der Heiden , M. , Funk , S. , Abbott , S. , & Bracher , J. ( 2023 ). Why are different estimates of the effective reproductive number so different? a case study on covid-19 in germany (E. H. Lau , Ed.). PLOS Compu-tational Biology , 19 ( 11 ), e1011653 . doi: 10.1371/journal.pcbi.1011653 (cit. on p. 13). OpenUrl CrossRef 7. ↵ Cook , R. D . ( 1986 ). Assessment of local influence . Journal of the Royal Statistical Society Series B: Statistical Methodology , 48 ( 2 ), 133 – 155 . doi: 10.1111/j.2517-6161.1986.tb01398.x (cit. on p. 3). OpenUrl CrossRef 8. ↵ Danilevicz , I. M. , & Ehlers , R. S . ( 2020 ). Bayesian influence diagnostics using normalized functional bregman divergence . Communications in Statistics - Theory and Methods , 51 ( 6 ), 1637 – 1652 . doi: 10.1080/03610926.2020.1764583 (cit. on pp. 3, 4). OpenUrl CrossRef 9. ↵ Edo-Matas , D. , Lemey , P. , Tom , J. A. , Serna-Bolea , C. , van den Blink , A. E. , van ’t Wout , A. B. , Schuitemaker , H. , & Suchard , M. A. ( 2010 ). Impact of ccr5delta32 host genetic background and disease progression on hiv-1 intrahost evolutionary processes: Efficient hypothesis testing through hierarchical phylogenetic models . Molecular Biology and Evolution , 28 ( 5 ), 1605 – 1616 . doi: 10.1093/molbev/msq326 (cit. on p. 13). OpenUrl CrossRef PubMed Web of Science 10. ↵ Felsenstein , J . ( 1981 ). Evolutionary trees from dna sequences: A maximum likelihood approach . Journal of Molecular Evolution , 17 ( 6 ), 368 – 376 . doi: 10.1007/bf01734359 (cit. on pp. 5, 14). OpenUrl CrossRef PubMed Web of Science 11. ↵ Felsenstein , J. ( 2004 ). Inferring phylogenies [Literaturverz. S. 595 - 644]. Sinauer Assoc. (Cit. on p. 2). 12. ↵ Gelman , A. , Carlin , J. B. , Stern , H. S. , Dunson , D. B. , Vehtari , A. , & Rubin , D. B . ( 2013 , Novem-ber) . Bayesian data analysis. Chapman; Hall/CRC . doi: 10.1201/b16018 (cit. on p. 2). OpenUrl CrossRef 13. ↵ Hasegawa , M. , Kishino , H. , & Yano , T.-a. ( 1985 ). Dating of the human-ape splitting by a molecular clock of mitochondrial dna . Journal of Molecular Evolution , 22 ( 2 ), 160 – 174 . doi: 10.1007/bf02101694 (cit. on p. 17). OpenUrl CrossRef PubMed Web of Science 14. ↵ Jukes , T. H. , & Cantor , C. R. ( 1969 ). Evolution of protein molecules . In Mammalian protein metabolism (pp. 21 – 132 ). Elsevier . doi: 10.1016/b978-1-4832-3211-9.50009-7 (cit. on p. 15). OpenUrl CrossRef 15. ↵ Katoh , K. , & Standley , D. M . ( 2013 ). Mafft multiple sequence alignment software version 7: Im-provements in performance and usability . Molecular Biology and Evolution , 30 ( 4 ), 772 – 780 . doi: 10.1093/molbev/mst010 (cit. on p. 17). OpenUrl CrossRef PubMed Web of Science 16. ↵ Kendall , D. G . ( 1948 ). On the generalized “birth-and-death” process . The Annals of Mathematical Statistics , 19 ( 1 ), 1 – 15 . doi: 10.1214/aoms/1177730285 (cit. on p. 2). OpenUrl CrossRef 17. ↵ Kingman , J . ( 1982 ). The coalescent . Stochastic Processes and their Applications , 13 ( 3 ), 235 – 248 . doi: 10.1016/0304-4149(82)90011-4 (cit. on pp. 2, 5). OpenUrl CrossRef 18. ↵ Kocher , A. , Papac , L. , Barquera , R. , Key , F. M. , Spyrou , M. A. , Hübler , R. , Rohrlach , A. B. , Aron , F. , Stahl , R. , Wissgott , A. , van Bömmel , F. , Pfefferkorn , M. , Mittnik , A. , Villalba-Mouco , V. , Neumann , G. U. , Rivollat , M. , van de Loosdrecht , M. S. , Majander , K. , Tukhba-tova , R. I. , . . . Kühnert , D. ( 2021 ). Ten millennia of hepatitis b virus evolution . Science , 374 ( 6564 ), 182 – 188 . doi: 10.1126/science.abi5658 (cit. on p. 2). OpenUrl CrossRef PubMed 19. ↵ Kühnert , D. , Kouyos , R. , Shirreff , G. , Pečerska , J. , Scherrer , A. U. , Böni , J. , Yerly , S. , Klimkait , T. , Aubert , V. , Günthard , H. F. , Stadler , T. , & Bonhoeffer , S. ( 2018 ). Quantifying the fitness cost of hiv-1 drug resistance mutations through phylodynamics (A. Stern, Ed.) . PLOS Pathogens , 14 ( 2 ), e1006895 . doi: 10.1371/journal.ppat.1006895 (cit. on pp. 3, 12). OpenUrl CrossRef PubMed 20. ↵ Kühnert , D. , Stadler , T. , Vaughan , T. G. , & Drummond , A. J. ( 2014 ). Simultaneous reconstruc-tion of evolutionary history and epidemiological dynamics from viral sequences with the birth–death sir model . Journal of The Royal Society Interface , 11 ( 94 ), 20131106 . doi: 10.1098/rsif.2013.1106 (cit. on p. 2). OpenUrl CrossRef PubMed 21. ↵ Kullback , S . ( 1959 ). Information theory and statistics [Includes bibliography]. Wiley [u.a.] (Cit. on pp. 9, 13). 22. ↵ Louca , S. , McLaughlin , A. , MacPherson , A. , Joy , J. B. , & Pennell , M. W . ( 2021 ). Fundamental identifiability limits in molecular epidemiology (K. Crandall , Ed.). Molecular Biology and Evolution , 38 ( 9 ), 4010 – 4024 . doi: 10.1093/molbev/msab149 (cit. on p. 12). OpenUrl CrossRef PubMed 23. ↵ MacPherson , A. , Louca , S. , McLaughlin , A. , Joy , J. B. , & Pennell , M. W . ( 2021 ). Unifying phylogenetic birth–death models in epidemiology and macroevolution (J. Albert, Ed.) . Systematic Biology , 71 ( 1 ), 172 – 189 . doi: 10.1093/sysbio/syab049 (cit. on p. 5). OpenUrl CrossRef PubMed 24. ↵ MacPherson , A. , & Pennell , M . ( 2024 ). The untapped potential of tree size in reconstructing evolutionary and epidemiological dynamics . doi: 10.1101/2024.06.07.597929 (cit. on p. 12). OpenUrl Abstract / FREE Full Text 25. ↵ McCulloch , R. E . ( 1989 ). Local model influence . Journal of the American Statistical Association , 84 ( 406 ), 473 – 478 . doi: 10.1080/01621459.1989.10478793 (cit. on pp. 3, 4). OpenUrl CrossRef Web of Science 26. ↵ McElreath , R . ( 2020 ). Statistical rethinking: A bayesian course with examples in r and stan ( Second edition ). Chapman & Hall/CRC , (cit. on pp. 3, 13). 27. ↵ Millar , R. B. , & Stewart , W. S . ( 2007 ). Assessment of locally influential observations in bayesian models . Bayesian Analysis , 2 ( 2 ). doi: 10.1214/07-ba216 (cit. on pp. 3–6, 13). OpenUrl CrossRef 28. Möller , S. , du Plessis , L. , & Stadler , T. ( 2018 ). Impact of the tree prior on estimating clock rates during epidemic outbreaks . Proceedings of the National Academy of Sciences , 115 ( 16 ), 4200 – 4205 . doi: 10.1073/pnas.1713314115 (cit. on p. 12). OpenUrl Abstract / FREE Full Text 29. ↵ Müller , N. F. , Wagner , C. , Frazar , C. D. , Roychoudhury , P. , Lee , J. , Moncla , L. H. , Pelle , B. , Richardson , M. , Ryke , E. , Xie , H. , Shrestha , L. , Addetia , A. , Rachleff , V. M. , Lieberman , N. A. P. , Huang , M.-L. , Gautom , R. , Melly , G. , Hiatt , B. , Dykema , P. , . . . Bedford , T. ( 2021 ). Viral genomes reveal patterns of the sars-cov-2 outbreak in washington state . Science Translational Medicine , 13 ( 595 ). doi: 10.1126/scitranslmed.abf0202 (cit. on pp. 3, 5, 12). OpenUrl Abstract / FREE Full Text 30. ↵ Müller , N. F. , Wüthrich , D. , Goldman , N. , Sailer , N. , Saalfrank , C. , Brunner , M. , Augustin , N. , Seth-Smith , H. M. , Hollenstein , Y. , Syedbasha , M. , Lang , D. , Neher , R. A. , Dubuis , O. , Naegele , M. , Buser , A. , Nickel , C. H. , Ritz , N. , Zeller , A. , Lang , B. M. , . . . Stadler , T. ( 2020 ). Characterising the epidemic spread of influenza a/h3n2 within a city through phylogenetics (A. S. Lauring , Ed.). PLOS Pathogens , 16 ( 11 ), e1008984 . doi: 10.1371/journal.ppat.1008984 (cit. on pp. 3, 12). OpenUrl CrossRef PubMed 31. ↵ Nadeau , S. A. , Vaughan , T. G. , Beckmann , C. , Topolsky , I. , Chen , C. , Hodcroft , E. , Schär , T. , Nissen , I. , Santacroce , N. , Burcklen , E. , Ferreira , P. , Jablonski , K. P. , Posada-Céspedes , S. , Capece , V. , Seidel , S. , Santamaria de Souza , N. , Martinez-Gomez , J. M. , Cheng , P. , Bosshard , P. P. , . . . Stadler , T. ( 2023 ). Swiss public health measures associated with re-duced sars-cov-2 transmission using genome data . Science Translational Medicine , 15 ( 680 ). doi: 10.1126/scitranslmed.abn7979 (cit. on pp. 3, 5, 12). OpenUrl CrossRef 32. ↵ Novitsky , V. , Kühnert , D. , Moyo , S. , Widenfelt , E. , Okui , L. , & Essex , M. ( 2015 ). Phylodynamic analysis of hiv sub-epidemics in mochudi, botswana . Epidemics , 13 , 44 – 55 . doi: 10.1016/j.epidem.2015.07.002 (cit. on p. 3). OpenUrl CrossRef PubMed 33. ↵ Översti , S. , Gaul , E. , Jensen , B.-E. O. , & Kühnert , D. ( 2023 ). Phylogenetic meta-analysis of chronic sars-cov-2 infections in immunocompromised patients shows no evidence of elevated evo-lutionary rates . doi: 10.1101/2023.11.01.565087 (cit. on p. 12). OpenUrl Abstract / FREE Full Text 34. ↵ Översti , S. , Weber , A. , Baran , V. , Kieninger , B. , Dilthey , A. , Houwaart , T. , Walker , A. , Schneider-Brachert , W. , & Kühnert , D. ( 2025 ). Evolutionary and epidemic dynamics of covid-19 in germany exemplified by three bayesian phylodynamic case studies . Bioinformatics and Biology Insights , 19 . doi: 10.1177/11779322251321065 (cit. on pp. 3, 12). OpenUrl CrossRef 35. ↵ Paradis , E. , & Schliep , K . ( 2019 ). Ape 5.0: An environment for modern phylogenetics and evolution-ary analyses in R . Bioinformatics , 35 , 526 – 528 . doi: 10.1093/bioinformatics/bty633 (cit. on pp. 15, 17). OpenUrl CrossRef PubMed 36. Pekar , J. E. , Lytras , S. , Ghafari , M. , Magee , A. F. , Parker , E. , Wang , Y. , Ji , X. , Havens , J. L. , Katzourakis , A. , Vasylyeva , T. I. , Suchard , M. A. , Hughes , A. C. , Hughes , J. , Rambaut , A. , Robertson , D. L. , Dellicour , S. , Worobey , M. , Wertheim , J. O. , & Lemey , P . ( 2025 ). The recency and geographical origins of the bat viruses ancestral to sars-cov and sars-cov-2 . Cell , 188 ( 12 ), 3167 – 3183 .e18. doi: 10.1016/j.cell.2025.03.035 (cit. on p. 2). OpenUrl CrossRef 37. Pekar , J. E. , Wang , Y. , Wang , J. C. , Shao , Y. , Taki , F. , Forgione , L. A. , Amin , H. , Clabby , T. , Johnson , K. , Torian , L. V. , Braunstein , S. L. , Pathela , P. , Omoregie , E. , Hughes , S. , Suchard , M. A. , Vasylyeva , T. I. , Lemey , P. , & Wertheim , J. O . ( 2025 ). Transmission dynamics of the 2022 mpox epidemic in new york city . Nature Medicine , 31 ( 5 ), 1464 – 1473 . doi: 10.1038/s41591-025-03526-9 (cit. on p. 3). OpenUrl CrossRef 38. ↵ Peruggia , M . ( 1997 ). On the variability of case-deletion importance sampling weights in the bayesian linear model . Journal of the American Statistical Association , 92 ( 437 ), 199 – 207 . doi: 10.1080/01621459.1997.10473617 (cit. on pp. 3, 4). OpenUrl CrossRef Web of Science 39. ↵ Plummer , M. , Best , N. , Cowles , K. , & Vines , K . ( 2006 ). Coda: Convergence diagnosis and output analysis for mcmc . R News , 6 ( 1 ), 7 – 11 . https://journal.r-project.org/archive/ (cit. on pp. 15, 17). OpenUrl CrossRef 40. ↵ R Core Team . ( 2024 ). R: A language and environment for statistical computing . R Foundation for Statistical Computing . Vienna, Austria . https://www.R-project.org/ (cit. on pp. 14, 17). 41. ↵ Revell , L. J . ( 2024 ). phytools 2.0: An updated R ecosystem for phylogenetic comparative methods (and other things) . PeerJ , 12 , e16505 . doi: 10.7717/peerj.16505 (cit. on p. 15). OpenUrl CrossRef PubMed 42. ↵ Safina , K. R. , Sidorina , Y. , Efendieva , N. , Belonosova , E. , Saleeva , D. , Kirichenko , A. , Kireev , D. , Pokrovsky , V. , & Bazykin , G. A . ( 2022 ). Molecular epidemiology of hiv-1 in oryol oblast, russia . Virus Evolution , 8 ( 1 ). doi: 10.1093/ve/veac044 (cit. on pp. 3, 12). OpenUrl CrossRef 43. ↵ Shu , Y. , & McCauley , J . ( 2017 ). Gisaid: Global initiative on sharing all influenza data – from vision to reality . Eurosurveillance , 22 ( 13 ). doi: 10.2807/1560-7917.es.2017.22.13.30494 (cit. on p. 16). OpenUrl CrossRef 44. ↵ Smith , B. J . ( 2007 ). Boa: An r package for mcmc output convergence assessment and posterior inference . Journal of Statistical Software , 21 ( 11 ), 1 – 37 (cit. on p. 15). OpenUrl 45. ↵ Smith , M. R. , Trofimova , M. , Weber , A. , Duport , Y. , Kühnert , D. , & von Kleist , M. ( 2021 ). Rapid incidence estimation from sars-cov-2 genomes reveals decreased case detection in europe during summer 2020 . Nature Communications , 12 ( 1 ). doi: 10.1038/s41467-021-26267-y (cit. on pp. 3, 12). OpenUrl CrossRef PubMed 46. ↵ Stadler , T. , Kühnert , D. , Bonhoeffer , S. , & Drummond , A. J. ( 2012 ). Birth–death skyline plot reveals temporal changes of epidemic spread in hiv and hepatitis c virus (hcv) . Proceedings of the National Academy of Sciences , 110 ( 1 ), 228 – 233 . doi: 10.1073/pnas.1207965110 (cit. on pp. 5, 7, 15, 17). OpenUrl Abstract / FREE Full Text 47. ↵ Taouk , M. L. , Taiaroa , G. , Duchene , S. , Low , S. J. , Higgs , C. K. , Lee , D. Y. J. , Pasricha , S. , Higgins , N. , Ingle , D. J. , Howden , B. P. , Chen , M. Y. , Fairley , C. K. , Chow , E. P. F. , & Williamson , D. A . ( 2024 ). Longitudinal genomic analysis of neisseria gonorrhoeae transmission dynam-ics in australia . Nature Communications , 15 ( 1 ). doi: 10.1038/s41467-024-52343-0 (cit. on p. 13). OpenUrl CrossRef PubMed 48. ↵ Thomas , Z. M. , MacEachern , S. N. , & Peruggia , M . ( 2018 ). Reconciling curvature and importance sampling based procedures for summarizing case influence in bayesian models . Journal of the American Statistical Association , 113 ( 524 ), 1669 – 1683 . doi: 10.1080/01621459.2017.1360777 (cit. on pp. 3–6, 13). OpenUrl CrossRef 49. ↵ Tsui , J. L.-H. , Sambaturu , P. , Pena , R. E. , Too , L. , Gutierrez , B. , Inward , R. , Kraemer , M. U. G. , du Plessis , L. , & Pybus , O. G. ( 2025 ). Transmission lineage dynamics and the detection of viral importation in emerging epidemics . doi: 10.1101/2025.03.05.25323408 (cit. on p. 11). OpenUrl Abstract / FREE Full Text 50. ↵ Vasylyeva , T. I. , du Plessis , L. , Pineda-Peña , A. C. , Kühnert , D. , Lemey , P. , Vandamme , A.-M. , Gomes , P. , Camacho , R. J. , Pybus , O. G. , Abecasis , A. B. , & Faria , N. R. ( 2019 ). Tracing the impact of public health interventions on hiv-1 transmission in portugal using molecular epidemiology . The Journal of Infectious Diseases , 220 ( 2 ), 233 – 243 . doi: 10.1093/infdis/jiz085 (cit. on pp. 3, 5, 12). OpenUrl CrossRef PubMed 51. ↵ Vaughan , T. ( 2025 , April 2). Tgvaughan/feast [original-date: 2014-03-22T09:05:29Z]. Retrieved April 26, 2025, from https://github.com/tgvaughan/feast (cit. on p. 15). 52. ↵ Vaughan , T. G . ( 2024 ). Remaster: Improved phylodynamic simulation for beast 2.7 (R. Schwartz , Ed.). Bioinformatics , 40 ( 1 ). doi: 10.1093/bioinformatics/btae015 (cit. on p. 15). OpenUrl CrossRef 53. ↵ Vaughan , T. G. , & Stadler , T . ( 2025 ). Bayesian phylodynamic inference of multitype population trajectories using genomic data (J. Parsch, Ed.) . Molecular Biology and Evolution , 42 ( 6 ). doi: 10.1093/molbev/msaf130 (cit. on p. 2). OpenUrl CrossRef 54. Včceřová , Š. , Komárek , A. , Bruyneel , L. , & Lesaffre , E. ( 2020 ). Identifying influential observations in a bayesian multi-level mediation model . Journal of Applied Statistics , 48 ( 5 ), 943 – 960 . doi: 10.1080/02664763.2020.1748179 (cit. on p. 13). OpenUrl CrossRef PubMed 55. Vehtari , A. , Gelman , A. , & Gabry , J . ( 2016 ). Practical bayesian model evaluation using leave-one-out cross-validation and waic . Statistics and Computing , 27 ( 5 ), 1413 – 1432 . doi: 10.1007/s11222-016-9696-4 (cit. on p. 3). OpenUrl CrossRef 56. ↵ Volz , E. , Hill , V. , McCrone , J. T. , Price , A. , Jorgensen , D. , O’Toole , Á. , Southgate , J. , Johnson , R. , Jackson , B. , Nascimento , F. F. , Rey , S. M. , Nicholls , S. M. , Colquhoun , R. M. , da Silva Filipe , A. , Shepherd , J. , Pascall , D. J. , Shah , R. , Jesudason , N. , Li , K. , . . . Neaverson , A. S. ( 2021 ). Evaluating the effects of sars-cov-2 spike mutation d614g on transmissibility and pathogenicity . Cell , 184 ( 1 ), 64 – 75 .e11. doi: 10.1016/j.cell.2020.11.020 (cit. on p. 2). OpenUrl CrossRef PubMed 57. ↵ Volz , E. M. , & Frost , S. D. W . ( 2014 ). Sampling through time and phylodynamic inference with coa-lescent and birth–death models . Journal of The Royal Society Interface , 11 ( 101 ), 20140945 . doi: 10.1098/rsif.2014.0945 (cit. on p. 12). OpenUrl CrossRef PubMed 58. ↵ Volz , E. M. , Koelle , K. , & Bedford , T . ( 2013 ). Viral phylodynamics (S. Wodak, Ed.) . PLoS Com-putational Biology , 9 ( 3 ), e1002947 . doi: 10.1371/journal.pcbi.1002947 (cit. on p. 2). OpenUrl CrossRef PubMed 59. ↵ Volz , E. M. , & Siveroni , I . ( 2018 ). Bayesian phylodynamic inference with complex models (A. E. Darling , Ed.). PLOS Computational Biology , 14 ( 11 ), e1006546 . doi: 10.1371/journal.pcbi.1006546 (cit. on p. 2). OpenUrl CrossRef 60. W, L. ( 2025 , April 16). W-l/ProblematicSites sars-CoV2 [original-date: 2020-05-05T14:57:50Z]. Retrieved June 11, 2025, from https://github.com/W-L/ProblematicSites_SARS-CoV2 (cit. on p. 17). 61. ↵ Walas , N. , Müller , N. F. , Parker , E. , Henderson , A. , Capone , D. , Brown , J. , Barker , T. , & Graham , J. P. ( 2024 ). Application of phylodynamics to identify spread of antimicrobial-resistant escherichia coli between humans and canines in an urban environment . Science of The Total Environment , 916 , 170139 . doi: 10.1016/j.scitotenv.2024.170139 (cit. on p. 3). OpenUrl CrossRef PubMed 62. ↵ Weber , A. , Översti , S. , & Kühnert , D. ( 2023 ). Reconstructing relative transmission rates in bayesian phylodynamics: Two-fold transmission advantage of omicron in berlin, germany during december 2021 . Virus Evolution , 9 ( 2 ). doi: 10.1093/ve/vead070 (cit. on pp. 3, 5, 8, 12). OpenUrl CrossRef 63. ↵ Weiss , R. E. , & Cook , R. D . ( 1992 ). A graphical case statistic for assessing posterior influence . Biometrika , 79 ( 1 ), 51 . doi: 10.2307/2337146 (cit. on p. 4). OpenUrl CrossRef 64. ↵ Wickham , H. , Averick , M. , Bryan , J. , Chang , W. , McGowan , L. D. , François , R. , Grolemund , G. , Hayes , A. , Henry , L. , Hester , J. , Kuhn , M. , Pedersen , T. L. , Miller , E. , Bache , S. M. , Müller , K. , Ooms , J. , Robinson , D. , Seidel , D. P. , Spinu , V. , . . . Yutani , H. ( 2019 ). Welcome to the tidyverse . Journal of Open Source Software , 4 ( 43 ), 1686 . doi: 10.21105/joss.01686 (cit. on pp. 15, 17). OpenUrl CrossRef PubMed 65. ↵ Wilinski , M. , Castro , L. , Keithley , J. , Manore , C. , Campos , J. , Romero-Severson , E. , Domman , D. , & Lokhov , A. Y . ( 2024 ). Congruity of genomic and epidemiological data in modelling of local cholera outbreaks . Proceedings of the Royal Society B: Biological Sciences , 291 (2019). doi: 10.1098/rspb.2023.2805 (cit. on p. 2). OpenUrl CrossRef 66. ↵ Wu , F. , Zhao , S. , Yu , B. , Chen , Y.-M. , Wang , W. , Song , Z.-G. , Hu , Y. , Tao , Z.-W. , Tian , J.-H. , Pei , Y.-Y. , Yuan , M.-L. , Zhang , Y.-L. , Dai , F.-H. , Liu , Y. , Wang , Q.-M. , Zheng , J.-J. , Xu , L. , Holmes , E. C. , & Zhang , Y.-Z . ( 2020 ). A new coronavirus associated with human respiratory disease in china . Nature , 579 ( 7798 ), 265 – 269 . doi: 10.1038/s41586-020-2008-3 (cit. on p. 17). OpenUrl CrossRef PubMed 67. ↵ Yang , Z. , & Rannala , B . ( 2012 ). Molecular phylogenetics: Principles and practice . Nature Reviews Genetics , 13 ( 5 ), 303 – 314 . doi: 10.1038/nrg3186 (cit. on p. 2). OpenUrl CrossRef PubMed 68. ↵ Zarebski , A. E. , Zwaans , A. , Gutierrez , B. , du Plessis , L. , & Pybus , O. G. ( 2025 ). Estimating epidemic dynamics with genomic and time series data . Journal of The Royal Society Interface , 22 ( 227 ). doi: 10.1098/rsif.2024.0632 (cit. on p. 2). OpenUrl CrossRef 69. ↵ Zhukova , A. , Hecht , F. , Maday , Y. , & Gascuel , O . ( 2023 ). Fast and accurate maximum-likelihood estimation of multi-type birth–death epidemiological models from phylogenetic trees . Systematic Biology , 72 ( 6 ), 1387 – 1402 . https : //doi . org /10 . 1093 /sysbio /syad059 (cit. on p. 3). OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted November 19, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Identification of transmission clusters with high influence on shared parameter estimates in Bayesian phylodynamics Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Identification of transmission clusters with high influence on shared parameter estimates in Bayesian phylodynamics Ariane Weber , Ruth Boersma , Sebastian Duchene medRxiv 2025.11.18.25340546; doi: https://doi.org/10.1101/2025.11.18.25340546 Share This Article: Copy Citation Tools Identification of transmission clusters with high influence on shared parameter estimates in Bayesian phylodynamics Ariane Weber , Ruth Boersma , Sebastian Duchene medRxiv 2025.11.18.25340546; doi: https://doi.org/10.1101/2025.11.18.25340546 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Epidemiology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9219) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffb21a1ed58f047',t:'MTc3OTQ0NTY3OA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00