The distribution of the number of mutations in the genealogy of a sample from a single population

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 47,478 characters · extracted from preprint-html · click to expand
The distribution of the number of mutations in the genealogy of a sample from a single population | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results The distribution of the number of mutations in the genealogy of a sample from a single population View ORCID Profile Yun-Xin Fu doi: https://doi.org/10.1101/2025.02.04.636538 Yun-Xin Fu a Department of Biostatistics and Data Science, School of Public Health University of Texas Health Science Center at Houston , 1200 Herman Pressler, Houston, Texas 77030 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Yun-Xin Fu For correspondence: yunxin.fu{at}uth.tmc.edu Abstract Full Text Info/History Metrics Preview PDF Abstract The number K of mutations in the genealogy of a sample of n sequences from a single population is one essential summary statistics in molecular population genetics and is equal to the number of segregating sites in the sample under the infinitesites model. Although its expectation and variance are the most widely utilized properties, its sampling formula (i.e., probability distribution) is the foundation for all explorations related to K . Recently, it has been established that K is subject to the Central Limit Theorem, and thus has asymptotic normality. However, due to its slow convergence to normality, the finite-sample distribution remains indispensable. Although an analytic sampling formula exists, its numerical application is limited due to susceptibility to error propagation. This paper presents a new sampling formula for K in a random sample of DNA sequences from a neutral locus without recombination, taken from a single population evolving according to the Wright-Fisher model with a constant effective population size, or the constant-in-state model, which allows the effective population size to vary across different coalescent states. The new sampling formula is expressed as the sum of the probabilities of the various ways mutations can manifest in the sample genealogy and achieves simplicity by partitioning mutations into hypothetical atomic clusters that cannot be further divided. Under the Wright-Fisher model with a constant effective population size, the new sampling formula is closely analogous to the celebrated Ewens’ sampling formula for the number of distinct alleles in a sample. Numerical computation using the new sampling formula is accurate and is limited only by the burden of enumerating a large number of partitions of a large K . However, significant improvement in efficiency can be achieved by prioritizing the enumeration of partitions with a large number of parts. The number K of mutations in the genealogy of a sample of n sequences from a population is one of the most important summary statistics in molecular population genetics. It is also ubiquitous in the analysis of DNA sequence samples, as it can be identified as the number of segregating sites in a sample under the infinite-sites model ( Ewens (2004) , and also see Fu (2025) for a concise list of theoretical and methodological studies related to K ). Therefore, it is essential to characterize its sampling formula (i.e., probability distribution), which is Watterson (1975) established that K = X 2 X n , where X i represents the number of mutations in the time period of observing i ancestral sequences in a forward process. In the terminology of the Kingman coalescent ( Kingman, 1982a ,b), X i is the number of mutations during state i of the coalescent process of the sample, and follows a geometric distribution with parameter 1 − φ i , where where θ = 4 Nµ with N as the effective population size and µ as the mutation rate per sequence per generation. Since X i is independent X j ( i ≠ j ), the probability generating function of K is thus the product of those for different X i , which led Watterson (1975) to derive the expectation and variance of K , and also pointed out that which, as well as for ℙ n ( m ) for small m , can also be obtained from direct convolution. Using partial expansion of the probability generating function of K , Tavaré (1984) derived the following sampling formula Recently, Fu (2025) established that central limit theory holds for K , meaning that K asymptotically follows a normal distribution. Furthermore, the finite-sample skewness and kurtosis of K are now available, which enhances our understanding of its finite-sample properties. The characterization of K appears to be nearly complete, at least under the Wright-Fisher model with a constant effective population size. However, since K under the infinite-sites model is often considered a counterpart to k , the number of distinct alleles in the sample under the infinite-alleles model, the lack of resemblance of Tavaré’s formula (elegant in its own right) to Ewens’ sampling formula leaves much to be desired. For a sample configuration of n sequences, where i j is the number of distinct alleles with j copies in the sample, it is known ( Ewens, 1972 ; Karlin and McGregor, 1972 ) that where Summing up all sample configurations that have the same number of distinct alleles leads to where The fact that K arises in different configurations, and that its probability is the sum of the probabilities of these configurations, is both intuitive and intriguing. This prompts one to wonder whether a similar analogy exists for K . Of course, since K is only asymptotically sufficient for θ ( Fu and Li, 1993 ), any such sampling formulas, if they exist, are expected to be more complex than the Ewens’ sampling formula. It is also unfortunate that Tavaré’s sampling formula (4) has a limited scope of applicability in the numerical calculation of ℙ n ( m ), due to error accumulation in the summation of large absolute values with alternating signs. This issue becomes progressively more severe with increasing sample size, often making the formula unusable even for modest sample sizes. Furthermore, it is highly desirable to be able to compute ℙ n ( m ) for a non-constant population. The primary purpose of this paper is to present and prove a new sampling formula for K , which not only provides a direct analogy to the Ewens’ sampling formula, but also serves as a reliable method for computing ℙ n ( m ) under a more general model than the Wright-Fisher model with a constant effective population size. The main results The Kingman coalescent ( Kingman, 1982a ,b) is best known for describing a sample of size n taken from a population that evolves according to the Wright-Fisher model ( Ewens, 2004 ), although the results are robust under several alternative models, including Moran’s model. For this reason, we assume the population evolves according to the constant-in-state model ( Fu, 2025 ), which is a variation of the Wright-Fisher model. In this model, the effective population size during coalescent state i is constant ( N i ), while the assumption of independence among coalescent times remains valid. The constant-in-state model extends the Wright-Fisher model by allowing for populations of non-constant sizes, while still maintaining mathematical tractability. As a result, it has been applied in various contexts ( Pybus et al., 2000 ; Liu and Fu, 2015 ). Throughout the paper, we restrict our attention to a sample of DNA sequences from a single locus without recombination, where all mutations are selectively neutral. Integer partitions are essential in this paper and thus warrant a brief introduction. A partition of an integer m refers to expressing m as a sum of positive integers, where the order of the summands does not matter ( Riordan, 1958 ; Abramowitz and Stegun, 1972 ). For example, 5 = 1 + 2 + 2 = 4 + 1, so the tuples (1, 2, 2) and (4, 1) are both partitions of 5. The order of parts in a partition is irrelevant, and by convention, the parts are listed in descending order. We will use bold letters to represent integer partitions in this paper and adopt the convention that s ⊢ m indicates that s is a partition of m . Thus, s ⊢ m represents a tuple of positive integer(s) such that k 1 ⩾ k 2 ⩾ … and k 1 = m . Equivalent but more convenient notation for an integer partition is cycle type (multiplicity form) where s k is the multiplicity (number of occurrences) of integer k in the partition tuple. Therefore, the sample configuration in Eq.(5) is an integer partition of sample size n . For example, s = (5, 2, 2, 1)) 10 has cycle type 1 1 2 2 3 0 4 0 5 1 . It is customary to omit the component in the cycle type when s k = 0, so the above cycle type becomes more concisely s = 1 1 2 2 5 1 . It is also conventional to consider 1 0 2 0 … as the sole partition of integer 0. Summation (∑) and multiplication (∏) appear frequently in this paper, often in the context of enumerating integer partitions and counting permutations of cycle type (which will be elaborated on later). We adopt the convention that when there is no summand, ∑ is equal to 0, and when there is no factor, ∏ is equal to 1. The main result of this paper is as follows. Theorem. Let ℙ n ( m ) denote the probability of having exactly m mutations in the genealogy of a random sample of n sequences from a population evolving according to the constant-in-state model. Then , where the summation is taken over all integer partitions and where where θ i = 4 N i µ, with N i being the effective population size at state i of the coalescent process and µ being the mutation rate per sequence per generation . Before proceeding with the proof of the Theorem, we introduce some notations for brevity and clarity. Additionally, we present a corollary for the specific case where the population evolves according to the Wright-Fisher model with a constant effective population size. Define for Then Eq.(11) can be written as It should be noted that c ( s ) defined above is the number of permutations of m different objects conforming to cycle type s ( Riordan, 1958 ; Abramowitz and Stegun, 1972 ). Its precise meaning in the context of mutations will be elaborated later when we discuss combinatorial interpretation of the Theorem. As far as c ( s ) is concerned, there is no need to specify explicitly the integer to which a partition s belongs, because the integer is equal to Furthermore it is well known( Abramowitz and Stegun, 1972 ) that where is the Stirling number of the first kind. It follows that defined by (8) is equal to , which is known as unsigned Stirling number of the first kind. Let Then enumerating over all integer partitions for a given m , it can be verified easily that which can be used to compute ℙ n ( m ) quickly for m ⩽5. Under the Wright-Fisher model with constant effective size, we have that θ = θ 2 = … = θ n . It follows from (3) that Furthermore all the summands of ω n ( j ) has a common factor θ j and for a partition s ) m , which leads to the following corollary Corollary. For a random sample of n sequences from a population evolving according to the Wright-Fisher model with constant effective size N, the probability of having exactly m mutations in the sample genealogy is where S n ( θ ) is defined by (6) and where the summation is taken over all partitions of m, c ( s ) is defined by (15) and θ = 4 Nµ with µ being the mutation rate per sequence per generation . We thus have a closely analogous formula to Ewens’ sampling formula (7), with two key differences. The first is that k can take value between 1 and n , while K can take values between 0 and 8. The second is that, although both and are of the form is a summation over partitions of sample size n with m parts where f ( s ) = 1, resulting in the unsigned Stirling number of the second kind (19). In comparison is a summation over all partitions of mutation number m with f ( s ) as a function of θ , partition s and sample size n . This second difference reflects the fact that k is a sufficient summary statistic for θ , so that the detailed configurations leading to k is independent of θ , while K is not a sufficient summary statistic for θ . Proof of the Theorem For clarity, let K n = K for sample size n . Consider first the case of n = 2. Since K 2 = X 2 , which follows a geometric distribution with parameter 1 − φ 2 , it follows that Since ,we have for any partition that Due to Eq.(20) and p 2 (0) = (1 − φ 2 ), one arrives at Therefore Eq.(17) is true for n = 2. Suppose it is true for all the sample sizes up to n − 1. Then since K n = K n − 1 + X n , the following recurrence relationship holds Since X n ∼ Geom (1 − φ 2 ) it follows that We shall show that this is in fact an alternative expression of Eq.(11) . From Eq.(22) , we have the recurrence relationship which by a standard expansion gives This immediately leads to For two partitions and ,define the relationship s ⩽ s ‵ being true if and only if , When s ⩽ s 1 , define their difference partition as Consequently ,Since , it follows that where the summation is taken over all partitions s ⩽ s ‵ , which include the ∅. Grouping s according to its ‖ s ‖ results in Therefore directly from (17) we have that Since s ⊢ ( m − k ) and s ′ m , we have For a given s ⊢ ( m − i ), enumerating over all s ′ ⊢ m with s ′ ⩾ s is equivalent to enumerating over all ( s ′ − s ) ⊢ i . Due to Eq.(20) , it follows that Therefore Eq.(29) is the same as Eq.(28) , which indicates that Eq.(11) is also true for sample size n . By induction the Theorem is true for all sample sizes. || Combinatorial interpretation Mutations under the infinite-alleles model lead to partitions of the sample, resulting in the probability of a sample partition as given in Eq. (5) , and the probability of having k distinct alleles as shown in Eq. (7) . These probabilities have since been re-derived multiple times and subjected to various combinatorial interpretations ( Hoppe, 1984 ; Donnelly, 1986 ; Donnelly and Tavaé, 1986 ; Griffiths and Lessard, 2005 ). Under the infinite-alleles model, each distinct allele is unequivocally identified in the sample, but the number of mutations responsible for the observed allelic configuration is ambiguous—only the minimum number of mutations is known. In contrast, under the infinite-sites model, each mutation is unequivocally identified as a segregating site, but the number of mutations alone does not provide sufficient information on how the sequences in a sample are partitioned. This leads to an ambiguous sample configuration, unless more detailed information about each mutation is considered. Therefore, ℙ n ( m ) can only be partitioned according to the configurations of mutations. The theorem suggests that mutations can be partitioned into clusters corresponding to integer partitions, though the meaning of this is not entirely clear. Insight can be gained by examining in detail the cases of two and three mutations. Consider the case of two mutations first. As for the states at which they may occur, there are only two possibilities. The first is that both mutations occur at the same state, and the second is that they occur at two different states. The probability that there are two mutations in the sample genealogy such that both occur at state i is Therefore the probability that both mutations occur at the same state is ℙ n (0) ∑ φ i . Similarly the probability that there are two mutations in the sample genealogy such that one occurs at state i and another at state j is Enumerating over all i < j (since j < i and i < j represent the same event thus should not be counted twice), leads to that the probability two mutations occur at different state equal to .It follows that which is Eq.(11) when m = 2. The split of in the second equality into two halves, with the second half combined into the probability of two mutations at different states, is key to arriving at the concise expression in Eq. (30) . One interpretation of this partition and reassembly is that the event of observing two mutations at the same state corresponds to the same outcome from two different underlying scenarios: one is that the two mutations arise in a tight cluster within a state, and the other is that the two mutations come from two separated clusters (each containing one mutation) at the same state. Since the sum of the probabilities of these two scenarios is ,it is essential to understand how this probability is apportioned between the two scenarios. The derivation of Eq. (30) suggests an even split, which is equivalent to assigning the probability based on the proportion of the number c (2 1 ) of permutations of cycle type 2 1 among all permutations of two mutations, which is 2!. Therefore, if the two scenarios are separated, the scenario corresponding to a single cluster of two mutations has probability On the other hand, when the scenario of two separated clusters of one mutation each is combined with ℙ n (0) ∑ i < j φ i φ j , it leads to a probability of ,which represents the scenario where two mutations arise in two clusters, each containing one mutation. These clusters could either be at the same state or at different states. The above analysis suggests that, although hypothetical and invisible, the concept of a cluster of mutations can help simplify the interpretation of mutations. The cluster of mutations should be the final assignment, as it will not be further divided or reassigned; otherwise, every mutation will eventually end up in its own cluster. Furthermore, each cluster resides within a single state. We can therefore refer to such clusters as atomic (indivisible) clusters. The rules for apportioning the probability of an event to different partitions of mutations into atomic clusters can be further clarified by examining the case of three mutations. There are three different observable events for three mutations. The first is that all three occur at a single state, with probability ,the second is that two mutations occur at one state and another at a different state, with probability ,and the third is that three mutations occur at three different states, with probability ℙ n (0) ∑ i < j< k φ i φ j φ k . The assembly of the probabilities leads to The first event is now split into three scenarios: one atomic cluster of three mutations, one atomic cluster of two mutations and one atomic cluster of one mutation, and three atomic clusters of one mutation each. The rule for appor-the total probability to the different scenarios is clearly proportional to the number of permutations of cycle types. Similarly, the second event is split into two scenarios: one atomic cluster of two mutations in one state and one cluster in a different state, and two atomic clusters of one mutation each in the same state and one cluster in a different state. One can continue the analysis for more mutations, but the above analysis suffices the definition of atomic clusters as a list of mutations with the following three properties: An atomic cluster cannot be further divided, An atomic cluster resides within a single state, The order (permutation) of mutations within an atomic cluster is cyclic. A permutation is said to be cyclic if the elements are arranged in circular form so that it only matter what element follows a given element and there is no beginning nor end element. Therefore, a partition of m mutations into a group of atomic clusters corresponds to an integer partition .The number of permutations of m mutations to confirm with cycle type s can be intuitively derived as follows. First of all, the number of arrangements of m mutations into s i of groups (atomic clusters) of size i , ( i = 1, …, m ) is given by the standard multinomial coefficient which assumes each group is labeled (identifiable). If groups can be identified only up to their sizes, then the total number of permutations is reduced to If permutation of mutations within each atomic cluster is identifiable, there will be more arrangements. The assumption of permutation within each atomic cluster being cyclic means that the number of permutations for atomic cluster of size i is ( i − 1)!. Therefore the total number of arrangements of m mutations into cycle type s is We thus arrive at a proposition that can be regarded as a corollary of the main theorem, based on the construct of hypothetical and invisible atomic clusters of mutations Proposition. Mutations in the genealogy of a sample of n sequences emerge in atomic clusters. The specification of a list of atomic clusters coincides with an integer partition of the number of mutations m. The probability of mutations emerging in a partition s is given by and ℙ n ( m ) is the summation of probabilities of all different partitions of mutations, where c ( s ), ℙ n (0) and λ n ( s ) are defined, respectively, by (15), (12) and (16) . The construct of atomic clusters of mutations and Eq. (31) provide an analogy to Ewens’ sampling formula (5). Without the use of hypothetical and invisible boundaries to partition mutations into atomic clusters, attempting to express ℙ n ( m ) as the sum of probabilities for different scenarios of mutation realizations would lead to complicated, multi-layered summations, quickly losing simplicity. Numerical computation When numerically evaluating ℙ n ( m ), it is important to verify its accuracy by comparing the results with known properties. In addition to the usual checks for a valid probability distribution, one can compare ℙ n ( m ) against its moments, including Expectation and variance ( Watterson, 1975 ; Fu, 2025 ): Skewness and kurtonsis ( Fu, 2025 ): where With these safeguards in place, we evaluated ℙ n ( m ) for various combinations of n , m and θ . In all cases, we encountered no issues with numerical accuracy when using Eq.(11) . The impact of sample size n on the computation of ℙ n ( m ) is minor, as it only affects ω n ( i ), which needs to be evaluated once for each different m . However, the computational burden does increase with m , as will be discussed later. We present two contrasting examples of ℙ n ( m ) here: one corresponding to the Wright-Fisher model with constant effective size, and the other to the constant-in-state model. Figure 1 shows the distribution of ℙ n ( m ) with θ = 3 for three different sample sizes: 50, 500, and 5000. These cases were chosen in part for comparison with the simulated data and the approximating normal distributions from Fu (2025) ( Fig. 3 , panel b). Download figure Open in new tab Fig. 1. Probability distribution ℙ n ( m ) with θ = 3 for three samples sizes. For each sample size, solid curve is ℙ n ( m ) computed from (11), dot plot is from coalescent simulation and dashed curve represents the normal approximation. The latter two distributions are from panel (b) of Fig 3 in Fu (2025) From Fig. 1 , it is clear that the exact distribution ℙ n ( m ) for each sample size aligns well with the empirical distribution based on coalescent simulations of 50,000 replicates. This is expected, but without the exact distribution, one might wonder how many replicates are necessary to obtain sufficiently accurate results. With the exact distributions at hand, we can revisit the conclusions of Fu (2025) regarding the asymptotic normality of ℙ n ( m ). One conclusion in Fu (2025) is that the improvement towards normality from n = 50 to n = 500 is quite noticeable, while the further improvement from n = 500 to n = 5000 is more subtle. The patterns in Fig. 1 reinforce this conclusion with greater confidence. This is not surprising, given that the speed of c onver gence of ℙ n ( m ) to normality is approximately . From to 500, the change is ,while from 500 to 5000 the change is only 0.426. Therefore, after an initial rapid improvement towards normality, further increases in sample size have progressively smaller effects. For a population evolved according to the constant-in-state model, we considered three scenarios of population dynamics for n = 100. The first scenario represents a constant population size, with θ i = 0.407 for i = 2, …, 15 and 2 for i = 16, …, 100. The second scenario corresponds to a recent population expansion,with θ i = 0.407 for i = 2, …, 15 and 2 for i = 16, …, 100. The third scenario is a bottleneck followed by expansion, specified by θ i = 0.65 for i = 2, 3, 0.1 for i = 4, 5, 6 and 1.425 for i > 6. All three scenarios have the same expected number of mutations E ( K ) = 5.177 The mean values of θ for these three scenarios are plotted in Fig. 2 against time into the past, scaled so that 1 unit corresponds to 4 N generations, where N is the effective population size for the constant population. It is clear that, for the constant population size scenario, the average time to the most recent common ancestor (MRCA) is 1 unit (4 N generations). In contrast, for scenarios 2 and 3, the time to the MRCA is shorter due to their smaller effective sizes near the MRCA. Download figure Open in new tab Fig. 2. The mean dynamics of θ i for a sample of size 100 for three scenarios. Scenario 1: constant model with θ = 1, Scenario 2: θ i = 0.407, i = 2, …, 15 and θ i = 2, i = 16, …, 100, and Scenario 3: θ i = 0.65, i = 2, 3, θ i = 0.1, i = 4, 5, 6 and θ i = 1.425, i = 7, …, 100. All three scenarios lead to E ( K ) = 5.177. Fig. 3 shows the distributions of ℙ n ( m ) for the three population dynamics scenarios described above. An immediate observation is that these distributions do not differ substantially, despite the seemingly dramatic differences in population size dynamics shown in Fig. 2 . While there are noticeable differences around the peak of these distributions, the variations in the tails are minimal. This pattern serves as a reminder that K represents only one aspect of the genetic polymorphism within a sample. Behind K , the mutations constituting the sample can vary significantly in both type and frequency – a factor that has been leveraged in various data analysis approaches. Download figure Open in new tab Fig. 3. Probability distribution ℙ n ( m ) computed from (11) for n=100 and three scenarios in Fig. 3 Impact of integer partitions A key technical issue affecting the computation of ℙ n ( m ) is the number and patterns of integer partitions. Let p ( m ) denote the number of different partitions of the integer m . The value of p ( m ) is a critical factor in determining the computational complexity of ℙ n ( m ) using Equation (11), as the computation requires enumerating all possible partitions of m . The function p ( m ) is a classic problem in combinatorics and number theory, and while a closed-form formula for p ( m ) is not available, it can be easily computed through enumeration or recurrence relations. For example, the only partition for m = 0 is the empty set ∅, p (1) = 1 corresponds to the partition (1), and p (2) = 2 corresponds to the partitions (2) and (1, 1). While p ( m ) increases slowly for small values of m , it grows rapidly as m increases. The thresholds of 10 3 , 10 6 and 10 9 partitions are reached at m = 22, 61, and 115, respectively. Table 1 provides values of p ( m ) for selected values of m up to 150 ( Sloane, 2024 ; Wolfram Research, 2014). View this table: View inline View popup Download powerpoint Table 1. The number p ( m ) of integer partitions for selected m ⩽150 Since every partition contributes positively to the over-all probability ℙ n ( m ), there is generally no issue with error propagation when using double-precision computation in modern programming languages and tools. However, for m > 50, p ( m ) increases rapidly, presenting a significant computational challenge. For instance, when m = 60, the number of partitions is nearly 1 million, and by the time m reaches 100, the number swells to 190 million. To make matters even more daunting, for m = 150, there are over 40 billion partitions! While large numbers may offer elegant solutions in certain contexts, such as in the application of the central limit theorem, they become a major hurdle in the numerical computation of ℙ n ( m ). Therefore, investigating the roles of different types of partitions in the computation of ℙ n ( m ) is important for understanding and improving the efficiency of these calculations. Two important characteristics of integer partitions are partition length, which refers to the number of parts in the partition, and maximal size, which denotes the largest value of the parts. In the context of mutation partitions, these correspond to the number of atomic clusters and the maximum number of mutations in a single atomic cluster, respectively. Let p i ( m ) represent the number of partitions of length i , and denote the number of partitions with a maximal size of i . It is known that ( m ) ( Slomson, 1991 ), and thus Then p i ( m )/ p ( m ) is the percentage of partitions of length i among a total p ( m ) partitions. Similarly is the percentage of partitions with maximal size i among p ( m ). For a partition ,define #( s ) as its number of parts and # ‵ ( s ) as its maximal size of parts. Then Define ℙ n ( m, i ) be the contribution to ℙ n ( m ) by partitions of length i , and similarly P 1 ( m, i ) be the contribution to ℙ n ( m ) by partitions s with maximal size i . Then and Plots of the p i ( m )/ p ( m ) against i / m for three different m with n = 500 and θ = 10 are given in Fig. 4 . It is clear that partitions of length in the range 0.2 m −0.3 m are most abundant. Take the case of m = 100 for example (green dotted plot), the peak of p i ( m )/ p ( m ) is around 0.2 for i / m , which means the highest concentration of partitions are those of length 20. Additionally, it is apparent that as m increases, the peak concentration of partitions becomes smaller. Fig. 4 also plots ℙ n ( m, i )/ ℙ n ( m ) against i / m . The contribution towards ℙ n ( m ) is concentrated around 0.55 in the case of m = 100 (green curve), indicating that partitions of length around 55 contribute most. It is also clear that the peak of partitions contributing to ℙ n ( m ) gradually shifts to the left with increase of m . Furthermore the density p i ( n )/ p ( m ) and the density ℙ n ( m, i )/ ℙ n ( m ) for a given m overlap only slightly, meaning that in general a small fraction of integer partitions contribute to the over-whelming portion of ℙ n ( m ). In other words, ℙ n ( m ) can typically be computed with sufficient accuracy by evaluating a small portions of partitions. For example, for m = 100, 99.99% of the ℙ 500 (100) comes from the partitions of length 30 or larger, which account for only 13.4% of the total 190 millions partitions of 100. This feature is quite beneficial, as it makes the challenge of dealing with a large number of partitions a manageable issue. Download figure Open in new tab Fig. 4. Percentage of partitions of different lengths (dots) and their contributions(lines) to ℙ n ( m ), for n = 500 and θ = 10. Three numbers of mutations considered are m = 50, 80 and 100. To test the limits of evaluating ℙ n ( m ) using Eq. (11) , we computed ℙ 500 (150), which required enumerating over 40 billion partitions of 150. This case was used to examine both the contribution of partitions of different maximal sizes to ℙ n ( m ) and the mean contribution of partitions of various lengths. The results are presented in Fig. 5 . From the figure, it is evident that the contribution by partitions of different maximal sizes (green curve) closely overlaps with the percentage of partitions of various maximal sizes (orange curve). This suggests that prioritizing the enumeration of partitions based on their maximal size does not lead to a significant reduction in computational effort. Download figure Open in new tab Fig. 5. Contributions of partitions to ℙ 500 (150) = 0.01044 with θ = 20. (a) Contribution of partitions of different lengths, (b) percentage of p i ( m ) in p ( m ), (c) Contribution by partitions of different maximal sizes and (d) Contribution per partition of different length. Fig. 5 also shows the per-partition contribution to ℙ n ( m ) for different lengths (red curve). Interestingly, this curve does not coincide with the overall contribution of different lengths (blue curve). In other words, the partition lengths that contribute most to ℙ n ( m ) are not the same as those that contribute the most per partition. However, it is clear that, whether considered collectively or individually, partitions with lengths equal to or larger than m /2 contribute most to ℙ n ( m ). Conditions for the failure of Tavaré formula It may seem straightforward to compute ℙ n ( m ) using Eq.(4) under the Wright-Fisher model with constant effective size, which is the sum of n − 1 summands. However, once sample size n is sufficiently large, the binomial coefficient in the ( i − 1)-th summand can be a very large number, particularly when i is around n/2. As a result, Tavaré formula can often be a summation of some huge summands with alternating sign. To make the matter worse, adjacent summands of opposite signs are typically not close enough in value to cancel each other quickly. When ℙ n ( m ) is about a magnitude 10 −15 smaller than the absolute values of some summands, error accumulation/propagation is inevitable. This is because modern computers generally carry out floating point calculation in 64-bits format, which leads to 15-17 significant digits, regardless of program languages/tools. There have been improvements in various programming languages, but they are mostly on integer operations and data format. Once floating point operations are performed, the notorious error issue persists. For illustration, consider ℙ 100 (5) with θ = 1. The 4 summands of Eq.(4) for i=48 to 51 are, respectively, − 8.31062×10 17 , 7.66893×10 17 , −6.81003×10 17 , 5.81884× 10 17 . When adding summands in ascending order of i , the result is 7.089 ×10 4 in standard double precision Java programming, which leads to ℙ n (5) = 7.01 × 10 6 , while the correct value is ℙ 100 (5) = 0.1553 according to Eq. (11) . One can explore different orders of adding summands and the final result does change slightly, but all results in qualitatively wrong answer. In our attempt to improve the applicability of Eq.(4) , we explored three different implementations, which were standard programming in Java (version 22) using double precision, Java programming using BigDecimal class which has unlimited precision for integer computation and improved precision for decimal numbers, and symbolic programming by Mathematica V10 (Wolfram Research, 2014), which is supposed to have unlimited precision in applicable numerical computations. We observed that significant error usually started with smaller m , as expected, because the power factor with large m can help to lower the absolute value of a summand, thus reducing the error accumulation. Also expected is that for a given combination of m and θ , errors become more pronounced with increasing sample size. This prompted us to exam the threshold sample size that leads to significant error in ℙ n ( m ). Table 2 gives the these sample sizes for several values of θ in ℙ n (1) and ℙ n (5). We started with m = 1 instead of m = 0 since ℙ n (0) can be computed directly by Eq.(3) without using Eq.(4) . We define failure of Eq.(4) as a situation in which ℙ n ( m ) has at least 10% error when compared with ℙ n ( m ) from Eq.(11) . View this table: View inline View popup Download powerpoint Table 2. Failure conditions for computing ℙ n ( m ) using Eq.(4) by three different programming methods It is clear from Table 2 that for a wide range of θ , Eq.(4) starts to fail for ℙ n (1) for sample size around 30 − 60, and starts to fail for ℙ n (5) for sample sizes around 40 − 85. Higher precision programmings push the threshold sample sizes only slightly higher, which validates the early statement that internal computation in modern computers is mostly 64-bits format, regardless of programming languages/environments. Consequently Tavaré formula is not recommended in general for numerical computation when n > 50, particularly when the entire range of m need to be considered. Summary A new formula is derived in this paper for the probability of the number K of mutations in the genealogy of a random sample of DNA sequences from a single locus without recombination, drawn from a population evolved according to the constant-in-state model. In the case of constant effective population size, this formula closely resembles Ewens’ sampling formula for the number of distinct alleles in the sample. The new sampling formula is based on the probabilities of different partitions of mutations into coalescent states and achieves significant simplification by recognizing and partitioning mutations into atomic clusters – which are indivisible, bounded within coalescent states, and where permutation of mutations is cyclic. Without the concept of atomic clusters, expressing ℙ n ( m ) as the sum of probabilities for different realizations of mutations in the sample genealogy would lead to complex, multi-layered summations whose complexity grows rapidly with the number of mutations. The new formula can be used effectively for numerical computation with minimal concern for accuracy, since every summand contributes positively. While the efficiency of the computation remains largely unaffected by sample size, it is impacted by the number of mutations due to the need to enumerate all possible partitions of the given number of mutations. In typical applications, the formula should handle computations efficiently with little or no optimization of the algorithm. Moreover, partitions with a larger number of atomic clusters contribute more to the overall probability, so computation efficiency can be improved by prioritizing the evaluation of partitions based on the number of atomic clusters. In contrast, Tavaré’s formula has a limited scope of applicability, primarily due to the error-prone summation of large summands with alternating signs. As such, it is not recommended for general numerical computation of the distribution of K , especially when the sample size is 50 or greater. However, it may still be useful for computing ℙ n ( m ) for large m with smaller sample sizes, potentially in conjunction with the new formula. A Java package implementing the new formula for ℙ n ( m ) is available from the author upon request. The relevance of permutations of cycle type and integer partitions to population genetics was first highlighted through Ewens’ sampling formulas under the infinite-alleles model ( Ewens, 1972 ; Karlin and McGregor, 1972 ), but has since had limited application to other summary statistics. The newly derived sampling formula also successfully connects to permutations of cycle type and integer partitions. The links between theoretical population genetics and the fields of combinatorics and number theory may be deeper than previously recognized. Consequently more summary statistics related to K can be expected to possess such characteristics. Footnotes ✩ Preprint formatted with Elsevier’s template. A few typos in the first two sections were corrected. Although they do not alter the flow of the manuscript in meaningful way, the correction should help to improve readability of the manuscript. References ↵ Abramowitz , M. , Stegun , I.A. , 1972 . Handbook of Mathematical Functions with Formulas, Graphs, and Mathematical Tables. 9 ed ., Dover Publications, Inc ., New York . ↵ Donnelly , P. , 1986 . Partition structures, polya urns, the Ewens’ sampling formula and the age of alleles . Theor. Popul. Biol . 30 , 271 – 288 . OpenUrl CrossRef PubMed ↵ Donnelly , P. , Tavaé , S. , 1986 . The ages of alleles and a coalescent . Adv. Appl. Probab . 18 , 1 – 19 . OpenUrl CrossRef ↵ Ewens , W.J. , 1972 . The sampling theory of selectively neutral alleles . Theor. Popul. Biol . 3 , 87 – 112 . OpenUrl CrossRef PubMed Web of Science ↵ Ewens , W.J. , 2004 . Mathematical Population Genetics . Springer-Verlag , New York . ↵ Fu , Y.X. , 2025 . The central limit theorem for the number of mutations in the genealogy of a sample from a large population . Theor. Popul. Biol., (in press) (bioRxiv doi: 10.1101/2025.01.23.634620 ). OpenUrl Abstract / FREE Full Text ↵ Fu , Y.X. , Li , W.H. , 1993 . Maximum likelihood estimation of population parameters . Genetics 134 , 1261 – 1270 . OpenUrl Abstract / FREE Full Text ↵ Griffiths , R.C. , Lessard , S. , 2005 . Ewens’ sampling formula and related formulae: combinatorial proofs, extensions to variable population size and applications to the ages of alleles . Theor. Popul. Biol . 68 , 167 – 177s . OpenUrl CrossRef PubMed ↵ Hoppe , F.M. , 1984 . Polya-like urns and the Ewens’ sampling formula . J. Math. Biol . 20 , 91 – 94 . OpenUrl CrossRef Web of Science ↵ Karlin , S. , McGregor , J.L. , 1972 . Addendum to a paper of W. Ewens . Theor. Popul. Biol . 3 , 113 – 116 . OpenUrl CrossRef PubMed Web of Science ↵ Kingman , J.F.C. , 1982a . The coalescent. Stochastic Processes and Their Applications 13 , 235 – 248 . OpenUrl CrossRef Kingman , J.F.C. , 1982b . On the genealogy of large populations . J. Appl. Probab . 19A , 27 – 43 . OpenUrl CrossRef PubMed ↵ Liu , X. , Fu , Y.X. , 2015 . Exploring population size changes using snp frequency spectra . Nat Genet 47 , 555 – 559 . OpenUrl CrossRef PubMed ↵ Pybus , O. , Rambaut , A. , Harvey , P. , 2000 . An integrated framework for the inference of viral population history from reconstructed genealogies . Genetics 155 , 1429 ––1437. OpenUrl Abstract / FREE Full Text ↵ Riordan , J. , 1958 . Introduction to combinatorial analysis . John Wiley , New York . ↵ Sloane , N.J.A. , 2024 . Seauence A000041: The on-line encyclopedia of integer sequences : https://oeis.org/ . ↵ Slomson , A. , 1991 . An Introduction to Combinatrics . Chapman and Hall Mathematics , London . ↵ Tavaré , S. , 1984 . Line-of-descent and genealogical processes, and their applications in population genetics models . Theor. Popul. Biol . 26 , 119 – 164 . OpenUrl CrossRef PubMed Web of Science ↵ Watterson , G.A. , 1975 . On the number of segregation sites . Theor. Popul. Biol . 7 , 256 – 276 . OpenUrl CrossRef PubMed Web of Science Wolfram Research, I ., 2014 . Mathematica V10 . Wolfram Research, Inc ., Champaign, Illinois . View the discussion thread. Back to top Previous Next Posted February 05, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following The distribution of the number of mutations in the genealogy of a sample from a single population Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share The distribution of the number of mutations in the genealogy of a sample from a single population Yun-Xin Fu bioRxiv 2025.02.04.636538; doi: https://doi.org/10.1101/2025.02.04.636538 Share This Article: Copy Citation Tools The distribution of the number of mutations in the genealogy of a sample from a single population Yun-Xin Fu bioRxiv 2025.02.04.636538; doi: https://doi.org/10.1101/2025.02.04.636538 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetics Subject Areas All Articles Animal Behavior and Cognition (7622) Biochemistry (17648) Bioengineering (13871) Bioinformatics (41880) Biophysics (21423) Cancer Biology (18558) Cell Biology (25460) Clinical Trials (138) Developmental Biology (13364) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15589) Genomics (22475) Immunology (17711) Microbiology (40327) Molecular Biology (17145) Neuroscience (88473) Paleontology (666) Pathology (2827) Pharmacology and Toxicology (4816) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00