μ Seq: Universal mutation rate quantification via deep sequencing of a single clonal expansion

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 138,632 characters · extracted from preprint-html · click to expand
μSeq: Universal mutation rate quantification via deep sequencing of a single clonal expansion | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results μ Seq: Universal mutation rate quantification via deep sequencing of a single clonal expansion View ORCID Profile Simone Pompei , Alberto Geroldi , Pietro Rivetti , View ORCID Profile Elena Grassi , View ORCID Profile Valentina Vurchio , Giorgio Tallarico , View ORCID Profile Giorgio Corti , Lorenzo Tattini , Gianni Liti , Andrea Bertotti , View ORCID Profile Marco Cosentino Lagomarsino doi: https://doi.org/10.1101/2025.04.17.649315 Simone Pompei 1 IFOM ETS, the AIRC Institute of Molecular Oncology , Milan, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Simone Pompei Alberto Geroldi 2 Dipartimento di Fisica, Università degli Studi di Milano , via Celoria 16, Milan, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pietro Rivetti 2 Dipartimento di Fisica, Università degli Studi di Milano , via Celoria 16, Milan, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Elena Grassi 4 Department of Oncology, University of Torino , 10060 Candiolo, Torino, Italy 5 Candiolo Cancer Institute – FPO IRCCS , 10060 Candiolo, Torino, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Elena Grassi Valentina Vurchio 4 Department of Oncology, University of Torino , 10060 Candiolo, Torino, Italy 5 Candiolo Cancer Institute – FPO IRCCS , 10060 Candiolo, Torino, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Valentina Vurchio Giorgio Tallarico 1 IFOM ETS, the AIRC Institute of Molecular Oncology , Milan, Italy 2 Dipartimento di Fisica, Università degli Studi di Milano , via Celoria 16, Milan, Italy 7 Pázmany Péter Catholic University, Faculty of Information Technology and Bionics , 1083 Budapest, Hungary Find this author on Google Scholar Find this author on PubMed Search for this author on this site Giorgio Corti 4 Department of Oncology, University of Torino , 10060 Candiolo, Torino, Italy 5 Candiolo Cancer Institute – FPO IRCCS , 10060 Candiolo, Torino, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Giorgio Corti Lorenzo Tattini 6 Université Côte d’Azur, CNRS, INSERM, IRCAN , Nice, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gianni Liti 6 Université Côte d’Azur, CNRS, INSERM, IRCAN , Nice, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Andrea Bertotti 4 Department of Oncology, University of Torino , 10060 Candiolo, Torino, Italy 5 Candiolo Cancer Institute – FPO IRCCS , 10060 Candiolo, Torino, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Marco Cosentino Lagomarsino 1 IFOM ETS, the AIRC Institute of Molecular Oncology , Milan, Italy 2 Dipartimento di Fisica, Università degli Studi di Milano , via Celoria 16, Milan, Italy 3 I.N.F.N, Milano , via Celoria 16, Milan, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Marco Cosentino Lagomarsino For correspondence: marco.cosentino-lagomarsino{at}ifom.eu Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Quantifying mutational processes is crucial for understanding evolution, especially in clinical and experimental settings, but remains challenging. Current methods are labor-intensive and often lack robustness, resulting in inconsistent or biased outcomes, particularly in mammalian cells. We use patient-derived colorectal cancer organoids to introduce µ Seq, a universal framework for inferring mutation rates in diverse biological systems. Our approach extracts mutation rates from deep sequencing of single clonal expansions, with a time gain of tenfold or more compared to a mutation-accumulation line, at the cost of three billion read whole-genome sequencing. µ Seq relies on four critical components: (i) a controlled experimental setup enabling validation (ii) a quantitative estimate inspired by the classic Luria–Delbrück spectrum for subclonal mutations derived from population dynamics, (iii) robust statistical models accounting for sampling noise and sequencing errors, and (iv) a data-analysis pipeline for subclonal mutation detection that compares endpoint populations to a closely related ancestor. Our models establish precise requirements for sequencing depth, genome size, and mutation frequency detection necessary for accurate mutation rate estimates. Crucially, we show that failing to meet these criteria can lead to errors spanning several orders of magnitude. We validate our approach using parallel mutation-accumulation experiments in colorectal cancer organoids, finding mutation rate estimates consistent with previous Mutation Accumulation studies. In particular, our results confirm the expected differences between MSI and MSS tumors. Finally, to demonstrate the adaptability of µ Seq, we apply it to yeast, leveraging multiple independent replicates to compensate for its much smaller genome. The robustness and broad applicability of µ Seq establish it as a powerful and universal tool for mutation rate quantification, spanning applications from cancer research to microbial evolution. Short Abstract 150 words Quantifying mutational processes is crucial for understanding evolution, disease progression, and drug resistance. Current methods are labor-intensive and less robust for mammalian cells and clinical settings. Using patient-derived colorectal cancer organoids as a model system, we developed µ Seq, a novel framework to estimate mutation rates from deep sequencing of a single clonal expansion with a close reference, re-envisioning the Luria-Delbrück spectrum. µ Seq integrates (i) a controlled experimental setup, (ii) robust statistical and population-dynamics models, and (iii) a data-analysis pipeline for subclonal mutation detection. We establish precise requirements for sequencing depth, genome size, and mutation frequency to ensure accuracy despite sequencing errors. Critically, µ Seq enables accurate estimation of mutation rates as low as 10 − 9 mutations per base pair per generation, using only subclonal mutations from a single expansion. We validate our framework through mutation-accumulation experiments in colorectal cancer organoids and yeast, demonstrating its robustness and broad applicability. Introduction In the context of Darwinian evolution, the point mutation rate — defined as the probability per unit time at which genomes acquire base-pair changes during lineage divisions — is a fundamental evolutionary force driving adaptation [ 1 ]. Quantifying this parameter not only provides critical insights into the accumulation of genetic variation over time, but is also crucial for understanding evolutionary dynamics. First, detecting selective forces in a population requires a neutral model that describes mutation accumulation without selection, enabling identification of deviations indicating selective pressure. Second, the mutation rate itself is an evolving, selectable trait [ 2 , 3 ], influencing the modes of adaptation. This trait is subject to different trade-offs. For instance, a low mutation rate is necessary for keeping a low level of deleterious mutations, particularly in large genomes [ 4 ]. However, a sufficiently high rate is crucial for a genome to hit the beneficial mutations that drive adaptation and the development of novel phenotypes. Furthermore, when too many beneficial mutations emerge synchronously by different individuals within the same population, clonal interference can hinder fixation [ 5 – 8 ]. Since changes in the mutation rate can drive vastly different evolutionary outcomes, accurately quantifying this rate is fundamental for any predictive evolutionary analysis [ 9 , 10 ]. The accumulation of somatic mutations is also a fundamental process in aging and a critical driver of carcinogenesis, metastasis and resistance to chemotherapy. In the context of cancer, quantifying the rate of pre- and post-cancerous somatic mutation accumulation is considered essential to understand tumor evolution, adaptation, and the emergence of therapy resistance. It also plays a crucial role in devising personalized treatment strategies [ 11 – 18 ]. This need can be highlighted by an analogy with laboratory evolution experiments in microbes, which show that mutator phenotypes often emerge in challenging environments [ 18 – 20 ]. This parallel has suggested that the demanding conditions within tumors may similarly favor the emergence of mutator phenotypes in cancer [ 21 – 23 ]. Prior to considering any effects of natural selection, it is essential to characterize and quantify the mutation rate itself, evaluating the underlying biological mechanisms that lead to the observed mutation [ 24 ]. Although methodologies to quantify mutation rates in microorganisms are well-established [ 25 – 28 ], they are labor intensive, and adapting similar approaches for mammalian cells, especially within clinically relevant cancer models, remains challenging [ 23 ]. Different approaches have been employed to estimate mutation rates in cancer, each with limitations. Comparative genomics, while leveraging vast datasets, is con-founded by selection and evolutionary processes, providing only average mutation rates over long timescales [ 29 ]. Lineage sequencing [ 30 ], while potentially offering high resolution, can be hampered by tree-reconstruction uncertainties and selection biases. Studies focusing on mutagenic agents under controlled exposure [ 31 ] face challenges in controlling selection pressures and accurately estimating effective generations. Other studies have used subclonal mutation spectra from sequencing data from cancer cells to estimate mutation rates [ 32 , 33 ]. However, these quantitative approaches often yield mutation rate estimates that vary by one to two orders of magnitude, indicating significant inconsistencies. This variability partly arises from assumptions of neutral evolution, which may not hold in patient-derived data where selective effects and biases can skew inferences [ 34 ]. Finally, comparisons between germline and somatic mutation rates [ 35 ] are hindered by difficulties in accurately estimating the number of mitoses in somatic cells. These limitations, as well as the discrepancies between the quantitative estimates coming from different sources, underscore the need for robust and accurate methods for quantifying mutation rates in mammalian systems. An innovative approach has leveraged CRISPR-modified organoid models [ 36 ], using expansion for 7-14 days and sequencing to quantify accumulated clonal mutations. However, this simplification of population dynamics may affect the estimated mutation rates derived from these data. Specifically, unless strong bottlenecks are implemented, selection pressures can influence the detected mutations [ 27 , 37 ]. Additionally, alterations in population dynamics within the knockouts can modify the effective number of generations over a fixed timeframe, introducing bias into mutation rate estimates. To address the challenges associated with uncontrolled population dynamics in this and similar approaches, we recently introduced a sequencing-based mutation accumulation line experiment in patient-derived colorectal cancer organoids [ 23 ]. This method minimizes the influence of selection by employing alternating cycles of cellular expansion and bottlenecks, providing a clear quantitative benchmark comparable to standards achieved with microorganisms [ 28 ]. However, it is experimentally demanding, requiring a minimum of six months of organoid expansion-bottleneck cycles to generate a sufficient sample of clonal mutations. We introduce µ Seq, a method that estimates mutation rates using a single cellular expansion and a genomic reference from a close ancestor, reducing the time needed for mammalian cells by tenfold or more, to a few weeks only (and from weeks to tens of hours for microbes). µ Seq re-envisions the classic Luria-Delbrück fluctuation test by combining an experimental assay with a stochastic mutation model [ 38 – 44 ]. To overcome limitations of similar methods based on subclonal mutation spectra and deep sequencing [ 32 , 33 ], µ Seq offers key innovations: it employs a controlled setup with a nearly-neutral expansion compared to a related ancestor, integrates models for population dynamics and statistical errors, and includes a validated data-analysis pipeline. This streamlined approach efficiently measures mutation rates in patient-derived organoids and microbes, reducing time and resources, and advancing mutation-rate quantification for both scientific and clinical use. Results A theoretical framework of mutation frequency spectrum predicts feasible estimates We start by detailing our theoretical work on estimators of the mutation rate from the subclonal mutation frequency [ 22 , 32 , 41 , 45 ]. µ Seq is a conceptual extension of the classic Luria-Delbrück fluctuation-test experiment [ 38 , 39 , 46 ], which quantifies the numbers of resistant mutants in parallel, neutrally expanding cultures. Our method applies to a single neutrally expanding population, treating each genomic site as an independent unit (analogous to the parallel populations in the original experiment) where subclonal mutations can develop during population expansion ( Fig. 1 ). Thus, our focus is on the so-called “site frequency spectrum”, i.e., the number of bases with a given frequency of subclonal mutations, typically visualized as a histogram of allele frequencies. It is important to note that the site frequency spectrum is not exactly a probability distribution because it excludes the no-mutation class. As a result, the total counts are proportional to the mutation rate multiplied by the number of scored bases (see below) [ 38 , 39 ]. We derive this quantity using population-dynamics mathematical model of an exponentially growing cell population. We assume that cells have a division rate b , a death rate d (per unit time), and a point mutation rate µ (per cell, per division, per nucleotide). This model belongs to a class of models describing populations evolving through a branching process with neutral mutations, under the infinite-sites assumption commonly employed in population genetics [ 47 ]. Our analysis has leveraged significant progress made over the past decade in deriving analytical results for the site frequency spectrum in models of this class. These theoretical advancements have been achieved both in the deterministic limit [ 32 ] and by incorporating stochastic effects [ 45 , 47 – 49 ]. Download figure Open in new tab Fig. 1: Illustration of the expected site frequency spectrum model and its components from population dynamics and errors. (A) We consider the mutation frequencies within a population of cells expanded neutrally from a single clone. The net growth is assumed exponential up to a population size N . During the expansion, cells can divide, die, and accumulate subclonal mutations. (B) Performing a sequencing experiment on the final population implies downsampling the population to an effective size set by the sequencing coverage ( M ). Mutations with an intra-population frequency x 1 /M , originating at the beginning of the experiment, are retained with essentially unchanged frequency. (C) Sequencing errors introduce additional mutations in the observed site frequency spectrum, modifying the sampled frequencies of existing subclonal ones and introducing new mutations at otherwise monomorphic sites. (D) The expected spectrum follows a 1 /x 2 power law, where x is the true intra-population frequency. (E) Sampling effects do not modify the expected shape, which remains a 1 /f 2 power law, where f is the sampled frequency, but the trend remains visible only for f > 1 /M . Mutations with 1 /N < f < 1 /M are typically lost. (F) Sequencing errors introduce another error component, primarily localized at small frequencies, which dominate the spectrum up to a frequency f min . Frequencies above f min generally feel a small bias from sequencing error, and thus retain the original 1 /f 2 law. In the limit of a large population size, i.e., when considering clonal expansions with a final population size N ≫ 1, the site frequency spectrum can be approximated by a continuous function that quantifies the expected number of subclonal mutations present in the population at the end of the experiment, with a frequency x , and reads Here, µ is the point mutation rate per individual per nucleotide per cell division, L is the length of the genome in base pairs (or in general the number of sampled sites), and is the viability (probability of establishment of a colony from a single cell), which quantifies the cell turnover by birth-death events. The spectrum P ( x ) describes the standard 1 /x 2 law of the Luria-Delbrück experiment ( Fig.1A-B ) [ 43 ]. While deviations from this law at low frequencies have addressed [ 43 , 49 , 50 ], in our experiment this region is dominated by sampling and sequencing errors, discussed below. In order to derive an estimator, we accounted for several statistical and population-dynamics effects. First, we included the effect of the sequencing process on the observed subclonal frequencies. This process can be mathematically modeled as a binomial sampling process of N cells, with an effective population size M representing the sequencing coverage. We assume each mutation is sampled in sequenced read with a probability equal to its intra-population frequency x (see Methods). Consistent with previous analytical derivations [ 47 , 48 ], we find that the expected site frequency spectrum of a sampled population, i.e., the expected number of mutations observed with a sampled frequency of f sequencing reads, can be expressed as Hence, the expected site frequency spectrum after sampling retains the Luria-Delbrück 1 /f 2 power law. In the sampling process, mutations with an intra-population frequency x < 1 /M are very unlikely to be extracted. However, if they are, their empirical fraction is biased towards higher frequencies due to the small sample size. This sampling bias occurs because if we can observe M reads, the lowest observable frequency is 1 /M . This introduces a shift in the apparent frequency f of observed mutations with overall frequency x 1 /M , originating at the beginning of the experiment, are retained and their apparent frequency remains essentially unaltered ( Fig. 1C-D ). As a result of these sampling biases, the effective site frequency spectrum maintains the same overall shape as the original site frequency spectrum but includes significantly fewer mutations. Assuming that all L sites have been properly sequenced, the fraction of mutations that are lost is approximately 1 − M/N , calculated as . In a typical bulk sequencing experiment, with the coverage M ranging from 10 to 100 and N around 10 6 , more than 99.99% of subclonal mutations cannot be detected due to sampling limitations. In a second step, we modeled the effect of sequencing errors affecting the sampling induced by the sequencing process. We assumed that errors can occur in each read in the sequencing with a probability ϵ (per nucleotide per read). In a normal short-read sequencing experiment, this quantity is between 10 − 3 and 10 − 2 errors per nucleotide per read. Because of sequencing errors, the observed frequency will be reduced due to false negative reads, i.e., reads that should have been detected but are missed by the sequencing machine, and due to false positive errors, mutations will be detected even in sampled reads corresponding to cells that did not harbor the mutation. We derived an approximated analytical solution for the expected site frequency spectrum of mutations after sampling and sequencing errors (see Methods), which reads with a pre-factor A ⪅ L . The site frequency spectrum is dominated by sequencing errors, described by an exponential function of , the Kullback-Leibler divergence between two Bernoulli processes, sampling and sequencing errors, with success probability f and ϵ resepctively. For f > f min , the site frequency spectrum is described by the Luria-Delbrück spectrum (see Methods for derivation). The value of f min can be computed numerically and corresponds to the frequency where the expected number of mutations from the sequencing errors contribution is equal to 1 ( Fig. 1E-F ). A conservative lower bound of this critical frequency is given by (see Methods) with Mϵ ≥ 1. The above equation also allowed us to consider a third effect, that of sampled sites L . Intuitively, we can think that high-frequency mutations, those that are formed in the first generations of the experiments, are exponentially less likely to be observed than mutations with low frequency, which were formed in later generations of the experiment, where more cells were around to potentially generate them. Analytically, we can use this fact to define a threshold f max given by the frequency for which the expected number of mutations of the LD specturm (net of sequencing errors) equals 1, where we recall that µ is the mutation rate, L is the number of sampled sites, M is the coverage and β = 1 − b/d . Mutations with frequencies above f max will be underrepresented or even absent in the data, leading to a failure in the accuracy of the estimate. Equally, for genomes or numbers of sampled sites L that are increasing small, f max will decrease, negatively affecting the estimate for lower-frequency mutations. In summary, we accounted for three main factors affecting the estimate, the sequencing depth, the number of sampled sites and sequencing errors. Sequencing errors, which typically occur at a roughly constant per-base rate, fix a threshold frequency below which most of the detected mutations is likely due to sequencing errors. Consequently, it will be useless to increase the sequencing depth above the inverse of the error rate, since most of the newly detected mutations would be false positives due to sequencing errors. The inverse coverage 1 /M sets a practical lower limit of mutation frequency detection. Hence, we can expect that a reasonable strategy is to perform experiments where the mean coverage is roughly equal to the inverse of the sequencing-error rate (i.e. typically between 100 and 500), and not much higher, since deeper sequencing will mostly lead to the detection of subclonal frequencies that are biased by sequencing errors, and therefore include many false-positive subclonal mutations. Finally, the number of sampled sites determines the detectability of mutations with large frequencies (which are rarer because they occur in the first generations, when fewer cells are around), and a large number of sampled loci is needed to ensure accurate estimates at the higher end of the frequency spectrum. This establishes both upper and lower bounds for detectable mutation frequencies: only when the bounds are sufficiently separated will the estimate be reliable. By equating Eq. 4 and Eq. 5 , we can estimate the bare minimum number of sites that need to be sampled to ensure a region of the spectrum falls within the Luria-Delbrück regime. This minimum number is given by (see Methods): As an example, techniques limiting the error rate like duplex sequencing[ 51 , 52 ] allow to probe lower frequencies and reduce the lower frequency threshold f min . However, if they are deployed on a small subset of the genomic sites, the upper frequency threshold f max will also decrease, due to the smaller sample size, possibly compromising the estimate. The mutation-rate estimate depends on experimental and sequencing parameters Beyond these qualitative considerations, our theoretical results, summarized by Eq. 3 also provide a principled and immediate method to estimate the mutation rate from the observed site frequency spectrum of a high-depth experiment conducted on a population of cells expanding under conditions as close to neutral as possible. Specifically, for observed frequencies f > f min , where the expected site frequency spectrum aligns with the theory in the Luria-Delbrück (LD) limit of large population, the expected site frequency spectrum depends explicitly on the point mutation rate [ 32 , 43 – 45 ]. By equating the observed frequency spectrum to the theoretical expectation P ( f ) in the LD-dominated region of the spectrum, we can derive an estimator for the mutation rate. This involves inverting the equation and expressing the result in terms of where we consider all counts observed in the range [ f − δ/ 2, f + δ/ 2], with f being the focal frequency, δ the bin size, and L the total number of sites considered. Using frequency bins of size δ (where δ ≪ 1 but should be kept sufficeintly large for efficient sampling) makes the estimate “local” ( Fig. 2AB ), providing two main advantages. First, it allows testing the behaviour of the estimate across different spectral regions, both within and beyond the expected Luria-Delbrück fequency range [ f min , f max ]. Second, unlike a global maximum-likelihood approach, this method is more robust, as discrepancies (such as non-uniform error rates or spurious high/low-frequency peaks), can bias global estimates, even if they are found outside the expected Luria-Delbrück fequency range. These issues become evident when comparing the behaviour of the local estimator applied to data across frequency bins with model predictions. In brief, our model identifies the optimal frequency range for estimating the mutation rate, within which reliability can be visually confirmed, even if the model fails outside this range. Indeed, as demonstrated with data below, other local estimators perform similarly well when applied within the robust frequency range predicted by our model. Download figure Open in new tab Fig. 2: Simulated data validates mutation rate estimation within specific frequency intervals. (A) The site frequency spectrum of a simulated experiment including population dynamics and experimental noise follows the model predictions. Parameters: N ≃ 10 5 cells, µ = 5 10 − 9 /(generation site individual), viability β = 0.47, sequencing coverage M = 100, and sequencing error E = 10 − 4 (see Methods). We identify two threshold frequency values: (i) f min , defined as the frequency at which the expected number of sites affected by sequencing error falls below one and (ii) f max , the first frequency for which the expected number of sites showing the Luria-Delbrück trend falls below one. For f f > f min , it follows the expected f − 2 law. For f > f max the samples sites become insufficient to show the “jackpot” mutations originated in the first generations. (B) Applying the estimator to the same simulated data reveals a plateau in the region f min < f < f max , corresponding to the input value of µ/β≃ 1.06 10 − 8 . For f f max the mutation rate is overestimated in all the bins with at least one count. (C) Increasing sequencing coverage at constant sequencing error, both f min and f max shift toward lower frequencies, and the plateau expands, covering a larger frequency interval. (D) With constant sequencing coverage but increasing sequencing error, f min shifts to the right, and the plateau decreases in size. (E) and (F) The average value of the plateau accurately reflects the ground-truth mutation rate. Box plots (black line=average, boxes=1st and 3rd quartiles, symbols are outliers) show the distribution of effective mutation rates in a set of 100 simulations for increasing sequencing coverage M in (E) and increasing sequencing error ϵ in (F). To test the validity of our estimator, we performed simulations of a discrete-time branching process to model exponential expansions. We also applied a sampling process combined with an error model to reproduce sequencing errors (see Methods). Our simulations consistently validated the results of Eq. 3 when averaging the model output over 100 independent realizations ( Extended Data Fig. 1 ). We then focused on a single realization of the simulation to evaluate the potential efficacy of our estimator in real data. Consistent with our theoretical results, the observed site frequency spectrum in a single realization can be decomposed into two components: one dominated by sequencing errors for f f min up to a frequency f max . At this frequency, the expected number of mutations according to the LD component equals 1, given by f max (see Eq. 5 ). For f > f max , the single realization shows deviations from the expected number. As we discussed above, these deviations are attributed entirely to statistical fluctuations: when the expected number is less than 1, there may be an underrepresentation if no mutations are observed, or an overrepresentation if a single mutation is detected ( Fig. 2A ). Fig. 2B shows that when applying the estimator to simulated data, we observe that it exhibits a plateau consistent with the input parameters within the frequency range [ f min , f max ]. In agreement with our theoretical model, simulated sequencing errors cause deviations for f f max . The values of the critical frequencies define the reliable region of the estimator and depend on the sequencing parameters. Specifically, for a fixed sequencing error rate E , increasing the sequencing coverage results in both f min and f max shifting to lower frequencies (see Fig. 2C ), with the plateau covering a larger frequency interval, in accordance to Eq.s (4,5). With constant sequencing coverage but increasing sequencing error, f min shifts to higher frequencies, and the plateau decreases in size ( Fig. 2D ). Additionally, as we discussed previously f min is a monotonically decreasing function of the sequencing error rate E , while f max is independent of it. Once the plateau region is identified by our theoretical considerations, we verified that the average value of the estimator applied to simulated data aligns very well with the ground truth value ( Fig. 2E-F ), while including counts outside the plateau region typically leads to a systematic overestimation of the effective mutation rate ( Extended Data Fig. 2 ). A controlled organoids experiment enables comparison with mutation-accumulation estimates In order to experimentally validate our method in a clinically relevant setting, we took advantage of sequencing data generated in the context of a mutation accumulation line experiment [ 23 ] conducted on patient-derived organoids [ 53 ] from colorectal cancer, extending the original investigation with additional experiments with a twofold objective. First, since those data were obtained from clonal cultures growing in neutral conditions, they perfectly fit the requirements of our model. Second, the mutation accumulation line experiment performed jointly on the same cells provided an experimental estimate of mutation rates [ 23 ] that could be exploited to benchmark our predictions. Typical mutation-accumulation line experiments are performed in lower organisms with a clonally expanding population undergoing repeated bottlenecks to minimize natural selection and promote the accumulation of neutral mutations through random genetic drift [ 26 – 28 , 54 ]. In our previous work [ 23 ] we adapted this design to higher eukaryotic cells, using colorectal cancer organoids derived from different patients. Figure 3A illustrates the experimental protocol (see Methods). Briefly, the organoids were cloned as single cells in multiple replicas, and passaged repeatedly to allow for the accumulation of de novo mutations. Specifically, to ensure neutral growing conditions, every 18 days the expanding population was subjected to bottlenecks in which the organoids were single-cell dissociated and approximately 100 random individual cells were replated (see Methods for details). After 6 months (221-531 estimated generations, see Methods and Extended data Table S1), the population was cloned again as single cells. High-coverage sequencing (100–150X) was conducted at two stages: first, on an “ancestor” population expanded over ∼ 50 days from the initial clone, and second, on clonal endpoint populations. The Luria-Delbrück estimate quantified the subclonal mutations, while the mutation-accumulation estimate was based on the clonal mutations, which are the differential mutations between the ancestor and the final clone [ 23 ]. Since the underlying population dynamics in the experiment were exactly the same for both estimates, this experimental setup provides an elegant cross-validation benchmark. Download figure Open in new tab Fig. 3: µ Seq yields spectra that agree with theoretical predictions with colorectal cancer patient-derived organoids, showing both the expected sequencing errors and Luria-Delbrück components of the spectrum. (A) Schematic representation of the experimental design and data-analysis pipeline. Organoids were cloned as single cells and expanded to define the ancestor. After six months (221-531 estimated generations, see Extended data Table S1) of a mutation-accumulation line experiment (see ref. [ 23 ] and Fig. 4), single-cell clones were derived again, expanded, and sequenced alongside the ancestor clones from the first expansion. High-coverage whole-genome sequencing data from the ancestral population and endpoint population were compared to identify clonal and subclonal mutations from a global pool of bases. Our model-derived computational estimation pipeline was then applied to this data. (B-D) The site frequency spectra of three organoids derived from different patients follow the model expectation. The blue solid line represents the Luria-Delbrück component (proportional to f − 2 ), while the orange thick solid line indicates the sequencing error component. (E-G) Value of the estimator when applied to the data from panels B-D. The estimated mutation rate reaches a stable plateau (red dash-dotted line) in the region f min < f < f max as predicted by the model. Since the organoid lines are aneuploid, we could not assume that all sequenced sites along the genome were present with the same copy number in cells. Therefore, we used the software Sequenza [ 55 ] (version 3.0.0) to infer the copy number of chromosomal regions. To identify subclonal mutations, we developed a custom pipeline performing a detailed comparison of the endpoint and ancestor sequencing data, described in the Methods section. This pipeline incorporates standard mutation-calling filters as well as specific considerations aimed at increasing the confidence in identifying sub-clonal mutations, reducing calling errors and biases. The workflow consists of three main steps: mapping both ancestor and endpoint sequences to a human genome reference, calculating copy number variations, and performing a pairwise comparison of the mapped data. The pileup files, generated during the mapping process, are then converted into a more compact format, retaining only the common sites to both ancestor and endpoint sequences that are identified as clonal in the ancestor, applying coverage thresholds (and thresholds to account for balanced numbers of forward and reverse reads), identifying potential subclonal mutations, and estimating their frequency. When sequencing reads at a single genomic locus showed evidence of more than one alternative allele (e.g., 90 reads with A, 8 with T, and 2 with C and an ancestral allele with A), we designated only the highest-frequency variant (in this case, T at 8%) as the representative subclonal mutation at that locus. The end product of this procedure is a list of detected subclonal mutations considered reliable by our set of criteria and a definition of the total pool of bases used for constructing the site frequency spectrum. Since we could not distinguish the homologous chromosome origin of mutations, the intra-population mutation frequency, f , was inferred as follows. Genomic regions with different ploidy (Π) exhibit distinct sequencing coverage values. While local coverage M i is proportional to ploidy, for mutations, where all Π copies are sequenced, the effective chromosome coverage reduces to M i / Π, assuming binomially distributed reads across homologous chromosomes. Hence, for each mutation i , given the number of reads k i , local coverage M i , and ploidy Π i , the intra-population frequency was estimated as This correction accounts for the expected single-chromosome reads, M i / Π i . This approximation ensures that estimated frequencies fluctuate around the true intra-population frequency while incorporating ploidy effects. The site frequency spectrum of the sequencing data was constructed by summing over histograms of different ploidy classes (Π = 2, 3). Effects related to ploidy variations were incorporated into our model and properly accounted for in the estimator. As previously noted, copy number variations (CNVs) can introduce biases in model-guided data analysis if not correctly considered [ 34 ]. First, while local coverage M i and sequencing error rate ϵ fluctuate across the genome, such variations are not explicitly included in the expected site frequency spectrum of sequencing errors ( Eq. 3 ). To address this, we approximated the sequencing error component with an exponential function — a standard approach for extreme values — since our focus was on the tail of the distribution ( f min corresponds to the highest frequency associated with sequencing errors). Additionally, in the Luria-Delbrück component of the spectrum, we accounted for copy-number-specific mutation rates by incorporating different values of µ across genomic regions ( µ → µ · Π, as there are Π sites per locus that can mutate). This led to an “overall” site frequency spectrum that includes sites with varying ploidy The model parameters inferred from the data are { A Π , B Π } , the fitting parameters of the exponential function for the sequencing error components, and µ/β for the Luria-Delbrück components. The latter was inferred using the mean value of the plateau observed in the estimator within the frequency range [ f min , f max ]. The value of f min was obtained by extrapolating the sum of the exponential functions using the best-fit values of A Π and B Π , while f max was determined iteratively using a prediction-correction scheme (see Methods). Other parameters ( L and ) were directly estimated from the sequencing data after filtering. Note that the only modification compared to Eq. 3 is the presence of the multiplicative factor , which accounts for ploidy variations across the genome. Similarly, the estimator for the effective mutation rate is given by where all counts observed within the range [ f − δ/ 2, f + δ/ 2] are considered. Here, f is the focal frequency, δ is the bin size, and L is the total number of sites considered ( L = Σ Π L Π ). Additionally, Eqs. 9 and 10 are directly applicable to subcloal mutation data of a specific ploidy. In such cases, one would have for diploid regions and for trisomic regions. Fig. 3BCD shows the experimental site frequency spectrum for three CRC organoids derived from different patients, together with the fit of our model. Two organoids were derived from microsatellite-stable (MSS) tumors and one from a microsatellite-unstable (MSI) tumor. For all three datasets, the data clearly show the two main contributions predicted by our model: (i) the component dominated by sequencing errors for f < f min and (ii) the Luria-Delbrück (LD) spectrum, proportional to 1 /f 2 , for f min < f < f max . The existence of a region in the site frequency spectrum dominated by the expected Luria-Delbrück power-law is further supported by the presence of a plateau identified by our estimator ( Fig. 3EFG ). Importantly, the plateau is consistently observed in the region f min < f < f max across all three datasets. We quantified the effective mutation rate for the three clones by taking the average value of the estimator within this region. To convert this estimate into the mutation rate , we used the estimated values of cell proliferation from ref. [ 23 ] (see Extended Data Table S1) to evaluate . Specifically, in [ 23 ], b − d was determined using cell counts at the end of each bottleneck, assuming from empirical observations that approximately 100 cells were present at the beginning of each bottleneck. The fraction of replicating cells at each stage was quantified using EdU incorporation to estimate b (see Methods). Finally, central values of β were obtained by calculating the ratio of b − d to b (see Extended Data Table S1). As discussed above, the error rate plays a role in setting f min . In our analysis, the effective error rate (10 − 2 ), reflected by the critical frequency threshold ( f min ≃ 0.1 0.2), was higher than the nominal sequencing error rate (10 − 3 ). This higher effective rate accounts for sequencing noise, alignment errors, coverage fluctuations, and site-specific variability (reported e.g. in ref. [ 56 , 57 ]). Lowering this effective error rate would reduce the required number of loci L (affecting f max ). Additionally, in all three spectra, we observed an extra component at high frequencies, with peaks around f ≈ 0.5 and f ≈ 1. The peak at f ≈ 1 is attributed to clonal mutations accumulated during the mutation-accumulation experiment. Such mutations, being clonal, are expected to appear at an observed intra-population frequency of f ≈ 1. In the MSI population, an additional peak at f ≈ 0.5 was observed, possibly due to heterozygous clonal mutations not recognized by our computational pipeline, or errors in the detection of the local ploidy and its changes during the mutation accumulation experiment. As done in standard mutation-accumulation line experiment, we used the counts of the number of sites within the “clonal” peak at frequency close to one to infer the mutation rate. While both experiments involved the same cell population, the mutation rate measurements derived from the mutation-accumulation line and our Luria-Delbrück method rely on orthogonal observational frameworks: clonal mutations accumulated over ∼ 6 months of serial passaging versus subclonal mutations arising during the final expansion phase only. However, the shared origin of the cell population ensures direct comparability between the two approaches. Thus, to validate the mutation rate ( µ ) estimates from our Luria-Delbrück method, we compared them to clonal mutation-derived values and observed strong agreement ( Fig. 4 ). These results also align closely with mutation-accumulation line estimates reported in [ 23 ], despite the different bioinformatics pipelines employed for identifying mutations ( Extended Data Fig. 6 ). In two datasets (1307LM and 0282PR), the predicted ( f max ) values showed partial overlap with frequency regions containing peaks. To mitigate this clear bias, we restricted our estimation to frequencies outside these small overlapping ranges. Download figure Open in new tab Fig. 4: The inferred values of the mutation rates of the three organoids are in agreement with the estimate obtained from mutation-accumulation lines. (A) Schematic representation of the joint mutation-accumulation line and Luria-Delbrück setups. During the six months between the ancestor and endpoint expansions described in Fig.3A , we performed a mutation-accumulation line experiment with several bottlenecks, described in ref. [ 23 ] (see (Extended data Table S1), this provided a cross-validation of two orthogonal ways to measure the mutation rates with cells from the same population. (B) Scatter plot for the inferred values of the mutation rate of the three organoids considered in this study obtained using our estimator (y-axis) of the Luria-Delbrück estimates from the subclonal spectra shown in Fig.3 compared to estimates obtained from the corresponding mutation-accumulation lines (x-axis). These data were analyzed using consistent criteria with the ones used to identify subclonal mutations (see Methods). Bars mark 99% percentile intervals. The estimated value of µ/β (mean value of the plateau) was converted into mutation rate by multiplying by the central value of β , obtained by direct measurements of b − d (growth curves in the exponential phase) and b (using Edu incorporation data) see ref. [ 23 ]. Central values of the dynamical rates are reported in Extended Data Table S1. Interestingly, the Luria-Delbrück estimate was systematically higher than the one obtained from clonal mutations. A possible reason for this discrepancy could be an overestimation of the number of generations in the MAL experiment, due experimental errors as well as to cells saturating growth and thus deviating from exponential growth dynamics between bottlenecks. Some mutational processes, such as homologous recombination, can generate clusters of mutations at nearby genomic sites. To test whether the assumption of independent mutations—where each site is treated as an independent Luria-Delbrück well—holds, we analyzed the distances between neighboring mutations within the frequency range [ f min , f max ], where mostly true positives are expected. If mutations occur independently, their nearest-neighbor distances should follow an exponential distribution. We compared the observed distribution to an exponential expectation, obtained by fitting an exponential model to the data, and found that the distributions largely follow this expectation, with small deviations at short distances ( Extended Data Fig. 3A-F ). These deviations, which affect about 10% of mutations in organoids, suggest a minor degree of clustering. To assess whether such clustered mutations contribute to the slight overestimation of the mutation rate observed in our analysis, we repeated the inference after removing mutations responsible for these deviations. The inferred mutation rate remained very similar, confirming that clustered mutations do not impact the accuracy of µ Seq estimates ( Extended Data Fig. 3G ). Finally, we assessed the consistency of mutation-rate estimates when calculated separately for regions with different detected ploidy. To do this, we compared the results of our method applied to the diploid and triploid parts of the genomes ( Extended Data Fig. 5 ). In the two MSS lines, the triploid regions were largest, while in MSI line, the diploid regions were more prevalent. Despite the reduced sampling of loci within each ploidy subgroup, the results showed that the estimate (Eq.10) in regions with different ploidy agrees with the effective mutation rate estimated using both regions combined, further supporting the robustness of our method. The µ Seq mutation-rate estimate is robust All results, including simulation validations, data-model consistency, estimates across different ploidy regions, and the agreement with mutation-accumulation line estimates, confirm the reliability and internal consistency of µ Seq. To further test robustness, we compared our approach with different analyses in the literature [ 22 , 28 , 32 , 54 , 58 , 59 ]. We identified two key features of µ Seq: (i) correctly identifying the frequency interval for estimation and (ii) a stringent comparison with a closely sequenced ancestor. We then tested the stability of our results against these features. First, with our criteria applied, other mutation rate estimators should perform similarly. Indeed, the cumulative P ( f ) approach proposed by Williams and coworkers [ 32 ] aligns with µ Seq, when applied within the [ f min , f max ] range defined by our method, but deviates otherwise ( Extended Data Fig. 4 , Methods). Second, a crucial element for the correct estimate of the mutation rate is that the identification of subclonal mutations should be done when comparing sequencing of the endpoint to that of the ancestral population (prior to the clonal expansion). To underline this point, we showed that using a standard reference genome as reference instead of the first expansion in our data would lead to incorrect estimates, by several orders of magnitude ( Fig. 5 and Extended Data Fig. 7 ). Importantly, the slope of the site frequency spectrum in this analysis is remarkably close to 1 /f 2 for a large range of frequencies (also beyond our bounds), although with a much larger prefactor. This empirical effect is remarkable as the slope could be a potential pitfall, if mistaken for a validation of the Luria-Delbrück scenario. Download figure Open in new tab Fig. 5: The inferred mutation rate using the frequency spectrum varies significantly depending on the reference genome used to identify clonal mutations. The top panel shows an illustrative sketch of the phylogenetic relationships between different genomes used as ancestral reference. Phylogenetic distances are purely for illustrative purpose, and the y axis of the plot shows the mutation rates quantified by our method. The closest genome to the endpoint, in terms of phylogenetic distance, is the ancestor used as reference in this study, corresponding to the first expansion of the cloned line ( Fig.4A ). Next, we considered the germline genome, which is phylogenetically separated from the previous genome due to somatic and driver mutations. Finally, we used the human reference genome, representing the common ancestor of the human population. Our results show that mutation rate inference can strongly depend on the phylogenetic distance of the reference genome used for calling the subclonal mutations. Crucially, values for shorter distances are in better agreement validation value set by the mutation-accumulatio line (dashed red line). The choice of reference genome isolates a distinct set of genomic regions on the endpoint. Hence, for a distant reference, the spectrum no longer refers to the sites that were actually clonal at the beginning of the mutation-accumulation line experiment. This effect can compromise the accuracy of the inference leading to gross overestimates. The shown data are relative to the MSS clone CRC1502LM. These analyses clearly show that if subclonal mutations are scored from an ancestor that is too distant the biases in the measured mutation rates can become enormous. While we have used a very close ancestor to score (sub) clonal mutations, it is also important to establish how distant an ancestor could be in order to obtain a reliable estimate. While a systematic investigation is beyond the scope of this study scope, we have performed a useful step in this direction, using germline sequencing instead of the close ancestor from an early expansion as genomic reference to call subclonal mutations in one of our clones. This analysis (shown in Extended Data Fig. 8 ) shows that using the germline as genomic reference still provides fairly accurate estimates for . The impact of different reference genomes on the inferred mutation rate is summarized in Fig. 5 . To gain further insight into the sources of bias involved in choosing a distant reference, we explored the statistics of the surplus (false-positive) subclonal mutations identified when using a reference genome other than the close ancestor from the early expansion. Although the subclonal mutations identified in each pairwise comparison (endpoint versus early expansion or versus a more distant reference) had identical intra-population frequencies across references ( Extended Data Fig. 9A-B ), the surplus subclonal mutations detected using either the germline or the human reference genome (hg38) as a genomic reference corresponded exclusively to sites that were not monomorphic in the close ancestral genome. Notably, most of these sites were associated with regions that underwent chromosomal gains (Fig. 9C-D-E-F). Based on these analyses, we surmise that using alternative reference genomes can reveal false-positive subclonal mutations mainly in regions with chromosomal alterations, and additionally in presence of large frequency changes and heterozygous sites. µ Seq is widely applicable Our theoretical predictions on frequency bounds are general and provide valuable insights into the feasibility of estimating parameters based on the effective mutation rate ( µ/β ) and sequencing error ( E ). Specifically, our approximate estimate for the minimum genome length required for reliable inference is given by the expression in Eq. 6 . This formula establishes a critical condition for a minimum genome length, or number of sampled loci L min needed for the inference to be statistically robust. To illustrate this point, consider the case of Saccharomyces cerevisiae (budding yeast), characterized by a mutation rate µ ≃ 3 ∗ 10 − 10 per site per generation [ 25 ] and a negligible death rate ( β ≃ 1). Assuming a sequencing error rate of ϵ ∼ 10 − 3 , the minimum required genome length is approximately L min ∼ 10 8 sites. This is 10 times larger than the actual genome length of budding yeast ( ∼ 10 7 sites), underscoring the challenge of applying the method to organisms with relatively short genomes, especially considering that many sites are filtered out by µ Seq, reducing the effective sample size. If we estimate these sites to be 10% of the genome, one would need to sequence 100 genomes for a reliable estimate. Besides serving to understand the limit of small genome size, we figured that deploying µ Seq in yeast or other microbes could be a very stringent test of this approach within an area with solid and well established estimates of mutation rates[ 26 – 28 , 38 , 54 , 60 ]. Thus, to overcome the limitation on sampled genomic sites, we extended our method to aggregate subclonal mutation data from multiple experimental realizations. This approach was made possible by the data from Liu and Zhang [ 61 ], which includes results from 93 independent mutation accumulation lines of the hypermutator knockout mutant msh2 Δ. Since these data included deep-sequencing of the last expansions, we could quantify both the subclonal mutations form the last expansion and the accumulated clonal mutations, so that the same cross-validation approach that we used for our organoids was possible. The msh2 Δ strain has an expected mutation rate higher than that of wild-type S. cerevisiae [ 28 ], and consequently, fewer sites (order 10 genomes according to our estimate) are needed for reliable inference, making the task more feasible. Our pipeline extracted the site frequency spectrum for each line, allowing us to compute two independent estimates of the mutation rate. As for the organoid data, the mutation-accumulation line estimate of the mutation rate relied on mutations observed at a sampled frequency greater than 0.85 to compute the mutation rate from clonal mutations ( µ clonal ). The µ Seq estimate used the aggregated site frequency spectrum in conjunction with our Luria-Delbrück-based approach. Quantitative data from Liu and Zhang [ 61 ], including growth rates measured at the beginning and end of the experiment, were used to determine the total number of generations. Notably, six datasets showed a peak around a frequency of 0.5 (possibly due to copy-number alterations during the experiment or errors in ploidy calling, as in the MSI organoid line discussed above) and were excluded. However, their inclusion yielded consistent estimates of the Luria-Delbrück mutation rate when frequencies up to 0.5 were retained (see Extended Data Figure 10 ) Importantly, we could not assume a negligible death rate for the msh2 Δ mutant, as the accumulation of deleterious mutations due to the elevated mutation rate results in a reduced viability [ 62 ]. More specifically, the strains were assumed to have an initial viability β 0 < 1. Additionally, in ref. [ 61 ] authors observed a further reduction in the proliferation rate during the MAL experiment by comparing the doubling times at the beginning and at the end of the experiment, noting a decrease of approximately 60%. We conservatively interpreted this observation as an increase in the death rate, likely caused by the further accumulation of deleterious mutations in this hypermutator strain. Hence, the effective viability used to infer the mutation rate was computed as β = 0.6 β 0 (see Methods for derivation), where β 0 , the initial viability of the strain (not available in the data set), was treated as a free parameter assumed set to β 0 = 0.52, according to the values reported in ref. [ 62 ]. Similarly, we re-evaluated the total number of generations by estimating the division rate starting from the proliferation rate and accounting for the death rate, expressed as ,where r ini (see Methods for derivation) is the initial proliferation rate, measured in [ 61 ]. Fig. 6 presents our results. First, our model accurately describes the aggregated data ( Fig. 6A ), displaying the two main components (dominated by sequencing errors and the Luria-Delbrück spectrum). Additionally, the estimator applied to this data reveals a plateau in the [ f min , f max ] interval ( Fig. 6B ), where f max > 0.85 due to the abundant sampling of loci. Download figure Open in new tab Fig. 6: Our method provides reliable mutation rate estimates using multiplexing of Saccharomyces cerevisiae msh2 Δ Strains (A) Aggregated site frequency spectrum from 87 Saccharomyces cerevisiae msh2 Δ strains, re-analyzed from ref. [ 61 ]. The data are shown alongside the expected sequencing error component (orange line), the Luria-Delbrück (LD) component (blue line) and the component associated to clonal mutations (cyan line) computed from our model. (B) Performance of the estimator when applied to the data in panel (A), showing the presence of a plateau in the range of frequencies predicted by the model (where mutation rate estimates are reliable).(C) Boxplot comparing mutation rates estimated with µ Seq and those reported for Saccharomyces cerevisiae msh2 Δ strains in the literature. The mutation rate µ was derived from the plateau value of the estimator in panel B, which measures µ/β . We assumed that the initial viability, β 0 , changes according to the experimentally observed growth rate variation [ 62 ] (see Methods). In the plot, the box represents the mean and interquartile range, while the whiskers extend to the maximum and minimum values. Dashed lines indicate the estimated mutation rates from Ollodart et al. [ 63 ], Serero et al. [ 64 ], and Lang et al. [ 65 ] Assuming an initial viability of β 0 = 0.52 for the msh2 Δ strain (the best estimate consistent with viability values reported in Ref. [ 62 ]), we find good agreement between the mutation rate estimated from the Luria–Delbrück spectrum and independent estimates from the literature [ 63 – 65 ]. Similarly to the case of the CRC organoid data, we tested the assumption of independent sites by the presence of clusters of subclonal mutations at nearby genomic sites ( Extended Data Fig. 11 A-B ). These deviations affect about 30% of the observed mutations, and contribute to the slight overestimation of the mutation rate observed in our analysis ( Extended Data Fig. 11C ). While the use of a mutator strain was instrumental, the robust high-range plateau visible in Fig. 6B shows that the same estimate could be obtained with lower multiplexing. We performed a subsampling of the 87 MALs to estimate, a posteriori , the number of MALs needed for this strain in order for the inference to be feasible. As shown in Extended Data Fig. 12 , we find that the estimate for this strain is possible with about 20 MALs. Given that the mutation rate of the non-mutator yeast strain is about 20 to 30 times smaller than the value we inferred for msh2 Δ [ 66 , 67 ], we anticipate that an estimate of the non-mutator yeast strain would be feasible with µ Seq using approximately 3 − 500 independent expansions, achievable with growth in multi-well plates. These results suggest that the method could be widely applicable to cells from species with small genomes (even bacteria) by multiplexing on the number of experimental realizations in order to obtain sufficient sampling of the subclonal mutations across loci. Discussion µ Seq combines a controlled experimental setup with a mathematical model that accounts for population dynamics and sequencing errors, to estimate mutation rates from the site frequency spectrum of a clonal, neutrally expanding population. Validated using mutation-accumulation line experiments, the method is applicable to mammalian cells and other organisms. Its development relies on theoretical modeling, numerical simulations, and a controlled experimental design. µ Seq stands on two pillars. On one hand, theoretical and computational techniques used to derive and test closed-form analytical expressions for the estimator and the frequency interval where the Luria-Delbrück spectrum can be observed, as a function of key parameters (number of sites, sequencing coverage and sequencing error rate). On the other, the controlled experimental setup with organoids ensures near-neutrality, and fixes experimental parameters (including cell birth-death rates and population size), as well as allowing to use mutation-accumulation lines as a benchmark. Interestingly, µ Seq consistently inferred higher mutation rates than those estimated from clonal mutations. In colorectal cancer organoids, the difference was approximately fourfold. Birth rates measured via nucleoside incorporation (see Methods) suggested 221–531 generations over six months (up to three doublings per day), which may over-estimate true division rates [ 23 ]. In yeast, the µ Seq estimate was approximately nine times higher ( Extended Data Fig. 11C ), potentially also due to inaccuracies in division rate inference (see Methods). Importantly, these discrepancies may also reflect the action of purifying selection during long-term mutation accumulation experiments [ 63 , 68 , 69 ]. Supporting this hypothesis, Liu and Zhang [ 61 ] reported a reduction in growth rate toward the end of their MALS experiments, consistent with the accumulation of deleterious mutations. Similarly, Grassi et al. [ 23 ] observed evidence of negative selection, with dN/dS < 1 in two of the three organoid lines analyzed here (a subset of those reported in the original study). In brief, the observed discrepancy might not be due to a problem of the µ Seq method. Our theoretical framework explains why mutation rate inference—whether using µ Seq or other methods based on subclonal mutation spectra [ 32 , 58 , 70 ]—fails outside a specific frequency range defined by read coverage and site sampling bounds. At low frequencies, sequencing errors dominate; at high frequencies, statistical undersampling renders inference unreliable. In this upper range, discrete peaks in the site frequency spectrum due to clonal mutations can also lead to systematic overestimation. These artifacts may inflate mutation rate estimates by one to two orders of magnitude. An even more pronounced overestimate occurs when the human reference genome is used for mutation calling. While the site frequency spectrum still follows the characteristic Luria–Delbrück 1 /f 2 form, the prefactor becomes substantially larger, resulting in mutation rate overestimates of up to three orders of magnitude. This could account for some of the inflated values reported in previous studies [ 70 ]. This phenomenology warrants further investigation. It may arise from technical biases that are not yet fully characterized or, more intriguingly, from unexplored aspects of population dynamics. Our analyses ( Extended Data Fig. 8 and Fig. 5 ) suggest that using a distant genomic reference alters the detection of subclonal mutations, particularly in excluded regions. These regions often appear monomorphic when compared to a distant reference but are in fact polymorphic in a nearby genomic background. In our data, this was primarily due to chromosomal alterations, but similar biases could arise from heterozygous mutations or frequency shifts that occurred between the reference genomes. On a broader level, µ Seq could support evolutionary studies across diverse species, including microbes. It enables pooling data from experimental replicates (or multiplexing by barcoded clones in a single expansion) to boost the number of sampled loci, which is essential for organisms with small genomes or low mutation rates. Despite the higher sequencing cost (approximately three billion reads are required), the method is faster and more scalable than traditional mutation-accumulation lines [26– 28, 60]. Future improvements may involve shallow sequencing of the ancestor and/or integration with lineage-tracking technologies [ 29 , 30 ]. In mammalian systems, where no standard exists for quantifying somatic mutation rates, µ Seq offers a unique opportunity to investigate the links between somatic mutation, cancer risk, and adaptation [ 13 , 24 , 71 ]. From a cancer evolution perspective, µ Seq enables the detection of inter-patient variability in mutation rates [ 21 ], even among colorectal tumors classified as microsatellite stable [ 23 ], based on differences in their mutation spectra. Clinically, µ Seq can be readily applied to patient-derived organoids within a few weeks, offering a speed advantage over traditional approaches—provided that mutational processes are not substantially altered in culture. This assumption can be tested by comparing mutational signatures and by applying the method across large patient cohorts. However, direct application to tumor biopsies presents additional challenges. Accurate mutation calling requires deep sequencing and a closely related genomic reference (e.g., patient-matched germline or an earlier tumor sample), conditions that are often unmet in existing datasets. Moreover, the method assumes neutral expansion and minimal spatial constraints during tumor growth. These assumptions can be explicitly tested; and even when they are violated, µ Seq still provides a useful null model for identifying deviations due to selection [ 41 , 72 ]. The mutation rate influences both the accumulation of random genetic errors and the emergence of mutations that can drive uncontrolled cell growth or therapy resistance. From a clinical perspective, elevated mutation rates can have adverse effects due to the increased likelihood of beneficial (from the tumor’s viewpoint) mutations. However, excessively high mutation rates may lead to “mutational meltdown” through the accumulation of deleterious mutations, and may also increase tumor visibility to the immune system. These opposing effects suggest that tumor mutation rates may be shaped by evolutionary trade-offs. Accordingly, tumors undergoing different mutational processes may require distinct treatment strategies informed by evolutionary reasoning. For example, microsatellite instability (MSI) colorectal cancers often harbor mutations in mismatch repair (MMR) genes—typically MLH1 or PMS2, and less frequently MSH2 or MSH6—resulting in elevated mutation rates. Despite this, MSI tumors generally have better prognosis than microsatellite stable (MSS) tumors and show strong responses to immunotherapy, likely due to increased neoantigen production [ 73 , 74 ]. However, some highly mutated MSI tumors also exhibit immune evasion through specific driver mutations [ 75 ]. In contrast, the behavior of MSS tumors with fast vs. slow mutation rates remains less well understood, though there is evidence that higher mutational burden in MSS tumors may correlate with better responses to immunotherapy [ 76 ]. MSH2 is a key MMR gene, and mutations in this gene have been leveraged in yeast-based genetic screens [ 77 ]. In humans, germline variants in MSH2 dramatically increase mutation rates and cancer risk, as observed in hereditary nonpolyposis colorectal cancer (HNPCC or Lynch syndrome) [ 78 ]. µ Seq offers a promising avenue to assess the functional impact of variants of uncertain significance by directly measuring their effect on mutation rate in patient-derived cells. This approach provides a bridge between yeast-based functional screens and patient-level data. Crucially, it enables mutation rate measurements without requiring time-consuming mutation accumulation experiments, potentially accelerating our understanding of genetic cancer risk and informing personalized medicine. Methods Theoretical modeling Site Frequency Spectrum (SFS) of neutral mutations in an exponentially growing population of cells We focused on the site frequency spectrum of an exponentially growing population of cells in continuous time. In this population, cells divide with rate b , die at rate d and accumulate mutations with a rate µ (per cell, per division, per nucleotide). Different analytical results for the SFS in this class of models, specifically for any population evolving according to a branching process with neutral mutations under the infinite-sites assumption of population genetics, have been derived in the last decade, both in the deterministic limit [ 32 ] and including stochastic effects [ 45 , 47 – 49 ]. Here, we considered the exact solution presented in [ 49 ], focusing on the fixed time limit, defined as the average time until the population reaches a final size N . In this limit, and for N ≫ 1, the expected number of mutations observed in j cells reads where is the probability of establishment of a cell (“viability” in experimental terms), i.e., the probability that a single cell (or a newly generated cell within a population) will establish, i.e. give rise to a viable colony (progeny) [ 47 , 48 , 79 ]. In the limit N → ∞ , Eq. 11 can be replaced by the continuous limit for the expected intra-population frequency , which reads [ 47 , 48 ] Eqs. (11, 12) are a valid estimate for j < N in the fixed- N limit, which involves considering an exponentially growing population precisely up to N cells, with an adjustment for j = N . Conversely, Eqs. (11, 12) were shown to be inaccurate in the limit as β → 0 and for . However, these frequency intervals, where Eqs. (11, 12) fail, are not relevant to our experimental situation because (i) the experiments were not performed with a fixed N endpoint, and (ii) for the range of experimentally relevant β , this transition occurs in the region of the spectrum dominated by sequencing errors (see below). SFS of a sampled population with sequencing errors We modeled the expected site frequency spectrum of mutations observed in a sequencing experiment. The sequencing process involves sampling the genome of a population of N cells, which we modeled as a binomial sampling process, with an effective population size M representing the sequencing coverage. In our model, each mutation is selected for sequencing with a probability equal to its intra-population frequency x . Hence, the expected spectrum of a sampled population, i.e., the expected number of mutations observed in k sequencing reads can be expressed as [ 47 , 48 ] where ℬ k ( M, x ) is the standard binomial distribution (for M number of attempts with a success probability x and k number of successes) and P ( x ) is the continuous site frequency spectrum estimated by Eq. 12 . Because of sequencing errors, the observed frequency of a mutation will differ from the intra-population frequency. Denoting with ϵ the probability per basepair of a sequencing error, the observed frequency will be reduced because of false negative, i.e., mutated sites that should have been detected, but are missed by the sequencer due to sequencing errors: Conversely, because of false positive errors, mutations will be detected even in sampled sequences that did not harbour the mutation These two effects combined together can be captured by considering an effective sampling probability which can then be used to compute the expected spectrum accounting for sampling and sequencing errors, as follows We have validated this expression with with simulated data (see Extended Data Fig. 1 , continous lines). Here, the second term accounts for all the sites that do not harbor subclonal mutations with a frequency greater than (which are in total ), for which only false-positive mutations appearing with probability equal to the sequencing error E can be detected. Since the sequencing error probability is expected to be small ( ϵ < 10 − 2 )[ 80 , 81 ], the expected site frequency spectrum P ( k ) is dominated by sequencing errors for small values of k , while it maintains the expected trend described by Eq. 13 in the right tail. This effect becomes even more explicit when considering the following approximate solution to Eq. 14 : with k min ≥ 2, which is in very good agreement with simulated data (see Extended Data Fig. 1 , dashed lines). The crossover between the two regimes (dominated by sequencing errors and by neutral mutations, respectively) occurs when the expected number of sequencing errors is close to zero. The value k min is then implicitly defined by This equation can be solved numerically. When the sequencing coverage is sufficiently large, one can consider the continuous limit of Eq. 15 , which expresses the histogram of the observed frequencies and use the approximated expression where is the Kullback-Leibler divergence between two Bernoulli processes, one with success probability f and the other with success probability ϵ . In the limit ϵM ≥ 1, Eq. 18 can be approximated further by noting that the binomial component of P ( f ) peaks around f = ϵ , yielding Thus, Critical frequency values (f min , f max ) and minimal number of sites required for the inference (L min ) A key aspect of our inference method is to define the frequency boundaries of the site frequency spectrum, where the Luria-Delbrück spectrum dominates over sequencing error components. The lower boundary, f min , represents the upper-bound frequency above which mutations attributable to sequencing errors become rare. Mathematically, this corresponds to the frequency where the sequencing error component of the site frequency spectrum is equal to one, because above this value we expect to find less than one site in our sample where purely false-positive mutations due to sequencing errors realized such a frequency. Equally, the upper boundary, f max , can be defined as the frequency at which the expected value of the Luria-Delbrück spectrum equals one, because above this frequency value we expect to have less than one site in our sample where the population dynamics gives rise to such high-frequency mutations. The high-frequency mutations are exponentially more rare in an exponentially expanding population, because they are due to “jackpot” events, where the mutation occurred in the first generations of the expansion [ 82 ]. Thus, in order to observe these jackpot events, one needs to sample many sites. Consequently, the value of f max critically depends on the number of sampled sites ( L ), because frequencies where the expected number of Luria-Delbrück sites is less than one are dominated by sampling (of sites) noise, leading to biased estimates (see Fig. 2 ). Both of these frequency boundaries can be approximated analytically using our model. In the limit ( ϵM ≥ 1), we obtained an approximate analytic expression for f min by setting Eq. 20 equal to 1, yielding Similarly, we define f max as the frequency at which the expected number of mutations in the Luria-Delbrück spectrum (excluding sequencing errors) equals one, Here, µ is the mutation rate, L is the number of sampled sites, M is the sequencing coverage, and β = 1 − b/d . Mutations with frequencies above f max are expected to be underrepresented or even absent in the data, leading to inaccuracies in estimation. Importantly, for genomes or datasets with small number of sampled sites L, f max decreases, further affecting the estimation of lower-frequency mutations. The frequency range where a reliable inference is possible is therefore defined by f min < f < f max . The equality f max = f min determines the threshold where limited sampling of sites and sequencing errors make the inference impossible. This sets a minimum number of sequenced sites ( L min ) required for inference. Using the analytical approximation Eq. 21 and recalling that , we derive the following condition: which, in the limit L min ≫ 1, is approximately solved by It is interesting to note that this threshold does not depend on the coverage M , which has the same effect on both boundaries of the reliable inference interval. Connection with existing estimation methods The site frequency spectrum of the intra-population frequency ( Eq. 1 ) can also be derived using standard methods commonly employed for deriving Luria-Delbrück statistics in the deterministic limit. More specifically, assuming the population grows exponentially as N ( t ) = e ( b−d ) t , the probability for a mutation to be observed at any frequency ≥ x can be computed [ 32 , 45 , 82 , 83 ] as where t x = − log[ x ] / ( b − d ). Hence, the expected cumulative number of mutations observed at intra-population frequency ≥ x is given by The expected number of mutations observed in the interval [ x min , x max ] reads: which is mathematically equivalent to where P ( x ) is given by Eq. 1 . This mathematical identity, however, holds only when considering the intra-population frequency x . Since the sampled population frequency f differs from the true frequency due to sampling effects, the application of this derivation to empirical data and its interpretation require an extension. However, since the site frequency spectrum of the sampled frequency retains the same mathematical form , above the sampling error threshold one can use the equivalent mathematical expression of Eq. 29 , replacing x → f , to relate the cumulative counts of mutations to the mutation rate As explained in the main text, keeping into account the additional error sources, the Luria-Delbrück spectrum is expected to be observed in the range [ f min , f max ]. Thus, the estimator in Eq. 30 can be applied by performing a linear fit of the total mutation counts observed in [1 /f, f max ] with f in [ f min , f max ] (computed according to our definitions) to obtain an estimate of the mutation rate equivalent to our proposed method. This estimation method yields consistent estimates to ours (see Extended Data Fig. 4 ). Numerical simulations Simulations were performed using a discrete-time algorithm for a birth-death branching process. Each step corresponds to a new generation of cells that are produced synchronously from the cells in the previous generation, with cell death. At each generation, each of the two daughter cells can die with probability 0 ≤ p 0 < 0.5 (where p 0 is an input parameter of the simulation). Therefore, after a division, the simulation yields 2 live cells with probability (1 − p 0 ) 2 , 1 live cell with probability 2 p 0 (1 − p 0 ), 0 live cells with probability . Starting from an initial “root” clone, the model generates a tree structure. Each node of the tree represents an individual in the population, each layer of the tree represents a generation, and comprises the set of individuals that make up the cell population for that generation. Once the population has reached a fixed size N (set as an external parameter), cell replication is stopped, and a process of random mutation is simulated on the tree for each base. In this process, the expected number of live cells at generation t grows exponentially as where β = log[2(1 − p 0 )]. Hence, the results obtained for a continuous process where N ( t ) = exp( βt ) are in excellent agreement with our simulated results using β = log[2(1 − p 0 )] (see Extended Data Fig. 1 ). The mutation rate µ is a key parameter in this process, representing the probability of a mutation occurring per base per cell division. During the simulation, we assigned a random number of mutations to each internal edge of the tree, drawn from a Poisson distribution with an expected value of µL , where L ) is the length of the in-silico genome of the simulated cells and serves as an input parameter. Each mutation occurring on an internal edge is transmitted deterministically to all descendant cells. At the end of the simulation, we focus on the subtree that includes all internal nodes with at least one surviving descendant in the final generation, excluding extinguished lineages that do not contribute to the observed mutations. For each mutation in this subtree, we assign an intra-population frequency of , where 𝒩 is the number of surviving cells in the final generation within the subtree associated with the edge where the mutation occurred. This ensures that each mutation’s frequency reflects its presence in the surviving cell population. Simulation of sequencing errors and sampling effect due to sequencing The output of the discrete-time branching process simulation consists of a list of intra-population frequencies, each corresponding to a mutation that occurred during the simulation. The length of this list, L mut , represents the total number of sites that experienced a mutation during the expansion process simulated by the branching-process model. To account for effects related to sequencing in our simulated data, for each frequency x , we extracted a random number from a binomial distribution with a total number of attempts equal to M (sequencing coverage) and with probability x + ϵ −2 xϵ . Here, E is an input parameter representing the sequencing error probability. The additive terms in the modified probability x s→ x + ϵ− 2 ϵ account for both false positives and false negatives in the sequencing process (see the Methods subsection on theoretical modeling). We also modeled sequencing errors for sites that did not acquire mutations in the simulation process, totaling L − L mut sites. For these sites, sequencing errors are expected to produce a number of false-positive reads that follow a binomial distribution with parameters M (number of attempts) and ϵ (extraction probability). Since L is typically large (around 10 9 ) while L mut is relatively small (on the order of 10 3 to 10 4 ), generating L − L mut binomial random numbers could be computationally intensive. Therefore, we generated a list of L − L mut random numbers from a multinomial distribution with L − L mut trials and probabilities given by B k ( M, ϵ ). By summing up these two contributions, we simulated a site frequency spectrum of mutations that includes the model of experimental errors due to sequencing errors and sampling of both reads and genomic sites. This data structure consists of a histogram of the number of mutations with k reads, for k = 0 to M . Experimental data and analysis of MALS in CRC organoids Experimental setup The experimental protocol for the mutation-accumulation line experiment was previously described in Grassi et al. ([ 23 ]); here, we summarize the key steps. Organoids were derived from MSS (ID 1307, 1502) and MSI (ID 0282) colorectal cancer samples via patient-derived xenografts (PDXs) in NOD-SCID mice to improve the establishment rate from 60% to 80% [ 53 ]. Single-cell clones were isolated by seeding tumoroids in 96-well plates, followed by microscopic selection of wells containing single cells. After ∼ 6 weeks, cells were archived or subjected to whole-genome sequencing (WGS) at T0. This first expansion was used as “ancestor” for our Luria-Delbrück experiment. Mutation accumulation lines were expanded through periodic bottlenecks ( 100 cells replated every 18 days) to promote neutral evolution. After 5–6 bottlenecks, a final ∼ 6-week expansion was performed before sequencing at T1. This final expansion was used as endpoint for our Luria-Delbrück experiment. The entire experiment spanned 180–190 days, varying slightly between clones. Whole-genome sequencing and data processing Genomic DNA from ancestor and endpoint populations was sequenced on Illumina NovaSeq or HiSeq with 150 bp paired-end reads and a mean coverage depth of 120x. Pileup files for ancestor and endpoint sequences were generated from BAM files using samtools [ 84 ] ( https://www.htslib.org/doc/samtools-mpileup.html ). Each clone’s data was further split based on copy number (CN), producing four final files per clone: two ancestor files (CN = 2, 3) and two endpoint files (CN = 2, 3). Reads were aligned to the hg38 reference human genome using bwa [ 85 ] (v0.7.17-r1188) [ 23 ], followed by duplicate marking with picard [ 86 ] (v2.18.15). Pileups were obtained from BAM files using samtools (v1.9, parameters ‘-q 1 -B’ to exclude multi-mapping reads and disable base alignment quality estimates). Copy number variation analysis was performed with sequenza [ 55 ] (v3.0.0), using tumor-normal matched samples and marked duplicate alignments as input (see [ 23 ] for details). Quanfitication of dynamical rates Cell division rates ( b ) were estimated using EdU incorporation assays, where the fraction of EdU-positive cells after a 3-hour pulse was used to infer the daily DNA replication rate. The total number of replications during the mutation accumulation (MA) phase was then obtained by multiplying this rate by the experiment duration. The growth rate ( b − d ) was estimated from cell counts before each 100-cell bottle-neck, fitting an exponential growth model. The average population doublings per day were computed as log 2 ( c/ 100) divided by the time between bottlenecks. As expected, this estimate was lower than EdU-based division rates due to cell death. These values were obtained from ref.[ 23 ], and the numerical values for the three organoids analyzed in this paper are provided in Extended Data Table S1. Mutation filtering procedure For both ancestor and endpoint samples, genomic regions were analyzed based on their ploidy. Specifically, for each clone, two pileup files were generated, corresponding to regions with Π = 2, 3. Since sequencing was performed at both the ancestor and endpoint stages, this resulted in a total of four pileup files. To identify clonal and sub-clonal mutations, we compared the ancestor and endpoint sequences within genomic regions of the same ploidy. The filtering process was applied as follows: In the ancestor sequencing, only sites where coverage fell within [Mode − st.dev, Mode+st.dev] were retained, where mode and standard deviation were computed separately for each Π in the pileup file of the ancestor. By enforcing this criterion, we also reduce the probability of including sites affected by ploidy changes, as the strict coverage filter helps exclude regions with abnormal copy numbers that may not be flagged by standard annotations. Among these, only monomorphic sites—where all reads carried the same allele—were selected, defining the set of ancestral monomorphic sites, which could develop mutations over time. In the endpoint sequencing, each site in the ancestral monomorphic sites was compared to its corresponding state in the endpoint sequence, retaining only those where coverage remained within [Mode − st.dev, Mode + st.dev], with mode and standard deviation computed separately for each Π in the pileup file of the endpoint. Again, this strict coverage filter also reduces the likelihood of including sites with local ploidy changes that may not be flagged by standard annotations. If a site was polymorphic at the endpoint, the variant allele was identified as the one with the highest read support (i.e., highest empirical frequency). For these polymorphic sites, the balance of forward and reverse reads was evaluated using the following inequalities: where N forward and N reverse represent the number of reads of the variant allele mapped in the forward and reverse orientations, respectively. This criterion is based on a binomial expectation: if an allele is sampled in a balanced manner, the statistics of forward and reverse read counts should follow a binomial distribution with an expected fraction of 1 / 2. By imposing this condition, we retained all sites whose observed frequency was within one standard deviation of 1 / 2. This filter is classically applied in mutation callers [ 87 ] to remove amplification errors from sequencing, which are proxied by an imbalance between the number of reads corresponding to the forward strand of DNA versus the reverse strand. Sites that passed this criterion were classified as genuine mutations and retained for further analysis, while those that failed were excluded from the ancestral monomorphic sites count, as their mutation status remained uncertain. Additionally, reference skips or deletions of the reference base reported in the pileup files were ignored. Hence, if an indel was present in either the ancestor or endpoint, the site was excluded from the pool of possible sites exhibiting subclonal mutations. Construction and analysis of the SFS of organoids sequencing data Computation of mutation frequency and construction of the site frequency spectrum For each detected mutation i , we considered the number of reads k i ≥ 0, the local coverage M i , and the local ploidy Π i . The intra-population frequency was estimated as This correction accounts for the fact that the expected number of reads for a single chromosome is M i / Π i . To construct the site frequency spectrum, we first grouped mutations according to their associated genomic regions, ensuring that mutations from regions with the same ploidy (Π) were binned together. The resulting histogram, denoted as , was computed separately for each ploidy class and comprised L Π sites, where L Π represents the total number of sampled sites considered after filtering (see previous section). Finally, the complete site frequency spectrum, , was obtained by summing the histograms over all ploidy classes using the same binning scheme. This procedure ensured that the contributions from different genomic regions were correctly aggregated while preserving the frequency distribution structure. We also checked that the different genomic regions lead to consistent estimates when considered indepently. Sequencing error component of the spectrum In sequencing data, the value of the local coverage M i fluctuates, and the error rate per site ϵ could also fluctuate. These fluctuations are not included in the expected SFS of sequencing errors given in Eq. 3 , which assumes constant values. To model this aspect, we used an exponential function, as we were primarily interested in the tail of the distribution, making a standard extreme-value distribution appropriate for the estimate. To identify the sequencing error component of the site frequency spectrum, we fitted the spectrum up to a frequency ≃ 0.15 using an exponential function for each ploidy separately. The total sequencing error component was then The value of f min was determined by best-fit values, imposing the condition . Luria-Delbrück component of the spectrum and estimator of the mutation rate Equations 15 and 16 are central results of our theoretical modeling to infer the mutation rate from sequencing data. To apply these equations, adjustments for genome ploidy were necessary. Specifically, in a genomic region with ploidy Π, the mutation rate increases by a factor of Π compared to the monosomic case, as all Π chromosomes can mutate ( µ → µ Π). Accounting for these modifications, the expected number of neutral mutations in a genomic region with ploidy Π is where we recall that L Π is the number of sites sampled in a genomic region with ploidy Π. Extending this to the whole genome, we get where L = ∑ Π L Π and is the average genome ploidy. The derivation of the estimator involves inverting the equation and expressing the result in terms of the observed frequency spectrum , obtaining where we considered all counts observed in the range [ f − δ/ 2, f + δ/ 2], with f being the focal frequency, δ the bin size, and L the total number of sites considered. Analogously, the value of f max was determined by adjusting Eq. 22 to incorporate the effect of ploidy variation where is the average of M i / Π i (coverage per ploidy) over all the L sampled sites, and reflects the average number of reads of a single chromosome across all the genome. Since f max depends explicitly on the value of the inferred effective mutation rate we used an iterative prediction-correction procedure to infer its value. We first estimated with f max ≈ 1.5 f min , then computed f max using Eq. 38 and subsequently iterated until convergence. Additionally, we constrained f max to not exceed , the frequency at which the expected spectrum component from clonal mutations equals 1 (see next paragraph). Inference of the mutation rate from clonal mutations produced over the 6-months mutation-accumulation line experiment To infer the mutation rate from mutations accumulated during the several expansions and bottlenecks of the mutation-accumulation line experiment (see Fig. 4 in the main text), we considered two methods: (i) using previously inferred values [ 23 ], and (ii) a self-consistent estimate based on mutations identified with our pipeline. Key to this inference is the identification of clonal mutations distinct from the ancestor, as these represent putative mutations accumulated during the experiment (which can only be clonal due to the bottlenecks). While these mutations have an intra-population frequency of 1, they originate from only one chromosome in a genomic region with ploidy Π. Hence, a site sampled by M i reads in genomic region with local ploidy Π i , that has accumulated a clonal mutation, has an expected fraction number of reads . Assuming that the total number of reads associated to the focal chromosome be binomial distributed, i.e. we obtain the following condition for the expected value of a mutation with agiven empirical frequency to be effectively clonal, where consistently with our way of computing the frequencies of the mutations (see Eq. 32 ). Building on this assumption, we modeled the clonal mutation contribution to the site frequency spectrum in a genomic region with ploidy Π as where 𝒩 ( f |1, σ ( f clonal ) is a normal distribution with mean = 1 and standard deviation , where we recall that is the average of M i / Π i over all the L sampled sites, and reflects the average number of reads of a single chromosome across all the genome. To estimate the total number of mutations likely associated with this component, we (i) isolated the counts of the site frequency spectrum constructed from the data in the range | f − 1| ≤ σ ( f clonal ) and (ii) performed a fit of a normal distribution of the form above to the extracted data, with a prefactor K Π . The total number of clonal mutations was then defined as where δ is the bin size used for the frequency histogram. From this number, the clonal mutation rate was computed as where L Π is the total number of sites sampled in the genomic region with ploidy Π and N generations was computed according to the population dynamics parameters of the organoid line, measured in ref. [ 23 ] as bT , where b is the division rate and T is the total duration of the experiment (see Extended Data Table S1). We estimated µ separately for different genomic regions and then computed a weighted average of these values, with weights L Π . The inferred mutation rates are reported in Extended Data Table S2. We found very good agreement (i) between the mutation rates estimated in the two genomic regions (Π = 2 and Π = 3) across all clones, as well as (ii) between our self-consistent estimate and the one presented in ref. [ 23 ] (see Extended Data Fig. 6B ). This procedure was applied separately for each ploidy. The component associated with clonal mutations was obtained by summing the two normal distributions for each ploidy, with their prefactors inferred from the fit The value of was determined by best-fit values, imposing the condition . Experimental data and analysis of MALS in the S. cerevisiae msh2 Δ strain Description of the datasets from Saccharomyces cerevisiae sequencing for the model application We retrieved sequencing data from ref. [ 61 ], a dataset including sequencing results from 93 independent mutation accumulation lines of msh2 Δ haploid strains. All mutation-accumulation lines had their own ancestor clone used to verify the contribution of new mutations in the final endpoint sequencings. Mutation-accumulation lines were expanded for approximately 1000 generations with approximately 60 bottlenecks. The sequencing of mutation-accumulation lines was downloaded from NCBI’s Sequence Read Archive (SRA) using Fasterq-dump (version 3.1.1, https://github.com/ncbi/sra-tools ), a tool that extracts data in FASTQ or FASTA format from SRA accessions (e.g., fasterq-dump –progress -e 8 –split-files SRR14744009). Indexing and mapping of the fastq files with the Saccharomyces cerevisiae reference genome (paired-end reads were aligned on the R64-1-1 S288C Saccharomyces reference sequence) were performed using scripts from a more complex pipeline called MuLoYDH, implemented by Tattini et al . in ref. [ 54 ]. Bash scripts used bwa (version 0.7.17-r1188) for indexing and mapping. Once the BAM files were obtained, we used samtools (version 1.20) with options samtools mpileup –min-MQ 5 –ignore-RG to create pileup files for each of the mutation-accumulation lines. For the yeast files, the division based on ploidy was done before we downloaded the fastq files, selecting the sequencing data with a confirmed ploidy value (CN = 1). All other steps, including the construction of pseudo-pileup files, the creation of common files, the application of filters, and the generation of the mutation list, were carried out as explained above for the organoid data. Quantification of Dynamical Rates To convert clonal mutation counts into mutation rates, we relied on quantitative measurements of birth and death rates from ref. [ 61 ]. In their study, net growth rates r = b − d were measured at the beginning ( r ini ) and end ( r end ) of the experiment, showing a reduction of approximately 60% ( r end = 0.6 r ini ). The total number of generations was computed using the log ratio of the number of cells at the beginning and at the end of each bottleneck, leading to an estimated total of ∼ 1511 generations over the duration of the mutation-accumulation line experiments ( T ), which consisted of 80 bottlenecks, each with a 48-hour expansion period. We reasoned as follows to quantify parameters used in our inference. The observed reduction in growth rate was assumed to result from an increased death rate due to the accumulation of deleterious mutations, which was expected given the hypermutator nature of the strain. Modelling the viability at the initial time point as we derived the following relationships: and Imposing the condition we estimated the viability at the end of the mutation-accumulation line experiment as This value was used to infer the mutation rate from the Luria-Delbrück component of the spectrum, explicitly depending on the initial viability ( β 0 ). The division rate b was expressed as Using these relationships, we corrected the total number of generations, now obtained as Since the number of generations was estimated from the log ratio of cells at the beginning and at the end of each expansion, assuming exponential growth. Since the growth rate was found to have declined at the end of the experiment, one can not assume uniform growth rate in each expansion. To best estimate for the average growth rate can then be obtained by averaging the first and last measured, leading to the expression: Thus, correcting for viability, the total number of generations is given by: Finally, for the inference of the mutation rate from the clonal mutations, we used the expression which explicitly depends on the value of β 0 . Analysis of subclonal mutation frequencies from the last expansion in the yeast experiments In order to construct the site frequency spectrum with a sufficient number of sampled sites L , we aggregated sequencing data from ≃ 90 independent experiments, subsequently applying the same methods described for organoids. The yeast strains analyzed were strictly haploid, meaning that each genomic site had a single chromosome copy, so we set Π = 1 for all sites. Although ploidy changes might have occurred during evolution, our strict coverage filtering likely excluded genomic regions that could have experienced duplications, reducing the risk of misinterpreting local copy number variations. We applied the same procedure as for the organoid data to infer the model parameters of the sequencing error and Luria-Delbrück components. However, for the clonal mutation component, we used a slightly different functional form. Since the strains were haploid, all sequencing counts corresponded to a single chromosome, eliminating the need to account for variability in read counts per chromosome. In this case, fluctuations in the expected clonal frequency (which remains 1) were primarily driven by (i) sequencing errors and (ii) local coverage variations. To model these fluctuations, we fitted an exponential function to the empirical spectrum in the range 0.85 ≤ f ≤ 1. This function was then used to determine the threshold frequency by identifying the frequency at which the fitted function equals 1. The number of sites with observed frequency provided our estimate of the total number of accumulated clonal mutations. The mutation rate estimate for the mutation-accumulation line from clonal mutations was then obtained by dividing the number of clonal mutations by the total number of generations (estimated as described above) and by L = ∑ MAL L MAL , the total number of sampled sites across all mutation-accumulation line experimental realizations. Download figure Open in new tab Extended Data Figure 1: Comparison of site frequency spectra from simulations and analytical predictions. The plots compare the average SFS over 100 simulated datasets for various sequencing coverages ( M , columns), per-cell establishment probability ( β , rows) and per-base sequencing error probabilities ( E , color coded according to the legend) with the expected value evaluated numerically ( Eq. 3 and 14) and the analytically approximated expected value ( Eq. 15 ). Other model parameters (see Methods): N ≃ 10 5 cells, µ = 5 10 − 9 /(division site individual). Download figure Open in new tab Extended Data Figure 2: The inclusion of mutation counts outside the frequency region predicted by the model can dramatically increase the estimated mutation rate. The figure shows box plots for the inferred mutation rates using different frequency regions of the site frequency spectrum, and for different values of the coverage and the sequencing error rate. The first row presents the box plots for the inferred values within the optimal region (visible in Fig. 2 of the main text). The second row shows the box plots when including frequencies less than f min . The third row includes frequencies greater than f max . Including frequencies outside the optimal region ( f f max ) leads to a systematic overestimation of the mutation rate. In all the plots, box plots show average as black line, boxes for 1st and 3rd quartiles, and circles for outliers. The distribution of astimated mutation rates was evaluated in a set of 100 simulated realizations for increasing sequencing coverage M in the first column and increasing sequencing error E in the second column. Other model parameters; N ≃ 10 5 cells, µ = 5 10 − 9 /(division site individual), viability β = 0.47. Download figure Open in new tab Extended Data Figure 3: Most subclonal mutations in the range [ f min , f max ] are not clustered, so removing clustered mutations does not affect the mutation rate ( µ ) estimate. Some mutational processes generate clusters of nearby mutations, violating µ Seq’s assumption of independence of mutations across genomic sites. To assess mutation distribution along the genome, we evaluated ther nearest-neighbor distances ( d NN ). If mutations are independent, d NN should follow an exponential distribution. Panels (A-C) display the cumulative d NN distribution for three organoid datasets, with the best-fit exponential distribution. Panels (D-F) highlight short-distance deviations using logarithmically spaced bins and log-scaled mutation counts. Deviations emerge for d NN < 10 4 , affecting 13% of the mutations in CRC1307LM, 9% in CRC1502LM, and 6% in CRC0282PR. Panel (G) shows a scatter plot comparing inferred mutation rates: all mutations (x-axis) versus excluding d NN < 10 4 (y-axis). Download figure Open in new tab Extended Data Figure 4: The estimators used by Williams and coworkers yield consistent estimates of the effective mutation rate when evaluated in the frequency interval f min , f max provided by our model. (A-C) Cumulative number of subclonal mutations as a function of the inverse frequency in the interval f min , f max for the three CRC clones considered in our study. This estimator was proposed by Williams et al . [ 32 ]. The dashed lines show the best linear regression of the data (see methods for definition), while continuous lines show the 99th percentile estimates (max and min). (D) Scatter plot between the estimated effective mutation rate measured with our method (mean value of the plateau) and the value obtained by linear regression. Each square represents the best estimate, while bars indicate the 99th percentile values (max and min). The red dashed line marks the bisector (perfect match). Download figure Open in new tab Extended Data Figure 5: The estimated effective mutation rate agrees with the plateau observed in the estimator for the diploid (Π = 2) and triploid (Π = 3) genomic regions of the three patient-derived organoids. We present the LF estimator values for these regions (first and second rows, respectively) across the three clones (indicated above each column). The red dashed line represents the effective mutation rate obtained from a joint fit of the Π = 2 and Π = 3 data. Download figure Open in new tab Extended Data Figure 6: The inferred values of the mutation rates of the three patient-derived organoids are in agreement with the estimate obtained from mutation accumulation lines from ref. [ 23 ]. (A) Same as plot as Fig. 4 of the main text, where we have now considered mutation rate estimates as computed in ref. [ 23 ], by taking mean values over different replicates for the three clones, and 99% percentile for bars. (B) Scatter plot of clonal mutation rate inferred with our self-consistent method vs clonal mutation rate inferred in ref. [ 23 ]. Squares marks central values while bars mark the 99% percentile of the inferred values. Download figure Open in new tab Extended Data Figure 7: The inference of the mutation rate is biased by orders of magnitude when mutations are called against an external reference. (A-C) Site frequency spectrum from the same three patient-derived organoids, where subclonal mutations are identified using as the reference ancestor the external reference genome hg38, instead of a close ancestor (see Fig. 5 of the main text). The blue solid line shows that the resulting spectrum surpringly has a Luria-Delbrück-like component (proportional to f − 2 ), while the orange thick solid line indicates the sequencing error component. (D-F) The plots show the value of the estimator when applied to the data from panels A-C. The estimated mutation rate, calculated using the Luria-Delbrück estimator, reaches a stable plateau (red dash-dotted line) over a wide range of values, but the estimated rates are three orders of magnitude larger than the ones obtained by using the first organoid expansion as ancestral reference. G: Scatter plot for the estimated effective mutation rate where the y-axis shows the estimates obtained using an external reference genome, while the x-axis shows the estimates obtained using ancestral sequencing data. The inferred mutation rate using the external reference genome is approximately 1000 times higher than the values validated by mutation-accumulation line experiments, and does not reflect the relative rank of the three clones. Download figure Open in new tab Extended Data Figure 8: The inferred mutation rates are approximately two times higher when mutations are called using high-coverage 60x germline sequencing as the reference ancestor. (A) Site frequency spectrum (SFS) of clonal mutations identified using germline sequencing to define the universe of mutations. (B) Estimated values based on (A). Both the mean value of the plateau and the mutation rate inferred from the peak at f = 1 are approximately respectively 2 and 4.1 times higher han the corresponding values inferred when the universe of sites was identified using the ancestral genome (see Fig. 5 ). Download figure Open in new tab Extended Data Figure 9: A more distant genomic reference yields false-positive surplus subclonal mutations corresponding to sites that are not monomorphic in the closeby ancestor. We compare the statistics of subclonal mutations in CRC clone 1307 (restricted to the frequency range [ f min , f max ]) obtained using either (i) the germline genome (left panels) or (ii) the human reference genome hg38 (right panels) as the genomic reference. (A–B) Scatter plots of the frequencies of mutations identified as subclonal using both the ancestral and (A) germline or (B) human-reference genomes. (C–D) Cumulative number of mutations as a function of inverse frequency, stratified by genomic reference: (C) germline and (D) human reference genome. All surplus mutations found when using germline or human-reference genome as a genomic reference were not monomorphic in the closeby ancestor. The majority of these correspond to sites with local ploidy 3, reflecting a chromosomal gainthat was already present in the ancestral genome but absent in the germline and human-reference genomes. (E–F) Proportions of three mutation classes: (1) monomorphic in both ancestral and alternative reference, (2) polymorphic in the ancestor but with the same ploidy, and (3) polymorphic in the ancestor and located in regions that underwent a chromosomal gain. Download figure Open in new tab Extended Data Figure 10: In yeast, a subclonal peak at frequency f ≃ 0.5 coming from a small subset of experimental replicates limits the frequency region of the inference but does not alter the inferred values. (AB) Same plots as Fig. 6 , but including for all the experimental replicates of ref. [ 61 ]. The six additional experiments excluded from the analysis reported in Fig. 5 carry an extra observed peak in the SFS at f = 0.5 (possibly due to erroneous ploidy calling or ploidy changes during the mutation-accumulation line experiments). However, when restricting the frequency region used for the inference by reducing f max to avoid including counts within the peak, the mutation-rate inference is quantitatively very similar to the one reported in Fig. 6 . Download figure Open in new tab Extended Data Figure 11: In yeast, the removal of clustered mutations does not affect the estimate of µ , and the mutation-rate estimate from clonal mutations in a mutation-accumulation line is lower than the µ Seq estimate from subclonal mutations. Panels (A-B) show the same analysis as in Extended Data Figure 3 focusing on the distribution of the genomic distance between neighboring pairs of subclonal mutations, to detect likely correlated nearby events. (A) shows the cumulative distribution of d NN , along with the best fit of an exponential distribution. To highlight deviations occurring at short distances, panel (B) presents the same data as panel (A) but with (i) logarithmically spaced bins and (ii) mutation counts in logarithmic scale. Most of the deviations from the exponential distribution are observed for mutations with d NN < 10 3 (about 30%). The boxplot in panel (C) compares the of the mutation rates obtained when considering all mutations (x-axis) versus when removing mutations with d NN < 10 3 , showing no significant deviations. Instead, the mutation rate estimated with clonal mutations from a joint mutation-accumulation line experiment (see Methods) is lower (by approximately a factor of 9) than the µ Seq subclonal estimate. Download figure Open in new tab Extended Data Figure 12: In yeast, the inference of the mutation rate for msh2 Δ strains can be achieved by aggregating sequencing data from ∼ 20 expansion experiments. (A-F) Site Frequency Spectrum (top) and estimator values (bottom) of sequencing data for an increasing number of MALs (from 1 to 80, as indicated above each plot). In (G), for all the SFS that showed an appreciable plateau (i.e., ≥ 20 MALs), we show box plots (line for the mean value, box for 1st and 3rd quartile, whiskers for min and max value) for all the non-zero values of the estimator for the data points in the range [ f min , f max ]. The dashed line marks the mean value of the estimator in the plateau when aggregating the entire dataset (87 MALs). All the box plots are in very good agreement with this value, showing that reliable mutation rate estimates can be obtained when multiplexing at least 20 clonal expansions. Extended data Table S1 Numerical values of the population-dyamics rates (mean and standard devations) of the patient-derived colorectal cancer organoids used in this work. These values are derived from the same experiments reported in ref. [ 23 ]. Extended data Table S2 Numerical values of the inferred muatition rates of patient-derived colorectal cancer organoids obtained in this work, including also the values reported in ref. [ 23 ]. Acknowlegments We are grateful to Gilles Fischer, Marco Fumasoni, Jacopo Grilli, Andrea Sottoriva, Federico Bassetti, Núria López-Bigas, and Johannes Berg for useful discussions. We also thank Orso Maria Romano for early contributions to this study. This work was supported by AIRC - Associazione Italiana per la Ricerca sul Cancro IG 2019 grant ID. 23258 and IG 2024 grant ID. 30391 (MCL and SP). Funding A.I.R.C., , 23258 , 30391 References [1]. ↵ Dobzhansky , T. The raw materials of evolution . The Scientific Monthly 46 , 445 – 449 ( 1938 ). OpenUrl [2]. ↵ Zhang , G. The mutation rate as an evolving trait . Nature reviews. Genetics 24 , 3 ( 2023 ). OpenUrl PubMed [3]. ↵ Lynch , M. et al. Genetic drift, selection and the evolution of the mutation rate . Nature reviews. Genetics 17 , 704 – 714 ( 2016 ). OpenUrl CrossRef PubMed [4]. ↵ Eigen , M. Selforganization of matter and the evolution of biological macro-molecules . Die Naturwissenschaften 58 , 465 – 523 ( 1971 ). URL http://dx.doi.org/10.1007/bf00623322 . OpenUrl CrossRef PubMed Web of Science [5]. ↵ Gerrish , P. J. & Lenski , R. E. The fate of competing beneficial mutations in an asexual population . Genetica 102 , 127 – 144 ( 1998 ). OpenUrl CrossRef PubMed Web of Science [6]. Park , S.-C. & Krug , J. Clonal interference in large populations . Proceedings of the National Academy of Sciences 104 , 18135 – 18140 ( 2007 ). OpenUrl Abstract / FREE Full Text [7]. Blundell , J. R. & Levy , S. F. Beyond genome sequencing: lineage tracking with barcodes to study the dynamics of evolution, infection, and cancer . Genomics 104 , 417 – 430 ( 2014 ). OpenUrl CrossRef PubMed [8]. ↵ Fumasoni , M. & Murray , A. W. The evolutionary plasticity of chromosome metabolism allows adaptation to constitutive dna replication stress . Elife 9 , e51963 ( 2020 ). OpenUrl CrossRef PubMed [9]. ↵ Lässig , M. , Mustonen , V. & Nourmohammad , A. Steering and controlling evolution—from bioengineering to fighting pathogens . Nature Reviews Genetics 24 , 851 – 867 ( 2023 ). OpenUrl CrossRef PubMed [10]. ↵ Lässig , M. , Mustonen , V. & Walczak , A. M. Predicting evolution . Nature ecology & evolution 1 , 0077 ( 2017 ). OpenUrl CrossRef [11]. ↵ Amirouchene-Angelozzi , N. , Swanton , C. & Bardelli , A. Tumor evolution as a therapeutic target . Cancer Discov ( 2017 ). URL https://www.ncbi.nlm.nih.gov/pubmed/28729406 . [12]. Maley , C. C. et al. Classifying the evolutionary and ecological features of neoplasms . Nat Rev Cancer 17 , 605 – 619 ( 2017 ). URL https://www.ncbi.nlm.nih.gov/pubmed/28912577 . OpenUrl CrossRef PubMed [13]. ↵ Martincorena , I. et al. Universal patterns of selection in cancer and somatic tissues . Cell ( 2017 ). URL https://www.ncbi.nlm.nih.gov/pubmed/29056346 . [14]. Tomasetti , C. , Li , L. & Vogelstein , B. Stem cell divisions, somatic mutations, cancer etiology, and cancer prevention . Science 355 , 1330 – 1334 ( 2017 ). OpenUrl Abstract / FREE Full Text [15]. Cross , W. C. , Graham , T. A. & Wright , N. A. New paradigms in clonal evolution: punctuated equilibrium in cancer . J Pathol 240 , 126 – 36 ( 2016 ). URL https://www.ncbi.nlm.nih.gov/pubmed/27282810 . OpenUrl CrossRef PubMed [16]. Bertotti , A. et al. The genomic landscape of response to egfr blockade in colorectal cancer . Nature 526 , 263 – 7 ( 2015 ). URL http://www.ncbi.nlm.nih.gov/pubmed/26416732 . OpenUrl CrossRef PubMed [17]. Swanton , C. Cancer evolution: the final frontier of precision medicine? Ann Oncol 25 , 549 – 51 ( 2014 ). URL http://www.ncbi.nlm.nih.gov/pubmed/24567514 . OpenUrl CrossRef PubMed Web of Science [18]. ↵ Sprouffske , K. , Merlo , L. M. , Gerrish , P. J. , Maley , C. C. & Sniegowski , P. D. Cancer in light of experimental evolution . Curr Biol 22 , R762 – 71 ( 2012 ). URL http://www.ncbi.nlm.nih.gov/pubmed/22975007 . OpenUrl CrossRef PubMed [19]. Sniegowski , P. D. , Gerrish , P. J. & Lenski , R. E. Evolution of high mutation rates in experimental populations of e. coli . Nature 387 , 703 ?705 ( 1997 ). URL http://dx.doi.org/10.1038/42701 . OpenUrl CrossRef PubMed Web of Science [20]. ↵ Taddei , F. et al. Role of mutator alleles in adaptive evolution . Nature 387 , 700 – 2 ( 1997 ). URL https://www.ncbi.nlm.nih.gov/pubmed/9192893 . OpenUrl CrossRef PubMed Web of Science [21]. ↵ J. Prindle , M. J. Fox , E. & A. Loeb , L. The mutator phenotype in cancer: Molecular mechanisms and targeting strategies . Current Drug Targets 11 , 1296 – 1303 ( 2010 ). URL http://dx.doi.org/10.2174/1389450111007011296 . OpenUrl CrossRef PubMed Web of Science [22]. ↵ Loeb , L. A. et al. Extensive subclonal mutational diversity in human colorectal cancer and its significance . Proceedings of the National Academy of Sciences 116 , 26863 – 26872 ( 2019 ). URL http://dx.doi.org/10.1073/pnas.1910301116 . OpenUrl Abstract / FREE Full Text [23]. ↵ Grassi , E. et al. Heterogeneity and evolution of dna mutation rates in microsatellite stable colorectal cancer . bioRxiv 2024–02 ( 2024 ). [24]. ↵ Martincorena , I. & Campbell , P. J. Somatic mutation in cancer and normal cells . Science 349 , 1483 – 1489 ( 2015 ). URL http://dx.doi.org/10.1126/science.aab4082 . OpenUrl Abstract / FREE Full Text [25]. ↵ Lang , G. I. & Murray , A. W. Estimating the per-base-pair mutation rate in the yeast saccharomyces cerevisiae . Genetics 178 , 67 – 82 ( 2008 ). URL https://www.genetics.org/content/178/1/67 . OpenUrl Abstract / FREE Full Text [26]. ↵ Zhu , Y. O. , Siegal , M. L. , Hall , D. W. & Petrov , D. A. Precise estimates of mutation rate and spectrum in yeast . Proceedings of the National Academy of Sciences 111 , E2310 – E2318 ( 2014 ). URL https://www.pnas.org/content/111/22/E2310 . OpenUrl Abstract / FREE Full Text [27]. ↵ Lynch , M. et al. A genome-wide view of the spectrum of spontaneous mutations in yeast . Proceedings of the National Academy of Sciences 105 , 9272 – 9277 ( 2008 ). URL https://www.pnas.org/content/105/27/9272 . OpenUrl Abstract / FREE Full Text [28]. ↵ Loeillet , S. et al. Trajectory and uniqueness of mutational signatures in yeast mutators . Proceedings of the National Academy of Sciences 117 , 24947 – 24956 ( 2020 ). OpenUrl Abstract / FREE Full Text [29]. ↵ Shibata , D. Mutation and epigenetic molecular clocks in cancer . Carcinogenesis 32 , 123 – 128 ( 2011 ). OpenUrl CrossRef PubMed Web of Science [30]. ↵ Brody , Y. et al. Quantification of somatic mutation flow across individual cell division events by lineage sequencing . Genome research 28 , 1901 – 1918 ( 2018 ). OpenUrl Abstract / FREE Full Text [31]. ↵ Szikriszt , B. et al. A comprehensive survey of the mutagenic impact of common cancer cytotoxics . Genome biology 17 , 99 ( 2016 ). OpenUrl CrossRef PubMed [32]. ↵ Williams , M. J. , Werner , B. , Barnes , C. P. , Graham , T. A. & Sottoriva , A. Identification of neutral tumor evolution across cancer types . Nat Genet 48 , 238 – 244 ( 2016 ). URL https://www.ncbi.nlm.nih.gov/pubmed/26780609 . OpenUrl CrossRef PubMed [33]. ↵ Werner , B. et al. Measuring single cell divisions in human tissues from multiregion sequencing data . Nature communications 11 , 1035 ( 2020 ). OpenUrl CrossRef PubMed [34]. ↵ Tarabichi , M. et al. Neutral tumor evolution? Nature genetics 50 , 1630 – 1633 ( 2018 ). OpenUrl CrossRef PubMed [35]. ↵ Milholland , B. et al. Differences between germline and somatic mutation rates in humans and mice . Nature communications 8 , 15183 ( 2017 ). OpenUrl CrossRef PubMed [36]. ↵ Drost , J. et al. Use of crispr-modified human stem cell organoids to study the origin of mutational signatures in cancer . Science (New York, N.Y .) 358 , 234 – 238 ( 2017 ). OpenUrl CrossRef [37]. ↵ Mukai , T. The genetic structure of natural populations of drosophila melanogaster. i. spontaneous mutation rate of polygenes controlling viability . Genetics 50 , 1 ( 1964 ). OpenUrl FREE Full Text [38]. ↵ Lang , G. Measuring Mutation Rates Using the Luria-Delbrück Fluctuation Assay , Vol. 1672 , 21 – 31 ( 2018 ). OpenUrl [39]. ↵ Luria , S. E. & Delbrück , M. Mutations of bacteria from virus sensitivity to virus resistance . Genetics 28 , 491 – 511 ( 1943 ). URL https://www.ncbi.nlm.nih.gov/pubmed/17247100 . OpenUrl FREE Full Text [40]. Shaffer , S. M. et al. Memory sequencing reveals heritable single-cell gene expression programs associated with distinct cellular behaviors . Cell 182 , 947 – 959 ( 2020 ). OpenUrl CrossRef PubMed [41]. ↵ Fusco , D. , Gralka , M. , Kayser , J. , Anderson , A. & Hallatschek , O. Excess of mutational jackpot events in expanding populations revealed by spatial luria– delbrück experiments . Nature communications 7 , 1 – 9 ( 2016 ). OpenUrl [42]. Zheng , Q. Comparing mutation rates under the luria–delbrück protocol . Genetica 144 , 351 – 359 ( 2016 ). OpenUrl CrossRef PubMed [43]. ↵ Kessler , D. A. & Levine , H. Large population solution of the stochastic luriadelbruck evolution model . Proceedings of the National Academy of Sciences of the United States of America 110 , 11682 – 11687 ( 2013 ). OpenUrl Abstract / FREE Full Text [44]. ↵ Russo , M. et al. A modified fluctuation-test framework characterizes the population dynamics and mutation rate of colorectal cancer persister cells . Nature genetics 54 , 976 – 984 ( 2022 ). OpenUrl CrossRef PubMed [45]. ↵ Bozic , I. , Gerold , J. M. & Nowak , M. A. Quantifying clonal and subclonal passenger mutations in cancer evolution . PLoS computational biology 12 , e1004731 ( 2016 ). OpenUrl CrossRef [46]. ↵ Pan , D. , Lin , J. & Amir , A. Criticality in the luria-delbrück model with an arbitrary mutation rate . Physical Review Letters 134 , 038401 ( 2025 ). OpenUrl CrossRef PubMed [47]. ↵ Durrett , R. & Durrett , R. Branching process models of cancer ( Springer , 2015 ). [48]. ↵ Durrett , R. Population genetics of neutral mutations in exponentially growing cancer cell populations . The annals of applied probability: an official journal of the Institute of Mathematical Statistics 23 , 230 ( 2013 ). OpenUrl PubMed [49]. ↵ Gunnarsson , E. B. , Leder , K. & Foo , J. Exact site frequency spectra of neutrally evolving tumors: A transition between power laws reveals a signature of cell viability . Theoretical Population Biology 142 , 67 – 90 ( 2021 ). OpenUrl CrossRef PubMed [50]. ↵ Kessler , D. A. & Levine , H. Scaling solution in the large population limit of the general asymmetric stochastic luria–delbrück evolution process . Journal of Statistical Physics 158 , 783 – 805 ( 2014 ). URL http://dx.doi.org/10.1007/s10955-014-1143-3 . OpenUrl PubMed [51]. ↵ Kennedy , S. R. et al. Detecting ultralow-frequency mutations by duplex sequencing . Nat Protoc 9 , 2586 – 606 ( 2014 ). URL https://www.ncbi.nlm.nih.gov/pubmed/25299156 . OpenUrl CrossRef PubMed [52]. ↵ Lou , D. I. et al. High-throughput dna sequencing errors are reduced by orders of magnitude using circle sequencing . Proceedings of the National Academy of Sciences 110 , 19872 – 19877 ( 2013 ). OpenUrl Abstract / FREE Full Text [53]. ↵ Leto , S. et al. Xenturion is a population-level multidimensional resource of xenografts and tumoroids from metastatic colorectal cancer patients . Nature Communications 15 ( 2024 ). [54]. ↵ Tattini , L. et al. Accurate tracking of the mutational landscape of diploid hybrid genomes . Molecular biology and evolution 36 ( 2019 ). [55]. ↵ Favero , F. et al. Sequenza: Allele-specific copy number and mutation profiles from tumor sequencing data . Annals of oncology : official journal of the European Society for Medical Oncology /ESMO ( 2015 ). [56]. ↵ Stoler , N. & Nekrutenko , A. Sequencing error profiles of illumina sequencing instruments . NAR genomics and bioinformatics 3 , qab019 ( 2021 ). OpenUrl [57]. ↵ Pfeiffer , F. et al. Systematic evaluation of error rates and causes in short samples in next-generation sequencing . Scientific reports 8 , 10950 ( 2018 ). OpenUrl CrossRef PubMed [58]. ↵ Williams , M. J. , Sottoriva , A. & Graham , T. A. Measuring clonal evolution in cancer with genomics . Annual Review of Genomics and Human Genetics 20 , 309 – 329 ( 2019 ). URL https://doi.org/10.1146/annurev-genom-083117-021712 . PMID: 31059289 . OpenUrl CrossRef PubMed [59]. ↵ Cipponi , A. et al. Mtor signaling orchestrates stress-induced mutagenesis, facilitating adaptive evolution in cancer . Science (New York, N.Y .) 368 , 1127 – 1131 ( 2020 ). OpenUrl CrossRef [60]. ↵ Liu , H. & Zhang , J. Yeast spontaneous mutation rate and spectrum vary with environment . Current Biology 29 ( 2019 ). [61]. ↵ Liu , H. & Zhang , J. The rate and molecular spectrum of mutation are selectively maintained in yeast . Nature Communications 12 , 4044 ( 2021 ). OpenUrl CrossRef PubMed [62]. ↵ Manthey , G. M. , Naik , N. & Bailis , A. M. Msh2 blocks an alternative mechanism for non-homologous tail removal during single-strand annealing in saccharomyces cerevisiae . PLoS One 4 , e7488 ( 2009 ). OpenUrl CrossRef PubMed [63]. ↵ Ollodart , A. R. et al. Multiplexing mutation rate assessment: determining pathogenicity of msh2 variants in saccharomyces cerevisiae . Genetics 218 , iyab058 ( 2021 ). OpenUrl CrossRef PubMed [64]. ↵ Serero , A. , Jubin , C. , Loeillet , S. , Legoix-Né, P. & Nicolas , A. G. Mutational landscape of yeast mutator strains . Proceedings of the National Academy of Sciences 111 , 1897 – 1902 ( 2014 ). OpenUrl Abstract / FREE Full Text [65]. ↵ Lang , G. I. , Parsons , L. & Gammie , A. E. Mutation rates, spectra, and genomewide distribution of spontaneous mutations in mismatch repair deficient yeast . G3: Genes, Genomes, Genetics 3 , 1453 – 1465 ( 2013 ). OpenUrl CrossRef [66]. ↵ Zhu , Y. O. , Siegal , M. L. , Hall , D. W. & Petrov , D. A. Precise estimates of mutation rate and spectrum in yeast . Proceedings of the National Academy of Sciences 111 , E2310 – E2318 ( 2014 ). OpenUrl Abstract / FREE Full Text [67]. ↵ Lynch , M. et al. A genome-wide view of the spectrum of spontaneous mutations in yeast . Proceedings of the National Academy of Sciences 105 , 9272 – 9277 ( 2008 ). OpenUrl Abstract / FREE Full Text [68]. ↵ Drake , J. W. C ontrasting mutation rates from specific-locus and long-term mutation-accumulation procedures . G3: Genes— Genomes— Genetics 2 , 483 – 485 ( 2012 ). OpenUrl CrossRef [69]. ↵ Kibota , T. T. & Lynch , M. Estimate of the genomic mutation rate deleterious to overall fitness in e. coll . Nature 381 , 694 – 696 ( 1996 ). OpenUrl CrossRef PubMed Web of Science [70]. ↵ Caravagna , G. et al. Subclonal reconstruction of tumors by using machine learning and population genetics . Nature genetics 52 , 898 – 907 ( 2020 ). OpenUrl CrossRef PubMed [71]. ↵ Cagan , A. et al. Somatic mutation rates scale with lifespan across mammals . Nature 604 , 517 – 524 ( 2022 ). URL http://dx.doi.org/10.1038/s41586-022-04618-z . OpenUrl CrossRef PubMed [72]. ↵ Bozic , I. , Paterson , C. & Waclaw , B. On measuring selection in cancer from subclonal mutation frequencies . PLOS Computational Biology 15 , e1007368 ( 2019 ). URL http://dx.doi.org/10.1371/journal.pcbi.1007368 . OpenUrl CrossRef PubMed [73]. ↵ Xiao , Y. & Freeman , G. J. The microsatellite instable subset of colorectal cancer is a particularly good candidate for checkpoint blockade immunotherapy . Cancer Discovery 5 , 16 – 18 ( 2015 ). URL http://dx.doi.org/10.1158/2159-8290.CD-14-1397 . OpenUrl Abstract / FREE Full Text [74]. ↵ Llosa , N. J. et al. The vigorous immune microenvironment of microsatellite instable colon cancer is balanced by multiple counter-inhibitory checkpoints . Cancer Discov 5 , 43 – 51 ( 2015 ). URL https://www.ncbi.nlm.nih.gov/pubmed/25358689 . OpenUrl Abstract / FREE Full Text [75]. ↵ Grasso , C. S. et al. Genetic mechanisms of immune evasion in colorectal cancer . Cancer Discov ( 2018 ). URL https://www.ncbi.nlm.nih.gov/pubmed/29510987 . [76]. ↵ Goodman , A. M. , Sokol , E. S. , Frampton , G. M. , Lippman , S. M. & Kurzrock , R. Microsatellite-stable tumors with high mutational burden benefit from immunotherapy . Cancer Immunology Research 7 , 1570 – 1573 ( 2019 ). URL http://dx.doi.org/10.1158/2326-6066.CIR-19-0149 . OpenUrl Abstract / FREE Full Text [77]. ↵ Ollodart , A. R. et al. Multiplexing mutation rate assessment: determining pathogenicity of msh2 variants in saccharomyces cerevisiae . Genetics 218 ( 2021 ). URL http://dx.doi.org/10.1093/genetics/iyab058 . [78]. ↵ Lynch , H. T. , Snyder , C. L. , Shaw , T. G. , Heinen , C. D. & Hitchins , M. P. Milestones of lynch syndrome: 1895-2015 . Nat Rev Cancer 15 , 181 – 94 ( 2015 ). URL https://www.ncbi.nlm.nih.gov/pubmed/25673086 . OpenUrl CrossRef PubMed [79]. ↵ Stadler , T. et al. How well can the exponential-growth coalescent approximate constant-rate birth–death population dynamics? Proceedings of the Royal Society B: Biological Sciences 282 , 20150420 ( 2015 ). OpenUrl CrossRef PubMed [80]. ↵ Rimmer , A. et al. Integrating mapping-, assembly-and haplotype-based approaches for calling variants in clinical sequencing applications . Nature genetics 46 , 912 – 918 ( 2014 ). OpenUrl CrossRef PubMed [81]. ↵ Mitchell , K. et al. Benchmarking of computational error-correction methods for next-generation sequencing data . Genome Biology 21 ( 2020 ). [82]. ↵ Luria , S. E. & Delbrück , M. Mutations of bacteria from virus sensitivity to virus resistance . Genetics 28 , 491 ( 1943 ). OpenUrl FREE Full Text [83]. ↵ Russo , M. et al. A modified fluctuation-test framework characterizes the population dynamics and mutation rate of colorectal cancer persister cells . Nature Genetics 54 , 976 – 984 ( 2022 ). OpenUrl CrossRef PubMed [84]. ↵ Danecek , P. et al. Twelve years of samtools and bcftools . GigaScience 10 ( 2021 ). URL http://dx.doi.org/10.1093/gigascience/giab008 . [85]. ↵ Li , H. & Durbin , R. Fast and accurate long-read alignment with burrows-wheeler transform . Bioinformatics 26 , 589 – 95 ( 2010 ). URL http://www.ncbi.nlm.nih.gov/pubmed/20080505 . OpenUrl CrossRef PubMed Web of Science [86]. ↵ Picard toolkit . https://broadinstitute.github.io/picard/ )( 2019 . [87]. ↵ Guo , Y. et al. The effect of strand bias in illumina short-read sequencing data . BMC genomics 13 , 666 ( 2012 ). OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted April 22, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following μSeq: Universal mutation rate quantification via deep sequencing of a single clonal expansion Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share μ Seq: Universal mutation rate quantification via deep sequencing of a single clonal expansion Simone Pompei , Alberto Geroldi , Pietro Rivetti , Elena Grassi , Valentina Vurchio , Giorgio Tallarico , Giorgio Corti , Lorenzo Tattini , Gianni Liti , Andrea Bertotti , Marco Cosentino Lagomarsino bioRxiv 2025.04.17.649315; doi: https://doi.org/10.1101/2025.04.17.649315 Share This Article: Copy Citation Tools μ Seq: Universal mutation rate quantification via deep sequencing of a single clonal expansion Simone Pompei , Alberto Geroldi , Pietro Rivetti , Elena Grassi , Valentina Vurchio , Giorgio Tallarico , Giorgio Corti , Lorenzo Tattini , Gianni Liti , Andrea Bertotti , Marco Cosentino Lagomarsino bioRxiv 2025.04.17.649315; doi: https://doi.org/10.1101/2025.04.17.649315 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Evolutionary Biology Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41910) Biophysics (21436) Cancer Biology (18576) Cell Biology (25480) Clinical Trials (138) Developmental Biology (13368) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40360) Molecular Biology (17163) Neuroscience (88534) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00