Generative continuous time model reveals epistatic signatures in protein evolution

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 70,685 characters · extracted from preprint-html · click to expand
Generative continuous time model reveals epistatic signatures in protein evolution | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Generative continuous time model reveals epistatic signatures in protein evolution View ORCID Profile Andrea Pagnani , View ORCID Profile Pierre Barrat-Charlaix doi: https://doi.org/10.1101/2025.09.17.676821 Andrea Pagnani 1 DISAT, Politecnico di Torino, Corso Duca degli Abruzzi 24 , 10129, Torino, Italy 2 Italian Institute for Genomic Medicine, IRCCS Candiolo , SP-142, 10060, Candiolo, Italy 3 INFN, Sezione di Torino , Via Pietro Giuria 1, 10125 Torino, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Andrea Pagnani Pierre Barrat-Charlaix 4 DISAT, Politecnico di Torino, Corso Duca degli Abruzzi , 10129, Torino, Italy 5 Sorbonne Université, CNRS, Institut de Biologie Paris-Seine, Laboratoire de Biologie Computationnelle et Quantitative , Paris F-75005, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Pierre Barrat-Charlaix For correspondence: PBCpierre.barrat-charlaix{at}sorbonne-universite.fr Abstract Full Text Info/History Metrics Preview PDF Abstract Protein evolution is fundamentally shaped by epistasis, where the effect of a mutation depends on the sequence context. As standard phylogenetic methods assume independently evolving sites, there is a need for more complex models based on accurate estimations of the fitness landscape. Good candidates are modern generative models – such as the Potts model – which successfully capture epistatic effects. However, recent work on generative evolutionary models usually use discrete time, making them difficult to integrate with the standard frameworks in evolutionary biology. We introduce a continuous-time sequence evolution model using the Gillespie algorithm and parameterized by a generative Potts model. This approach enables us to simulate realistic, family-specific evolutionary trajectories and allows for direct comparison with independent-site models. Surprisingly, we find that while epistasis significantly slows down evolution, it does not change the average evolutionary rates at individual sites. This is explained by the rate heterogeneity caused by context-dependence: we show that the rate at some positions varies between null to high values depending on the context, while other positions are essentially independent from the context. Finally, we show that epistasis leads to a systematic underestimation bias in the inference of evolutionary distance between sequences. Overall, our work provides a new tool for simulating realistic protein evolution and offers novel insights into the complex interplay between epistasis and evolutionary dynamics. Significance statement Understanding how proteins evolve is central to molecular biology and phylogenetics. Traditional evolutionary models assume that mutations act independently at each position in a sequence. This neglects epistasis — the fact that the effect of a mutation depends on the rest of the sequence — which is known to be ubiquitous in proteins. By simulating protein evolution in continuous time using a generative model, our approach produces realistic sequences and reveals how epistasis shapes evolutionary dynamics. We find that epistasis slows down evolution and can mislead common methods for estimating evolutionary timescales. This work bridges modern generative models of proteins and phylogenetics, providing new tools to better understand molecular evolution. sequence evolution models generative models phylogenetic inference epistasis I. INTRODUCTION Proteins are essential molecules for all forms of life. They fill very specific functions that evolution has optimized during billions of years. For this reason, members of a protein family that share a common biological function can have sequences differing in up to 80% of their amino acids. On the other hand, a handful of ill-chosen mutations can render a protein completely non-functional. Sequence evolution models are theoretical tools that describe the evolutionary process by quantifying how protein sequences change with time. They are the central element of phylogenetic inference, which aims at reconstructing the evolutionary relations between genes and organisms. A large amount of work has been dedicated to making sequence evolution models more realistic. This includes modeling the heterogeneity of evolutionary rates at different sequence positions [ 13 , 37 ], the construction of matrices quantifying the transition rates between amino acids [ 14 , 15 ], or the development of more complex methods such as site-specific amino acid frequencies or heterotachous rates [ 4 , 31 ]. Virtually all evolutionary models make the simplifying assumption that different positions in the protein sequence evolve independently. However, there is little doubt that sites in a protein sequence are not independent and that this influences evolution [ 32 , 34 ]. This interdependence is called epistasis, and neglecting it is in fact a known weakness of evolutionary models [ 35 ]. The last decade has seen important progress in the development of generative protein sequence models. These models assign a probability to protein sequences, and are said to be generative if sampled artificial sequences are functional or at least statistically indistinguishable from natural ones [ 19 , 30 ]. This probability distribution is learnt by taking advantage of the fact that many homologous proteins share similar biological functions but can have very different sequences. Generative sequence models can take the form of restricted Boltzmann machines [ 36 ], variational auto-encoders [ 9 ], or deep neural networks [ 25 , 27 ]. A particularly interesting class of models is the Potts model, because of both its mathematical simplicity and the possibility to interpret its parameters biologically [ 3 ]. It must be stressed that the key ingredient in all these techniques is the accurate modeling of epistasis: the fact that the effect of a mutation in a protein sequence depends on the context, that is on the rest of the sequence. Integrating epistasis into sequence evolution models with the perspective of phylogenetic inference is a challenging problem and has been addressed in the past. In [ 8 , 20 ], authors design a method to identify pairs of epistatic positions and to reconstruct the phylogenetic tree, while [ 17 ] shows that the inclusion of epistasis typically degrades phylogenetic reconstruction done by a traditional site-independent evolutionary model. However, these works are based on the assumption of non-overlapping epistatic pairs, which can be considered a strong oversimplification given the state of the art on generative models. More general models of epistasis have also been used: in [ 28 , 29 ], protein fitness is assumed to be a sum of pairwise interactions corresponding to contacts in the tertiary structure, with values given by a biochemistry-based statistical potential [ 22 ]. Authors then show that it is possible to infer certain parameters of the evolutionary model given sequence data. Using the same idea, [ 23 ] shows that the inclusion of epistasis typically degrades phylogenetic reconstruction. While this approach is closer in spirit to modern sequence models, using a biochemistry based potential and restricting interactions to contacts makes it very unlikely that the resulting model is generative. Recently another line of research has built on modern generative models to understand the consequences of epistasis on evolutionary dynamics. In [ 5 ], authors show that many effects that need to be explicitly modeled by independent-sites evolutionary models, such as Gamma distributed rates or heterotachy, emerge naturally if an epistatic model is used. Epistasis has also been shown to introduce different timescales in evolution: fast mutational processes within a sequence context, and slow change of the context itself [ 7 ]. Generative evolutionary models can also be used to model the results of directed evolution experiments, or to evolve new functional sequences starting from a natural one [ 1 , 2 ]. These works are based on discrete Monte-Carlo sampling of a Potts model. This approach is very natural, but is at odds with traditional evolutionary models used in phylogenetics which are based on a continuous time approach. While this may seem like a technical point, it in fact limits the reach of these methods. First, it makes their results harder to relate to the rest of the literature on sequence evolution models. For instance, a unit of time in a continuous model generally corresponds to on average one substitution per sequence position, but there is no such scaling of time in discrete evolutionary models. A consequence is that the evolutionary distance inferred by phylogenetic reconstruction cannot be easily compared to the simulation time in a discrete model. Seconds, as we will show in the results, using continuous time models makes it possible to discuss the rates of substitution and therefore to gain more understanding on how the model is behaving. Again, this has no clear equivalent with discrete time models. In this article, we propose a sequence evolution model that is generative and also compares more naturally to the existing literature. We base it on a Potts model with a continuous time evolution procedure based on the Gillespie algorithm [ 12 ]. Since Potts models are generative, sequences obtained during an evolutionary trajectory will be statistically very similar to natural ones and likely functional. Potts models are learned using single families of homologous proteins and are thus family specific, but our simulation procedure is general: our model can simulate evolution for any protein family at the cost of inferring a Potts model. We propose this model as an open source package that can be used by the community. We then set on using our model to study the consequences of epistasis on protein evolution. In particular we observe that epistasis slows down evolution, and yet the average substitution rates at both the sequence and position level are very close to the ones of a site-independent model. This is explained by the fact that the epistatic model evolves at different rates in different contexts. We then derive a method to score sequence positions by how context-dependent their evolution is. Finally, we show that epistasis strongly biases the inference of the evolutionary time between two sequences, and that this is caused by the existence of positions whose evolution is strongly context-dependent. II. RESULTS A. Continuous time generative evolution model A Potts model is defined at the level of a protein family and associates a probability to each aligned amino-acid sequence a = { a 1 … a L } of fixed length L : where Z is a normalization constant and parameters h and J are called fields and couplings. The state a i at position i of a can take q = 21 values: the 20 natural amino acids and the gap symbol. Potts models have been shown to be generative in the sense that the distribution P eq closely matches that of natural sequences, and that sequences sampled from P eq are generally found to be functional [ 30 ]. Importantly, parameters h and J are calculated so as to fit the first two moments π i ( a ) and π ij ( a, b ) of P eq to the single-site and pairwise distribution of amino-acids found in the alignment of natural sequences. We want to construct a sequence evolution model with two properties. First, it should be parametrized using continuous time, similarly to most models used in phylogenetics. In practice, this means that the model is a continuous time Markov chain (CTMC) characterized by a sequence-to-sequence transition rate matrix 𝒬 of size q L × q L . While this matrix is of gigantic dimensions, we show below that it never needs to be fully computed. Secondly, we want P eq to be a fixed point of the dynamics: whatever the starting sequence, sampling the model after a long time t should be equivalent to sampling from the generative distribution. These two properties can be expressed mathematically as where P ( a, b , t ) is the transition probability from sequence a to b in evolutionary time t . Following a common practice in phylogenetics, we scale the time parameter t by a model-dependent factor Ω so that the average rate of substitutions per position is one. Ω can be computed using Monte-Carlo sampling of P eq (Material and Methods). To ensure that P eq is a fixed point of the dynamic, we require that the transition rate matrix verifies the so-called detailed balanced equations: for any pairs of sequences a, b we want P eq ( a ) 𝒬 ab = P eq ( b ) 𝒬 ba . In this work, we enforce this by using the Glauber dynamics defined by with E the energy function of the Potts model. Other dynamics are possible, such as Metropolis, Gibbs or square-root: in the Supplementary Material, we show that our main results do not qualitatively depend on this choice. Furthermore, we find Glauber dynamics to be very similar to Metropolis or square-root on many accounts (Supplementary Material). To further simplify the model, we only consider the case where 𝒬 ab is non-zero only if a and b differ by only one mutation. This amounts to say that only single amino-acid mutations have a non vanishing probability for very short evolutionary times. This simplification is essential to simulating dynamics using Gillespie’s algorithm [ 12 ], as in this case the rows of the 𝒬 matrix only have L ( q − 1) non-zero terms. If we note 𝒩 ( a ) the set of sequences that are one mutation away from a , we can easily compute the rate of substitution starting from sequence a as Computing this sum is numerically tractable as it only involves L ( q − 1) terms. This modeling strategy is well-suited for a Potts model but can in principle be applied to any distribution P eq . We can therefore use it to simulate evolution under a profile model, which considers sequence positions as independent but reproduces the frequencies of amino-acids in each column of the alignment. Formally, the equilibrium distribution for the profile model is the same as the one of Eq. 1 but with coupling parameters set to zero. The fields h are adjusted so that the frequency of amino-acids at each position remains the same. We show in the Supplementary Material that in this case the transition probabilities of Eq. 2 take a factorized form with a position-specific average substitution rate. The essential difference between a Potts and a profile model is therefore the absence of epistasis in the latter. In the following, we explore properties of our evolutionary model using a specific protein family (PF00072, response regulator). We use its alignment to learn a Potts model and a profile model, allowing us to explore the effect of epistasis on evolution. Additional results for two other families are shown in the Supplementary Material (PF00595, PDZ domain; PF13354, beta-lactamase). B. Sequence rates, epistasis and speed of evolution Our model assigns a rate of evolution R ( a ) to each sequence a , as defined in Eq. 4 . This sequence specific rate determines how fast evolution proceeds when starting from a : it is precisely equal to the inverse of the average waiting time to the next substitution (Material and Methods). In the top panels of Figure 1 , we show the distribution of R ( a ) /L for sequences coming from the alignment of the PF00072 family and for both the profile and the Potts models. Scaling by the sequence length L allows us to have values that are on average one for both models. Panel A shows that the distributions of R ( a ) have sensibly different width for the two models, with the Potts giving rise to more varied rates (standard deviation 0.29) than the profile (standard deviation 0.12). On the other hand, rates in the profile and Potts are quite correlated as is shown on panel B , with a linear correlation of about 0.8. Neglecting epistasis thus tends to uniformize the rates of evolution without affecting which sequences are fast and which are slow. It is interesting to note that since different sequences have different rates, our model will by design feature heterotachy [ 16 ], that is the change in evolutionary rates along lineages. Indeed, R ( a ) changes during the evolutionary trajectory as a itself changes. As the figure shows, this is not an intrinsic property of epistasis. However, epistasis makes the R ( a ) distribution wider, so we expect heterotachy to be more pronounced in this case, consistently with [ 5 ]. Download figure Open in new tab FIG. 1. Rate of evolution, for models fitted to the PF00072 family. A : Distribution of the scaled sequence rates R ( a ) over natural sequences for the profile and Potts models. Rates are scaled by the length of the sequence L , so that they are one on average. B : Comparison of rates in the profile and Potts model over natural sequences, showing a high correlation. C : Comparison of sequence rate and energy in the Potts model over natural sequences. Sequences with low energy, that is more probable, tend to evolve slower. D : Average sequence divergence versus time for evolution simulated with a Potts and profile models. The average is over a set of 100 starting sequences and 5 realizations for each starting sequence. Sequence divergence is computed relative to the initial sequence. The dashed horizontal line shows the average distance between natural sequences and the initial sequences used here. E : Average sequence divergence versus time segregated by the category of the starting sequence: slow ([0, 0.33] quantile), intermediate ([0.34, 0.66] quantile) and fast ([0.67, 1.] quantile). The scaled average rate R ( a ) of initial sequences in each category is indicated in the legend. Another effect of epistasis that has already been observed is to slow down evolution [ 5 , 7 ]. We confirm this by running evolutionary trajectories starting from a set of 100 natural sequences of the PF00072 family for a total time t = 20. For each starting sequence, we run 5 independent trajectories, and monitor the Hamming distance between the sequence at time t and the initial one. Panel D shows the average divergence as a function of time obtained in this process. We observe that evolution under the Potts model is on average slower than under the profile model in the sense that it takes a longer time to reach a given Hamming distance as well as for the distance to saturate. Since both models reproduce the distribution of amino-acids in each column of the alignment, they have the same average sequence divergence on the long term. It is important to note that because of our scaling, the meaning of time for the two models is the same: in time interval one, on average L amino-acid substitutions occur. The fact that evolution is slower under the epistatic Potts model indicates that substitutions occur at a more restricted set of sequence positions or involve more reverse mutations. A related question is what makes a particular sequence fast or slow to evolve. In panel C , we show that the sequence-specific rate is highly correlated with the energy E ( a ) of the sequence in the Potts model: dynamics are slower when starting from sequences that have low energy and are therefore more likely. This is straightforward to interpret under the assumption that the energy of Potts model is a good predictor of protein functionality [ 30 ]. Indeed, we expect highly optimized and functional proteins to be close to a fitness peak and therefore slow to evolve, while high energy sequences correspond to a fitness valley and can rapidly mutate. In fact, our sequence rate also correlates strongly with other measures of how fast a sequence evolves: we show for example in Supplementary Figure 8 that it is related to the context dependent entropy as defined in [ 7 ]. Rates for alternative dynamics are also very correlated to the ones for Glauber, as shown in Supplementary Figure 10 . An advantage of R ( a ) as we define it is that it is directly interpretable in terms of the evolutionary process, as it is the inverse of the average waiting time to the next substitution. However, it is also a local quantity, in the sense that the rate of the sequence coming after a in the evolutionary trajectory could in principle be completely different. Therefore, an important question is whether this rate matters for long-term evolution. To evaluate this, we start evolutionary trajectories from initial sequences with low, intermediate or high rate, respectively chosen from the quantiles ([0, 1 / 3], [1 / 3, 2 / 3], [2 / 3, 1]) of the distribution of panel A . As can be seen in Panel E , distance from the starting sequence increases more slowly for starting sequences with a low rate. This slower evolution does not only concern the initial slope of the distance versus time curve but impacts the trajectory on the long term: at t = 20, the trajectory starting from the “fast” sequence is fully equilibrated while the one starting from the “intermadiate” sequence has barely reached saturation and the one starting from the “slow” sequence is still far from saturation. Note that the final saturation value is different for these three trajectories, as the average distance of natural sequences to each particular starting sequence may differs. The profile trajectories (dashed lines) converge faster in all cases and indicate the value of saturation. Interestingly, the trajectory starting from the “fast” sequence evolves at the same speed in the profile and Potts model, indicating that epistasis only weakly influences evolution in this case. C. Average site-specific evolutionary rate For a given sequence a and a position i , the site-specific rate R i ( a ) is defined as the sum of the rates from a to any mutant at position i . Defining 𝒩 i ( a ) as the set of sequences that are mutants of a at position i , we have In order to study a quantity that does not depend on a particular sequence, we define the average site-specific rate ⟨ R i ⟩ as the average of R i over natural sequences. ⟨ R i ⟩ represents the average rate at which substitutions occur at site i during simulated evolution. A first question is whether epistasis influences the rates at individual sites i . The first panel of Figure 2 shows that this is not the case, as the values of ⟨ R i ⟩ between a Potts and a profile model are highly similar. Therefore, on average, epistasis does not slow down the substitution process at any site in particular. The central panel of Figure 2 reveals that average substitution rate at a site is heavily correlated to conservation: positions that are variable in the alignment tend to evolve fast in the model, while conserved positions have a low rate. This is not surprising, and analytical calculations in the case of a profile model confirm that the average rate can always be linked to conservation (Supplementary Material). The Potts and profile models both capture the conservation of columns in the alignment, explaining that sites evolve on average at the same rate in both cases. On the other hand, this does not explain what makes the epistatic dynamics different: we will show that it is necessary to consider rates in particular contexts for this purpose. Download figure Open in new tab FIG. 2. Left : average evolutionary rate ⟨ R i ⟩ at site i for the Potts and the profile models. The average is performed over natural sequences. Center : average rate ⟨ R i ⟩ against the entropy of position i in the alignment. More conserved positions evolve slower on average. Right : distribution of average site-specific rates over sites i . The dashed line shows a maximum-likelihood fit to a Gamma distribution (parameters α = β = 2.3). Another relevant question is the distribution of site-specific substitution rates. The importance of taking into account and correctly modeling the rate heterogeneity among sites is a well established principle in phylogenetics. Rates are usually modeled using categories that are based on a Gamma [ 13 , 37 ] or parameter-free distribution [ 33 ]. In our case, the rate heterogeneity comes from the characteristics of the equilibrium distribution P eq : it can be computed exactly for the profile model (A2 of the Supplementary Material) and estimated numerically for the Potts. The right panel of Figure 2 shows the distribution of ⟨ R i ⟩ for the Potts model. We observe that this distribution is not incompatible with a Gamma distribution of parameter α = 2.3 and mean one – by design, the mean of ⟨ R i ⟩ over i is one. However, this result should be taken with care as the quality of the fit appears lower for other dynamics, especially Gibbs dynamics, as can be seen in Supplementary Figure 10 . Note that given the results of the left panel of the same figure, this distribution would not change if we were to use the profile model. The differences in evolutionary dynamics caused by epistasis can therefore not be accounted for by simply modeling the average site rates. D. Context-dependent site evolutionary rate In order to better understand the role of epistasis in our dynamics, we want to quantify the dependence of the rate of evolution at each site on the sequence context. We define the context dependent rate at a given position i as where a\ i designates a set of amino-acids at positions other than i, i . e . a context, and p i ( a | a\ i ) the conditional probability at position i given this context according to the generative model. Note that this conditional probability can be easily computed if a Potts model is used. The sequence a in the inner sum is defined as having context a \i and state a at position i . We expect to be a marker of epistasis. Indeed, for a profile model with additive fitness, we always have since the conditional probability p i does not depend on the context. On the other hand, we can observe significant differences in rates for different contexts if we use the generative Potts model. Panel A of Figure 3 shows the scatter plot of for the contexts of two particular sequences, at quantiles 10% (slow) and 90% (fast) of the distribution of R ( a ) (see Figure 1 A ). Many positions are found to have a very small rate in the slow context while having a significantly higher rate in the fast context. We choose two particular sites to highlight, with a procedure described further below: site 15 has roughly the same rate of about 1.5 in the two contexts, while site 95 evolves at a rate higher than 2 in the fast context and close to 0 in the slow context. In fact, panel B shows the distribution of across all contexts found in the natural sequences and for the two highlighted positions i = 95 and i = 15. We find that the rates of site 15 are centered around an average value of 1.5 for all natural sequences, while the ones of site 95 are spread almost uniformly from 0 to 2.5. Similar analysis of other positions reveals that this is a common pattern: the evolutionary rate at some site depends weakly on the context, while others are very evolvable given certain contexts and “frozen” in others. The existence of such context-dependent sites is an effect of epistasis. Download figure Open in new tab FIG. 3. A : context-dependent rates at each site for a slow and a fast context (see panel E of Figure 1 ). B : Distribution of context-dependent rates across natural sequences at the two highlighted positions. C : Variance versus mean of across natural sequences, for each position i . The color code corresponds to the division of sites in three chunks with different importance of the context. D : Distribution of the average inside each chunk across natural sequences. Colors are the same as in panel C. E : Evolutionary trajectories showing the Hamming distance to the starting sequence as a function of time, for three starting sequences (slow, intermediate, fast contexts). Each plot focuses on one chunk, with Hamming distance being computed using only the corresponding positions. To explore this, we construct a ranking of sites based on how context-dependent their evolution is. We define as the average rate of evolution of sites in the context defined by sequence a . We then rank sites by their contribution to the variance of R C ( a ) over sequences: the top K sites of the ranking are chosen so that the variance of R C ( a ) restricted to these K sites is maximal (see Material and Methods). This procedure makes the ranking depend both on the variance of the context dependent rate over sequences and of the covariance of context-dependent rate between positions. Our ranking therefore selects sets of positions that have a variable but also tend to be variable or frozen in similar contexts. The effect of the ranking is visible on the heatmap of the covariance matrix ( Supplementary Figure 12 ). In panel A , we highlighted the site ranked first and the one ranked at position 2 L/ 3, with L the length of the sequences. Interestingly, we could find a structural interpretation to the top-ranked sites in the case of the PF00072 family, and also for the PF00595 family considered in the Supplementary Material. In both cases, the top L/ 5 sites – where L is the length of the sequence – were found to be structurally close and located at a binding site or an interface, as is visible on Supplementary Figure 13 . For the third family studied in the Supplementary Material, PF13354 beta-lactamase, we found the top L/ 5 sites split in three groups of structurally close residues, but were not able to give a convincing structural interpretation. Ideally, we would like to identify a set of position whose evolution is context-dependent, and another that is context-independent. However, our ranking procedure does not provide a clears threshold to group sites. We therefore decide to simply divide positions into three chunks of size L/ 3 based on the ranking; we call these chunks context-dependent, intermediate and context-independent sites. Panel C of Figure 3 shows the variance of across sequences against its mean, with a color corresponding to the chunk that the site belongs to. As expected, the context-dependent sites tend to have a higher variance and the context-independent sites a lower variance, although this correspondence is not perfect since our ranking also depends on the covariance. Interestingly, context-independent sites are found among either the very fast or very slow evolving positions. In panel D , we show the distribution over sequences of the average rate of evolution among each chunk, that is . As expected, we observe a wider distribution for the context-dependent sites and a more peaked one for the context-independent ones. Note that for a profile model, there is no difference between the context-dependent rate and the average rate: we would then observe perfectly peaked distributions for all chunks. This indicates the existence of contexts in which sites of the first chunk jointly evolve rapidly, and others in which they evolve slowly. Interestingly, the average of the distribution is roughly the same and equal to one for all chunks. Therefore, when we average over contexts, there is no significant difference in the speed of evolution of these different parts of the sequence. It is only when looking at a particular contexts that the effects of epistasis are visible. To explore the speed of evolution in the different chunks, we adopt the same strategy as in Figure 1 : we simulate evolutionary trajectories starting from slow, intermediate and fast backgrounds (the same as in panel E of Figure 1 ). We then compute the distance to the starting sequence as a function of time, restricted to the sites that belong to a specific chunk. Distance are normalized so that a value of one indicates that all amino-acids in the considered chunk are different. Results confirm that our ranking procedure gives relevant information regarding long term evolution: the sequence distance to the origin for the context-dependent sites changes at dramatically different speeds if evolution starts from slow, intermediate or fast contexts. On the other hand, not much difference can be seen for the two other chunks. Evolutionary speed is therefore strongly affected by epistasis and by the context at a restricted amount of protein sites, while it appears to be well described by a site-independent model at other positions. E. Inference of evolutionary time An important part of phylogenetic inference consists in inferring the evolutionary time separating two sequences. Here, we want to understand whether epistasis creates biases in this inference if it is done with a model that considers sites as independent. To do so, we design the following setup. First, we sample an ensemble of M = 10 000 sequences { a 1 … a M } from an evolutionary model (profile or Potts) as well as a set of M time values {Δ t 1 … Δ t M } from a uniform distribution U ([0.01, 2.0]). Then, from each sequence a i , we simulate evolution for time Δ t i to reach sequence b i . The ensemble { a i , b i , Δ t i } can then be used to assess the accuracy of time inference methods. In our case, we reconstruct the maximum likelihood time Δ t i from the pair ( a i , b i ) using the profile model (Material and Methods). The top-left panel of Figure 4 shows the reconstructed time as a function of the real time for both profile and Potts as the evolutionary model. On average, the inference with the profile model is very accurate when evolution was also simulated with the profile. However, we note that the time is systematically underestimated when using the Potts model. This is consistent with previous results, showing that evolution is slower when epistasis is present. The top-right panel shows the same data, but this time segregated by the rate of the starting sequence: sequences a i with an R ( a ) in the [0, 1 / 3] quantile form slow contexts, and so on (see Figure 1 ). For the profile model, this segregation makes no difference, which is not surprising since time inference is performed with the same model as was used to simulate. However, for the Potts model, we observe that the underestimation of time is stronger for slower contexts. It is important to stress that the profile model we use for inference has site-specific and sequence-specific rates that are very similar to the Potts, as show in Figures 1 & 2 . Its equilibrium distribution is also as close as it could be to the one of the Potts model, since it has the same site-specific frequencies of amino-acids. This systematic bias is therefore an irremediable effect of epistasis. Download figure Open in new tab FIG. 4. Top left : reconstructed versus real evolutionary time for all simulated pairs of sequences, for the profile and Potts models. One point represents a pair of sequences, and the smooth lines are obtained using a running average. Top right : reconstruction of time for different contexts: three groups of pairs of sequences are made, based on the evolutionary rate of the starting sequence. Lines show the average reconstructed versus real time for each group. The Potts model is shown as a solid line, the profile as a dashed line. Bottom : reconstruction of time when focusing on specific parts of the sequences, as defined in the previous section: the context-independent (left), intermediate (center) and context-dependent sites. We try to understand how this biases arises using the ranking of sites according to how context dependent they are. For each pair of sequences ( a i , b i ), we again inferred the most likely time with the profile model but restricting the sequences to the L/ 3 most context-dependent, intermediate and least context-dependent sites, that is to the chunks shown in Figure 3 . In the lower panel of Figure 4 , we again plot the inferred time against the real one for each of the three sequence chunks, segregating by starting context. It is important to note that the all three chunks contain variable sites and each have an average evolutionary rate around one: this guarantees that there are a good amount of substitutions in each chunk during evolution, making the inference of time possible. Two things can be noted from these plots. First, the underestimation of time is more pronounced for groups of sites that are more context-dependent. In the left-most panel showing the context-independent chunk, the inferred time is very close to the real one: this indicates that these sites evolve roughly as the profile model predicts, that is independently from each other. On the contrary, deviations are much stronger for the context-dependent chunk. Secondly, the dependence on the starting context is also stronger when looking at sites that are context dependent. In the figure, this is materialized by the difference between the three colored curves representing the three starting contexts. This is consistent with panel E of Figure 3 . These results show that (i) inferred evolutionary time is systematically underestimated if epistasis is neglected and (ii) underestimation is caused by a set of sites that are strongly context-dependent and is stronger when considering sequences that have a low evolutionary rate. III. DISCUSSION We have presented a novel generative protein sequence evolution model based on a Potts model. Its key innovation lies in its continuous-time formulation made possible by the use of the Gillespie [ 12 ] algorithm. This allows the model to align with the standards of the phylogenetic literature, while retaining the generative capacity to produce realistic, functional sequences. With the development of machine-learning based approach to phylogeny, there is a strong need for realistic sequence evolution models [ 24 , 35 ]. We believe that this work is an important step in this direction. Our approach relies on several modeling choices. First, the model assumes a single, fixed fitness landscape for an entire protein family, as defined by the Potts model parameters. This assumption implies that the complex web of epistatic interactions that the model captures is static over evolutionary time and across lineages. Changes in the sequence-function mapping – due for instance to a change in cellular environment or of activity – are not considered. Furthermore, we incorporate insertions-deletions by treating the gap symbol as an additional amino acid state. This choice has proven effective in the field of generative models, but could be discussed when dealing with evolutionary dynamics as indels and point mutations can have quite different dynamics [ 11 ]. This differs from standard phylogenetic techniques which generally treat gaps as missing data [ 21 ], and we have argued that it is an improvement in the context of ancestral sequence reconstruction [ 6 ]. Our results elucidate both the similarities and stark differences between epistatic and non-epistatic evolutionary models. Surprisingly, we found that the average substitution rates at the sequence and site levels were highly similar between the Potts and profile models, a finding that contrasts with some previous work [ 5 ]. However, the effects of epistasis become profoundly apparent when examining specific sequence contexts: the speed of evolution is highly dependent on the starting sequence, with low-energy, optimized sequences evolving significantly slower. We further demonstrated that this epistatic influence is not uniformly distributed across the protein sequence. Indeed, some positions are highly constrained by the rest of the sequence – with evolutionary rates ranging from vanishing to high depending on the context – while others evolve in a largely independent manner. Intriguingly, the long-term average evolutionary rates for these two groups of sites are relatively close, masking the critical role of epistasis when only average quantities are considered. This uncovers interesting features of evolution in a complex landscape while showing that site-independent models are unable to capture them. A central finding with important implications for phylogenetic inference is that ignoring epistasis leads to a systematic underestimation of evolutionary time. This bias is not uniform; it is significantly stronger (i) for slow-evolving sequences and (ii) when inference is based on highly context-dependent sites. Importantly, this bias arises even if the site-independent model that is used for reconstruction has correct average site-specific rates. Even if we do not explore it here, it is reasonable to think that these biases will impact full-scale phylogenetic reconstruction. For instance, epistasis could cause a uniform compression of all branch lengths or distort the tree topology by preferentially shortening branches leading to specific clades. Clades that have evolved in a context of high functional constraint (our “slow” sequences) might be disproportionately affected, making recent evolutionary radiations appear even more recent than they are and potentially obscuring their true chronological order. As it is now well established that epistasis is ubiquitous in protein evolution, understanding how it affects phylogenetics and how we can correct for it becomes an important research direction to which we hope this work will contribute. IV. MATERIAL AND METHODS A. Evolutionary model In general, the evolutionary model gives the rate of transition between any two sequences a and b . We assume that this rate is non-zero only between sequences separated by one amino-acid mutation, and define 𝒩 ( a ) as the set of sequences that are one mutation away from a . Consequently, a given row 𝒬 a : of the transition rate matrix has only ( L − 1) q non-zero values. The full model is therefore specified by where Δ E = E ( b ) − E ( a ), and where the expression for Q a , a ensures that rows of Q sum to 0 as should be a for a continuous time Markov chain. We use a normalization constant Ω to set the timescale of the process. Following common practice in modeling sequence evolution, we want the average rate of substitutions per position to be one, that is This is achieved by setting Ω to This average over sequences a can be calculated either using a sample from P eq obtained by standard MCMC, or using natural sequences to compute the average. Matrix 𝒬 are then scaled by a factor Ω −1 , so that transition probabilities take are (informally) of the form . Ω therefore represents the amount of “physical” time corresponding to one unit of evolutionary time. Supplementary Figure 7 shows the value of Ω for family PF00072 and different types of monte-carlo dynamics. We see that for each dynamic, the value of Ω is higher for the Potts model than for the profile, indicating that epistasis also has the effect of changing the scaling from evolutionary time to “physical” time. In the main text and in the rest of this section, this scaling is implicit and we lighten notation by omitting Ω. B. Simulating with Gillespie algorithm We use Gillespie’s algorithm to sample from our model and simulate evolution [ 12 ]. Given a starting sequence a , this is done in three steps: compute the total rate of substitution away from a : R ( a ) = ∑ b ∈𝒩 ( a ) 𝒬 ab ; sample the time t to the next substitution from an exponential distribution of parameter τ = 1 /R ( a ); choose the next substitution: position i and new amino-acid b are chosen with probability where b is equal to a at all positions but i , where it is equal to b . From Eq. 8 we get that the average rate R ( a ) over starting sequences is L , that is a rate 1 per sequence position on average. C. Ranking of positions based on context-dependent rates Our aim is to identify sets of positions that (i) have an evolutionary rate that strongly depends on the context and (ii) are fast or slow in similar contexts. In order to do this, we note A K an arbitrary set of K positions and define for any sequence a the partial sum of context rates as For each K , we now want to find the set of positions A K that maximizes the variance of across natural sequences. Note that this variance depends on both the variance and covariance of the context rate: We estimate the best set A K in a greedy way: starting from the set A K −1 , we add to it the position that marginally maximizes the variance. The order in which we add positions is the ranking discussed in Figure 3 . To evaluate how accurate the greedy approach is, we also try to optimize the set A K using a more presumably precise simulated annealing approach. Results using the two methods are close to identical. D. Inferring time with the profile model We show in the Supplementary Materialthat when using a profile model, the transition probability between two sequences in time Δ t can be written as where the Q i are 21 × 21 giving the transition rates between amino-acids at position i . Given two sequences a and b , we infer the evolutionary time between them by maximizing the log-likelihood which is simply a sum over L terms that each involve the states a i and b i . To speed-up the optimization, we preemptively diagonalize the matrices Q i that depend only on the model in order to quickly compute their exponential. We perform the optimization with the LBFGS algorithm along with automatic differentiation [ 26 , 38 ]. CODE AVAILABILITY The core code for our evolutionary model is available at https://github.com/PierreBarrat/PottsEvolver.jl . Scripts and data necessary to reproduce the results in this paper are available at https://github.com/PierreBarrat/ContinuousPottsData . Supplementary Material ACKNOWLEDGMENTS AP acknowledges financial support from “Explainable Models for Protein Design”, funded by the MIUR Progetti di Ricerca di Rilevante Interesse Nazionale (PRIN) Bando 2022 - grant 2022TE5B7X, “Centro Nazionale di Ricerca in High-Performance Computing, Big Data and Quantum Computing” (ICSC), and support from the European REA, Marie Sk-lodowska-Curie Actions, grant agreement no. 101131463 (SIMBAD). AP and PBC acknowledge “FAIR - Future Artificial Intelligence Research” project, and received funding from the European Union NextGenerationEU (Piano Nazionale di Ripresa e Resilienza (PNRR)–Missione 4 Componente 2, Investimento Grants No. 1.3–D.D. 1555 11/10/2022, and No. PE00000013). This paper reflects only the authors’ views and opinions, neither the European Union nor the European Commission can be considered responsible for them. Appendix A: Supplementary methods 1. Profile model We want to show that the transition rate matrix of Eq. 7 is compatible with non-epistatic evolution. To do so, we design a profile evolutionary model where each sequence position i follows its own continuous time Markov chain with rate µ i and a q × q transition rate matrix Q i , and show that its sequence to sequence transition rate matrix is equal to the Q of the main text. The transition probability between two sequences a and b in time t is Each matrix Q i implicitely defines a stationary probability π i ( a ) for amino-acids at site i , such that for any . We assume that the Q i and µ i are normalized: so that position i evolves on average at rate µ i and the average rate at each position is one. The corresponding sequence to sequence transition rate matrix 𝒬 is defined by with δ xy being the Kronecker delta. Therefore the transition rate vanishes for sequences that are more than one mutation away and we can write For this to match the matrix of the main text in Eq. 7 , we need that for any sequences a and b ∈ 𝒩 i ( a ), the transition rate 𝒬 ab depends only on the states a i and b i at position i . This is necessary as on the right hand side of the above equation, is independent of the context. Note that for the Glauber dynamics, but also for other dynamics described below, 𝒬 automatically has this property if the equilibrium distribution P eq is factorized, which is the case when we use a profile model. More precisely, models in Eq. 7 and Eq. A1 are the same if we have where a is any sequence having state a at position i , and b is equal to a except at position i where it is in state b . What we have shown here is that applying the model of the main text to a factorized distributin P eq results in a factorized dynamics ( Eq. A1 ) with inhomogeneous rates µ i . 2. Different dynamics In the main text, we use our model with Glauber dynamics ( Eq. 2 ). Other choices are possible: any transition rate matrix 𝒬 that satisfies the detailed balance conditions will be compatible with the generative distribution P eq . In the supplementary figures below, we show results for three alternative dynamics (defined here in the case b ∈ 𝒩 ( a )): Metropolis dynamis: Square-root rule: This rule has been used in the past in the context of evolutionary models [ 29 ]. We do not know of a proper name for it, and choose to call it this way because the transition rate is equal to the square-root of the ratio P eq ( b ) /P eq ( a ). Gibbs dynamics: The rate of transition to state b i at position i is equal to the conditional probability of b i given the rest of the sequence. Note that the Gibbs rule is the only one that does not depend on the initial state a i at position i . Appendix B: Supplementary figures 1. Main text figures for other families Download figure Open in new tab Figure S 1. Equivalent to Figure 1 of the main text for the PF00595 family. Download figure Open in new tab Figure S 2. Equivalent to Figure 2 of the main text for the PF00595 family. Download figure Open in new tab Figure S 3. Equivalent to Figure 3 of the main text for the PF00595 family. Download figure Open in new tab Figure S 4. Equivalent to Figure 1 of the main text for the PF13354 family. Download figure Open in new tab Figure S 5. Equivalent to Figure 2 of the main text for the PF13354 family. 2. Other supplementary figures Download figure Open in new tab Figure S 6. Equivalent to Figure 3 of the main text for the PF13354 family. Download figure Open in new tab Figure S 7. Value of the time normalization constant Ω for the PF00072 family and different evolutionary step types. Ω represents the amount of “physical” time that corresponds to one unit of evolutionary time. Download figure Open in new tab Figure S 8. Context dependent entropy (CDE) versus sequence specific rate of evolution in a Potts model for members of the PF00072 family. We define the CDE of a sequence as the sum of the context dependence entropies at each position i in the sequence, themselves defined as in [ 7 ]. Context dependent entropy has been shown to be good proxy for the velocity of evolution at each position of the sequence. Download figure Open in new tab Figure S 9. Sequence evolutionary rate R ( a ) for different dynamics (scaled by the sequence length). The horizontal axis shows Glauber dynamics used in the main text, and the vertical axes respectively show Metropolis, square-root and Gibbs dynamics. Rates are very correlated in every case. Download figure Open in new tab Figure S 10. Distribution of average site-specific rates for family PF00072 and different dynamics. This is equivalent to the right panel of Figure 2 of the main text. The dashed line indicates the maximum likelihood fit with a Gamma distribution. Download figure Open in new tab Figure S 11. Similar to panel E of Figure 1 of the main text, but for longer evolutionary trajectories. Families are left : PF00072 and right : PF13354. Distances eventually saturate to a maximum even when starting from a slow context. Download figure Open in new tab Figure S 12. Heatmap of the covariance matrix of context-dependent rates: row i and column j corresponds to value , with the averages taken over sequences. Left : natural order of the positions. Right : Positions are permuted using the ranking procedure described in the main text. The ranking groups together positions that have a high or low context-dependent rate in the same contexts, i . e . high covariance. Download figure Open in new tab Figure S 13. Structures of representents of the families PF00072 and PF00595. We highlighted in red the L/ 5 sites with a strong context-dependence according to the procedure described around Figure 3 , where L is the length of the sequence. Left : the CheY chemotaxis response-regulator domain (PF00072) of E. Coli (grey and red) in complex with its partner histidine kinease CheA (light blue) [ 18 ]. The top 22 context-dependent sites are structurally close and mostly located at the interface. PDB: 1EAY. Right : Third PDZ domain of the PSD-95 protein (grey and red) in complex with its ligand CRIPT (light blue) [ 10 ]. The top 16 sites are structurally close and mostly located at the interface with the ligand. PDB: 1BE9. Download figure Open in new tab Figure S 14. Ranking of positions based on their context-dependent rates for different dynamics. Each point represents a position, with its rank for the two dynamics displayed as x and y coordinates. The number indicates the Pearson correlation. Rankings are very similar for the Glauber, Metropolis and square-root dynamics, especially for the top ranked sites. Gibbs dynamics are visibly different, although the ranking is still strongly correlated. Funder Information Declared European Union, https://ror.org/019w4f821 , PNRR Missione 4 Componente 2, Investimento 1.3-D.D. 1555 11/10/2022, PE00000013 MIUR Progetti di Ricerca di Rilevante Interesse Nazionale , Bando 2022 - grant 4922022TE5B7X European REA Marie Sklodowska Curie Actions , 101131463 (SIMBAD) References [1]. ↵ Sophia Alvarez , Charisse M. Nartey , Nicholas Mercado , Jose Alberto de la Paz , Tea Huseinbegovic , and Faruck Morcos . In vivo functional phenotypes from a computational epistatic model of evolution . Proceedings of the National Academy of Sciences , 121 ( 6 ): e2308895121 , February 2024 . doi: 10.1073/pnas.2308895121 . OpenUrl CrossRef PubMed [2]. ↵ Matteo Bisardi , Juan Rodriguez-Rivas , Francesco Zamponi , and Martin Weigt . Modeling Sequence-Space Exploration and Emergence of Epistatic Signals in Protein Evolution . Molecular Biology and Evolution , page msab321, November 2021 . ISSN 1537-1719 . doi: 10.1093/molbev/msab321 . OpenUrl CrossRef PubMed [3]. ↵ Simona Cocco , Christoph Feinauer , Matteo Figliuzzi , Remi Monasson , and Martin Weigt . Inverse Statistical Physics of Protein Sequences: A Key Issues Review . Reports on Progress in Physics , 81 ( 3 ): 032601 , March 2018 . ISSN 0034-4885 , 1361-6633. doi: 10.1088/1361-6633/aa9965 . OpenUrl CrossRef PubMed [4]. ↵ Stephen M Crotty , Bui Quang Minh , Nigel G Bean , Barbara R Holland , Jonathan Tuke , Lars S Jermiin , and Arndt Von Haeseler . GHOST: Recovering Historical Signal from Heterotachously Evolved Sequence Alignments . Systematic Biology , 69 ( 2 ): 249 – 264 , March 2020 . ISSN 1063-5157 . doi: 10.1093/sysbio/syz051 . OpenUrl CrossRef PubMed [5]. ↵ Jose Alberto de la Paz , Charisse M. Nartey , Monisha Yuvaraj , and Faruck Morcos . Epistatic contributions promote the unification of incompatible models of neutral molecular evolution . Proceedings of the National Academy of Sciences , page 201913071, March 2020 . ISSN 0027-8424 , 1091-6490. doi: 10.1073/pnas.1913071117 . OpenUrl Abstract / FREE Full Text [6]. ↵ Matteo De Leonardis , Andrea Pagnani , and Pierre Barrat-Charlaix . Reconstruction of Ancestral Protein Sequences Using Autoregressive Generative Models . Molecular Biology and Evolution , 42 ( 4 ): msaf070 , April 2025 . ISSN 1537-1719 . doi: 10.1093/molbev/msaf070 . OpenUrl CrossRef PubMed [7]. ↵ Leonardo Di Bari , Matteo Bisardi , Sabrina Cotogno , Martin Weigt , and Francesco Zamponi . Emergent time scales of epistasis in protein evolution . Proceedings of the National Academy of Sciences , 121 ( 40 ): e2406807121 , October 2024 . doi: 10.1073/pnas.2406807121 . OpenUrl CrossRef PubMed [8]. ↵ Linda Dib , Daniele Silvestro , and Nicolas Salamin . Evolutionary footprint of coevolving positions in genes . Bioinformatics , 30 ( 9 ): 1241 – 1249 , May 2014 . ISSN 1367-4803 . doi: 10.1093/bioinformatics/btu012 . OpenUrl CrossRef PubMed Web of Science [9]. ↵ Xinqiang Ding , Zhengting Zou , and Charles L. Brooks Iii . Deciphering protein evolution and fitness landscapes with latent space models . Nature Communications , 10 ( 1 ): 5644 , December 2019 . ISSN 2041-1723 . doi: 10.1038/s41467-019-13633-0 . OpenUrl CrossRef PubMed [10]. ↵ Declan A. Doyle , Alice Lee , John Lewis , Eunjoon Kim , Morgan Sheng , and Roderick MacKinnon . Crystal Structures of a Complexed and Peptide-Free Membrane Protein–Binding Domain: Molecular Basis of Peptide Recognition by PDZ . Cell , 85 ( 7 ): 1067 – 1076 , June 1996 . ISSN 0092-8674 , 1097-4172. doi: 10.1016/S0092-8674(00)81307-0 . OpenUrl CrossRef PubMed Web of Science [11]. ↵ Christoph Feinauer , Marcin J. Skwark , Andrea Pagnani , and Erik Aurell . Improving Contact Prediction along Three Dimensions . PLOS Computational Biology , 10 ( 10 ): e1003847 , October 2014 . ISSN 1553-7358 . doi: 10.1371/journal.pcbi.1003847 . OpenUrl CrossRef PubMed [12]. ↵ Daniel T. Gillespie . Exact stochastic simulation of coupled chemical reactions . The Journal of Physical Chemistry , 81 ( 25 ): 2340 – 2361 , December 1977 . ISSN 0022-3654 . doi: 10.1021/j100540a008 . OpenUrl CrossRef PubMed Web of Science [13]. ↵ X Gu , Y X Fu , and W H Li . Maximum likelihood estimation of the heterogeneity of substitution rate among nucleotide sites . Molecular Biology and Evolution , 12 ( 4 ): 546 – 557 , July 1995 . ISSN 0737-4038 . doi: 10.1093/oxfordjournals.molbev.a040235 . OpenUrl CrossRef PubMed Web of Science [14]. ↵ David T. Jones , William R. Taylor , and Janet M. Thornton . The rapid generation of mutation data matrices from protein sequences . Bioinformatics , 8 ( 3 ): 275 – 282 , June 1992 . ISSN 1367-4803 . doi: 10.1093/bioinformatics/8.3.275 . OpenUrl CrossRef PubMed [15]. ↵ Si Quang Le and Olivier Gascuel . An Improved General Amino Acid Replacement Matrix . Molecular Biology and Evolution , 25 ( 7 ): 1307 – 1320 , July 2008 . ISSN 0737-4038 . doi: 10.1093/molbev/msn067 . OpenUrl CrossRef PubMed Web of Science [16]. ↵ P. Lopez , D. Casane , and H. Philippe . Heterotachy, an Important Process of Protein Evolution . Molecular Biology and Evolution , 19 ( 1 ): 1 – 7 , January 2002 . ISSN 0737-4038 . doi: 10.1093/oxfordjournals.molbev.a003973 . OpenUrl CrossRef PubMed Web of Science [17]. ↵ Andrew F Magee , Sarah K Hilton , and William S DeWitt . Robustness of Phylogenetic Inference to Model Misspecification Caused by Pairwise Epistasis . Molecular Biology and Evolution , 38 ( 10 ): 4603 – 4615 , October 2021 . ISSN 1537-1719 . doi: 10.1093/molbev/msab163 . OpenUrl CrossRef PubMed [18]. ↵ Megan M. McEvoy , Andrew C. Hausrath , Gannon B. Randolph , S. James Remington , and Frederick W. Dahlquist . Two binding modes reveal flexibility in kinase/response regulator interactions in the bacterial chemotaxis pathway . Proceedings of the National Academy of Sciences , 95 ( 13 ): 7333 – 7338 , June 1998 . doi: 10.1073/pnas.95.13.7333 . OpenUrl Abstract / FREE Full Text [19]. ↵ Francisco McGee , Sandro Hauri , Quentin Novinger , Slobodan Vucetic , Ronald M. Levy , Vincenzo Carnevale , and Allan Haldane . The generative capacity of probabilistic protein sequence models . Nature Communications , 12 ( 1 ): 6302 , November 2021 . ISSN 2041-1723 . doi: 10.1038/s41467-021-26529-9 . OpenUrl CrossRef PubMed [20]. ↵ Xavier Meyer , Linda Dib , Daniele Silvestro , and Nicolas Salamin . Simultaneous Bayesian inference of phylogeny and molecular coevolution . Proceedings of the National Academy of Sciences , 116 ( 11 ): 5027 – 5036 , March 2019 . doi: 10.1073/pnas.1813836116 . OpenUrl Abstract / FREE Full Text [21]. ↵ Bui Quang Minh , Heiko A Schmidt , Olga Chernomor , Dominik Schrempf , Michael D Woodhams , Arndt von Haeseler , and Robert Lanfear . IQ-TREE 2: New Models and Efficient Methods for Phylogenetic Inference in the Genomic Era . Molecular Biology and Evolution , 37 ( 5 ): 1530 – 1534 , May 2020 . ISSN 0737-4038 . doi: 10.1093/molbev/msaa015 . OpenUrl CrossRef PubMed [22]. ↵ Sanzo Miyazawa and Robert L. Jernigan . Estimation of effective interresidue contact energies from protein crystal structures: Quasi-chemical approximation . Macromolecules , 18 ( 3 ): 534 – 552 , March 1985 . ISSN 0024-9297 . doi: 10.1021/ma00145a039 . OpenUrl CrossRef Web of Science [23]. ↵ Chris A. Nasrallah , David H. Mathews , and John P. Huelsenbeck . Quantifying the Impact of Dependent Evolution among Sites in Phylogenetic Inference . Systematic Biology , 60 ( 1 ): 60 – 73 , January 2011 . ISSN 1063-5157 . doi: 10.1093/sysbio/syq074 . OpenUrl CrossRef PubMed [24]. ↵ Luca Nesterenko , Luc Blassel , Philippe Veber , Bastien Boussau , and Laurent Jacob . Phyloformer: Fast, Accurate, and Versatile Phylogenetic Reconstruction with Deep Neural Networks . Molecular Biology and Evolution , 42 ( 4 ): msaf051 , April 2025 . ISSN 1537-1719 . doi: 10.1093/molbev/msaf051 . OpenUrl CrossRef PubMed [25]. ↵ Roshan M. Rao , Jason Liu , Robert Verkuil , Joshua Meier , John Canny , Pieter Abbeel , Tom Sercu , and Alexander Rives . MSA Transformer . In Proceedings of the 38th International Conference on Machine Learning , pages 8844 – 8856 . PMLR , July 2021 . [26]. ↵ Jarrett Revels , Miles Lubin , and Theodore Papamarkou . Forward-Mode Automatic Differentiation in Julia , July 2016 . [27]. ↵ Alexander Rives , Joshua Meier , Tom Sercu , Siddharth Goyal , Zeming Lin , Jason Liu , Demi Guo , Myle Ott , C. Lawrence Zitnick , Jerry Ma , and Rob Fergus . Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences . Proceedings of the National Academy of Sciences , 118 ( 15 ): e2016239118 , April 2021 . doi: 10.1073/pnas.2016239118 . OpenUrl Abstract / FREE Full Text [28]. ↵ Douglas M. Robinson , David T. Jones , Hirohisa Kishino , Nick Goldman , and Jeffrey L. Thorne . Protein Evolution with Dependence Among Codons Due to Tertiary Structure . Molecular Biology and Evolution , 20 ( 10 ): 1692 – 1704 , October 2003 . ISSN 0737-4038 . doi: 10.1093/molbev/msg184 . OpenUrl CrossRef PubMed Web of Science [29]. ↵ Nicolas Rodrigue , Hervé Philippe , and Nicolas Lartillot . Assessing Site-Interdependent Phylogenetic Models of Sequence Evolution . Molecular Biology and Evolution , 23 ( 9 ): 1762 – 1775 , September 2006 . ISSN 0737-4038 . doi: 10.1093/molbev/msl041 . OpenUrl CrossRef PubMed Web of Science [30]. ↵ William P. Russ , Matteo Figliuzzi , Christian Stocker , Pierre Barrat-Charlaix , Michael Socolich , Peter Kast , Donald Hilvert , Remi Monasson , Simona Cocco , Martin Weigt , and Rama Ranganathan . An evolution-based model for designing chorismate mutase enzymes . Science , 369 ( 6502 ): 440 – 445 , July 2020 . ISSN 0036-8075 , 1095-9203. doi: 10.1126/science.aba3304 . OpenUrl Abstract / FREE Full Text [31]. ↵ Le Si Quang , Olivier Gascuel , and Nicolas Lartillot . Empirical profile mixture models for phylogenetic reconstruction . Bioinformatics , 24 ( 20 ): 2317 – 2323 , October 2008 . ISSN 1367-4803 . doi: 10.1093/bioinformatics/btn445 . OpenUrl CrossRef PubMed Web of Science [32]. ↵ Michael Socolich , Steve W. Lockless , William P. Russ , Heather Lee , Kevin H. Gardner , and Rama Ranganathan . Evolutionary information for specifying a protein fold . Nature , 437 ( 7058 ): 512 – 518 , September 2005 . ISSN 1476-4687 . doi: 10.1038/nature03991 . OpenUrl CrossRef PubMed Web of Science [33]. ↵ Julien Soubrier , Mike Steel , Michael S.Y. Lee , Clio Der Sarkissian , Stéphane Guindon , Simon Y.W. Ho , and Alan Cooper . The Influence of Rate Heterogeneity among Sites on the Time Dependence of Molecular Rates . Molecular Biology and Evolution , 29 ( 11 ): 3345 – 3358 , November 2012 . ISSN 0737-4038 . doi: 10.1093/molbev/mss140 . OpenUrl CrossRef PubMed [34]. ↵ Tyler N. Starr and Joseph W. Thornton . Epistasis in protein evolution . Protein Science , 25 ( 7 ): 1204 – 1218 , 2016 . ISSN 1469-896X . doi: 10.1002/pro.2897 . OpenUrl CrossRef PubMed [35]. ↵ Johanna Trost , Julia Haag , Dimitri Höhler , Laurent Jacob , Alexandros Stamatakis , and Bastien Boussau . Simulations of Sequence Evolution: How (Un)realistic They Are and Why . Molecular Biology and Evolution , 41 ( 1 ): msad277 , January 2024 . ISSN 1537-1719 . doi: 10.1093/molbev/msad277 . OpenUrl CrossRef PubMed [36]. ↵ Jérôme Tubiana , Simona Cocco , and Rémi Monasson . Learning protein constitutive motifs from sequence data . eLife , 8 : e39397 , March 2019 . ISSN 2050-084X . doi: 10.7554/eLife.39397 . OpenUrl CrossRef PubMed [37]. ↵ Ziheng Yang . Maximum likelihood phylogenetic estimation from DNA sequences with variable rates over sites: Approximate methods . Journal of Molecular Evolution , 39 ( 3 ): 306 – 314 , September 1994 . ISSN 1432-1432 . doi: 10.1007/BF00160154 . OpenUrl CrossRef PubMed Web of Science [38]. ↵ Ciyou Zhu , Richard H. Byrd , Peihuang Lu , and Jorge Nocedal . Algorithm 778: L-BFGS-B: Fortran subroutines for large-scale bound-constrained optimization . ACM Trans. Math. Softw ., 23 ( 4 ): 550 – 560 , December 1997 . ISSN 0098-3500 . doi: 10.1145/279232.279236 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted September 20, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Generative continuous time model reveals epistatic signatures in protein evolution Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Generative continuous time model reveals epistatic signatures in protein evolution Andrea Pagnani , Pierre Barrat-Charlaix bioRxiv 2025.09.17.676821; doi: https://doi.org/10.1101/2025.09.17.676821 Share This Article: Copy Citation Tools Generative continuous time model reveals epistatic signatures in protein evolution Andrea Pagnani , Pierre Barrat-Charlaix bioRxiv 2025.09.17.676821; doi: https://doi.org/10.1101/2025.09.17.676821 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7642) Biochemistry (17715) Bioengineering (13907) Bioinformatics (42005) Biophysics (21472) Cancer Biology (18624) Cell Biology (25534) Clinical Trials (138) Developmental Biology (13391) Ecology (19935) Epidemiology (2067) Evolutionary Biology (24356) Genetics (15617) Genomics (22529) Immunology (17753) Microbiology (40437) Molecular Biology (17200) Neuroscience (88697) Paleontology (667) Pathology (2840) Pharmacology and Toxicology (4829) Physiology (7653) Plant Biology (15171) Scientific Communication and Education (2046) Synthetic Biology (4304) Systems Biology (9827) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00