Ancestral Sequences Cannot be Accurately Reconstructed via Interpolation in a Variational Autoencoder’s Latent Space

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 72,461 characters · extracted from preprint-html · click to expand
Ancestral Sequences Cannot be Accurately Reconstructed via Interpolation in a Variational Autoencoder’s Latent Space | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Ancestral Sequences Cannot be Accurately Reconstructed via Interpolation in a Variational Autoencoder’s Latent Space View ORCID Profile Evan Gorstein , Mengze Tang , Hailey Bruzzone , Claudia Solís-Lemus doi: https://doi.org/10.1101/2025.11.19.689264 Evan Gorstein 1 Department of Statistics, University of Wisconsin-Madison , Madison, WI, United States of America 2 Wisconsin Institute of Discovery, University of Wisconsin-Madison , Madison, WI, United States of America Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Evan Gorstein Mengze Tang 2 Wisconsin Institute of Discovery, University of Wisconsin-Madison , Madison, WI, United States of America Find this author on Google Scholar Find this author on PubMed Search for this author on this site Hailey Bruzzone 1 Department of Statistics, University of Wisconsin-Madison , Madison, WI, United States of America 2 Wisconsin Institute of Discovery, University of Wisconsin-Madison , Madison, WI, United States of America Find this author on Google Scholar Find this author on PubMed Search for this author on this site Claudia Solís-Lemus 2 Wisconsin Institute of Discovery, University of Wisconsin-Madison , Madison, WI, United States of America 3 Department of Plant Pathology, University of Wisconsin-Madison , Madison, WI, United States of America Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: solislemus{at}wisc.edu Abstract Full Text Info/History Metrics Preview PDF Abstract Standard methods for ancestral sequence reconstruction (ASR) rely on substitution models for the residues in a biological sequence and assume independent evolution across these sites, ignoring the epistatic interactions that shape molecular evolution. In contrast, deep learning models like variational autoencoders (VAEs) can learn low-dimensional representations (“embeddings”) of sequences in a protein family that may implicitly handle these dependencies, raising the possibility of performing more accurate ASR by interpolating between extant sequence embeddings within the VAE’s latent space. In this study, we test this hypothesis by developing and evaluating a VAE-based ASR pipeline. Benchmarking this approach against established likelihood-based and parsimony methods using various simulations of protein evolution, including scenarios with and without epistasis, we find that the VAE-based approach is consistently and significantly outperformed by standard methods, even in epistatic regimes where it was hypothesized to have an advantage. We further show that this failure is not due to a lack of phylogenetic signal in the latent space, which does recapitulate evolutionary structure. Rather, the primary limitation is the information loss inherent to the autoencoding process: the VAE’s decoder cannot reconstruct sequences with sufficient fidelity for the precise demands of ASR. 1 Introduction Substitution models which characterize how individual sites in a biological sequence evolve stochastically over time have served as a workhorse in molecular evolutionary research, facilitating the study of the evolutionary history of a gene or protein family, including the estimation of the relative timing of residue substitutions, gene duplications, and speciation events ( Jukes et al. 1969 ; Dayhoff et al. 1978 ; Whelan and Goldman 2001 ; Le and Gascuel 2008 ). Application of these models consists in viewing the sequence as a collection of independently evolving residues, which in turn allows one to avoid the combinatorial explosion of sequence space when computing the likelihood of a given evolutionary scenario. The resulting computational tractability comes at the cost of neglecting the epistasis that often plays a role in molecular evolution, which can occur due to interactions between sites in determining structure and function of a protein ( Starr and Thornton 2016 ; Nasrallah et al. 2010 ). Outside of phylogenetics, researchers have increasingly leveraged methods from machine learning and natural language processing to study protein sequences. In particular, there has been much work in recent years towards the development of unsupervised or weakly supervised methods to learn useful representations of protein sequences as dense numerical vectors, known as “protein embeddings”, from the large collections of homologous protein sequences deposited and cataloged in online databases ( Yang et al. 2018 ; Rao et al. 2019 ; Detlefsen et al. 2022 ). Although these embeddings have more recently become associated with large protein language models trained on millions of sequences from thousands of diverse families ( Fan et al. 2025 ), prior to the proliferation of these universally trained models, researchers trained models such as variational autoencoders (VAEs) to learn embeddings for only the sequences in an individual protein family. The resulting embeddings have been used to visualize the space of sequences within the family, predict the impact of mutations, and even generate novel viable sequences ( Sinai et al. 2017 ; Riesselman et al. 2018 ; Hawkins-Hooker et al. 2021 ; Ziegler et al. 2023 ). Detlefsen et al. (2022) discuss the various choices involved in learning protein representations and illustrate empirically the implications of these choices for downstream predictive and interpretive goals. Notably, they show that when the goal is to represent the phylogenetic relationships within a given family, training on aligned sequences from that family alone is advantageous, although state-of-the-art protein language models trained universally do also learn within-family phylogenetics ( Tule et al. 2024 ). Exploring the shape of the manifold in latent space of the embeddings from a VAE trained on a single family of aligned sequences, both Detlefsen et al. (2022) and Ding et al. (2019) find that it captures the evolutionary structure of the data, namely, that sequences have evolved along a phylogenetic tree. In the latter article, the authors hierarchically cluster the embeddings based on Euclidean distance within the latent space and find that these clusters identify larger evolutionary clades more accurately than the clades in a tree estimated with FastTree ( Price et al. 2009 ); however, these embedding clusters are less accurate than the estimated tree at identifying smaller clades. In theory, the advantage of embedding models relative to the independent-sites models employed by phylogeneticists is their flexibility, model-agnosticism, and potential to capture epistatic effects by learning directly from the data. The results from Ding et al. (2019) suggest, however, that although the global organization of the embeddings recapitulates the overall phylogeny, finer-grained historical relationships may not be recoverable from them. In this paper, we investigate the limits of an embeddings-based approach to phylogenetics by studying whether we can perform ancestral sequence reconstruction (ASR) by interpolating within the latent space of a VAE. Ancestral sequence reconstruction is a fundamental task in phylogenetics in which the analyst attempts to infer, on the basis of extant homologous sequences, the amino acids of an ancestral sequence at one or multiple of the internal nodes of the phylogenetic tree that relates these homologs ( Pauling et al. 1963 ; Hochberg and Thornton 2017 ; Selberg et al. 2021 ). Classically, ancestral sequence reconstruction was done using a maximum parsimony principle ( Dayhoff et al. 1978 ), but likelihood-based approaches have emerged as the dominant methodology in more recent years. The models that these approaches assume and from which the likelihood is thus derived are the substitution models mentioned at the outset, along with the corresponding implicit assumption of independent evolution across sites. In this paper, we benchmark an approach to ASR based on the embeddings from a VAE, comparing it to this more standard approach based on the likelihood function from substitution models. In addition to learning a function for embedding sequences in a continuous space (the encoder), VAEs simultaneously learn a mapping back to the space of sequences (the decoder) that is trained to approximate the inverse of the encoder. Thus, interpolation within the embedding space of VAE can be used to generate novel, yet meaningful data by feeding the interpolates through the decoder. In the context of ASR, the novel, meaningful data we want our variational autoencoder to produce are the ancestral sequences that once existed in the past and which gave rise to the extant sequences on which the VAE is trained. Following a brief review of related work, we detail this approach, scalable to families with thousands of sequences, of embedding-based ancestral sequence reconstruction. We then describe the results of simulation studies conducted to benchmark this approach against existing methods for ASR. In the absence of recoverable ancient sequences, real protein families lack a definitive ground truth with which to evaluate proposed ancestral sequences and it is therefore common to benchmark ASR on computationally simulated evolution ( Williams et al. 2006 ; Moreta et al. 2022 ; De Leonardis et al. 2025 ). In this work, we study several different sequence evolution simulation schemes, including multiple that incorporate epistasis or co-evolution of sites, to see whether an embeddings-based approach to ASR might improve the reconstruction accuracy relative to existing methods that ignore epistasis. We find that in each tested scenario, embedding-based ASR is outperformed by standard likelihood-based methods, even when the assumption of independent site evolution made by these methods is violated. In order to do ASR with embeddings, one requires both 1) a latent space that is phylogenetically meaningful and 2) a mapping from sequence to embedding space that does not lose too much information about the sequence. In the context of the simulations with epistasis, we discuss the relative responsibility of each condition’s absence for the inferior performance of an embeddings-based approach to ASR. 2 Related work While there has been effort in the phylogenetic community to develop models of protein sequence evolution that incorporate epistatic effects, with the notable exception of the recent work of De Leonardis et al. (2025) , there are few methods for performing phylogenetic inference based on these models. Robinson (2003) and Rodrigue et al. (2006) both consider epistatic models of sequence evolution based on a structure-based fitness landscape, but these models have not been used for phylogenetic inference. Meyer et al. (2019) models co-evolving sites in sequences of nucleotides in a Bayesian model that facilitates joint inference of the phylogeny and the pairs of coupled sites. De La Paz et al. (2020) propose transforming a Potts model into a dynamic model of sequence evolution with MCMC sampling, which they call Sequence Evolution with Epistatic Contributions (SEEC). While forward simulation from an extension of this model has proven useful for exploring sequence space and generating functional phenotypes ( Bisardi et al. 2021 ; Alvarez et al. 2024 ), it has not been used for phylogenetic inference. Recently, De Leonardis et al. (2025) developed an analytically tractable model of sequence evolution that accounts for epistasis and which can be used to perform ancestral sequence reconstruction. In particular, these authors formulate a model according to which each site in a sequences evolves in the context of preceding sites (in some, pre-specified ordering) according to a Markov chain whose stationary distribution is given by the fitted conditional probabilities from a regression on the previous sites. Maximum likelihood ancestral sequence can be reconstructed under this model by following the same ordering, one site at a time, and accounting for the conditional probabilities in reconstructing each position; the resulting estimates are more accurate than reconstructions from IQTree ( Minh et al. 2020 ) in settings where the sequences have been simulated with epistasis. Finally, Moreta et al. (2022) propose performing ancestral sequence reconstruction with a VAE. While an important motivation for this work, their approach incorporates the known phylogenetic relationships between sequences into the training of the VAE; in particular, they impose a prior on the latent variables in the VAE based on a Gaussian process along the tree, thus tying together the different sequences in the training data. In contrast, we investigate whether neural network embedding models trained without any access to the phylogenetic tree still learn representations that can be used for accurate ancestral sequence reconstruction, a question that is of interest for several reasons. First, the phylogenetic prior in the latent space means that the VAE from Moreta et al. (2022) must be trained with more complex stochastic variational inference methods ( Bingham et al. 2019 ). Because we only incorporate the phylogenetic correlation structure at a later stage in our ASR pipeline (namely, with the Brownian motion model applied to pre-computed embeddings), we are able to train a standard VAE, a method that is scalable to larger families with thousands of sequences (the largest family analyzed in Moreta et al. (2022) has 800 sequences and takes over 3.5 hours to train). Secondly, protein language models as well as most other protein embedding models are trained in a completely unsupervised fashion without phylogenetic tree information available at training time. Excluding information on phylogenetic tree in the training of our VAE increases the chances that our findings generalize to other sequence embeddings. Finally, it has already been demonstrated in Ding et al. (2019) and Detlefsen et al. (2022) that the embeddings from a standard VAE trained on sequences from a large multiple sequence alignment recapitulate phylogeny, so it would not seem necessary to add a phylogenetic prior to the VAE. In section 4.2, we provide evidence that an absence of phylogenetic signal in the learned embeddings is not what is responsible for the disappointing performance of the embeddings-based approach to ASR that we encounter. 3 VAE-assisted ancestral sequence reconstruction The goal of ASR is to estimate the sequence identity of some determinate ancestor (RNA, DNA, or protein) on the basis of extant homologs from the family. In contrast to other methods for ASR, the embeddings-based approach requires a family of (at least) hundreds of extant sequences, as these comprise the data for training a variational autoencoder. Although in practice the alignment of such a large collection of potentially distantly related sequences and the phylogenetic tree that relates them are both subject to error, especially when estimated based on a potentially flawed model of independent site evolution, for the purpose of focusing solely on evaluating the task of ASR and in line with other ASR methods ( De Leonardis et al. 2025 ; Moreta et al. 2022 ), we assume that we are in possession of the ground-truth alignment and the ground truth phylogenetic tree, with branch lengths, possibly unrooted. Note as well that if the extant sequences in the alignment are not all orthologs, but also include paralogs, the tree will include some internal nodes representing duplication, rather than speciation, events. Most methods for performing ASR, including the approach presented here, are agnostic to the interpretation of the internal nodes of the tree. In the pipeline under study, represented graphically in Fig. 1 , we begin by training a variational autoencoder on the collection of fixed length extant sequences from the alignment to learn a latent variable model of the distribution of sequences in the family. The variational autoencoder framework includes an encoder, which accepts as input a sequence and outputs the parameters of a multivariate Gaussian distribution in a low-dimensional space (Methods and materials). Once the model has been trained, we view the location parameter of this distribution output by the encoder when fed a particular sequence as the embedding of the sequence in latent space. Viewing the embeddings of all the extant sequences in the alignment as a transformed version of the data, we introduce a model which assumes that they are generated by a multivariate Brownian motion along the phylogenetic tree within the low-dimensional latent space (Methods and materials). Note that this model is only introduced and applied to the embeddings obtained from the VAE after the latter has been trained in the standard fashion using stochastic gradient descent. The Brownian motion model applied to the embeddings of the extant sequences yields maximum likelihood estimates of “ancestral embeddings” at each internal node in the tree (Methods and materials). We then feed these estimated ancestral embeddings through the decoder of our trained VAE to obtain estimated ancestral sequences ( Fig. 1 , Methods and materials). Download figure Open in new tab Fig. 1: Graphical representation of method for ASR based on the embeddings from a VAE. The VAE is trained on extant sequences from a protein family to learn continuous representations of each extant member of the family. These “embeddings” are then used to fit a multivariate trait evolutionary model, yielding posterior estimates of “ancestral embeddings” for each internal node of the tree. Finally, the ancestral embeddings are fed back through the VAE decoder to arrive at reconstructions of the ancestral sequences. We benchmark the VAE-based approach to ancestral sequence reconstruction (ASR) on sequences simulated to evolve in several ways. First, we consider the traditional model in which sites evolve independently in order to understand how VAE-based ASR does in this well-understood setting. We expected, however, that the advantage of an embeddings-based approach to ASR would only emerge in settings where the independent sites assumption is violated, as the embeddings would then be able to capture coevolutionary effects which standard phylogenetic models do not. To test this, we consider two epistatic models of sequence evolution: the Sequence Evolution with Epistatic Contributions (SEEC) model from ( De La Paz et al. 2020 ), which amounts to Gibbs sampling from a Potts model, and the auto-regressive model of ( De Leonardis et al. 2025 ), in which each site is simulated to evolve according to a continuous time Markov Chain whose stationary distribution depends on the sites that were evolved before it. Finding that a standard likelihood-based approach based on a model that assumes independent evolution of sites outperforms an embeddings-based approach even when applied to sequences simulated to evolve with epistasis, we are able to discount the utility of the embeddings-based approach for most, if not all, datasets. In the following sections, we describe the simulations results, first for the independent sites datasets, then the epistatic datasets. 4 Results 4.1 Independent site evolution For our independent-sites simulations, we use the estimated general time reversible model of site-specific evolution from Le and Gascuel (2008) , known as the LG model. We consider three phylogenetic trees of various sizes on which we simulate evolution under this model. The first tree is a random tree with 10,000 tips simulated with random uniform branch lengths between 0 and 0.3 (see Methods and materials). On this tree, we simulate evolution of sequences with the LG model and with equal substitution rates across sites, reproducing the tree and evolutionary simulation from Ding et al. (2019) . Next, we consider more realistic settings. We choose tree topologies that have been estimated to members of the Cluster of Orthologous Genes (COG) database ( Tatusov 2001 ), taken from the benchmark set of FastTree ( Price et al. 2009 ). In particular, we consider one tree with 1250 tips, inferred to the COG28 family, and one tree with 5000 tips, inferred to the COG2814 family (see Methods and materials). On each tree, we again simulate independent-site evolution according to the LG model, but for these, we included rate heterogeneity across sites (see Methods and materials). In each case, the simulated sequences have 100 sites. Applying our pipeline to each dataset, we trained a variational autoencoder on the collection of evolved sequences at the tips of the tree, then fit a Brownian motion model to the embeddings at the tips to estimate ancestral embeddings, then finally fed the estimated ancestral embeddings back through the decoder of the VAE to obtain estimated ancestral sequences. In order to understand the method visually, we first used a VAE with a two dimensional latent space. We are able to reproduce the same star-shaped manifolds discussed in Ding et al. (2019) and moreover reproduce the finding that, at least in some cases, tracing the embeddings of a particular lineage of sequences in embedding spaces involves moving from the center of the latent space towards the periphery along the spikes in the star-shaped manifold ( Fig. 2A ). Moreover, the estimates of the ancestral embeddings from the Brownian motion model remain within the star-shaped manifold, and the ancestral sequence reconstruction accuracy resulting from decoding the estimated embeddings is higher for those embeddings closer to the tips of the spikes, corresponding to ancestors that are evolutionarily closer to extant sequence, than for those that are closer to the center, which represent sequences that are deeper in the tree ( Fig. 2B ). However, Fig. 2C shows that our estimates of ancestral embeddings based on the Brownian motion model applied to the leaf embeddings are often closer to the origin than the embeddings of the ground-truth ancestral sequences. If we look more closely at the dots in the background of Fig. 2A , we can see why that is: in the spikes of the latent space, the embeddings of the ground truth ancestral sequences are further from the origin than the embeddings of the leaf sequences that the model is exposed to during training, resulting in bias when we use the leaf embeddings to infer the location of the ancestral embeddings with a Brownian motion model. This mismatch between embeddings of leaves and embeddings of ancestral sequences challenges the interpretation of the latent space as an accurate representation of the sequences’ evolutionary history, despite the results from Ding et al. (2019) . Download figure Open in new tab Fig. 2: Visualizations of the latent space for independent sites simulations. Row A : The embeddings of all leaf (tiny blue dots) and ground-truth ancestral (tiny red dots) sequences from the trained VAE. The embeddings of four leaves at the outskirts of the embedding space are singled out with red crosses. The embeddings of the ancestral lineage of each singled-out leaf is connected by a line, and these ancestral lineages coalesce at the root in the center of embedding space, as in Ding et al. (2019) . The colors of the embeddings of the ancestors of the singled-out leaves are based on their distance to the root. Row B : The embeddings of all leaf sequences (unfilled dots) and the reconstructed ancestral embeddings based on the Brownian motion model (colorful, filled dots), with edge connections based on the phylogenetic tree. Colors of reconstructed ancestral embeddings are based on the Hamming error resulting from feeding the reconstructed embedding through the VAE decoder and comparing with the ground truth ancestral sequence. Row C : Visualization of the error in estimating ancestral embeddings using the Brownian motion model. Each arrow starts at an estimated embeddings based on Brownian motion model and ends at the embedding of the actual sequence. That is, each arrow begins at a dot from the plot in row B and ends at a dot from the plot in row A . Column 1250 tips : Family simulated with rate heterogeneity on COG28 tree. Column 5000 tips : Family simulated with rate heterogeneity on COG2814 tree. Column 10000 tips : Family simulated without rate heterogeneity on random, balanced tree from ( Ding et al. 2019 ). Ultimately, we are interested in the accuracy of the estimated ancestral sequences obtained from decoding the estimated ancestral embeddings, not the accuracy of the estimated ancestral embeddings themselves. Two-dimensional latent spaces have the virtue of being plottable, but may impose too severe of an information bottleneck for accurate reconstruction, so we additionally trained variational autoencoders to each dataset with a 20-dimensional latent space. Fig. 3 shows the ancestral reconstruction accuracy of the VAE-assisted approach for the two different latent space dimensions, 2 and 20, along with the performance of several different existing methods: maximum likelihood estimation under a model of independent evolution using the IQTree software ( Minh et al. 2020 ), maximum-likelihood estimation under the autoregressive model of evolution from De Leonardis et al. (2025) , maximum parsimony estimation, and a baseline obtained by using the consensus sequence from the alignment of leaf sequences for each reconstructed ancestor (implementation details for each method in Methods and materials). For each method, a Locally Weighted Scatterplot Smooth (LOWESS) curve was fit to model the mean Hamming error of the method’s reconstructions of ancestral sequences as a function of the ancestors’ depth in the tree, where depth is defined as patristic distance to the nearest leaf (Methods and materials). Download figure Open in new tab Fig. 3: LOWESS curves fit to Hamming error versus depth-in-tree scatter plots for benchmark methods and for VAE-based approaches on each of the three protein families simulated with independent evolution. Benchmark methods include modal : the consensus sequence from the MSA of the leaf sequences is used to estimate each ancestral sequence; parsimony : the maximum parsimony reconstructions of ancestral sequences; iqtree : marginal maximum likelihood reconstructions under an independent evolutionary model using IQTree; ardca : marginal maximum likelihood reconstructions based on the auto-regressive evolutionary model. 1250 tips : family simulated with rate heterogeneity on COG28 tree; 5000 tips : Family simulated with rate heterogeneity on COG2814 tree; 10000 tips : family simulated without rate heterogeneity on random, balanced tree from Ding et al. (2019) . For each simulation setting, both of the likelihood-based reconstructions, IQTree and ArDCA, dominate the VAE-based reconstructions at every evolutionary depth. In fact, even maximum parsimony dominates the reconstructions based on the two-dimensional VAE for all simulations and even the reconstructions based on the 20-dimensional embeddings for the simulation without site rate heterogeneity. Using a VAE with an even higher-dimensional latent space, we would be able to achieve arbitrarily small reconstruction error for ancestral sequences at depth 0, corresponding to ancestors that are arbitrarily close to present-day sequences, as the higher-capacity VAE is able to memorize the present-day sequences that its trained on. The cost of using the higher-capacity VAE is that the error starts to increase at a much higher rate with respect to depth as we overfit the VAE more to the present-day sequences its trained on. Even for the 2- and 20-dimensional latent space VAEs that we trained, which are not overfit (see Methods and materials), for the families simulated with site rate heterogeneity, we are better off using the consensus sequence than decoding the estimated ancestral VAE embedding when predicting the ancestors deepest in the tree ( Fig. 3 ). 4.2 Epistatic evolution To simulate epistatic evolution, we considered two distinct models, both based on transforming a generative model of the distribution of sequences in a protein family into a dynamic model of evolution. In both cases, we first fit the generative model to the Pfam family of Staphylococcal nuclease homologs (PF00565) (full MSA downloaded from PFAM, but pre-processed, see Methods and materials). In the first case, we fit a Potts model to this MSA ( Rosset et al. 2025 ), and in the second case, we fit the auto-regressive model of Trinquier et al. (2021) (Methods and materials). Contact maps from these fitted models reveal several pairs of positions that show signs of epistatic coupling ( Fig A1 ). To transform these generative models, Potts and auto-regressive, to models of sequence evolution, we follow the approaches of De La Paz et al. (2020) and De Leonardis et al. (2025) , respectively. Specifically, for evolution with the Potts model, we performed Gibbs sampling from the Potts model along the phylogenetic tree, making the number of mutational steps on each branch proportional to its length. To simulate evolution with the auto-regressive model, we follow De Leonardis et al. (2025) , who allow the evolution of each site in a sequence to depend on preceding ones (in some pre-specified order) by setting the equilibrium probabilities in the site’s Markovian dynamics at each branch in the tree to equal the predicted probabilities of the fitted autoregressive model given the previously evolved sites (see Methods and materials). In both cases, the tree on which we simulated evolution was the COG28 tree with 1250 leaves from the previous independent sites simulations. Fig. 4 displays the ancestral sequence reconstruction accuracy for these two datasets as a function of ancestral depth in the tree. Although there is a switch from the previous simulations in the best-performing method from IQTree to ArDCA, corroborating the results of De Leonardis et al. (2025) , both of these methods, as well as maximum parsimony, continue to outperform the VAE-based approach at almost all depths and for both versions of the VAE with different latent space dimensions. The 20-dimensional latent space VAE performs similarly to maximum parsimony; however, even this optimal VAE model is outperformed by IQTree on these datasets, despite the fact that IQTree is maximizing the likelihood of a model with independent site evolution (the LG model, see Methods and materials). Training a less regularized VAE (either by decreasing the weight decay hyperparameter or increasing the dimensionality of the latent space) yields an accuracy-depth curve with a steeper slope, such that the reconstruction accuracy for ancestors deep in the tree is even poorer than that of the 20-dimensional VAE shown in Fig. 4 (not shown). Download figure Open in new tab Fig. 4: Hamming error versus depth in tree for benchmark methods and VAE-based approaches on each of the epistaticly simulated families. See caption to Fig. 3 for details on the benchmark methods included. As mentioned in the Introduction, satisfactory performance from VAE-assisted ASR requires two distinct properties: the latent space must be phylogenetically meaningful, and the embeddings must preserves sufficient information about the sequences. Without the first property, namely the phylogenetic interpretability of the latent space, it does not make sense to fit a phylogenetic Brownian motion model to the embeddings to estimate ancestral embeddings. Without the second, namely fidelity in the VAE’s self-reconstructions, even if the phylogenetic Brownian motion model in the latent space is appropriate, feeding the interpolates representing ancestral embeddings back through the decoder will produce inaccurate sequences. In order to assess the relative responsibility of the absence of each condition to the results from our embeddings-based ASR pipeline, we considered that, in this simulated setting where ancestral sequences are known, we can remove the issue of estimating ancestral embeddings with a Brownian motion model by simply using the embeddings of the ground truth ancestral sequences. By decoding these latter embeddings with our trained VAE and evaluating their accuracy, we isolate the VAE’s inherent self-reconstruction inaccuracy; we can then compare this inherent self-reconstruction inaccuracy to the overall ASR inaccuracy that results from first estimating ancestral embeddings with the Brownian motion model and then feeding these estimated ancestral embeddings through the VAE decoder. Fig. 5 displays the result of this comparison. In particular, the red curves represents the Hamming distance between ground truth ancestral sequences and the decoding of their embeddings, a measure of self-reconstruction accuracy of the trained VAE, while the purple curves are the Hamming error from decoding the estimated embeddings, the same as the purple curves in Fig. 4 . For the VAE with a 20-dimensional latent space, the accuracy of the self-reconstruction of the ancestors is only slightly better than the accuracy based on decoding estimated ancestral embeddings and notably still worse than the IQTree benchmark, which is based on a flawed independent sites model. Download figure Open in new tab Fig. 5: Comparison of Hamming distance between ground truth ancestral sequences and their self-reconstruction (decoding of their own embeddings) (red curves) with Hamming distance between ground truth ancestral sequences and the decoding of their estimated embeddings obtained from Brownian motion model applied to embeddings of leaf sequences (purple curves). Error of two benchmarked approaches (maximum parsimony reconstructions and IQTree reconstructions) also shown for reference. Interestingly, for the VAE with a 2-dimensional latent space, reconstructed ancestral sequences obtained by decoding the estimated ancestral embeddings are, for ancestors that are relatively shallow, actually more accurate than the self-reconstructions. To explain this, we can return to our observation that the embeddings of the ground truth ancestral sequences tend to be further from the origin than those of the leaf sequences, the sequences with which the VAE is trained ( Fig. 2A, C ). It is this bias that results in the counter-intutitive finding in Fig. 5 that in order to estimate an ancestral sequence that is closely related to some extant sequence (such that it has small depth), rather than decode its own embedding, we do better by decoding something that is closer to the embedding of this extant sequence that the model was trained with. 4.3 Beta-lactamase We additionally ran our pipeline on the Beta-lactamase family (PFAM family PF00144, Mistry et al. (2020) ). We trained the VAE model with latent dimensions of 2 and 20 on a pre-processed MSA that had 23,698 sequence with 229 sites (see Methods and materials). On the basis of this same MSA with 23,698 tips, we inferred a tree using IQTree. We then took the subtree induced by a subset of 134 representative sequences from Detlefsen et al. (2022) (see Methods and materials), computed their embeddings by applying the trained VAE encoder, reconstructed ancestral embeddings for the internal nodes of the subtree using Brownian motion, and decoded these estimated ancestral embeddings to obtain estimated ancestral sequences. Figure 6 shows the predicted distribution of amino acids in the first 20 positions of the ancestral sequence at the root according to different methods, including both VAE models, as a probability logo plot. Embeddings from the 2 dimensional VAE and reconstruction of a different ancestral sequence are shown in Figures B2 and B3 , respectively. Overall, we see that the uncertainty in the VAE-based reconstructions tracks the uncertainty from other methods; however, there are discrepancies between the VAE-based reconstructions and the other methods. For example, in position 6, parsimony, IQTree, and ArDCA are unanimous in reconstructing a glutamic acid, while both VAEs are quite uncertain with the low-dimensional VAE assigning the highest probability to glutamine and the high-dimensional VAE assigning the highest probability to alanine. Download figure Open in new tab Fig. 6: Probability logo plot of reconstructions of first 20 positions of root sequence of the Beta-lactamase family (PF00144) with different methods: maximum parsimony (Parsimony), maximum likelihood under an independent sites model (IQTree), maximum likelihood under the autoregressive evolutionary model (ArDCA), a VAE with a two-dimensional latent space (Low dimensional VAE), and a VAE with a twenty-dimensional latent space (High dimensional VAE). 5 Discussion In the VAE-based ancestral sequence reconstruction pipeline under study, we posit a Brownian motion model of evolution in the continuous latent space that is decoded by our learned VAE decoder to arrive at the evolution in sequence space. In this way, we avoid modeling evolution in the discrete sequence space which typically requires an assumed independence across sites to make the model computationally tractable. We hypothesized that we could more accurately capture the evolutionary traversal through sequence space in this way. In practice, however, we ran into difficulties. The proliferation of powerful protein language models in recent years has raised the prospect of a new set of potential tools for phylogenetics ( Tule et al. 2024 ). Our work highlights the importance of not uncritically assuming that deep-learning based embeddings can be used as the input data with which to perform common tasks in phylogenetics. Indeed, to the question of whether interpolation in the latent space of a VAE trained on extant sequence can match more standard approaches at the task of ancestral sequence reconstruction, we answer with a resounding no, even in settings in which sequences have evolved with epistasis. We additionally show that the problem is not that the latent space of the VAE is not phylogenetically meaningful. In fact, phylogenetic distances are preserved in the latent space, and the spikes in the latent manifold, as first observed in ( Ding et al. 2019 ), do often correspond to clades in the phylogeny ( Figure 2A ). Rather, what is responsible for the poor performance is simply the fact that the VAE encoder loses too much information about the sequences in performing the compression to the latent space and therefore the embeddings cannot be decoded back to sequences with high enough fidelity. The information bottleneck becomes less severe with an increase in the dimension of the latent space, but even when the ground truth ancestral sequences are passed through a trained VAE with a 20-dimensional latent space, these self-reconstructions are less accurate than the ancestral reconstruction estimates from IQTree ( Figure 5 ). Ancestral sequence reconstruction requires attending precisely to the sequences local to the targeted ancestor, and it is just this sort of precise, local information that is blurred over in the trained VAE. We stress that VAE embeddings can still be useful for visualizing, clustering, and exploring protein families despite falling short for ancestral sequence reconstruction, as we have documented here. More generally, embeddings from protein language model are extremely rich representations and have been demonstrated to be extremely effective for structural and functional prediction. One tantalizing if unrealized prospect is the incorporation of structural information into ancestral sequence reconstruction pipelines, and the mulitmodal protein language models which incorporate training on structural data provide an effective means to produce a shared embedding space for both sequence and structure ( Fan et al. 2025 ; Heinzinger et al. 2024 ; Blaabjerg et al. 2024 ). Even with these exciting advances in developing more rich protein embeddings that results from both scaling up the training data and incorporating structural information, our study serves as a warning against uncritically employing embeddings as input data for tasks in which the goal is the fine-grained reconstruction of the evolutionary past. 6 Methods and materials 6.1 Trees The random tree with uniform branch lengths between 0 and 0.3 was generated by calling the populate() method of the Tree class in python package ete3 (( Huerta-Cepas et al. 2016 ), v3.1.1). The Cluster of Orthologous Genes trees from the FastTree study can be downloaded from http://www.microbesonline.org/fasttree/downloads/aa5K_new.tar.gz (trees with 5000 leaves) and http://www.microbesonline.org/fasttree/downloads/aa1250.tar.gz (trees with 1250 leaves). Price et al. (2009) describe how these unrooted trees were inferred to alignments of protein families from the Clusters of Orthologous Groups (COG), using either PhyML (for the 1250 leaf trees) or FastTree (for the 5000 leaf trees). Our simulations use specifically the COG28 tree (1250 leaves) and COG2814 tree (5000 leaves). Note that the FastTree algorithm results in some negative branch lengths in the 5000 leaf trees. Moreover, even the tree with 1250 leaves has several extremely short pendant edges, which would lead to duplicate evolved sequences. Therefore, we pre-processed the trees by (1) replacing all negative branch lengths with 0 and (2) subsequently trimming any pendant branch of length below 0.001. In order to simulate directed evolution along these trees, we rooted them at their midpoint. 6.2 Evolution simulations 6.2.1 Independent sites To simulate evolution under the LG model without site rate heterogeneity, we use code from Ding et al. (2019) . We use IQTree with the --alisim flag to simulate MSAs under the LG model with rate heterogeneity across sites. We allowed rates to vary according to a discrete (4 categories) Gamma distribution with shape parameter 0.5, -m LG+G { 0.5 }, representing reasonably sharp rate heterogeneity. 6.2.2 Potts Our first simulator of epistatic evolution was performed by Gibbs sampling from a Potts model along a phylogeny. Evolution proceeds by randomly choosing a site in the sequence to mutate and sampling from the conditional distribution of this site given all the other sites according to the Potts model (note that this may result in no change if the sampled amino acid is the same as the current one). The number of times we do this on a given branch equals the branch length times the length of the sequence. We rely on the implementation in the Julia package PottsEvolver.jl ( Pagnani and Barrat-Charlaix 2025 ). The Potts model under which evolution is simulated was trained with Boltzmann machine learning to reproduce the statistics of an MSA for the family of Staphylococcal nuclease homologues (PF00565) using the adabmDCA software ( Rosset et al. 2025 ). 6.2.3 Autoregressive The autoregressive model of evolution is formulated in De Leonardis et al. (2025) . We fit an autoregressive generative model to the family of Staphylococcal nuclease homologues (PF00565) using the ardca() function in the Julia package ArDCA.jl ( Trinquier et al. 2021 ). To simulate evolution using the parameters from this fitted generative model, we rely on the function ASR.Simulate.evolve() from AncestralSequenceReconstruction.jl ( De Leonardis et al. 2025 ). 6.3 Real protein families 6.3.1 Beta lactamase (PF00144) We downloaded the MSA for PFAM family PF00144 from the Github repository released with Detlefsen et al. (2022) , accessible at https://github.com/MachineLearningLifeScience/meaningful-protein-representations/blob/master/tape/PF00144_full.txt . We pre-processed this MSA by following the steps in Ding et al. (2019) . In particular, we specified a “query” sequence (A0A010Q9K6 9PEZI/15-292) and performed the following steps in order: We remove all positions in the alignment at which the query sequence has a gap. We remove any sequences that at this point are more than 20% gaps. We remove any positions that are at this point more than 20% gaps. This reduced the length of the original MSA from 2592 to 229 and reduced the number of sequences from 36,328 to 23,698. The subset of 134 representative sequences are the intersection of those 200 sequences used for ancestral reconstruction in Detlefsen et al. (2022) that were retained in our pre-processed MSA. 6.3.2 Staphylococcal nuclease (PF00565) The “full” MSA for PF00565 was downloaded directly from Interpro at https://www.ebi.ac.uk/interpro/entry/pfam/PF00565/entry_alignments/ and subsequently filtered to only include sequences from Eukaryotes. We then performed the same query-based filtering steps with query sequence SND1_HUMAN/552-660. 6.3.3 Tree inference for PF00144 A phylogenetic tree for PF00144 was inferred from the pre-processed MSAs described above using IQTree 2 ( Minh et al. 2020 ). Because no prior information was available regarding the appropriate evolutionary model, we enabled automatic model selection using the -m MFP option, which evaluates a range of candidate substitution models and selects the best-fitting model based on the lowest Bayesian Information Criterion (BIC). 6.4 Variational Autoencoder A variational autoencoders (VAE) is a neural network with an autoencoder architecture that is trained to learn a probabilistic, latent variable model of the data it encodes ( Kingma and Welling 2013 ). Autoencoder neural networks generally consist of two subnetworks: an encoder and a decoder. The encoder network learns a mapping from the high-dimensional space of the data, in our case, protein sequences, to a lower-dimensional embedding space. The decoder, in turn, learns to map the lower dimensional space back to the original space in such a way that it approximates the inverse of the encoder. Autoencoders thus learn lower-dimensional representations of the data they are trained on. In a variational autoencoder, in order to avoid overfitting and produce a smoother embedding space, a prior is placed on the embeddings and the autoencoder is trained to learn a probabilistic model of the data. Formally, for each sequence s in our alignment, it is assumed that a continuous, low-dimensional latent variable z first gets generated according to a simple distribution p ( z ); then z gets mapped in a complex fashion to a distribution p θ (· | z ) over sequences, where the mapping is parameterized by unknowns θ ; finally, s is sampled from this distribution. The decoder, DEC, of the autoencoder is used to execute this complicated mapping from z to p θ (· | z ). If we were to attempt to learn the weights and biases, θ , of the decoder by directly maximizing the marginal likelihood of the data, we would encounters the intractable integrals in where n is the total number of sequences in the alignment s = {s (1) , s (2) , …, s ( n ) } . To facilitate the learning of this model, we introduce the encoder neural network, ENC, parameterized by ϕ , which should map each sequence s to a distribution that approximates the posterior of the latent variable given the sequence, ENC( s ) = q ϕ (· | s ) ≈ p θ (· | s ). To simultaneously train the encoder and decoder, we consider, instead of log p θ ( s ), the evidence lower bound (ELBO) objective function where KL(· ∥ ·) denotes the Kullback-Leibler (KL) divergence and the inequality is due to the non-negativity of the KL divergence. Maximizing (1) jointly with respect to both θ , the parameters of the decoder, and ϕ , the parameters of the encoder, simultaneously targets large log p θ ( s ), which corresponds to the original goal of finding a good generative model, and small ∑ i KL( q ϕ ( z | s ( i ) ) ∥ p θ ( z | s ( i ) )), which corresponds to accurate inference of the latent variables for the sequences, given our generative model. However, if the only way to make the KL( q ϕ ( z | s ( i ) ) ∥ p θ ( z | s ( i ) )) small involves choosing a θ for which log p θ ( s ) is small, there can be a tradeoff between the two competing objectives. In order to avoid this situation, we use a sufficiently flexible neural network for the encoder, so that s ↦ q ϕ (· | s ) is flexible enough to approximately match s ↦ p θ ∗ (· | s ), where θ ∗ = arg max θ log p θ ( s ). Unlike the marginal log-likelihood, the ELBO objective can be tractably estimated, since we have We can calculate the expression appearing inside the expected value for any z and thereby obtain a stable Monte Carlo estimate of our new objective evaluated at a single training sequence s by sampling z from q ϕ (· | s ) and plugging into the expression. Estimates of the gradients of the negative ELBO with respect to θ and ϕ can then be obtained with autodifferentiation in tandem with the reparameterization trick, and we can use these estimated gradients to optimize L θ , ϕ ( s ) with mini-batch stochastic gradient descent. In our implementation of a vanilla VAE for protein sequences of fixed-length m , the likelihood is a product of independent categorical distributions over amino acids for each position in the sequence, where DEC( z ) j is the probability distribution over amino acids at the j th position in the sequence computed by the decoder. Meanwhile, the prior on the latent variable is a standard normal p ( z ) = N ( z ; 0 d , I d ), where d is the dimension of the latent space, and the variational approximation to the posterior of the latent variable is Finally, following Ding et al. (2019) , our encoder and decoders are both given by multi-layer perceptrons, each with a single hidden layer: where k = 21 or 20 is the number of amino acid types for alignments with or without gaps, respectively. We use a hidden dimension of 500 (1000) in models fit to simulated (real) MSAs and use a latent dimension of 2 and 20 to explore the latter hyperparameter’s impact on the results. When training the VAE, we held out 10% of the extant sequences from the MSA to use as a validation set to monitor overfitting. We used weight decay to regularize our networks, and all of the models shown in the paper were fit with the weight decay parameter that achieved optimal importance weighted ELBO ( Burda et al. 2015 ) on the held out set ( Table 1 ), which plateaued for all models by 500 epochs of training. View this table: View inline View popup Download powerpoint Table 1: VAE performance across all datasets, measured by Importance Weighted (IW) ELBO per site in a held-out set of sequences. The IW ELBO is computed with 5000 samples. Note that the non-independent evolution simulators and real MSA included gap characters, but the independent model (LG) did not, so the IW ELBO values are not directly comparable. We explored adding additional hidden layers to the encoder and decoder, as well as changing to a ReLU activation function, but found that these changes, if they made a difference, led only to more overfitting and not to improvements in performance on the downstream task of ASR. 6.5 Maximum likelihood ancestral reconstruction under Brownian motion Once the VAE is trained, embeddings for each of the tips of the phylogenetic tree are obtained by passing each extant sequence through the encoder ENC µ , resulting in a matrix where n is the number of tips in the tree. For each internal node i in the tree, an “ancestral embedding” is estimated under an assumed model of Brownian motion as follows. Let C i be the phylogenetic covariance matrix under Brownian motion that would apply to the tips given a tree rooted at i , i.e. assuming trait evolution begins at i and is directed away from i . Formally, [ C i ] ab gives the distance from i to the most recent common ancestor of tips a and b in the tree rooted at i . Then the maximum likelihood estimate of the ancestral embedding z i is given by where 1 ∈ ℝ n is a column vector of ones. The quadratic forms with appearing in this equation can be computed efficiently (linearly in the number of taxa) with the algorithm from Tung Ho and Ané (2014) . We rely on the implementation in the R package RPhylopars ( Goolsby et al. 2016 ). 6.6 ASR Benchmark Methods 6.6.1 Maximum parsimony We apply Fitch’s algorithm ( Fitch 1971 ) to recover the assignment of amino acids to ancestral nodes that minimize the total number of changes along the tree. If there are multiple possible choices that achieve this minimal changes, we choose one arbitrarily to evaluate for Figures 3 to 5 . For the reconstruction of the root of the beta lactamase family in Figure 6 , we assign equal probabilities to each such solution. 6.6.2 IQTree We reconstruct likelihood-based estimates of ancestral sequences by running IQTree with the -asr flag. We impose the LG model with rate heterogeneity ( -LG+G ) even for the datasets that were simulated with epistatic evolution. We fix the tree topology and do not have IQTree optimize the branch lengths ( -blfix ). IQTree outputs the posterior distribution over amino acids at each site in each ancestral node in the tree. Note that IQTree does not include gap characters in its vocabulary, so no posterior probability is assigned to gaps. 6.6.3 Autoregressive To infer ancestral sequences under the autoregressive model from Bisardi et al. (2021) , we rely on the algorithm from De Leonardis et al. (2025) implemented in AncestralSequenceReconstruction.jl. We fit the autoregressive model that is required for the algorithm to the alignment of leaf sequences. We keep the tree topology and its branch lengths fixed when reconstructing ancestors with the infer_ancestral() function in AncestralSequenceReconstruction.jl. For Figures 3 to 5 , we choose the maximum a posteriori estimate to evaluate, but AncestralSequenceReconstruction.jl can also provide samples from the posterior, and we display the distribution of these posterior samples for the reconstruction of the root of the beta lactamase family shown in Figure 6 . 6.7 LOWESS curves For the set of reconstructed ancestral sequences from each method, a LOWESS curve was fit to the scatter plot of Hamming error versus ancestral depth (defined as distance to nearest leaf) using the lowess function from the statsmodels.nonparametric.smoothers_lowess module in Python, specifying as the smoothing parameter to use a fraction 0.3 of the data for the local fits. 7 Data Availability The raw and pre-processed multiple sequence alignments for PF00565 and PF00144 are made available at the Github repository for this paper: https://github.com/solislemuslab/asr-ae/ . This repository also contains the code for the simulation of MSAs under different models of evolution, the training of the VAEs, and the benchmarking of the embeddings-based approach to ASR against other standard methods. 8 Acknowledgements This work was supported by the National Science Foundation (DEB-2144367 to CSL). This material is based upon work support by the National Institute of Food and Agriculture, United States Department of Agriculture, Hatch project 7007384 (to CSL). CSL participated in the Institute for Computational and Experimental Research in Mathematics in Providence, RI “Theory, Methods, and Applications of Quantitative Phylogenomics” program and presented early versions of this work. Appendix A Contact maps from generative models Download figure Open in new tab Fig. A1: Contact maps from different generative models fit to multiple sequence alignment of Staphylococcal nuclease homologs (Pfam family PF00565) Left: Potts model; right: autoregressive model. Appendix B Further Beta-lactamase results Download figure Open in new tab Fig. B2: (a) Embeddings of sequences from the PF00144 MSA colored by phylum of species that sequence is sampled from. Left:embeddings of all sequences. Right: embeddings of representative sequences from Detlefsen et al. (2022) (b) Embedding of inferred phylogenetic tree in latent space. Positions of internal nodes in blue are estimated under assumed Brownian motion along the phylogeny. Root is highlighted, as well as a shallow ancestor, A35368, and its two leaf children, A0A1I1L7I8_9GAMM and A0A0A8E0Q6_9XANT. Download figure Open in new tab Fig. B3: First 20 positions of the sequences A0A1I1L7I8 9GAMM and A0A0A8E0Q6 9XANT and probability logo plot of VAE’s reconstructions of these positions in their parent A35368. Funder Information Declared U.S. National Science Foundation , DEB-2144367 National Institute of Food and Agriculture , Hatch Project 7007384 References ↵ Alvarez , S. , Nartey , C.M. , Mercado , N. , Paz , J.A. , Huseinbegovic , T. , Morcos , F. : In vivo functional phenotypes from a computational epistatic model of evolution . Proceedings of the National Academy of Sciences 121 ( 6 ), 2308895121 ( 2024 ) doi: 10.1073/pnas.2308895121 OpenUrl CrossRef ↵ Bingham , E. , Chen , J.P. , Jankowiak , M. , Obermeyer , F. , Pradhan , N. , Karaletsos , T. , Singh , R. , Szerlip , P. , Horsfall , P. , Goodman , N.D. : Pyro: Deep universal probabilistic programming . Journal of Machine Learning Research 20 ( 28 ), 1 – 6 ( 2019 ) OpenUrl ↵ Burda , Y. , Grosse , R. , Salakhutdinov , R. : Importance weighted autoencoders . arXiv preprint arXiv: 1509.00519 ( 2015 ) ↵ Blaabjerg , L.M. , Jonsson , N. , Boomsma , W. , Stein , A. , Lindorff-Larsen , K. : Ssemb: A joint embedding of protein sequence and structure enables robust variant effect predictions . Nature Communications 15 ( 1 ), 9646 ( 2024 ) doi: 10.1038/s41467-024-53982-z OpenUrl CrossRef PubMed ↵ Bisardi , M. , Rodriguez-Rivas , J. , Zamponi , F. , Weigt , M. : Modeling sequence-space exploration and emergence of epistatic signals in protein evolution . Molecular Biology and Evolution 39 ( 1 ) ( 2021 ) doi: 10.1093/molbev/msab321 OpenUrl CrossRef PubMed ↵ Detlefsen , N.S. , Hauberg , S. , Boomsma , W. : Learning meaningful representations of protein sequences . Nature Communications 13 ( 1 ) ( 2022 ) doi: 10.1038/s41467-022-29443-w OpenUrl CrossRef ↵ De Leonardis , M. , Pagnani , A. , Barrat-Charlaix , P. : Reconstruction of ancestral protein sequences using autoregressive generative models . Molecular Biology and Evolution 42 ( 4 ) ( 2025 ) doi: 10.1093/molbev/msaf070 OpenUrl CrossRef PubMed ↵ De La Paz , J.A. , Nartey , C.M. , Yuvaraj , M. , Morcos , F. : Epistatic contributions promote the unification of incompatible models of neutral molecular evolution . Proceedings of the National Academy of Sciences 117 ( 11 ), 5873 – 5882 ( 2020 ) doi: 10.1073/pnas.1913071117 OpenUrl Abstract / FREE Full Text ↵ Dayhoff , M.O. , Schwartz , R.M. , Orcutt , B.C. : A model of evolutionary change in proteins . In: Atlas of Protein Sequence and Structure vol. 5 . National Biomedical Research Foundation, Silver Spring, MD ( 1978 ) ↵ Ding , X. , Zou , Z. , Brooks III , C.L. : Deciphering protein evolution and fitness landscapes with latent space models . Nature Communications 10 ( 1 ) ( 2019 ) doi: 10.1038/s41467-019-13633-0 OpenUrl CrossRef PubMed ↵ Fitch , W.M. : Toward defining the course of evolution: Minimum change for a specific tree topology . Systematic Zoology 20 ( 4 ), 406 ( 1971 ) doi: 10.2307/2412116 OpenUrl CrossRef PubMed ↵ Fan , W. , Zhou , Y. , Wang , S. , Yan , Y. , Liu , H. , Zhao , Q. , Song , L. , Li , Q. : Computational protein science in the era of large language models (llms) . arXiv preprint arXiv: 2501.10282 ( 2025 ) ↵ Goolsby , E.W. , Bruggeman , J. , Ané , C. : Rphylopars: fast multivariate phylogenetic comparative methods for missing data and within-species variation . Methods in Ecology and Evolution 8 ( 1 ), 22 – 27 ( 2016 ) doi: 10.1111/2041-210x.12612 OpenUrl CrossRef ↵ Huerta-Cepas , J. , Serra , F. , Bork , P. : Ete 3: Reconstruction, analysis, and visualization of phylogenomic data . Molecular Biology and Evolution 33 ( 6 ), 1635 – 1638 ( 2016 ) doi: 10.1093/molbev/msw046 OpenUrl CrossRef PubMed ↵ Hawkins-Hooker , A. , Depardieu , F. , Baur , S. , Couairon , G. , Chen , A. , Bikard , D. : 36 Generating functional protein variants with variational autoencoders . PLOS Computational Biology 17 ( 2 ), 1008736 ( 2021 ) doi: 10.1371/journal.pcbi.1008736 OpenUrl CrossRef ↵ Hochberg , G.K.A. , Thornton , J.W. : Reconstructing ancient proteins to understand the causes of structure and function . Annual Review of Biophysics 46 ( 1 ), 247 – 269 ( 2017 ) doi: 10.1146/annurev-biophys-070816-033631 OpenUrl CrossRef PubMed ↵ Heinzinger , M. , Weissenow , K. , Sanchez , J.G. , Henkel , A. , Mirdita , M. , Steinegger , M. , Rost , B. : Bilingual language model for protein sequence and structure . NAR Genomics and Bioinformatics 6 ( 4 ) ( 2024 ) doi: 10.1093/nargab/lqae150 OpenUrl CrossRef ↵ Jukes , T.H. , Cantor , C.R. , et al : Evolution of protein molecules . Mammalian protein metabolism 3 ( 21 ), 132 ( 1969 ) OpenUrl ↵ Kingma , D.P. , Welling , M. : Auto-encoding variational bayes . arXiv preprint arXiv: 1312.6114 ( 2013 ) ↵ Le , S.Q. , Gascuel , O. : An improved general amino acid replacement matrix . Molecular Biology and Evolution 25 ( 7 ), 1307 – 1320 ( 2008 ) doi: 10.1093/molbev/msn067 OpenUrl CrossRef PubMed Web of Science ↵ Mistry , J. , Chuguransky , S. , Williams , L. , Qureshi , M. , Salazar , G.A. , Sonnhammer , E.L.L. , Tosatto , S.C.E. , Paladin , L. , Raj , S. , Richardson , L.J. , Finn , R.D. , Bateman , A. : Pfam: The protein families database in 2021 . Nucleic Acids Research 49 ( D1 ), 412 – 419 ( 2020 ) doi: 10.1093/nar/gkaa913 OpenUrl CrossRef PubMed ↵ Meyer , X. , Dib , L. , Silvestro , D. , Salamin , N. : Simultaneous bayesian inference of phylogeny and molecular coevolution . Proceedings of the National Academy of Sciences 116 ( 11 ), 5027 – 5036 ( 2019 ) doi: 10.1073/pnas.1813836116 OpenUrl Abstract / FREE Full Text ↵ Moreta , L.S. , Rønning , O. , Al-Sibahi , A.S. , Hein , J. , Theobald , D. , Hamelryck , T. : Ancestral protein sequence reconstruction using a tree-structured ornsteinuhlenbeck variational autoencoder . In: International Conference on Learning Representations ( 2022 ) ↵ Minh , B.Q. , Schmidt , H.A. , Chernomor , O. , Schrempf , D. , Woodhams , M.D. , Haeseler , A. , Lanfear , R. : Iq-tree 2: New models and efficient methods for phylogenetic inference in the genomic era . Molecular Biology and Evolution 37 ( 5 ), 1530 – 1534 ( 2020 ) doi: 10.1093/molbev/msaa015 OpenUrl CrossRef PubMed ↵ Nasrallah , C.A. , Mathews , D.H. , Huelsenbeck , J.P. : Quantifying the impact of dependent evolution among sites in phylogenetic inference . Systematic Biology 60 ( 1 ), 60 – 73 ( 2010 ) doi: 10.1093/sysbio/syq074 OpenUrl CrossRef PubMed ↵ Pagnani , A. , Barrat-Charlaix , P. : Generative continuous time model reveals epistatic signatures in protein evolution . bioRxiv ( 2025 ) doi: 10.1101/2025.09.17.676821 OpenUrl Abstract / FREE Full Text ↵ Price , M.N. , Dehal , P.S. , Arkin , A.P. : Fasttree: Computing large minimum evolution trees with profiles instead of a distance matrix . Molecular Biology and Evolution 26 ( 7 ), 1641 – 1650 ( 2009 ) doi: 10.1093/molbev/msp077 OpenUrl CrossRef PubMed Web of Science ↵ Pauling , L.C. , Zuckerkandl , E. , Henriksen , T. , Lövstad , R. : Chemical paleogenetics. molecular “restoration studies” of extinct forms of life . Acta Chemica Scandinavica , 9 – 16 ( 1963 ) doi: 10.3891/acta.chem.scand.17s-0009 OpenUrl CrossRef ↵ Rao , R. , Bhattacharya , N. , Thomas , N. , Duan , Y. , Chen , P. , Canny , J. , Abbeel , P. , Song , Y. : Evaluating protein transfer learning with tape . Advances in neural information processing systems 32 ( 2019 ) ↵ Riesselman , A.J. , Ingraham , J.B. , Marks , D.S. : Deep generative models of genetic variation capture the effects of mutations . Nature Methods 15 ( 10 ), 816 – 822 ( 2018 ) doi: 10.1038/s41592-018-0138-4 OpenUrl CrossRef PubMed ↵ Rosset , L. , Netti , R. , Muntoni , A.P. , Weigt , M. , Zamponi , F. : adabmdca 2.0– a flexible but easy-to-use package for direct coupling analysis . arXiv preprint arXiv: 2501.18456 ( 2025 ) ↵ Robinson , D.M. : Protein evolution with dependence among codons due to tertiary structure . Molecular Biology and Evolution 20 ( 10 ), 1692 – 1704 ( 2003 ) doi: 10.1093/molbev/msg184 OpenUrl CrossRef PubMed Web of Science ↵ Rodrigue , N. , Philippe , H. , Lartillot , N. : Assessing site-interdependent phylogenetic models of sequence evolution . Molecular Biology and Evolution 23 ( 9 ), 1762 – 1775 ( 2006 ) doi: 10.1093/molbev/msl041 OpenUrl CrossRef PubMed Web of Science ↵ Selberg , A.G.A. , Gaucher , E.A. , Liberles , D.A. : Ancestral sequence reconstruction: From chemical paleogenetics to maximum likelihood algorithms and beyond . Journal of Molecular Evolution 89 ( 3 ), 157 – 164 ( 2021 ) doi: 10.1007/s00239-021-09993-1 OpenUrl CrossRef PubMed ↵ Sinai , S. , Kelsic , E. , Church , G.M. , Nowak , M.A. : Variational auto-encoding of protein sequences . arXiv preprint arXiv: 1712.03346 ( 2017 ) ↵ Starr , T.N. , Thornton , J.W. : Epistasis in protein evolution . Protein Science 25 ( 7 ), 1204 – 1218 ( 2016 ) doi: 10.1002/pro.2897 OpenUrl CrossRef PubMed ↵ Tatusov , R.L. : The cog database: new developments in phylogenetic classification of proteins from complete genomes . Nucleic Acids Research 29 ( 1 ), 22 – 28 ( 2001 ) doi: 10.1093/nar/29.1.22 OpenUrl CrossRef PubMed Web of Science ↵ Tule , S. , Foley , G. , Bodén , M. : Do protein language models learn phylogeny? Briefings in Bioinformatics 26 ( 1 ) ( 2024 ) doi: 10.1093/bib/bbaf047 OpenUrl CrossRef ↵ Tung Ho , L.s. , Ané , C. : A linear-time algorithm for gaussian and non-gaussian trait evolution models . Systematic Biology 63 ( 3 ), 397 – 408 ( 2014 ) doi: 10.1093/sysbio/syu005 OpenUrl CrossRef PubMed ↵ Trinquier , J. , Uguzzoni , G. , Pagnani , A. , Zamponi , F. , Weigt , M. : Efficient generative modeling of protein sequences using simple autoregressive models . Nature Communications 12 ( 1 ) ( 2021 ) doi: 10.1038/s41467-021-25756-4 OpenUrl CrossRef PubMed ↵ Whelan , S. , Goldman , N. : A general empirical model of protein evolution derived from multiple protein families using a maximum-likelihood approach . Molecular Biology and Evolution 18 ( 5 ), 691 – 699 ( 2001 ) doi: 10.1093/oxfordjournals.molbev.a003851 OpenUrl CrossRef PubMed Web of Science ↵ Williams , P.D. , Pollock , D.D. , Blackburne , B.P. , Goldstein , R.A. : Assessing the accuracy of ancestral protein reconstruction methods . PLoS Computational Biology 2 ( 6 ), 69 ( 2006 ) doi: 10.1371/journal.pcbi.0020069 OpenUrl CrossRef ↵ Yang , K.K. , Wu , Z. , Bedbrook , C.N. , Arnold , F.H. : Learned protein embeddings for machine learning . Bioinformatics 34 ( 23 ), 4138 – 4138 ( 2018 ) doi: 10.1093/bioinformatics/bty455 OpenUrl CrossRef PubMed ↵ Ziegler , C. , Martin , J. , Sinner , C. , Morcos , F. : Latent generative landscapes as maps of functional diversity in protein sequence space . Nature Communications 14 ( 1 ) ( 2023 ) doi: 10.1038/s41467-023-37958-z OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted November 20, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Ancestral Sequences Cannot be Accurately Reconstructed via Interpolation in a Variational Autoencoder’s Latent Space Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Ancestral Sequences Cannot be Accurately Reconstructed via Interpolation in a Variational Autoencoder’s Latent Space Evan Gorstein , Mengze Tang , Hailey Bruzzone , Claudia Solís-Lemus bioRxiv 2025.11.19.689264; doi: https://doi.org/10.1101/2025.11.19.689264 Share This Article: Copy Citation Tools Ancestral Sequences Cannot be Accurately Reconstructed via Interpolation in a Variational Autoencoder’s Latent Space Evan Gorstein , Mengze Tang , Hailey Bruzzone , Claudia Solís-Lemus bioRxiv 2025.11.19.689264; doi: https://doi.org/10.1101/2025.11.19.689264 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Evolutionary Biology Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41910) Biophysics (21436) Cancer Biology (18576) Cell Biology (25480) Clinical Trials (138) Developmental Biology (13368) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40360) Molecular Biology (17163) Neuroscience (88534) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00