Evaluation of Machine Learning-Assisted Directed Evolution Across Diverse Combinatorial Landscapes

doi:10.1101/2024.10.24.619774

Evaluation of Machine Learning-Assisted Directed Evolution Across Diverse Combinatorial Landscapes

2024 · doi:10.1101/2024.10.24.619774

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 94,782 characters · extracted from preprint-html · click to expand

Evaluation of Machine Learning-Assisted Directed Evolution Across Diverse Combinatorial Landscapes | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Evaluation of Machine Learning-Assisted Directed Evolution Across Diverse Combinatorial Landscapes View ORCID Profile Francesca-Zhoufan Li , View ORCID Profile Jason Yang , View ORCID Profile Kadina E. Johnston , Emre Gürsoy , View ORCID Profile Yisong Yue , View ORCID Profile Frances H. Arnold doi: https://doi.org/10.1101/2024.10.24.619774 Francesca-Zhoufan Li 1 Division of Biology and Biological Engineering, California Institute of Technology , Pasadena, California 91125, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Francesca-Zhoufan Li Jason Yang 2 Division of Chemistry and Chemical Engineering, California Institute of Technology , Pasadena, California 91125, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jason Yang Kadina E. Johnston 1 Division of Biology and Biological Engineering, California Institute of Technology , Pasadena, California 91125, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Kadina E. Johnston Emre Gürsoy 2 Division of Chemistry and Chemical Engineering, California Institute of Technology , Pasadena, California 91125, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yisong Yue 3 Division of Engineering and Applied Sciences, California Institute of Technology , Pasadena, California 91125, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Yisong Yue For correspondence: frances{at}cheme.caltech.edu yyue{at}caltech.edu Frances H. Arnold 1 Division of Biology and Biological Engineering, California Institute of Technology , Pasadena, California 91125, United States 2 Division of Chemistry and Chemical Engineering, California Institute of Technology , Pasadena, California 91125, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Frances H. Arnold For correspondence: frances{at}cheme.caltech.edu yyue{at}caltech.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Summary Various machine learning-assisted directed evolution (MLDE) strategies have been shown to identify high-fitness protein variants more efficiently than typical wet-lab directed evolution approaches. However, limited understanding of the factors influencing MLDE performance across diverse proteins has hindered optimal strategy selection for wet-lab campaigns. To address this, we systematically analyzed multiple MLDE strategies, including active learning and focused training using six distinct zero-shot predictors, across 16 diverse protein fitness landscapes. By quantifying landscape navigability with six attributes, we found that MLDE offers a greater advantage on landscapes which are more challenging for directed evolution, especially when focused training is combined with active learning. Despite varying levels of advantage across landscapes, focused training with zero-shot predictors leveraging distinct evolutionary, structural, and stability knowledge sources consistently outperforms random sampling for both binding interactions and enzyme activities. Our findings provide practical guidelines for selecting MLDE strategies for protein engineering. Introduction Engineered proteins are indispensable across myriad applications, serving as effective therapeutics to combat diseases, non-toxic agents to enhance crops, and green biocatalysts to synthesize chemicals. 1 The development of such useful proteins often involves directed evolution (DE), a method for accumulating beneficial mutations using iterations of mutagenesis and functional assessment by selection or screening. 2 – 4 DE is an empirical, greedy hill climbing process on a high-dimensional fitness landscape that maps protein sequence to function ( Figure 1a ). 5 , 6 Despite its widespread use, DE remains time-consuming and resource-intensive: screening is expensive, and multiple rounds of mutation and screening may be needed to generate the desired improvements. Download figure Open in new tab Figure 1. Summary of landscape attributes, simulations, and combinatorial landscapes. a) Landscape attributes include fitness statistics (percent of active variants, tailedness, location of Cauchy peak, and number of peaks for kernel density estimation) and ruggedness (number of local optima and percent of non-magnitude pairwise epistasis). b) Various in silico simulations include three types of DE (simple recomb, single-step, and top96 recomb), MLDE, multiple rounds (ALDE), and focused training for both MLDE (ftMLDE) and ALDE (ftALDE) (Methods). c) Combinatorial landscapes studied, categorized by the number of targeted sites (three and four) and function types (binding interactions and enzyme activities) on six protein systems (ParD-ParE toxin-antitoxin, GB1 immunoglobulin binding, dihydrofolate reductase, T7 RNA polymerase, TEV protease, and tryptophan synthase). 37 , 40 , 42 – 44 To keep the main text relevant to the majority of campaigns, we focused on libraries with at least 1% active variants, while the remaining landscapes are detailed in the supplemental information. Fitness landscapes are more rugged and difficult to traverse when rich in epistatic, or non-additive, effects of amino acid substitutions. 7 , 8 Epistasis is often observed between mutations in close structural proximity 9 and is enriched at binding surfaces or enzyme active sites, due to direct interactions between residues, substrates, and/or cofactors. 8 Protein engineers frequently target mutations to these interacting sites to enhance a function, often using simultaneous site-saturation mutagenesis (SSM) to make libraries in which the targeted amino acids are mutated to many or all 19 other possible amino acids. 10 Combining the beneficial mutations found at these sites often reveals epistatic effects. For example, beneficial mutations in the context of the initial sequence may not be beneficial in combination with other mutations. Therefore, epistasis can present a significant challenge for DE. 3 Compared to DE, machine learning-assisted DE (MLDE) has shown promise for exploring a broader scope of sequence space and more effectively navigating epistatic landscapes. 11 – 13 MLDE utilizes ML models trained on sequence-fitness data to capture non-additive effects. The trained models can then be used to predict high-fitness variants across the entire landscape in a single round 11 , 12 , 14 or iteratively in an active learning (ALDE) fashion. 15 – 17 The choice of the training set can greatly influence the performance of the ML models. One can randomly sample the full combinatorial space for training the model (MLDE) or alternatively do focused training (ftMLDE) 12 by selectively sampling to avoid low-fitness variants. In the latter, the quality of the training set can be biased toward more-informative variants with zero-shot (ZS) predictors to reach high-fitness variants more effectively. ZS predictors estimate protein fitness without the need for experimental data: they are instead based on prior assumptions and leverage auxiliary information, such as protein stability calculations, evolutionary data, or structural information. 18 – 25 Although ML in protein engineering has been demonstrated in different case studies, 11 , 26 – 36 most MLDE 11 , 15 , 16 and ftMLDE 12 studies on epistatic landscapes have been benchmarked against a single dataset on the B1 domain of an immunoglobulin-binding protein G (GB1). 37 Thus, two key issues persist: first, the effectiveness of different MLDE strategies on proteins with complex functions, such as enzymes, remains uncertain and, second, the principles that guide successful use of MLDE strategies across diverse protein properties are not understood. Furthermore, despite a growing array of ZS options, 25 there is no definitive guideline for selecting predictors for a given application. This is particularly true for combinatorial epistatic landscapes. 23 , 38 Recent experimental studies 39 – 44 provide a wealth of data on a broader array of protein fitness landscapes, enabling us to start establishing best practices and generalizable guidelines for practitioners working with various proteins. To contextualize the benefits of MLDE, ALDE, and focused training, we conducted a comprehensive study of 16 diverse combinatorial protein fitness landscapes. They span six protein systems and two function types (binding and enzyme activity). Consisting of variants that are simultaneously mutated at three or four residues, these landscapes vary in landscape attributes, such as statistical measures (including the number of active variants and fitness distribution properties) as well as ruggedness (a measure of the prevalence of fitness interactions among variants, 45 including pairwise epistasis and the number of local optima). Specifically, this study focuses on two questions: (1) When do MLDE, multiple rounds (such as in ALDE), and focused training offer a significant advantage compared to DE? (2) How can we best select and utilize the ZS predictor(s) for focused training? Results Overview of landscapes For this study, we selected 16 experimental combinatorial landscapes covering a range of binding interactions and enzyme activities ( Table 1 and S1). 37 , 40 – 44 All landscapes feature mutations at binding interaction points, in active sites, or at positions previously shown to modulate fitness, all of which are often targeted for engineering tasks ( Figure 1c ). For binding, we examined two three-site bacterial toxin-antitoxin ParD-ParE landscapes 40 and the GB1 landscape for immunoglobulin binding. 37 For enzyme activity, we analyzed a three-site dihydrofolate reductase (DHFR) landscape, 41 a three-site T7 RNA polymerase landscape, 42 , 43 a four-site TEV protease landscape, 42 , 43 and ten three-or four-site landscapes of the thermostable β-subunit of tryptophan synthase (TrpB). 44 View this table: View inline View popup Download powerpoint Table 1. Combinatorial landscapes analyzed in the main text (for full details, see Table S1 ). 37 , 40 – 44 Due to the frequent misalignment between theoretical landscape modeling and experimental applications, we selected two groups of empirical and interpretable attributes to characterize the landscapes: fitness statistics (which do not incorporate sequence information) and landscape ruggedness (which involves mapping sequences to fitness) ( Figure 1a ; Methods). 46 – 53 We used the following statistics to indirectly infer the complexity of the fitness landscape: the percentage of active variants, the fitness value corresponding to the Cauchy peak location, the kurtosis of the fitness distribution (“tailedness”), and the number of kernel density estimation (KDE) peaks (Methods). The Cauchy distribution is known for its heavy tails. We sought to use the fitness corresponding to its peak location as a landscape attribute to capture the majority of the variant finesses. KDE is a non-parametric method for estimating the probability density function of fitness distribution. KDE does not assume any specific underlying distribution for fitness and is useful for smoothing out noise. We reasoned that the number of KDE peaks, reflecting the distribution modalities of fitness, could serve as a proxy for the underlying landscape navigability, which impacts the outcome of DE. To quantify ruggedness, which poses navigation challenges for DE, 47 , 49 we included the number of local optima and the percentage of pairwise non-magnitude epistasis ( Figure 1a ). We defined a local optimum as a variant that possesses higher fitness than all its active neighbors differing by a single amino acid substitution (Methods). 41 , 44 , 46 , 47 , 49 Recent studies have also highlighted the impact of various types of epistasis on DE 3 , 44 and emphasized that the majority of epistasis is pairwise. 54 Thus, we also included the amount of non-magnitude pairwise epistasis (conditional or impossible for DE to navigate) as a relevant landscape attribute (Methods). All MLDE strategies consistently outperform DE, particularly as landscape navigability decreases We next assessed how landscape attributes influence the efficacy of protein engineering strategies. Specifically, we evaluated the outcomes of a protein engineering campaign using two metrics: (1) “average maximum fitness achieved,” which is the fitness of the final variant achieved by each method on average and (2) “fraction reaching the global optimum”, which measures how frequently the true maximum fitness is reached. We explored these measures across MLDE, ALDE, focused training, and three different DE strategies. The DE strategies are summarized as “recomb”, a recombination of the best SSM variant at each site (19 × n site + 2 samples, including the initial and final variant); “single-step”, an iterative process starting from any site with subsequent variants built on the best variant found (19 × n site + 1 samples, including the initial variant); and “top96 recomb”, where SSM is performed at each position, all substitution combinations are calculated based on additive recombination, and the top 96 variants are selected (19 × n site + 97 samples, including the initial and 96 variants, where 96 is the number of wells in plates commonly used for screening; Figure 1b ; Methods). 11 , 12 , 44 MLDE and ftMLDE trained an ensemble of models, employing either random or ZS predictor-guided training sample selection. The trained models were then used to predict fitness for all variants, where the top 96 predicted variants were then used for evaluation (Methods). 12 ALDE divided the total sample size into multiple rounds, with each subsequent round of sampling guided by the uncertainty quantified in its previous round ( Figure 1b ; Methods). 17 Similar to how ftMLDE improved upon MLDE, ftALDE used ZS predictors to select a more informative initial training set, instead of the random sampling used in ALDE. Considering the variability in throughput and expense of experimental screens, we explored a range of total number of variants screened (total sample size), from 120 to 2,016 samples ( Figure 2a ; Table S2 ). On average across landscapes, MLDE (dashed light blue line) required merely 48 training samples to outperform “recomb” DE and 96 to surpass “single-step” DE for both metrics. It took 96 training samples for MLDE to match the average maximum fitness and 384 to achieve a comparable fraction reaching the global optimum as the most competitive DE strategy, “top96 recomb”. By incorporating various ZS predictors, ftMLDE (solid dark blue line) consistently outperformed MLDE with random sampling (showing a 4–12% improvement in average maximum fitness achieved for up to 960 training samples and a 9–77% improvement in fraction reaching the global optimum across all training sample sizes, Table S3 ); ftMLDE achieved the same levels of average maximum fitness and global optimum fraction as MLDE but with fewer training samples required ( Figure 2a ). These results suggest that MLDE can identify high-fitness variants more effectively than DE, and focused training with ZS predictors can further improve performance compared to MLDE with random sampling. Download figure Open in new tab Figure 2. Performance and correlations of MLDE, ALDE, and focused training compared with DE and six landscape attributes. a) Comparison of DE, MLDE, ALDE, ftMLDE, and ftALDE performance, averaged across 12 landscapes with at least 1% active variants. Shading indicates standard deviation. Performance is shown for i) the maximum fitness achieved and ii) the fraction of campaigns reaching the global optimum, for different numbers of training samples. Three DE methods were included: simple recomb, single-step, and top96 recomb. The vertical line marks a total sample size of 480 (e.g. 384 sampled variants for training and top 96 predicted variants for testing, or four rounds of ALDE each with 120 variants) that the following results expand on. See Figure S1 for landscapes with fewer than 1% active variants. b) Single-step DE, MLDE, ALDE, and focused training results broken down by landscape, with a total sample size of 480 for both metrics. See Figure S2 for landscapes with fewer than 1% active variants. c) Spearman’s ranking correlation of ML strategy performance improvement (the average maximum fitness of the top 96 predicted variants by MLDE and focused training over single-step DE, y-axis) with six landscape attributes (x-axis): i) percentage of active variants, ii) fraction of local optima (normalized to the number of variants measured), iii) fraction of non-magnitude epistasis, iv) Cauchy peak location, v) kurtosis (tailedness), and vi) number of kernel density estimation (KDE) peaks. See Figure S3 for different rounds of ALDE and ftALDE. For each MLDE and ftMLDE experiment, boosting models were trained on 384 random samples from the entire or ZS-focused library using one-hot encoding, with five-fold cross-validation. For each ALDE and ftALDE experiment, boosting ensembles with greedy acquisition function were trained with 240, 160, or 120 samples per round for two, three, or four rounds in total, respectively. The top 96 predicted variants were evaluated. Each ML experiment was averaged across 50 replicates (Methods). ftMLDE and ftALDE performance were further averaged across six ZS predictors (details in the next section). DE simulations started from all active variants (Methods). Next, we compared ALDE (MLDE with multiple rounds of training and testing guided by uncertainty quantification) to MLDE (a single round of training and testing, equivalent to two rounds) with the same total number of variants screened. With two rounds, ALDE (dotted bright yellow line) began to outperform MLDE (dashed light blue line) after 480 total samples for average maximum fitness achieved and 288 total samples for fraction reaching the global optimum but did not outperform ftMLDE (solid dark blue line) until 1,056 samples for both metrics. With four rounds, ALDE (dotted light brown line) matched or exceeded ftMLDE performance. With focused training, ftALDE (solid orange line) matched or surpassed ftMLDE with the same number of rounds and showed further improvement with additional rounds ( Figure 2a ). However, for libraries with fewer than 1% active variants, even four rounds of ALDE (without focused training) consistently underperformed compared to ftMLDE ( Figure S1 ). Our observations underscore the utility of focused training using ZS predictors, enabling MLDE to match multi-round ALDE performance and offering further improvement to ALDE. Given the large standard deviations in the performance of different approaches across landscapes, we examined how each approach performed on individual landscapes and found that some landscapes exhibited more significant improvements than others ( Figure 2b and S2). We first quantified the improvements of ML strategies over single-step DE and found that ML strategies offered a greater advantage on landscapes which were more challenging for DE to navigate. To better understand when DE struggled, we then calculated six different attributes to provide insights into landscape navigability ( Figure 2c ; Methods). Specifically, the mean maximum fitness achieved by DE correlated positively with the fraction of active variants ( Figure 2c–i ) and the fitness distribution’s Cauchy peak location ( Figure 2c –iv), indicating improving navigability by DE. Consequently, the improvements resulted from all ML methods were anti-correlated with percent active ( Figure 2c–i ) and Cauchy peak location ( Figure 2c–iv ). The amount of kurtosis (tailedness) and number of KDE peaks of the fitness distribution hindered DE navigability ( Figure 2c–v and vi). ML methods thus improved performance most significantly for landscapes with high tailedness and more KDE peaks. Similarly, increased landscape ruggedness decreased DE navigability, yielding greater benefit of using ML methods over DE for such landscapes. DE navigability was anti-correlated with the fraction of local optima ( Figure 2c–ii ) and the fraction of non-magnitude epistasis ( Figure 2c–iii ), and thus the net improvement of ML methods over DE was positively correlated with both higher fractions of local optima ( Figure 2c–ii ) and higher fractions of non-magnitude epistasis ( Figure 2c– iii ). Indeed, ftMLDE demonstrated the most substantial performance improvements (3.5-fold) for one of the least navigable landscapes (TrpB3E; Table S4 ). The performance improvements from different rounds of ALDE and ftALDE were also correlated with landscape navigability defined by the six attributes ( Figure S3 ; Table S4 ). ZS predictors provide orthogonal priors on protein fitness that improve focused training performance Next, we sought to understand the effectiveness of different ZS predictors for fitness prediction and their ability to improve ftMLDE and ftALDE performance across landscapes. ZS predictors could be useful for (1) effectively ranking variants to sample the fittest mutants (measured by Spearman’s correlation, Methods) and (2) filtering out non-viable variants, especially in landscapes dominated by inactive variants (measured by ROC-AUC, Methods). To evaluate the effectiveness and limitations of various ZS predictors under different assumptions and priors, we selected six distinct predictors across two axes: calculation vs. learning-based and sequence vs. structure-based. These predictors include Hamming distances, EVmutation, ESM, ESM-IF, CoVES, and Triad ( Figure 3a ; Methods). 18 – 22 , 55 To differentiate our work from comprehensive ZS predictor benchmarks that are largely limited to measuring the effects of single amino acid substitutions, 25 we emphasized their utilities for focused training applications in epistatic landscapes. Download figure Open in new tab Figure 3. Summary of different ZS predictors and their impacts on focused training across landscapes. a) Six ZS predictors: i) Hamming distance, ii) EVmutation (coevolutionary conservation), 18 , 56 iii) ESM (mutant likelihood from pretrained protein-language model), 19 iv) ESM-IF (mutant likelihood from pretrained inverse folding models based on sequence and structure information), 20 and v) Triad (mutant stability ΔΔG). 12 , 55 b) ZS predictor performances in terms of variant fitness ranking correlation (Spearman’s ρ) and active/inactive classification (ROC-AUC) across 12 landscapes with at least 1% active variants. Error bars indicate standard deviation. Dotted gray line indicates random classification. c) Fitness values ranking Spearman’s correlation with MSA depth for each ZS predictor. Statistical significance (p-value <0.05) is indicated as * ( Table S5 ). Dotted gray line indicates no correlation. d) Pairwise Spearman’s correlation of six ZS predictors averaged across 12 landscapes. e) Performance of focused training with different ZS predictors including the best Hamming distance ensemble, averaged across 12 landscapes. Shading indicates standard deviation. It assesses i) the maximum fitness achieved and ii) the fraction reaching the global optimum in relation to the number of samples used by ftMLDE. For each MLDE and ftMLDE experiment, boosting models were trained on 384 random samples from the entire or ZS-focused library using one-hot encoding, with five-fold cross-validation. The top 96 predicted variants were evaluated. Each ML experiment was averaged across 50 replicates (Methods). The vertical line marks a total sample size of 480 (e.g., 384 sampled variants for training and top 96 predicted variants for testing). ALDE and ftALDE results see Figure S7 . For landscapes with fewer than 1% active variant, see Figure S8 , S9 , and S12 . As a baseline, we used the Hamming distance as a ZS predictor, which counts the number of amino acid substitutions from the parent, a variant already exhibiting some activity. By setting a Hamming distance threshold, we essentially confined the sampling space to the vicinity of the parent to enrich the training set with more viable variants on average, as most mutations are deleterious. 5 Indeed, we observed that the Hamming distance (indicated in blue) showed a weak correlation with fitness ranking ( Figure 3b–i ) and classified active/inactive variants better than random ( Figure 3b –ii) as a ZS predictor. Notably, Hamming distance relied on the parent defined by the authors of each landscape, rather than a randomly sampled active variant as in the DE simulations. Since these landscapes were designed to have variant activity levels both higher and lower than the parent, the parent and its neighboring variants were likely to be more fit and active than those around a randomly selected active variant ( Figure S4 ). Although the landscape parent was one of the most active, leading to strong Hamming distance performance, Hamming distance still outperformed random predictions in fitness ranking ( Figure S5 ) and active/inactive classification ( Figure S6 ) on average, when using different active variants as the parent. In the focused training setting, Hamming distance-guided training set sampling outperformed random selection, improving both the average maximum fitness achieved across all total sample sizes ( Figure 3e–i ) and fraction reaching the global optimum, up to total sample sizes of 1,056 for ftMLDE ( Figure 3e–ii ) and 480 for ftALDE ( Figure S7 ). Various ZS predictors can incorporate implicit evolutionary conservation based on the distribution of naturally occurring sequences. The EVmutation score predicts the fitness effect of a given set of substitutions based on conservation and evolutionary couplings through multiple-sequence alignment (MSA). 18 , 56 We observed EVmutation (indicated in green) outperformed the Hamming distance for both ranking fitness values ( Figure 3b–i ) and classifying active/inactive variants ( Figure 3b –ii). Moreover, it was one of the best ZS predictors for focused training across all sample sizes on both metrics ( Figure 3e ). Similarly, protein language models (PLMs) can capture these evolutionary conservations by learning to predict the original identity of masked or corrupted amino acids. 57 – 63 The likelihood of filling such amino acids can be thought of as a predictor for different amino acid substitutions given the sequence context. 19 The ESM score (Evolutionary Scale Modeling, indicated in purple) from one of such state-of-the-art PLMs performed similarly to EVmutation as a ZS predictor for both fitness ranking ( Figure 3b–i ) and active/inactive classification ( Figure 3b –ii). It also did not further improve upon EVmutation in the focused training setting ( Figure 3e ). Incorporating structural context can also be useful for ZS predictions. ESM-IF (ESM inverse-folding, indicated in yellow) is an inverse-folding model trained to predict a protein sequence from its backbone atom coordinates, where effects of substituting amino acids can be approximated with the likelihoods of each possible sequence for this reconstruction task. 20 We observed that ESM-IF was the best ZS predictor for fitness ranking ( Figure 3b–i ) and active/inactive classification ( Figure 3b-ii ), but with only slight improvements over other ZS predictors. In the focused training setting, ESM-IF score did offer a consistent advantage over the sequence-only ESM, but only offered a slight advantage over EVmutation at either low or high number of samples (i.e., 120, 144, 1,056, and 2,016, Figure 3e ). CoVES (Combinatorial Variant Effects from Structure, indicated in brown) learns to predict a masked amino acid identity from its surrounding atomic-level structural microenvironments but does not account for epistasis. 21 Compared to ESM-IF, we observed that CoVES was a slightly less effective ZS predictor for fitness estimation ( Figure 3b ) and in the focused training setting ( Figure 3e ), but it was still one of the most effective predictors for improving over random sampling. An alternative local structure-based ZS score utilizes physics-informed stability calculations. Stability is an important prior for protein function, as an unfolded or misfolded protein will be less likely to be functional. 12 , 22 The Triad score estimates mutant stability by calculating the change in its free energy of folding relative to the parent (ΔΔG) using a Rosetta energy function. 12 , 55 While Triad (indicated in orange) was the weakest predictor for variant fitness ranking ( Figure 3b–i ), it classified active/inactive variants fairly well ( Figure 3b–ii ) as a ZS predictor. Triad-guided training set sampling outperformed random selection in the ftMLDE setting, up to a total sample size of 1,056 for both metrics ( Figure 3e ). In the ftALDE setting, it outperformed random selection up to a total sample size of 576 for average maximum fitness and 384 for the fraction reaching the global optimum ( Figure S7 ). The relative differences between ZS predictors in focused training remained consistent across different rounds of ftALDE. However, in libraries with fewer than 1% active variants, these differences were minimized, and all ZS-guided focused training approaches showed a significant advantage over random sampling ( Figure S8 and S9). To facilitate ZS predictor selection and ensembling, we first examined how the fitness ranking performance of ZS predictors correlated with the depth of multiple sequence alignments (MSAs) ( Figure 3c ; Table S5 ; Methods). We found that the performance of the physics-based Triad and the structure-only CoVES did not correlate with MSA depth, confirming their independence from evolutionary data. In contrast, the three sequence-based predictors, Hamming distance, EVmutation, and ESM did show correlation with MSA depth. Despite being a hybrid sequence-structure model, ESM-IF captured evolutionary information to a similar extent as EVmutation, likely because over 99% of its structures were predicted from similar sequence databases (the UniRef family). 20 We then investigated the relationship between different ZS predictors using pairwise correlations ( Figure 3d ; Methods). 64 , 65 Within each modality, sequence-based (Hamming distance, EVmutation, and ESM) or structure-based (CoVES and Triad), all ZS predictors exhibited at least a 0.5 Spearman’s correlation with each other. ESM and EVmutation showed the strongest correlation (Spearman’s ρ = 0.78), suggesting PLMs like ESM may capture similar coevolutionary information as MSAs used by EVmutation. 58 , 66 , 67 ESM-IF showed similar correlations with both the structure-only CoVES (Spearman’s ρ = 0.62) and the sequence-only ESM (Spearman’s ρ = 0.62) and EVmutation (Spearman’s ρ = 0.63). Despite distinct approaches, all four learning-based predictors captured related information. However, Triad had only moderate correlations with the other two structure-based predictors (Spearman’s ρ = 0.5) and weak correlations with the three sequence-based predictors (Spearman’s ρ < 0.4). Hamming distance showed moderate correlations with ESM and EVmutation (Spearman’s ρ = 0.6 and 0.5, respectively) but it was weakly correlated with the structurally inclined predictors (Spearman’s ρ < 0.4). This underscores the orthogonality between learning-based models, a naive protein engineering prior, and a physics-based approach. Thus, we hypothesized that ensembling orthogonal ZS predictors may further enhance focused training by synergizing complementary information sources. We evaluated if ensembling Hamming distance with other ZS predictors enhanced focused training performance compared to each predictor alone. Prefiltering with a Hamming distance (by restricting variants to those within two amino acid substitutions of the parent sequence) boosted focused training performance from each ZS predictor up to 192 total samples (left vertical gray line in Figure S10 ). This benefit extended to 480 total samples for ESM and ESM-IF (right vertical gray line in Figure S10 ) and continued to 1,056 for EVmutation for both average max fitness achieved ( Figure 3e–i ) and fraction reaching the global optimum ( Figure 3e–ii ). However, the benefits diminished beyond a total sample size of 480, due to this pre-constrained sampling space. In contrast, ZS ensembles with Triad did not show significant improvement ( Figure S11 and S12 ). This suggests that, despite its orthogonal information, the physics-based Triad predictor offered no further benefit to focused training performance. Similarly, naively ensembling the two top-performing predictors, ESM-IF and EVmutation, yielded no additional improvements ( Figure S11 and S12 ). Our results highlight the advantage of combining Hamming distance with other informative ZS predictors to further enhance focused training performance. Landscape and functional attributes affect ZS predictability We next examined how ZS predictability differed across landscapes, specifically comparing those measuring binding interactions vs. enzyme activities ( Figure 4a ). All six ZS predictors ranked fitness values substantially better for binding interactions than for enzyme activity ( Figure 4a–i ), with Triad showing a statistically significant difference (p-value = 0.001, Table S6 ). The structure-based predictors (CoVES and Triad) were better at classifying active/inactive variants for binding datasets than for enzymatic ones, while the sequence-based predictors (Hamming distance, EVmutation, and ESM) performed better for enzyme activity datasets. Hamming distance showed a statistically significant difference in the context of classification (p-value = 0.042, Figure 4a –ii; Table S6 ). However, the differences between binding interactions and enzyme activities were no longer statistically significant for any ZS predictors in the focused training setting across different ML strategies ( Figure 4b ; Table S7 and S8). Download figure Open in new tab Figure 4. Summary of different ZS scores and their impact on individual landscapes by types of function. a) Six ZS predictor performance for each individual landscape in terms of i) Spearman’s correlation of fitness values ranking and ii) ROC-AUC of active/inactive classification. Random predictions are indicated in horizontal gray dashed lines. Statistical significance (p-value <0.05) is indicated as *. b) A breakdown of the ftMLDE results with a total sample size of 480 from Figure 3e , categorized by six ZS predictors and two functions (binding interactions and enzyme activities) for each landscape. Focused training improvement over randomly sampled training set (MLDE) is quantified by i) average maximum fitness and ii) fraction reaching the global optimum. See supplemental information for landscapes with fewer than 1% active variants ( Figure S14 and S15), ftALDE with different rounds ( Figure S15 and S16 ) and ZS predictor impacts on focused training with 192 total samples ( Figure S14 , S17 , and S18 ). For 10 out of the 12 landscapes, all six ZS predictors successfully focused the training set to be more informative than random sampling, leading to improved ftMLDE performance for both metrics ( Figure 4b ). Harder-to-navigate landscapes and the libraries with fewer than 1% active variants benefited more from focused training, provided the ZS predictor for active/inactive variant classification was better than random (ROC-AUC > 0.5, Figure 4 , S13, and S14). TrpB3E (indicated in gray), one of the hardest-to-navigate landscapes ( Figure 2b ) with one of the lowest but still above-random active/inactive variant classification ROC-AUC ( Figure 4a–ii ), gained the most from all ZS predictors compared to randomly sampled MLDE training sets for both performance metrics ( Figure 4b ). Similar improvements in hard-to-navigate landscapes were consistently observed when comparing ftALDE with ALDE in the same round, and when considering different total sample sizes for each of the focused training approaches ( Figure S15 -S18). A falsely biased training set could negatively impact focused training performance. For DHFR (dihydrofolate reductase, indicated in blue), the structure-based predictions, Triad ΔΔG and CoVES, performed poorly, with worse-than-random active/inactive classification ( Figure 4a–ii , dashed gray line) and harmed ftMLDE performance for both metrics ( Figure 4b ). Discussion Our findings confirmed that all MLDE strategies exceeded or at least matched DE performance across 16 landscapes, with the advantages becoming more pronounced as landscape attributes posed greater obstacles for DE (e.g., fewer active variants and more local optima). ZS predictors, which leverage various prior knowledge, enriched training sets to enable ftMLDE to match multi-round ALDE performance and offered further improvement to ALDE. Overall, our study suggests that MLDE strategies are highly generalizable and can significantly reduce the experimental load of DE, and we present key considerations for the effective deployment of these approaches. We expect that these findings will encourage and facilitate the adoption of ML-assisted directed evolution for efficient protein engineering. As a general recommendation for the implementation of ML strategies to guide protein engineering objectives, we introduce a decision-making process for selecting a campaign strategy ( Figure 5 ). The first step is to assess whether the landscape is hard-to-navigate. This typically involves a low percentage of active variants and high percentage of pairwise non-magnitude epistasis, which can be inferred from prior knowledge (e.g., the percentage of active variants from single-site SSM experiments) or from the structural proximity of residues of interest to functional (binding or active) sites and to each other. We observed a weak negative correlation (Spearman’s ρ = -0.34) between the percentage of pairwise non-magnitude epistasis (where higher values indicate harder-to-navigate landscapes) and the average pairwise C-alpha distance of mutated residues (the smaller the distance, the closer the central carbon atoms of the two amino acids at the targeted sites, Methods). Next, determine whether there is a “good-enough” ZS prior (optionally ensembling orthogonal predictors), meaning a ZS predictor that can classify active/inactive variants better than random. For hard-to-navigate landscapes without prior information, using a Hamming distance threshold of two (i.e., constructing double-site libraries) can effectively enrich informed variant sampling for the training sets. Additionally, ZS predictor classification performance on single-site libraries can identify predictors that may fail on larger combinatorial libraries as well ( Figure S19 and S20; Table S9 ). For example, CoVES and Triad active/inactive variant classification for DHFR were worse than random for single substitutions ( Figure S19 ) and they both classified the full DHFR landscape worse than random ( Figure 4a–i ), which ultimately ablated the benefit of focused training ( Figure 4a–ii ). Finally, consider whether the search space is large (e.g., four-site libraries) and whether the screening budget allows for multiple rounds and/or an increased number of samples per round. A decision tree is provided to assist users in selecting the appropriate strategy ( Figure 5 ). Download figure Open in new tab Figure 5. Decision tree summarizing recommended ML strategies based on total number of variants screened experimentally, landscape navigability (e.g. active variant percentage, pairwise epistasis), the quality of ZS active/inactive variant classification (i.e. ROC-AUC > 0.5), and the number of available screening rounds (N). We focused this study on combinatorial landscapes typically generated using SSM, which are often enriched in epistasis and present challenges for DE. Our decision leveraged the observation that random mutations spread across a protein generally exhibit little sign epistasis, and thus beneficial mutations can often be combined to great success using laboratory methods such as staggered extension process (StEP) recombination to generate variants with higher fitness. 68 , 69 In this context, Hamming distance has been demonstrated to have a weak correlation with variant fitness. 23 The ZS predictor benchmark has also been performed predominantly on datasets with random mutation spread across a protein. 25 We also evaluated several additional design choices for ML strategies that had a more limited impact on MLDE performance. First, we explored more informative ways to represent protein sequences compared to a categorical encoding (one-hot, which has no learned information and treats all amino acids equally). Learned representations from PLMs (e.g., ESM2) showed minimal to no improvement over one-hot encoding for landscapes with at least 1% active variants ( Figure S21 ), including in the focused training setting ( Figure S22 ). However, they did exhibit improvements for landscapes with fewer than 1% active variants ( Figure S23 ). While not as beneficial as focused training ( Figure S8 ), learned representations may still enhance performance for particularly challenging landscapes when combined with focused training ( Figure S24 ). Additionally, we used boosting models to facilitate a direct comparison between MLDE and ALDE. Different model choices and ensembles, such as ridge regression for MLDE or deep neural network ensembles for ALDE, could offer further improvements ( Figure S25 and S26). We also provide a codebase, SSMuLA (Site-Saturation Mutagenesis Landscape Assistant), which includes options for these granular design choices. While we streamlined focused training design choices, we also identified areas for improvement. Based on testing individual ZS predictors across different thresholds of the original search space, we set the focused training library threshold to the top 12.5%, ranked by ZS scores, across all landscapes ( Figure S27 ). We then naively ensembled ZS predictors to demonstrate the benefits of combining orthogonal priors (Methods). The current Hamming distance ensembled focused training libraries inherently had a size cutoff (i.e., 12.5% of a double-site library on a four-site landscape is 300). More sophisticated approaches, such as MODIFY, 70 which balance ZS selection with training set diversity and manage the exploration-exploitation trade-off, may offer a more comprehensive and autonomous method for selecting and ensembling ZS predictors. There are also signs that ZS predictors are intrinsically limited for certain prediction tasks. For instance, the K227 substitution in TrpB, which enables high-fitness variants under engineering conditions but is nearly undetectable in natural sequences, might not be adequately captured by EVmutation. 44 Additionally, the performance of different ZS predictors varied even within the same protein, as observed in TrpB across different landscapes. This indicates that while natural evolutionary information can be predictive, it may fall short in capturing evolution in the laboratory. Furthermore, all the enzyme systems we studied primarily involved native or near-native functions, with a majority being TrpB landscapes. When applied to non-native functions, we expect that the usefulness of evolutionary-based ZS predictors will decrease, perhaps significantly so. In the case of the TEV and T7 landscapes, none of the ZS predictors consistently improved focused training performance from random sampling, suggesting room for the development of new ZS predictors. In summary, our study lays the groundwork for a ML-assisted protein engineering framework leveraging the strengths of multiple ML approaches, including MLDE, ftMLDE, ALDE, and, introduced here, ftALDE. With our growing ability to read 71 , 72 and write sequences, 73 along with improved tools for constructing libraries, 74 we believe our findings will demystify the application of ML-based DE strategies and encourage their broader adoption in protein engineering. Author contributions F.Z.L., conceptualization, data curation, formal analysis, investigation, methodology, software, visualization, writing – original draft, writing – review & editing J.Y., conceptualization, methodology, software, writing -review & editing K.E.J., conceptualization, methodology, software, writing – review & editing E. G., software Y. Y., funding acquisition, resources, supervision, writing – review & editing F.H.A., funding acquisition, resources, supervision, writing – review & editing Declaration of interests The authors declare no competing interests. Declaration of generative AI and AI-assisted technologies in the writing process During the preparation of this work the authors used ChatGPT in order to check spelling, grammar, and improve the readability and language of the manuscript. After using this tool, the authors reviewed and edited the content as needed and take full responsibility for the content of the published article. Resource availability Lead contact Further information and requests for resources should be directed to and will be fulfilled by the lead contact, Frances Arnold ( frances{at}cheme.caltech.edu ). Materials availability Not applicable to this study. Data and code availability All data and results that support this study are deposited at https://doi.org/10.5281/zenodo.13910506 . All code is available at https://github.com/fhalab/SSMuLA . Methods View this table: View inline View popup Download powerpoint Landscape preparation By choosing essentially complete datasets, we minimized the need for data imputation, thus avoiding potential biases and misrepresentations. We focused on libraries with at least 1% active variants in the main text to keep the main text relevant to the majority of campaigns, with results for all landscapes provided in the supplementary material. All fitness values were normalized so that the variant with the maximum fitness has a value of one. Landscape attributes We considered two groups of attributes for this analysis: 1) fitness statistics, which included percent of active variants and parameters derived from simple statistical modeling, and 2) ruggedness, which included pairwise epistasis and the number of local optima. We do not impute missing values. Definition of active variants For landscapes containing fitness data for variants with stop codons, “active” variants were defined as those 1.96 standard deviations above the mean fitness of all sequences containing stop codons, which are expected to be inactive. 44 For GB1, T7, and TEV we followed the cutoffs set by the authors, based on the detection limit of their fitness measurement system. 37 , 42 , 43 Fitness statistics We used the “statistical functions” (’scipy.stats’) and signal (’scipy.signal’) modules from the SciPy Python package 75 to calculate kurtosis, estimate the Cauchy peak location, and determine the number of KDE peaks. Specifically, kurtosis was calculated using the ’kurtosis’ function with default settings from the ’stats’ module. Cauchy peak location was estimated using the ’fit’ method from the ’cauchy’ distribution object in the ’stats’ module. The number of KDE peaks was determined by estimating the probability density function with the ’gaussian_kdè function from the ’stats’ module and then identifying local optima using the ’argrelextremà function from the signal module. Pairwise epistasis calculation We classified pairwise epistasis into three categories: magnitude, sign, and reciprocal sign. For each active variant, we assigned an epistasis type for each possible double substitution at chosen sites. 44 We then calculated the fraction of epistasis type for each starting variant in the landscape. To enhance relevance to DE navigability, we incorporated additive interactions into magnitude epistasis, and merged sign and reciprocal sign epistasis into non-magnitude epistasis. Missing values are omitted. Magnitude epistasis The combined effect of two mutations is larger than or equal to their additive effects in the same direction as each individual mutation. This is navigable through single-step or recombination-based DE methods. Sign epistasis The direction of the effect of one mutation changes in the presence of the other such that the substitution order impacts single-step DE navigability. Reciprocal sign The combined effect changes the direction of both mutations in the presence of each other. This is not accessible with single-step DE that is inherently a greedy uphill walk. Pairwise epistasis correlation with C-alpha distances The pairwise C-alpha distances of mutated residues were calculated based on each of the parent structure and then averaged for each landscape. The Spearman’s correlation was then calculated with fraction of pairwise non-magnitude epistasis Local optimum calculation A local optimum is a variant with higher fitness than all its neighboring active variants differing by one amino acid substitution. 41 , 44 , 46 , 47 , 49 ZS calculation We calculated six different ZS scores for each landscape. The ranking correlation was calculated using Spearman’s correlation and the active/inactive variant classification was quantified by ROC-AUC. All values can be found with data deposit. Hamming distance Hamming distance counted the number of amino acid differences between a variant and the parent sequence. In the main text, the parent sequence was defined by the authors of each landscape. Simulations provided in the supplemental information explored all possible parent sequences, starting from any active variant. The Hamming distance from each given parent sequence was used for fitness ranking and active/inactive variant classification, with the final results averaged over all possible parent sequences. EVmutation score EVmutation score was based on conservation and evolutionary couplings with multiple sequence alignments (MSAs). 18 , 56 For each landscape, the parent sequence was uploaded to the EVcoupling web server with default parameters for MSA generation and their subsequent EVmutation model training. We chose the recommended EVmutation model if the alignment covers all mutation sites, and otherwise we prioritized the models covering all sites with a higher bitscore. In the case that not all the mutation sites are covered, the position filter was decreased from the default 70% to 50%. The EVmutation scores and ranking were then generated with the EVmutation model. ESM score ESM score was based on pretrained protein language model masked language modeling objective output probability for mutations given their surrounding context. 19 Contrary to EVmutation, ESM score does not entail explicit MSAs. For each landscape, ESM score for each position was calculated by using the log odds ratio comparing the mutated amino acid probability with the parent probability. Multi-mutant ESM scores are then summed from individual mutants. We found ESM-1v (esm1v_t33_650M_UR90S_1), ESM-1b (esm1b_t33_650M_UR50S), and ESM2 (esm2_t33_650M_UR50D) giving comparable results and decided to move forward with ESM2. ESM-IF score ESM-IF score was calculated using the ratio between likelihoods of the mutated and parent sequences according to the inverse folding model ESM-IF1 (esm_if1_gvp4_t16_142M_UR50), 20 given the experimentally determined parent structure from PDB. 76 It leverages a Geometric Vector Perceptron (GVP) module that maintains properties under transformations like rotations. 77 CoVES score The CoVES score was calculated using pretrained weights from Ding et al. (2024) following their methods, 21 applied to all parent structures from the PDB. Triad score Triad score reflected the change in free energy of folding upon mutation (ΔΔG) prediction of mutant stability. The calculation was based on Rosetta energy functions across mutation sites with a fixed backbone assumption. First, the parent protein crystal structure for each landscape was obtained from PDB. Then, for each landscape, we obtained the score for each variant with the Triad software suite following the method from Wittmann et al. (2021). 12 ZS ensembles For each landscape, the six different ZS scores were calculated accordingly. For Hamming distance-based ensemble, a Hamming distance of two was first applied (i.e., double-site libraries), then five other ZS scores were used to rank the variants. For other ensembles, equal weight was given to each individually calculated and ranked ZS score for each variant. ZS analysis ZS MSA depth correlation The MSA depth referred to the number of sequences resulted from EVmutation, where all mutation sites were covered. ZS pairwise correlation The pairwise correlation was performed for each landscape and then averaged across the 12 landscapes with at least 1% active variants. DE simulations For each landscape, all DE simulations started from an active variant, regardless of its background. The maximum fitness achieved by each starting variant was recorded. Single-step DE This is a greedy walk algorithm. The process begins with selecting one of the possible substitution sites, evaluating the fitness impact of all possible amino acid substitutions at this position. The substitution yielding the highest fitness is fixed, and the position is restricted from further exploration. In the next round, one of the remaining positions is selected, with all mutants evaluated, and the best substitution is fixed again. This process repeats iteratively until all positions have been evaluated yielding the fitness of the best variant identified in the last round. Consequently, each site is optimized once per simulation. For example, a four-site library requires four rounds of single-step DE to reach the optimal variant and there is a total of 24 (4!) possible orders of sampling. This is a deterministic approach to navigate the fitness landscape as the best variant is always selected. 11 , 12 , 44 Recombination SSM This is a naive recombination. This approach randomly samples the combinatorial space, independently optimizing each site within the context of the initial sequence and then combining the best substitutions from each site into a new variant. 11 , 44 Top96 recombination This is an alternative recombination approach. All substitutions are made at each of the sites independently in the background of the initial sequence, calculating fitness for all combinations from single substitution over the initial sequence. The sequences are then ranked based on their fitness, and the top 96 variants are tested in silico . The reported maximum fitness reflects the highest observed among the initial sequence, any single substitutions, and the best of the top 96. 44 MLDE, ALDE, and focus-training experiments For each experiment on a given landscape, a range of total number of samples (sizes: 120, 144, 192, 288, 384, 480, 576, 672, 1056, and 2016) were split across training and testing for MLDE or multiple rounds of sampling for ALDE. All results were averaged across 50 replicates. The top 96 predicted variant fitness values were analyzed. Encoding strategies One-hot and learned representations from ESM2 (esm2_t33_650M_UR50D) were tested. One-hot encodings were flattened over the mutated sites. Learned representations from ESM2 (esm2_t33_650M_UR50D) were implemented in three ways, (1) flattened over the mutated sites, (2) mean pooled over the mutated sites, and (3) mean pooled over the full sequence. MLDE experiments For each MLDE experiment on a given landscape, XGBoost 78 and the Scikit-learn ridge regression 79 models were trained on different random samples (sizes: 24, 48, 96, 192, 288, 384, 480, 576, 960, and 1920) with five-fold cross-validation. An alpha value of 1 was used for ridge regression. The ’reg:tweediè objective was implemented with an ’early_stopping_rounds’ of 10 for the boosting models. The model ensembles were used to predict variant fitness across the entire library, and the top 96 predicted variant fitness values were analyzed. ALDE experiments For each ALDE experiment on a given landscape, models were trained on different random samples (total number: 120, 144, 192, 288, 384, 480, 576, 672, 1056, and 2016) split across different iterations (rounds: 2, 3, and 4). Boosting ensemble and deep neural network ensembles were tested for ALDE. The ’reg:tweediè objective was implemented with an ’early_stopping_rounds’ of 10 for the boosting models. The ’torch.optim.Adam’ optimizer with the ’torch.nn.MSELoss’ loss from PyTorch 80 was implemented for deep neural network ensembles with bootstrapping of five models where 90% of the total training data was randomly seen during training. Greedy acquisition functions were deployed for both boosting and deep neural network ensembles. Focused training experiments Different focused training sets (50%, 25%, and 12.5% of total mutants) were ranked by favorable ZS scores with the chosen ZS predictor for each given landscape. These focused sets were then used to sample training data for MLDE or the initial round of ALDE. 12.5% was chosen for all simulations other than examining the optimal focused training library size ( Figure S27 ). Feature correlation and importance analysis To analyze how each landscape attribute correlated with the simulation targets, a Spearman’s correlation was performed between the attribute and the performance of the model. Both the Spearman’s ρ and p-value were reported. To test the differences between binding and enzyme activities, t-tests were performed, where the t-statistic and p-values were reported. A p-value less than 0.05 was considered statistically significant. Supplemental information View this table: View inline View popup Download powerpoint Table S1. Combinatorial landscapes with additional details including landscapes with fewer than 1% active variants, related to Table 1 . 37 , 40 – 44 View this table: View inline View popup Download powerpoint Table S2. MLDE percent improvement from three types of DE, related to Figure 2a . View this table: View inline View popup Download powerpoint Table S3. ftMLDE percent improvement from MLDE, related to Figure 2a . View this table: View inline View popup Download powerpoint Table S4. MLDE, ALDE and focused training with 480 total sample size fold improvement from single-step DE, related to Figure 2c . Bold row indicates the landscape with the max improvement and italic rows indicate landscapes with fewer than 1% active variants. View this table: View inline View popup Download powerpoint Table S5. Protein function and MSA impact ZS predictor performances test significance, related to Figure 3c . Correlation between Spearman’s correlation of ZS predictor fitness ranking prediction with MSA depth, where the depth for the EVmutation calculation covering the full sequence is used. Bold font indicates statistically significant (p-value < 0.05). View this table: View inline View popup Download powerpoint Table S6. T-test for ZS predictor between binding and enzyme activities for landscapes with at least 1% active variants, related to Figure 4a . Bold font indicates statistically significant (p-value < 0.05). View this table: View inline View popup Download powerpoint Table S7. T-test for focused training MLDE (480 total sample size) between binding and enzyme activities for landscapes with at least 1% active variants, related to Figure 4b . View this table: View inline View popup Download powerpoint Table S8. T-test for focused training ALDE (480 total sample size split into four rounds) between binding and enzyme activities for landscapes with at least 1% active variants, related to Figure 4b . View this table: View inline View popup Download powerpoint Table S9. T-test for ZS predictor between binding and enzyme activities for landscapes with at least 1% active variants but with single substitution only, related to discussion. Bold font indicates statistically significant (p-value < 0.05). Download figure Open in new tab Figure S1. MLDE and ftMLDE performance averaged across four landscapes with fewer than 1% active variants, related to Figure 2a . Shading indicates standard deviation. Download figure Open in new tab Figure S2. Single-step DE, MLDE, ALDE, and focused training results broken down by four landscapes with fewer than 1% active variants. A total sample size of 480 was used for all ML strategies across both metrics, related to Figure 2b . Download figure Open in new tab Figure S3. Correlation of ALDE and ftALDE performance improvement (the average maximum fitness of the top 96 predicted variants by ALDE and ftALDE over single-step DE, y-axis) with six landscape attributes (x-axis), related to Figure 2c . Download figure Open in new tab Figure S4. Mean variant fitness of double-site library (Hamming distance of two) from active variant as the parent, related to Hamming distance in Figure 3 . Download figure Open in new tab Figure S5. Hamming distance fitness ranking using any active variant as the parent, related to Hamming distance in Figure 3 . The dotted line indicates random predictions. Download figure Open in new tab Figure S6. Hamming distance active/inactive variant classification using any active variant as the parent, related to Hamming distance in Figure 3 . The dotted line indicates random predictions. Download figure Open in new tab Figure S7. Multiple rounds of ftALDE averaged across 12 landscapes with more than 1% active variants, related to Figure 3e . Shading indicates standard deviation. Download figure Open in new tab Figure S8. ftMLDE with Hamming distance-ensembled ZS predictors averaged across four landscapes with fewer than 1% active variants, related to Figure 3e . Shading indicates standard deviation. Download figure Open in new tab Figure S9. Multiple rounds of ftALDE averaged across four landscapes with fewer than 1% active variants, related to Figure 3e . Shading indicates standard deviation. Download figure Open in new tab Figure S10. ftMLDE with Hamming distance-ensembled ZS predictors, averaged across 12 landscapes with more than 1% active variants, related to Figure 3e . Shading indicates standard deviation. Download figure Open in new tab Figure S11. ftMLDE with Triad-ensembled ZS predictors or ESM-IF and EVmutation ensemble, averaged across 12 landscapes with more than 1% active variants, related to Figure 3e . Shading indicates standard deviation. Download figure Open in new tab Figure S12. ftMLDE with Triad-ensembled ZS predictors or ESM-IF and EVmutation ensemble, averaged across four landscapes with fewer than 1% active variants, related to Figure 3e . Shading indicates standard deviation. Download figure Open in new tab Figure S13. ZS predictor fitness value ranking (left) and active/inactive variant classification (right) for four landscapes with fewer than 1% active variants, related to Figure 4a . Download figure Open in new tab Figure S14. Effects of focused training for ftMLDE with a total sample size of 480 (384 training and 96 testing, top) and 192 (96 training and 96 testing, bottom) for four landscapes with fewer than 1% active variants, related to Figure 4b . Download figure Open in new tab Figure S15. Effects of focused training for two (top) and four (bottom) rounds of ftALDE with a total sample size of 480 for four landscapes with fewer than 1% active variants, related to Figure 4b . Download figure Open in new tab Figure S16. Effects of focused training for two (top) and four (bottom) rounds of ftALDE with a total sample size of 480 for 12 landscapes with at least 1% active variants, related to Figure 4b . Download figure Open in new tab Figure S17. Effects of focused training for two (top) and four (bottom) rounds of ftALDE with a total sample size of 192 for 12 landscapes with at least 1% active variants, related to Figure 4b . Download figure Open in new tab Figure S18. Effects of focused training for ftMLDE with a total sample size of 192 (96 training and 96 testing) for 12 landscapes with at least 1% active variants, related to Figure 4b . Download figure Open in new tab Figure S19. ZS predictor for single substitution fitness value ranking (left) and active/inactive variant classification (right) for 12 landscapes with at least 1% active variants, related to discussion and Figure 4a . Statistical significance (p-value <0.05) is indicated as *. Download figure Open in new tab Figure S20. ZS predictor for single substitution fitness value ranking (left) and active/inactive variant classification (right) for landscapes with fewer than 1% active variants, related to discussion. Statistical significance (p-value <0.05) is indicated as *, related to discussion and Figure 4a . Download figure Open in new tab Figure S21. Encoding strategies for MLDE performance, averaged across 12 landscapes with at least 1% active variants. Comparison of learned embeddings from the protein language model ESM2 using different pooling methods vs. one-hot encoding flattened over the substitution sites, related to discussion. Shading indicates standard deviation. Download figure Open in new tab Figure S22. Encoding strategies for EVmutation-guided ftMLDE performance, averaged across landscapes with at least 1% active variants. Comparison of learned embeddings from the protein language model ESM2 using different pooling methods vs. one-hot encoding flattened over the substitution sites, related to discussion. Shading indicates standard deviation. Download figure Open in new tab Figure S23. Encoding strategies for MLDE performance, averaged across four landscapes with fewer than 1% active variants. Comparison of learned embeddings from the protein language model ESM2 using different pooling methods vs. one-hot encoding flattened over the substitution sites, related to discussion. Shading indicates standard deviation. Download figure Open in new tab Figure S24. Encoding strategies for EVmutation-guided ftMLDE performance, averaged across four landscapes with fewer than 1% active variants. Comparison of learned embeddings from the protein language model ESM2 using different pooling methods vs. one-hot encoding flattened over the substitution sites, related to discussion. Shading indicates standard deviation. Download figure Open in new tab Figure S25. MLDE and ALDE with different model types, averaged across 12 landscapes with at least 1% active variants. MLDE with boosting or ridge regression. ALDE different rounds with boosting or deep neural network ensembles. No focused training included, related to discussion. Shading indicates standard deviation. Download figure Open in new tab Figure S26. MLDE and ALDE with different model types, averaged across four landscapes with fewer than 1% active variants. MLDE with boosting or ridge regression. ALDE different rounds with boosting or deep neural network ensembles. No focused training included, related to discussion. Shading indicates standard deviation. Download figure Open in new tab Figure S27. The impact of reducing the size of the focused training library relative to the full library on ftMLDE performance averaged across 12 landscapes with at least 1% active variants split into three-site landscapes (top row) and four-site landscapes (bottom row). Related to discussion. Acknowledgments The authors thank Sabine Brinkmann-Chen, Tanvi Ganapathy, Ariane Mora, Chenghao Liu, Yueming Long, Julia Reisenbauer, Casey Ritts, Kathleen Sicinski, Bruce Wittmann, Kevin Yang, and the other members of the Arnold Lab for critical reading and discussion of the manuscript, Andrei Papkou, Vikram Sundar, and Boqiang Tu for dataset discussion, and Thomas Hopf and Aviv Spinner for assistance with EVmutation implementation. This work was supported by the NSF Division of Chemical, Bioengineering, Environmental and Transport Systems (CBET 1937902) and Amgen Chem-Bio-Engineering Award (CBEA AMGEN.ARNOLD22). F.Z.L. and J.Y. were partially supported by the National Science Foundation Graduate Research Fellowship and F.Z.L. was partially supported by Amazon AI4Science Fellowship at Caltech. Footnotes ↵ 6 Lead contact https://github.com/fhalab/SSMuLA https://doi.org/10.5281/zenodo.13910506 References 1. ↵ Lutz , S. , and Iamurri , S.M . ( 2018 ). Protein Engineering: Past , Present, and Future. Methods Mol. Biol. Clifton NJ 1685 , 1 – 12 . doi: 10.1007/978-1-4939-7366-8_1 . OpenUrl CrossRef 2. ↵ Arnold , F.H. ( 2018 ). Directed Evolution: Bringing New Chemistry to Life . Angew. Chem. Int. Ed. 57 , 4143 – 4148 . doi: 10.1002/anie.201708408 . OpenUrl CrossRef PubMed 3. ↵ Packer , M.S. , and Liu , D.R . ( 2015 ). Methods for the directed evolution of proteins . Nat. Rev. Genet . 16 , 379 – 394 . doi: 10.1038/nrg3927 . OpenUrl CrossRef PubMed 4. ↵ Wang , Y. , Xue , P. , Cao , M. , Yu , T. , Lane , S.T. , and Zhao , H . ( 2021 ). Directed Evolution: Methodologies and Applications . Chem. Rev . 121 , 12384 – 12444 . doi: 10.1021/acs.chemrev.1c00260 . OpenUrl CrossRef 5. ↵ Romero , P.A. , and Arnold , F.H . ( 2009 ). Exploring protein fitness landscapes by directed evolution . Nat. Rev. Mol. Cell Biol . 10 , 866 – 876 . doi: 10.1038/nrm2805 . OpenUrl CrossRef PubMed Web of Science 6. ↵ Maynard Smith , J. ( 1970 ). Natural selection and the concept of a protein space . Nature 225 , 563 – 564 . doi: 10.1038/225563a0 . OpenUrl CrossRef PubMed Web of Science 7. ↵ Starr , T.N. , and Thornton , J.W . ( 2016 ). Epistasis in protein evolution . Protein Sci. Publ. Protein Soc . 25 , 1204 . doi: 10.1002/PRO.2897 . OpenUrl CrossRef 8. ↵ Miton , C.M. , Buda , K. , and Tokuriki , N . ( 2021 ). Epistasis and intramolecular networks in protein evolution . Curr. Opin. Struct. Biol . 69 , 160 – 168 . doi: 10.1016/j.sbi.2021.04.007 . OpenUrl CrossRef PubMed 9. ↵ Anishchenko , I. , Ovchinnikov , S. , Kamisetty , H. , and Baker , D . ( 2017 ). Origins of coevolution between residues distant in protein 3D structures . Proc. Natl. Acad. Sci . 114 , 9122 – 9127 . doi: 10.1073/pnas.1702664114 . OpenUrl Abstract / FREE Full Text 10. ↵ Bell , E.L. , Finnigan , W. , France , S.P. , Green , A.P. , Hayes , M.A. , Hepworth , L.J. , Lovelock , S.L. , Niikura , H. , Osuna , S. , Romero , E. , et al. ( 2021 ). Biocatalysis . Nat. Rev. Methods Primer 1 , 1 – 21 . doi: 10.1038/s43586-021-00044-z . OpenUrl CrossRef 11. ↵ Wu , Z. , Kan , S.B.J. , Lewis , R.D. , Wittmann , B.J. , and Arnold , F.H . ( 2019 ). Machine learning-assisted directed protein evolution with combinatorial libraries . Proc. Natl. Acad. Sci . 116 , 8852 – 8858 . doi: 10.1073/pnas.1901979116 . OpenUrl Abstract / FREE Full Text 12. ↵ Wittmann , B.J. , Yue , Y. , and Arnold , F.H . ( 2021 ). Informed training set design enables efficient machine learning-assisted directed protein evolution . Cell Syst . doi: 10.1016/j.cels.2021.07.008 . OpenUrl CrossRef 13. ↵ Yang , K.K. , Wu , Z. , and Arnold , F.H . ( 2019 ). Machine-learning-guided directed evolution for protein engineering . Nat. Methods 16 , 687 – 694 . doi: 10.1038/s41592-019-0496-6 . OpenUrl CrossRef PubMed 14. ↵ Johnston , K.E. , Fannjiang , C. , Wittmann , B.J. , Hie , B.L. , Yang , K.K. , and Wu , Z . ( 2023 ). Machine Learning for Protein Engineering. ArXiv , arXiv : 2305 . 16634v 1 . OpenUrl 15. ↵ Qiu , Y. , Hu , J. , and Wei , G.-W . ( 2021 ). Cluster learning-assisted directed evolution . Nat. Comput. Sci . 1 , 809 – 818 . doi: 10.1038/s43588-021-00168-y . OpenUrl CrossRef PubMed 16. ↵ Qiu , Y. , and Wei , G.-W . ( 2022 ). CLADE 2.0: Evolution-Driven Cluster Learning-Assisted Directed Evolution . J. Chem. Inf. Model . 62 , 4629 – 4641 . doi: 10.1021/acs.jcim.2c01046 . OpenUrl CrossRef PubMed 17. ↵ Yang , J. , Lal , R.G. , Bowden , J.C. , Astudillo , R. , Hameedi , M.A. , Kaur , S. , Hill , M. , Yue , Y. , and Arnold , F.H . ( 2024 ). Active Learning-Assisted Directed Evolution . Preprint at bioRxiv , doi: 10.1101/2024.07.27.605457 . OpenUrl Abstract / FREE Full Text 18. ↵ Hopf , T.A. , Ingraham , J.B. , Poelwijk , F.J. , Schärfe , C.P.I. , Springer , M. , Sander , C. , and Marks , D.S . ( 2017 ). Mutation effects predicted from sequence co-variation . Nat. Biotechnol . 35 , 128 – 135 . doi: 10.1038/nbt.3769 . OpenUrl CrossRef PubMed 19. ↵ Meier , J. , Rao , R. , Verkuil , R. , Liu , J. , Sercu , T. , and Rives , A . ( 2021 ). Language models enable zero-shot prediction of the effects of mutations on protein function . bioRxiv , 2021.07.09.450648. doi: 10.1101/2021.07.09.450648 . OpenUrl Abstract / FREE Full Text 20. ↵ Hsu , C. , Verkuil , R. , Liu , J. , Lin , Z. , Hie , B. , Sercu , T. , Lerer , A. , and Rives , A . ( 2022 ). Learning inverse folding from millions of predicted structures . Preprint , doi: 10.1101/2022.04.10.487779 . OpenUrl Abstract / FREE Full Text 21. ↵ Ding , D. , Shaw , A.Y. , Sinai , S. , Rollins , N. , Prywes , N. , Savage , D.F. , Laub , M.T. , and Marks , D.S . ( 2024 ). Protein design using structure-based residue preferences . Nat. Commun . 15 , 1639 . doi: 10.1038/s41467-024-45621-4 . OpenUrl CrossRef PubMed 22. ↵ Bloom , J.D. , Labthavikul , S.T. , Otey , C.R. , and Arnold , F.H . ( 2006 ). Protein stability promotes evolvability . Proc. Natl. Acad. Sci . 103 , 5869 – 5874 . doi: 10.1073/pnas.0510098103 . OpenUrl Abstract / FREE Full Text 23. ↵ Hsu , C. , Nisonoff , H. , Fannjiang , C. , and Listgarten , J . ( 2022 ). Learning protein fitness models from evolutionary and assay-labeled data . Nat. Biotechnol ., 1 – 9 . doi: 10.1038/s41587-021-01146-5 . OpenUrl CrossRef 24. Yang , K.K. , Zanichelli , N. , and Yeh , H . ( 2023 ). Masked inverse folding with sequence transfer for protein representation learning . Protein Eng. Des. Sel . 36 , gzad015 . doi: 10.1093/protein/gzad015 . OpenUrl CrossRef 25. ↵ Notin , P. , Kollasch , A.W. , Ritter , D. , Niekerk , L. van , Paul , S. , Spinner , H. , Rollins , N. , Shaw , A ., Weitzman , R. , Frazer , J. , et al. ( 2023 ). ProteinGym: Large-Scale Benchmarks for Protein Design and Fitness Prediction . Preprint at bioRxiv , doi: 10.1101/2023.12.07.570727 . OpenUrl Abstract / FREE Full Text 26. ↵ Bedbrook , C.N. , Yang , K.K. , Rice , A.J. , Gradinaru , V. , and Arnold , F.H . ( 2017 ). Machine learning to design integral membrane channelrhodopsins for efficient eukaryotic expression and plasma membrane localization . PLOS Comput. Biol . 13 , e1005786 . doi: 10.1371/journal.pcbi.1005786 . OpenUrl CrossRef 27. Bedbrook , C.N. , Yang , K.K. , Robinson , J.E. , Mackey , E.D. , Gradinaru , V. , and Arnold , F.H . ( 2019 ). Machine learning-guided channelrhodopsin engineering enables minimally invasive optogenetics . Nat. Methods 16 , 1176 – 1184 . doi: 10.1038/s41592-019-0583-8 . OpenUrl CrossRef 28. Thomas , N. , Belanger , D. , Xu , C. , Lee , H. , Hirano , K. , Iwai , K. , Polic , V. , Nyberg , K.D. , Hoff , K. , Frenz , L. , et al. ( 2024 ). Engineering highly active and diverse nuclease enzymes by combining machine learning and ultra-high-throughput screening . Preprint at bioRxiv , doi: 10.1101/2024.03.21.585615 . OpenUrl Abstract / FREE Full Text 29. Rapp , J.T. , Bremer , B.J. , and Romero , P.A . ( 2024 ). Self-driving laboratories to autonomously navigate the protein fitness landscape. Nat . Chem. Eng . 1 , 97 – 107 . doi: 10.1038/s44286-023-00002-4 . OpenUrl CrossRef 30. Romero , P.A. , Krause , A. , and Arnold , F.H . ( 2013 ). Navigating the protein fitness landscape with Gaussian processes . Proc. Natl. Acad. Sci . 110 , E193 – E201 . doi: 10.1073/pnas.1215251110 . OpenUrl Abstract / FREE Full Text 31. Bryant , D.H. , Bashir , A. , Sinai , S. , Jain , N.K. , Ogden , P.J. , Riley , P.F. , Church , G.M. , Colwell , L.J. , and Kelsic , E.D . ( 2021 ). Deep diversification of an AAV capsid protein by machine learning . Nat. Biotechnol . 39 , 691 – 696 . doi: 10.1038/s41587-020-00793-4 . OpenUrl CrossRef 32. Freschlin , C.R. , Fahlberg , S.A. , and Romero , P.A . ( 2022 ). Machine learning to navigate fitness landscapes for protein engineering . Curr. Opin. Biotechnol . 75 , 102713 . doi: 10.1016/j.copbio.2022.102713 . OpenUrl CrossRef 33. Yang , J. , Li , F.-Z. , and Arnold , F.H . ( 2024 ). Opportunities and Challenges for Machine Learning-Assisted Enzyme Engineering . ACS Cent. Sci . 10 , 226 – 241 . doi: 10.1021/acscentsci.3c01275 . OpenUrl CrossRef PubMed 34. Angermueller , C. , Mariet , Z. , Jester , B. , Engelhart , E. , Emerson , R. , Alipanahi , B. , Lin , C. , Shikany , C. , Guion , D. , Nelson , J. , et al. ( 2023 ). High-throughput ML-guided design of diverse single-domain antibodies against SARS-CoV-2 . Preprint at bioRxiv , doi: 10.1101/2023.12.01.569227 . OpenUrl Abstract / FREE Full Text 35. Guo , J. , Lin , L.F. , Oraskovich , S.V. , Jesús , J.A.R. de , Listgarten , J. , and Schaffer , D.V. ( 2024 ). Computationally guided AAV engineering for enhanced gene delivery . Trends Biochem. Sci . 49 , 457 – 469 . doi: 10.1016/j.tibs.2024.03.002 . OpenUrl CrossRef PubMed 36. ↵ Gelman , S. , Johnson , B. , Freschlin , C. , D’Costa , S. , Gitter , A. , and Romero , P.A . ( 2024 ). Biophysics-based protein language models for protein engineering . Preprint at bioRxiv , doi: 10.1101/2024.03.15.585128 . OpenUrl Abstract / FREE Full Text 37. ↵ Wu , N.C. , Dai , L. , Olson , C.A. , Lloyd-Smith , J.O. , and Sun , R . ( 2016 ). Adaptation in protein fitness landscapes is facilitated by indirect paths . eLife 5 , e16965 . doi: 10.7554/eLife.16965 . OpenUrl CrossRef PubMed 38. ↵ Riesselman , A.J. , Ingraham , J.B. , and Marks , D.S . ( 2018 ). Deep generative models of genetic variation capture the effects of mutations . Nat. Methods 15 , 816 – 822 . doi: 10.1038/s41592-018-0138-4 . OpenUrl CrossRef PubMed 39. ↵ Jalal , A.S.B. , Tran , N.T. , Stevenson , C.E. , Chan , E.W. , Lo , R. , Tan , X. , Noy , A. , Lawson , D.M. , and Le , T.B.K . ( 2020 ). Diversification of DNA-Binding Specificity by Permissive and Specificity-Switching Mutations in the ParB/Noc Protein Family . Cell Rep . 32 , 107928 . doi: 10.1016/j.celrep.2020.107928 . OpenUrl CrossRef PubMed 40. ↵ Lite , T.-L.V. , Grant , R.A. , Nocedal , I. , Littlehale , M.L. , Guo , M.S. , and Laub , M.T . ( 2020 ). Uncovering the basis of protein-protein interaction specificity with a combinatorially complete library . eLife 9 , e60924 . doi: 10.7554/eLife.60924 . OpenUrl CrossRef 41. ↵ Papkou , A. , Garcia-Pastor , L. , Escudero , J.A. , and Wagner , A . ( 2023 ). A rugged yet easily navigable fitness landscape . Science 382 , eadh3860 . doi: 10.1126/science.adh3860 . OpenUrl CrossRef PubMed 42. ↵ Tu , B. , Sundar , V. , and Esvelt , K.M . ( 2024 ). An ultra-high-throughput method for measuring biomolecular activities . Preprint at bioRxiv , doi: 10.1101/2022.03.09.483646 . OpenUrl Abstract / FREE Full Text 43. ↵ Sundar , V. , Tu , B. , Guan , L. , and Esvelt , K . ( 2024 ). FLIGHTED: Inferring Fitness Landscapes from Noisy High-Throughput Experimental Data . Preprint , doi: 10.1101/2024.03.26.586797 . OpenUrl Abstract / FREE Full Text 44. ↵ Johnston , K.E. , Almhjell , P.J. , Watkins-Dulaney , E.J. , Liu , G. , Porter , N.J. , Yang , J. , and Arnold , F.H . ( 2024 ). A combinatorially complete epistatic fitness landscape in an enzyme active site . Proc. Natl. Acad. Sci . 121 , e2400439121 . doi: 10.1073/pnas.2400439121 . OpenUrl CrossRef PubMed 45. ↵ Van Cleve , J. , and Weissman , D.B. ( 2015 ). Measuring ruggedness in fitness landscapes . Proc. Natl. Acad. Sci . 112 , 7345 – 7346 . doi: 10.1073/pnas.1507916112 . OpenUrl FREE Full Text 46. ↵ de Visser , J.A.G.M. , and Krug , J. ( 2014 ). Empirical fitness landscapes and the predictability of evolution . Nat. Rev. Genet . 15 , 480 – 490 . doi: 10.1038/nrg3744 . OpenUrl CrossRef PubMed 47. ↵ Sendero , I.G. , Schenk , M.F. , Franke , J. , Krug , J. , and Visser, J.A.G.M. de ( 2013 ). Quantitative analyses of empirical fitness landscapes . J. Stat. Mech. Theory Exp . 2013 , P01005 . doi: 10.1088/1742-5468/2013/01/P01005 . OpenUrl CrossRef 48. Aita , T. , and Husimi , Y . ( 1996 ). Fitness Spectrum Among Random Mutants on Mt. Fuji-Type Fitness Landscape . J. Theor. Biol . 182 , 469 – 485 . doi: 10.1006/jtbi.1996.0189 . OpenUrl CrossRef PubMed Web of Science 49. ↵ Kauffman , S. , and Levin , S . ( 1987 ). Towards a general theory of adaptive walks on rugged landscapes . J. Theor. Biol . 128 , 11 – 45 . doi: 10.1016/S0022-5193(87)80029-2 . OpenUrl CrossRef PubMed Web of Science 50. Kingman , J.F.C . ( 1978 ). A simple model for the balance between selection and mutation . J. Appl. Probab . 15 , 1 – 12 . doi: 10.2307/3213231 . OpenUrl CrossRef Web of Science 51. Crona , K. , Greene , D. , and Barlow , M . ( 2013 ). The peaks and geometry of fitness landscapes . J. Theor. Biol . 317 , 1 – 10 . doi: 10.1016/j.jtbi.2012.09.028 . OpenUrl CrossRef PubMed Web of Science 52. Kondrashov , D.A. , and Kondrashov , F.A . ( 2015 ). Topological features of rugged fitness landscapes in sequence space . Trends Genet . 31 , 24 – 33 . doi: 10.1016/j.tig.2014.09.009 . OpenUrl CrossRef PubMed 53. ↵ Thomas , N. , Agarwala , A. , Belanger , D. , Song , Y.S. , and Colwell , L . ( 2022 ). Tuned Fitness Landscapes for Benchmarking Model-Guided Protein Design . Preprint at bioRxiv , doi: 10.1101/2022.10.28.514293 . OpenUrl Abstract / FREE Full Text 54. ↵ Park , Y. , Metzger , B.P.H. , and Thornton , J.W . ( 2024 ). The simplicity of protein sequence-function relationships . Preprint at bioRxiv , doi: 10.1101/2023.09.02.556057 . OpenUrl Abstract / FREE Full Text 55. ↵ Triad https://triad.protabit.com/ . 56. ↵ Hopf , T.A. , Green , A.G. , Schubert , B. , Mersmann , S. , Schärfe , C.P.I. , Ingraham , J.B. , Toth-Petroczy , A. , Brock , K. , Riesselman , A.J. , Palmedo , P. , et al. ( 2019 ). The EVcouplings Python framework for coevolutionary sequence analysis . Bioinformatics 35 , 1582 – 1584 . doi: 10.1093/bioinformatics/bty862 . OpenUrl CrossRef PubMed 57. ↵ Rives , A. , Meier , J. , Sercu , T. , Goyal , S. , Lin , Z. , Liu , J. , Guo , D. , Ott , M. , Zitnick , C.L. , Ma , J. , et al. ( 2021 ). Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences . Proc. Natl. Acad. Sci . 118 . doi: 10.1073/pnas.2016239118 . OpenUrl Abstract / FREE Full Text 58. ↵ Lin , Z. , Akin , H. , Rao , R. , Hie , B. , Zhu , Z. , Lu , W. , Smetanin , N. , Verkuil , R. , Kabeli , O. , Shmueli , Y. , et al. ( 2023 ). Evolutionary-scale prediction of atomic-level protein structure with a language model . Science 379 , 1123 – 1130 . doi: 10.1126/science.ade2574 . OpenUrl CrossRef PubMed 59. Yang , K.K. , Fusi , N. , and Lu , A.X . ( 2024 ). Convolutions are competitive with transformers for protein sequence pretraining . Cell Syst. , S2405471224000292 . doi: 10.1016/j.cels.2024.01.008 . OpenUrl CrossRef 60. Elnaggar , A. , Heinzinger , M. , Dallago , C. , Rehawi , G. , Wang , Y. , Jones , L. , Gibbs , T. , Feher , T. , Angerer , C. , Steinegger , M. , et al. ( 2021 ). ProtTrans: Towards Cracking the Language of Life’s Code Through Self-Supervised Learning doi: 10.1101/2020.07.12.199554 . OpenUrl Abstract / FREE Full Text 61. Elnaggar , A. , Essam , H. , Salah-Eldin , W. , Moustafa , W. , Elkerdawy , M. , Rochereau , C. , and Rost , B . ( 2023 ). Ankh: Optimized Protein Language Model Unlocks General-Purpose Modelling . Preprint at arXiv , doi: 10.48550/arXiv.2301.06568 . OpenUrl CrossRef 62. Ofer , D. , Brandes , N. , and Linial , M . ( 2021 ). The language of proteins: NLP, machine learning & protein sequences . Comput. Struct. Biotechnol. J . 19 , 1750 – 1758 . doi: 10.1016/j.csbj.2021.03.022 . OpenUrl CrossRef PubMed 63. ↵ Bepler , T. , and Berger , B . ( 2021 ). Learning the protein language: Evolution, structure, and function . Cell Syst . 12 , 654 – 669 .e3. doi: 10.1016/j.cels.2021.05.017 . OpenUrl CrossRef PubMed 64. ↵ Johnson , S.R. , Fu , X. , Viknander , S. , Goldin , C. , Monaco , S. , Zelezniak , A. , and Yang , K.K . ( 2024 ). Computational scoring and experimental evaluation of enzymes generated by neural networks . Nat. Biotechnol ., 1–10. doi: 10.1038/s41587-024-02214-2 . OpenUrl CrossRef 65. ↵ Lu , W. , Zhang , J. , Rao , J. , Zhang , Z. , and Zheng , S . ( 2024 ). AlphaFold3, a secret sauce for predicting mutational effects on protein-protein interactions . Preprint at bioRxiv , doi: 10.1101/2024.05.25.595871 . OpenUrl Abstract / FREE Full Text 66. ↵ Li , F.-Z. , Amini , A.P. , Yue , Y. , Yang , K.K. , and Lu , A.X . ( 2024 ). Feature Reuse and Scaling: Understanding Transfer Learning with Protein Language Models . In Proceedings of the 41st International Conference on Machine Learning (PMLR) , pp. 27351 – 27375 . 67. ↵ Chowdhury , R. , Bouatta , N. , Biswas , S. , Floristean , C. , Kharkar , A. , Roy , K. , Rochereau , C. , Ahdritz , G. , Zhang , J. , Church , G.M. , et al. ( 2022 ). Single-sequence protein structure prediction using a language model and deep learning . Nat. Biotechnol . 40 , 1617 – 1623 . doi: 10.1038/s41587-022-01432-w . OpenUrl CrossRef PubMed 68. ↵ Zhao , H. , Giver , L. , Shao , Z. , Affholter , J.A. , and Arnold , F.H . ( 1998 ). Molecular evolution by staggered extension process (StEP) in vitro recombination . Nat. Biotechnol . 16 , 258 – 261 . doi: 10.1038/nbt0398-258 . OpenUrl CrossRef PubMed Web of Science 69. ↵ Almhjell , P.J. , Johnston , K.E. , Porter , N.J. , Kennemur , J.L. , Bhethanabotla , V.C. , Ducharme , J. , and Arnold , F.H . ( 2024 ). The β-subunit of tryptophan synthase is a latent tyrosine synthase . Nat. Chem. Biol . 20 , 1086 – 1093 . doi: 10.1038/s41589-024-01619-z . OpenUrl CrossRef PubMed 70. ↵ Ding , K. , Chin , M. , Zhao , Y. , Huang , W. , Mai , B.K. , Wang , H. , Liu , P. , Yang , Y. , and Luo , Y . ( 2024 ). Machine learning-guided co-optimization of fitness and diversity facilitates combinatorial library design in enzyme engineering . Nat. Commun . 15 , 6392 . doi: 10.1038/s41467-024-50698-y . OpenUrl CrossRef PubMed 71. ↵ Wittmann , B.J. , Johnston , K.E. , Almhjell , P.J. , and Arnold , F.H . ( 2022 ). evSeq: Cost-Effective Amplicon Sequencing of Every Variant in a Protein Library . ACS Synth. Biol . doi: 10.1021/acssynbio.1c00592 . OpenUrl CrossRef 72. ↵ Long , Y. , Mora , A. , Gürsoy , E. , Johnston , K.E. , Zhoufan-Li , F. , and Arnold , F.H . ( 2024 ). LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning . Preprint at bioRxiv , doi: 10.1101/2024.09.04.611255 . OpenUrl Abstract / FREE Full Text 73. ↵ Hoose , A. , Vellacott , R. , Storch , M. , Freemont , P.S. , and Ryadnov , M.G . ( 2023 ). DNA synthesis technologies to close the gene writing gap . Nat. Rev. Chem . 7 , 144 – 161 . doi: 10.1038/s41570-022-00456-9 . OpenUrl CrossRef PubMed 74. ↵ Yang , J. , Ducharme , J. , Johnston , K.E. , Li , F.-Z. , Yue , Y. , and Arnold , F.H . ( 2023 ). DeCOIL: Optimization of Degenerate Codon Libraries for Machine Learning-Assisted Protein Engineering . ACS Synth. Biol . 12 , 2444 – 2454 . doi: 10.1021/acssynbio.3c00301 . OpenUrl CrossRef PubMed 75. ↵ Virtanen , P. , Gommers , R. , Oliphant , T.E. , Haberland , M. , Reddy , T. , Cournapeau , D. , Burovski , E. , Peterson , P. , Weckesser , W. , Bright , J. , et al. ( 2020 ). SciPy 1.0: fundamental algorithms for scientific computing in Python . Nat. Methods 17 , 261 – 272 . doi: 10.1038/s41592-019-0686-2 . OpenUrl CrossRef PubMed 76. ↵ Berman , H.M. , Westbrook , J. , Feng , Z. , Gilliland , G. , Bhat , T.N. , Weissig , H. , Shindyalov , I.N. , and Bourne , P.E . ( 2000 ). The Protein Data Bank . Nucleic Acids Res . 28 , 235 – 242 . doi: 10.1093/nar/28.1.235 . OpenUrl CrossRef PubMed Web of Science 77. ↵ Jing , B. , Eismann , S. , Suriana , P. , Townshend , R.J.L. , and Dror , R . ( 2021 ). Learning from Protein Structure with Geometric Vector Perceptrons . Preprint at arXiv , doi: 10.48550/arXiv.2009.01411 . OpenUrl CrossRef 78. ↵ Chen , T. , and Guestrin , C . ( 2016 ). XGBoost: A Scalable Tree Boosting System . In Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining KDD ’16 . ( Association for Computing Machinery ), pp. 785–794. doi: 10.1145/2939672.2939785 . OpenUrl CrossRef 79. ↵ Buitinck , L. , Louppe , G. , Blondel , M. , Pedregosa , F. , Mueller , A. , Grisel , O. , Niculae , V. , Prettenhofer , P. , Gramfort , A. , Grobler , J. , et al. ( 2013 ). API design for machine learning software: experiences from the scikit-learn project . Preprint at arXiv , doi: 10.48550/arXiv.1309.0238 . OpenUrl CrossRef 80. ↵ Paszke , A. , Gross , S. , Massa , F. , Lerer , A. , Bradbury , J. , Chanan , G. , Killeen , T. , Lin , Z. , Gimelshein , N. , Antiga , L. , et al. ( 2019 ). PyTorch: An Imperative Style , High-Performance Deep Learning Library. Preprint at arXiv , doi: 10.48550/arXiv.1912.01703 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted October 24, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Evaluation of Machine Learning-Assisted Directed Evolution Across Diverse Combinatorial Landscapes Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Evaluation of Machine Learning-Assisted Directed Evolution Across Diverse Combinatorial Landscapes Francesca-Zhoufan Li , Jason Yang , Kadina E. Johnston , Emre Gürsoy , Yisong Yue , Frances H. Arnold bioRxiv 2024.10.24.619774; doi: https://doi.org/10.1101/2024.10.24.619774 Share This Article: Copy Citation Tools Evaluation of Machine Learning-Assisted Directed Evolution Across Diverse Combinatorial Landscapes Francesca-Zhoufan Li , Jason Yang , Kadina E. Johnston , Emre Gürsoy , Yisong Yue , Frances H. Arnold bioRxiv 2024.10.24.619774; doi: https://doi.org/10.1101/2024.10.24.619774 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioengineering Subject Areas All Articles Animal Behavior and Cognition (7649) Biochemistry (17738) Bioengineering (13925) Bioinformatics (42059) Biophysics (21496) Cancer Biology (18643) Cell Biology (25577) Clinical Trials (138) Developmental Biology (13406) Ecology (19946) Epidemiology (2067) Evolutionary Biology (24370) Genetics (15627) Genomics (22551) Immunology (17772) Microbiology (40497) Molecular Biology (17212) Neuroscience (88786) Paleontology (667) Pathology (2845) Pharmacology and Toxicology (4835) Physiology (7663) Plant Biology (15177) Scientific Communication and Education (2047) Synthetic Biology (4304) Systems Biology (9838) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00