Negative epistasis limits current codon optimization approaches

doi:10.1101/2025.06.03.657573

Negative epistasis limits current codon optimization approaches

2025 · doi:10.1101/2025.06.03.657573

preprint OA: gold CC-BY-NC-ND-4.0

📄 Open PDF Full text JSON View at publisher

Full text 90,168 characters · extracted from preprint-html · click to expand

Negative epistasis limits current codon optimization approaches | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Negative epistasis limits current codon optimization approaches Marco Gees , View ORCID Profile Zrinka Raguz Nakic , View ORCID Profile Maria Anisimova , View ORCID Profile Victor Garcia , View ORCID Profile Christin Peters doi: https://doi.org/10.1101/2025.06.03.657573 Marco Gees 1 Institute of Applied Computational Life Sciences, Zurich University of Applied Sciences , Schloss 1, 8820 Wädenswil, Switzerland 2 Institute of Chemistry and Biotechnology, Zurich University of Applied Sciences , Einsiedlerstrasse 29, 8820 Wädenswil, Switzerland Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zrinka Raguz Nakic 2 Institute of Chemistry and Biotechnology, Zurich University of Applied Sciences , Einsiedlerstrasse 29, 8820 Wädenswil, Switzerland Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zrinka Raguz Nakic For correspondence: zrinka.raguznakic{at}stadtspital.ch Maria Anisimova 1 Institute of Applied Computational Life Sciences, Zurich University of Applied Sciences , Schloss 1, 8820 Wädenswil, Switzerland 3 Swiss Institute of Bioinformatics , Quartier Sorge Bâtiment Genopode, 1015 Lausanne, Switzerland Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Maria Anisimova Victor Garcia 1 Institute of Applied Computational Life Sciences, Zurich University of Applied Sciences , Schloss 1, 8820 Wädenswil, Switzerland 3 Swiss Institute of Bioinformatics , Quartier Sorge Bâtiment Genopode, 1015 Lausanne, Switzerland Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Victor Garcia For correspondence: victor.garcia_palencia{at}alumni.ethz.ch Christin Peters 2 Institute of Chemistry and Biotechnology, Zurich University of Applied Sciences , Einsiedlerstrasse 29, 8820 Wädenswil, Switzerland Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Christin Peters For correspondence: peters.christin{at}gmx.de Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Demand for high-yield protein production in biotechnological applications is driving efforts to maximize heterologous protein expression in scalable microorganisms such as E. coli . While codon optimization techniques employed by contemporary sequence providers promise high-expression products, expression levels are often unsatisfactory. Whether the causes for this performance unreliability are due to fundamental constraints on the predictability of protein yields, or whether they stem from differences in theoretical approaches, is unknown. Here, we performed a comparative analysis to address this question. We assessed the performance of twelve different optimization approaches at enhancing expression of a sequence encoding a cinnamyl alcohol dehydrogenase. Six approaches stemmed from commercial providers and six from freely available sources. Through analysis of their elongation time profiles and multidimensional scaling we assessed which algorithms follow a unique optimization approach. We found that codon-optimized sequences are, on average, capable to raise protein expression levels with respect to the nonoptimised source sequence. However, variation in the protein expression levels was large. Simple, non-proprietary optimization techniques were capable of achieving protein expression levels that fall within the top expression range amongst candidate sequences. Lastly, we found that negative epistasis influences a sequence’s protein expression level. Since therefore the protein expression landscape arising from synonymous sequence space must exhibit a non-negligible degree of ruggedness, standard approaches will be limited in their capacity to predict protein expression levels of sequences. Introduction The increasing demand for high-yield proteins in therapeutic [ 1 ], industrial, and research settings drives the need to harness their expression at significant scales [ 2 ]. A prominent strategy in this domain is heterologous expression, where a protein native to a particular ‘origin’ organism is produced in a ‘target’ organism that can more easily be scaled for mass production [ 3 , 4 ]. Heterologous expression approaches face significant challenges. Genomic sequences from one organism have evolved over large time spans, and have been tailored by evolution to match the unique genomic environment of their host [ 5 , 6 ]. Such sequences are replete with modifications that reflect the past environmental pressures faced by the source organism. Given the great energetic costs imposed on the organism by translation [ 7 ], sequences are expected to reflect, in particular, selection for cost efficient translation [ 5 , 8 , 9 ]. Translocating these sequences into a different host organism introduces them to a foreign genomic landscape, characterized by new and different translational features. Thus, selectively beneficial changes acquired in the source context might turn maladaptive in the new target environment, jeopardizing yield [ 3 , 4 , 10 , 11 ]. A common approach to mitigate these risks is codon optimization , a method that alters the source sequence composition via synonymous substitutions, thereby preserving the order of amino acids that it encodes [ 12 – 14 ]. Different codon optimization approaches and algorithms are currently made available –especially for Escherichia coli – by a variety of commercial providers of transgenic DNA products as well as open-source software [ 15 ]. However, the reliability if these algorithms is generally poor [ 16 ] and the source codes of many of the algorithms in use remain undisclosed, complicating a comprehensive assessment of current-level capacities to predict protein expression yield for a sequence. In this study, we investigated the performance of six current codon optimization techniques by commercial providers as well as six non-proprietary optimization algorithms for the A. thaliana gene CAD5 (AtCAD5), which we heterologously expressed in two strains of E. coli , BL21(DE3) and K12(DE3). Our study is divided in three parts. First, a characterization of the sequences themselves and their inter-relationships according to the optimization approach employed. Second, the description of the protein expression features of these sequences. And third, we combine the information of these two assessments to test for the presence of epistasic effects in protein expression. We conclude that the observed variability in the expression levels attained may partly be explained by the inherent ruggedness present in the protein expression profile of the synonymous sequence of the AtCAD5 sequence. This result implies that there exist fundamental limitations on the predictability of protein expression levels of sequences by current approaches. Results The landscape of codon-optimized sequences To assess the performance of currently used codon-optimization techniques, we heterologously expressed 22 distinct sequences of the AtCAD5 gene, originally described in Arabidopsis thaliana , in two common laboratory strains of E. coli , BL21(DE3) and K12(DE3). AtCAD5 encodes a catalytically active cinnamyl alcohol dehydrogenase, a protein class which is expected to express well in E. coli . In plants, it is involved in and acts upstream of the lignin biosynthetic process (see Table II for references and details), which has received increasing attention for its potential to sequester carbon and thus mitigate the effects of global warming [ 17 , 18 ]. Two splice variants are known, of which we chose AT4G34230.1, the longer variant, whose coding sequence contains 357 codons. View this table: View inline View popup Download powerpoint TABLE I View this table: View inline View popup Download powerpoint TABLE II: Summary of data sources used. These 22 synonomous sequences were optimized by either proprietary software, open-source algorithms or by algorithms developed at the Institute of Computational Life Sciences (ICLS) in Wädenswil. In the proprietary subset, we used optimization algorithms from Twist Bioscience , the GENEius algorithm by Eurofins Scientific SE, Genscript Biotech, NovoPro Bioscience, GeneWiz from Azenta Life Sciences and Integrated DNA Technologies (IDT) (see Materials and Methods for overview and Supporting Information for detailed sequence description). Three sequences were produced by opensource algorithms: One sequence was optimized using the package GeneGA [ 19 ] within the Bioconductor package [ 20 ], one with JCat (Java Codon Adaptation Tool) [ 21 ], and one with the use of the Ana-Coda package [ 22 , 23 ]. Another three sequences were optimized by algorithms developed at the Institute of Computational Life Sciences at ZHAW. The rationale behind the development of our own optimization strategy was based on the notion that indiscriminately pruning sequences from translationally inefficient codons can jeopardize the stability of the translational and folding processes [ 24 , 25 ]. Where these codons are preserved by purifying selection, they are likely supporting a key function for the organism, and should therefore be retained. To implement this idea, three approaches were developed: A mirrored approach, where codon efficiencies in the host sequence are matched to the codon efficiencies of the source sequence. In a second approach, termed mirrored and restricted the sequence is additionally pruned of a predefined set of restriction sites. In a third approachwe replaced the most inefficient codons from the mirrored and restricted sequences with the second-most inefficient codons, giving rise to the mirrored and capped sequence candidate. A summary of the set of sequences used and the references to their design algorithms is given in Table III . View this table: View inline View popup Download powerpoint TABLE III: Summary of the heterologous expression of the AtCAD5 gene sequences in E. coli. The table lists the sources or algorithms used for codon optimization, the number of sequences optimized by each source, whether the generating algorithm is open-source, references where the algorithm is describe in detail (where available), and the urls where the optimization algorithms were used or may be downloaded. All sequences were designed with a C-terminal tag that consists of 30 codons that encode for a linker and GFP11 (See Supplementary Information ). The tag is used for a Split-GFP assay where it complements to an intact GFP upon addition of the detector fragment GFP1-10, resulting in fluorescence that is proportional to the amount of the expressed protein, thereby allowing quantification of the expressed protein. The optimized sequences display variation in the number of synonymous changes made. Relative to the wildtype, the sequences from Twist Bioscience show the largest number of point substitutions, with about 35% of the original sequence altered. These substitutions translate to about 92-94% of codons changed. The rest of the optimization algorithms result in smaller modifications. About 22-26% of the sequence is altered by point substitutions by the rest of the optimizers, corresponding to 60-67% of codons changed. Thus, sequences optimized by Twist Bioscience are unique in the amount of modification they undergo. Prior to analyzing the expression features of these sequences, we first investigated how they differ in their efficiency profile, and second, how they are distributed in sequence space. Figure 1 shows the local elongation time profiles of all optimized sequences for three different sets of codon-specific elongation time estimates, all inferred via distinct theoretical approaches. The first set of elongation time estimates was inferred from genome and protein expression rate data with the AnaCoda package [ 22 , 23 ] ( Fig. 1A ). A second set of estimates was inferred from ribosome occupancy data by Rudorf et al. [ 26 ] ( Fig. 1B ). The third set was inferred from tRNA abundance data by Shah and Gilchrist [ 27 ] ( Fig. 1C ), see Materials and Methods for detailed descriptions). For all three sets of elongation time estimates, the vast majority of approaches for codon optimization decrease the average time of translation relative to the original wildtype sequence, or equivalently, raise the average translational rate per codon. Download figure Open in new tab FIG. 1: Translation efficiency profiles by optimization algorithm. Locally estimated scatterplot smoothing (LOESS) curves of the codon-specific relative elongation times (y-axis) for all three sets of elongation time estimates (panels A to C) by codon position (x-axis). A: LOESS Curves for the AnaCoda-derived relative translation inefficiencies - a proxy for elongation times-given relative to the optimal codon within a family. B: LOESS-curves of efficiency profiles based on the Rudorf et alḋata set [ 26 ]. C: LOESS curves for the efficiency profiles based on the Shah and Gilchrist codon-specific elongation time estimates [ 27 ]. For algorithms with multiple optimized sequences, the elongation time profiles were aggregated prior to running the LOESS regression due to low within-algorithm sequence variation. The efficiency profiles of the optimized sequences reflect the different strategies used by the algorithms. All open-source based strategies follow an all-optimal codons approach, but with differences in the identity of the optimal codons. The exception are the ICLS-based sequences, which duefully replicate local elongation time patterns of the wildtype. In contrast to the open-source algorithms, proprietary algorithms do not solely follow an alloptimal codons strategy, as evidenced by their deviation from the AnaCoda and JCat sequences, which in fact do follow such a strategy. Instead, maximum efficiency codons are frequently traded off for codons that optimize other, undisclosed features - most likely features linked to the probability of obtaining viable protein. The wildtype sequence has among the highest average elongation times of all sequences –in line with expectation– together with some instances of the ICLS-based sequences. We proceeded to utilize these efficiency profiles to characterize the inter-sequence relationships among the optimized sequences via Multidimensional Scaling (MDS, see Materials and Methods ). Figure 2 shows the MDS of all the optimized sequences computed with the elongation time (for AnaCoda-based data, Figure 2A ) and elongation rate (for Shah and Rudorf data, Figures 2B and C ) profiles of these sequences as derived from all three codon-specific elongation time data sets. Additionally, panel D shows the MDS derived from the Hamming distance as a similarity metric. The different codon-specific elongation time data sets give rise to qualitatively similar graphs. Sequences produced by the same algorithm tend to cluster together for all three data sets. The ICLS-based sequences, the Twist sequences and the wildtype produce isolated clusters, indicating idiosyncratic underlying design principles. The non-proprietary sequences are found at the periphery of the graphs, nearby a larger cluster of the rest of the sequences in which boundaries between algorithms are more fluid. Download figure Open in new tab FIG. 2: Multidimensional scaling for four different distance metrics between sequences. For A, the stress function after MDS optimization based on sequence distances of AnaCoda-derived elongation times was 16.8. For B and C, sequence distances were based on elongation rate profiles (Shah and Rudorf data, respectively), and the stress values were 34.8 and 23.4, respectively. D Hamming distances between sequences were used as similarity metric (stress: 35.8). The distinctions discernible among algorithms are much more pronounced when Hamming distances are taken as the similarity metric. In Figure 2D , most sequences are spread out along a line stretching from position (0,−100) to about (0,40). Single sequence clusters by the same algorithm remain discernible, but the clusters increasingly overlap for sequences close to position (0,−100). This suggests that the sequences’ positions relative to the wildtype in sequence space may be approximated as being spread out along a particular axis. Across all MDS outputs, no clear trend for increasing protein expression is identifiable along any direction. These results suggest that the commercial providers do deploy optimization algorithms that give rise to distinct sequence types, but that these differences diminish in importance if elongation time or rate similarity metrics are adopted. Protein expression levels of codon-optimized AtCAD5 sequences Next, we assessed by how much the optimized sequences were able to raise protein expression levels relative to wildtype. To this end, all optimized sequence variants were expressed in BL21(DE3), a standard E. coli strain for biotechnological protein expression. To investigate the transferability of the findings to other strains, a subset of sequences was also expressed in K12(DE3), another standard research strain (from here on we term the strains BL21 and K12, respectively). To assess expression levels, a Split-GFP assay was performed, and relative protein amounts were determined by fluorescence signal [ 28 ]. For each optimized sequence, a total of eight replicates (two quadruplicates) were generated to measure fluorescence. Additionally, activity was determined in enzymatic assays using cinnamyl aldehyde as substrate. The optimization methods reliably generate high-expression products (see Figure 3 ). For the BL21 strain, 95.5% of the optimized sequence replicates exceed the mean wildtype expression level. Only the mirrored and capped -strategy designed by the ICLS attains overall inferior expression levels than the wildtype. In the K12 strain, 91.3% of the replicates exceed the mean of the K12-wildtype. Here, it is one of the Twist sequences that yields lower average expression values than the wildtype. For the BL21(DE3) strain, the average fold increase over all optimized sequences was 1.61, whereas for K12 it is 1.96. Download figure Open in new tab FIG. 3: Empirical density distribution of the protein expression levels of all replicates of the 22 optimized sequences of BL21 and K12 strains. The red and green vertical lines represent the mean wildtype expression levels taken over all eight replicates for BL21 and K12 strains, respectively. Figure 4A shows that the maximum mean fold increase of 1.95 for the BL21 is attained by the optimized sequence NovoPro [ 1 ] , designed by the Novo-Pro Bioscience. For K12, the maximum expression is attained by the JCat sequence, with 1.85 mean fold increase. The mirrored and capped strategy for the BL21 strain displays exceptionally low protein expression levels. Download figure Open in new tab FIG. 4: Protein expression characteristics of codon optimized sequences of the AtCAD5 gene for BL21 and K12. (A) Protein expression levels (in arbitrary units (AU) of fluorescence levels) for all 22 optimized sequences expressed in the BL21, and for 13 sequences expressed in K12. (B) Protein expression levels of BL21 versus K12. Estimate uncertainties are given with two standard deviations (sd) –one sd for each direction pointing away from the mean– of eight replicate measurements taken per sequence and strain. Standard linear regression is shown as a blue line, while the grey area represents the 95% confidence range of possible regression line slopes under a Student t-distribution. (C) Protein expression levels versus enzymatic activity for expression products from both BL21 and K12. Uncertainty ranges are computed as in B). Figure 4A reveals that for the BL21 strain, the intra-algorithm variation –that is, the variation in yield among sequences generated by the same algorithm– is generally small compared to the over-all range of protein expression levels, as seen in the cases of NovoPro, IDT and Genscript, with the exception of the Twist-based sequences. For BL21, intra-algorithm variation is also larger in magnitude to intra-sequence variation –the variation in yield of eight replicates of the same sequence. Notably, all open-source algorithms reliably expressed very high protein levels, all close to the 33’000 AU from Novo-Pro [ 1 ], the most highly expressed sequence on average. Figure 4A also reveals significant stochasticity among the eight replicates assessed per sequence, especially for the K12 strain, mostly arising from inter-quadruplicate variation. To test for the presence of strain-specific effects on expression, we computed the correlation of BL21 and K12 expression levels, finding a correlation of 0.83. Linear regression between expression levels across the two strains yields an estimated slope of 0.7, where the null that the slope equals zero is rejected with p-value < 0.001. This indicates that qualitatively as well as quantitatively, the expression characteristics of the sequences are very similar among the two strains. The readouts from Split-GFP must be interpreted with care regarding functionality. Especially proteins at high expression levels are prone to improper folding and formation of inclusion bodies. To assess whether the expressed proteins are functional, we determined the enzymatic activity of the proteins generated by all sequences for BL21 and K12. To quantify the active enzyme, depletion of NADPH was monitored at A 340nm (absorbtion at 340 nm) upon consumption of the substrate cinnamal aldehyde. Figure 4C ) shows that protein activity and expression levels are highly correlated across both strains BL21 and K12, with 0.8 overall correlation. Linear regression between expression and activity levels confirms the presence of a significant association with p-value < 0.001. However, closer visual inspection suggests the presence of an activity plateau after about 20’000 AU. This indicates that above a specific expression level, protein folding might become impaired, resulting in a partially non-functional protein pool. In conclusion, we find that proprietary as well as open-source algorithms are able to consistently raise protein expression levels relative to the wildtype, with the exception of the ICLS-based mirrored and capped sequence in the BL21 strain and the Twist [ 3 ] sequence in the K12 strain. We find that the Split-GFP assay is well-suited to capture the protein expression landscape of the optimized sequences examined, with some limitations regarding assessment of protein functionality, and that this expression landscape retains its basic properties when expressed in a different E. coli strain. Our results however are also in line with previous observations that too high expression rates eventually result in non-functional protein presumably due to impaired protein folding, as shown in the enzymatic assays. Evidence for negative epistatic effects among synonymous codons on protein expression levels Next, we addressed the question of whether the contributions from individual loci to overall protein expression combine in an additive, non-interactive manner, or whether alternatively, codons contribute non-additively, that is, whether they interact epistatically. To this end, we combined relational information on sequences with information on their measured expression. The methods for testing epistasis in these combined data is taken from theory devised to study epistasis in the context of population genetics. For the purposes of protein expression maximization, one relevant insight from epistasis research is that the presence of epistatic effects in protein expression impedes the prediction of the protein expression level of a sequence – whereas if epistasis is low or absent, only relatively small numbers of sequences have to be measured to provide accurate predictions for unobserved sequences [ 29 ]. This is due to epistasis giving rise to a rugged sequence-to-expression landscape, while the lack of epistasis gives rise to a gradually rising landscape [ 29 ]. Several methods have been proposed to assess the presence of epistasis in population genetics contexts [ 29 , 30 ]. One method, in particular, can be adapted to the data set of this study to address the question of whether codons in a sequence interact to affect the resulting protein expression level. The method is based on first determining a reference sequence, which is set to have zero log-fitness. This sequence is assumed to occupy a state of high fitness relative to its genomic surroundings. Log-fitness is then measured for sequences with deleterious mutations via regression, where w is the organism fitness, n is the Hamming distance to the reference strain and a and b are regression coefficients [ 30 , 31 ]. A negative b coefficient is indicative of negative epistasis, while a positive value of b is interpreted as the presence of positive epistasis. Similar studies have been carried for relative short genomes in terms of log-fitness [ 32 ]. Since selection operates on the phenotype of an organism, the fitness of an organism is typically thought to receive contributions from a variety of traits of the organism. If such trait –such as protein expression level– is closely linked to fitness, it may be interpreted as an intermediate feature from which fitness is derived –in particular if all other traits are held constant in an experimental setting. Therefore, the methods and logic utilized to investigate epistatic effects on fitness are also applicable to the trait itself. Thus, to test for epistatic interactions in the generation of the trait of protein expression, we adapted this method to the specifics of our data set. We applied the regression against the data. Here, p is the protein expression level, p ref is the protein expression level of the reference sequence, and again, n is the number of deleterious mutations separating a reference sequence from any other sequence. The coefficients a and b are interpreted as above [ 30 , 31 ]. To test this hypothesis, we first identified the best-performing sequence in both strains. The JCat sequence stands out as the one sequence which produces very high expression levels in both BL21(DE3) and K12 (see Figure 4B ). As the JCat sequence is best performing sequence, the mutations that separate other sequences from it are assumed to exert a deleterious effect upon protein expression. We utilized the Hamming distances computed for the multidimensional scaling computations between JCat and all other sequences to fit the regression formula (2) to the data. Figure 5 shows that the regression curve adopts a convex shape in both strains, BL21 an K12. Download figure Open in new tab FIG. 5: Negative epistatic effects among synonymous mutations combine to affect protein expression in BL21. Upper panel: Hamming distance from reference strain –the JCat optimized sequence– versus protein expression levels in units of the mean protein expression of the JCat sequence, minus one. Data points are shown for all measured replicates per sequence. No outliers were omitted. The fit of model (2) is shown as a royal blue line. The grey area around the regression line indicates the 95% uncertainty in the estimates of the two evaluated coefficients. Lower panel: Analogous to upper panel, but for the data measured in the K12 strain. Fitting (2) on the combined data from BL21 and K12 yields a very strong statistical signal for a negative b value (see Table I , generated with the stargazer -package [ 33 ] in R [ 34 ]). This is indicative of negative epistatic interactions governing the generation of protein expression levels. Running the same regression model, (2), on the strain-specific data sets individually yields more varied results. For BL21, the results of the aggregated data set are largely replicated. The statistical signal for a negative b -coefficient is strong, estimated at b = −7 · 10 −6 with p < 0.001 for the null hypothesis H 0 : b = 0. The adjusted R-squared for the regression is 0.54 at a number of observations of n = 184 (here, each replicate accounts for a data point). While the estimate for the b coefficient in the K12 data is negative ( b = −2.7 · 10 −6 ), it fails to attain statistical significance, most likely due to a limited number of observations ( n = 112 versus n = 296 in the aggregated data set). Summarizing, our data is consistent with the notion of negative epistatic effects governing the protein expression levels of the CAD5 sequence. While individual analysis of the K12 data set fails to attain statistical significance for negative epistasis, all data sets that exceed K12’s number of observations do, raising confidence that the effect is present. Discussion To express heterologous proteins at high yields, many biotechnological approaches today rely on a handful of microbial host organisms such as E. coli . However, optimization of expression for high yields is still a challenge. To address this issue, it is currently standard procedure to rely upon codon optimization, the synonymous modification of the original sequence to match the genomic and environmental features of the host organism. Codon optimization is typically offered, free of charge, by most commercial sequence providers. The providers commonly claim that their optimization will –with high probability– result in functional protein at a high yield. However, comparative studies to examine these claims are scarce [ 11 , 16 ]. The expectation that a model may be able to predict which synonymous sequences will yield high expression levels with accuracy is hard to justify on the grounds of the current body of theory on the mechanisms of translation. How codon usage bias in particular influences expression is determined by how synonymous changes affect translation, which remains subject to intense research efforts. In fact, a plethora of factors related to synonymous sequence structure have been found to exert significant influences on protein expression [ 12 ]. Examples include the presence of codon ramps at the 5’-end of the sequence, mRNA secondary structure, tRNA abundances, microorganismal growth rates, GC-content, codon usage bias and so forth [ 11 , 12 ]. These developments cast further doubts on the promise of accurate protein yield prediction from sequence information alone. The predictive capability of any model aiming to derive protein expression levels from sequence structure relies on its capacity to capture the inherent features of a part of the genotype-phenotype map for that sequence [ 29 ]. The features of such a genotype-phenotype map are open to empirical study. These studies allow to ascertain whether the map displays features –for instance a pronounced inherent ruggedness– that preclude accurate prediction by means of standard approaches or not. This study contributes to addressing this issue on three fronts. First, we validate that a main property of currently employed optimization algorithms is the reduction of total elongation time, or conversely, the increase of translational speed. We further represent our set of optimized sequences in via multidimensional scaling (MDS), and find that sequences of the same algorithm tend to cluster, confirming their common origin and high similarity. When using Hamming distances, MDS yields more pronounced algorithm-oriented clustering, indicating the presence of unique design principles. Second, we use a recently developed protein expression quantification technology, Split-GFP, for the purposes of genotype-phenotype map characterization on a protein that is not GFP itself. The technology yields measurements of high validity and reliability, as confirmed by further tests on the association of protein expression exhibited by two different E. coli strains. Third, we find that the synonymous genotype-to-phenotype landscape for protein expression is influenced by negative epistasis, and thus is very likely exhibiting ruggedness. This implies that fundamental limitations are imposed on the accuracy of protein expression prediction via naïve approaches by the nature of intra-sequence codon interactions. Thus, our results suggest that the study of the origin of epistatic effects on protein expression may help improve the predictive accuracy of protein expression models. Our results come with a series of caveats. For instance, we have explored the expression landscape of a fraction of the possible synonymous sequences of a single heterologously expressed gene, AtCAD5. Whether the specific biochemical features of AtCAD5 or its protein product preclude the generalization of our results to other is hard to predict. However, most larger expression landscape studies of synonymous sequences focus on green fluorescent protein (GFP) only [ 35 , 36 ]. Our study contributes to the investigation of the robustness of the GFP-derived conclusions by the exploration of another protein that is not GFP. We also lack a firm explanation for the failure of a subset of sequences to attain similar expression levels as the rest of the sequences, notably the mirrored and capped sequence in BL21(DE3), and the Twist [ 3 ] sequence for K12. One possible reason is the formation of dense aggregates of inclusion bodies upon high expression, which do not only affect protein activity, but also mask the GFP tag. The formation of fluorescent GFP entities would be hindered in such cases, rendering the Split-GFP assay incapable to detect such aggregates. Furthermore, such unexpected declines of protein expression below the original sequence’s level, are consistent with a rugged synonymous genotype-phenotype map. Ruggedness implies the presence of “expression valleys”, that is, sequences or localized sequence clusters where expression suddenly falls off. The presence and location of such valleys is hard to pinpoint by means of codon usage bias theory alone. Furthermore, the data presented in Figure 4 suggest a saturation of the activity of the expressed proteins with increased fluorescence. This suggests that increased production of the desired target protein does not necessarily lead to a corresponding amount of functional protein. This highlights an indirect problem that is encountered in biotechnological applications for optimal protein expression: apart from the DNA sequence itself, an optimization of the entire expression system and conditions is required. A possible explanation for this plateauing behavior is that two of codons’ translationally relevant properties could be inversely correlated: more translationally fast codons could be less accurate. Here, accuracy refers to several translational errors: amino acid misincorporation (missense errors) or more severe processivity errors, such as ribosomal drop-off [ 9 , 37 ]. In such a scenario, the artificial increase of the efficacy of a sequence by means of codon optimization –that is, the enrichment of sequences with fast codons– would, as a side effect, imply the artificial accrual of inaccurate codons in the same sequence. This would ensure high numbers of protein, but the fraction of functional products would be diminished. One strategy to counteract this problem has previously been introduced by the usage of a dual reporter system, that in addition to protein expression also reports the presence of falsely folded inclusion bodies [ 38 ]. This framework could by deployed to address these questions in a more precise manner. A further concern relates to the sample of the sequences analyzed for the test for epistasis. The set of optimized sequences used was not randomly sampled around a well-known fitness optimum. Therefore, the tests performed for the detection of epistasis may be biased towards above-average expression, if the optimization algorithms were to possess the property that they are capable to raise protein expression with a minimum amount of synonymous changes. However, this does not appear to be a target of the optimization function, since often, the number of synonymous changes proposed by the algorithms range from a minimum of 200 synonymous changes to almost 350 changes that is, given a sequence with a codon length of 360, almost every codon is exchanged in some instances. Furthermore, the test for epistasis does not explicitly state that the sampled sequences around the optimum must be sampled randomly [ 31 , 39 ]. A last caveat concerns the identity of the optimal sequence for protein expression in the investigation of epistasis. The test for epistasis assumes that the reference strain occupies a local or global optimum, such that all other sequences are separated from the reference sequence by mutations that add a deleterious effect to protein expression. Thus, the misidentification of the best expressing sequence could lead to biases that may affect the test, because if the true best performer sequences are close to the reference strain, a regression curve will have to curve down-wards to capture this feature, leading to negative b coefficients. To address this issue, we ran several regressions on a subset of possible candidates for best performing sequences, always retrieving our original results. This indicates that the signal for the result stems from the expression characteristics of sequences that are at large Hamming distances from the reference sequences of in the candidate subset, and not the aforementioned possible statistical artefact. Our results should be interpreted in the context of the broad body of evidence linking codon usage to protein expression and its use for codon optimization. Codon usage has been shown to greatly impact protein expression [ 11 ]. However, the variation in obtained gains by codon optimization is large, typically ranging between five to fifteen fold gain in E. coli [ 3 , 12 ]. While several sequence features are accounted for during optimization, such as codon adaptation index, GC-content, avoidance of repeats as well as other motifs (see Table 2 in [ 3 ]), the exact nature of many optimization algorithms remains confidential [ 11 , 16 ]. A large fraction of optimization approaches rely on all-optimal codon strategies [ 40 ], but the research field as a whole is moving away from such naïve algorithms [ 12 ]. The large variations in protein expression gains as well as in optimization approaches underscores a conspicuous lack of understanding in the mechanisms that give rise to the protein expression level of a sequence [ 11 , 16 , 41 ]. In fact, a recent study achieving a correlation coefficient of 0.76 between measured and predicted expression levels relied on machine learning instead of a mechanistic theory to produce estimates [ 42 ]. Reviews report that about 40% of soluble protein targets fail to reach meaningful expression levels [ 4 ]. Current limitations to accurately predict protein expression are also reflected by a paucity of predictive theory [ 4 , 42 ]. The majority of theoretical work on codon usage bias relating to protein expression is aimed at understanding how evolution may shape sequences to maintain a certain protein expression level at minimal energetic expenditure [ 5 , 8 , 9 , 27 , 37 , 43 , 44 ], or alternatively, at leveraging statistical associations identified from large data sets to improve predictive quality [ 45 – 48 ]. One of the few instances where protein expression prediction is grounded in a mechanistic foundation, is given in a series of papers by Gilchrist and Shah. These authors have pioneered the approach of explaining codon sequence structure via evolutionary principles [ 5 , 8 , 9 , 27 , 37 , 43 , 44 , 49 ]. This line of research focuses on determining a function, , that appropriately represents the cost to benefit ratio of a sequence, . Natural selection works towards minimizing η , while being counteracted by mutation (with corresponding, organism-specific biases). The selection pressure exerted upon the sequence to-wards optimizing its cost-benefit ratio is assumed to be proportional to the target protein expression rate of that sequence. Thus, if the population is assumed to be in mutation-selection balance, the observation of a sequence with a given η holds information on its target protein expression rate. Landerer, Gilchrist and Shah have refined Bayesian methods to predict this target protein expression rate given a sequence [ 37 , 43 , 49 ]. However, it is important to note that this method is geared towards predicting target protein expression rates only, that is, the rate or protein expression that is optimal for the organism’s survival in the environment it adapted to. Our results do corroborate some of the findings of other, more extensive explorations of the synonymous sequence space of a sequence, but are also at odds with some of their findings. For instance, Kudla et al. expressed 154 synonymous sequences of the reporter protein GFP, and found no correlation between codon usage bias and fluorescence levels [ 35 ]. Intuitively, this would imply that the strategy underlying most of the codon optimiziation strategies studied here should not have been effective. However, we do observe that the most codon biased sequences –AnaCoda, JCat and GeneGA– do indeed result in the highest expression levels. Furthermore, the study reported very large increase in fluorescence (i.e., protein expression), of about 250 fold of the original sequence [ 35 ]. The two fold increases observed here are much smaller. It is possible that either other variables of the sequence, such as its length and distance from the possible optimum, affect the potential for gain. Perhaps the most extensive exploration of protein expression patterns in synonymous sequence space has been carried out by Cambray et al. [ 36 ]. The authors statistically analyzed the effect of eight factors conjectured to govern protein expression levels of 244’000 sequences in E. coli , and concluded that about 50% of variation remains currently unexplained [ 36 ]. Codon usage bias did explain around 26% of variation under conditions where translation initiation was facilitated. If such facilitation was absent, codon usage biases contribution was strongly diminished to negligible levels ( Fig 2a in [ 36 ]). Their analysis suggests that mRNA secondary structures located around the 5’-end exert the strongest influence upon expression levels, accounting for 83% of total effect, in accordance with what was found in later analyses of expression [ 42 , 50 , 51 ]. Corroborating our results, Cambray et al. found evidence for “roughness” of the sequence expression landscape [ 36 ], an expected consequence of epistatic effects. The authors concluded that this suggests the presence of unknown sequence descriptors containing important predictive information. We hypothesize that such roughness may arise from epistasis: Individual codon contributions may add up disproportionately, depending on codon position and genetic background [ 29 ]. A possible mechanism for such epistasis was identified by Cannarozzi et al. [ 52 ], which found that clusters of similar codons appear to enhance translation efficiency by facilitating the reusage of the same tRNA. Thus, the presence of the same codon upstream may enhance the contribution to expression of a codon beyond what would be expected if that first codon was missing, giving rise to epistasis. The mirroring techniques developed at ICLS are similar in spirit and implementation to the codon harmonization techniques pioneered by Angov et al. [ 53 ]. Similarly to the mirroring technique adopted here, the rationale behind codon harmonization is the conjecture that clusters of non-optimal codons serve the purpose of aiding the nascent protein to fold correctly [ 54 ]. To optimize the source sequence, they first established a set of codon usage frequencies for each amino acid residue in the source organism’s genome. An analogous list was then produced for the host organism. Original codons where replaced by hast codons that most closely matched usage frequency [ 53 ]. Link/end segments are treated by a special procedure. Unlike the mirroring technique used here, codon harmonization thus relies on usage frequencies instead of translational inefficiency estimates (inferred by the AnaCoda package) to inform the codon replacements. Using this technique, Angov et alṡuccessfully codon harmonized the sequence for MSP1 (FVO) from Plasmodium falciparum -a candidate antigen for a Malaria vaccine- and achieved a sixty-fold increase in protein yield when expressed in E. coli . Our results differ starkly the mirrored sequences were among the least expressed. However, the poor expression attained by the mirrored and capped sequence, where all of the most inefficient codons were replaced by the second most inefficient codons, suggests that the rationale underpinning this approach warrants more detailed investigation [ 55 ]. The quest for understanding protein production is ongoing [ 16 ] and the question whether there exist fundamental limitations to its predictability remains to be answered conclusively. But a more profound understanding of the underlying mechanisms of protein synthesis holds the promise to acquiring the capacity to engineer synthetic organisms, for instance via designed genetic codes [ 56 ]. Materials and Methods Data Plasmid collection and strain preparation All DNA sequences for expression were designed as fusion proteins, containing a C-terminal linker with the GFP11 fragment that allows detection of expression in a split-GFP assay. Cloning and ligation of the wild type sequence of AtCAD5 into pET28b was performed using standard restriction and ligation techniques. The wild type sequence of AtCAD5 was supplied as gene block from Integrated DNA Technologies (IDT). NdeI and XhoI sites were used for the restriction, followed by ligation by the T4 DNA ligase of the gel-purified DNA fragments. Correct insertion of the wild type sequence of AtCAD5 was confirmed by sequencing. Remaining DNA sequences for AtCAD5 were ordered at Twist Bioscience as clonal genes in pET28b, whereas a selection of constructs was generated by adapting specific codons to the desired codon sequences (Supplementary Table 1). To that end, a PCR using a primer pair that contains the desired nucleotide change was performed, followed by digestion of the template strand by DpnI (see Supplementary Information on codon optimized sequences). Correct plasmid sequences were confirmed by sequencing. Plasmids were transformed into chemically competent E. coli BL21(DE3) or electrocompetent E. coli K12(DE3) cells for expression. E. coli K12(DE3) corresponds to the K12 strain MG1655 ΔendA ΔrecA (DE3) and was a gift from Kristala Prather (Addgene plasmid # 37854)[ 57 ]. Preparation of Split-GFP assay detector fragment GFP1-10 The previously reported DNA sequence for the GFP1-10 detector fragment was obtained as gene block from IDT [ 58 ]. Amplification and elongation with flanking NdeI and HindIII restriction sites was performed by PCR. The amplified fragment and pET22b were digested, DNA fragments were gel purified and ligation using the T4 DNA ligase was performed. Correct insertion of the GFP1-10 fragment into pET22b was confirmed by sequencing. Expression was performed in E. coli BL21(DE3), with all media being supplemented with 100 μ g/ml ampicillin. A 10 L fermentation in a 15 L reactor was conducted in TB (terrific broth) medium. Inoculation was performed 1:20 using a TB overnight culture grown at 37 ° C, 180 rpm. Cultivation parameters were kept at 37 ° C, a pH range of 7.0 – 8.5 and p O2 > 35%. Expression of GFP1-10 was induced with 100 μ M IPTG at OD 600 5.5, continuing cultivation for 22 h. The biomass was pelleted by centrifugation at at 3428 x g, 4 ° C for 20 min and stored at −20 ° C until further usage. The purification of GFP1-10 from inclusion bodies was achieved based on previously established protocols [ 28 , 58 ]. Cell pellets were resuspended in 15 ml per 5 g wet biomass of TNG buffer (100mM Tris-HCl pH 7.4, 100 mM NaCl, and 10% glycerol) and incubated on ice for 5 min. Cell disruption was performed in three cycles by ultrasonication (2 min, amplitude 50%, pulse 1s/1s). The inclusion bodies were pelleted by 30 min centrifugation at 19000 x g and 4 ° C, followed by 6 cycles of resuspension in 10 ml TNG buffer and 10 min centrifugation at 3428 x g and 4 ° C. The final centrifugation step was performed at 16000 x g at room temperature, and the pellet was stored at −80 C until further usage. The inclusion bodies were resuspended in 1 M urea to a concentration of 75 mg/ml. Following 1 h agitation at 600 rpm at room temperature, the solution was centrifuged for 1 min at 16000 x g, before transferring the supernatant to 25 ml TNG buffer. Concentration of GFP1-10 was determined by a NanoDrop One UV-Vis Spectrophotometer (280 nm, 24.443 kDa, 17.545 l mol −1 cm −1 ) and adjusted to 0.4 mg/ml with TNG buffer. Aliquots were stored at −20 ° C until further usage. Protein expression for lysate preparation Growth of E. coli BL21(DE3) and E. coli K12(DE3) for expression of codon variants was performed in quadruplicates in 96-well deep well plates in two independent experiments, resulting in a total of eight replicates per strain. All media were supplemented with 50 μ g/ml kanamycin. Overnight precultures in 900 μ l LB medium were grown at 37 C, 300 rpm. Precultures were used to inoculate 1200 μ l fresh TB medium in a 1:1000 ratio. Induction with 100 μ M IPTG was performed in early-mid exponential phase at OD 600 1.21 ± 0.27 or 0.75 ± 0.08 for BL21(DE3) or K12(DE3) respectively, followed by expression at 20 ° C. After 20 ± 2 h, the well volume was adjusted to 900 μ l and the plate was centrifuged for 20 min at 3428 x g at 4 ° C. The supernatant was discarded, and cell pellets were stored at −20 ° C until further usage. For preparation of lysates, each pellet was resuspended in 180 μ l of BugBuster ® Master Mix supplemented with 3.23% (v/v) Protease Inhibitor Cocktail Set II. For strains with lower end OD - mostly affecting K12(DE3) strains that generally grow lower than BL21(DE3) - the resuspension volume was reduced to maintain comparable biomass concentrations of 0.083 ± 0.030 OD/ μ l in the lysate. Following incubation for 30 min at room temperature, the plate was centrifuged for 20 min at 3428 x g at 4 ° C. The supernatant was transferred to a fresh plate for analysis of protein expression and activity. Split-GFP assay In black 96-well microtiter plates with clear bottom, 12.5 μ l cell lysate from deep well plate expression were mixed with 187.5 μ l GFP1-10 detector fragment. The plate was sealed and incubated at 32 ° C and 100 rpm for 20 h, to allow stable formation of fluorescent GFP-complexes signal. Fluorescence was measured with a Tecan Infinite ® 200 PRO plate reader, using extinction at 485 nm and emission at 520 nm. Within each assay plate, the average of the background GFP signal using the empty pET28b vector was subtracted from all other strain signals. The average and the standard deviation of all replicates was determined. Replicates that deviated more than 1.5 x standard deviations from the mean were considered outliers and were subsequently excluded from calculations. Enzymatic detection of AtCAD5 activity Enzyme activity was determined using clear 96-well microtiter plates with clear bottom similarly as previously reported in [ 59 ]. The reaction mixture for the assessment of enzyme activity in the lysate consisted of a final concentration of 485 μ M NADPH and 2.4 mM cinnamal aldehyde in 0.1 M Sodium Potassium Phosphate buffer (pH 6.25). A fresh 30x dilution of cell lysate from deep well plate expression was prepared in 0.1 M Sodium Potassium Phosphate buffer (pH 6.25). To initiate the reaction, 10.0 μ l of the diluted lysate was added to the reaction mixture. NADPH consumption was followed by A 340nm measured with a Tecan Infinite ® 200 PRO plate reader over at least 2 min in 11.7 sec intervals. To assess AtCAD5 activity in the lysate, the inital slope of the reaction was determined. The average and the standard deviation of all replicates was determined. Replicates that deviate more than 1.5 x standard deviations from the mean were considered outliers and were subsequently excluded from calculations. Genomic data for A. thaliana and E. coli used for Codon Optimization In this study, we used two genome data sets – one for Arabidopsis thaliana ( A. thaliana ) and one for E. coli – as well as a protein expression data set – for E. coli – to calibrate our codon optimization model via the AnaCoda package. For A. thaliana , we downloaded the coding sequences from the Arabidopsis thaliana information resource (TAIR) [ 60 ]. These sequences contained in the Araport11 blastsets, file Araport11_cds_20210622.fasta. The wildtype sequence for the gene AtCAD5 was also downloaded from TAIR, reference At4g34230. For E. coli , we obtained the coding sequence data for the strain K12, substrain MG1655, from the National Center for Biotechnology Information (NCBI), accession number NC 000913.3. The data include 4319 protein coding sequences (see https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/511145/ [ 61 – 64 ]). Protein expression level data for E. coli Protein expression data for E. coli were generated in [ 7 ], and were converted into a useful format in the publication [ 49 ], and can be found on the Github project https://github.com/acope3/Signal_Peptide_Scripts/blob/master/Data/Empirical/ROC_Phi/Ecoli_K12_MG1655_main_phi.csv . Briefly, to convert the data from [ 7 ], the authors of [ 44 ] utilized the mRNA translational efficiency values in the Supplementary Table S4 in [ 7 ], and multiplied them with the corresponding mRNA levels of the same gene to obtain the protein expression level for that gene. These levels were then normalized in such manner that the average expression level is one. Codon-specific elongation rate data for E. coli In this study, we used three different sets of codon-specific elongation rate data for E. coli . The first set was estimated by Shah and Gilchrist [ 27 ] mostly on the basis of tRNA abundancies. They estimated cognate elongation rates of codons by combining gene copy numbers of cognate tRNA (as proxies for cognate tRNA abundances) and probabilities of elongation of near- and pseudo cognates. The table with the codon-specific elongation rate values (column R c ) was retrieved from Table S2 of the Supporting Information of [ 27 ]. The second set of codon-specific elongation rates was derived from elongation time estimates. Proxies of elongation times were estimated via de AnaCoda package’s ribosomal overhead cost (ROC) model, which assumes that the selection coefficient of a synonymous sequence variant is directly proportional to the overall elongation time of the sequence [ 22 , 43 ]. To obtain a table of relative codon-specific elongation times, we ran AnaCoda’s ROC model on the full E. coli genome with normalized protein expression level data. The values in the table of estimates corresponds to codon-specific overhead costs, which are assumed to be proportional to the elongation time estimates for each codon. All the overhead costs within a synonymous codon family are given relative to the least costly codon within the family, which by default has a cost of zero. A third data set by Rudorf and Lipowsky was estimated on the basis of ribosome occupancy data [ 26 ]. The authors employed a Markov model to infer codon-specific elongation rates under different sets specific bacterial growth rates, assuming a 2-1-2 pathway of tRNA release at the E site. Their estimates are found in the S2 Table of the Supporting Information of [ 26 ]. We used the estimates for a specific growth rate of 0.7 per hour, which is close (within 0.2 per hour) to the growth rate measured for E. coli in the experimental setup of this study for all replicates examined. Summary of external data sources Table II gives a summary of the data used from external sources, including coding sequences for the gene to be optimized, genomic data, and especially data on protein expression levels in E. coli and codon-specific translation information for E. coli as well. Codon Optimization of the Sequences Anacoda-based all-optimal codons strategy for sequence design For the design of the codon optimized sequence termed AnaCoda , we first ran the ribosome overhead cost (ROC) model [ 27 , 43 ] of the AnaCoda package [ 22 ] on the E. coli genome, including information on normalized protein expression levels measured during E. coli ‘s exponential growth phase. This regression yielded estimates for both, codon-specific mutational biases as well as codon-specific biases in the cost of translation as would by inferred if these were the main targets of selection for synonymous codon change. Under the ROC model, the cost of translation corresponds exactly to the sum of the elongation times of all codons of the sequence times a factor to convert these into selection coefficients. Thus, the codon-specific cost estimates from AnaCoda are directly proportional those codons’ elongation times [ 27 , 43 ]. As a caveat, the translational costs can only be computed relative to the least costly (that is, most time-effective or translationally fast) codon within a codon family for the same amino acid. With these preparations, we proceeded to replace each codon of the AtCAD5 wildtype sequence with the optimal codon of its family as estimated under the ROC model. That is, each codon at every position was replaced by a codon that under the ROC model has a translational cost of zero, and thus the smallest elongation time within its codon family. Notably, the optimized sequence differs from other sequences optimized by other, more classical codon optimization algorithms. This is because for the amino acid threonine (T), AnaCoda identifies the codon ACT as optimal, while JCat identified the codon ACC as optimal. Proprietary sequences of CAD5 To obtain codon optimized sequences from the commercial providers, we utilized the optimization algorithms freely available on the providers’ official websites. Table III gives a summary of the urls where these proprietary algorithms can be accessed. If the optimization algorithms worked by stochastic optimization principles, we repeated the optimization three times. All optimization algorithms allowed for the exclusion of the restriction sites BamHI [GGATCC], EcoRI [GAATTC], EcoRV [GATATC], NdeI [CATATG] and XhoI [CTCGAG], necessary for plasmid construction. Anacoda-derived mirroring optimization strategies To mirror the original elongation rate pattern in the new organism, we first had to assess codons’ translational efficiencies in their original organism’s genetic environment. We ran a regression of the ROC model of the AnaCoda package [ 22 , 23 ] on the A. thaliana genome, yielding tables of relative codon-specific elongation times for the A. thaliana genome. Subsequently, the same regression procedure was performed for E. coli (additionally taking protein expression levels as input), yielding an analogous, codon-specific translational efficiency table. The wildtype sequence was then optimized as follows: Each original codon was replaced by a codon that occupied the same relative efficiency rank within its synonymous codon family, thus replacing the most inefficient original codons with the most inefficient new codons, and so forth. In the publication, we term this the mirrored sequence. The code detailing this algorithm is available in the Supporting Information . In a second approach, the mirrored sequence was pruned of restriction sites (see section Proprietary sequences of CAD5 for a list of the restriction sites). When removing restriction sites, we first identified the codons with the most similar translational efficiency of a synonymous codon family member for each of the two codons on the site. We then replaced that codon with its most similar synonymous counterpart. In a third approach, we replaced all maximally inefficient codons with the second most inefficient codons of their respective synonymous codon family, thereby “capping” inefficiencies in the sequence. We term this approach mirrored and capped . Summary of the codon-optimized sequences Table III gives a summary of the optimized sequences that were heterologously expressed in this study. Table also gives reference to the codon optmization algorithms employed by commercial providers of the gene products, as well as the references, if found, of the scientific publications describing the optimization algorithms. Multidimensional Scaling Efficiency profiles may be interpreted as vectors in which the i th entry corresponds to the elongation time of the codon at the i th position of that sequence. This allows to define a similarity metric between sequences and subsequently to apply MDS, a technique that allows for the representation and visualization of multidimensional similarity matrices in a two-dimensional Cartesian space, while preserving inter-element relationships –as expressed via a cost function termed stress to be minimized– as much as possible [ 70 , 71 ] (see Materials and Methods). The distance metric used in Figure 2 is the Euclidian distance between relative elongation time or elongation rate profiles, given by where and are either the elongation time or elongation rate vectors of two sequences, and where x i and y i are the elongation time or rate of a codon at the i th position of the sequence, respectively. Additionally, L is the length of the sequence in codons. For Figure 2A ), the elongation time profiles were derived by using relative translation inefficiencies inferred under the ribosomal overhead cost (ROC) model of the AnaCoda package [ 22 , 23 , 43 ]. The distance metric used in Figure 2B ) was the Euclidian distance between the elongation rate profiles of the sequences derived from the Shah data set of codon-specific elongation rates [ 27 ], while for C) they were derived from the Rudorf data set [ 26 ]. For all MDS procedures, we used Kruskal’s non-metric multidimensional scaling [ 70 , 71 ]. Acknowledgments Victor Garcia was supported by SystemsX.ch – the Swiss Initiative for Systems Biology [grant number 51FSP0_163566], the Swiss National Science Foundation [grant number 31003A_182330] and the Digitalization Initiative of the Zurich Higher Education Institutions (DIZH fellowship grant). Marco Gees was supported by the Swiss National Science Foundation [grant number 31003A_182330]. Marco Gees, Zrinka Raguz Nakic, Victor Garcia and Christin Peters thank the Zurich University of Applied Sciences for the internal funding of the project “Die Exploration der Gen-Harmonisierung”. Funder Information Declared Swiss National Science Foundation, https://ror.org/00yjd3n13 , 182330 Swiss Initiative for Systems Biology , 51FSP0_163566 Digitalisierungsinitiative der Zürcher Hochschulen (DIZH) ZHAW Zurich University of Applied Sciences, Life Sciences and Facility Management References [1]. ↵ Mauro VP . Codon Optimization in the Production of Recombinant Biotherapeutics: Potential Risks and Considerations . BioDrugs . 2018 Feb ; 32 ( 1 ): 69 – 81 . Available from: http://link.springer.com/10.1007/s40259-018-0261-x . OpenUrl CrossRef PubMed [2]. ↵ Borkowski O , Bricio C , Murgiano M , Rothschild-Mancinelli B , Stan GB , Ellis T. Cell-free prediction of protein expression costs for growing cells . Nature Communications . 2018 Apr ; 9 ( 1 ): 1457 . Available from: https://www.nature.com/articles/s41467-018-03970-x . OpenUrl CrossRef PubMed [3]. ↵ Gustafsson C , Govindarajan S , Minshull J. Codon bias and heterologous protein expression . Trends in Biotechnology . 2004 Jul ; 22 ( 7 ): 346 – 53 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S0167779904001118 . OpenUrl CrossRef PubMed Web of Science [4]. ↵ Parret AH , Besir H , Meijers R. Critical reflections on synthetic gene design for recombinant protein expression . Current Opinion in Structural Biology . 2016 Jun ; 38 : 155 – 62 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S0959440X16300793 . OpenUrl CrossRef PubMed [5]. ↵ Gilchrist MA , Shah P , Zaretzki R. Measuring and Detecting Molecular Adaptation in Codon Usage Against Nonsense Errors During Protein Translation . Genetics . 2009 Dec ; 183 ( 4 ): 1493 – 505 . Available from: https://academic.oup.com/genetics/article/183/4/1493-1505/6063162 . OpenUrl Abstract / FREE Full Text [6]. ↵ Plotkin JB , Kudla G. Synonymous but not the same: the causes and consequences of codon bias . Nature Reviews Genetics . 2011 Jan ; 12 ( 1 ): 32 – 42 . Available from: https://www.nature.com/articles/nrg2899 . OpenUrl CrossRef PubMed Web of Science [7]. ↵ Li GW , Burkhardt D , Gross C , Weissman JS . Quantifying Absolute Protein Synthesis Rates Reveals Principles Underlying Allocation of Cellular Resources . Cell . 2014 ; 157 ( 3 ): 624 – 35 . Available from: https://www.sciencedirect.com/science/article/pii/S0092867414002323 . OpenUrl CrossRef PubMed Web of Science [8]. ↵ Gilchrist MA . Combining Models of Protein Translation and Population Genetics to Predict Protein Production Rates from Codon Usage Patterns . Molecular Biology and Evolution . 2007 Aug ; 24 ( 11 ): 2362 – 72 . Available from: https://academic.oup.com/mbe/article-lookup/doi/10.1093/molbev/msm169 . OpenUrl CrossRef PubMed Web of Science [9]. ↵ Shah P , Gilchrist MA . Explaining complex codon usage patterns with selection for translational efficiency, mutation bias, and genetic drift . Proceedings of the National Academy of Sciences . 2011 Jun ; 108 ( 25 ): 10231 – 6 . Available from: http://www.pnas.org/cgi/doi/10.1073/pnas.1016719108 . OpenUrl Abstract / FREE Full Text [10]. ↵ Buhr F , Jha S , Thommen M , Mittelstaet J , Kutz F , Schwalbe H , et al. Synonymous Codons Direct Co-translational Folding toward Different Protein Conformations . Molecular Cell . 2016 Feb ; 61 ( 3 ): 341 – 51 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S1097276516000095 . OpenUrl CrossRef PubMed [11]. ↵ Liu Y , Yang Q , Zhao F. Synonymous but Not Silent: The Codon Usage Code for Gene Expression and Protein Folding . Annual Review of Biochemistry . 2021 Jun ; 90 ( 1 ): 375 – 401 . Available from: https://www.annualreviews.org/doi/10.1146/annurev-biochem-071320-112701 . OpenUrl CrossRef PubMed [12]. ↵ Quax TEF , Claassens NJ , Söll D , van der Oost J. Codon Bias as a Means to Fine-Tune Gene Expression . Molecular Cell . 2015 Jul ; 59 ( 2 ): 149 – 61 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S1097276515004025 . OpenUrl CrossRef PubMed [13]. Gingold H , Pilpel Y. Determinants of translation efficiency and accuracy . Molecular Systems Biology . 2011 Jan ; 7 ( 1 ): 481 . Available from: https://onlinelibrary.wiley.com/doi/10.1038/msb.2011.14 . OpenUrl Abstract / FREE Full Text [14]. ↵ Novoa EM , Ribas De Pouplana L. Speeding with control: codon usage, tRNAs, and ribosomes . Trends in Genetics . 2012 Nov ; 28 ( 11 ): 574 – 81 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S0168952512001138 . OpenUrl CrossRef PubMed Web of Science [15]. ↵ Al-Hawash AB , Zhang X , Ma F. Strategies of codon optimization for high-level heterologous protein expression in microbial expression systems . Gene Reports . 2017 Dec ; 9 : 46 – 53 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S2452014417300614 . OpenUrl CrossRef [16]. ↵ Nieuwkoop T , Finger-Bou M , Van Der Oost J , Claassens NJ . The Ongoing Quest to Crack the Genetic Code for Protein Production . Molecular Cell . 2020 Oct ; 80 ( 2 ): 193 – 209 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S1097276520306444 . OpenUrl CrossRef PubMed [17]. ↵ Fuss S , Lamb WF , Callaghan MW , Hilaire J , Creutzig F , Amann T , et al. Negative emissions—Part 2: Costs, potentials and side effects . Environmental Research Letters . 2018 may ; 13 ( 6 ): 063002 . Available from : doi: 10.1088/1748-9326/aabf9f . OpenUrl CrossRef [18]. ↵ Harman-Ware AE , Sparks S , Addison B , Kalluri UC . Importance of suberin biopolymer in plant function, contributions to soil organic carbon and in the production of bio-derived energy and materials . Biotechnology for Biofuels . 2021 Mar ; 14 ( 1 ): 75 . Available from: https://biotechnologyforbiofuels.biomedcentral.com/articles/10.1186/s13068-021-01892-3 . OpenUrl CrossRef PubMed [19]. ↵ Li Z , Huang H. GeneGA: Design gene based on both mRNA secondary structure and codon usage bias using Genetic algorithm ; 2023 . Available from: https://bioconductor.org/packages/GeneGA . [20]. ↵ Huber W , Carey VJ , Gentleman R , Anders S , Carlson M , Carvalho BS , et al. Orchestrating high-throughput genomic analysis with Bio-conductor . Nature Methods . 2015 ; 12 ( 2 ): 115 – 21 . Available from: http://www.nature.com/nmeth/journal/v12/n2/full/nmeth.3252.html . OpenUrl CrossRef PubMed [21]. ↵ Grote A , Hiller K , Scheer M , Munch R , Nortemann B , Hempel DC , et al. JCat: a novel tool to adapt codon usage of a target gene to its potential expression host . Nucleic Acids Research . 2005 Jul ; 33 ( Web Server ): W526 – 31 . Available from: https://academic.oup.com/nar/article-lookup/doi/10.1093/nar/gki376 . OpenUrl CrossRef PubMed Web of Science [22]. ↵ Landerer C , Cope A , Zaretzki R , Gilchrist MA . AnaCoDa: analyzing codon data with Bayesian mixture models . Bioinformatics . 2018 Mar ; 34 ( 14 ): 2496 – 8 . eprint: https://academic.oup.com/bioinformatics/article-pdf/34/14/2496/48918107/bioinformatics_34_14_2496.pdf Available from : doi: 10.1093/bioinformatics/bty138 . OpenUrl CrossRef [23]. ↵ Landerer C , Cope AL . AnaCoDa: Analysis of Codon Data under Stationarity using a Bayesian Framework ; 2020 . Available from: https://CRAN.R-project.org/package=AnaCoDa . [24]. ↵ Chaney JL , Clark PL . Roles for Synonymous Codon Usage in Protein Biogenesis . Annual Review of Biophysics . 2015 Jun ; 44 ( 1 ): 143 – 66 . Available from: http://www.annualreviews.org/doi/10.1146/annurev-biophys-060414-034333 . OpenUrl CrossRef PubMed [25]. ↵ Yu CH , Dang Y , Zhou Z , Wu C , Zhao F , Sachs M , et al. Codon Usage Influences the Local Rate of Translation Elongation to Regulate Co-translational Protein Folding . Molecular Cell . 2015 Sep ; 59 ( 5 ): 744 – 54 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S1097276515005766 . OpenUrl CrossRef PubMed [26]. ↵ Rudorf S , Lipowsky R. Protein Synthesis in E. coli: Dependence of Codon-Specific Elongation on tRNA Concentration and Codon Usage . PLOS ONE . 2015 Aug ; 10 ( 8 ): e0134994 . Available from: https://dx.plos.org/10.1371/journal.pone.0134994 . OpenUrl CrossRef PubMed [27]. ↵ Shah P , Gilchrist MA . Effect of Correlated tRNA Abundances on Translation Errors and Evolution of Codon Usage Bias . PLoS Genetics . 2010 Sep ; 6 ( 9 ): e1001128 . Available from: https://dx.plos.org/10.1371/journal.pgen.1001128 . OpenUrl CrossRef [28]. ↵ Santos-Aberturas J , Dörr M , Waldo GS , Bornscheuer UT . In-Depth High-Throughput Screening of Protein Engineering Libraries by Split-GFP Direct Crude Cell Extract Data Normalization . Chemistry & Biology . 2015 Oct ; 22 ( 10 ): 1406 – 14 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S1074552115003385 . OpenUrl CrossRef PubMed [29]. ↵ Bank C. Epistasis and Adaptation on Fitness Landscapes . Annual Review of Ecology, Evolution, and Systematics . 2022 Nov ; 53 ( 1 ): 457 – 79 . Available from: https://www.annualreviews.org/doi/10.1146/annurev-ecolsys-102320-112153 . OpenUrl CrossRef [30]. ↵ Kouyos RD , Silander OK , Bonhoeffer S. Epistasis between deleterious mutations and the evolution of recombination . Trends in Ecology & Evolution . 2007 ; 22 ( 6 ): 308 – 15 . Available from: https://www.sciencedirect.com/science/article/pii/S0169534707000699 . OpenUrl CrossRef PubMed [31]. ↵ West SA , Peters AD , Barton NH . Testing for Epistasis Between Deleterious Mutations . Genetics . 1998 May ; 149 ( 1 ): 435 – 44 . Available from: https://academic.oup.com/genetics/article/149/1/435/6034229 . OpenUrl Abstract / FREE Full Text [32]. ↵ Bonhoeffer S , Chappey C , Parkin NT , Whitcomb JM , Petropoulos CJ . Evidence for Positive Epistasis in HIV-1 . Science . 2004 ; 306 ( 5701 ): 1547 – 50 . eprint: https://www.science.org/doi/pdf/10.1126/science.1101786 . Available from: https://www.science.org/doi/abs/10.1126/science.1101786 . OpenUrl Abstract / FREE Full Text [33]. ↵ Hlavac M. stargazer: Well-Formatted Regression and Summary Statistics Tables. Bratislava, Slovakia : Social Policy Institute ; 2022 . Available from: https://CRAN.R-project.org/package=stargazer . [34]. ↵ R Core Team . R: A Language and Environment for Statistical Computing . Vienna, Austria ; 2023 . Available from: https://www.R-project.org/ . [35]. ↵ Kudla G , Murray AW , Tollervey D , Plotkin JB . Coding-Sequence Determinants of Gene Expression in Escherichia coli . Science . 2009 Apr ; 324 ( 5924 ): 255 – 8 . Available from: https://www.science.org/doi/10.1126/science.1170160 . OpenUrl Abstract / FREE Full Text [36]. ↵ Cambray G , Guimaraes JC , Arkin AP . Evaluation of 244,000 synthetic sequences reveals design principles to optimize translation in Escherichia coli . Nature Biotechnology . 2018 Nov ; 36 ( 10 ): 1005 – 15 . Available from: http://www.nature.com/articles/nbt.4238 . OpenUrl CrossRef PubMed [37]. ↵ Landerer C , O’Meara BC , Zaretzki R , Gilchrist MA . Unlocking a signal of introgression from codons in Lachancea kluyveri using a mutation-selection model . BMC Evolutionary Biology . 2020 Dec ; 20 ( 1 ): 109 . Available from: https://bmcevolbiol.biomedcentral.com/articles/10.1186/s12862-020-01649-w . OpenUrl CrossRef PubMed [38]. ↵ Zutz A , Hamborg L , Pedersen LE , Kassem MM , Papaleo E , Koza A , et al. A dual-reporter system for investigating and optimizing protein translation and folding in E. coli . Nature Communications . 2021 Oct ; 12 ( 1 ): 6093 . Available from: https://www.nature.com/articles/s41467-021-26337-1 . OpenUrl CrossRef PubMed [39]. ↵ Kouyos RD , Leventhal GE , Hinkley T , Haddad M , Whitcomb JM , Petropoulos CJ , et al. Exploring the Complexity of the HIV-1 Fitness Landscape . PLoS Genetics . 2012 Mar ; 8 ( 3 ): e1002551 . Available from: https://dx.plos.org/10.1371/journal.pgen.1002551 . OpenUrl CrossRef PubMed [40]. ↵ Elena C , Ravasi P , Castelli ME , Peirú S , Menzella HG . Expression of codon optimized genes in microbial systems: current industrial applications and perspectives . Frontiers in Microbiology . 2014 ; 5 . Available from: http://journal.frontiersin.org/article/10.3389/fmicb.2014.00021/abstract . [41]. ↵ Mutalik VK , Guimaraes JC , Cambray G , Mai QA , Christoffersen MJ , Martin L , et al. Quantitative estimation of activity and quality for collections of functional genetic elements . Nature Methods . 2013 Apr ; 10 ( 4 ): 347 – 53 . Available from: https://www.nature.com/articles/nmeth.2403 . OpenUrl CrossRef PubMed [42]. ↵ Nieuwkoop T , Terlouw BR , Stevens KG , Scheltema R , de Ridder D , van der Oost J , et al. Reveal-ing determinants of translation efficiency via whole-gene codon randomization and machine learning . Nucleic Acids Research . 2023 Mar ; 51 ( 5 ): 2363 – 76 . Available from: https://academic.oup.com/nar/article/51/5/2363/7016452 . OpenUrl CrossRef PubMed [43]. ↵ Gilchrist MA , Chen WC , Shah P , Landerer CL , Zaretzki R. Estimating Gene Expression and Codon-Specific Translational Efficiencies, Mutation Biases, and Selection Coefficients from Genomic Data Alone ‡ . Genome Biology and Evolution . 2015 Jun ; 7 ( 6 ): 1559 – 79 . Available from: https://academic.oup.com/gbe/article-lookup/doi/10.1093/gbe/evv087 . OpenUrl CrossRef PubMed [44]. ↵ Cope AL , Hettich RL , Gilchrist MA . Quantifying codon usage in signal peptides: Gene expression and amino acid usage explain apparent selection for inefficient codons . Biochimica et Biophysica Acta (BBA) - Biomembranes . 2018 ; 1860 ( 12 ): 2479 – 85 . Available from: https://www.sciencedirect.com/science/article/pii/S0005273618302748 . OpenUrl CrossRef PubMed [45]. ↵ Welch M , Govindarajan S , Ness JE , Villalobos A , Gurney A , Minshull J , et al. Design Parameters to Control Synthetic Gene Expression in Es-cherichia coli . PLoS ONE . 2009 Sep ; 4 ( 9 ): e7002 . Available from: https://dx.plos.org/10.1371/journal.pone.0007002 . OpenUrl CrossRef PubMed [46]. Gustafsson C , Minshull J , Govindarajan S , Ness J , Villalobos A , Welch M. Engineering genes for predictable protein expression . Protein Expression and Purification . 2012 May ; 83 ( 1 ): 37 – 46 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S1046592812000629 . OpenUrl CrossRef PubMed [47]. Goulet DR , Yan Y , Agrawal P , Waight AB , Mak ANs , Zhu Y. Codon Optimization Using a Recurrent Neural Network . Journal of Computational Biology . 2023 Jan ; 30 ( 1 ): 70 – 81 . Available from: https://www.liebertpub.com/doi/10.1089/cmb.2021.0458 . OpenUrl CrossRef PubMed [48]. ↵ Welch M , Villalobos A , Gustafsson C , Minshull J. Designing Genes for Successful Protein Expression . In: Methods in Enzymology . vol. 498 . Elsevier ; 2011 . p. 43 – 66 . Available from: https://linkinghub.elsevier.com/retrieve/pii/B9780123851208000036 . OpenUrl CrossRef [49]. ↵ Cope AL , Gilchrist MA . Quantifying shifts in natural selection on codon usage between protein regions: a population genetics approach . BMC Genomics . 2022 Dec ; 23 ( 1 ): 408 . Available from: https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-022-08635-0 . OpenUrl CrossRef PubMed [50]. ↵ Saito Y , Kitagawa W , Kumagai T , Tajima N , Nishimiya Y , Tamano K , et al. Developing a codon optimization method for improved expression of recombinant proteins in actinobacteria . Scientific Reports . 2019 Jun ; 9 ( 1 ): 8338 . Available from: https://www.nature.com/articles/s41598-019-44500-z . OpenUrl CrossRef PubMed [51]. ↵ Verma M , Choi J , Cottrell KA , Lavagnino Z , Thomas EN , Pavlovic-Djuranovic S , et al. A short translational ramp determines the efficiency of protein synthesis . Nature Communications . 2019 Dec ; 10 ( 1 ): 5774 . Available from: https://www.nature.com/articles/s41467-019-13810-1 . OpenUrl CrossRef PubMed [52]. ↵ Cannarozzi G , Schraudolph NN , Faty M , Von Rohr P , Friberg MT , Roth AC , et al. A Role for Codon Order in Translation Dynamics . Cell . 2010 Apr ; 141 ( 2 ): 355 – 67 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S0092867410001893 . OpenUrl CrossRef PubMed Web of Science [53]. ↵ Angov E , Hillier CJ , Kincaid RL , Lyon JA . Heterologous Protein Expression Is Enhanced by Harmonizing the Codon Usage Frequencies of the Target Gene with those of the Expression Host . PLoS ONE . 2008 May ; 3 ( 5 ): e2189 . Available from: https://dx.plos.org/10.1371/journal.pone.0002189 . OpenUrl CrossRef PubMed [54]. ↵ Angov E. Codon usage: Nature’s roadmap to expression and folding of proteins . Biotechnology Journal . 2011 Jun ; 6 ( 6 ): 650 – 9 . Available from: https://onlinelibrary.wiley.com/doi/10.1002/biot.201000332 . OpenUrl CrossRef PubMed [55]. ↵ Punde N , Kooken J , Leary D , Legler PM , Angov E. Codon harmonization reduces amino acid misincorporation in bacterially expressed P. falciparum proteins and improves their immunogenicity . AMB Express . 2019 Dec ; 9 ( 1 ): 167 . Available from: https://amb-express.springeropen.com/articles/10.1186/s13568-019-0890-6 . OpenUrl CrossRef PubMed [56]. ↵ De La Torre D , Chin JW . Reprogramming the genetic code . Nature Reviews Genetics . 2021 Mar ; 22 ( 3 ): 169 – 84 . Available from: https://www.nature.com/articles/s41576-020-00307-7 . OpenUrl CrossRef PubMed [57]. ↵ Tseng HC , Harwell CL , Martin CH , Prather KL . Biosynthesis of chiral 3-hydroxyvalerate from single propionate-unrelated carbon sources in metabolically engineered E. coli . Microbial Cell Factories . 2010 Dec ; 9 ( 1 ): 96 . Available from: https://microbialcellfactories.biomedcentral.com/articles/10.1186/1475-2859-9-96 . OpenUrl CrossRef PubMed [58]. ↵ Cabantous S , Waldo GS . In vivo and in vitro protein solubility assays using split GFP . Nature Methods . 2006 Oct ; 3 ( 10 ): 845 – 54 . Available from: https://www.nature.com/articles/nmeth932 . OpenUrl CrossRef PubMed [59]. ↵ Sarni F , Grand C , Boudet AM . Purification and properties of cinnamoyl-CoA reductase and cinnamyl alcohol dehydrogenase from poplar stems (Populus X euramericana) . European Journal of Biochemistry . 1984 Mar ; 139 ( 2 ): 259 – 65 . Available from: https://onlinelibrary.wiley.com/doi/10.1111/j.1432-1033.1984.tb08002.x . OpenUrl CrossRef PubMed Web of Science [60]. ↵ Rhee SY , Beavis W , Berardini TZ , Chen G , Dixon D , Doyle A , et al. The Arabidopsis Information Resource (TAIR): a model organism database providing a centralized, curated gateway to Arabidopsis biology, research materials and community . Nucleic Acids Research . 2003 Jan ; 31 ( 1 ): 224 – 8 . eprint: https://academic.oup.com/nar/article-pdf/31/1/224/7127123/gkg076.pdf . Available from : doi: 10.1093/nar/gkg076 . OpenUrl CrossRef PubMed Web of Science [61]. ↵ Freddolino PL , Amini S , Tavazoie S. Newly Identified Genetic Variations in Common Escherichia coli MG1655 Stock Cultures . Journal of Bacteriology . 2012 ; 194 ( 2 ): 303 – 6 . eprint: https://journals.asm.org/doi/pdf/10.1128/JB.06087-11 . Available from: https://journals.asm.org/doi/abs/10.1128/JB.06087-11 . OpenUrl Abstract / FREE Full Text [62]. Riley M , Abe T , Arnaud MB , Berlyn MKB , Blattner FR , Chaudhuri RR , et al. Escherichia coli K-12: a cooperatively developed annotation snapshot—2005 . Nucleic Acids Research . 2006 Jan ; 34 ( 1 ): 1 – 9 . eprint: https://academic.oup.com/nar/article-pdf/34/1/1/7127982/gkj405.pdf . Available from : doi: 10.1093/nar/gkj405 . OpenUrl CrossRef PubMed Web of Science [63]. Hayashi K , Morooka N , Yamamoto Y , Fujita K , Isono K , Choi S , et al. Highly accurate genome sequences of Escherichia coli K-12 strains MG1655 and W3110 . Molecular Systems Biology . 2006 ; 2 ( 1 ):2006.0007. eprint: https://www.embopress.org/doi/pdf/10.1038/msb4100049 . Available from: https://www.embopress.org/doi/abs/10.1038/msb4100049 . [64]. ↵ Blattner FR , Plunkett G , Bloch CA , Perna NT , Burland V , Riley M , et al. The Complete Genome Sequence of Escherichia coli K-12 . Science . 1997 ; 277 ( 5331 ): 1453 – 62 . eprint: https://www.science.org/doi/pdf/10.1126/science.277.5331.1i4ts53 . Available from: https://www.science.org/doi/abs/10.1126/science.277.5331.1453 . OpenUrl Abstract / FREE Full Text [65]. Villalobos A , Ness JE , Gustafsson C , Minshull J , Govindarajan S. Gene Designer: a synthetic biology tool for constructing artificial DNA segments . BMC Bioinformatics . 2006 Dec ; 7 ( 1 ): 285 . Available from: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-7-285 . OpenUrl CrossRef PubMed [66]. Lithwick G. Hierarchy of Sequence-Dependent Features Associated With Prokaryotic Translation . Genome Research . 2003 Dec ; 13 ( 12 ): 2665 – 73 . Available from: http://www.genome.org/cgi/doi/10.1101/gr.1485203 . OpenUrl Abstract / FREE Full Text [67]. Laursen BS , Sørensen HP , Mortensen KK , Sperling-Petersen HU . Initiation of Protein Synthesis in Bacteria . Microbiology and Molecular Biology Reviews . 2005 Mar ; 69 ( 1 ): 101 – 23 . Available from: https://journals.asm.org/doi/10.1128/MMBR.69.1.101-123.2005 . OpenUrl Abstract / FREE Full Text [68]. Jin H , Zhao Q , Gonzalez De Valdivia EI , Ardell DH , Stenström M , Isaksson LA . Influences on gene expression in vivo by a Shine–Dalgarno sequence . Molecular Microbiology . 2006 Apr ; 60 ( 2 ): 480 – 92 . Available from: https://onlinelibrary.wiley.com/doi/10.1111/j.1365-2958.2006.05110.x . OpenUrl CrossRef PubMed Web of Science [69]. Stenström CM , Holmgren E , Isaksson LA . Cooperative effects by the initiation codon and flanking regions on translation initiation . Gene . 2001 Aug ; 273 ( 2 ): 259 – 65 . Available from: https://linkinghub.elsevier.com/retrieve/pii/S0378111901005844 . OpenUrl CrossRef PubMed Web of Science [70]. ↵ Cox MAA , Cox TF . Multidimensional Scaling. In: Handbook of Data Visualization . Berlin, Heidelberg : Springer Berlin Heidelberg ; 2008 . p. 315 – 47 . Available from : doi: 10.1007/978-3-540-33037-0_14 . OpenUrl CrossRef [71]. ↵ Venables WN , Ripley BD . Modern Applied Statistics with S . 4th ed. New York : Springer ; 2002 . Available from: https://www.stats.ox.ac.uk/pub/MASS4/ . View the discussion thread. Back to top Previous Next Posted June 06, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Negative epistasis limits current codon optimization approaches Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Negative epistasis limits current codon optimization approaches Marco Gees , Zrinka Raguz Nakic , Maria Anisimova , Victor Garcia , Christin Peters bioRxiv 2025.06.03.657573; doi: https://doi.org/10.1101/2025.06.03.657573 Share This Article: Copy Citation Tools Negative epistasis limits current codon optimization approaches Marco Gees , Zrinka Raguz Nakic , Maria Anisimova , Victor Garcia , Christin Peters bioRxiv 2025.06.03.657573; doi: https://doi.org/10.1101/2025.06.03.657573 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Molecular Biology Subject Areas All Articles Animal Behavior and Cognition (7616) Biochemistry (17625) Bioengineering (13852) Bioinformatics (41825) Biophysics (21397) Cancer Biology (18524) Cell Biology (25417) Clinical Trials (138) Developmental Biology (13350) Ecology (19858) Epidemiology (2067) Evolutionary Biology (24277) Genetics (15581) Genomics (22459) Immunology (17698) Microbiology (40278) Molecular Biology (17134) Neuroscience (88400) Paleontology (666) Pathology (2823) Pharmacology and Toxicology (4812) Physiology (7632) Plant Biology (15106) Scientific Communication and Education (2042) Synthetic Biology (4281) Systems Biology (9807) Zoology (2266)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-21T05:10:58.409756+00:00

License: CC-BY-NC-ND-4.0