Integrating experimental feedback improves generative models for biological sequences

doi:10.1101/2025.03.31.646327

Integrating experimental feedback improves generative models for biological sequences

2025 · doi:10.1101/2025.03.31.646327

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 52,407 characters · extracted from preprint-html · click to expand

Integrating experimental feedback improves generative models for biological sequences | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Integrating experimental feedback improves generative models for biological sequences View ORCID Profile Francesco Calvanese , Giovanni Peinetti , View ORCID Profile Polina Pavlinova , View ORCID Profile Philippe Nghe , View ORCID Profile Martin Weigt doi: https://doi.org/10.1101/2025.03.31.646327 Francesco Calvanese 1 Sorbonne Université, CNRS, Dept. of Computational, Quantitative and Synthetic Biology - CQSB , Paris, France 2 Laboratoire de Biophysique et Evolution, CNRS-ESPCI, PSL University , Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Francesco Calvanese Giovanni Peinetti 1 Sorbonne Université, CNRS, Dept. of Computational, Quantitative and Synthetic Biology - CQSB , Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Polina Pavlinova 2 Laboratoire de Biophysique et Evolution, CNRS-ESPCI, PSL University , Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Polina Pavlinova Philippe Nghe 2 Laboratoire de Biophysique et Evolution, CNRS-ESPCI, PSL University , Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Philippe Nghe Martin Weigt 1 Sorbonne Université, CNRS, Dept. of Computational, Quantitative and Synthetic Biology - CQSB , Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Martin Weigt For correspondence: martin.weigt{at}sorbonne-universite.fr Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Generative probabilistic models have shown promise in designing artificial RNA and protein sequences but often suffer from high rates of false positives, where sequences predicted as functional fail experimental validation. To address this critical limitation, we explore the impact of reintegrating experimental feedback into the model design process. We propose a likelihood-based reintegration scheme, which we test through extensive computational experiments on both RNA and protein datasets, as well as through wet-lab experiments on the self-splicing ribozyme from the group I intron RNA family where our approach demonstrates particular efficacy. We show that integrating recent experimental data enhances the model’s capacity of generating functional sequences (e.g. from 6.7% to 63.7% of active designs at 45 mutations). This feedback-driven approach thus provides a significant improvement in the design of biomolecular sequences by directly tackling the false-positive challenge. I. INTRODUCTION Generative probabilistic models for biological sequences, such as proteins and RNA, have recently emerged as promising tools for designing artificial biomolecules [ 1 – 4 ]. These models, particularly family-specific ones like those built using Direct-Coupling Analysis (DCA) [ 1 , 2 ], as well as more advanced architectures like restricted Boltzmann machines [ 4 ], variational autoencoders [ 2 , 5 , 6 ], and protein language models [ 3 ], have shown notable success in generating functional sequences. However, a persistent challenge remains: these models often produce a high rate of false positives – sequences generated as potentially functional by the model but failing in experimental tests. These models are trained on sets of homologous sequences, representing families of sequences with shared evolutionary ancestry. Such families are typically characterized by highly conserved structures and functions, though the sequences themselves may diverge significantly. Multiple-sequence alignments (MSA) [ 7 – 10 ], containing presumably functional sequences from different species, serve as the foundation for training. As a consequence, these models are trained in an unsupervised manner on unlabeled functional sequences, which limits their capacity to differentiate between functional and non-functional variants. A significant issue in these generative models is the high rate of false positives – sequences deemed functional by the model that fail experimental validation [ 1 , 2 ]. This limitation arises from the scarce sampling of the viable sequence space in the MSAs, leading to an underrepresentation of functional diversity, and to an intrinsic difficulty in accurately estimating the limitations of functional sequence space. In this study, we focus on DCA-based Potts models [ 1 , 11 , 12 ] and demonstrate that integrating experimental feedback including false-positive sequences into the training procedure can enhance model accuracy and reduce false-positive rates. By incorporating this feedback through an extension of the maximum-likelihood inference procedure [ 12 ], which makes explicit use of the experimental results, we show that false positives from the initial model play a critical role in refining the boundaries of the viable sequence space, thereby improving the model’s performance. An intriguing ingredient to this approach is that the underlying mathematical structure of the model remains unchanged, but the reintegration of experimental data significantly improves parameter learning. This highlights an important insight: the current limitations of generative models stem not necessarily from the limited expressivity of their architectures, but also from the insufficient information content in the original training data. These alignments, representing natural sequences that have diverged through evolution, offer a sparse and incomplete sampling of the functional sequence landscape. Enhancing this landscape with experimental feedback allows for a reliable model, generating a higher fraction of functional sequences (i.e. true positives). Augmenting data quantity and quality at unchanged model complexity turns out to be an efficient strategy. The paper is organized as follows. In the next Results section, we outline the main ideas of the reintegration approach and present extensive tests on diverse RNA and protein families, progressing from purely computational settings to experimental validation : artificial sequences sampled from our models and tested experimentally are the most rigorous possible validation of a generative approach to bio-molecules. We detail the application of our procedure to DCA models in the Materials and Methods section, where we also describe the datasets used to evaluate our approach. At the end of the article, we present our main conclusion, and show an outlook of possible extensions of our approach. II. RESULTS Here we propose a method to reintegrate experimental test results into a sequence generative model. Figure 1 illustrates the core idea. In the standard approach, the natural data MSA 𝒟 N is used to train an initial probabilistic generative model P ( a | θ 1 ) = P 1 ( a ) whose parameters θ 1 are obtained through Maximum Likelihood Estimation (MLE): where Download figure Open in new tab FIG. 1: Schematic representation of the reintegration procedure. The original model probability distribution P 1 ( a ) trained on a natural MSA is used to sample the P 1 -dataset. These sequences are experimentally tested and labeled, and this information is reintegrated into the training of a DCA model P 2 ( a ). The new P 2 -dataset exhibits enhanced functionality. Note that the mathematical form of P 1 ( a ) and P 2 ( a ) is identical, and that the improved generative performance results from refined parameter values learned on enriched data. Once the model is trained, an set 𝒟 T of artificial sequences can be sampled from P 1 ( a ) and tested experimentally. However, this approach often suffers from a high rate of false-positive sequences in 𝒟 T , i.e. sequences expected to be functional according to P 1 ( a ), yet failing experimental tests (indicated in red in Figure 1 ), cf. [ 1 , 2 ]. To address this issue, we propose reintegrating the experimental feedback contained in 𝒟 T into an updated model, P ( a | θ 2 ) = P 2 ( a ). This updated model maintains the same mathematical form and architecture as P 1 ( a ) but uses recalibrated parameters inferred leveraging the newly labeled data 𝒟 T . Consequently, P 2 ( a ) is expected to generate a higher proportion of functional (true-positive) sequences. To implement this, we update the model parameters optimizing a new objective function: where The first contribution to 𝒬 equals the standard loglikelihood for the natural data given in Eq. (2) . Maxmizing 𝒬 reduces thus to standard MLE if no experimental data is available, i.e., for 𝒟 T = ∅. The second contribution is the reintegration term, which acts on the probabilities of the tested sequences in dependence on the adjustment weight w , assigned to every sequence in the tested dataset b ∈ 𝒟 T in function of the experimental test result. We require w ( b ) to adhere to the following rules: Negative Adjustment: w ( b ) 0 for sequences passing the experimental functionality test, such that their probability P 2 ( b ) is increased when maximizing 𝒬. The specific values of the weights for individual sequences depend on the specific experimental setting, in the easiest case they can be taken to be all of equal absolute value (see below for some more complicated construction removing at least partially biases in the experimental data). The overall intensity of reintegration is controlled by the hyperparameter λ ; the higher is λ , the greater the relative importance assigned to the experimentally labelled dataset 𝒟 T compared to the natural sequences 𝒟 N . When λ = 0, the classical MLE is recovered. If we consider only the functional sequences with w ( b ) > 0 in 𝒟 T , this procedure is similar to adding them to 𝒟 N with a λ -dependent weight, and performing the standard MLE inference. The essential difference arises from the inclusion of non-functional sequences with w ( b ) < 0 in 𝒟 T , which indicate regions of the sequence space that our model should avoid, cf. Fig. 2 . Ideally, 𝒟 T would consist of P 1 ( a ) generated sequences, cf. Fig. 1 . Consider a sequence b that has been generated by P 1 ( a ): the sole fact that it was sampled implies that it was assigned a high probability by the P 1 ( a ) model. If this sequence fails the experimental test, it is assigned a negative w ( b ) < 0, and the reintegrated P 2 ( a ) model will subsequently assign it a lower probability. This procedure enables the model to correct itself based on the experimental feedback, and to better infer the limits of functional sequence space. Download figure Open in new tab FIG. 2: Stylized representation of the effect of the reintegration procedure. Sequences generated by the initial model P 1 (region inside orange line) that fail experimental tests (region outside grey area) are assigned negative adjustment weights (w 0). The reintegrated model P 2 is then trained using these adjusted weights, resulting in generated sequences that avoid regions associated with non-functional sequences and concentrate in regions associated with functional sequences (region inside blue line), thereby reducing the fraction of false positives among the generated sequences. The reintegration set 𝒟 T is, however, not limited to be comprised of sequences generated by P 1 ( a ) but can, in principle, be any functionally labelled dataset. It is, however, intuitively important that negative sequences are close to the functional sequence space. Random sequences, e.g., are almost surely non-functional, but they typically have already very low values of P 1 ( a ), and reintegrating them negatively will not improve the description of the positively functional sequence space. Our tests of this procedure have been carried out on DCA models, since DCA has proved capable of generating functional proteins and RNAs [ 1 , 2 ]. Another advantage is that we can naturally implement the new objective function 𝒬 in the DCA framework without altering its classical training procedures. A detailed discussion is provided in the Materials and Methods section with all analytical derivations detailed in the Supplementary Section S1 . Note that, in a more generic machine-learning context, the DCA model can be replaced by other generative model architectures, and MLE by the optimization of any loss function, which is additive in data-point specific losses. In the following, we present computational tests on RNA and protein data, as well as experimental validation performed on group I intron ribozymes. A. The effect of the reintegration strength λ in Rfam RNA families To understand the action of the proposed reintegration method, we need to study its performance systematically in function of the reintegration strength λ . For this aim, we use three RNA-family MSAs from the Rfam database (cf. Materials and Methods ). Before performing resource and time consuming experiments (cf. below), we first assess the role of lambda via a fully computational approach. As a proxy for the experimental fitness, we employ the negative free energy − F of folding a given sequence onto the family’s consensus secondary structure, computed from the Turner Model [ 13 ]) implemented in the RNAeval function of the ViennaRNA package [ 14 ]. Further details on the data and fitness proxy are provided in the Materials and Methods . For the statistical models, we use our recent time-efficient RNA-tailored Edge Activation DCA (eaDCA) [ 15 ]. It provides easy access to the inferred models’ Shannon entropy S [ 16 ] in function of λ , and thereby allows to quantify the potential diversity of the sequences generated by the model. To this end, for each of the three RNA families, we used the Rfam MSA as 𝒟 N and trained our initial model P 1 ( a ) via eaDCA. From this model, we sampled the P 1 ( a )-dataset comprising 2000 sequences, to be used as 𝒟 T in the reintegration procedure. We measured the RNAeval proxy fitness − F [ 14 ] of these sequences (cf. Materials and Methods ), and decided (somewhat arbitrarily) to consider all sequences with above-average − F as functional, and below-average − F as non-functional - the training objective for P 2 ( a ) thus being the generation of highly thermo-stable sequences. We thus define a simple adjustment weight w ( b ) for all b ∈ 𝒟 T : where F avg is the average RNAeval folding free energy evaluated for the 𝒟 T dataset. A reasonable range for the reintegration strength λ can be chosen using the following consideration: as already mentioned, the case λ = 0 is equivalent to training the standard DCA model P 1 ( a ) using MLE and the natural MSA 𝒟 N . For λ = 1, the two contributions to the objective function 𝒬 in Eq. (4) become equally important, and so do the two datasets 𝒟 N and 𝒟 T . It is therefore reasonable to explore λ s between zero and values slightly larger than one. Note that, for larger values, the learning algorithm starts to have convergence problems (see Materials and Methods ). To assess the effect of the reintegration procedure, we monitor the following quantities: Average Proxy Fitness: is the average proxy fitness of the sequences in the P 2 -dataset. This measures how well our reintegration is pushing the generation towards “functional” sequences according to Eq. (5) . True Positives : TP is defined as the fraction of sequences in the P 2 -dataset that exhibit a fitness score F ( b ) ≥ − F avg . In other words, these sequences would have been assigned a positive weight during the reintegration procedure, indicating predicted functionality. This metric quantifies the effectiveness of the approach in reducing false positives. Model entropy: S quantifies the diversity of sequences generated by the reintegrated model P 2 . A higher entropy suggests a more diverse sequence space. Average intra-dataset distance: is the average Hamming distance (number of mutations) between pairs of sequences in the P 2 -dataset. It quantifies the diversity between generated sequences. This value is to be compared with , i.e., the average distance in the non reintegrated P 1 ( a )-dataset. Average minimum distance to functional sequences in 𝒟 T : for each sequence in the P 2 -dataset, we calculate the Hamming distance to the closest functional sequence in the reintegration dataset 𝒟 T . The average of these is reported as . This metric ensures that our model is not merely replicating functional sequences from 𝒟 T . Average minimum distance to non-functional sequences in 𝒟 T : for each sequence in the P 2 -dataset, we calculate the Hamming distance to the closest non-functional sequence in the reintegration dataset 𝒟 T . The average of these is reported as . This metric allows us to test if generated sequence avoid the vicinity of non-functional sequences from 𝒟 T , as is to be expected by their negative contribution to the objective 𝒬. By tracking these quantities, we can evaluate whether the reintegration leads to a better model without overfitting the reintegrated data. The results of these analyses are presented in Table I for the RF00162 RNA family, similar results are observed also for the other families, cf. Supplementary Section S2 in Supplementary Tables S1, S2, S3 and Supplementary Figure S1 . View this table: View inline View popup Download powerpoint TABLE I: Effect of the reintegration strength for the RF00162 RNA family. Note that the values reported for λ = 0 correspond to the DCA model without reintegration, i.e. −F avg = 14.68 is the threshold value chosen for functional sequences, and the entropy S = 61.92 equals the one of P 1 . In general, we observe that h i gher values of λ yield a higher average proxy fitness , demonstrating that the reintegration procedure effectively enhances the fitness of the generated sequences. For example, for the RF00162 RNA family ( Table I ), the average proxy fitness increases from 22.4 ( λ = 0) to 34.9 ( λ = 1), and the TP fraction improves from 49.9% to 99.4%. However, this gain comes at the expense of model entropy S and overall sequence diversity: S decreases from 63.4 ( λ = 0) to 53.3 ( λ = 1), while the average intra-dataset distance drops from 46.0 ( λ = 0) to 39.7 ( λ = 1). Additionally, generated sequences become somewhat closer to the functional sequences in the reintegration dataset , as indicated by a reduction in the average minimum distance from 28.1 ( λ = 0.1) to 25.3 ( λ = 1). The average minimum distance to non-functional sequences slightly increases from 29.6 ( λ = 0.1) to 31.8 ( λ = 1). We were able to perform these tests because we can readily compute the proxy fitness also on the sequences generated from the reintegrated model. In more realistic scenarios, this is not possible without experiments. However, our observations guide the choice of λ before doing experiments: it seems reasonable to select a value before a significant loss in diversity occurs or before the training procedure fails to converge. The effect of reintegration on the proxy fitness distributions for the three Rfam families is shown in Fig. 3 . In all cases, the P 2 -dataset samples exhibit a significant shift to higher proxy fitness compared to the P 1 -dataset (indicated by λ = 0). We find in particular, that “non-functional” sequences with proxy fitnesses below − F avg become very rare. Download figure Open in new tab FIG. 3: A Consensus secondary structure of the RF00162 RNA family (SAM riboswitch). B Distribution of the RNAeval proxy fitness for the RF00162 DCA model P 1 ( a ) (orange, λ = 0) and the reintegrated P 2 ( a ) model (blue, λ = 1). C Consensus secondary structure of the RF00504 RNA family (Glycine riboswitch). D Distribution of the RNAeval proxy fitness for the RF00504 DCA model P 1 ( a ) (orange, λ = 0, N = 2000) and the reintegrated P 2 ( a ) model (blue, λ = 2, N = 2000). E Consensus secondary structure of the RF00005 RNA family (tRNA). F Distribution of the RNAeval proxy fitness for the RF00005 DCA model P 1 ( a ) (orange, λ = 0, N = 2000) and the reintegrated P 2 ( a ) model (blue, λ = 1, N = 2000). The secondary structure diagrams were generated using the Forna software [ 17 ]. In addition to generating sequences with improved proxy fitness, another effect of the reintegration is that the DCA model score becomes a more reliable predictor of fitness, an in-depth analysis of this phenomenon is provided in the Supplementary Section S2 and Supplementary Figure S2 B. Reintegrating experimental activity of a protein family Our case study for applying the reintegration procedure to proteins is the chorismate mutase (CM) enzyme, which plays an essential role in the biosynthesis of aromatic amino acids. This enzyme serves as an ideal setting to test our procedure because Russ et al. [ 1 ] have already trained a DCA model P 1 on an MSA 𝒟 N of natural CM homologs, and they have experimentally tested the natural sequences of 𝒟 N as well as a dataset 𝒟 T of P 1 -designed CM variants using an in vivo growth assay (see Materials and Methods ). As a result, we have access to experimentally labeled datasets indicating sequence functionality. Moreover, Russ et al. demonstrated that it is possible to train a simple Logistic Regression (LR) classifier on 𝒟 N to predict whether an artificial CM variant is functional or not. We can leverage these findings for our reintegration procedure. Our approach begins by training an LR classifier using the labeled natural CM variants in 𝒟 N to predict experimental functionality. This classifier achieves an accuracy of approximately 80% in predicting the functionality of artificially generated sequences (cf. Supplementary Section S3 ), which is consistent with the results reported [ 1 ]. Since experimentally testing our artificial sequences is not feasible within this study, we will use this classifier to evaluate the performance of our reintegration procedure. Note that the labels used for training the classifier are not used in our reintegration procedure, but are complementary information exploitable for posterior sequence evaluation. We first trained our initial DCA model P 1 ( a ) using the natural MSA 𝒟 N as training data, and the adabmDCA implementation of DCA [ 11 ]. From this model, we sampled the P 1 -dataset comprising 8000 artificial CM variants. To train our reintegrated DCA model P 2 ( a ), we used dataset 𝒟 T , which is experimentally labelled in [ 1 ], allowing us to avoid relying on proxy fitness measures. We chose again a binary adjustment function w ( b ) for all sequences in 𝒟 T : We set the reintegration strength parameter λ to 1, which is the highest value that converged within an acceptable time frame. The reintegration procedure is significantly slower for proteins compared to RNA, requiring 3 hours of runtime on an L4 GPU using the most advanced DCA GPU implementation available. Also here, we sampled from the resulting model a P 2 -dataset containing 8000 artificial CM sequences. Results for lower values of λ are provided in the Supplementary Table S4 . To assess the effectiveness of our reintegration procedure, we employed the LR classifier, and determined the percentage of variants predicted to be functional in both the P 1 - and the reintegrated P 2 -dataset. The percentage of predicted functional variants increases from 39% to 68%, indicating a favorable outcome of the procedure. This comes at a moderate cost of reduced average sequence diversity, of , and the designs still retain a good level of diversity (SI). values and the histogram of distances between the P dataset and the closest reintegrated 𝒟 T are provided in the SI. In the PCA plots shown in Fig. 4 , we project the P 1 - and P 2 -generated datasets (colored) onto the first two principal components of the natural MSA 𝒟 N (grey). Notably, the P 2 dataset avoids certain regions in PCA space that are occupied by sequences from both the natural dataset and the P 1 model. To better understand this behavior, Fig. 5 projects the reintegration dataset 𝒟 T onto the same PCA, with functional sequences shown in green and non-functional ones in red. We observe that the regions avoided by the P 2 dataset correspond to the non-functional areas in 𝒟 T . These findings suggest that the observed reduction in diversity of sequences generated after reintegration results from the exclusion of non-functional regions, effectively refining the functional sequence space in the P 2 model compared to the standard DCA model P 1 . Download figure Open in new tab FIG. 4: A Classifier predictions for protein functionality in the P 1 ( a )-dataset (N = 8000): True indicates predicted functional, False indicates predicted non-functional. B PCA projection of the P 1 ( a )-dataset (orange); natural PSA homologs are represented by the grey cloud. C Classifier predictions for protein functionality in the P 2 ( a )-dataset (N = 8000): True indicates predicted functional, False indicates predicted non-functional. D PCA projection of the P 2 ( a )-dataset (blue); natural PSA homologs are represented by the grey cloud. Download figure Open in new tab FIG. 5: PCA projection of the experimentally labeled dataset D T (N = 1003) of artificially generated chorismate mutase sequences. Functional sequences are shown in green, non-functional ones in red. The natural sequences in D N (N = 1130) are shown in grey (note that not all of them are functional in the experimental test performed in E. coli). C. Experimental Validation on Group I Intron Ribozymes Group I intron ribozymes are catalytic RNA molecules that possess the ability to self-splice, meaning they can excise themselves from precursor RNA transcripts without the assistance of proteins or additional enzymes. Our work leverages on that of Lambert et al. [ 2 ], who started with a reference wildtype, the Azoarcus group I intron ribozyme, and designed artificial mutations of this reference sequence using MSA-based generative models (DCA [ 15 ] and VAE [ 5 ]), structure based methods (Turner Model [ 13 ]) and combinations of the two. Through high-throughput assays, they tested these mutations for self-splicing-like activity and evaluated the percentage of active designs from each model at varying distances from the wild type. Here, we present experimental evidence supporting the presented reintegration method applied in this context. We start with their DCA model, utilizing it as our non-reintegrated P 1 model. This model was trained using as training set 𝒟 N an MSA of 817 group I introns aligned against the Azoarcus group I intron ribozyme ( L = 197). For the training of our reintegrated models, we employ as 𝒟 T a subset of 14099 from their experimentally labeled dataset, which consists of 24071 mutations of the reference (see Materials and Methods ). An important detail is that the 𝒟 T in this case is significantly different from all the other reintegration instances in this study; all the tested designs are mutations of a single reference sequence, so their distribution in the sequence space is localized around one single point. Additionally, as expected, designs with fewer mutated residues generally exhibited higher activity than those with a greater number of mutations. Thus, a significant amount of information about sequence functionality contained in 𝒟 T is related to the trivial distance from the reference. Unlike all the other instances presented, where 𝒟 T designs are exclusively derived from the non-reintegrated model P 1 , in this case, they originate from all the different models tested in [ 2 ]. Details about 𝒟 T are provided in the Supplementary Section S4 and Supplementary Figure S3 . We implemented two reintegration strategies: Standard Reintegration Procedure (REINT): All active 𝒟 T sequences were reintegrated with a weight , and all non-active 𝒟 T sequences were reintegrated with . Bin Sum Zero Reintegration (REINT BS0): 𝒟 T sequences were grouped into bins based on their mutational distance from the reference sequence, with each bin covering four mutational steps. For sequences in bin i , the weight w ( b ) was set to for active sequences, where is the number of active sequences in the bin, and for non-active sequences, where is the number of non-active sequences in the bin. This ensured that the sum of w ( b ) in each bin was zero, mitigating bias towards the Azoarcus reference sequence by balancing positive and negative signals at each distance. For further details and the choice of the parameter λ , refer to the Supplementary Section S4 . The two models were trained using the same GPU DCA implementation used for the CM case. From the two reintegrated models, we generated designs at various bins of mutational distance from the reference wildtype (Table 6). These designs were then experimentally tested for self-splicing activity using the same experimental assay that was used for the designs from the non-reintegrated P 1 model [ 2 ]. Details about the experimental procedures and its comparability across different experimental istances are provided in the Supplementary Section S4 and Supplementary Figure S4, S5 and Supplementary Table S5 . The results of the experimental assays are displayed in Figure 6 and detailed in the Supplementary Table S6 . Download figure Open in new tab FIG. 6: Active fraction of the generated designs as a function of the mutational distance from the reference sequence for the P 1 DCA model and the two reintegrated models. The number of tested designs at each bin of distance can be found in Table II . Both reintegrated models outperformed the non-reintegrated one. The REINT BS0 model maintained an active fraction of 23.6% at a mutational distance of 60 residues, compared to the non-reintegrated P 1 model, which had a similar active fraction of 19.3% at 40 mutations but dropped to just 2.0% at 60 mutations. The REINT model successfully generated functional sequences at 65 mutations, further away than all the P 1 ( a ) tested in [ 2 ], where the furthest active seqwuence was found at 60 mutations. This increase in performance came at the expense of generated sequence diversity, which was significantly more impacted than in the previous reintegration examples. This is most likely due to the highly localized nature of the reintegrated dataset 𝒟 T . The intra-sample and distance distributions for mutational distances of 45, 60 and 65 are shown in Table III , with the complete analysis provided in the Supplementary Table S6 and Supplementary Figure S6, S7 . View this table: View inline View popup Download powerpoint TABLE II: Number of tested designs of the two reintegrated models at various mutational distances. For the DCA P 1 model, this number is 150 for each bin [ 2 ]. View this table: View inline View popup Download powerpoint TABLE III: Active Fraction, Average Intra-Dataset Distance , and Average Minimum Distance from the Positively Reintegrated Dataset for mutational distances 60, 70, and 75 from the reference sequence. III. DISCUSSION Advances in high-throughput experimentation and machine learning are rapidly reshaping our ability to design functional biomolecular sequences. In this work, we have demonstrated that reintegrating experimental feedback into generative models markedly improves the reliability of predicted sequences. Our results show that even when the underlying mathematical model remains unchanged, the incorporation of well-characterized experimental data – including both functional and non-functional sequences – serves as a powerful corrective mechanism. Experimental tests therefore do not only validate predictions of models entirely trained on natural sequence data, but also guides the refinement of the modeled sequence space by penalizing non-functional regions and reinforcing areas associated with activity. A key insight from our study is that enhanced performance can be achieved by training on more informative data rather than solely by increasing model complexity. The same DCA framework, when trained with a balanced integration of natural sequence data and experimental outcomes, produces a model that yields a significantly higher fraction of functional sequences. This observation underscores that the limitations of conventional generative models are not necessarily due to the inadequacy of their architectures, but rather stem from the sparsity and incomplete sampling inherent in natural sequence databases. By reintegrating experimental feedback, our method compensates for this deficiency, thereby sharpening the model’s discriminative power. At the same time, our approach has inherent limitations. Because the reintegration procedure relies on experimental data that are necessarily sampled from regions already explored by nature, the model is not readily extended to completely novel areas of sequence space. In other words, while our method is highly effective at refining and navigating the known functional landscape, it remains dependent on the availability and quality of experimental measurements. This dependency emphasizes that the generation of truly novel sequences will continue to require a comprehensive experimental framework to guide and validate model predictions. Looking ahead, the iterative interplay between experimental feedback and model refinement presents a promising route toward more accurate and targeted design strategies. Future studies could explore how successive rounds of data integration may gradually expand the functional sequence space, while also addressing potential trade-offs between accuracy and diversity. Ultimately, our results suggest that a synergistic integration of experimental and computational approaches can overcome the false-positive limitations of current generative models and pave the way for more reliable biomolecular design. IV. MATERIALS AND METHODS In this section, we detail the methodological framework of our proposed reintegration method applied to the DCA generative model, as well as the data and methods employed for training and evaluating our models in a number of diverse RNA and protein families. A. Reintegration method for DCA models The presented reintegration procedure is particularly well-suited for application to DCA models, as it does not require modifying their standard inference methods (differently from previous reintegration attempts [ 18 ]). In its conventional implementation, DCA assumes that the natural data distribution is described by a probability P 1 ( a ) in the form of a Potts model, with a = ( a 1 , …, a L ) denoting, according to the problem under study, an aligned nucleotide or amino-acid sequence of length L . The optimal parameters { h 1 , J 1 } for P 1 ( a ) are inferred via Maximum Likelihood Estimation (MLE) [ 12 ], where 𝒟 N is an MSA of all sequenced homologs of the family under consideration. We omit here the standard sequence reweighting procedure [ 19 ] commonly used in DCA, to simplify notation, but it can be included straightforwardly. Our publicly available implementation does contain it. The reintegration data consist of a second alignment, 𝒟 T , of experimentally tested sequences, together with experimental outcomes encoded by the adjustment weight w ( b ). The reintegrated model P 2 ( a ) remains in the Potts-model form, but with updated parameters { h 2 , J 2 } chosen to maximize the new objective function In standard DCA inference, the MLE conditions require the one-point marginals and two-point marginals of P 1 ( a ) to match the empirical frequencies f i ( a ) and f ij ( a, b ) observed in 𝒟 N : where δ a,b is the Kronecker delta. By contrast, maximizing 𝒬 imposes choosing { h 2 , J 2 } so that the one-point and two-point marginals of P 2 ( a ) match the 𝒟 T -corrected frequencies (see Supplementary Section S1 ): where Hence, the maximization condition becomes for all i, j and all a, b . Thus, any standard DCA inference method may be used simply by replacing the original empirical frequencies f with the corrected frequencies from Eq. (12) . Consequently, implementing reintegration in a DCA pipeline requires only altering the frequency targets used during parameter inference. It is important to note, however, that negative values of w ( b ) (i.e. experimentally tested non-functional sequences) can break the convexity of the problem, removing guaranteed uniqueness or even existence of a solution. Additionally, the corrected frequencies may sometimes lie outside the interval [0, 1]. As a result, convergence to a consistent DCA model is not strictly guaranteed, especially for strong reintegration strengths λ . Nonetheless, in all of our applications, we found that moderate settings of λ do converge reliably. B. Data, RNA fitness proxy & Group I intron experimental activity 1. RFAM sequence data − The RNA datasets used in this paper are MSA of three RNA families. These families are the tRNA family RF00005 (number of sequences |𝒟 N | = 30000, number of residues L = 71), the SAM riboswitch family RF0162 (|𝒟 N | = 6113, L = 108), and the Glycine riboswitch family RF0504 (|𝒟 N | = 4600, L = 94). For all three RNA families, the corresponding consensus secondary structure is available from the Rfam database [ 7 ]. The Datasets are taken from [ 15 ]. 2. RNA fitness proxy − To study the reintegration procedure, we need a fast and reliable way to label our datasets. Performing experiments on RNA sequence functionality is both expensive and time-consuming, so we initially used computational tools as fitness proxies to label our datasets. Since we have access to the RNA families’ consensus secondary structures, we employed the RNAeval function provided by the Vienna Package [ 14 ]. RNAeval calculates the thermodynamic free energy F (using the Turner 2004 energy model [ 13 ]) of an RNA sequence folded onto a given secondary structure. Our underlying assumption is that well-generated sequences will, on average, exhibit low free energy F when folded onto the family’s consensus structure, indicating structural stability. In contrast, poorly generated sequences may lack structural stability or fold into alternative structures, and are expected to have a higher free energy. Therefore, we will use − F as a fitness proxy. This approach leverages the established structure/function relationship in RNA [ 20 , 21 ], and enables us to computationally label sequences for a given target secondary structure. 3. Group I Intron Ribozymes Data − The Group I intron datasets utilized in this paper are taken from the study by Lambert et al. [ 2 ]. These datasets consist of multiple sequence alignments (MSAs) of Group I intron ribozymes. The natural data D N consists of an MSA of N = 817 natural Group I introns aligned against the reference sequence, the Azoarcus Group I intron (numbeer of residues L = 197). The reintegration dataset D T consists of a subset of 14099 out of the original 24071 experimentally labeled sequences. Specifically, D T includes the sequences with mutational distance from 3 to 60, excluding those near the activity threshold. Sequences with mutational distances exceeding 60 were not reintegrated due to the unreliable detection of activity signals beyond this range (see Supplementary Section S4 ). 4. Group I Experimental Activity − To experimentally validate the reintegration methods, we conducted experiments on artificially designed Group I intron ribozymes. The experimental activity was determined using the same methodology as the tested ribozymes in Lambert et al. [ 2 ]. The experimental assay consists of a high-throughput screening of self-splicing catalytic activity, details are provided in the Supplementary Section S4 . 5. Protein data The protein datasets used in this paper are taken from the study by Russ et al. [ 1 ]. These datasets consist of MSAs of the chorismate mutase (CM) enzyme. 𝒟 N consists of the MSA of the natural CM homologs (number of sequences N = 1130, number of residues L = 96) and the reintegration dataset 𝒟 T is the alignment ( N = 1003, L = 96) of DCA-generated artificial CM variants. In [ 1 ], both datasets are labeled based on experimental testing: all protein sequences were expressed in genetically engineered E. coli strains, each modified to produce one of the CMs from 𝒟 N or 𝒟 T instead of their natural wildtype variants. These E. coli strains were then tested for growth under selective conditions; sequences enabling E. coli growth were labeled as functional, whereas those that did not were labeled as non-functional. It is noteworthy that many natural sequences in 𝒟 N were non-functional in E. coli under the experiment’s conditions: the natural CM homologs have undergone evolutionary adaptations specific to their native hosts, and may fail to provide growth in the E. coli environment. DATA AND CODE AVAILABILITY All data and code used in this study are publicly available at Zenodo: 10.5281/zenodo.15115193 . 6. Conflict of interest statement None declared. ACKNOWLEDGEMENTS We acknowledge helpful discussions with Camille Lambert, Roberto Netti, Vaitea Opuu, Andrea Pagnani, Lorenzo Rosset and Francesco Zamponi during the preparation of this work, and we thank particularly Lorenzo Rosset for providing his GPU-based adabmDCA implementation prior to publication, and Vaitea Opuu for his help with the initial analysis of the experimental data using the scripts of [ 2 ]. Footnotes https://zenodo.org/records/15115193 References [1]. ↵ W. P. Russ , M. Figliuzzi , C. Stocker , P. Barrat-Charlaix , M. Socolich , P. Kast , D. Hilvert , R. Monasson , S. Cocco , M. Weigt , et al. , “ An evolution-based model for designing chorismate mutase enzymes ,” Science , vol. 369 , no. 6502 , pp. 440 – 445 , 2020 . OpenUrl Abstract / FREE Full Text [2]. ↵ C. N. Lambert , V. Opuu , F. Calvanese , F. Zamponi , E. Hayden , M. Weigt , M. Smerlak , and P. Nghe , “Expanding the space of self-reproducing ribozymes using probabilistic generative models.” July 2024 . [3]. ↵ A. Madani , B. Krause , E. R. Greene , S. Subramanian , B. P. Mohr , J. M. Holton , J. L. Olmos , Jr, C. Xiong , Z. Z. Sun , R. Socher , J. S. Fraser , and N. Naik , “ Large language models generate functional protein sequences across diverse families ,” Nat. Biotechnol ., vol. 41 , pp. 1099 – 1106 , Aug . 2023 . OpenUrl CrossRef PubMed [4]. ↵ J. F. de Cossio-Diaz , P. Hardouin , F.-X. L. du Moutier , A. D. Gioacchino , B. Marchand , Y. Ponty , B. Sargueil , R. Monasson , and S. Cocco , “Designing molecular RNA switches with restricted boltzmann machines,” May 2023 . [5]. ↵ A. Hawkins-Hooker , F. Depardieu , S. Baur , G. Couairon , A. Chen , and D. Bikard , “ Generating functional protein variants with variational autoencoders ,” PLoS Comput. Biol ., vol. 17 , p. e1008736 , Feb . 2021 . OpenUrl CrossRef PubMed [6]. ↵ S. Lyu , S. Sowlati-Hashjin , and M. Garton , “ Variational autoencoder for design of synthetic viral vector serotypes ,” Nat. Mach. Intell ., vol. 6 , pp. 147 – 160 , Jan . 2024 . OpenUrl CrossRef [7]. ↵ I. Kalvari , E. P. Nawrocki , N. Ontiveros-Palacios , J. Argasinska , K. Lamkiewicz , M. Marz , S. Griffiths-Jones , C. Toffano-Nioche , D. Gautheret , Z. Weinberg , E. Rivas , S. R. Eddy , R. Finn , A. Bateman , and A. I. Petrov , “ Rfam 14: expanded coverage of metagenomic, viral and microRNA families ,” Nucleic Acids Research , vol. 49 , pp. D192 – D200 , 11 2020 . OpenUrl [8]. J. Mistry , S. Chuguransky , L. Williams , M. Qureshi , G. A. Salazar , E. L. L. Sonnhammer , S. C. E. Tosatto , L. Paladin , S. Raj , L. J. Richardson , R. D. Finn , and A. Bateman , “ Pfam: The protein families database in 2021 ,” Nucleic Acids Res ., vol. 49 , pp. D412 – D419 , Jan . 2021 . OpenUrl CrossRef PubMed [9]. E. P. Nawrocki and S. R. Eddy , “ Infernal 1.1: 100-fold faster RNA homology searches ,” Bioinformatics , vol. 29 , pp. 2933 – 2935 , 09 2013 . OpenUrl CrossRef PubMed Web of Science [10]. ↵ R. D. Finn , J. Clements , and S. R. Eddy , “ HMMER web server: interactive sequence similarity searching ,” Nucleic Acids Res ., vol. 39 , pp. W29 – 37 , July 2011 . OpenUrl CrossRef PubMed Web of Science [11]. ↵ A. P. Muntoni , A. Pagnani , M. Weigt , and F. Zamponi , “ adabmdca: adaptive boltzmann machine learning for biological sequences ,” BMC bioinformatics , vol. 22 , no. 1 , pp. 1 – 19 , 2021 . OpenUrl CrossRef PubMed [12]. ↵ F. Cuturello , G. Tiana , and G. Bussi , “ Assessing the accuracy of direct-coupling analysis for RNA contact prediction ,” RNA , vol. 26 , no. 5 , pp. 637 – 647 , 2020 . OpenUrl Abstract / FREE Full Text [13]. ↵ D. H. Mathews , M. D. Disney , J. L. Childs , S. J. Schroeder , M. Zuker , and D. H. Turner , “ Incorporating chemical modification constraints into a dynamic programming algorithm for prediction of RNA secondary structure ,” Proc. Natl. Acad. Sci. U. S. A ., vol. 101 , pp. 7287 – 7292 , May 2004 . OpenUrl Abstract / FREE Full Text [14]. ↵ R. Lorenz , S. H. Bernhart , C. H. zu Siederdissen , H. Tafer , C. Flamm , P. F. Stadler , and I. L. Hofacker , “ ViennaRNA package 2.0 ,” Algorithms for Molecular Biology , vol. 6 , Nov . 2011 . [15]. ↵ F. Calvanese , C. N. Lambert , P. Nghe , F. Zamponi , and M. Weigt , “ Towards parsimonious generative modeling of RNA families ,” Nucleic Acids Research , vol. 52 , pp. 5465 – 5477 , 04 2024 . OpenUrl CrossRef PubMed [16]. ↵ J. P. Barton , A. K. Chakraborty , S. Cocco , H. Jacquin , and R. Monasson , “ On the entropy of protein families ,” Journal of Statistical Physics , vol. 162 , pp. 1267 – 1293 , 2016 . OpenUrl CrossRef [17]. ↵ P. Kerpedjiev , S. Hammer , and I. L. Hofacker , “ Forna (force-directed RNA): Simple and effective online RNA secondary structure diagrams ,” Bioinformatics , vol. 31 , pp. 3377 – 3379 , Oct . 2015 . OpenUrl CrossRef PubMed [18]. ↵ P. Barrat-Charlaix , M. Figliuzzi , and M. Weigt , “ Improving landscape inference by integrating heterogeneous data in the inverse ising problem ,” Sci. Rep ., vol. 6 , Nov . 2016 . [19]. ↵ S. Cocco , C. Feinauer , M. Figliuzzi , R. Monasson , and M. Weigt , “ Inverse statistical physics of protein sequences: a key issues review ,” Reports on Progress in Physics , vol. 81 , no. 3 , p. 032601 , 2018 . OpenUrl CrossRef PubMed [20]. ↵ S. M. Assmann , H.-L. Chou , and P. C. Bevilacqua , “ Rock, scissors, paper: How RNA structure informs function ,” The Plant Cell , vol. 35 , pp. 1671 – 1707 , 02 2023 . OpenUrl CrossRef PubMed [21]. ↵ R. C. Spitale and D. Incarnato , “ Probing the dynamic RNA structurome and its functions ,” Nat. Rev. Genet ., vol. 24 , pp. 178 – 196 , Mar . 2023 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted April 05, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Integrating experimental feedback improves generative models for biological sequences Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Integrating experimental feedback improves generative models for biological sequences Francesco Calvanese , Giovanni Peinetti , Polina Pavlinova , Philippe Nghe , Martin Weigt bioRxiv 2025.03.31.646327; doi: https://doi.org/10.1101/2025.03.31.646327 Share This Article: Copy Citation Tools Integrating experimental feedback improves generative models for biological sequences Francesco Calvanese , Giovanni Peinetti , Polina Pavlinova , Philippe Nghe , Martin Weigt bioRxiv 2025.03.31.646327; doi: https://doi.org/10.1101/2025.03.31.646327 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7633) Biochemistry (17680) Bioengineering (13889) Bioinformatics (41927) Biophysics (21445) Cancer Biology (18585) Cell Biology (25491) Clinical Trials (138) Developmental Biology (13373) Ecology (19897) Epidemiology (2067) Evolutionary Biology (24308) Genetics (15606) Genomics (22494) Immunology (17736) Microbiology (40385) Molecular Biology (17175) Neuroscience (88583) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4822) Physiology (7641) Plant Biology (15149) Scientific Communication and Education (2045) Synthetic Biology (4293) Systems Biology (9822) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00