Full text
75,535 characters
· extracted from
preprint-html
· click to expand
Ambient Proteins: Training Diffusion Models on Low Quality Structures | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Ambient Proteins: Training Diffusion Models on Low Quality Structures Giannis Daras , View ORCID Profile Jeffrey Ouyang-Zhang , Krithika Ravishankar , William Daspit , Costis Daskalakis , Qiang Liu , Adam Klivans , View ORCID Profile Daniel J. Diaz doi: https://doi.org/10.1101/2025.07.03.663105 Giannis Daras 1 CSAIL, MIT Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jeffrey Ouyang-Zhang 2 Computer Science, UT Austin Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jeffrey Ouyang-Zhang Krithika Ravishankar 2 Computer Science, UT Austin Find this author on Google Scholar Find this author on PubMed Search for this author on this site William Daspit 2 Computer Science, UT Austin Find this author on Google Scholar Find this author on PubMed Search for this author on this site Costis Daskalakis 1 CSAIL, MIT Find this author on Google Scholar Find this author on PubMed Search for this author on this site Qiang Liu 2 Computer Science, UT Austin Find this author on Google Scholar Find this author on PubMed Search for this author on this site Adam Klivans 2 Computer Science, UT Austin Find this author on Google Scholar Find this author on PubMed Search for this author on this site Daniel J. Diaz 2 Computer Science, UT Austin Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Daniel J. Diaz For correspondence: danny.diaz{at}utexas.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract We present Ambient Protein Diffusion , a framework for training protein diffusion models that generates structures with unprecedented diversity and quality. State-of-the-art generative models are trained on computationally derived structures from AlphaFold2 (AF), as experimentally determined structures are relatively scarce. The resulting models are therefore limited by the quality of synthetic datasets. Since the accuracy of AF predictions degrades with increasing protein length and complexity, de novo generation of long, complex proteins remains challenging. Ambient Protein Diffusion overcomes this problem by treating low-confidence AF structures as corrupted data. Rather than simply filtering out low-quality AF structures, our method adjusts the diffusion objective for each structure based on its corruption level, allowing the model to learn from both high and low quality structures. Empirically, Ambient Protein Diffusion yields major improvements: on proteins with 700 residues, diversity increases from 45% to 86% from the previous state-of-the-art, and designability improves from 68% to 86%. We will make all of our code, models and datasets available under the following repository: https://github.com/jozhang97/ambient-proteins . 1 Introduction Proteins are the fundamental building blocks of life. They accelerate chemical reactions by many orders of magnitude, convert sunlight into food, and underpin the myriads of processes within cells and organisms with the level of accuracy and precision required to sustain life [ 6 , 31 ]. Unlike computational protein engineering—which focuses on improving the developability or function of existing proteins through computationally guided mutations for practical biotechnological applications [ 21 , 39 , 24 , 40 , 8 , 35 , 20 , 51 ]— de novo protein design aims to create entirely new proteins with specified structures and functions, ultimately seeking to discover folds and activities not found in nature [ 12 ]. Since protein function is largely determined by tertiary and quaternary structure, generative machine learning frameworks for protein design focus on learning the sparse, evolutionarily sampled landscape of protein structures, with the goal of generating novel, functional backbone scaffolds beyond those observed in nature [ 49 , 32 , 33 , 27 , 23 , 52 , 22 , 7 , 54 , 46 ]. Recent breakthroughs in machine learning–based structure prediction—most notably AlphaFold2 [ 28 ]—have made it possible to infer accurate protein structures directly from sequence [ 28 , 11 , 34 ]. This progress has enabled the creation of large-scale structural resources such as the AlphaFold Database (AFDB), which contains over 214M predicted structures from UniProtKB sequences [ 13 , 45 ]. In parallel, high-throughput tools for sequence and structure comparison, such as MMSeqs2 and FoldSeek, have facilitated the curation of large, diverse training datasets from AFDB [ 5 ]. Among them, the 2.3M AFDB cluster dataset, has already been shown to improve the capabilities of generative models for protein structure design [ 33 , 23 ]. The quality of a generative model depends on the size and fidelity of its training data. While AlphaFold2 (AF) has enabled large-scale protein structure prediction, its outputs often contain biological or computational inaccuracies [ 50 ]. To estimate the reliability of a predicted structure, AlphaFold provides a per-residue confidence score, the predicted Local Distance Difference Test (pLDDT), which is a proxy of local structural accuracy. In practice, researchers frequently filter predicted structures based on average pLDDT scores, training only on high-confidence subsets (typically using a cutoff of pLDDT > 80). However, lower pLDDT scores are disproportionately associated with longer and more structurally complex proteins. As a result, filtering based on pLDDT introduces a bias toward smaller, simpler folds, reducing structural diversity in the training set and impairing the model’s ability to generalize to more complex regions of structure space—including longer proteins. Notably, many low-pLDDT structures still contain well-folded domains that are misoriented with respect to each other, as reflected by low predicted alignment error (pAE). These structures can still offer valuable domain-level and coarse-grained information about the structure distribution, which is discarded by overly aggressive filtering. To mitigate these issues, we depart from the standard paradigm of aggressive filtering of lowconfidence structures. Instead, we introduce Ambient Protein Diffusion —a framework for training diffusion models that incorporates proteins with noisy or incomplete structures directly into the training process. Ambient Protein Diffusion builds on recent advances in learning generative models from corrupted data [ 14 , 16 , 1 , 15 , 2 , 41 , 4 , 48 , 17 , 36 , 42 ], which have explored controlled corruption settings such as additive Gaussian noise [ 14 , 16 , 1 , 17 ] and masking [ 15 , 2 ]. Our framework generalizes these techniques to arbitrary, unknown corruption processes, enabling the training of generative models in scientific domains where the corruption mechanism is complex and non-parametric. In our setting, the AlphaFold prediction errors represent such a corruption: they are structured, not explicitly modeled, and vary across protein size and topology. Yet, our method effectively leverages these imperfect samples to significantly advance the capabilities of generative protein models. For example, on proteins with 700 residues, our 16.7M parameter model improves diversity from 45% to 86% and increases designability from 68% to 86% compared to the previous state-of-the-art, Proteína [ 23 ], a 200M parameter model. Below, we summarize our key contributions: We generalize recent approaches for training generative models on corrupted data to handle arbitrary, non-parametric, and unknown corruption processes, enabling their application to scientific domains. We demonstrate that our framework, Ambient Protein Diffusion, effectively leverages low-pLDDT AlphaFold predictions, allowing the model to learn from all available samples without distorting the underlying structure distribution. We further construct a new training set from the AFDB cluster dataset optimized for geometric diversity irrespective of their evolutionary relationship, yielding a broader and more representative sampling of structural space for generative modeling. We achieve state-of-the-art results in both diversity and designability for protein generation, improve diversity by 45% and designability by 24% on long proteins (800 residues), and establish the Pareto frontier between these objectives on short proteins ( < 256 residues). Our models also achieve state-of-the-art novelty scores in both short and long protein generation, indicating much lower memorization of the training set. 2 Background and Related Work De novo Protein Generation Most de novo protein generation frameworks that operate in structure space follow a three-step pipeline: (1) a generative model samples a three-dimensional backbone structure; (2) an inverse folding model (e.g., ProteinMPNN [ 19 ]) proposes amino acid sequences likely to fold into the generated backbone; and (3) these sequences are evaluated by a structure prediction model (e.g., ESMFold [ 34 ]) to identify the ones that best recapitulate the target fold. Pioneering methods such as RFDiffusion [ 49 ] and Chroma [ 27 ] have established strong baselines for backbone generation. More recent advances include Genie [ 32 ], which introduces a denoising diffusion model with an SE(3)-equivariant network that generates proteins as point clouds of reference frames; Genie2 [ 33 ], which scales Genie using synthetic AlphaFold structures to improve training data diversity; and Proteína [ 23 ], which replaces diffusion with flow matching and scales both model size and dataset scale by orders of magnitude to improve performance on longer and more complex monomeric proteins. Ambient Protein Diffusion is built using the Genie architecture and makes use of ambient protein diffusion to achieve state-of-the-art results with substantially shorter training times, much fewer parameters (16.7M vs 200M), and significantly reduced computational requirements. Training Datasets Recent advances in structure prediction—most notably AlphaFold2 [ 28 ] and ESMFold [ 34 ]—have dramatically expanded the available structural data, enabling the prediction of ∼ 214M and ∼ 617M monomeric protein structures from UniProtKB (via the AlphaFold Protein Structure Database) [ 45 ] and metagenomic libraries (via the ESM Atlas) [ 34 ], respectively. While this explosion of computational structures presents unprecedented opportunities, it also poses significant challenges for downstream bioinformatic analysis and model training, particularly due to the scale, redundancy, and uneven quality of the predicted structures. To address this, prior work applied MMSeqs2 [ 26 ] and FoldSeek [ 44 ] to cluster the AlphaFold Database (AFDB), yielding ∼ 2.3M clusters shown to capture evolutionary relationships between predicted structures [ 5 ]. This AFDB cluster dataset has since served as the foundational dataset to train several generative protein structure models [ 23 , 33 ]. In this work, we apply a reparameterized FoldSeek to the AFDB cluster dataset to maximize geometric diversity rather than evolutionary insights. Our goal is to construct a dataset better suited for learning a generative model of protein structure space—one that emphasizes structural rather than evolutionary variation. Starting from the 2.3 million AFDB clusters, we use the cluster representatives with average pLDDT > 70 ( ∼ 1.29M structures) and apply our geometric clustering procedure. The resulting dataset comprises roughly ∼ 292K structurally diverse clusters. Diffusion Models The goal in diffusion modeling is to sample from an unknown density p 0 that we have samples available. Formally, let a dataset of N independent samples, where . The unknown distribution p 0 is potentially complex, high-dimensional and multimodal. To make the sampling problem more tractable, in diffusion modeling we target smoothened densities p t defined as the convolution with a Gaussian: p t = p 0 ∗ 𝒩 (0, σ 2 ( t ) I d ) 2 , where σ ( t ) is an increasing function of t , with σ (0) = 0. In particular, the object of interest in diffusion modeling is the score-function of the smoothened densities, defined as . The latter is connected to the optimal denoiser (in the l 2 sense) through Tweedie’s Formula: . Given access to 𝔼 [ X 0 | X t = x t ] one can sample from the distribution p 0 of interest by running a discretized version of a reverse diffusion process [ 3 , 43 , 9 , 10 ]. Hence, the sampling problem becomes equivalent to the problem of approximating the set of functions . Given a sufficiently rich family of functions { h θ : θ ∈ Θ}, the conditional expectation at a particular time t can be learned by minimizing the objective: In the context of protein diffusion models for backbones, X 0 captures the 3-D co-ordinates for each residues of the protein. Learning from noisy data Recent work has explored the problem of learning diffusion models from corrupted data. Typically, the corruption process is simple, e.g. it can be additive Gaussian noise as in [ 14 , 16 , 1 ], or masking as in [ 1 , 2 ]. Even in works where the corruption process is more general, the degradation needs to be known and multiple diffusion trainings are required until an Expectation-Maximization algorithm converges [ 41 , 4 ]. In this work, we deviate from this setting as the corruption process is unknown and complex, which may include AlphaFold learning and hallucination errors, and noise inherent to the structural biology technique used to solve the structure, etc. We also target a single diffusion training instead of performing multiple EM iterations. The method is detailed in Section 3 . Our work generalizes the techniques developed in [ 14 , 16 ] for the additive Gaussian noise case. Particularly, in [ 16 ], the authors consider learning from a dataset of samples noised with additive Gaussian noise of different variances . Formally, let , where X 0 ∼ p 0 , Z ∼ 𝒩 (0, I d ). Each point contributes to the learning only for t ≥ t i , using the objective: . As the number of samples grows to infinity, Equation 2 also recovers the conditional expectation 𝔼[ X 0 | X t = x t ], but it does so while being able to utilize noisy samples. This objective recovers the true minimizer because one can prove that the conditional expectation , lies in the line that connects the current noisy point x t and the prediction of the clean image, 𝔼[ X 0 | X t = x t ]. Distribution merging under noise A key idea in our method will be that distribution distances contract as we increase the noise level added. In the context of diffusion models, this property has been leveraged in the SDEdit [ 38 ] paper to allow diffusion models to perform stroke-based editing at inference time without any finetuning. Most relevant to our work, Ambient Omni [ 18 ] uses the distribution contraction property (together with other innovations) to use blurry and out-of-distribution data during training. In this work, we focus on synthetic data from AlphaFold that arise from various corruption sources: biological ones (errors in the crystallography process), computational ones (errors in the solution of the inverse problem from the crystallography data to the modeled structure), and learning ones (AlphaFold mistakes due to the limited size of the training dataset and hallucinations). 3 Method 3.1 Building Intuition We are given access to samples from the AlphaFold distribution and aim to learn how to sample from the true distribution of experimentally solved structures, p 0 , without an explicit degradation model mapping . Our key insight is that, regardless of how deviates from p 0 , adding noise to both distributions causes them to contract toward one another. As the noise level increases, the distributions and p t become progressively more aligned. This is because it is known that Gaussian noise contracts distribution distances (KL divergence) in the following sense: In fact, as t→ ∞ , we have that: , as both distributions converge to the same Gaussian. We now define the concept of merging of two distributions towards the same measure. Definition 3.1 ( ϵ -merged) We say that two distributions, p and are ϵ-merged, if the KL distance between the two is upper-bounded, by ϵ, i . e ., if . Similarly, we define the merging time of two distributions as the minimal amount of noise we need to add such that the two distributions become ϵ -merged. Formally, Definition 3.2 ( ϵ -merging time) Let two distributions . We define their ϵ-merging time as follows: . Assuming we can estimate the ϵ -merging time between two distributions p and , our key idea is to treat samples from as approximate samples from p t for all timesteps . This idea is illustrated in Figure 2 . The intuition is that once the distributions have sufficiently merged under noise, the residual shift becomes negligible and samples from can be used for learning p t . This holds because: (i) the learning algorithm may not be sensitive to small distributional discrepancies at high noise levels, and (ii) even if some bias is introduced, the remaining diffusion trajectory for times is robust to small initial distributional mismatch due to its inherent stochasticity. Download figure Open in new tab Figure 1. Long protein generation performance. We train Ambient Protein Diffusion on proteins up to 768 residues and sample sequences ranging from 300 to 800 residues. Our 17M parameters model significantly outperforms the previous state-of-the-art Proteina [ 23 ], which is a 200M parameters model. Ambient Protein Diffusion generates both diverse and designable structures across all lengths. Download figure Open in new tab Figure 2. Overview of Ambient Protein Diffusion on AlphaFold structures. Rows 1-3 show the noising process (from left to the right) of three different AlphaFold proteins based on their average pLDDT (top: high, middle: medium, and bottom: low). These proteins are only used during training at the green diffusion times. At these noise levels, any initial AlphaFold prediction errors in low-pLDDT proteins have effectively been “erased” by the added noise, and the distributions of low- and high-pLDDT proteins have merged. Sample dependent noise levels At a high level, our objective is to determine the ϵ -merging time between the distribution of AlphaFold-predicted structures and that of experimentally resolved proteins. A key challenge arises from the fact that the AlphaFold distribution is highly heterogeneous in structural fidelity—that is, the accuracy with which AlphaFold predicts the true protein structure varies widely across samples. It is well established that short, structurally simple proteins are predicted with higher confidence, while longer and more complex proteins tend to yield lower-confidence predictions. This trend is illustrated in Figure 3B (Left). If we were to assign a single noise level across the entire AlphaFold dataset, we would need to select a relatively high noise level to accommodate the lowest-confidence predictions, particularly from long proteins. This would unnecessarily degrade the training signal for high-confidence structures—regardless of protein length—and limit the model’s ability to learn from clean supervision. To address this, we treat the AlphaFold dataset as a mixture of K sub-distributions, q 1 , q 2 , …, q K , each representing a distinct confidence regime. We then assign each sub-distribution an appropriate noise level, sufficient to bring it ϵ -close to the distribution of high-confidence structures under the same noise schedule. This formulation allows the model to effectively learn from high-confidence AlphaFold predictions and incorporate low-confidence structures in a controlled manner, mitigating the degradation typically caused by noisy training data. Download figure Open in new tab Figure 3. Reclustering the AFDB cluster dataset to improve generative protein modeling. (A) Starting from the 2.3M AFDB clustered dataset, we cluster the representatives with FoldSeek optimized for geometric similarity: alignment-type set to TM-Align, TM threshold set to 0.5, and coverage set to 0.75. This results in a 300K clusters (pLDDT > 70) from which we keep the representatives for training. (B) pLDDT and protein length statistics for our new training set. (C) Overlay of two Ambient clusters: Top row: Representative (beige; UniProt A0A2W1EPG1) overlaid with members A0A820X4G2 (left, red) and A0A446YZW1 (right, red). Bottom row: Representative (cyan; UniProt A0A395XDB6) overlaid with members A0A3B0YRI6 (left, blue) and L0S475 (right, blue). In the original AFDB cluster dataset, each of these six proteins was designated as the representative of its own cluster despite their similarities in structural features. A natural way to decompose the AlphaFold distribution into a mixture of quality-specific sub-distributions is to leverage AlphaFold’s self-reported confidence metric—the average predicted Local Distance Difference Test (pLDDT) score—as a proxy for predicted structural fidelity. In particular, given a dataset , we consider K distributions (where K is a hyperparameter to be chosen) with empirical observations for the j -th distribution being all the samples , for some hyperparameters . Choice of sub-distribution boundaries In this work, we adopt a deliberately simple and conservative strategy by partitioning the AlphaFold dataset into three discrete quality regimes based on the average pLDDT score: high-quality proteins (pLDDT > 90), medium-quality proteins (pLDDT in [80, 90]) and low-quality proteins (pLDDT in [70, 80]). We acknowledge that this discretization is coarse and that more principled alternatives may yield further improvements—for instance, by optimizing the bin boundaries or learning a continuous mapping from pLDDT to diffusion time. Despite the simplicity of our choices, our experimental results demonstrate that even a naive quality-aware decomposition can lead to important gains in performance across both short and long proteins. It is important to emphasize that there are two sources of benefit over filtering methods: 1) low-quality data (previously discarded) increases diversity, and 2) the distinction we do between medium-quality and high-quality data boosts designability. The population of proteins that have pLDDT lower than 70 has a very high merging time, and we did not see significant benefits from including them. To improve training efficiency and save computational resources, we decided to discard this subpopulation. We underline that training algorithms that are adaptive to local corruption (rather than global, i.e. average pLDDT) might be able to benefit from such proteins – we leave this direction for future work. 3.2 Ambient Protein Diffusion Algorithm Our algorithm takes as input a dataset of protein structures together with their average pLDDT score, , a diffusion schedule, σ ( t ), and a mapping function f : [0, 100] → ℝ + that translates the average pLDDT value of a protein to its estimated ϵ -merging time. Annotation stage The first step of the algorithm replaces each protein in the dataset with a noisy version of itself, where the noise level is determined by mapping function f . The mapping function f is a hyperparameter for our algorithm – in our experiments, we opt for a rather simple choice for this function (see Appendix Table 8 for a full description of our training configuration), but in principle this can be an arbitrary function defined by the user based on their specific domain knowledge or experimental findings. After this transformation, each protein can be treated as a sample from the target distribution convolved with a Gaussian at its assigned noise level. This transformation step is only performed once during dataset preprocessing, i.e., we replace the low-quality protein with a noisy version of itself before we start the training. This is important because adding different noise realizations across epochs can lead to recovery of the original low-quality protein if the noise is averaged out. Loss function After the annotation stage, we need to solve a training problem where we have data corrupted at different noise levels with additive Gaussian noise, as in [ 14 , 16 ]. Hence, we can use the objective of Equation 2 . Instead of directly applying the loss, we first need to rescale each time t to account for the vanishing gradient effect that is due to the multiplicative factor a ( t ). Specifically, we need to rescale the loss at time t with: such that we balance the different timesteps. We underline that this rescaling was not mentioned in the original paper of Daras et al. [ 16 , 14 ], for training with noisy data. Yet, we find this rescaling critical for the success of our method. We hypothesize that the authors of [ 14 , 16 ] did not encounter this issue because there were at most two noise levels considered, while in AF predicted protein structures there is a whole spectrum of assigned noise levels based on the predicted quality (measured by average pLDDT) of a protein structure. We provide further details about the loss implementation in the Appendix (Section C) and pseudocode in Algorithm 1. Uniform Protein Sampling in terms of diffusion times To perform a training update for a diffusion model, we typically sample a point from the training distribution and then we uniformly sample the noise level t . However, since in our case we are dealing with noisy data, not all times t are allowed for a given protein, i.e. a protein with pLDDT ( i ) is only used for times t ≥ f (pLDDT ( i ) ). To avoid spending most of the training updates on very noisy proteins, we opt for sampling first the diffusion time and then select from the eligible proteins that can be used in that diffusion time. This strategy ensures balanced coverage across the diffusion trajectory—from low to high noise—while still leveraging the diversity of low-confidence structures (pLDDT < 80) in our training dataset. Summarizing Our algorithm requires three simple changes to the regular diffusion training: 1) an annotation stage (before training) where each low-quality protein is replaced with a noisy version of itself, 2) a change in the way we fetch samples from the dataset so that we do not overallocate training updates to highly noisy proteins and 3) a change in the loss function to account for the fact that for some proteins we do not have access to an uncorrupted structures. 3.3 Reclustering AFDB clusters for generative modeling applications On top of our algorithmic contributions, we also reconsider the choices made for the training dataset. The AFDB cluster dataset [ 5 ] has been used to train several generative protein models [ 33 , 23 ]. However, the original intent behind the clustering was to study structure evolution across AFDB. Thus, the hyperparameters were chosen to obtain clusters of homologous structures, and the authors report that 97.4% of pairwise comparisons within clusters are conserved at the H-group (Homology) level of the ECOD hierarchical domain classification (median TM-score 0.71). While these FoldSeek hyperparameters are well-suited for evolutionary analysis of AFDB, we found that the AFDB cluster dataset has a significant degree of structural duplication and near-duplication between clusters that are more distantly evolutionarily related (see Figure 3C ). This structural redundancy leads to an imbalanced training set, where structural motifs from the larger protein superfamilies are overrepresented. Given this finding, we hypothesize that the datasets for generative modeling of protein struc-tures—particularly for backbone-based models— benefit more from clusters defined purely by geometric similarity. To that end, we construct a new clustering dataset derived from the AFDB cluster representatives, with an exclusive focus on structural topology. Specifically, these are the changes we made to the FoldSeek hyperparameters: we switch the alignment-type from 3Di+AA to TM-Align to improve fidelity, we use a TM-score threshold of 0.5, and we relax the alignment coverage from 0.9 to 0.75. We did the latter to improve clustering of AlphaFolded proteins with extended, unfolded N- or C-terminal regions (i.e., noodle tails) ( Figure 3C ). This approach produced a more balanced dataset that samples structural folds more uniformly, independent of their evolutionary relationships. Ablations that disentangle the contribution of this reclustering from our ambient training approach are given in Figure 4 . Download figure Open in new tab Figure 4. Ablation to quantify the effect of the Ambient Protein Diffusion framework. We improve upon Genie2 by making the architecture bigger, reclustering the dataset as in Section 3.3 and by finetuning on longer proteins (up to 768 aminoacids). The resulting improved Genie2, shown in Orange, outperforms Genie2 but still lags behind our Ambient Protein Diffusion Model, shown in blue. The only difference between the two models is that Ambient Protein Diffusion uses low pLDDT AlphaFold structures as noisy data, as explained in Section 3.2 . Ambient Protein Diffusion consistently outperforms both the improved Genie2 baseline and Genie2, with increasingly significant improvements as sequence length grows. 4 Experimental Results We build on the Genie2 codebase [ 33 ]. Our model architecture follows the Genie2 architecture except that it is scaled larger, using 8 triangle layers as opposed to 5. We train Ambient Protein Diffusion in 3 stages with increasingly longer proteins, eventually reaching proteins up to length 768. For details on the training process, see Appendix Section C.3, and for details on metric,s see Appendix Section B. We underline that the computational cost of training our model is relatively low compared to the prior state of the art Proteína model. This is due to the decreased size of our model ( < 17M vs 200M) and training set ( ∼ 290K vs ∼ 780K). We further note that our goal is to develop models that perform well across a range of tasks, including long-protein generation, motif scaffolding, and more. To this end, we train only two models for the purposes of this paper: one model optimized for long-protein generation ( Figure 1 ) and another optimized for short-protein generation ( Figure 5 ). Download figure Open in new tab Figure 5. Designability - diversity trade-off for short protein generation (up to 256 residues). Ambient dominates completely the Pareto frontier between designability and diversity, while using a 12.88 × smaller model. We further do so without using any higher-order sampler or (auto-) guidance method. Download figure Open in new tab Figure 6. Additional qualitative visualizations of unconditional generations. 4.1 Comparisons on unconditional generation of longer proteins In Figure 1 , we compare Ambient Protein Diffusion performance on generating backbone for proteins with length ranging from 300 to 800 residues. To directly compare with Proteína on long-protein generation, we adopt its three-stage training and evaluation protocol. During training, the maximum sequence length is capped at 768 residues. For evaluation, we sample 100 protein backbones at each target length and evaluate them using the designability and diversity metrics. Since Ambient Protein Diffusion builds on Genie2, we use the same sampling procedure—running 1000 diffusion steps with a noise scale of γ = 0.6. This noise scale parameter controls the trade-off between the designability and diversity by reducing the amount of stochasticity added in the reverse process, as it is typically done in the protein generative modeling literature (see Appendix C.4.1 for details). Ambient Protein Diffusion achieves designability and diversity scores exceeding 90% for proteins between 300 and 500 residues, and maintains scores above 85% for lengths up to 700 residues. For 800-residue proteins, both metrics decline to 68%. Compared to Proteína, Ambient Protein Diffusion outperforms by 26% in designability and 91% in diversity at length 700, and by 24% and 45%, respectively, at length 800. At every protein length, Ambient Protein Diffusion’s diversity is equal to its designability, indicating that every designable protein is unique. This is not the case for Proteína, where diversity scores consistently fall below designability, regardless of protein length. Taken together, these results demonstrate the impact of ambient diffusion on backbone-based generative models and highlight the strength of Genie2’s equivariant architecture. Our 17M parameter model trained on approximately 290K AlphaFold structures significantly outperforms a 200M-parameter transformer model trained on roughly 780K proteins. Our results show that smaller, more efficient models can surpass larger transformer baselines in both structural diversity and designability. 4.2 Ablating the significance of Ambient Diffusion In comparison to Genie2, the starting point of our implementation, we made the following changes: 1) made the model bigger, 2) trained on longer proteins, 3) reclustered the dataset to optimize for geometric similarity rather than evolutionary similarity, and 4) used low pLDDT AlphaFold proteins as noisy data using our Ambient Protein Diffusion framework. To quantify how much of the improvement comes from the latter step, we train an Improved Genie2 without the Ambient Framework for training with corrupted data and we report results in Figure 4 . We find that while our two models perform similarly on proteins of 300 residues, the designability and diversity of a vanilla diffusion model diminishes on longer proteins. For proteins with 800 residues, the number of designable clusters drops from 68% to 25%. Ambient Protein Diffusion shows a marked improvement, maintaining a stable number of designable clusters. We underline that the difference comes solely from the training algorithm since the network architecture, model size, optimization hyperparameters, and inference algorithm stay the same. 4.3 Comparisons on unconditional generation of shorter proteins In this experiment, we evaluate the model on the unconditional generation of shorter proteins in Figure 5 . We provide training details for the model optimized for short protein generation in the Appendix Section C.3.2. Following the Genie2 protocol, we generate 5 structures for each sequence length from 50 to 256 residues, yielding a total of 1,035 structures. The generated structures are evaluated for both designability and diversity. In line with prior work, we sweep the noise scale γ to explore the tradeoff between designability and diversity. Ambient Protein Diffusion outperforms previous methods on both metrics, establishing a new Pareto frontier that achieves superior performance compared to all existing models, including Proteina. 4.4 Novelty scores The next step is to show that our method does not just memorize proteins, but instead it can generate novel designable structures that are distinct from the training set. Here, we compute the TM-novelty metric following the evaluation protocol of Geffner et al. [ 23 ]. However, we found two key issues when reproducing the literature TM-novelty scores, which we discuss in detail in Appendix Section B.3. First, since Spring 2025 FoldSeek resolved a bug in computing the alntmscore (see Github issue 312), which makes all previous reported values incorrect. Second, we found that Geffner et al. [ 23 ] inadvertently used the alnTM-Score from the row with the highest qTM-Score, rather than from the row with the highest alnTM-Score. To ensure accurate and comparable benchmarking with the literature, we recalculated TM-Novelty with the patched FoldSeek v10 for several literature models, explicitly selecting the max alnTM-Score for each query. We report the results in Table 2 . For backward compatibility, we also reproduced literature results using the unpatched FoldSeek v9 and the default qTM-Score sorting (see Appendix Table 7). Moving forward, we strongly recommend that the community adopt FoldSeek v10 and always use the max alnTM-Score value to determine the novelty per query when computing TM-Novelty. View this table: View inline View popup Download powerpoint Table 1: PDB and AFDB TM-Novelty for long protein generation. For each model, we sample 100 backbones for each length: 300, 400, 500, 600, 700, and 800. Both Ambient and Proteina are models fine-tuned on proteins of length up to 768 residues. A lower TM-Novelty score is better. View this table: View inline View popup Download powerpoint Table 2: PDB and AFDB TM-Novelty for short protein generation. For each model, we sample 5 structures for each sequence length from 50 to 256 residues, yielding a total of 1,035 structures. A lower TM-Novelty score is better. View this table: View inline View popup Download powerpoint Table 3: Long protein generation performance. Best values per residue length are highlighted in bold. Ambient Proteins results are shown for γ = 0.6. Found in main text in Figure 1 . Using both versions of FoldSeek, Ambient Protein Diffusion sets new state-of-the-art TM-novelty scores on both the PDB and AFDB (588K) benchmarks. In the short-evaluation regime ( ≤ 256 AAs) using FoldSeek v10, Ambient Protein Diffusion exceeds the next-best model (Genie2) by 2.0% on PDB and 1.6% on AFDB—despite Genie2’s restriction to proteins no longer than 256 AAs. Against Proteina, Ambient Protein Diffusion further boosts TM-novelty by 7.0% on PDB and 4.8% on AFDB. In the long-evaluation regime (300–800 AAs), we focus on comparison with Proteina. Here, Ambient Protein Diffusion achieves TM-novelty scores of 0.682 on PDB and 0.740 on AFDB, representing improvements of 18.4% and 16.2%, respectively. Together, these results demonstrate that Ambient Protein Diffusion, driven by the Ambient loss and cluster dataset, produces the most novel proteins across both short and long-sequence settings. 4.5 Motif Scaffolding As a final evaluation, we compare our method to prior work in motif scaffolding. The full results are shown in Appendix Figure 7 and in Appendix Tables 5 and 6. With γ = 0.45, Ambient Protein Diffusion generates 1,923 unique successful scaffolds for single-motif tasks, a significant improvement over Genie2’s 1,445 [ 33 ] and performs comparably to a Proteína model (2,094 [ 23 ]), which is much larger (200M parameters vs 17M parameters) and is optimized specifically for motif scaffolding. For multi-motif scaffolding, Ambient Protein Diffusion generates 89 unique successful structures across 5 of the 6 problems, outperforming Genie2, which produces 40 and solves 4. Download figure Open in new tab Figure 7. Performance on Motif Scaffolding Tasks. We compare Ambient Protein Diffusion to state-of-the-art models for motif scaffolding. The graphs show the number of unique successful scaffolds generated for each single- and multi-motif task. No model produced successful scaffolds for 4JHW and 3NTN. Only Ambient Protein Diffusion produced a valid solution for multi-motif scaffolding of 2B5I . 5 Limitations and Future Work This work represents a first step toward protein generative models that make better use of the synthetic structures from the AlphaFold Database. Nevertheless, there are several clear avenues for improvement. (i) Structure-quality metric. We rely on AlphaFold’s self-reported pLDDT score—a coarse,residue-averaged confidence measure that can itself be noisy or misleading, (ii) Data coverage. We only use one representative per AFDB cluster rather than incorporating all available cluster members, and we build on the existing AFDB clustering rather than reclustering the full 214M–structure dataset,(iii) pLDDT–merging-time mapping. Our choice of how to translate pLDDT values into merging thresholds was driven by empirical tuning rather than by a systematic ablation study or principled selection criterion, (iv) Experimental validation. Ultimately, the real test of any generative model is whether its predictions hold up in the laboratory. We have yet to confirm our structures experimentally. 6 Conclusion We introduced Ambient Protein Diffusion , a framework for protein structure generation that leverages low-confidence AlphaFold structures as a source of noisy training data. Ambient Protein Diffusion enables the generation of long protein structures with unprecedented levels of designability, diversity and novelty. Diversity increases as it can use low-confidence Alphafold structures that are typically discarded and designability increases as we separate the pristine quality proteins structures from the medium quality AlphaFold predictions. Ambient Protein Diffusion represents a foundational step toward robust de novo protein design at more natural, biologically relevant lengths. A Additional Results A.1 Motif Scaffolding Results We additionally compare our method to prior work on motif scaffolding in Figure 7 , with full results provided in the supplement. Our evaluation follows the Genie2 benchmark, which comprises 24 single-motif and 6 multi-motif design tasks [ 33 , 49 ]. For each task, we generate 1,000 scaffold samples using a noise scale of γ = 0.45. A design is considered successful if it (1) satisfies Genie2’s motif designability criteria and (2) preserves the motif with an RMSD below 1Å. Among successful designs, a scaffold is counts as unique if its TM-score is at most 0.6 when compared to any other successful scaffold. A task is considered solved if at least one successful scaffold is generated. With γ = 0.45, Ambient Protein Diffusion generates 1,923 unique successful scaffolds for singlemotif tasks, a significant improvement over Genie2’s 1,445 [ 33 ] and performs comparably to a Proteína model (2,094 [ 23 ]) that is much larger (200M parameters vs 17M parameters) and is optimized specifically for motif scaffolding. Notably, all methods solve a similar number of motifs – RFDiffusion solves 22 of the 24 tasks, while Ambient Protein Diffusion, Genie2, and Proteína each solve the same 23 tasks. For multi-motif scaffolding, Ambient Protein Diffusion generates 89 unique successful structures across 5 of the 6 benchmark problems, outperforming Genie2, which produces 40 and solves 4. Ambient Protein Diffusion performs particularly well on the 1PRW_four motif (38 vs. 11 successful structures) in which a scaffold is generated surrounding a calcium binding motif [ 47 ]. Overall, Ambient Protein Diffusion outperforms existing methods such as Genie2 and RFDiffusion on single-motif tasks and matches the performance of a Proteína model optimized specifically for motif-scaffolding. A.2 Complete Tabular Results This section presents the full numerical tables corresponding to result figures shown in the text. Specifically, Table 3 enumerates the results in Figure 1 and Figure 4 . Table 4 enumerate the results in Figure 5 . Table 5 enumerates partial results in Figure 7 . Table 6 enumerates partial results in Figure 7 . View this table: View inline View popup Download powerpoint Table 4: Designability-diversity trade-off for short protein generation. Designability and diversity for short protein generation. Found in main text in Figure 5 . View this table: View inline View popup Download powerpoint Table 5: Performance on Single Motif Scaffolding Tasks Ambient Protein Diffusion achieves superior results to Genie 2 and RFDiffusion and performs on par with Proteina. Crucially, our model achieves these results zero-shot, i.e., unlike Proteina, it is not optimized for motif scaffolding and still achieves comparable performance while being an order of magnitude smaller. Found in text in Figure 7 . View this table: View inline View popup Download powerpoint Table 6: Performance on Multi Motif Scaffolding Tasks Ambient Protein Diffusion achieves consistently superior results to the predecessor Genie-2 model, despite using the same architecture, i.e. the benefit comes from better use of the data. The motif 2B5I is only solved by Ambient Protein Diffusion. Found in text in Figure 7 . View this table: View inline View popup Download powerpoint Table 7: PDB and AFDB TM-Novelty for short protein generation using FoldSeek-v9 and max qTM-score row. We recompute all values using Geffner et al. [ 23 ] method. Numbers reported by Geffner et al. [ 23 ] are shown in parenthesis. These numbers are reported for backwards comparisons only and we strongly encourage the community to use the corrected TM-Novelty scores reported in the main text (see Appendix B.3). View this table: View inline View popup Download powerpoint Table 8: Hyperparameters of the diffusion protein model. Dashes (-) indicate that the value is the same as the previous column. The Ambient walls correspond to the assigned diffusion times based on the protein’s pLDDT (times are from 1 to 1000). Proteins with pLDDT > 90 are used everywhere. Proteins with pLDDT > 80 are used for times in [600, 1000] and proteins with pLDDT > 70 are used for times in [900, 1000]. We underline that these hyperparameters were not particularly optimized, and even more benefits might be observed by properly tuning these values. B Evaluation Metrics Evaluation of a protein generative model is challenging and there have been a few metrics that have been proposed. In what follows, we explain standard metrics in the protein-generative modeling literature that we will use in our Experimental Results section. Our experiments report using Proteína’s definitions of the metrics when possible. B.1 Designability Designability (also referred to as refoldability) assesses the structural plausibility of generated proteins. Given a generated backbone, ProteinMPNN [ 19 ] generates eight plausible amino acid sequences for that backbone. ESMFold then folds each sequence and the resulting eight structures are compared to the original backbone. The self-consistency RMSD (scRMSD) is defined as the smallest root mean squared deviation between the generated backbone and each of the eight refolded structures. A backbone is considered designable if scRMSD < 2 Å and designability is defined as the percentage of generated backbones that meet this criterion. B.2 Diversity Diversity quantifies the structural variability among the generated proteins. Designable backbones are clustered using Foldseek with a TM-score threshold of 0.5. Diversity is then defined as: This metric reflects the proportion of structurally distinct (i.e., non-redundant) designable backbones among all designable samples. B.3 Novelty Metric definition Novelty is a metric that assesses the uniqueness of the generated backbones in comparison to existing structures in a database. We compute the novelty score with respect to both AFDB and PDB datasets following Geffner et al. [ 23 ]. To compute novelty, we measure the structural similarity of each designable protein to those in the dataset using FoldSeek’s easy-search command used by Proteina: foldseek easy-search –alignment-type 1 –exhaustive-search –tmscore-threshold 0.0 –max-seqs 10000000000 –format-output query,target,alntmscore,lddt: For each designable backbone, we keep the max alntmscore value rather than the alntmscore value of the first row, which is the max qtmscore value. The novelty of the dataset is the average of these maximum alntmscore values, representing how distinct our generated structures are from the proteins in the reference database (i.e., we do df.groupby(“query”)[“alntmscore”].max().mean()) . Perhaps counterintuitively, high novelty is not desired since it implies high similarity to the existing database. Bug in novelty computation Several of our evaluation metrics —TM-Diversity and TM-Novelty— depend on FoldSeek’s TM-score implementation. In Fall 2024, however, FoldSeek developers identified a bug in the alntmscore output (see Github issue 312 titled “alntmscore output is wrong” for details), which means that all previously reported TM-based metrics in the literature that did not use FoldSeek v10 (release 10-941cd33) are incorrect. Additionally, we found that Geffner et al. [ 23 ] mistakenly computed TM-novelty by taking the alnTM-Score from the row with the highest qTM-Score, rather than from the row with the highest alnTM-Score. This oversight arises because Foldseek’s easy-search command, by default, sorts its output in descending order by qTM-Score—irrespective of the requested output format. To ensure accurate and comparable benchmarking with the literature, we recalculated TM-Novelty with the patched FoldSeek v10 (release 10-941cd33), explicitly selecting the maximum alnTM-Score for each query. For backward compatibility, we also reproduced literature results using the unpatched FoldSeek v9 (release 9-427df8a) and the default max qTM-Score row. Moving forward, we strongly recommend that the community adopt FoldSeek v10 and always sort using the alnTM-Score output to determine the maximum TM-Score per query and correctly compute TM-Novelty. Using both versions of FoldSeek, Ambient Protein Diffusion sets new state-of-the-art TM-novelty scores on both the PDB and AFDB (588K) benchmarks. C Full Training Algorithm and Implementation Details C.1 Additional Implentation Details Loss buffer The loss rescaling introduced in the main paper ensures balanced weighting across noise levels. At the same time, it also introduces a potential instability: the loss explodes as σ ( t ) approaches σ ( t i ). To mitigate this instability, we define a buffer zone around each protein’s assigned noise level. Specifically, given a protein’s assigned noise level t i , it is only used during training at timesteps t + τ , where τ is a buffer hyperparameter that controls the exclusion margin. This constraint prevents the model from encountering degenerate gradient behavior near the rescaling boundaries and is only applied to medium and low confidence structures (pLDDT < 90). We underline that is similar to how in normal diffusion there is a buffer time zone around t = 0 that is never sampled. Ambient in high-noise regime As explained in the main paper, each protein is only used for a subset of diffusion times according to its average pLDDT value. The proteins that have super high PLDDT ( > 90) are considered clean data and can be used with the normal training objective. However, as found in [ 42 ], using the Ambient training objective for high-noise might be useful even if clean data is available. Intuitively, this objective prevents memorization and promotes diversity in the outputs. We ablated this design choice, and we found a slight increase in diversity for the same designability by using this. Hence, we used this tool from [ 42 ] for all our Ambient Protein Diffusion trainings. C.2 Algorithm We provide the full algorithm in Algorithm 1. We commit to open-sourcing our code and models to facilitate the broader adoption of our method from the community. Algorithm 1 Ambient Protein Diffusion: Training Algorithm. Download figure Open in new tab C.3 Model and Training Hyperparameters C.3.1 Hyperparameters for model optimized for long generation We train Ambient Protein Diffusion in 3 stages with increasingly longer proteins. In the first stage, we train on proteins from 50 to 256 residues for 200 epochs on our ambient clusters dataset using the representatives ( ∼ 196,000 proteins). Since we increased the batch size to 384 items, we adopted a learning rate schedule to improve convergence [ 25 ]. We train with the AdamW optimizer with a maximal learning rate of 1.0 × 10 − 4 . During the second and third stage, we include additional cluster representatives of at most 512 and 712 residues, which scales our dataset to ∼ 269,000 and ∼ 291,000 proteins respectively. Training is performed on 48 GH200 GPUs and runs in 18, 48, and 48 hours for each stage respectively. We underline that the computational cost of training our model, while significant, is still relatively low compared to the Proteína’s estimated 14 days training on 128 A100 GPUs. This is due to the decreased size of our model ( < 17M vs 200M) and training set ( ∼ 290K vs ∼ 780K). Table 8 includes a more thorough list of the hyperparameters used for our experiments. C.3.2 Hyperparameters for model optimized for short generation The Ambient Protein Diffusion model used in this experiment was trained on a dataset filtered with a TM-Align threshold of 0.4 (as opposed to 0.5), resulting in a training set of approximately 90K cluster representative proteins. While it is well known that protein pairs with TM-scores above 0.5 typically share the same fold, and those below 0.5 generally do not, we find that the trade-off between designability and diversity is sensitive to the underlying structural heterogeneity of the dataset. Notably, clustering with a TM-align threshold of 0.4, which corresponds to less than a 1% chance of shared global topology, slightly outperforms the 0.5 threshold, which reflects a ∼ 38% probability of topological similarity [ 53 ]. C.4 Sampling C.4.1 Noise scale In diffusion modeling, one designs a forward Ito corruption process: defined by the drift function f ( ·, · ) and the noise coefficient g ( · ). This process gets initialized at a distribution p 0 and diffuses over time, defining smoother densities p t . Due to a remarkable result by Anderson [ 3 ], sampling from p 0 is achieved by running the reverse process: Initialized at pT However, in the context of protein generative models, it has been observed that sampling from a discretized version of the reverse process of Equation 6 does not lead to good performance as measured by the available metrics. Hence, it is common practice in the protein generative modeling literature to sample from a tilted measure using the process: where the parameter γ controls the stochasticity added to the generation. Typically, this parameter is set to values γ < 1 leading to more designable proteins at the expense of reduced diversity in the generated samples. The goal is often to optimally control the trade-off between designability and diversity, i.e. to be able to produce a wide range of structurally and functionally diverse proteins. Unless stated otherwise, for the experiments in this paper, we use γ = 0.6 (as done in Genie2 [ 33 ]). C.4.2 Hyperparameters and sampling methods For sampling, we follow the exact same parameters as Genie2. In particular, we run 1000 sampling steps using a simple first-order discretization of Equation (6). We underline that results could be further enhanced by using more advanced sampling techniques such as autoguidance [ 30 ] (used in Proteina [ 23 ]), higher-order samplers [ 29 ] and test-time scaling [ 37 ] methods. 7 Acknowledgements This research has been supported by NSF Awards CCF-1901292, ONR grants N00014-25-1-2116, N00014-25-1-2296, a Simons Investigator Award, and the Simons Collaboration on the Theory of Algorithmic Fairness. The experiments were run on the Vista GPU Cluster through the Center for Generative AI (CGAI) and the Texas Advanced Computing Center (TACC) at UT Austin. The authors want to thank Bowen Jing and Hannes Stärk for useful discussions. Funder Information Declared National Science Foundation, https://ror.org/021nxhr62 , CCF-1901292 Office of Naval Research, https://ror.org/00rk2pe57 , N00014-25-1-2116 , N00014-25-1-2296 Simons Investigator Award Texas Advanced Computing Center Footnotes gdaras{at}mit.edu jozhang{at}utexas.edu krithravi{at}utexas.edu willdaspit{at}gmail.com costis{at}csail.mit.edu lqiang{at}cs.utexas.edu klivans{at}cs.utexas.edu danny.diaz{at}utexas.edu https://github.com/jozhang97/ambient-proteins ↵ 2 Alternative formulations of diffusion modeling, such as the Variance Preserving case, are equivalent to this case up to a simple reparametrization. For the ease of analysis, we focus our presentation on corruptions of the form X t = X 0 + σ t Z, Z ∼ 𝒩 (0, I d ). References [1]. ↵ Asad Aali , Marius Arvinte , Sidharth Kumar , and Jonathan I Tamir . Solving inverse problems with score-based generative priors learned from noisy data . arXiv preprint arxiv: 2305.01166 , 2023 . [2]. ↵ Asad Aali , Giannis Daras , Brett Levac , Sidharth Kumar , Alex Dimakis , and Jon Tamir . Ambient diffusion posterior sampling: Solving inverse problems with diffusion models trained on corrupted data . In The Thirteenth International Conference on Learning Representations , 2025 . URL https://openreview.net/forum?id=qeXcMutEZY . [3]. ↵ Brian D.O. Anderson . Reverse-time diffusion equation models . Stochastic Processes and their Applications , 12 ( 3 ): 313 – 326 , 1982 . OpenUrl CrossRef [4]. ↵ Weimin Bai , Yifei Wang , Wenzheng Chen , and He Sun . An expectation-maximization algorithm for training clean diffusion models from corrupted observations . arXiv preprint arxiv: 2407.01014 , 2024 . [5]. ↵ Inigo Barrio-Hernandez , Jingi Yeo , Jürgen Jänes , Milot Mirdita , Cameron LM Gilchrist , Tanita Wein , Mihaly Varadi , Sameer Velankar , Pedro Beltrao , and Martin Steinegger . Clustering predicted structures at the scale of the known protein universe . Nature , 622 ( 7983 ): 637 – 645 , 2023 . OpenUrl CrossRef PubMed [6]. ↵ Jeremy M Berg , John L Tymoczko , and Lubert Stryer . Biochemistry (loose-leaf). Macmillan , 2007 . [7]. ↵ Avishek Joey Bose , Tara Akhound-Sadegh , Guillaume Huguet , Kilian Fatras , Jarrid Rector-Brooks , Cheng-Hao Liu , Andrei Cristian Nica , Maksym Korablyov , Michael Bronstein , and Alexander Tong . Se (3)-stochastic flow matching for protein backbone generation . arXiv preprint arxiv: 2310.02391 , 2023 . [8]. ↵ Jose M Carceller , Bhumika Jayee , Claire G Page , Daniel G Oblinsky , Gustavo Mondragón-Solórzano, Nithin Chintala , Jingzhe Cao , Zayed Alassad , Zheyu Zhang , Nathaniel White , et al. Engineering a photoenzyme to use red light . Chem , 11 ( 2 ), 2025 . [9]. ↵ Sitan Chen , Sinho Chewi , Jerry Li , Yuanzhi Li , Adil Salim , and Anru R Zhang . Sampling is as easy as learning the score: theory for diffusion models with minimal data assumptions . arXiv preprint arxiv: 2209.11215 , 2022 . [10]. ↵ Andreas Krause , Emma Brunskill , Kyunghyun Cho , Barbara Engelhardt , Sivan Sabato , and Jonathan Scarlett Sitan Chen , Giannis Daras , and Alex Dimakis . Restoration-degradation beyond linear diffusions: A non-asymptotic analysis for DDIM-type samplers . In Andreas Krause , Emma Brunskill , Kyunghyun Cho , Barbara Engelhardt , Sivan Sabato , and Jonathan Scarlett , editors, Proceedings of the 40th International Conference on Machine Learning, volume 202 of Proceedings of Machine Learning Research , pages 4462 – 4484 . PMLR, 23–29 Jul 2023 . URL https://proceedings.mlr.press/v202/chen23e.html . [11]. ↵ Xinshi Chen , Yuxuan Zhang , Chan Lu , Wenzhi Ma , Jiaqi Guan , Chengyue Gong , Jincai Yang , Hanyu Zhang , Ke Zhang , Shenghao Wu , Kuangqi Zhou , Yanping Yang , Zhenyu Liu , Lan Wang , Bo Shi , Shaochen Shi , and Wenzhi Xiao . Protenix - advancing structure prediction through a comprehensive alphafold3 reproduction . bioRxiv , 2025 . doi: 10.1101/2025.01.08.631967 . OpenUrl Abstract / FREE Full Text [12]. ↵ Alexander E Chu , Tianyu Lu , and Po-Ssu Huang . Sparks of function by de novo protein design . Nature biotechnology , 42 ( 2 ): 203 – 215 , 2024 . OpenUrl CrossRef PubMed [13]. ↵ UniProt Consortium . Uniprot: a hub for protein information . Nucleic acids research , 43 ( D1 ): D204 – D212 , 2015 . OpenUrl CrossRef PubMed [14]. ↵ Giannis Daras , Yuval Dagan , Alexandros G Dimakis , and Constantinos Daskalakis . Consistent diffusion models: Mitigating sampling drift by learning to be consistent . arXiv preprint arxiv: 2302.09057 , 2023 . [15]. ↵ Giannis Daras , Kulin Shah , Yuval Dagan , Aravind Gollakota , Alex Dimakis , and Adam Klivans . Ambient diffusion: Learning clean distributions from corrupted data . In Thirty-seventh Conference on Neural Information Processing Systems , 2023 . URL https://openreview.net/forum?id=wBJBLy9kBY . [16]. ↵ Giannis Daras , Alexandros G Dimakis , and Constantinos Daskalakis . Consistent diffusion meets tweedie: Training exact ambient diffusion models with noisy data . arXiv preprint arxiv: 2404.10177 , 2024 . [17]. ↵ Giannis Daras , Yeshwanth Cherapanamjeri , and Constantinos Costis Daskalakis . How much is a noisy image worth? data scaling laws for ambient diffusion . In The Thirteenth International Conference on Learning Representations , 2025 . URL https://openreview.net/forum?id=qZwtPEw2qN . [18]. ↵ Giannis Daras , Adrian Rodriguez-Munoz , Adam Klivans , Antonio Torralba , and Constantinos Daskalakis . Ambient diffusion omni: Training good models with bad data , 2025 . [19]. ↵ Justas Dauparas , Ivan Anishchenko , Nathaniel Bennett , Hua Bai , Robert J Ragotte , Lukas F Milles , Basile IM Wicky , Alexis Courbet , Rob J de Haas , Neville Bethel , et al. Robust deep learning–based protein sequence design using proteinmpnn . Science , 378 ( 6615 ): 49 – 56 , 2022 . OpenUrl CrossRef PubMed [20]. ↵ Daniel J Diaz , Anastasiya V Kulikova , Andrew D Ellington , and Claus O Wilke . Using machine learning to predict the effects and consequences of mutations in proteins . Current opinion in structural biology , 78 : 102518 , 2023 . OpenUrl CrossRef PubMed [21]. ↵ Daniel J Diaz , Chengyue Gong , Jeffrey Ouyang-Zhang , James M Loy , Jordan Wells , David Yang , Andrew D Ellington , Alexandros G Dimakis , and Adam R Klivans . Stability oracle: a structure-based graph-transformer framework for identifying stabilizing mutations . Nature Communications , 15 ( 1 ): 6170 , 2024 . OpenUrl CrossRef PubMed [22]. ↵ Cong Fu , Keqiang Yan , Limei Wang , Wing Yee Au , Michael Curtis McThrow , Tao Komikado , Koji Maruhashi , Kanji Uchino , Xiaoning Qian , and Shuiwang Ji . A latent diffusion model for protein structure generation . In Learning on Graphs Conference , pages 29 – 1 . PMLR , 2024 . [23]. ↵ Tomas Geffner , Kieran Didi , Zuobai Zhang , Danny Reidenbach , Zhonglin Cao , Jason Yim , Mario Geiger , Christian Dallago , Emine Kucukbenli , Arash Vahdat , et al. Proteina: Scaling flow-based protein structure generative models . arXiv preprint arxiv: 2503.00710 , 2025 . [24]. ↵ Chengyue Gong , Adam Klivans , James Madigan Loy , Tianlong Chen , Daniel Jesus Diaz , et al. Evolution-inspired loss functions for protein representation learning . In Forty-first International Conference on Machine Learning , 2024 . [25]. ↵ Priya Goyal , Piotr Dollár , Ross Girshick , Pieter Noordhuis , Lukasz Wesolowski , Aapo Kyrola , Andrew Tulloch , Yangqing Jia , and Kaiming He . Accurate, large minibatch sgd: Training imagenet in 1 hour . arXiv preprint arxiv: 1706.02677 , 2017 . [26]. ↵ Maria Hauser , Martin Steinegger , and Johannes Söding . Mmseqs software suite for fast and deep clustering and searching of large protein sequence sets . Bioinformatics , 32 ( 9 ): 1323 – 1330 , 2016 . OpenUrl CrossRef PubMed [27]. ↵ John B Ingraham , Max Baranov , Zak Costello , Karl W Barber , Wujie Wang , Ahmed Ismail , Vincent Frappier , Dana M Lord , Christopher Ng-Thow-Hing , Erik R Van Vlack , et al. Illuminating protein space with a programmable generative model . Nature , 623 ( 7989 ): 1070 – 1078 , 2023 . OpenUrl CrossRef PubMed [28]. ↵ John Jumper , Richard Evans , Alexander Pritzel , Tim Green , Michael Figurnov , Olaf Ronneberger , Kathryn Tunyasuvunakool , Russ Bates , Augustin Žídek , Anna Potapenko , et al. Highly accurate protein structure prediction with alphafold . nature , 596 ( 7873 ): 583 – 589 , 2021 . OpenUrl CrossRef PubMed [29]. ↵ Tero Karras , Miika Aittala , Timo Aila , and Samuli Laine . Elucidating the design space of diffusion-based generative models . arXiv preprint arxiv: 2206.00364 , 2022 . [30]. ↵ Tero Karras , Miika Aittala , Tuomas Kynkäänniemi , Jaakko Lehtinen , Timo Aila , and Samuli Laine . Guiding a diffusion model with a bad version of itself . Advances in Neural Information Processing Systems , 37 : 52996 – 53021 , 2024 . OpenUrl [31]. ↵ Albert L Lehninger , David L Nelson , and Michael M Cox . Lehninger principles of biochemistry. Macmillan , 2005 . [32]. ↵ Yeqing Lin and Mohammed AlQuraishi . Generating novel, designable, and diverse protein structures by equivariantly diffusing oriented residue clouds . arXiv preprint arxiv: 2301.12485 , 2023 . [33]. ↵ Yeqing Lin , Minji Lee , Zhao Zhang , and Mohammed AlQuraishi . Out of many, one: Designing and scaffolding proteins at the scale of the structural universe with genie 2 . arXiv preprint arxiv: 2405.15489 , 2024 . [34]. ↵ Zeming Lin , Halil Akin , Roshan Rao , Brian Hie , Zhongkai Zhu , Wenting Lu , Nikita Smetanin , Robert Verkuil , Ori Kabeli , Yaniv Shmueli , et al. Evolutionary-scale prediction of atomic-level protein structure with a language model . Science , 379 ( 6637 ): 1123 – 1130 , 2023 . OpenUrl CrossRef PubMed [35]. ↵ Yi Liu , Sophie G Bender , Damien Sorigue , Daniel J Diaz , Andrew D Ellington , Greg Mann , Simon Allmendinger , and Todd K Hyster . Asymmetric synthesis of α-chloroamides via photoenzymatic hydroalkylation of olefins . Journal of the American Chemical Society , 146 ( 11 ): 7191 – 7197 , 2024 . OpenUrl CrossRef PubMed [36]. ↵ Haoye Lu , Qifan Wu , and Yaoliang Yu . SFBD: A method for training diffusion models with noisy data . In Frontiers in Probabilistic Inference: Learning meets Sampling , 2025 . URL https://openreview.net/forum?id=6HN14zuHRb . [37]. ↵ Nanye Ma , Shangyuan Tong , Haolin Jia , Hexiang Hu , Yu-Chuan Su , Mingda Zhang , Xuan Yang , Yandong Li , Tommi Jaakkola , Xuhui Jia , et al. Inference-time scaling for diffusion models beyond scaling denoising steps . arXiv preprint arxiv: 2501.09732 , 2025 . [38]. ↵ Chenlin Meng , Yutong He , Yang Song , Jiaming Song , Jiajun Wu , Jun-Yan Zhu , and Stefano Ermon . Sdedit: Guided image synthesis and editing with stochastic differential equations . arXiv preprint arxiv: 2108.01073 , 2021 . [39]. ↵ Jeffrey Ouyang-Zhang , Daniel Diaz , Adam Klivans , and Philipp Krähenbühl . Predicting a protein’s stability under a million mutations . Advances in Neural Information Processing Systems , 36 : 76229 – 76247 , 2023 . OpenUrl [40]. ↵ Jeffrey Ouyang-Zhang , Chengyue Gong , Yue Zhao , Philipp Krähenbühl , Adam R Klivans , and Daniel J Diaz . Distilling structural representations into protein sequence models . bioRxiv , pages 2024 – 11 , 2024 . [41]. ↵ François Rozet , Gérôme Andry , François Lanusse , and Gilles Louppe . Learning diffusion priors from observations by expectation maximization . arXiv preprint arxiv: 2405.13712 , 2024 . [42]. ↵ Kulin Shah , Alkis Kalavasis , Adam R. Klivans , and Giannis Daras . Does generation require memorization? creative diffusion models using ambient diffusion , 2025 . [43]. ↵ Yang Song , Jascha Sohl-Dickstein , Diederik P Kingma , Abhishek Kumar , Stefano Ermon , and Ben Poole . Score-based generative modeling through stochastic differential equations . arXiv preprint arxiv: 2011.13456 , 2020 . [44]. ↵ Michel van Kempen , Stephanie S Kim , Charlotte Tumescheit , Milot Mirdita , Cameron LM Gilchrist , Johannes Söding , and Martin Steinegger . Foldseek: fast and accurate protein structure search . Biorxiv , pages 2022 – 02 , 2022 . [45]. ↵ Mihaly Varadi , Stephen Anyango , Mandar Deshpande , Sreenath Nair , Cindy Natassia , Galabina Yordanova , David Yuan , Oana Stroe , Gemma Wood , Agata Laydon , et al. Alphafold protein structure database: massively expanding the structural coverage of protein-sequence space with high-accuracy models . Nucleic acids research , 50 ( D1 ): D439 – D444 , 2022 . OpenUrl CrossRef PubMed [46]. ↵ Chentong Wang , Yannan Qu , Zhangzhi Peng , Yukai Wang , Hongli Zhu , Dachuan Chen , and Longxing Cao . Proteus: exploring protein structure generation for enhanced designability and efficiency . bioRxiv , pages 2024 – 02 , 2024 . [47]. ↵ Jue Wang , Sidney Lisanza , David Juergens , Doug Tischer , Joseph L Watson , Karla M Castro , Robert Ragotte , Amijai Saragovi , Lukas F Milles , Minkyung Baek , et al. Scaffolding protein functional sites using deep learning . Science , 377 ( 6604 ): 387 – 394 , 2022 . OpenUrl CrossRef PubMed [48]. ↵ Yifei Wang , Weimin Bai , Weijian Luo , Wenzheng Chen , and He Sun . Integrating amortized inference with diffusion models for learning clean distribution from corrupted images . arXiv preprint arxiv: 2407.11162 , 2024 . [49]. ↵ Joseph L Watson , David Juergens , Nathaniel R Bennett , Brian L Trippe , Jason Yim , Helen E Eisenach , Woody Ahern , Andrew J Borst , Robert J Ragotte , Lukas F Milles , et al. De novo design of protein structure and function with rfdiffusion . Nature , 620 ( 7976 ): 1089 – 1100 , 2023 . OpenUrl CrossRef PubMed [50]. ↵ Carter J Wilson , Wing-Yiu Choy , and Mikko Karttunen . Alphafold2: a role for disordered protein/region prediction? International Journal of Molecular Sciences , 23 ( 9 ): 4591 , 2022 . OpenUrl CrossRef PubMed [51]. ↵ Seung-Gyun Woo , Danny J Diaz , Wantae Kim , Mason Galliver , and Andrew D Ellington . Machine learning-guided engineering of t7 rna polymerase and mrna capping enzymes for enhanced gene expression in eukaryotic systems . Chemical Engineering Journal , page 165191, 2025 . [52]. ↵ Kevin E Wu , Kevin K Yang , Rianne van den Berg , Sarah Alamdari , James Y Zou , Alex X Lu , and Ava P Amini . Protein structure generation via folding diffusion . Nature communications , 15 ( 1 ): 1059 , 2024 . OpenUrl CrossRef PubMed [53]. ↵ Jinrui Xu and Yang Zhang . How significant is a protein structure similarity with tm-score= 0.5? Bioinformatics , 26 ( 7 ): 889 – 895 , 2010 . OpenUrl CrossRef PubMed Web of Science [54]. ↵ Jason Yim , Andrew Campbell , Andrew YK Foong , Michael Gastegger , José Jiménez-Luna , Sarah Lewis , Victor Garcia Satorras , Bastiaan S Veeling , Regina Barzilay , Tommi Jaakkola , et al. Fast protein backbone generation with se (3) flow matching . arXiv preprint arxiv: 2310.05297 , 2023 . View the discussion thread. Back to top Previous Next Posted July 05, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Ambient Proteins: Training Diffusion Models on Low Quality Structures Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Ambient Proteins: Training Diffusion Models on Low Quality Structures Giannis Daras , Jeffrey Ouyang-Zhang , Krithika Ravishankar , William Daspit , Costis Daskalakis , Qiang Liu , Adam Klivans , Daniel J. Diaz bioRxiv 2025.07.03.663105; doi: https://doi.org/10.1101/2025.07.03.663105 Share This Article: Copy Citation Tools Ambient Proteins: Training Diffusion Models on Low Quality Structures Giannis Daras , Jeffrey Ouyang-Zhang , Krithika Ravishankar , William Daspit , Costis Daskalakis , Qiang Liu , Adam Klivans , Daniel J. Diaz bioRxiv 2025.07.03.663105; doi: https://doi.org/10.1101/2025.07.03.663105 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioengineering Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41913) Biophysics (21436) Cancer Biology (18578) Cell Biology (25482) Clinical Trials (138) Developmental Biology (13372) Ecology (19889) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15599) Genomics (22483) Immunology (17728) Microbiology (40365) Molecular Biology (17163) Neuroscience (88540) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15136) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9818) Zoology (2269)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.