FlexRibbon: Joint Sequence and Structure Pretraining for Protein Modeling

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 83,423 characters · extracted from preprint-html · click to expand
FlexRibbon: Joint Sequence and Structure Pretraining for Protein Modeling | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results FlexRibbon: Joint Sequence and Structure Pretraining for Protein Modeling Jianwei Zhu , Yu Shi , Ran Bi , Peiran Jin , Chang Liu , Zhe Zhang , Haitao Huang , Zekun Guo , Pipi Hu , Fusong Ju , Lin Huang , Xinwei Tai , Chenao Li , Kaiyuan Gao , Xinran Wei , Huanhuan Xia , Jia Zhang , Yaosen Min , Zun Wang , Yusong Wang , Liang He , Haiguang Liu , Tao Qin doi: https://doi.org/10.1101/2025.10.08.681293 Jianwei Zhu 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yu Shi 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ran Bi 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Peiran Jin 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: jinpeiran{at}bjzgca.edu.cn liuhaiguang{at}bjzgca.edu.cn qintao{at}bjzgca.edu.cn Chang Liu 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: jinpeiran{at}bjzgca.edu.cn liuhaiguang{at}bjzgca.edu.cn qintao{at}bjzgca.edu.cn Zhe Zhang 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Haitao Huang 1 Zhongguancun Academy , Beijing, China 2 Hunan University , Changsha, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zekun Guo 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pipi Hu 3 Beijing Institute of Mathematical Sciences and Applications , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Fusong Ju 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lin Huang 4 Ubiquant , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Xinwei Tai 1 Zhongguancun Academy , Beijing, China 5 Huazhong University of Science and Technology , Wuhan, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chenao Li 6 Institute of Biophysics, Chinese Academy of Sciences , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kaiyuan Gao 5 Huazhong University of Science and Technology , Wuhan, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Xinran Wei 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Huanhuan Xia 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jia Zhang 4 Ubiquant , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yaosen Min 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zun Wang 7 Shanghai Artificial Intelligence Laboratory , Shanghai, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yusong Wang 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Liang He 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Haiguang Liu 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: jinpeiran{at}bjzgca.edu.cn liuhaiguang{at}bjzgca.edu.cn qintao{at}bjzgca.edu.cn Tao Qin 1 Zhongguancun Academy , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: jinpeiran{at}bjzgca.edu.cn liuhaiguang{at}bjzgca.edu.cn qintao{at}bjzgca.edu.cn Abstract Full Text Info/History Metrics Preview PDF Abstract Protein foundation models have advanced rapidly, with most approaches falling into two dominant paradigms. Sequence-only language models (e.g., ESM-2) capture sequence semantics at scale but lack structural grounding. MSA-based predictors (e.g., AlphaFold 2/3) achieve accurate folding by exploiting evolutionary couplings, but their reliance on homologous sequences makes them less reliable in highly mutated or alignment-sparse regimes. We present FlexRibbon, a pretrained protein model that jointly learns from amino acid sequences and three-dimensional structures. Our pretraining strategy combines masked language modeling with diffusion-based denoising, enabling bidirectional sequence-structure learning without requiring MSAs. Trained on both experimentally resolved structures and AlphaFold 2 predictions, FlexRibbon captures global folds as well as flexible conformations critical for biological function. Evaluated across diverse tasks spanning interface design, intermolecular interaction prediction, and protein function prediction, FlexRibbon establishes new state-of-the-art performance on 12 different tasks, with particularly strong gains in mutation-rich settings where MSA-based methods often struggle. 1 Introduction Proteins are fundamental to nearly all biological processes, and modeling their sequences, structures, and functions underpins biomedical and biotechnological advances ranging from enzyme engineering to therapeutic antibody design. Recently, protein foundation models (PFMs) have emerged as a unifying framework that leverages large-scale data and deep learning to capture the principles of protein biology, offering new opportunities for both understanding and design. The development of PFMs has followed two main trajectories. One line of work builds on sequence-only language models (PLMs) such as ESM-2 [ 1 ] and ProtT5 [ 2 ], which leverage large corpora of protein sequences to learn universal embeddings. These models are broadly applicable and computationally efficient, but the lack of physical relevance, particularly information about three-dimensional geometry, limits their ability to capture the structural basis of protein function. Another line is represented by multiple sequence alignment (MSA) based structure predictors, exemplified by AlphaFold 2/3 [ 3 , 4 ], which exploit evolutionary couplings encoded in MSAs to achieve striking accuracy in structure prediction. Yet, this dependence on homologous sequences introduces sensitivity: when alignments are shallow, sparse, or disrupted by extensive mutation, the predictive signal degrades. As a result, critical scenarios such as antibody CDR loops, intrinsically disordered interfaces, and rapidly evolving pathogens remain inadequately addressed by either paradigm; in these settings, single-sequence models that bypass MSAs and directly model individual sequences provide a more faithful way to capture flexible and highly mutated regions where alignment signals are weak. We introduce FlexRibbon, a 3-billion-parameter pretrained protein model that learns directly from amino acid sequences and large-scale structural corpora, including experimentally resolved structures [ 5 ] and AlphaFold 2 predicted structures [ 6 ]. Unlike sequence-only models or predictors that impose a one-way sequence-to-structure mapping, FlexRibbon integrates sequence and structure signals from the outset: each residue is represented by a single embedding that combines sequence identity with structural context. The training strategy couples masked language modeling on sequences with diffusion-based denoising on structures, enabling the model to capture bidirectional sequence-structure dependencies and support full-atom structure generation. To address the variable confidence of predicted structures, we introduce an adaptive loss that selectively weights low-confidence regions, extracting useful signal while avoiding overfitting to unreliable geometry. Previous joint models were designed primarily for structure prediction, but the high memory cost of full-atom representations made the structural component difficult to scale, so most parameters ended up concentrated on the sequence side. FlexRibbon overcomes this limitation with a hierarchical modeling strategy that allocates scalable capacity across both sequence and structure, allowing efficient large-scale structural learning alongside sequence semantics. We systematically evaluate FlexRibbon across three broad task families: (i) flexible interface prediction and design, such as antibody/nanobody CDR modeling and peptide binding; (ii) intermolecular interaction prediction, including protein-ligand docking prediction, ligand-induced conformational change, and protein-ligand affinity prediction; and (iii) protein function prediction, such as gene ontology and enzyme activity. Across these categories, FlexRibbon achieves state-of-the-art performance, with especially strong improvements in mutation-rich settings where MSA-based methods often struggle. Beyond outperforming existing models, our results highlight the consistent advantages of joint sequence-structure pretraining. The key contributions are: Proposing a novel pretraining strategy for FlexRibbon that unifies protein structure prediction and design by combining masked language modeling with diffusion-based denoising, thereby learning a bidirectional sequence-structure mapping rather than a one-way sequence-to-structure mapping. Introducing a hierarchical modeling strategy that balances scalable capacity across sequence and structure representations, overcoming the memory bottlenecks of full-atom models and enabling structural representations to scale effectively. Showing that FlexRibbon enables co-design of protein sequence and structure, delivering substantial improvements on flexible and highly mutated regions such as antibody/nanobody CDR loops and peptide-binding interfaces, where MSA-based models struggle. Demonstrating consistent gains across 12 tasks spanning flexible interface modeling, intermolecular interactions, and protein function prediction, showing that sequence-structure pretraining transfers broadly beyond protein folding. 2 Related Works Protein foundation models PFMs learn transferable protein representations for diverse tasks. Early PFMs were sequence-only language models such as ESM-1b/ESM-2 [ 1 , 7 ] and ProtT5 [ 2 ], trained on large sequence corpora but limited by the absence of geometric priors. In contrast, structure-centric PFMs such as AlphaFold2/3 [ 3 , 4 ] leverage MSA and templates to achieve high-accuracy folding, yet degrade in highly mutated or low-homology regions. More recently, PFMs have moved toward multimodal, structure-aware pretraining. ESM-3 [ 8 ] unifies sequence, structure, and function in a frontier generative model. DPLM-2 [ 9 ] extends diffusion protein language models (PLMs) to jointly model both sequences and structures via structure tokenization. Antibody design Antibody design methods can be broadly categorized into sequence-based and structure-based approaches. On the sequence side, general-purpose PLMs such as ProtBert [ 10 ] provide strong baselines for paratope prediction, mutation recovery, and antibody library generation. More specialized pretraining frameworks such as SFM-Protein [ 11 ] introduce masked language modeling with pairwise and span-level objectives, showing improved performance on CDR-H3 benchmarks. In addition, graph neural network methods like ABGNN [ 12 ] and RefineGNN [ 13 ] attempt to couple sequence embeddings with local structural context, while knowledge-driven frameworks such as Roset-taAntibodyDesign (RAbD) [ 14 ] remain widely used in practice. On the structure side, diffusion-based models such as DiffAb [ 15 ] generate CDR loops conditioned on antigen structures, enabling co-design of sequence and structure, while dyMEAN [ 16 ] and MEAN [ 17 ] extend this direction with E(3)-equivariant architectures for full-atom design. More recently, IgGM [ 18 ] expands design capabilities to antibodies and nanobodies by producing antigen-specific complexes. Together, these approaches demonstrate the promise of combining sequence information and structural priors for flexible and functional antibody design. 3 Methods 3.1 Diffusion Pretraining We employ diffusion modeling as a generative pretraining objective for protein structures. A structure R ∈ ℝ 3 N is represented by the 3D coordinates of all heavy atoms. Following Karras et al. [ 19 ] (also adopted by AlphaFold 3), we connect the data distribution p ( R ) with Gaussian noise p src through a variance-exploding process [ 20 ]: where σ t increases with time t . Sampling amounts to reversing this process, which requires learning the score function ∇log p t . We approximate it with a neural network s θ ( R , t ), equivalently parameterized as: where D θ ( R , t ) denotes model output. The effective learning objective amounts to denoising loss: with a time-step sampler p ( t ) and weight w t . A key challenge is ensuring invariance to rigid-body transformations. We remove translational freedom by centering structures on the center of mass, and enforce rotational invariance by augmenting data with random SO(3) rotations, instead of relying on heavy SO(3)-equivariant architectures [ 21 ] that may also introduce undesired reflection symmetry. We also found that alignment-based objectives [ 4 , 22 ] did not improve training stability in our settings but added the risk of improper sampling [ 23 ]. Further details are provided in Supplementary 8.6. 3.2 Architecture Our architecture is organized in three stages: a sequence module, a coarse-grained structure module, and an all-atom structure module ( Fig. 1 and Fig. 6 ). This design balances efficiency and expressivity: coarse-grained modeling captures global protein-ligand organization, while the all-atom stage ensures fine-grained structural accuracy. Table 4 shows the architectural hyperparameters. Download figure Open in new tab Figure 1: FlexRibbon framework. The model architecture consists of three modules: (1) a sequence module that encodes masked protein sequences and ligand topologies, (2) a coarse-grained structure module that encodes residue-level structural information, and (3) an all-atom structure module that refines these representations into chemically consistent coordinates. The framework combines diffusion-based denoising with sequence recovery, enabling joint alignment of sequence, residue, and atomic representations for protein-ligand modeling. Various downstream tasks, including antibody/nanobody design, modeling of intermolecular interactions, and protein function prediction, are supported. Sequence Module The sequence module jointly embeds protein residues and small-molecule atoms into a unified representation space. For protein residues, we apply a standard Transformer encoder (Supplementary 7) with rotary position embeddings [ 24 ], focusing purely on sequence-derived semantics. For small molecules, we incorporate 2D topology with a learnable attention bias derived from atom types and bond types to capture chemical identity and connectivity. The combined representations define a residue-atom graph, which is further refined by a pair-feature update module that models residue-residue and residue-atom interactions. Coarse-grained Structure Module The coarse-grained structure module employs a Diffusion Trans-former (DiT) [ 25 ] to denoise coordinates at residue level for proteins and atom level for small molecules. Each residue is represented as a coarse structural anchor, while each ligand atom is represented by a position embedding derived from its noised coordinates. The module conditions on embeddings from the sequence module to guide denoising. All-atom Structure Module The all-atom structure module employs a DiT where each atom of proteins is represented explicitly. Noised 3D coordinates of all atoms are encoded into position features that serve as token inputs. The coarse-grained outputs are broadcast to all atoms of each residue, providing residue-level guidance as conditional input. To preserve chemical validity, learnable attention biases are added to atom pairs connected by covalent bonds, combining atom-type and bond-type embeddings as additive bias terms in the attention map. This refinement stage allows the model to reconcile global residue-level context with detailed atomic-level interactions, yielding chemically consistent and high-resolution structures. 3.3 Structure-Informed Masked Language Model (SIMLM) Masked language modeling (MLM) [ 1 , 26 ] has proven effective for predicting masked amino acids in protein sequences. In the spirit of unifying sequence and structure, the masked positions should be inferred from correlations within the surrounding sequence and reflect the structural context that these residues possess. To realize this principle, we extend MLM beyond sequence-only inputs by integrating diffusion-based noise into structural representations, yielding a structure-informed masked language model (SIMLM). This formulation couples sequence recovery with structural denoising, thereby reinforcing the mapping between amino acid identity and three-dimensional conformation. Concretely, we integrate MLM and diffusion through three complementary training modes. Mode 1 (Sequence-to-Structure): standard diffusion-based structure reconstruction, where clean sequences condition the generation of noisy structures. Mode 2 (Coupled Perturbation): for 15% of residues selected at random, mask the amino acid type and add diffusion noise to their local structures, while leaving all other tokens and structures unperturbed. Mode 3 (Sequence-Masked Global Perturbation): randomly select 15% of residues for type masking, while applying diffusion noise to the structures of all residues. Through these modes, the model alternates between one-way mapping, localized joint perturbation, and global perturbation, which together encourage robust learning of the bidirectional relationship between protein sequences and structures. These allow the model to capture not only sequence-level regularities but also the structural constraints and variability that underlie protein evolution and function. More details are provided in Supplementary 8.3. 3.4 Training and Sampling Loss function Our training objective integrates four complementary components to balance coordinate accuracy, sequence recovery, and structural plausibility. The overall loss is defined as Here, ℒ MSE denotes the diffusion pretraining loss function given in Eq. (2) , ℒ MLM improves sequence-level representation through masked residue prediction, ℒ Dist regularizes predicted inter-residue distances to maintain realistic tertiary structure geometry, and ℒ smooth-lDDT aligns training with widely used structure quality metrics by emphasizing local geometric accuracy (Supplementary 8). Training Pretraining is organized into two progressive stages. Stage A optimizes all components except ℒ MLM , training on proteins with up to 384 residues. Deferring MLM at this stage avoids the instability that arises when it is introduced too early, while the residue cap improves efficiency and helps the model prioritize learning core structural regularities. Stage B expands the input length to 768 residues and incorporates ℒ MLM , enabling stable joint optimization of sequence and structure on larger scales. In both stages, we include a confidence-weighted diffusion loss that scales residue-level contributions by pLDDT-derived sigmoid weights, reducing noise from low-confidence regions while emphasizing reliable structural signals (Supplementary 8.5). Sampling The sampling procedure is the simulation of the reverse process. By leveraging the relation of the denoising model to the score model in Eq. (1) , we have: We follow similar modifications as used in AlphaFold 3 [ 4 , 19 ], but forgo applying the random rotation at each sampling step as orientation alignment is not used in the loss function. Hence, the model learns the correct output orientation relative to the input. The detailed sampling algorithm is presented in Supplementary 8.7. 4 Results We use entries from the AlphaFold Protein Structure Database (AFDB, CC-BY 4.0 License) [ 6 ] and the Protein Data Bank (PDB, CC0 1.0 License) [ 5 ] released on or before 2021-09-30 for pretraining (Supplementary 6). For downstream evaluation, we consider three major task families: (i) Flexible interface prediction and design , spanning five tasks involving antigen–antibody, antigen–nanobody, and protein–peptide complexes; (ii) Intermolecular interaction prediction , including three tasks centered on protein–ligand binding; and (iii) Protein function prediction , comprising four tasks focused on functional annotation. 4.1 Flexible interface prediction and design Biomolecules with flexible binding interfaces are difficult to model and design [ 27 ]. Antibodies, nanobodies, and peptides are key examples, as their functions depend on flexible binding [ 28 ]. This flexibility allows them to target diverse molecules, but also makes structure prediction challenging. To study this problem, we introduce tasks on flexible interface modeling, including antigen-antibody, antigen-nanobody, and protein-peptide complexes. Each task is defined as: given the sequence or structure of the components, predict the structure of the complex. To avoid overlap, protein chains in the test sets share at most 40% sequence identity with the training data. We focus on two tasks: interface structure prediction and interface design. Antibody and nanobody interface prediction is a cornerstone of structural immunology, as accurate modeling underpins antibody discovery and therapeutic engineering. Nanobodies can be regarded as single-domain antibodies derived from VHH fragments [ 29 ], allowing both classes to be modeled within a shared framework. For this task, we follow the evaluation procedure of IgGM [ 18 ], measuring performance by the success rate (SR) based on the DockQ [ 30 ] score, with a threshold of DockQ ≥ 0.23. Experiments are conducted on the SAb23H2 test set from IgGM, where we compare FlexRibbon against the structure prediction models AlphaFold 3 and tFold-Ag [ 31 ], as well as the antibody design methods dyMEAN [ 16 ] and IgGM. Following the IgGM protocol, we predict antigen-antibody (-nanobody) complex structures given the antigen sequence and antibody (nanobody) sequence. As shown in Fig. 2 , FlexRibbon achieves success rates of 61.3% for antigen-antibody and 51.1% for antigen-nanobody complexes, yielding absolute improvements of 14.6% and 7.1% over IgGM, respectively. These results demonstrate that FlexRibbon effectively models antigen-antibody and antigen-nanobody interactions ( Tables 6 and 7 ). Download figure Open in new tab Figure 2: Success rates of structure prediction for antibodies, nanobodies, and peptides. tFold-Ag and AlphaFold 3 take MSA information as input. IgGM, dyMEAN, and FlexRibbon leverage antigen structural information. All methods, except AlphaFold 3, additionally incorporate epitope information. Results except FlexRibbon are taken from Wang et al. [ 18 ]. Protein-peptide interface prediction is another important scenario, as peptides often act as recognition motifs or regulators for diverse cellular processes. We use FoldBench [ 32 ] as the benchmark. We follow the FoldBench evaluation protocol, also reporting the success rate based on DockQ. FlexRibbon is compared with the structure prediction model AlphaFold 3 and the peptide design method PepGLAD [ 33 ]. As shown in Fig. 2 , FlexRibbon achieves an SR of 91.4%, exceeding AlphaFold 3 and PepGLAD by 7.0% and 10.2%, respectively. These findings suggest that FlexRibbon generalizes well to flexible peptide-protein binding scenarios ( Table 8 ). Antibody and nanobody design is a key challenge in developing novel binders for therapeutic and diagnostic applications. In this task, the input is the antigen sequence and structure, and the objective is to design sequences for all antibody/nanobody CDR regions while jointly generating the full complex structure. We evaluate FlexRibbon on the SAb23H2 test set from IgGM, following the same protocol. Performance is measured by amino acid recovery (AAR), DockQ, and success rate (SR, defined as the proportion of samples which have DockQ ≥ 0.23) relative to wild-type complexes. Baselines include dyMEAN, diffAb [ 15 ], and IgGM. As shown in Table 1 , FlexRibbon achieves 41.4% AAR and 46.0% SR for antibody design, surpassing IgGM and setting a new state-of-the-art. For nanobody design, FlexRibbon also slightly outperforms IgGM, with higher AAR (21.8%) and SR (43.7%). In addition, FlexRibbon supports user-specified CDR lengths, enabling flexible design. Figure 3 illustrates some representative designs: our framework generates up to six CDRs simultaneously, with AAR reported specifically for the highly flexible CDR-H3 region, which is also the most challenging to design. Detailed results are provided in Supplementary 9. Together, these results show that FlexRibbon can generate realistic CDR sequences while maintaining structural fidelity to wild-type complexes. View this table: View inline View popup Download powerpoint Table 1: Metrics for antibody and nanobody design. Both IgGM and FlexRibbon leverage antigen structure and epitope information. All baseline results are taken from [ 18 ]. Download figure Open in new tab Figure 3: Predicted structures for (left) SARS-CoV-2 RBD with the Re30H02 nanobody (or our design) and (right) SARS-CoV-2 Omicron BA.4 RBD along with its antibody (or our design). Native structures are shown in grey, while those for sequences generated by FlexRibbon are in colour. The insets show close-up views of the interaction region, with dashed lines indicating presumed hydrogen bonds. For SARS-CoV-2 RBD (left), the native and designed sequences are structurally similar, with our design manifesting one additional hydrogen bond, whereas for SARS-CoV-2 Omicron BA.4 RBD (right), the predicted structures differ significantly in the interaction region, with our design yielding a greater number of hydrogen bonds. These showcases demonstrate the ability of FlexRibbon to generate sequences that are structurally sound. 4.2 Intermolecular interaction prediction Protein-ligand interactions are fundamental in understanding protein conformational changes, binding affinities, and diverse biological functions. The accurate prediction of these interactions is thus crucial for elucidating molecular mechanisms and accelerating drug discovery. We evaluate FlexRibbon on three downstream tasks: protein-ligand docking prediction, ligand-induced conformational changes and protein-ligand binding affinity, which sharing a common foundation in modeling protein-ligand complexes, while emphasizing different aspects of interaction. Protein-ligand docking prediction is a key task for modeling intermolecular interactions with broad implications for life sciences and drug discovery. We follow the AlphaFold 3 protocol on the PoseBusters V1 benchmark [ 37 ] of 428 protein-ligand complexes, comparing against MSA-based models (AlphaFold3, Chai-1 [ 34 ], Protenix [ 38 ], Boltz-1 [ 23 ]) and single-sequence-based models (DiffDock [ 35 ], DiffDock-L [ 36 ]). Unlike previous single-sequence-based methods, which assume a fixed protein structure and only generate ligand poses, FlexRibbon jointly predicts protein-ligand structures directly from sequence like AlphaFold 3. As shown in Fig. 4 , FlexRibbon achieves 71.82% in the random-1 regime and 78.70% under oracle selection, surpassing all single-sequence baselines by substantial margin and reaching parity with MSA-based approaches ( Table 11 ). Download figure Open in new tab Figure 4: Evaluation of protein-ligand docking on the PoseBusters V1 benchmark. The methods are separated into (left) MSA-based and (right) single-sequence groups. The success rate is defined as the percentage of predictions with pocket-aligned ligand RMSD < 2 Å. Apart from DiffDock and DiffDock-L, which predict the ligand pose with the protein structure given, all other methods jointly generate the structure of the protein-ligand complex. Results for AlphaFold 3 are taken from Jumper et al. [ 3 ], Chai-1 from Chai Discovery [ 34 ], DiffDock from Corso et al. [ 35 ], and DiffDock-L from Corso et al. [ 36 ]. Results for Protenix, Boltz-1 and FlexRibbon were generated locally with five generated samples. Other methods report top-1 ranked predictions, while our model does not use a confidence head; thus, we report random single-sample performance, with “oracle” denoting the best prediction among five samples selected against the ground truth structure. Ligand-induced conformational change prediction is key to understanding how proteins adapt upon ligand binding. We follow the ESMDiff protocol [ 39 ] and use the Apo/Holo dataset [ 40 ] to evaluate FlexRibbon with ensemble TM-score (TM-ens) and residue flexibility correlations (ResFlex r ) at both global and per-target levels. Baselines include AlphaFlow (MSA-based), ESMFlow [ 41 ] (sequence-based), and ESMDiff (structure–language). Unlike these methods, FlexRibbon can model protein-ligand complexes in two modes: (i) protein-only (5 samples) and (ii) mixed (3 apo + 2 holo samples) with ligand guidance. Using a zero-shot pretrained checkpoint, FlexRibbon achieves a TM-ens score of 0.889 (improving upon ESMDiff by 0.038) and stronger flexibility correlations. Adding ligands further improves TM-ens by 0.012, showing the importance of ligand context. Figure 5 and Table 12 summarize the results. Download figure Open in new tab Figure 5: Evaluation of ligand-induced conformational change prediction. The left panels show the per-target mean correlations (top) and mean ensemble TM-scores (bottom). In the right panel, an overlay of the predicted apo (blue, PDB 4AKE) and holo (yellow, PDB 2ECK) structures of adenylate kinase is presented, illustrating the conformational changes (highlighted by the dashed boxed regions) induced by the presence of AMP and ADP molecules (orange). FlexRibbon is able to accurately predict both states, with TM-scores of 0.985 and 0.984, respectively. Protein-ligand binding affinity prediction is a cornerstone of drug discovery, enabling efficient prioritization of candidate compounds for therapeutic targets. Traditional high-throughput screening is costly and limited in scope, motivating computational approaches that estimate binding affinities directly from protein-ligand structures. We evaluate FlexRibbon on the CASF-2016 benchmark [ 42 ], using the standard metrics of root mean square error (RMSE) and Pearson’s correlation coefficient ( R ). Comparisons are made against state-of-the-art baselines SIGN [ 43 ], GLANT [ 44 ], and SPIN [ 45 ]. As shown in Table 2 , FlexRibbon achieves the best performance on both criteria. These results highlight the value of pretrained embeddings derived from joint protein-ligand structures as a strong basis for accurate affinity prediction. View this table: View inline View popup Download powerpoint Table 2: Evaluation of protein-ligand binding affinity prediction on CASF-2016. 4.3 Protein function prediction Protein function prediction is central to characterizing novel proteins, understanding disease, and guiding therapeutic discovery. We evaluate this by finetuning FlexRibbon on Gene Ontology (GO) [ 46 ] and Enzyme Commission (EC) [ 47 ] annotation tasks using the DeepFRI [ 48 ] setup. Baselines include sequence-only models (ESM-2-3B, SFM-Protein-3B [ 11 ]) and a sequence–structure hybrid (ESM-GearNet [ 49 ]). Unlike these methods, FlexRibbon jointly predicts structures and embeddings without external structural input. Performance is measured by maximum F 1 score. EC number prediction provides a controlled benchmark for catalytic function annotation, formulated as a binary classification task. As shown in Table 3 , FlexRibbon achieves an F 1 score of 0.891, slightly exceeding ESM-GearNet and clearly outperforming sequence-only baselines (ESM-2 and SFM-Protein). These results highlight the importance of structural information in accurately capturing enzymatic function. View this table: View inline View popup Download powerpoint Table 3: F 1 for the Enzyme Commission (EC) and Gene Ontology (GO) tasks. The GO task is comprised of three independent sub-tasks, namely biological process (BP), molecular function (MF), and cellular component (CC). View this table: View inline View popup Download powerpoint Table 4: Key architectural hyperparameters of FlexRibbon. GO term prediction GO term prediction evaluates protein function across biological processes (BP), molecular functions (MF), and cellular components (CC), each framed as an independent multilabel classification task, consistent with the EC setup. As shown in Table 3 , FlexRibbon achieves F1 scores of 0.539 (BP), 0.694 (MF), and 0.560 (CC), outperforming ESM-GearNet by 0.051, 0.013, and 0.096, respectively. These gains indicate that FlexRibbon provides more informative representations for finetuning, enabling more accurate functional annotation across ontologies. 5 Conclusion We introduce FlexRibbon, a pretrained protein foundation model that integrates both sequence and structural information into a unified framework. Unlike prior approaches that impose a one-way sequence-to-structure mapping, FlexRibbon implements joint training via masked language modeling and diffusion-based denoising, enabling bidirectional sequence-structure representations that support both prediction and design. Extensive evaluations across a diverse set of downstream tasks spanning antibody/nanobody and peptide interface modeling, ligand-induced conformational change, protein-ligand binding affinity, and functional annotation demonstrate strong and consistent gains, with especially notable improvements on flexible and mutation-rich interfaces where existing methods struggle. These results highlight the effectiveness of joint sequence-structure pretraining and show that its benefits extend broadly beyond protein folding, establishing FlexRibbon as a general-purpose foundation model for protein science. 6 Data The pretraining dataset is constructed from two primary sources: AFDB and PDB . The AlphaFold Protein Structure Database (AFDB), released by Google DeepMind and EMBL-EBI, contains over 200 million predicted structures spanning nearly the entire UniProt 2021 04 release. To reduce redundancy, we cluster sequences at 90% identity and retain only entries with a global pLDDT score greater than 50, discarding low-confidence structures. This yields approximately 78 million AFDB samples. For experimentally resolved structures in Protein Data Bank (PDB), we use PDB 20210930 , adopting the same cutoff date as AlphaFold 3. Following their filtering protocol, we exclude structures with more than 300 chains, resolution worse than 9 Å,or fewer than 4 residues. After filtering, we obtain roughly 181 thousand PDB samples. In total, our pretraining corpus comprises more than 78 million protein structures. The datasets used for each downstream task are detailed in Supplementary 9. 7 Architecture Figure 6 and Table 4 summarize the model. FlexRibbon adopts a three-stage design: Download figure Open in new tab Figure 6: Detailed Model Architecture of FlexRibbon. Download figure Open in new tab Figure 7: Overlay of the predicted structures of the (left) apo and (right) holo states of adenylate kinase, overlaid with the native structures (grey) from the PDB. The holo state includes an AMP and an ADP molecule (orange), whose presence induces the kinase to fold inwards to hold the molecules in place. Sequence module . Jointly embeds protein residues and small-molecule atoms into a unified space using a Transformer encoder. Protein residues receive RoPE for order sensitivity, while atom embeddings remain permutation-invariant. Learnable attention biases from atom categories and bond types provide chemical context, and a pair-feature update module refines residue–atom interactions. Coarse-grained structure module . Applies a Diffusion Transformer (DiT) to denoise residue and atom coordinates at reduced resolution. Conditioning on sequence embeddings integrates sequence semantics with structural noise while keeping attention maps small enough to allocate parameters to structural reasoning. All-atom structure module . Upsamples to full atomic resolution with a second DiT. Coarse outputs are broadcast to all atoms in each residue as conditional input. Atom- and bond-type biases are added to attention maps to preserve chemical validity, reconciling residue-level context with detailed atomic geometry. This design captures the global organization of diverse biomolecular complexes efficiently and refines them into chemically consistent high-resolution structures. Key hyperparameters appear in Table 4 . 8 Training 8.1 Confidence-weighted diffusion loss To incorporate structural reliability, we scale the diffusion MSE loss using residue-level pLDDT scores with a sigmoid weighting function. Specifically, residues with very low confidence (pLDDT ≲ 60) are down-weighted toward zero, while those with very high confidence (pLDDT ≳ 80) receive weights close to one. The transition between these regimes is smoothed using a sigmoid: where σ (·) is the logistic sigmoid and β controls the steepness of the curve. In practice, we set β = 5 such that weights are near zero at pLDDT = 60 and near one at pLDDT = 80. This formulation avoids hard thresholds while ensuring that uncertain structural regions contribute less to the optimization, and high-confidence regions dominate the learning signal. 8.2 Inter-residue distance loss We apply ℒ Dist on top of the sequence module to regularize predicted inter-residue distances. Specifically, the sequence encoder outputs residue-level embeddings, which are combined through an outer product to form pairwise features. A lightweight MLP head then predicts the C α -C α distance for each residue pair. The loss penalizes deviations between predicted and ground-truth distances, encouraging the encoder to capture geometric constraints directly at the sequence-pair feature level. This design provides the model with explicit supervision on tertiary structure geometry while avoiding direct reliance on coordinate-level regression. 8.3 Structure-informed Masked Language Modeling (SIMLM) loss We design a structure-informed masked language modeling loss to align sequence and structure representations. Only protein sequences (FASTA) are masked, following the BERT-style policy: 15% of residues are selected for corruption, with 80% replaced by [MASK], 10% replaced by a random amino acid, and 10% left unchanged. For each masked residue i with ground-truth identity y i , we compute hidden features from both the sequence encoder and the coarse-grained structure encoder (e.g., based on C α geometry). Two independent prediction heads are applied: one maps to a distribution and the other maps to . The loss averages the negative log-likelihoods from both heads: Where ℳ is the set of masked positions. This formulation encourages both the sequence and structure pathways to retain predictive signal for residue identity, thereby improving cross-modal consistency. In practice, we interleave the three perturbation modes (Mode 1, Mode 2, and Mode 3) during training with a ratio of 6: 2: 2, balancing standard sequence-to-structure generation with increasingly challenging coupled and global perturbations. 8.4 Smooth-lDDT loss Following AlphaFold2 [ 3 ], we compute the smooth local distance difference test (lDDT) loss to assess local structural accuracy. The smooth lDDT metric measures the agreement of predicted inter-residue distances with the ground truth in a differentiable manner. Specifically, for each residue i , we evaluate all neighboring residues j within a cutoff radius (typically 15 Å). For each pair ( i, j ), the absolute deviation of the predicted C α - C α distance from the reference is mapped to a soft score using a piecewise linear function with thresholds at 0.5, 1, 2, and 4 Å. The residue-wise scores are averaged across neighbors and then across residues to produce the overall smooth lDDT. In training, we only use C α atoms to compute this loss, consistent with AlphaFold2. The resulting value serves both as a differentiable accuracy proxy and as a regularizer encouraging the model to capture local geometric consistency. 8.5 Training Recipe We adopt a two-stage pretraining strategy (see Table 5 ). Stage A focuses on diffusion-based denoising with proteins of up to 384 residues. Training uses the Adam optimizer [ 50 ] in bfloat16 mixed precision with a batch size of 4,096 on 128 A100 GPUs for 200k steps and a learning rate of 1 × 10 −4 . This stage builds the core ability to reconstruct clean structures from noisy inputs while incorporating structural regularization via distance and smooth-lDDT losses. Stage B extends the maximum protein length to 768 residues and adds the masked language modeling (MLM) objective. Training uses a batch size of 2,048 on the same hardware for 100k steps with a learning rate of 6 × 10 −5 . This stage enables the model to handle larger proteins and integrate sequence-level supervision, while continuing to optimize diffusion, distance, and smooth-lDDT objectives. View this table: View inline View popup Download powerpoint Table 5: Two-stage pretraining configuration. 8.6 Diffusion training details We provide here the complete derivations and formulation details omitted from the main text. Forward process Diffusion-based generative modeling aims to approximate a target distribution p ( R ) by connecting it to a tractable source distribution p src . We represent a protein structure R ∈ ℝ 3 N by the 3D coordinates of all heavy atoms. The forward noising process is defined as with σ t monotone increasing in t . For sufficiently large σ T , R T approximates a Gaussian distribution . This corresponds to the SDE where w t is a Wiener process on ℝ 3 N . Reverse process By stochastic process theory [ 51 ], one can recover p ( R ) by simulating the reverse diffusion process. A deterministic equivalent is given by the probability-flow ODE [ 20 ]: where denotes reversed time and denotes the reversed sample trajectory. Score estimation The only unknown term is ∇ log p t , which we approximate with s θ ( R , t ). Minimizing the score-matching objective is equivalent to a denoising objective with conditional distribution : where we use the parameterization Intuitively, the network predicts the clean structure R 0 from its noisy version R t , hence the name “denoising model.” 1 Rigid-body invariances Protein structures are equivalent up to rigid-body transformations. Translations are removed by centering at the center of mass. Rotational invariance is harder: while SO(3)-equivariant networks [ 21 ] can guarantee invariance, they often require heavy operations and introduce reflection symmetry. We instead use a standard architecture and provide rotational invariance information via random SO(3) data augmentation. Some works apply explicit rotational alignment in the loss [ 4 , 22 ], but such alignment lacks a consistent orientation correspondence and complicates sampling [ 23 ]. In our experiments, the plain denoising objective already yielded stable and effective training, so we removed the alignment operation in the loss. Algorithm 1 Sampling procedure. Download figure Open in new tab 8.7 Sampling procedure From the reverse sampling formulation in Eq. (3) , what essentially controls the progression of the diffusion process is the discretization of σ t . A convenient choice is thus to let σ t = t [ 19 ]. The sampling process is then specified by a discretization of the reverse time , where N is the number of discretization steps. Following [ 19 ] (which is also adopted in Alphafold 3 [ 4 ]), in each step, the update starts not directly from the current time step . Instead, the clock is first recurred back to (which comes from increasing the forward time by (1 + γ ), i.e with a more noisy state, which can be implemented by simulating the forward process from , where ϵ ∼ 𝒩 ( 0, I ). The simulation then proceeds by an update from to following Eq. (3) . In contrast to the sampling process by Alphafold 3, we do not need a random rotation in each step as we do not use rotational alignment in training. The complete procedure is presented in Alg. 8.6. 9 Experiments Details Downstream tasks are application-specific benchmarks designed to evaluate how effectively a pretrained foundation model can be adapted to solve targeted scientific problems. While pretraining provides the model with general sequence-structure representations, downstream tasks assess its transferability to practical domains such as protein design, intermolecular interaction prediction, and functional annotation. These tasks typically involve fine-tuning the model on smaller, curated datasets and comparing its performance against established baselines. By systematically evaluating across diverse downstream tasks, we demonstrate not only the generality of the pretrained model, but also its ability to capture biologically meaningful features that enable real-world scientific discovery. Details of the fine-tuning procedure, encompassing dataset partitioning, optimization strategies, and evaluation metrics, are provided for each of the eight downstream tasks. 9.1 Antibody and nanobody interface prediction Datasets High-quality datasets are essential for evaluating antibody and nanobody interface modeling. We use SAbDab [ 52 ] as the training and validation dataset and adopt the same training, validation, and test splits as in IgGM [ 18 ] to ensure fair comparison. Moreover, we removed anti-ligand pattern from the dataset. In total, we constructed 10028 samples from 5146 unique PDB ids, in which 2023 samples are nanobody and make up 1108 unique PDB ids. for training and validation and evaluate performance on 60 antigen-antibody docking structures (SAb-23H2-Ab) and 27 antigen-nanobody docking structures (SAb-23H2-Nano). Finetuning and inference Accurate antibody modeling requires leveraging both sequence and structural information. We incorporate epitope annotations, which have been shown to be critical for reliable antibody prediction [ 18 ], by labeling residues with at least one heavy atom within 10 Å of an antibody or nanobody chain. During fine-tuning, we adopt four complementary training modes to balance complex structure prediction and antibody design: (i) with 30% probability, the model receives full sequences and predicts the antibody-antigen complex structure; (ii) with 40% probability, the model is provided with the antibody backbone sequence, antigen sequence, and antigen structure, and is tasked with designing antibody CDR sequences and structures; (iii) with 15% probability, the model receives antibody and antigen sequences along with the antibody structure and predicts the antigen structure; and (iv) with 15% probability, the model receives antibody and antigen sequences along with the antigen structure and predicts the antibody structure. During inference, we follow the IgGM protocol for fair comparison, generating 5 samples per test instance. The input consists of the antigen sequence, antigen structure, epitope annotations, and antibody sequence, and the model predicts the final antigen-antibody or antigen-nanobody complex structure. Evaluation metrics Model performance is evaluated using DockQ, interface RMSD (iRMS), ligand RMSD (LRMS), and success rate (SR, defined as the proportion of samples which have DockQ ≥ 0.23), which are widely used in the antibody modeling community [ 18 , 30 , 53 ]. DockQ, iRMS, and LRMS are averaged across all generated samples, while the success rate is computed as the fraction of all generated samples with DockQ ≥ 0.23. As shown in Table 6 and Table 7 , FlexRibbon consistently achieves the best performance across all metrics, substantially outperforming existing baselines. View this table: View inline View popup Download powerpoint Table 6: Metrics for prediction of antigen-antibody docking structure. tFold-Ag and AlphaFold 3 use MSA information as input. dyMEAN, IgGM, and FlexRibbon use antigen structure information. All methods except AlphaFold 3 use epitope information. AlphaFold 3, dyMEAN, tFold-Ag, and IgGM results are taken from [ 18 ]. Methods marked with † use MSA as input. View this table: View inline View popup Download powerpoint Table 7: Metrics for structure prediction for nanobody. tFold-Ag and AlphaFold 3 use MSA information as input. IgGM, and FlexRibbon use antigen structure information. All methods except AlphaFold 3 use epitope information. AlphaFold 3, tFold-Ag, and IgGM results are taken from [ 18 ]. Methods marked with † use MSA as input. View this table: View inline View popup Download powerpoint Table 8: Metrics for peptide structure prediction. AlphaFold 3 and Boltz-1 use MSA information as input, while PepGLAD and FlexRibbon leverage protein structure and epitope information. Results for Boltz-1 and AlphaFold 3 are taken from [ 32 ], where DockQ scores are not reported. PepGLAD results are obtained by running the method on this benchmark. Methods marked with † use MSA as input. 9.2 Protein-peptide interface prediction Datasets High-quality, non-redundant datasets are essential for training accurate protein-peptide interface models. We constructed the training dataset from PepGLAD [ 33 ] and applied a temporal filter to exclude entries released after September 30, 2021, ensuring that the pretrained model had no prior exposure to test-like data. After filtering, the dataset contains 5,202 non-redundant protein-peptide complexes, reduced from the original 6,105 entries. For evaluation, we use FoldBench [ 32 ], which comprises 51 protein-peptide pairs, all sharing less than 40% sequence identity with the training and validation sets. Finetuning and inference Effective fine-tuning is crucial for adapting a foundation model to specific downstream tasks. We adopt the same hyperparameters as in the antibody and nanobody interface prediction task. During inference, we follow the AlphaFold 3 procedure, generating 5 samples per test instance. The input consists of the protein sequence, protein structure, epitope annotations, and the peptide sequence, and the model predicts the corresponding protein-peptide complex structure. Additional details are provided in Section 9.1. Evaluation metrics Standardized metrics are important for consistent assessment of interface prediction performance. We evaluate model performance using DockQ, interface RMSD (iRMS), ligand RMSD (LRMS), and success rate (SR, defined as the proportion of samples which have DockQ ≥ 0.23). DockQ, iRMS, and LRMS are averaged across all generated samples, while the success rate is computed as the fraction of samples with DockQ ≥ 0.23. When computing DockQ, the heavy and light chains of the antibody are merged into a single chain, with the antigen treated as a separate chain. As shown in Table 8 , FlexRibbon outperforms all other methods across these metrics, demonstrating more accurate modeling of protein-peptide interfaces. 9.3 Antibody and nanobody design Datasets High-quality and consistent datasets are essential for evaluating CDR design performance. We use the same training, validation, and test datasets as described in Section 9.1 to ensure comparability and reproducibility. Training and inference Effective CDR design requires careful integration of sequence and structural information. During training, we follow the procedure in Section 9.1. At inference, the CDR regions are masked to enable the model to design new sequences based on the antigen structure and epitope annotations. Users can also specify different CDR lengths to generate diverse designs. Following the IgGM evaluation protocol, we generate 5 samples per test case using the same CDR lengths as the wild-type sequences, jointly designing all six CDR regions for antibodies and all three CDR regions for nanobodies. Evaluation metrics Quantitative metrics are necessary to assess both sequence and structural fidelity in CDR design. We use amino acid recovery (AAR) [ 18 ] to measure sequence similarity to the wild-type, with higher values indicating closer resemblance. For antibodies, AAR is computed separately for each of the six CDR regions (three from the heavy chain and three from the light chain) and averaged across samples. Structural evaluation is conducted using DockQ, interface RMSD (iRMS), ligand RMSD (LRMS), and success rate (SR, defined as the proportion of samples which have DockQ ≥ 0.23), which compare the designed structures against wild-type complexes. When computing DockQ, the heavy and light chains of the antibody are merged into a single chain, with the antigen treated as a separate chain. As shown in Tables 9 and 10 , FlexRibbon achieves performance comparable to IgGM across all metrics, demonstrating that it produces realistic CDR sequences while maintaining structural integrity. View this table: View inline View popup Download powerpoint Table 9: Comparison of antibody modeling methods for antibody design, reporting CDR loop accuracy (AAR, RMSD) and docking performance. Higher values of AAR, DockQ, and SR indicate better performance, while lower values of RMSD, iRMS, and LRMS are preferable. DockQ scores are computed by comparing the designed structures against the corresponding wild-type complexes. View this table: View inline View popup Download powerpoint Table 10: Comparison of nanobody modeling methods for nanobody design, reporting CDR accuracy (AAR), RMSD, and docking performance. Higher values of AAR, DockQ, and SR indicate better performance, while lower RMSD, iRMS, and LRMS are preferable. DockQ scores are calculated by comparing the designed structures to the corresponding wild-type complexes. View this table: View inline View popup Download powerpoint Table 11: Success Rate (SR) comparison of different methods. View this table: View inline View popup Download powerpoint Table 12: Evaluation of ligand-induced conformation changes: (1) Pearson correlation ( r ) between sampled and ground-truth diversity as measured by the residue flexibility (ResFlex, absolute deviation after alignment), and (2) the ensemble TM-score (TM-ens). For residue flexibility, both global (gl.) correlations and mean/median per-target (pt.) correlations are reported; for TM-ens, mean/median correlations are reported. Higher values indicate better performance. Methods marked with † use MSA as input. 9.4 Protein-ligand docking prediction Datasets Benchmarking zero-shot performance is a key way to assess a model’s generalization ability without task-specific fine-tuning. For this task, we directly evaluate our model in the zero-shot setting. The test set is PoseBusters V1 [ 37 ], which contains 428 protein-ligand complexes deposited in the PDB between January 1, 2021 and May 30, 2023. For pretraining, we follow the same protocol as Boltz-1 and Protenix, using all PDB structures released before 2021-09-30. Since these three methods share the same data cutoff time, comparisons on the test set remain fair. Training and inference . Evaluating zero-shot inference provides insight into a model’s ability to directly predict complex structures from minimal inputs. Given protein sequences and ligand SMILES strings, we generate full protein-ligand complex structures in a manner similar to AlphaFold 3. For FlexRibbon, Protenix, and Boltz-1, we generate five samples per complex. Evaluation metrics . Standardized metrics are critical to ensure reliable comparison across methods. Following the AlphaFold 3 protocol, we report the success rate, defined as the percentage of predictions with a pocket-aligned RMSD < 2 Å. The pocket-alignment procedure is consistent with AlphaFold 3: the pocket is defined as all heavy atoms within 10 Å of any ligand heavy atom, restricted to the primary polymer chain or modified residue of the ligand, and further limited to protein backbone atoms. Baselines include MSA-based methods (AlphaFold 3, Chai-1, Protenix, Boltz-1) and single-sequence methods (DiffDock, DiffDock-L). For Protenix and Boltz-1, results are reported using the top-ranked sample out of five diffusion-generated predictions. For FlexRibbon, which does not include a confidence head, we report both the random-1 score (performance of a randomly chosen sample) and the oracle score (the best of five samples selected against the ground truth). Note that Boltz-1 failed on two targets (7M31 TDR and 7SUC COM) due to residue number restrictions. 9.5 Ligand-induced conformational change prediction Datasets Benchmarking zero-shot performance provides insights into a model’s ability to generalize without task-specific adaptation. For this task, we do not perform finetuning and directly evaluate the zero-shot capability of our model. The test set, originally derived from Saldaño et al. [ 40 ], consists of 90 apo-holo protein pairs. Training and inference Zero-shot inference allows us to assess the model’s structural prediction ability under different input conditions. Given protein sequences and ligand SMILES strings, we generate structural predictions without any fine-tuning. Specifically, we produce five predictions for each case without ligands (apo) and five predictions with ligands obtained from the original holo complexes in the PDB. For fair comparison, we report two evaluation settings: (1) all five apo samples, and (2) a mixed set of three apo samples and two holo samples. Evaluation metrics Rigorous evaluation metrics are essential to capture both accuracy and diversity in structural predictions. Following the protocol in AlphaFlow [ 41 ], we use two types of metrics. The first is the Pearson correlation ( r ) between sampled diversity and ground-truth diversity, measured by residue flexibility (ResFlex, absolute deviation after alignment), reported as global (gl.) mean and per-target (pt.) mean/median correlations. The second is the ensemble TM-score (TM-ens), reported as mean and median. Results are presented in Section 4.2. In addition, we conduct an ablation study with different ligand conditions, showing that FlexRibbon can generate structures in multiple conformational states, highlighting its potential to address the protein multi-state problem. 9.6 Protein-ligand binding affinity prediction Datasets Reliable benchmarking requires consistent training and evaluation protocols. Following the strategy of SPIN [ 45 ], we use the same training and test sets. The training data is drawn from PDBbind v2020 [ 54 ], comprising 19,443 protein-ligand complexes. For evaluation, we adopt the CASF-2016 [ 42 ] benchmark, which includes 285 samples. To prevent data leakage, any overlapping entries between CASF-2016 and the training set were removed. Training and inference Model training is formulated as a regression task to predict binding affinity values. The input is the three-dimensional structure of protein-ligand complexes, and the target output is a continuous affinity score. During inference, the model predicts one affinity score per sample, which is directly compared against the ground-truth value. Evaluation metrics Standard regression metrics are used to assess predictive accuracy and correlation with experimental data. Specifically, we report the Root Mean Square Error (RMSE) and Pearson’s correlation coefficient ( R ). Detailed results are presented in Section 4.2. 9.7 EC number prediction Datasets Accurate enzyme function prediction requires high-quality annotation datasets. We follow the dataset setup used in DeepFRI [ 48 ], where enzyme annotations are derived from UniProtKB with experimentally validated Enzyme Commission (EC) numbers. The training, validation, and test set contains 15551, 1729, 1919 protein samples respectively. Training and inference The task is formulated as a multi-label classification problem, where each protein sequence may be associated with one or more EC numbers. During inference, the model outputs probability scores over all possible EC labels, which is then used to compute the precision-recall curve. For both training and inference, the protein sequences are passed through our base model once for structure prediction, after which both sequence and structural information are used for model finetuning and evaluation. Evaluation metrics Model performance is evaluated using the maximum F-score ( F 1 ), which balances precision and recall. Specifically, F 1 is defined as the maximum F-score across all probability thresholds: where t is the threshold applied to predicted probabilities. 9.8 GO term prediction Datasets Gene Ontology (GO) provides a comprehensive representation of protein function, covering three sub-ontologies: Molecular Function (MF), Biological Process (BP), and Cellular Component (CC). Following DeepFRI [ 48 ], we construct the training, validation, and test set as shown in Table 13 . View this table: View inline View popup Download powerpoint Table 13: Size of data samples used for the Gene Ontology (GO) task. Training and inference GO term prediction is also framed as a multi-label classification problem. For each protein, the model outputs probability scores over GO terms independently for MF, BP, and CC. The training and inference procedures are identical to those used for EC number prediction, including structure prediction. Evaluation metrics Performance is measured by F 1 , defined as the maximum F1-score across thresholds. The metric captures the balance between precision and recall in predicting GO terms and is widely adopted in functional annotation benchmarks. Footnotes 1 Recovering the exact R 0 is impossible due to information loss; the model in fact predicts 𝔼 [ R 0 | R t ]. References [1]. ↵ Zeming Lin , Halil Akin , Roshan Rao , Brian Hie , Zhongkai Zhu , Wenting Lu , Nikita Smetanin , Robert Verkuil , Ori Kabeli , Yaniv Shmueli , et al. Evolutionary-scale prediction of atomic-level protein structure with a language model . Science , 379 ( 6637 ): 1123 – 1130 , 2023 . OpenUrl CrossRef PubMed [2]. ↵ Suresh Pokharel , Pawel Pratyush , Michael Heinzinger , Robert H Newman , and Dukka B Kc . Improving protein succinylation sites prediction using embeddings from protein language model . Scientific reports , 12 ( 1 ): 16933 , 2022 . OpenUrl PubMed [3]. ↵ John Jumper , Richard Evans , Alexander Pritzel , Tim Green , Michael Figurnov , Olaf Ronneberger , Kathryn Tunyasuvunakool , Russ Bates , Augustin Zídek , Anna Potapenko , et al. Highly accurate protein structure prediction with AlphaFold . nature , 596 ( 7873 ): 583 – 589 , 2021 . OpenUrl CrossRef PubMed [4]. ↵ Josh Abramson , Jonas Adler , Jack Dunger , Richard Evans , Tim Green , Alexander Pritzel , Olaf Ronneberger , Lindsay Willmore , Andrew J Ballard , Joshua Bambrick , et al. Accurate structure prediction of biomolecular interactions with AlphaFold 3 . Nature , 630 ( 8016 ): 493 – 500 , 2024 . OpenUrl CrossRef PubMed [5]. ↵ Helen M Berman , John Westbrook , Zukang Feng , Gary Gilliland , Talapady N Bhat , Helge Weissig , Ilya N Shindyalov , and Philip E Bourne . The protein data bank . Nucleic acids research , 28 ( 1 ): 235 – 242 , 2000 . OpenUrl CrossRef PubMed Web of Science [6]. ↵ Mihaly Varadi , Damian Bertoni , Paulyna Magana , Urmila Paramval , Ivanna Pidruchna , Malarvizhi Radhakrishnan , Maxim Tsenkov , Sreenath Nair , Milot Mirdita , Jingi Yeo , et al. AlphaFold Protein Structure Database in 2024: providing structure coverage for over 214 million protein sequences . Nucleic acids research , 52 ( D1 ): D368 – D375 , 2024 . OpenUrl CrossRef PubMed [7]. ↵ Alexander Rives , Joshua Meier , Tom Sercu , Siddharth Goyal , Zeming Lin , Jason Liu , Demi Guo , Myle Ott , C Lawrence Zitnick , Jerry Ma , et al. Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences . Proceedings of the National Academy of Sciences , 118 ( 15 ): e2016239118 , 2021 . OpenUrl Abstract / FREE Full Text [8]. ↵ Thomas Hayes , Roshan Rao , Halil Akin , Nicholas J Sofroniew , Deniz Oktay , Zeming Lin , Robert Verkuil , Vincent Q Tran , Jonathan Deaton , Marius Wiggert , et al. Simulating 500 million years of evolution with a language model . Science , 387 ( 6736 ): 850 – 858 , 2025 . OpenUrl CrossRef PubMed [9]. ↵ Xinyou Wang , Zaixiang Zheng , Fei YE , Dongyu Xue , Shujian Huang , and Quanquan Gu . DPLM-2: A multimodal diffusion protein language model . In The Thirteenth International Conference on Learning Representations , 2025 . [10]. ↵ Ahmed Elnaggar , Michael Heinzinger , Christian Dallago , Ghalia Rehawi , Yu Wang , Llion Jones , Tom Gibbs , Tamas Feher , Christoph Angerer , Martin Steinegger , et al. Prottrans: Toward un-derstanding the language of life through self-supervised learning . IEEE transactions on pattern analysis and machine intelligence , 44 ( 10 ): 7112 – 7127 , 2021 . OpenUrl [11]. ↵ Liang He , Peiran Jin , Yaosen Min , Shufang Xie , Lijun Wu , Tao Qin , Xiaozhuan Liang , Kaiyuan Gao , Yuliang Jiang , and Tie-Yan Liu . SFM-Protein: Integrative co-evolutionary pre-training for advanced protein sequence representation . arXiv preprint arXiv: 2410.24022 , 2024 . [12]. ↵ Kaiyuan Gao , Lijun Wu , Jinhua Zhu , Tianbo Peng , Yingce Xia , Liang He , Shufang Xie , Tao Qin , Haiguang Liu , Kun He , and Tie-Yan Liu . Pre-training antibody language models for antigen-specific computational antibody design . In Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD) , pages 506 – 517 . ACM , 2023 . [13]. ↵ Wengong Jin , Jeremy Wohlwend , Regina Barzilay , and Tommi S. Jaakkola . Iterative refinement graph neural network for antibody sequence-structure co-design . In International Conference on Learning Representations , 2022 . [14]. ↵ Jared Adolf-Bryfogle , Oleg Kalyuzhniy , Maureen Kubitz , Brian D. Weitzner , Xiaozhen Hu , Yasunori Adachi , William R. Schief , and Roland L. Dunbrack Jr . RosettaAntibodyDesign (RAbD): A general framework for computational antibody design . PLOS Computational Biology , 14 ( 4 ): e1006112 , 2018 . OpenUrl [15]. ↵ S. Koyejo , S. Mohamed , A. Agarwal , D. Belgrave , K. Cho , and A. Oh , editors Shitong Luo , Yufeng Su , Xingang Peng , Sheng Wang , Jian Peng , and Jianzhu Ma . Antigen-specific antibody design and optimization with diffusion-based generative models for protein structures . In S. Koyejo , S. Mohamed , A. Agarwal , D. Belgrave , K. Cho , and A. Oh , editors, Advances in Neural Information Processing Systems , volume 35 , pages 9754 – 9767 . Curran Associates, Inc ., 2022 . OpenUrl [16]. ↵ Xiangzhe Kong , Wenbing Huang , and Yang Liu. End-to-end full-atom antibody design . In Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett, editors , Proceedings of the 40th International Conference on Machine Learning, volume 202 of Proceedings of Machine Learning Research , pages 17409 – 17429 . PMLR , 23–29 Jul 2023 . [17]. ↵ Xiangzhe Kong , Wenbing Huang , and Yang Liu . Conditional antibody design as 3d equivariant graph translation . In The Eleventh International Conference on Learning Representations , 2023 . [18]. ↵ Rubo Wang , Fandi Wu , Xingyu Gao , Jiaxiang Wu , Peilin Zhao , and Jianhua Yao . IgGM: A generative model for functional antibody and nanobody design . In The Thirteenth International Conference on Learning Representations , 2025 . [19]. ↵ Tero Karras , Miika Aittala , Timo Aila , and Samuli Laine . Elucidating the design space of diffusion-based generative models . Advances in neural information processing systems , 35 : 26565 – 26577 , 2022 . OpenUrl [20]. ↵ Yang Song , Jascha Sohl-Dickstein , Diederik P Kingma , Abhishek Kumar , Stefano Ermon , and Ben Poole . Score-based generative modeling through stochastic differential equations . In International Conference on Learning Representations , 2021 . [21]. ↵ Jonas Köhler , Leon Klein , and Frank Noé . Equivariant flows: exact likelihood generative learning for symmetric densities . In International conference on machine learning , pages 5361 – 5370 . PMLR , 2020 . [22]. ↵ Minkai Xu , Lantao Yu , Yang Song , Chence Shi , Stefano Ermon , and Jian Tang . GeoDiff: A geometric diffusion model for molecular conformation generation . In International Conference on Learning Representations , 2022 . [23]. ↵ Jeremy Wohlwend , Gabriele Corso , Saro Passaro , Noah Getz , Mateo Reveiz , Ken Leidal , Wojtek Swiderski , Liam Atkinson , Tally Portnoi , Itamar Chinn , et al. Boltz-1 democratizing biomolecular interaction modeling . BioRxiv , pages 2024 – 11 , 2025 . [24]. ↵ Jianlin Su , Yu Lu , Shengfeng Pan , Ahmed Murtadha , Bo Wen , and Yunfeng Liu . RoFormer: Enhanced transformer with rotary position embedding , 2023 . [25]. ↵ William Peebles and Saining Xie . Scalable diffusion models with transformers . In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 4195 – 4205 , 2023 . [26]. ↵ Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova . BERT: Pre-training of deep bidirectional transformers for language understanding . In Proceedings of naacL-HLT , volume 1 , page 2 . Minneapolis, Minnesota , 2019 . OpenUrl [27]. ↵ Kejia Wu , Hanlun Jiang , Derrick R Hicks , Caixuan Liu , Edin Muratspahić , Theresa A Ramelot , Yuexuan Liu , Kerrie McNally , Sebastian Kenny , Andrei Mihut , et al. Design of intrinsically disordered region binding proteins . Science , 389 ( 6757 ): eadr8063 , 2025 . OpenUrl PubMed [28]. ↵ Kejia Wu , Hua Bai , Ya-Ting Chang , Rachel Redler , Kerrie E McNally , William Sheffler , TJ Brunette , Derrick R Hicks , Tomos E Morgan , Tim J Stevens , et al. De novo design of modular peptide-binding proteins by superhelical matching . Nature , 616 ( 7957 ): 581 – 589 , 2023 . OpenUrl CrossRef PubMed [29]. ↵ Michiel M Harmsen and Hans J De Haard . Properties, production, and applications of camelid single-domain antibody fragments . Applied microbiology and biotechnology , 77 ( 1 ): 13 – 22 , 2007 . OpenUrl CrossRef PubMed Web of Science [30]. ↵ Claudio Mirabello and Björn Wallner . DockQ v2: improved automatic quality measure for protein multimers, nucleic acids, and small molecules . Bioinformatics , 40 ( 10 ): btae586 , 2024 . OpenUrl CrossRef PubMed [31]. ↵ Fandi Wu , Yu Zhao , Jiaxiang Wu , Biaobin Jiang , Bing He , Longkai Huang , Chenchen Qin , Fan Yang , Ningqiao Huang , Yang Xiao , et al. Fast and accurate modeling and design of antibody-antigen complex using tFold . bioRxiv , pages 2024 – 02 , 2024 . [32]. ↵ Sheng Xu , Qiantai Feng , Lifeng Qiao , Hao Wu , Tao Shen , Yu Cheng , Shuangjia Zheng , and Siqi Sun . FoldBench: An all-atom benchmark for biomolecular structure prediction . bioRxiv , pages 2025 – 05 , 2025 . [33]. ↵ Xiangzhe Kong , Yinjun Jia , Wenbing Huang , and Yang Liu . Full-atom peptide design with geometric latent diffusion . Advances in Neural Information Processing Systems , 37 : 74808 – 74839 , 2025 . OpenUrl [34]. ↵ Chai Discovery . Chai-1: Decoding the molecular interactions of life . bioRxiv , 2024 . [35]. ↵ Gabriele Corso , Hannes Stärk , Bowen Jing , Regina Barzilay , and Tommi Jaakkola . DiffDock: Diffusion steps, twists, and turns for molecular docking . In International Conference on Learning Representations (ICLR) , 2023 . [36]. ↵ Gabriele Corso , Arthur Deng , Nicholas Polizzi , Regina Barzilay , and Tommi S. Jaakkola . Deep confident steps to new pockets: Strategies for docking generalization . In The Twelfth International Conference on Learning Representations , 2024 . [37]. ↵ Martin Buttenschoen , Garrett M Morris , and Charlotte M Deane . PoseBusters: Ai-based docking methods fail to generate physically valid poses or generalise to novel sequences . Chemical Science , 15 ( 9 ): 3130 – 3139 , 2024 . OpenUrl PubMed [38]. ↵ ByteDance AML AI4Science Team , Xinshi Chen , Yuxuan Zhang , Chan Lu , Wenzhi Ma , Jiaqi Guan , Chengyue Gong , Jincai Yang , Hanyu Zhang , Ke Zhang , et al. Protenix-advancing structure prediction through a comprehensive AlphaFold3 reproduction . BioRxiv , pages 2025 – 01 , 2025 . [39]. ↵ Jiarui Lu , Xiaoyin Chen , Stephen Zhewen Lu , Chence Shi , Hongyu Guo , Yoshua Bengio , and Jian Tang . Structure language models for protein conformation generation . In The Thirteenth International Conference on Learning Representations , 2025 . [40]. ↵ Tadeo Saldaño , Nahuel Escobedo , Julia Marchetti , Diego Javier Zea , Juan Mac Donagh , Ana Julia Velez Rueda , Eduardo Gonik , Agustina García Melani , Julieta Novomisky Nechcoff , Martín N Salas , et al. Impact of protein conformational diversity on alphafold predictions . Bioinformatics , 38 ( 10 ): 2742 – 2748 , 2022 . OpenUrl CrossRef PubMed [41]. ↵ Ruslan Salakhutdinov , Zico Kolter , Katherine Heller , Adrian Weller , Nuria Oliver , Jonathan Scarlett , and Felix Berkenkamp , editors Bowen Jing , Bonnie Berger , and Tommi Jaakkola . AlphaFold meets flow matching for generating protein ensembles . In Ruslan Salakhutdinov , Zico Kolter , Katherine Heller , Adrian Weller , Nuria Oliver , Jonathan Scarlett , and Felix Berkenkamp , editors, Proceedings of the 41st International Conference on Machine Learning, volume 235 of Proceedings of Machine Learning Research , pages 22277 – 22303 . PMLR , 21–27 Jul 2024 . [42]. ↵ Minyi Su , Qifan Yang , Yu Du , Guoqin Feng , Zhihai Liu , Yan Li , and Renxiao Wang . Comparative assessment of scoring functions: the CASF-2016 update . Journal of chemical information and modeling , 59 ( 2 ): 895 – 913 , 2018 . OpenUrl PubMed [43]. ↵ Shuangli Li , Jingbo Zhou , Tong Xu , Liang Huang , Fan Wang , Haoyi Xiong , Weili Huang , Dejing Dou , and Hui Xiong . Structure-aware interactive graph neural networks for the prediction of protein-ligand binding affinity . In Proceedings of the 27th ACM SIGKDD conference on knowledge discovery & data mining , pages 975 – 985 , 2021 . [44]. ↵ Shuangli Li , Jingbo Zhou , Tong Xu , Liang Huang , Fan Wang , Haoyi Xiong , Weili Huang , Dejing Dou , and Hui Xiong . GIaNt: Protein-ligand binding affinity prediction via geometry-aware interactive graph neural network . IEEE Transactions on Knowledge and Data Engineering , 36 ( 5 ): 1991 – 2008 , 2023 . OpenUrl [45]. ↵ Seungyeon Choi , Sangmin Seo , and Sanghyun Park . SPIN: SE(3)-invariant physics informed network for binding affinity prediction . In 27th European Conference on Artificial Intelligence (ECAI 2024) , 2024 . [46]. ↵ Michael Ashburner , Catherine A Ball , Judith A Blake , David Botstein , Heather Butler , J Michael Cherry , Allan P Davis , Kara Dolinski , Selina S Dwight , Janan T Eppig , et al. Gene Ontology: tool for the unification of biology . Nature genetics , 25 ( 1 ): 25 – 29 , 2000 . OpenUrl CrossRef PubMed Web of Science [47]. ↵ Amos Bairoch . The ENZYME database in 2000 . Nucleic acids research , 28 ( 1 ): 304 – 305 , 2000 . OpenUrl CrossRef PubMed Web of Science [48]. ↵ Vladimir Gligorijević , P. Douglas Renfrew , Tomasz Kosciolek , Julia Koehler Leman , Daniel Berenberg , Tommi Vatanen , Chris Chandler , Bryn C. Taylor , Ian M. Fisk , Hera Vlamakis , Ramnik J. Xavier , Rob Knight , Kyunghyun Cho , and Richard Bonneau . Structure-based protein function prediction using graph convolutional networks . Nature Communications , 12 ( 1 ): 3168 , May 2021 . OpenUrl PubMed [49]. ↵ Zuobai Zhang , Chuanrui Wang , Minghao Xu , Vijil Chenthamarakshan , Aurélie Lozano , Payel Das , and Jian Tang . A systematic study of joint representation learning on protein sequences and structures . arXiv preprint arXiv: 2303.06275 , 2023 . [50]. ↵ Diederik P. Kingma and Jimmy Ba . Adam: A method for stochastic optimization . In 3rd International Conference on Learning Representations (ICLR2015) , 2015 . [51]. ↵ Brian D.O. Anderson . Reverse-time diffusion equation models . Stochastic Processes and their Applications , 12 ( 3 ): 313 – 326 , 1982 . OpenUrl CrossRef [52]. ↵ James Dunbar , Konrad Krawczyk , Jinwoo Leem , Terry Baker , Angelika Fuchs , Guy Georges , Jiye Shi , and Charlotte M. Deane . SAbDab: the structural antibody database . Nucleic Acids Research , 42 ( D1 ): D1140 – D1146 , 11 2013 . OpenUrl PubMed Web of Science [53]. ↵ Kevin E Wu , Kevin K Yang , Rianne van den Berg , Sarah Alamdari , James Y Zou , Alex X Lu , and Ava P Amini . Protein structure generation via folding diffusion . Nature communications , 15 ( 1 ): 1059 , 2024 . OpenUrl PubMed [54]. ↵ Zhihai Liu , Minyi Su , Li Han , Jie Liu , Qifan Yang , Yan Li , and Renxiao Wang . Forging the basis for developing protein–ligand interaction scoring functions . Accounts of chemical research , 50 ( 2 ): 302 – 309 , 2017 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 10, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following FlexRibbon: Joint Sequence and Structure Pretraining for Protein Modeling Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share FlexRibbon: Joint Sequence and Structure Pretraining for Protein Modeling Jianwei Zhu , Yu Shi , Ran Bi , Peiran Jin , Chang Liu , Zhe Zhang , Haitao Huang , Zekun Guo , Pipi Hu , Fusong Ju , Lin Huang , Xinwei Tai , Chenao Li , Kaiyuan Gao , Xinran Wei , Huanhuan Xia , Jia Zhang , Yaosen Min , Zun Wang , Yusong Wang , Liang He , Haiguang Liu , Tao Qin bioRxiv 2025.10.08.681293; doi: https://doi.org/10.1101/2025.10.08.681293 Share This Article: Copy Citation Tools FlexRibbon: Joint Sequence and Structure Pretraining for Protein Modeling Jianwei Zhu , Yu Shi , Ran Bi , Peiran Jin , Chang Liu , Zhe Zhang , Haitao Huang , Zekun Guo , Pipi Hu , Fusong Ju , Lin Huang , Xinwei Tai , Chenao Li , Kaiyuan Gao , Xinran Wei , Huanhuan Xia , Jia Zhang , Yaosen Min , Zun Wang , Yusong Wang , Liang He , Haiguang Liu , Tao Qin bioRxiv 2025.10.08.681293; doi: https://doi.org/10.1101/2025.10.08.681293 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Molecular Biology Subject Areas All Articles Animal Behavior and Cognition (7622) Biochemistry (17645) Bioengineering (13867) Bioinformatics (41873) Biophysics (21420) Cancer Biology (18550) Cell Biology (25447) Clinical Trials (138) Developmental Biology (13361) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24289) Genetics (15587) Genomics (22473) Immunology (17707) Microbiology (40322) Molecular Biology (17144) Neuroscience (88457) Paleontology (666) Pathology (2826) Pharmacology and Toxicology (4815) Physiology (7634) Plant Biology (15111) Scientific Communication and Education (2042) Synthetic Biology (4285) Systems Biology (9813) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00