Evaluation of De Novo Deep Learning Models on the Protein-Sugar Interactome

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 52,070 characters · extracted from preprint-html · click to expand
Evaluation of De Novo Deep Learning Models on the Protein-Sugar Interactome | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Evaluation of De Novo Deep Learning Models on the Protein-Sugar Interactome View ORCID Profile Samuel W. Canner , Lei Lu , View ORCID Profile Sho S. Takeshita , View ORCID Profile Jeffrey J. Gray doi: https://doi.org/10.1101/2025.09.02.673778 Samuel W. Canner 1 Program in Molecular Biophysics, The Johns Hopkins University , Baltimore, MD, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Samuel W. Canner Lei Lu 2 Department of Pharmaceutical Chemistry, University of California San Francisco , San Francisco, California 94143, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sho S. Takeshita 1 Program in Molecular Biophysics, The Johns Hopkins University , Baltimore, MD, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sho S. Takeshita Jeffrey J. Gray 1 Program in Molecular Biophysics, The Johns Hopkins University , Baltimore, MD, United States 3 Department of Chemical and Biomolecular Engineering, Johns Hopkins University , Baltimore, MD, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jeffrey J. Gray For correspondence: jgray{at}jhu.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Advances in deep learning have produced a range of models for predicting the protein-sugar interactome; however, structural docking of noncovalent protein-carbohydrate complexes remains largely unexplored. Although all-atom structure prediction models like AlphaFold3 (AF3), Boltz-1, Chai-1, DiffDock, and RosettaFold-All Atom (RFAA) were validated on protein-small molecule complexes, no benchmark or evaluation exists specifically for noncovalent protein-carbohydrate docking. To address this, we developed a high-quality dataset of experimental structures – Benchmark of CArbohydrate Protein Interactions (BCAPIN). Using BCAPIN and a novel evaluation metric, DockQC, we assessed the performance of all-atom structure prediction models on non-covalent protein-carbohydrate docking. We found all methods achieved comparable results, with an 85% success rate for structures of at least acceptable quality. However, we found that the predictive power of all models declined with increasing carbohydrate polymer length. With the capabilities and limitations assessed, we evaluated AF3’s ability to predict binding for a set of putative human carbohydrate binding and carbohydrate non-binding proteins. While current models show promise, further development is needed to enable high-confidence, high-throughput prediction of the complete protein-sugar interactome. Introduction Many new computational prediction tools have recently been developed to decode the protein-sugar interactome. Bonnardel et al. created LectomeXplore, which annotates all known proteomes with a hidden Markov model (HMM) for lectins (glycan-binding proteins). 1 If the protein is identified as a lectin, one could use Lundstrøm et al.’s model LectinOracle to predict which carbohydrate the lectin binds. 2 However, not all carbohydrate binding proteins are lectins, for example native sugar sensors and antibodies. 3 Leveraging this gap, some of us (Canner, Gray) developed PiCAP to predict whether a protein binds to carbohydrates, irrespective of protein family, and released predicted annotations on six different species, predicting a putative list of all the proteins present in the protein-sugar interactome. 4 To further elucidate these protein-carbohydrate interactions, Canner, Shanker et al. and Bibekar et al. created CAPSIF 5 and PeSTo-Carbs, 6 respectively, to predict which residues a protein uses to bind to carbohydrates. The combination of these breakthrough models can be used to predict whether any given protein binds to a carbohydrate (as a lectin or non-lectin), what carbohydrate it binds to (if the protein is a lectin), and what residues are implicated in the protein for carbohydrate binding. Now, with all-atom biomolecular prediction software like AlphaFold3 (AF3), 7 protein-carbohydrate complex structures can be readily predicted. AF3 and other deep learning models thereby make possible the development of a complete putative structural dataset of the entire protein-sugar interactome: all protein-carbohydrate interactions across a species. First however, we must evaluate the performance of AF3 and other all-atom biomolecular structure prediction models on protein-carbohydrate complexes. The development of AlphaFold3 7 built upon a string of advances in protein structure prediction, such as the Nobel Prize research of David Baker, Demis Hassabis, and John Jumper: Rosetta and AlphaFold2. 8 In the past few years, the leap in the most recent generation of de novo prediction methods was the ability to model any molecule with all-atom structure prediction. The Jaakkola lab developed DiffDock to predict small molecule docking on a provided protein structure. 9 , 10 The Baker lab developed RosettaFold-AllAtom (RFAA), becoming the first end-to-end all-atom biomolecule structure prediction. 11 Google DeepMind released their first end-to-end all atom prediction model AlphaFold3 (AF3). Building on previous work from DiffDock, the Jaakkola lab developed Boltz-1. 12 With partnerships from OpenAI and other industry representatives, the Chai Discovery team released their (proprietary) model Chai-1. 7 , 13 Given this growing suite of models (albeit non-exhaustive), identification of their performance on specific tasks is critical, with one of the most used metrics being the success rate when benchmarked against a dataset called Posebusters. 14 Posebusters contains non-covalent protein-small molecule complexes. Posebusters provides well-defined specificity of the small molecule and binding protein pocket, with a model’s success measured by its ability to predict small molecule complexes under 2 Å RMSD from the solved structure. In total, DiffDock and RFAA both achieve 42% success on PoseBusters, 10 , 11 while AF3 and Chai-1 achieve 76% and 77% success on PoseBusters, 7 , 13 respectively. No success rate was reported on PoseBusters for Boltz-1. 12 While PoseBusters emphasizes strong specific protein-ligand binding, protein-carbohydrate interactions present unique challenges. Unlike protein-small molecule interactions, protein-carbohydrate interactions are commonly less specific, with proteins containing multiple binding sites for long linear heterogenous polymers containing various epitopes, and therefore sugars require extra attention that is not provided in the dataset. 15 – 17 Moreover, proteins stabilize carbohydrates through a combination of direct contacts (hydrogen bonding, electrostatics), indirect (water mediated) interactions, and by CH-π bonds via aromatic residues. 17 , 18 Finally, the binding affinity of protein-carbohydrate complexes are commonly weak (μM – mM), but rather driven by high avidity (nM) of multiple binding sites on the protein or multiple repeats of the glycan epitope. 3 Due to the distinct binding mechanisms involved in noncovalent protein-carbohydrate interactions, solved experimental structures of bound non-covalent carbohydrates to proteins are limited. From all solved structures in the Protein Data Bank, DIONYSUS identifies protein structures with non-covalent specific interactions with carbohydrates to be 2.5% (5,461). 19 , 20 With the advent of high-throughput diazirine photoaffinity linker experimental data of protein-carbohydrate interactions, 21 , 22 we are attaining more knowledge of protein-carbohydrate interactions on a protein level. We therefore propose that all-atom deep learning (DL) structure prediction pipeline may enhance our understanding of the protein-sugar interactome (PSI). Here, we benchmark DL structural models: AF3, Boltz-1, Chai-1, RFAA, and DiffDock on the task of predicting docked de novo protein-carbohydrate structures. To benchmark the models, we constructed a novel dataset of proteins unseen during each model’s training. We identify the strengths and shortcomings of these models and evaluate test cases where all models perform poorly. With strengths and limitations identified, we then use AF3 as a proof-of-concept tool for predicting the structural de novo human protein-sugar interactome. This work sets the stage for future integration of deep learning tools in structural glycobiology to fully characterize the protein-sugar interactome across all species. Results BCAPIN and DockQC: Novel datasets and analysis To assess the capabilities of AlphaFold3 (AF3), Boltz-1, Chai-1, DiffDock, and RosettaFold All-Atom (RFAA) at predicting protein-carbohydrate complexes, it is essential to have an independent test set of high-quality experimentally resolved protein-carbohydrate structures and a suitable evaluation metric. For the dataset, we leveraged DIONYSUS, 20 which aggregates all experimentally determined protein-glycan structures from the PDB. We first excluded all protein-nucleic acid complexes and clustered the remaining protein sequences at 50% identity. We removed clusters with structures solved before the latest model’s training cutoff dataset (September 2021). Importantly, due to experimental limitations, not all experimental structures are of equal quality. To ensure structural reliability, we applied a filter using the real space correlation coefficient (RSCC) 23 , which measures the agreement between the calculated and experimental density. Structures with an RSCC greater than or equal to 0.9 were retained (Figure S2). The resulting Benchmark of CArbohydrate Protein INteractions (BCAPIN) test set consists of 20 structures: 9 structures that bind sugar monomers, 3 structures that bind dimers, 5 structures that bind polymers, and 3 structures that bind at least a nucleotide (NTP) and a saccharide ( Table 1 ). View this table: View inline View popup Download powerpoint Table 1: Benchmark of CArbohydrate Protein INteractions (BCAPIN) test set. The table lists the PDB 4-letter ID, protein name, UniProt ID, glycan input string for GlyLES, and any secondary ligands if present. To evaluate the performance of predicted protein-carbohydrate complexes, we developed a single continuous scoring metric named DockQC. DockQC is inspired by the DockQ metric from the CASP-CAPRI challenge, averaging the fraction of native contacts ( F nat ), interface root mean squared deviation (IRMS), and ligand RMS (LRMS) to designate a predicted structure’s quality. While DockQ is widely used for protein-protein docking, the native code is unusable on our test cases, and, when reimplemented, it tends to overestimate the quality for protein-carbohydrate complexes, often assigning medium-to-high scores even when the predicted ligand position is incorrect (Figure S1, Table S1). DockQC addresses these issues by averaging three terms: F nat , ring-ring RMSD (rRMS), and LRMS. F nat measures the fraction of native residue-residue contacts, rRMS is a novel metric that measures the RMSD between the center of mass (COM) of each carbohydrate ring in the aligned predicted and experimental structures, and LRMS measures the RMSD of all aligned ligand heavy atoms. With the BCAPIN test set and evaluation metrics established, we investigated the performance of five methods, AlphaFold3, (AF3), Boltz-1, Chai-1, RosettaFold All-Atom (RFAA), and DiffDock, at predicting protein-carbohydrate structure. We first evaluated the behavior of DockQC on the set. Thresholds were chosen after inspecting many predictions and tuning metric weights, some examples are described next. On hedgehog interacting protein (7PGK), which binds a disaccharide heparin analog, Chai-1 failed to predict the protein structure accurately, leading to an incorrect carbohydrate placement with a low DockQC score of 0.11 ( Figure 1A ). For chitoporin (7EQR), a β-barrel protein that binds an oligosaccharide with a degree of polymerization (DP) of 1 six, RFAA captured the binding pocket of the carbohydrate, but lacked broader structural accuracy, yielding an acceptable prediction a DockQC of 0.26 ( Figure 1B ). With sialidase-sialic acid complex (8AXS), Boltz-1 achieved a medium quality prediction, correctly modeling the binding pocket and ring position (but not its orientation), with a DockQC of 0.65 ( Figure 1C ). In contrast, on glycoside hydrolase family 110 protein binding a Gal dimer (7JWF), AF3 nearly recapitulated the experimental structure delivering a high-quality structure with a 0.96 DockQC ( Figure 1D ). In total, our DockQC quality thresholds chosen to be incorrect (DockQC < 0.25), acceptable (0.25 <= DockQC < 0.50), medium (0.50 <= DockQC = 0.80) (Figure S1, Table S1). Download figure Open in new tab Figure 1: Protein-carbohydrate docked structures across DL methods. (A) Incorrect prediction of Chai-1 (red) on 7PGK (DockQC = 0.11). (B) Acceptable quality prediction of RFAA (orange) on 7EQR (DockQC = 0.26). (C) Medium quality prediction of Boltz-1 (violet) on 8AXS (DockQC=0.65). (D) High quality prediction of AF3 (green) on 7JWF (DockQC=0.96) DL Methods achieve medium or high accuracy on over 80% of cases After tuning our DockQC metric, we evaluated overall model performances on all BCAPIN targets ( Figure 2 ). Across methods, we found comparable results for all end-to-end models, at least 80% of their the highest confidence predictions (top-1) scored with at least acceptable quality. Expanding scoring to include the most accurate of each model’s top 5 confidence predictions (top-5) led to only marginal improvements. AF3 was the best-performing model: its top-1 predictions yielded 10% acceptable, 40% medium, and 35% high quality structures; top-5 predictions improved slightly to 15% acceptable, 35% medium, and 40% high quality structures. Download figure Open in new tab Figure 2: DL model success rates on BCAPIN Test Set. Each labeled method has the top-1 model on the left and top-5 model on the right. Given the strong performance of end-to-end models on BCAPIN, we next examined how starting structure influences DiffDock’s predictive power. DiffDock- holo (initialized with the experimentally solved holo protein structure) performed equivalently to the end-to-end models, achieving at least acceptable quality on 85% of all top-1 predictions. In contrast, Diffdock- AF3 (initialized from AF3-predicted apo protein) achieved only 60% acceptable or better quality in top-1 predictions. However, when extending to the top-5 predictions, Diffdock-AF3 improved substantially, yielding 85% acceptable quality structures. Thus, DiffDock is sensitive to the initial input structure. Methods fail to capture all cases Although all models perform strongly on BCAPIN, we sought to identify cases where all models still struggle. Notably, all models fail to predict on two complexes: 8DZD and 7ZON ( Figure 3 ). Download figure Open in new tab Figure 3: Failure of DL prediction algorithms on select proteins from the BCAPIN test set. Experimentally solved structures of (A) secreted protein (8DZD) and (B) glycosidase family 18 (7ZON, right) in gray, alongside AF3 predictions (blue), Boltz-1 (orange), Chai-1 (green), Diffdock (red), and RFAA (magenta). 8DZD is a Mycobacterium smegmatis secreted protein composed entirely of α-helices bound to a fructose-glucose disaccharide. While most models (except Chai-1) accurately predict the protein backbone, none correctly dock the ligand. RFAA places the ligand inside the protein. 7ZON is a glycosidase primarily composed of β-sheets bound to three independent glucose monosaccharides. Although most models correctly predict two of the binding sites, the models consistently misplace the third monosaccharide on the opposite side of the protein surface. We further scrutinized all predictions to identify additional cases of sub-optimal performance. We found that all models produced only acceptable to medium quality on 8IC1 and 7RFT ( Figure 4 ). Download figure Open in new tab Figure 4: Low Quality DL predictions select proteins from the BCAPIN test set. We show the experimentally solved structures of (A) arabinose (8IC1) and (B) SAS protein 20 (7RFT), in gray, alongside AF3 predictions (blue), Boltz-1 (orange), Chai-1 (green), Diffdock (red), and RFAA (magenta). 8IC1 is an arabinose that binds a homogenous arabinofuranose oligosaccharide of DP 4 along a β-sheet. Several models, such as AF3 and Boltz-1, incorrectly predict binding at an alternative β-sheet, while others (DiffDock and RFAA) incorrectly predict the saccharide conformation ( Figure 4A ). 7RFT is a SAS protein 20 that binds a glucose oligosaccharide of DP 3 at a β-sheet. Although all methods identify the binding pocket of 7RFT correctly, none accurately reproduce the specific experimental conformation, particularly the orientation of the terminal Glc, which experimentally makes minimal contact with the β-strand ( Figure 4B ). These data suggest that current models may have difficulty on α-helical binding pockets of saccharides, simultaneous binding of multiple ligands, and docking longer saccharides. Prediction power decreases with carbohydrate length We next hypothesized that model performance may correlate with saccharide complexity. To explore the role of DP on performance, we plotted the top-1 DockQC score against saccharide length ( Figure 5 ). In total, all models showed similar trends across saccharide length categories: medium quality for monosaccharides, medium to high quality for disaccharides, acceptable quality for oligosaccharides, and acceptable quality for glycosyltransferases (GTs). Thus, we observed a decline in performance as complexity increased from simple mono and disaccharides to DP of three or greater and coordination of small ligands, in the case of GTs. Download figure Open in new tab Figure 5: Comparison of average and standard deviation DockQC of predicted structures versus saccharide length. We group saccharide length into a degree of polymerization (DP) of 1 (mono), 2 (di), and 3+ (oligo), and further group all glycosyltransferases (GTs) together that require multiple inputs (e.g. a saccharide and NTP) and with the number of proteins in each group listed. Dashed lines indicate the DockQC cutoffs between acceptable (red), medium (blue), and high (green) quality structures. Top-1 prediction on BCAPIN with AF3 (blue circle), Boltz-1 (orange square), Chai-1 (Green X), Diffdock- holo (red triangle), Diffdock- AF3 (purple triangle), and RFAA (brown diamond). Download figure Open in new tab Figure 6: Comparison of confidence metrics and DockQC accuracy on the BCAPIN test set. Lines of best fit are provided for each plot. (A) Comparison of DockQC and ligand pLDDT for AF3 (blue circle), Boltz-1 (red square), Chai-1 (green X) and RFAA (gray diamond). (B) Comparison of DiffDock confidence for both DiffDock- holo (red circle) and DiffDock- AF3 (blue square). (C) Comparison of DockQC and ipTM for AF3, Boltz-1, and Chai-1. (D) Comparison of DockQC versus the pAE for AF3, Boltz-1 (called pDE), and RFAA. Model confidence moderately predicts accuracy Although all current models perform strongly on BCAPIN, performance varies across predictions. We therefore assessed whether models can reliably self-assess the accuracy of their own predictions using internal confidence metrics, such as predicted local distance difference test (pLDDT), interface predicted template modeling score (ipTM), and predicted absolute error (PAE). For average ligand pLDDT, AF3 and Boltz-1 show moderate correlations with DockQC, whereas Chai-1 and RFAA produce strong correlations ( Figure 4 ). Since pLDDT reflects only the ligand confidence, we also evaluated ipTM, which incorporates the protein-ligand interface. Among models reporting ipTM (AF3, Boltz-1, Chai-1), all show moderate correlations, with Chai-1 performing best (Supplemental Figure S3). For PAE, Boltz-1 showed a weak negative of -0.26, AF3 a moderate correlation, with RFAA a strong correlation of -0.7 with DockQC (Supplemental Figure S4). Contrary to the end-to-end models, DiffDock provides only one confidence metric. While both DiffDock-holo and DiffDock-AF3 use the same scoring, DiffDock- AF3 ’s provides a significantly weaker correlation than DiffDock- holo , reinforcing DiffDock’s sensitivity to the starting structure (Supplemental Figure S5). Overall, all end-to-end models show moderate correlations between their internal confidence metrics to the DockQC, with RFAA demonstrating the strongest predictive reliability. Contrarily, DiffDock’s confidence metric is more susceptible to small perturbations in the input structure, limiting its reliability. Proteome scale predictions require refinement The BCAPIN dataset is limited to small (less than 600 residues) single- or two-domain structurally resolved proteins with strong binding affinities. Despite being implicated in important physiological interactions, binding characteristics of large multidomain or multichain structures with carbohydrates are less well characterized due to their relative low binding affinity (but high avidity). To elucidate the protein-sugar interactome, researchers currently employ photoaffinity tag experiments 22 or use computational tools like LectinOracle 2 or PiCAP. 4 However, these tools do not provide structural protein-carbohydrate complex predictions. Therefore, we aimed to assess if any end-to-end all atom structure prediction models could provide a high-throughput de novo approach for predicting docked protein-carbohydrate complexes with high confidence. To evaluate a de novo protein-carbohydrate docking pipeline, we selected nine proteins from the human proteome and used AF3 with its ipTM confidence metric to predict their structures in complex with either a GM1 ganglioside or a hybrid N-glycan ( Figure 7 ). We used GM1 ganglioside ligands for proteins experimentally identified to interact with GM1 gangliosides in Zhang et al. and the hybrid N-glycan ligand for all others, as it is a common covalent modification on membrane and secreted proteins. Download figure Open in new tab Figure 7: AlphaFold3 predictions on selected human protein-glycan interactions. PiCAP provides the protein-level prediction, and CAPSIF2 provides residue predictions (cyan). The bound glycan is either a complex N-glycan (green) or a GM1 ganglioside (yellow), with the initial GlcNAc of the N-glycan highlighted in blue and all sialic acids highlighted in magenta. PiCAP predicted interleukin 31 (IL31), sonic hedgehog (SHH), and scrapie-responsive protein 1 (SCRG) as putative carbohydrate binding proteins. Here, we used AF3 to dock these proteins with a hybrid N-glycan, a common branched saccharide where one branch terminates in an oligomannose chain and the other in a sialic acid. CAPSIF2 predicted no carbohydrate-binding residues on IL31; however, AF3 predicted the glycan to bind at an unstructured region of the protein with a high interaction confidence (ipTM = 0.81). Conversely, AF3 docked the N-glycan at the CAPSIF2 predicted residues of SHH and SCRG with a lower confidence (ipTM = 0.49). Experimentally, arachindonyl ether phospholipid synthase (TM164), receptor-type tyrosine-protein phosphatase S (PTPRS), and Frizzled 1 (FZD1) were identified in multiple experiments as ganglioside binding proteins. 22 These proteins were also predicted by PiCAP to bind a carbohydrate. We therefore modeled these proteins in complex with the GM1 ganglioside glycan ( Figure 7 ). AF3 predicted TM164 to bind GM1 in the CAPSIF2 predicted pocket with high confidence (ipTM = 0.85). However, AF3 however predicts PTPRS and FZD1 to bind the ganglioside glycan at sites outside of the CAPSIF2 predicted pockets. Notably, CAPSIF2 predicts on intracellular binding pocket for FZD1, whereas both AF3, experimental data, and CAPSIF:V suggest binding occurs in the extracellular region. 22 While PiCAP predicts approximately 7,000 human proteins to bind carbohydrates, it also predicts ∼13,000 human proteins as non-binders. To assess whether AF3 could also discriminate between physiologically relevant and irrelevant interactions, we selected three proteins: mothers against decapentaplegic homolog 4 (SMAD4), NEDD-8 activating enzyme E1 regulatory subunit (ULA1), and Tudor domain containing 10 (TDR10). Since SMAD4 was previously investigated by Zhang et al. and identified as a putative non -binder of GM1, we modeled the protein with GM1. AF3 however predicts the SMAD4-GM1 complex with a high confidence (ipTM = 0.82). Similarly, AF3 predicted moderate to high confidence interactions for an N-glycan in complex with ULA1 (ipTM = 0.61) and TDR10 (ipTM = 0.79). These findings suggest that ipTM values alone may not be sufficient to distinguish between physiologic and non-physiologic interactions in a high-throughput manner. Discussion We present an evaluation of multiple end-to-end all-atom prediction frameworks for carbohydrate-protein docking and interrogate their capabilities at unveiling the structural secrets of the protein-sugar interactome. Overall, all methods perform incredibly well at this task – all end-to-end models capture 80% of their highest confidence models at least acceptable quality ( Figure 2 ). These models improve upon previous energy-based protein-carbohydrate docking methods like GlycanDock 24 and HADDOCK 25 , which are useful for refinement but not full de novo docking. Although the models we tested improve upon previous methods and models, they still have limitations, including reduced performance with increased complexity. Specifically, the models perform worse on multi-ligand targets (GTs) and saccharides with DP greater or equal to three. Also, the models lack robust confidence metrics for protein-carbohydrate complexes. Our BCAPIN dataset is the first study of protein-carbohydrate noncovalent docking, including all protein-carbohydrate complexes in the PDB. However, BCAPIN primarily comprises small, globular, single-domain proteins bound to linear glycan chains, which is not representative of the diverse protein-carbohydrate interactions found in physiological contexts. Thus, as more experimental data becomes available, alongside further developments in these prediction techniques, the framework presented here can be iterated to better elucidate the protein-sugar interactome. Several limitations are present in our work. Many protein-oligosaccharide complexes are dominated by non-specific electrostatic interactions. In some instances, a few central residues engage in specific interactions while terminal residues interact primarily through electrostatics, resulting in reduced experimental resolution, as seen in 7PUG and 7EQR in our low quality test set (Table S2). Other cases, like heparin-binding proteins, are driven almost exclusively by electrostatic forces. 26 To address these issues, we applied an average RSCC cutoff on bound ligands, though this approach removes several high-quality specific bound structures with unstructured termini. Additionally, BCAPIN does not include heparin binding proteins, which should be studied for future work. The largest limitation in continually iterating and benchmarking this structure prediction software however is the availability of high-quality experimental structures. Although the DIONYSUS dataset is impressive in its scope, containing 5,461 protein-carbohydrate complexes, only 1,842 unique protein structures remain after by 95% sequence similarity 20 , 27 . Further, when assessing the individual unique binding pockets of these DIONYSUS proteins, there are only 258 unique clusters of binding pockets. 28 With this limited set of ∼1,800 unique structures and ∼250 unique binding mechanisms, data science and machine learning approaches are restricted. Therefore, discovery of novel carbohydrate binding proteins and their structural interactions is critical. To better improve computational approaches, we believe that one of the most promising sources of future data future lies in liquid glycan arrays and photoaffinity labeling experiments (e.g. those using diazirine linkers). 21 , 22 , 29 , 30 These in vivo high throughput techniques enable identification of protein-carbohydrate interactions on a proteome-wide scale; however, they currently lack immediate structural resolution. Computational modeling stands poised to fill this gap by providing structural hypotheses at atomic level detail, thereby accelerating the validation and functional understanding of these experimentally identified interactions. To push the scope of the BCAPIN test set, we selected two branched polysaccharides with distinct properties to explore AF3’s capabilities. Although our study does not demonstrate that AF3 is yet ready to support full scale high-throughput experiments comparable to photoaffinity labeling, it shows that AF3 can generate useful, testable hypotheses on a case-by-case basis that may expedite wet lab investigations. To aid wet lab experiments, our lab has computationally studied protein-carbohydrate structural interactions. We developed GlycanDock 24 , CAPSIF 5 , and PiCAP 4 as ways to elucidate these interactions. PiCAP in particular, represents a significant advancement, as it was the first model to predict whether a protein binds to carbohydrate, irrespective of protein family on a proteome scale. However, these current models rely on the fundamental work of thousands of scientists solving crystal structures of protein-carbohydrate complexes. While high-throughput technologies are likely to uncover many more non-covalent protein carbohydrate interactions in vivo , reliably obtaining the bound structure or identifying the full glycan repertoire for each protein remains a computational bottleneck. We envision a full suite of models and methods will fill the gap to identify the full protein sugar interactome of a species. We advocate for a model that would improve upon LectinOracle 2 , integrating the glycan embeddings from methods like SweetNet 31 or Gifflar 32 using sequence and structural information insights from structure prediction models, current photoaffinity experiments, and CAZY 33 can predict the glycan binding repertoire of all proteins. With this addition, one can use PiCAP to predict whether a protein binds carbohydrates, use CAPSIF2 or PeSTo-Carbs to predict how the protein binds the carbohydrate structurally, and finally, use the proposed model to predict which carbohydrates are recognized, all at high-throughput scales. This integrated approach will be essential to fully map the protein-sugar interactome, advancing our understanding of glycan-mediated biology, enabling translational applications in therapeutics and diagnostics. Methods Dataset To evaluate how all-atom prediction software extrapolates to glycans, we used DIONYSUS (access date: October 8, 2024), to construct our dataset. We first selected all protein-carbohydrate complexes after the September 2021 training cutoff date used by all models. Of the 5,461 identified structures by DIONYSUS, 614 proteins were deposited in the PDB after the training date cutoff. We then clustered all 5,461 protein sequences using MMSEQS 27 into 50% sequence identity clusters and removed any post-cutoff proteins with sequence homology with any protein published before the training date cutoff, leaving 105 structures. We then selected a single structure from each cluster, selecting the complex with the highest degree of polymerization (DP), leaving 35 protein structures. Of these 35 protein structures, 11 experimentally bind monomers, 6 experimentally bind dimers, 13 bind polymers (3+ saccharides), and 5 bind a saccharide and nucleotide triphosphate (NTP). For each structure, we analyzed the ligand structure quality measures, notably real space R factor (RSR) and real space correlation coefficient (RSCC) (Figure S2). 23 When these metrics weren’t available, (7TOH, 7YWF, 8CSF) we provide their root-mean-squared deviation Z-scores (RMSZ). We define the set of high-quality structures with an RSCC greater than 0.9, 23 which contains 20 structures: 9 that bind monomers, 3 that bind dimers, 5 that bind polymers, and 3 that bind at least an NTP and a saccharide. We named our dataset the Benchmark of CArbohydrate Protein INteractions (BCAPIN). Prediction methodology To provide an equivalent and biologically relevant input ligand for all structures, we generated the SMILES strings of the original PDB ligand using GlyLES 34 (part of the Glycowork 35 Python package). In the case of homogenous polymers, we extended the length of the original carbohydrate by a DP of 2 to provide additional biological context. AF3, Boltz-1, Chai-1 and Diffdock input a SMILES string, 7 , 9 , 12 , 13 , but RFAA requires an SDF file input (ligand coordinates) to perform the calculations, which we used RDKit to calculate the initial ligand coordinates. In the case of heparin binding proteins (8EDI and 7PGK), we used the SMILES retrieved from the PubChem compound instead. 36 For the five glycosyltransferases (GTs) targets, we input both the carbohydrate(s) and NTP to the software for multi-body docking. To replicate the process of a simple de novo pipeline, we ran all methods without modifications or customizations. AF3, Boltz-1, and RFAA were run with a local distribution with five random seeds using the SMILES strings (or RDKit generated SDF from the SMILES for RFAA). Chai-1 was run using the Chai-1 servers, which uses five random seeds for predictions. All confidence metrics were extracted from provided mmCIF and json files. For predicted absolute error, we rather used Boltz-1’s interface predicted distance error (ipde). Diffdock is not an end-to-end method, therefore we ran DiffDock in two different contexts, (1) with the solved experimental structure, which we call DiffDock- holo , and (2) with a predicted AF3 protein structure, which we call Diffdock- AF3 . The AF3 structure for the input into DiffDock- AF3 was chosen as the best ranking AF3 apo model running from 5 random seeds. We ran both DiffDock methods using the HuggingFace server with the SMILES strings, resulting in 10 total models. On GTs with multiple ligands, we concatenate the structures of the same rank together for a singular prediction. Metrics Carbohydrates differ substantially from conventional small molecules, as they range from small monosaccharides to branched polymers. We therefore selected the following metrics to analyze protein-carbohydrate complex predictions: full ligand F nat ( F nat,Jull ), residue F nat ( F nat,res ), ligand RMSD (LRMS), and ring-ring RMSD (rRMS). F nat is the fraction of native contacts, defined as all residue-residue contacts (any heavy atom to any heavy atom) within 5 Å: where TP (True Positives) is the overlap between predicted contacts and experimentally known contacts and FN (False negatives) are all experimental contacts not observed in the predicted structure. We use this formal definition of residue-residue contacts which we call F nat,res . In addition, as these are small molecule-like ligands, we additionally define F nat,full , which instead of carbohydrate residue-protein residue contacts, instead is the full ligand F nat , or any carbohydrate heavy atom-protein residue contacts (effectively treating the full ligand as a singular residue). In addition to F nat , we leverage the root means squared deviation (RMSD) metric: where x i are the coordinates of select heavy atoms of the predicted structure and y i are the same heavy atom coordinates of the experimentally determined structure after optimal superposition of the protein’s binding pocket (all residues within 10 Å of the ligand). We chose two different RMSDs to indicate the fine-grained nature of carbohydrate polymers: ligand RMSD and ring RMSD. Ligand RMSD ( LRMS ) measures the distance between the predicted and experimental structures of the ligand’s heavy atoms. For LRMS, we use the RDKit implementation that compares the maximal similar substructures. This measurement is accounts for the specific orientations (e.g. chirality and epimerization) of the carbohydrate rings. 37 , 38 Ring RMSD ( rRMS ) simplifies the problem to only measuring the distance between the center of mass (COM) of each carbohydrate ring. We use a greedy implementation of rRMS , where each saccharide species is equivariant to any other saccharide species along the polymer chain. We combine the four separate measurements to “DockQC,” which represents the overall quality of the predicted protein-carbohydrate structure on a scale from 0 to 1. This metric is inspired by the foundational DockQ metric for measuring protein-protein docking. 39 , 40 DockQ measures on a scale from [0,1] by combining the fraction of natural contacts ( F nat ), LRMS, and interface RMS (iRMS). 39 , 40 where d 1 = 1.5 Å and d 2 = 8.5 Å and Currently, DockQ does not allow the ligands to differ in size between the crystal and predicted structure. Additionally for small molecules, DockQ only reports the LRMS value. 40 When we reimplemented the DockQ metric with these values accounted for, we found it unrepresentative of the predictions (Table S1, Figure S1). We therefore constructed the DockQC based on the metrics as follows: where d 1 = 2.0 Å, d 2 = 4.0 Å. We tuned the scaling factors of d 1 and d 2 to fit the DockQC into the four different categories: incorrect (DockQC < 0.25), acceptable (0.25 <= DockQC < 0.50), medium (0.50 <= DockQC = 0.80) (Table S1, Figure S1). Human proteome predictions We selected nine proteins from the human proteome to evaluate de novo docking on proteomic scales, where PiCAP predicts six of these proteins as carbohydrate binding proteins and three as non-binding proteins. We used the following purported glycans for docking based on the function of each protein: GM1, Gal(β1-3)GalNAc(β1-4)[Neu5Ac(α1-3)]Gal(β1-4)Glcβ, for ganglioside binding proteins and a hybrid N-glycan for the remaining proteins, Neu5Ac(α1-6)Gal(β1-4)GlcNAc(β1-2)Man(α1-3)[Man(α1-6)[Man(α1-3)]Man(α1-6)]Man(β1-4)GlcNAc(β1-4)GlcNAcβ. Data Availability The BCAPIN dataset and all model inputs, code, and analysis data are available on Github at github.com/graylab/dockqc. Author Contributions S.W.C. conceived the project, performed the research, analyzed data, wrote the manuscript, and created all figures. L.L. conceived the project, performed the research, analyzed data, and wrote the manuscript. S.S.T. performed the research and analyzed data. J.J.G. conceptualized and supervised the project, analyzed data, and wrote the manuscript. Funding This work was supported by NIH R35-GM141881 (JJG and SWC), and NIH R01-AI162381 (JJG and SWC) and NSF-2108660 (LL) and NIH T32-GM008403 (SST). Acknowledgements We thank William F. Degrado for his continual support and advice on the project, identifying limitations of benchmarking when comparing experimental and computational protein-carbohydrate structures. We also thank Dr. Matthew O’Meara and Miguel Limcaoco for advice on evaluation of experimentally solved structures and of predicted structures. Computing was performed on the JHU High Performance Computing from ARCH. We thank the Rosetta Commons lab exchange travel support (LL). Funder Information Declared NIH , R35-GM141881 , R01-AI162381 , T32-GM008403 NSF , 2108660 Footnotes https://github.com/graylab/dockqc References 1. ↵ Bonnardel , F. , Mariethoz , J. , Pérez , S. , Imberty , A. & Lisacek , F. LectomeXplore, an update of UniLectin for the discovery of carbohydrate-binding proteins based on a new lectin classification . Nucleic Acids Res 49 , D1548 – D1554 ( 2021 ). OpenUrl CrossRef PubMed 2. ↵ Lundstrøm , J. , Korhonen , E. , Lisacek , F. & Bojar , D. LectinOracle: A Generalizable Deep Learning Model for Lectin–Glycan Binding Prediction . Advanced Science 9 , ( 2022 ). 3. ↵ Varki , A. et al. Essentials of Glycobiology . ( Cold Spring Harbor Laboratory Press, Cold Spring Harbor , 2022 ). 4. ↵ Canner , S. W. , Schnaar , R. L. & Gray , J. J. Predictions from Deep Learning Propose Substantial Protein-Carbohydrate Interplay . Preprint at doi: 10.1101/2025.03.07.641884 ( 2025 ). OpenUrl Abstract / FREE Full Text 5. ↵ Canner , S. W. , Shanker , S. & Gray , J. J. Structure-based neural network protein– carbohydrate interaction predictions at the residue level . Frontiers in Bioinformatics 3 , ( 2023 ). 6. ↵ Bibekar , P. , Krapp , L. & Peraro , M. D. PeSTo-Carbs: Geometric Deep Learning for Prediction of Protein–Carbohydrate Binding Interfaces . J Chem Theory Comput 20 , 2985 – 2991 ( 2024 ). OpenUrl CrossRef PubMed 7. ↵ Abramson , J. et al. Accurate structure prediction of biomolecular interactions with AlphaFold 3 . Nature 630 , 493 – 500 ( 2024 ). OpenUrl CrossRef PubMed 8. ↵ The Royal Swedish Academy of Sciences . The Nobel Prize in Chemistry 2024 . ( 2024 ). 9. ↵ Corso , G. , Stärk , H. , Jing , B. , Barzilay , R. & Jaakkola , T. DiffDock: Diffusion Steps, Twists, and Turns for Molecular Docking . ICLR ( 2023 ). 10. ↵ Corso , G. et al. Deep Confident Steps to New Pockets: Strategies for Docking Generalization . ( 2024 ). 11. ↵ Krishna , R. et al. Generalized biomolecular modeling and design with RoseTTAFold All-Atom . Science (1979) 384 , ( 2024 ). 12. ↵ Wohlwend , J. et al. Boltz-1 Democratizing Biomolecular Interaction Modeling . Preprint at doi: 10.1101/2024.11.19.624167 ( 2024 ). OpenUrl Abstract / FREE Full Text 13. ↵ Boitreaud , J. et al. Chai-1: Decoding the molecular interactions of life . Preprint at doi: 10.1101/2024.10.10.615955 ( 2024 ). OpenUrl Abstract / FREE Full Text 14. ↵ Buttenschoen , M. , Morris , G. M. & Deane , C. M. PoseBusters: AI-based docking methods fail to generate physically valid poses or generalise to novel sequences . Chem Sci 15 , 3130 – 3139 ( 2024 ). OpenUrl CrossRef PubMed 15. ↵ Fontana , C. & Widmalm , G. Primary Structure of Glycans by NMR Spectroscopy . Chem Rev 123 , 1040 – 1102 ( 2023 ). OpenUrl CrossRef PubMed 16. Zhang , S. , Chen , K. Y. & Zou , X. Carbohydrate-protein interactions: advances and challenges . Commun Inf Syst 21 , 147 – 163 ( 2021 ). OpenUrl CrossRef PubMed 17. ↵ Hudson , K. L. et al. Carbohydrate–Aromatic Interactions in Proteins . J Am Chem Soc 137 , 15152 – 15160 ( 2015 ). OpenUrl CrossRef PubMed 18. ↵ Nagae , M. et al. Crystal Structure of Anti-polysialic Acid Antibody Single Chain Fv Fragment Complexed with Octasialic Acid . Journal of Biological Chemistry 288 , 33784 – 33796 ( 2013 ). OpenUrl Abstract / FREE Full Text 19. ↵ Berman , H. M. The Protein Data Bank . Nucleic Acids Res 28 , 235 – 242 ( 2000 ). OpenUrl CrossRef PubMed Web of Science 20. ↵ Gheeraert , A. et al. DIONYSUS: a database of protein–carbohydrate interfaces . Nucleic Acids Res ( 2024 ) doi: 10.1093/nar/gkae890 . OpenUrl CrossRef 21. ↵ Babulic , J. L. , De León González , F. V. & Capicciotti , C. J. Recent advances in photoaffinity labeling strategies to capture Glycan–Protein interactions . Curr Opin Chem Biol 80 , 102456 ( 2024 ). OpenUrl CrossRef PubMed 22. ↵ Zhang , G.-L. et al. The Human Ganglioside Interactome in Live Cells Revealed Using Clickable Photoaffinity Ganglioside Probes . J Am Chem Soc 146 , 17801 – 17816 ( 2024 ). OpenUrl CrossRef PubMed 23. ↵ Smart , O. S. et al. Validation of ligands in macromolecular structures determined by X-ray crystallography . Acta Crystallogr D Struct Biol 74 , 228 – 236 ( 2018 ). OpenUrl CrossRef PubMed 24. ↵ Nance , M. L. , Labonte , J. W. , Adolf-Bryfogle , J. & Gray , J. J. Development and Evaluation of GlycanDock: A Protein–Glycoligand Docking Refinement Algorithm in Rosetta . J Phys Chem B 125 , 6807 – 6820 ( 2021 ). OpenUrl 25. ↵ Ranaudo , A. , Giulini , M. , Pelissou Ayuso , A. & Bonvin , A. M. J. J. Modeling Protein–Glycan Interactions with HADDOCK . J Chem Inf Model 64 , 7816 – 7825 ( 2024 ). OpenUrl CrossRef PubMed 26. ↵ Mishra , S. & Horswill , A. R. Heparin Mimics Extracellular DNA in Binding to Cell Surface-Localized Proteins and Promoting Staphylococcus aureus Biofilm Formation . mSphere 2 , ( 2017 ). 27. ↵ Hauser , M. , Steinegger , M. & Söding , J. MMseqs software suite for fast and deep clustering and searching of large protein sequence sets . Bioinformatics 32 , 1323 – 1330 ( 2016 ). OpenUrl CrossRef PubMed 28. ↵ Gheeraert , A. , Guyon , F. , Pérez , S. & Galochkina , T. Unraveling the diversity of protein-carbohydrate interfaces: Insights from a multi-scale study . Carbohydr Res 550 , 109377 ( 2025 ). OpenUrl 29. ↵ Lin , C.-L. , Carpenter , E. J. , Li , T. , Ahmed , T. & Derda , R. Liquid Glycan Array . in Phage Engineering and Analysis 143 – 159 ( 2024 ). doi: 10.1007/978-1-0716-3798-2_10 . OpenUrl CrossRef 30. ↵ Lima , G. M. et al. The liquid lectin array detects compositional glycocalyx differences using multivalent DNA-encoded lectins on phage . Cell Chem Biol ( 2024 ) doi: 10.1016/j.chembiol.2024.09.010 . OpenUrl CrossRef 31. ↵ Burkholz , R. , Quackenbush , J. & Bojar , D. Using graph convolutional neural networks to learn a representation for glycans . Cell Rep 35 , 109251 ( 2021 ). OpenUrl CrossRef PubMed 32. ↵ Joeres , R. & Bojar , D. Higher-Order Message Passing for Glycan Representation Learning . ( 2024 ). 33. ↵ Henrissat , B. A classification of glycosyl hydrolases based on amino acid sequence similarities . Biochem J 280 (Pt 2 ), 309 – 16 ( 1991 ). OpenUrl Abstract / FREE Full Text 34. ↵ Thomès , L. , Burkholz , R. & Bojar , D. Glycowork: A Python package for glycan data science and machine learning . Glycobiology 31 , 1240 – 1244 ( 2021 ). OpenUrl 35. ↵ Joeres , R. , Bojar , D. & Kalinina , O. V. GlyLES: Grammar-based Parsing of Glycans from IUPAC-condensed to SMILES . J Cheminform 15 , 37 ( 2023 ). OpenUrl 36. ↵ Kim , S. et al. PubChem 2025 update . Nucleic Acids Res 53 , D1516 – D1525 ( 2025 ). OpenUrl CrossRef PubMed 37. ↵ Meli , R. & Biggin , P. C. spyrmsd: symmetry-corrected RMSD calculations in Python . J Cheminform 12 , 49 ( 2020 ). OpenUrl CrossRef 38. ↵ Landrum , G. RDKit: Open-source cheminformatics . RDKit ( 2006 ). 39. ↵ Basu , S. & Wallner , B. DockQ: A Quality Measure for Protein-Protein Docking Models . PLoS One 11 , e0161879 ( 2016 ). OpenUrl CrossRef PubMed 40. ↵ Mirabello , C. & Wallner , B. DockQ v2: improved automatic quality measure for protein multimers, nucleic acids, and small molecules . Bioinformatics 40 , ( 2024 ). View the discussion thread. Back to top Previous Next Posted September 06, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Evaluation of De Novo Deep Learning Models on the Protein-Sugar Interactome Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Evaluation of De Novo Deep Learning Models on the Protein-Sugar Interactome Samuel W. Canner , Lei Lu , Sho S. Takeshita , Jeffrey J. Gray bioRxiv 2025.09.02.673778; doi: https://doi.org/10.1101/2025.09.02.673778 Share This Article: Copy Citation Tools Evaluation of De Novo Deep Learning Models on the Protein-Sugar Interactome Samuel W. Canner , Lei Lu , Sho S. Takeshita , Jeffrey J. Gray bioRxiv 2025.09.02.673778; doi: https://doi.org/10.1101/2025.09.02.673778 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Biophysics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41936) Biophysics (21452) Cancer Biology (18588) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15153) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00