Machine Learning-Driven Glycoproteomic Profiling Identifies Novel Diabetes-Associated Glycosylation Biomarkers

doi:10.1101/2025.02.24.25322514

Machine Learning-Driven Glycoproteomic Profiling Identifies Novel Diabetes-Associated Glycosylation Biomarkers

2025 · doi:10.1101/2025.02.24.25322514

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 47,044 characters · extracted from preprint-html · click to expand

Machine Learning-Driven Glycoproteomic Profiling Identifies Novel Diabetes-Associated Glycosylation Biomarkers | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Machine Learning-Driven Glycoproteomic Profiling Identifies Novel Diabetes-Associated Glycosylation Biomarkers Amr Elguoshy , Keiko Yamamoto , Yoshitoshi Hirao , Kengo Yanagita , Tomohiro Uchimoto , Takashi Kamimura , Tetsuya Takazawa , Tadashi Yamamoto doi: https://doi.org/10.1101/2025.02.24.25322514 Amr Elguoshy 1 Biofluid and Biomarker Center, Kidney Research Center, Graduate School of Medical and Dental Sciences, Niigata University , Japan 2 Departments of Biotechnology, Faculty of Agriculture, Al-Azhar University , Cairo, Egypt Find this author on Google Scholar Find this author on PubMed Search for this author on this site Keiko Yamamoto 1 Biofluid and Biomarker Center, Kidney Research Center, Graduate School of Medical and Dental Sciences, Niigata University , Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yoshitoshi Hirao 1 Biofluid and Biomarker Center, Kidney Research Center, Graduate School of Medical and Dental Sciences, Niigata University , Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kengo Yanagita 1 Biofluid and Biomarker Center, Kidney Research Center, Graduate School of Medical and Dental Sciences, Niigata University , Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Tomohiro Uchimoto 1 Biofluid and Biomarker Center, Kidney Research Center, Graduate School of Medical and Dental Sciences, Niigata University , Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Takashi Kamimura 3 Department of Diabetes and Endocrinology, Shinrakuen Hospital Find this author on Google Scholar Find this author on PubMed Search for this author on this site Tetsuya Takazawa 3 Department of Diabetes and Endocrinology, Shinrakuen Hospital Find this author on Google Scholar Find this author on PubMed Search for this author on this site Tadashi Yamamoto 1 Biofluid and Biomarker Center, Kidney Research Center, Graduate School of Medical and Dental Sciences, Niigata University , Japan 4 Department of Clinical Laboratory, Shinrakuen Hospital , Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: tadashiy-bbc{at}ccr.niigata-u.ac.jp Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Glycosylation plays a critical role in protein function and disease progression, including diabetes mellitus. This study performed a comprehensive glycoproteomic analysis comparing healthy volunteers (HV) and DM samples, identifying 19,374 peptides and 2,113 proteins, of which 1,104 were glycosylated. A total of 287 distinct glycans were mapped to 3,722 glycosylated peptides, revealing significant differences in glycosylation patterns between HV and DM samples. Statistical analysis identified 29 significantly altered glycosylation sites, with 23 upregulated in DM and 6 downregulated in DM. Notably, the glycan HexNAc(2)Hex(2)Fuc(1) at position 215 of Prosaposin was significantly upregulated in DM, marking its first reported association with diabetes. Machine learning models, particularly Support Vector Machines (SVM) and Generalized Linear Models (GLM), achieved high classification accuracy (∼ 92%: 96%) in distinguishing HV and DM samples based on glycosylation features (Glycans, Glycosylated proteins, and Glycosylation sites). These findings suggest that altered glycosylation patterns may serve as potential biomarkers for diabetes-related pathophysiology and therapeutic targeting. Introduction Glycosylation, a ubiquitous and intricate post-translational modification, plays a pivotal role in regulating a multitude of biological processes, including protein folding, stability, cellular signaling, and immune response 1 , 2 . This enzymatic process, which involves the attachment of glycans to proteins or lipids, is highly sensitive to cellular metabolic states and environmental cues. Consequently, dysregulated glycosylation patterns have been linked to the pathogenesis of chronic diseases, particularly metabolic disorders such as diabetes mellitus 3 , 4 , 5 . For instance, disruptions in glycan biosynthesis pathways, such as impaired sialylation or fucosylation, are known to exacerbate insulin resistance and vascular dysfunction in DM 6 Diabetes mellitus is a chronic metabolic disorder characterized by persistent hyperglycemia resulting from defects in insulin secretion, insulin action, or both 7 . Beyond its well-documented metabolic dysregulation, DM is associated with profound changes in protein glycosylation, which contribute to disease progression and the development of complications such as nephropathy, neuropathy, retinopathy, and cardiovascular disease 6 , 8 . Emerging evidence suggests that glycosylation changes in DM influence key pathological processes, including chronic inflammation, insulin resistance, and endothelial dysfunction 9 , 10 . As a result, specific N-glycan profiles have garnered attention as potential biomarkers for early diagnosis, prognosis, and therapeutic monitoring in DM 11 , 3 . Despite significant advances in glycobiology, a comprehensive understanding of glycosylation alterations at both the protein and site-specific levels in DM remains elusive. Most studies to date have focused on global glycan profiling, leaving a critical gap in knowledge regarding the precise molecular mechanisms underlying glycosylation changes and their functional consequences in DM 3 . For example, glycosylation at specific residues can dictate protein stability, ligand binding, or subcellular localization—features critical to pathological processes in DM 12 . Addressing this gap requires advanced analytical approaches capable of capturing the complexity of glycosylation at high resolution. In this study, we conducted a systematic and in-depth analysis of glycosylation data derived from healthy individuals and DM patients. Utilizing a combined approach of statistical analysis and machine learning, we aimed to identify dysregulated glycoproteins, glycans and glycosylation sites associated with DM 13 . This dual-level analysis provided a comprehensive view of glycosylation changes in DM. Our analytical workflow incorporated rigorous statistical methods to identify differentially glycosylated features without imputation, followed by machine learning analyses on imputed data to evaluate the robustness and reproducibility of our findings. This integrated approach ensured the reliability of our results while offering insights into the potential impact of missing data imputation on the identification of significant glycosylation features. Here, we present the key findings from our analysis, highlighting specific glycoproteins and glycosylation sites that are significantly associated with DM pathology. We also discuss the broader implications of these findings for the development of novel biomarkers and therapeutic targets, which could pave the way for improved management and treatment of DM. Materials and Methods 1. Sample Collection and Preparation A total of 40 biological samples were collected, comprising 20 samples from healthy volunteers (HV) and 20 from diabetic patients ( Table1 ). Sample collection followed standardized protocols to minimize pre-analytical variability. View this table: View inline View popup Table 1. presents the demographic and clinical characteristics of the samples used in the study, categorized into healthy volunteers (HV) and diabetes mellitus (DM) patients. The table includes details on sample name, age, sex, HbA1c levels, and urinary protein status. Age: The HV and DM group includes individuals with age averages of 54.4, and 58.4 years respectively; Sex: Both groups comprise male and female participants.; HbA1c Levels: As expected, the DM group exhibits higher HbA1c values, indicative of their diabetic status.; Urinary Protein Status: Represented as (−), (±), and (+), indicating negative, borderline, or positive urinary protein levels, respectively. Urinary proteins, including glycoproteins, were precipitated using methanol/chloroform precipitation. Briefly, 500 µL of urine was used for each sample. Frozen urine samples were thawed at 37 °C in a water bath for 10 minutes prior to processing. An equal volume of methanol and one-quarter volume of chloroform were added to the urine sample, followed by thorough mixing for 5 minutes. The mixture was centrifuged at 19,000 × g at 25 °C for 15 minutes. The supernatant was carefully discarded without disturbing the protein-containing interface layer using a pipette. An equal volume of methanol was then added to the sample, mixed gently for 5 minutes, and centrifuged again at 19,000 × g at 25 °C for 15 minutes. The supernatant was removed, and the precipitated proteins were air-dried. 2. Tryptic Peptide Preparation The precipitated proteins, including glycoproteins, were dissolved in 100 µL of 8 M urea/50 mM Tris-HCl (pH 8.0) buffer. Reduction was performed by adding 1 µL of 1 M dithiothreitol (DTT) and incubating at room temperature (RT) for 1 hour. Alkylation was carried out by adding 8 µL of 500 mM iodoacetamide (IAA) and incubating at RT for 1 hour in the dark. The alkylation reaction was quenched by adding 1 µL of 1 M DTT. The sample was then diluted eightfold with 50 mM Tris-HCl (pH 8.0). For digestion, 1 µg of trypsin (Agilent, Santa Clara, CA, USA) was added, and the sample was incubated at 37 °C for 16 hours with shaking. Digestion was terminated by acidification with 50% trifluoroacetic acid (TFA). The digested peptides were purified using a C18 spin column (GL Science, Tokyo, Japan) according to the manufacturer’s instructions. Briefly, the C18 spin column was activated sequentially with 100% and 50% acetonitrile (ACN), followed by equilibration with 0.2% formic acid (FA) by centrifugation at 3,000 × g for 30 seconds. The sample was loaded onto the column and centrifuged at 3,000 × g for 90 seconds. Trapped peptides, including glycopeptides, were washed twice with 0.2% TFA and eluted with 95% ACN containing 5% FA. The eluted peptides were dried using a VEC-260 vacuum dryer (Iwaki, Tokyo, Japan). The dried peptides were reconstituted in 0.1% FA, and peptide concentration was measured using a NanoDrop 1000 spectrophotometer (Thermo Fisher Scientific, Bremen, Germany). Samples were stored at -80 °C until further analysis. 3. Mass Spectrometry Analysis Peptide samples were analyzed using a Fusion Lumos mass spectrometer equipped with an Orbitrap detector (Thermo Fisher Scientific, Bremen, Germany) in data-dependent acquisition (DDA) mode. Peptides, including glycopeptides, were separated using a nanoflow liquid chromatography system (nLC1000, Thermo Fisher Scientific) on a trap column (2 cm × 75 µm, Acclaim PepMap 100) and an analytical column (12.5 cm × 75 µm, NTCC-360) at a flow rate of 300 nL/min. The mobile phases consisted of (A) 0.1% FA in water and (B) 0.1% FA in ACN. A total of 500 ng of peptides was injected and eluted using a linear gradient from 2% to 35% mobile phase B over 120 minutes. The mass spectrometer was operated in positive ion mode with a scan range of 350–1800 m/z. The 20 most intense peaks with charge states of 2+, 3+ and 4+ were selected for fragmentation, with an automatic gain control (AGC) target of 4.0 × 10^5 and a maximum injection time of 50 ms. To enhance glycopeptide identification, a dual fragmentation strategy was employed, utilizing higher-energy collisional dissociation (HCD) and electron-transfer/higher-energy collisional dissociation (EThcD) for complementary fragmentation of glycopeptides 14 . Peptides, including both deglycosylated and glycopeptides, were identified with a stringent 1% false discovery rate (FDR) at the protein level. Glycosylation sites were mapped using a combination of automated database searches and manual validation to ensure high-confidence site localization. 4. Database Search Parameters MS/MS spectra were analyzed using the Byonic v3.4.0 search engine (Protein Metrics, Inc.) against the Homo sapiens SwissProt database (Version: 2018-07). The search parameters were as follows: - Parent ion mass tolerance: 10 ppm - Fragment ion mass tolerance: 0.02 Da - Fixed modification: Carbamidomethylation (C) - Variable modifications - 309 curated mammalian N-glycans - 6 most common O-glycans Spectral counts of glycosylated and non-glycosylated peptides were extracted for downstream quantitative analyses. 4. Quantification of Glycosylation at Different Levels Relative abundance normalization was applied to compute glycosylation levels at three hierarchical levels: site, glycan, and glycosylated protein. Site-Level Glycosylation Relative Abundance: Glycan-Level Glycosylation Relative Abundance Glycosylated Protein-Level Glycosylation Relative Abundance This relative abundance-based normalization accounted for variations in sample loading and MS performance, ensuring comparability across samples. 5. Statistical Analysis Prior to missing data imputation, statistical tests were applied to detect significant differences in glycosylation patterns between HV and DM groups. Differential Glycosylation Analysis: ∘ Relative abundances of glycans, glycosylated proteins, and glycosylation sites were compared. ∘ The Wilcoxon rank-sum test was used due to non-Gaussian Relative abundances distributions. ∘ Fold-change analysis was performed, considering |log 2 (fold change)| > 0.58 as biologically significant. 6. Missing Data Handling and Imputation To improve dataset robustness, glycans, glycosylated proteins, and glycosylation sites detected in fewer than 10 of the 40 samples were excluded to reduce noise before applying imputation methods. Missing values were imputed using five independent strategies to minimize bias in downstream analyses: Mean imputation (substitution with group mean) k-Nearest Neighbors (KNN) imputation (leveraging data similarity) Random Forest imputation (preserving complex relationships) Minimum imputation (assuming missing values correspond to low abundance) Mixed imputation, which combines KNN for missing at random (MAR) data and minimum imputation for missing not at random (MNAR) data Each imputation method generated a distinct dataset, and their performance was evaluated to ensure imputed values did not distort biologically meaningful patterns. 7. Machine Learning Analysis Machine learning models were developed to classify HV and DM samples based on glycosylation features. Feature Selection: ∘ Boruta Algorithm was used to identify the most discriminative glycosylation features. Boruta algorithm is an extension of the Random Forest algorithm and is particularly useful for high-dimensional datasets. Model Training and Evaluation: ∘ Algorithms applied: Logistic Regression, Decision Trees, Random Forest (RF), and Support Vector Machines (SVM). ∘ Performance was evaluated using 10-fold cross-validation (CV) and a test dataset. ∘ The following metrics were calculated: ▪ Accuracy (overall classification performance) ▪ Precision and Recall (class-wise performance) ▪ F1-score (harmonic mean of precision and recall) ▪ Area Under the Receiver Operating Characteristic Curve (AUC-ROC) Final Model Evaluation Criteria: ∘ Test Performance Priority: Emphasizing test-set metrics over CV metrics for real-world applicability. ∘ Consistency across Metrics: Prioritizing models with balanced accuracy, precision, recall, and F1-score. ∘ Variance in Cross-Validation: Models with lower standard deviation (CV_SD) in performance were favored. Feature Importance Analysis: ∘ Logistic Regression coefficients and Random Forest feature importance scores were analyzed. ∘ Key glycosylation sites contributing to classification were identified. 8. Software and Computational Tools All analyses were conducted in R (version 4.2.0). The following R packages were utilized: ∘ dplyr (data manipulation) ∘ ggplot2 (visualization) ∘ caret (machine learning modeling) ∘ missForest and DMwR (missing data imputation) ∘ pROC (ROC curve analysis) Mass spectrometry data were processed using Byonic software, followed by statistical and machine learning analysis in R. Results and Discussion 1. Glycosylation Profiles in Healthy Volunteers (HV) and Diabetes Mellitus Samples A comprehensive glycoproteomic analysis identified 19,374 peptides (both glycosylated and non-glycosylated), corresponding to 2,113 proteins across all samples. Among these, 287 distinct glycans were mapped to 3,722 glycosylated peptides, covering 9,194 unique glycosylation sites and 1,104 glycosylated proteins. Specifically, 282 N-glycans and 5 O-glycans were identified. This comprehensive dataset underscores the complexity of glycosylation in both physiological and pathological states, consistent with prior studies highlighting the diversity of glycan structures in human proteomes15. The predominance of N-glycans aligns with their established role in stabilizing protein conformations and mediating cellular interactions, processes frequently disrupted in metabolic diseases like DM 2 . Notably, aberrant glycosylation in DM has been linked to immune dysregulation and impaired cell signaling 6 . Our findings reinforce this paradigm, revealing diabetes-specific glycoforms that may contribute to disease progression. For instance, the identification of 5 O-glycans, though fewer in number, suggests potential roles in modulating extracellular matrix interactions or inflammatory pathways, as observed in other chronic inflammatory conditions 16 . 2. Dysregulated Glycans, Glycoproteins and Glycosylation Sites in DM Comparative statistical analysis revealed significant differences in glycosylation patterns between HV and DM cohorts (p < 0.05). Among 1,700 shared glycosylation sites, 29 exhibited altered abundances (23 upregulated in DM; 6 downregulated in DM) (( Figures 1a–1c ).), while 22 of 576 common glycoproteins were dysregulated (16 upregulated in DM; 6 downregulated in DM) ( Figures 2a–2c ) ( Tables-2 ). Download figure Open in new tab Figure 1(a–c): Comparative analysis of glycosylation sites between healthy volunteers (HV) and diabetes mellitus (DM) samples. 1a : Volcano plot displaying significantly dysregulated glycosylation sites in DM compared to HV (p < 0.05). 1b : Boxplots highlighting upregulated glycans at specific glycosylation sites in DM. 1c : Boxplots showing downregulated glycans at specific glycosylation sites in DM. Download figure Open in new tab Figure 2(a–c): Comparative analysis of glycosylated proteins between healthy volunteers (HV) and diabetes mellitus (DM) samples. 2a : Volcano plot displaying significantly dysregulated glycosylated proteins in DM compared to HV (p < 0.05). 2b : Boxplots highlighting upregulated glycosylated proteins in DM. 2c : Boxplots showing downregulated glycosylated proteins in DM. Key findings include the upregulation of the glycan HexNAc(2)Hex(2)Fuc(1) at position 215 of the Prosaposin precursor protein (site_4118) in DM (p < 0.0001) ( Figure 1a – 1b ), representing the first report of this glycan’s association with this specific site in DM. This glycan, previously detected in unrelated contexts such as healthy human urine 17 , Gaucher spleen tissue 18 , and GM1 gangliosidosis urine 19 at undefined positions, has never been linked to diabetes with the glycosylation at position 215. Four of the five identified sphingolipid activator proteins (SAPs), namely SAP-A, SAP-B, SAP-C, and SAP-D, originate from a common precursor protein (Prosaposin) through proteolytic processing. The SAP-precursor is encoded by a gene located on chromosome 10, which comprises 15 exons and 14 introns. Importantly, glycosylation at position 215 is critical for SAP-B function in lysosomal sphingolipid degradation, as its loss reduces protein stability and activity, leading to lysosomal storage disorders. This novel finding emphasizes its potential as a biomarker for diabetes-related lysosomal dysfunction 20 . HexNAc(5)Hex(6)Fuc(1)NeuAc(1)NeuGc(1) at position 78 in Prostaglandin-H2 D-isomerase (site_7054) (p < 0.01) ( Figure 1a, 1b ) ( Table 2 ), which was previously identified in prostate cancer urine; and HexNAc(4)Hex(5)NeuAc(2) at position 115 in AMBP precursor protein (site_6021) (p < 0.01) ( Figure 1a, 1b ). These observations suggest that glycosylation changes in DM may overlap with other pathological states, but their specific roles in DM-related processes, such as inflammation and oxidative stress, warrant further investigation. View this table: View inline View popup Table 2. an overview of the identified glycosylation sites, glycosylated proteins, and glycans associated with diabetes mellitus (DM). The data includes unique glycan feature IDs (used in figures) and their corresponding names, and status in DM (UP: increased, DOWN: decreased). Conversely, downregulated glycans in DM included: HexNAc(1)Hex(1)NeuAc(1) O-glycan at position Thr-38 in Phosphoinositide-3-kinase-interacting protein precursor (site_2019) (p < 0.01) ( Figure 1a, 1c ) ( Table 2 ), previously reported as O-linked in prostate cancer urine 21 , and HexNAc(1)Hex(1)NeuAc(2) O-glycan at position Thr-346 in YIPF3 precursor protein (site_2848) (p < 0.01) ( Figure 1a, 1c ), also identified in urine from a single study 22 . The observed downregulation may suggest an impairment of these glycosylation pathways in diabetes, with potential consequences for protein function and disease progression. Among dysregulated glycoproteins, Prosaposin (GN=PSAP) (Protein_147) and Prothrombin (GN=F2) (Protein_55) were the most significantly upregulated in DM (p < 0.0001) ( Figure 2a, 2b ) ( Table 2 ), while Phosphoinositide-3-kinase-interacting protein (GN=PIK3IP1) (Protein_490) (p < 0.0001) and Golgi membrane protein (GN=GOLM1) (Protein_465) (p < 0.01) were the most significantly downregulated (( Figure 2a, 2c )). Reduced glycosylation here may impair these protein signaling, a pathway central to insulin sensitivity. Among 268 shared glycans, the most significantly upregulated in DM was HexNAc(4)Hex(5)NeuAc(1) (Glycan_80) (p < 0.0001), followed by HexNAc(4)Hex(5)NeuGc(1) (Glycan_83) and HexNAc(5)Hex(3)Fuc(1) (Glycan_98) (p < 0.01) ( Figure 3a ) ( Table 2 ). These dysregulated glycosylation patterns and proteins are strongly linked to processes such as inflammation, oxidative stress, and glucose metabolism, reinforcing their relevance as potential biomarkers or therapeutic targets in DM. The identification of specific glycans and their precise roles in DM pathophysiology provides a foundation for further studies to explore their diagnostic and therapeutic potential. Download figure Open in new tab Figure 3a: Comparative analysis of the identified glycans between healthy volunteers (HV) and diabetes mellitus (DM) samples. 3a : Boxplots highlighting upregulated glycans in DM compared to HV. 3. Data Filtration and Imputation Data filtration retained glycans, glycosylated proteins and glycosylation sites detected in at least 10 out of 40 samples, resulting in 197 distinct glycans, 223 glycosylated proteins, and 450 glycosylation sites. This represented 73%, 38%, and 26% of the shared glycans, proteins, and sites, respectively, highlighting the challenge of analyzing low-abundance glycoproteins in clinical contexts. Missing data analysis revealed proportions of 34.2%, 41.7%, and 49.7% for glycans, proteins, and sites, respectively. To address missing data, five imputation methods were applied, generating distinct datasets for downstream analyses for each level (Glycosylation sites, Glycosylated proteins, Glycans). The impact of each method on data distribution was evaluated to determine the most suitable approach for our highly right-skewed dataset. Both mean imputation ( Figures 4a , 5a , 6a ) and MissForest imputation ( Figures 4c , 5c , 6c ) resulted in noticeable distortions of the data distribution, with a tendency to overestimate missing values. The similarity between the results of MissForest and mean imputation suggests that missing values may be randomly distributed and do not strongly depend on other variables. This indicates that the imputation model lacks sufficient features or variability to make accurate predictions. Download figure Open in new tab Figures 4(a–e): Comparison of different imputation methods for glycosylation site data. 4a : Mean imputation effects on glycan data distribution. 4b : K-nearest neighbors (KNN) imputation. 4c : MissForest imputation. 4d : Minimum imputation. 4e : Mixed imputation. Download figure Open in new tab Figures 5(a–e): Comparison of different imputation methods for glycosylated protein data. 5a : Mean imputation effects on glycan data distribution. 5b : K-nearest neighbors (KNN) imputation. 5c : MissForest imputation. 5d : Minimum imputation. 5e : Mixed imputation. Download figure Open in new tab Figures 6(a–e): Comparison of different imputation methods for glycan data. 6a : Mean imputation effects on glycan data distribution. 6b : K-nearest neighbors (KNN) imputation. 6c : MissForest imputation. 6d : Minimum imputation. 6e : Mixed imputation. In contrast, K-nearest neighbors (KNN) imputation (4b, 5b, 6b) produced imputed values that were better distributed and more closely aligned with the original data structure. This method demonstrated greater adaptability to the inherent variability in the dataset. Minimum imputation (4d, 5d, 6d) preserved the overall distribution but introduced a pronounced peak at the lower extreme of the data range, likely leading to an underestimation of missing values. Similarly, mixed imputation (4e, 5e, 6e), which combines KNN for missing at random (MAR) data and minimum imputation for missing not at random (MNAR) data, maintained the data distribution while introducing a small peak at the lower end. Although this approach may also underestimate missing values, it aligns well with the intrinsic right-skewed nature of the dataset, where most real values are small. Given the highly right-skewed nature of the data, methods incorporating minimum and mixed imputation are expected to yield the most reliable results. However, the final assessment of imputation performance should be based on the evaluation of machine learning models trained on these imputed datasets, as model performance will ultimately determine the most suitable imputation strategy for our data. 4. Machine Learning Analysis Glycosylation Sites Feature selection using Boruta identified between 17 (missForest) and 31 (KNN) significant glycosylation sites out of 450 features. Four sites (“gly_2019,” “gly_4118,” “gly_5450,” and “gly_3504”) were consistently selected across all methods. Support Vector Machine (SVM) models trained on minimum-imputed data achieved optimal classification performance, with an average cross-validation accuracy of 96%, precision of 94%, recall of 100%, specificity of 92%, F1-score of 97%, and AUC-ROC of 96%. The test set performance was similarly robust across all metrics ( Figure 7a, 7b ). Download figure Open in new tab Figures 7(a, b): Machine Learning Classification Performance and Variable Importance for Glycosylation Site Data Imputed by Minimum Imputation. 7a : Feature importance analysis, identifying the glycosylation sites that contributed most to the classification. 7b : Performance metrics of the classification model, including accuracy, precision, recall, specificity, F1-score, and AUC-ROC for both cross-validated (CV) and test data. Since minimum imputation produced the most accurate and reliable results, performance metrics for other imputation methods are not shown. Glycosylated Proteins Feature selection retained 15 to 21 glycoproteins, depending on the imputation method, with nine proteins consistently identified. SVM models trained on mixed-imputed data achieved the highest performance, yielding a cross-validation accuracy of 92%, precision of 93%, and AUC-ROC of 92%. Test set metrics reached 100% accuracy, precision, recall, and AUC ( Figure 8a, 8b ). Download figure Open in new tab Figures 8(a, b): Machine Learning Classification Performance and Variable Importance for Glycoprotein Data Imputed by mixed Imputation. 8a : Feature importance analysis, identifying the glycosylated proteins that contributed most to the classification. 8b : Performance metrics of the classification model, including accuracy, precision, recall, specificity, F1-score, and AUC-ROC for both cross-validated (CV) and test data. Since mixed imputation produced the most accurate and reliable results, performance metrics for other imputation methods are not shown. Glycans Feature selection retained 9 to 12 features depending on the imputation method, with four glycans (“Glycan_18”, “Glycan_80”, “Glycan_98”, “Glycan_254”) consistently identified. Generalized linear models (GLMs) trained on KNN-imputed data achieved superior classification performance, with cross-validation accuracy of 94%, precision of 100%, and AUC-ROC of 95%. Test set performance metrics were similarly high, reinforcing the robustness of the approach ( Figure 9a, 9b ). Download figure Open in new tab Figures 9(a, b): Machine Learning Classification Performance and Variable Importance for Glycan Data Imputed by K-Nearest Neighbor (KNN) Imputation. 8a : Feature importance analysis, identifying the glycans that contributed most to the classification. 8b : Performance metrics of the classification model, including accuracy, precision, recall, specificity, F1-score, and AUC-ROC for both cross-validated (CV) and test data. Since KNN imputation produced the most accurate and reliable results, performance metrics for other imputation methods are not shown. Both SVM and GLM demonstrated superior performance with KNN and minimum imputation strategies, providing reliable tools for distinguishing HV and DM samples based on glycosylation signatures. The differential performance of SVM and GLM in glycosylation data analysis is inherently linked to the complexity, structure, and statistical properties of the data. SVM excels in modeling non-linear and high-dimensional relationships prevalent in glycosylation site and glycoprotein data, whereas GLMs are more suited for compositional glycan data with linear additive effects. The integration of both approaches provides a comprehensive framework for identifying glycosylation-based biomarkers and understanding disease-specific glycosylation alterations. The overlap between statistically significant features and machine learning priorities (e.g., Prosaposin N215) validates their biological relevance. This dual approach mirrors recent advances in integrative glycoproteomics, where hybrid frameworks enhance biomarker discovery. Dysregulated glycosylation features highlighted by these methods offer valuable insights into DM pathology and hold potential as biomarkers or therapeutic targets. 5. Novel Site-Specific Glycosylation in DM In this study, we identified distinct glycosylation patterns at specific amino acid residues in proteins from individuals with Diabetes Mellitus that were absent in healthy controls. Specifically, the Asn-831 and Ser-833 residues in the Fibrinogen alpha chain were found to be systematically glycosylated exclusively in DM patients. The Asn-831 site was modified with a HexNAc(1)Fuc(1) glycan, while the Ser-833 site was modified with a HexNAc(1)Hex(1)NeuAc(1) glycan. Additionally, the Thr-231 residue in Roundabout homolog 4 (ROBO4) was observed to be systematically glycosylated exclusively in DM patients, with an O-linked glycan structure of HexNAc(1)Hex(1)NeuAc(2). These findings represent the first reported instance of glycosylation at these specific residues in the context of DM. Notably, these glycosylation sites have not been previously documented in the GlyConnect database, underscoring their novelty and potential significance in the pathophysiology of DM. Further investigation into the functional implications of these glycosylation events may provide insights into the molecular mechanisms underlying DM and its associated complications. Conclusion This study provides a detailed glycoproteomic landscape of diabetes mellitus, identifying significant alterations in glycosylation patterns. The discovery of novel glycosylation sites and their association with DM highlights the potential of glycosylation as a biomarker for disease progression. The application of statistical and machine learning methods further strengthens the robustness of these findings, demonstrating their utility in distinguishing DM from healthy states. These results pave the way for future functional investigations and biomarker validation in clinical settings. Data Availability All data produced in the present study are available upon reasonable request to the authors Funding This research was supported by MEXT, JST, Tosoh Corp. and Masanori Katagiri Foundation. Institutional Review Board Statement This study was conducted in accordance with the Declaration of Helsinki and was approved by the ethics committees of Shinrakuen Hospital (No. H290005) and Niigata University (No. 2021-0025). Informed Consent Statement Urine samples used in this study were collected from left-over specimens of healthy subjects (volunteers) after laboratory tests for their health check at Shinrakuen Hospital and were anonymized with link information unavailable to the investigators. The ethics committees approved the collection and use of these samples by obtaining informed consent in the form of opt-out. Conflicts of Interest The authors declare no conflict of interest. Acknowledgments COI: BBC is the Collaborative Research Laboratory of Tosoh Corporation. References 1. ↵ In Essentials of Glycobiology , 4th ed.; Varki , A. ; Cummings , R. D. ; Esko , J. D. ; Stanley , P. ; Hart , G. W. ; Aebi , M. ; Mohnen , D. ; Kinoshita , T. ; Packer , N. H. ; Prestegard , J. H. ; Schnaar , R. L. ; Seeberger , P. H. , Eds. Cold Spring Harbor (NY ), 2022 . 2. ↵ Moremen , K. W. ; Tiemeyer , M. ; Nairn , A. V. , Vertebrate protein glycosylation: diversity, synthesis and function . Nature reviews. Molecular cell biology 2012 , 13 ( 7 ), 448 – 62 . OpenUrl CrossRef PubMed 3. ↵ Rudman , N. ; Gornik , O. ; Lauc , G. , Altered N-glycosylation profiles as potential biomarkers and drug targets in diabetes . FEBS letters 2019 , 593 ( 13 ), 1598 – 1615 . OpenUrl CrossRef PubMed 4. ↵ Stambuk , T. ; Gornik , O. , Protein Glycosylation in Diabetes . Advances in experimental medicine and biology 2021 , 1325 , 285 – 305 . OpenUrl PubMed 5. ↵ Ohtsubo , K. ; Marth , J. D. , Glycosylation in cellular mechanisms of health and disease . Cell 2006 , 126 ( 5 ), 855 – 67 . OpenUrl CrossRef PubMed Web of Science 6. ↵ Gornik , O. ; Lauc , G. , Glycosylation of serum proteins in inflammatory diseases . Disease markers 2008 , 25 ( 4-5 ), 267 – 78 . OpenUrl CrossRef PubMed Web of Science 7. ↵ American Diabetes, A ., (2) Classification and diagnosis of diabetes . Diabetes care 2015 , 38 Suppl , S8 – S16 . OpenUrl FREE Full Text 8. ↵ Zou , Y. ; Li , S. ; Chen , W. ; Xu , J. , Urine-derived stem cell therapy for diabetes mellitus and its complications: progress and challenges . Endocrine 2024 , 83 ( 2 ), 270 – 284 . OpenUrl PubMed 9. ↵ Higashioka , M. ; Hirakawa , Y. ; Hata , J. ; Honda , T. ; Sakata , S. ; Shibata , M. ; Kitazono , T. ; Osawa , H. ; Ninomiya , T. , Serum Mac-2 Binding Protein Glycosylation Isomer Concentrations Are Associated With Incidence of Type 2 Diabetes . The Journalof clinical endocrinology and metabolism 2023 , 108 ( 7 ), e425 – e433 . OpenUrl 10. ↵ Masaki , N. ; Feng , B. ; Breton-Romero , R. ; Inagaki , E. ; Weisbrod , R. M. ; Fetterman , J. L. ; Hamburg , N. M. , O-GlcNAcylation Mediates Glucose-Induced Alterations in Endothelial Cell Phenotype in Human Diabetes Mellitus . Journal of the American Heart Association 2020 , 9 ( 12 ), e014046 . OpenUrl CrossRef PubMed 11. ↵ Rao , P. V. ; Laurie , A. ; Bean , E. S. ; Roberts , C. T. , Jr . .; Nagalla , S. R. , Salivary protein glycosylation as a noninvasive biomarker for assessment of glycemia . Journal of diabetes science and technology 2015 , 9 ( 1 ), 97 – 104 . OpenUrl 12. ↵ Mereiter , S. ; Balmana , M. ; Campos , D. ; Gomes , J. ; Reis , C. A. , Glycosylation in the Era of Cancer-Targeted Therapy: Where Are We Heading? Cancer cell 2019 , 36 ( 1 ), 6 – 16 . OpenUrl CrossRef PubMed 13. ↵ Lazar , C. ; Gatto , L. ; Ferro , M. ; Bruley , C. ; Burger , T. , Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative Proteomics Data Sets to Compare Imputation Strategies . Journalof proteome research 2016 , 15 ( 4 ), 1116 – 25 . OpenUrl 14. ↵ Yu , Q. ; Wang , B. ; Chen , Z. ; Urabe , G. ; Glover , M. S. ; Shi , X. ; Guo , L. W. ; Kent , K. C. ; Li , L. , Electron-Transfer/Higher-Energy Collision Dissociation (EThcD)-Enabled Intact Glycopeptide/Glycoproteome Characterization . Journal of the American Society for Mass Spectrometry 2017 , 28 ( 9 ), 1751 – 1764 . OpenUrl CrossRef PubMed 15. Aebersold , R. ; Mann , M. , Mass-spectrometric exploration of proteome structure and function . Nature 2016 , 537 ( 7620 ), 347 – 55 . OpenUrl CrossRef PubMed 16. ↵ Kudelka , M. R. ; Antonopoulos , A. ; Wang , Y. ; Duong , D. M. ; Song , X. ; Seyfried , N. T. ; Dell , A. ; Haslam , S. M. ; Cummings , R. D. ; Ju , T. , Cellular O-Glycome Reporter/Amplification to explore O-glycans of living cells . Nature methods 2016 , 13 ( 1 ), 81 – 6 . OpenUrl PubMed 17. ↵ Faull , K. F. ; Johnson , J. ; Kim , M. J. ; To , T. ; Whitelegge , J. P. ; Stevens , R. L. ; Fluharty , C. B. ; Fluharty , A. L. , Structure of the asparagine-linked sugar chains of porcine kidney and human urine cerebroside sulfate activator protein . Journal of mass spectrometry : JMS 2000 , 35 ( 12 ), 1416 – 24 . OpenUrl PubMed 18. ↵ Ito , K. ; Takahashi , N. ; Takahashi , A. ; Shimada , I. ; Arata , Y. ; O’Brien , J. S. ; Kishimoto , Y. , Structural study of the oligosaccharide moieties of sphingolipid activator proteins, saposins A, C and D obtained from the spleen of a Gaucher patient . European journal of biochemistry 1993 , 215 ( 1 ), 171 – 9 . OpenUrl PubMed 19. ↵ Michalski , J. C. ; Lemoine , J. ; Wieruszeski , J. M. ; Fournet , B. ; Montreuil , J. ; Strecker , G. , Characterization of a novel type of chain-terminator Gal beta 1-6Gal beta 1-4)GlcNAc in an oligosaccharide related to N-glycosylated protein glycans isolated from GM1 the urine of patients with gangliosidosis . Europeanjournalof biochemistry 1991 , 198 ( 2 ), 521 – 6 . OpenUrl 20. ↵ Wrobe , D. ; Henseler , M. ; Huettler , S. ; Pascual Pascual , S. I. ; Chabas , A. ; Sandhoff , K. , A non-glycosylated and functionally deficient mutant (N215H) of the sphingolipid activator protein B (SAP-B) in a novel case of metachromatic leukodystrophy (MLD) . Journal of inherited metabolic disease 2000 , 23 ( 1 ), 63 – 76 . OpenUrl CrossRef PubMed Web of Science 21. ↵ Kawahara , R. ; Ortega , F. ; Rosa-Fernandes , L. ; Guimaraes , V. ; Quina , D. ; Nahas , W. ; Schwammle , V. ; Srougi , M. ; Leite , K. R. M. ; Thaysen-Andersen , M. ; Larsen , M. R. ; Palmisano , G. , Distinct urinary glycoprotein signatures in prostate cancer patients . Oncotarget 2018 , 9 ( 69 ), 33077 – 33097 . OpenUrl PubMed 22. ↵ Darula , Z. ; Pap , A. ; Medzihradszky , K. F. , Extended Sialylated O-Glycan Repertoire of Human Urinary Glycoproteins Discovered and Characterized Using Electron-Transfer/Higher-Energy Collision Dissociation . Journal of proteome research 2019 , 18 ( 1 ), 280 – 291 . OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted February 26, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Machine Learning-Driven Glycoproteomic Profiling Identifies Novel Diabetes-Associated Glycosylation Biomarkers Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Machine Learning-Driven Glycoproteomic Profiling Identifies Novel Diabetes-Associated Glycosylation Biomarkers Amr Elguoshy , Keiko Yamamoto , Yoshitoshi Hirao , Kengo Yanagita , Tomohiro Uchimoto , Takashi Kamimura , Tetsuya Takazawa , Tadashi Yamamoto medRxiv 2025.02.24.25322514; doi: https://doi.org/10.1101/2025.02.24.25322514 Share This Article: Copy Citation Tools Machine Learning-Driven Glycoproteomic Profiling Identifies Novel Diabetes-Associated Glycosylation Biomarkers Amr Elguoshy , Keiko Yamamoto , Yoshitoshi Hirao , Kengo Yanagita , Tomohiro Uchimoto , Takashi Kamimura , Tetsuya Takazawa , Tadashi Yamamoto medRxiv 2025.02.24.25322514; doi: https://doi.org/10.1101/2025.02.24.25322514 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (570) Allergy and Immunology (864) Anesthesia (302) Cardiovascular Medicine (4445) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15236) Forensic Medicine (30) Gastroenterology (1127) Genetic and Genomic Medicine (6610) Geriatric Medicine (669) Health Economics (1000) Health Informatics (4549) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15926) Intensive Care and Critical Care Medicine (1104) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6613) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1147) Occupational and Environmental Health (957) Oncology (3341) Ophthalmology (975) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1694) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5458) Public and Global Health (9244) Radiology and Imaging (2205) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1197) Rheumatology (596) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (713) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a024838e7d9d593a',t:'MTc3OTg3OTYwNQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00