Full text
41,888 characters
· extracted from
preprint-html
· click to expand
AstraPTM: Context-Aware PTM Prediction Model for Large-Scale Proteins | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results AstraPTM: Context-Aware PTM Prediction Model for Large-Scale Proteins Aniruddh Goteti , Çağlar Bozkurt doi: https://doi.org/10.1101/2025.02.20.639276 Aniruddh Goteti 1 Orbion GmbH Find this author on Google Scholar Find this author on PubMed Search for this author on this site Çağlar Bozkurt 1 Orbion GmbH Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: caglar.bozkurt{at}orbion.life Abstract Full Text Info/History Metrics Preview PDF Abstract Post-translational modifications (PTMs) are critical molecular events that pro-foundly influence protein stability, localization, and function. While numerous computational tools exist for PTM site prediction, most struggle with handling large proteins and rely on separate models for each modification. To address these challenges, we introduce AstraPTM , a transformer-based framework that predicts 25 PTMs in a single pass. By leveraging advanced protein embeddings (ESM2) and training on a high-coverage dbPTM dataset, AstraPTM captures both short-range sequence motifs and long-range interactions across proteins without a sequence length limitation. AstraPTM combines a binary classification module—indicating whether a residue is modified—with a multi-label module that pinpoints specific PTM types. This dual approach achieves high accuracy on well-represented PTMs (e.g., phospho-rylation, glycosylation) while maintaining sensitivity for rarer modifications. In benchmarks against existing methods such as MusiteDeep and MIND-S, AstraPTM demonstrates competitive or superior performance, demonstrating AUC-ROC above 99% for well-represented modifications, underscoring its versatility for proteome-wide annotation. Beyond prediction, the model’s capacity to handle full-length proteins offers a powerful resource for researchers investigating PTM crosstalk and disease pathways, ultimately bridging the gap between large-scale omics data and targeted biomedical applications. Introduction Post-translational modifications (PTMs) are chemical changes occurring after protein synthesis, significantly influencing a protein’s stability, localization, and interactions ( 1 ; 2 ; 3 ). Common examples include phosphorylation (adding a phosphate group), glyco-sylation (adding sugar molecules), and ubiquitination (tagging proteins for degradation). By altering how proteins fold and function, PTMs modulate nearly every aspect of cellular biology and can play pivotal roles in disease progression ( 4 ; 15 ). Researchers have developed numerous computational tools to predict PTM sites from protein sequence and structural cues. Some approaches focus on proximity-based features, while others draw on evolutionary history to highlight residues likely to undergo modification ( 6 ; 7 ; 2 ). Understanding these prediction methods is especially relevant in medical contexts, where aberrant PTM patterns are frequently implicated in cancers, neurodegenerative disorders, and other pathologies. By pinpointing these abnormal sites, scientists can design targeted therapies aimed at restoring or inhibiting specific modifications ( 8 ; 9 ; 10 ). Despite notable progress, many existing tools face limitations such as handling large proteins, accommodating highly imbalanced PTM data, or requiring separate models for different modification types. To address these gaps, we introduce AstraPTM : a context-aware, transformer-based model capable of predicting 25 PTMs in a single pass. By combining comprehensive protein embeddings, sequence-level context, and extensive training data, AstraPTM aspires to deliver more robust and scalable PTM predictions for both research and clinical applications. 2 Literature Review Post-translational modifications (PTMs) are key biochemical processes that modify proteins after synthesis, influencing their stability, localization, and interactions ( 11 ; 15 ; 2 ). These modifications include phosphorylation, glycosylation, ubiquitination, and acetylation, each capable of reshaping a protein’s structure and function. Given their dynamic and context-dependent nature, predicting PTMs is vital for understanding protein regulation and complex cellular pathways. Early computational tools such as SAPH-ire leveraged structural data (e.g., solvent accessibility) and family-level modification counts to predict PTM sites ( 11 ). Subsequent models like Sitetack demonstrated that incorporating known PTM locations and co-occurrence patterns (e.g., O-glycosylation with phosphorylation) can further enhance predictive accuracy ( 15 ). These advances underscore the importance of analyzing PTM networks holistically, as co-evolving PTMs may have synergistic effects on protein function ( 2 ). Research has also highlighted the clinical significance of PTM prediction, particularly in cancer and neurodegenerative diseases, where abnormal PTM profiles can reveal therapeutic targets ( 1 ; 16 ). By identifying key phosphorylation or ubiquitination sites, for instance, drug design can focus on inhibiting or modifying these PTMs, thereby improving treatment specificity and efficacy. Tools like jEcho and PEIMAN have further integrated sequence, structure, and evolutionary data to generate user-friendly platforms for PTM visualization and enrichment analysis, assisting researchers in systematically assessing PTM implications ( 17 ; 10 ). Recent developments in machine learning and AlphaFold -driven structural predictions have greatly enhanced PTM site prediction ( 18 ). By combining high-resolution structural data with multi-scale sequence representations, these models offer more nuanced insights into PTM location and function. This synergistic approach signifies a leap forward in our capacity to decode complex regulatory networks governed by PTMs, thereby paving the way for improved therapeutic interventions and a deeper comprehension of protein biology. Recent PTM prediction models have demonstrated notable improvements over established methods ( 12 ; 13 ; 14 ; 15 ), yet they still exhibit considerable shortcomings. One key issue is the reliance on limited context windows, which restricts the ability to capture long-range dependencies crucial for context-sensitive post-translational modifications. Additionally, many models do not include a sufficiently large or diverse set of proteins in their training corpora, hampering the identification of nuanced PTM occurrence patterns across different organisms and protein families. Finally, most current approaches lack an open-set or generative strategy, focusing only on PTMs already established by experimental evidence and rarely venturing beyond them to predict entirely novel modifications. In summary, PTM prediction research continues to expand, encompassing structural, sequence-based, and evolutionary strategies. The integration of these data streams, coupled with advances in deep learning, positions PTM analysis as an essential component of modern proteomic research and drug discovery efforts. 3 Keys to Successful PTM Prediction Careful dataset design is critical for achieving successful predictions with deep learning. When predicting post-translational modifications (PTMs) , the following considerations are especially important: Contextual Awareness PTM occurrence heavily depends on the surrounding protein sequence, so incorporating sufficient context around each residue is crucial for accurate prediction. Feature Enrichment To help the model learn the nuances of PTM sites, additional biochemical and structural properties should be included in the feature set. This enrichment goes beyond raw sequence data to capture meaningful context. Dataset Size and Diversity While a dataset should be homogeneous in format, it must also represent the various PTMs found in real-world data. Ensuring both sufficient size and diversity allows the model to generalize effectively to different PTMs. 4 AstraPTM: Steps to a Context-Aware Model AstraPTM is a deep learning framework designed to predict which residues carry post-translational modifications (PTMs) and to specify the type of PTM at each site. By leveraging powerful ESM2 embeddings and a Transformer architecture, AstraPTM captures both local and global sequence information without being constrained by fixed windows. This enables accurate predictions for proteins of varying lengths—without a limitation in length—while efficiently learning from multiple types of PTM data. Highlights of the AstraPTM Approach Comprehensive Context Capture Unlike models that rely on fixed-size windows, AstraPTM’s transformer-based design uses self-attention to integrate information across the full protein. Coupled with ESM2 embeddings, this allows the model to detect subtle, long-range dependencies crucial for accurate PTM predictions. Scalability for Large Proteins By avoiding manual sequence truncation, AstraPTM can handle proteins without a limit in sequence length. Its attention mechanism dynamically scales to capture interactions among distant regions. Binary & Multi-label Synergy A single framework handles both the identification of PTM sites ( binary task ) and the classification of specific PTM types ( multi-label task ). This approach improves overall robustness, especially for rare or novel modifications. Benefits of Having Binary and Multi-label Models For most PTMs, the scarcity of experimental data constrains the predictive performance of machine learning methods. As a result, many machine learning models designed for PTM prediction restrict the range of PTMs they support within their multi-label frameworks. To overcome this limitation and assist researchers in identifying novel or rare PTMs in their proteins of interest, we frame the PTM prediction task around two closely related questions: Where do PTMs occur along the protein sequence? Which types of PTMs are likely to be present at those locations? This approach stems from the limited availability of PTM data. The dbPTM database, for example, shows a highly imbalanced distribution: only 25 PTMs appear in more than 100 proteins, and just 14 exceed 1,000 annotated proteins. Consequently, a single large multi-class model spanning all 72 dbPTM PTMs would struggle to learn meaningful distinctions among underrepresented PTM types. By contrast, a binary classification model can effectively learn to detect whether a residue is modified at all—even if a specific PTM is rare or unobserved. The companion multi-label classification model then refines the prediction by specifying which PTM(s) are most likely to be present at that site. In practice: Binary PTM Presence The model can potentially capture novel or rare PTMs by leveraging shared contextual cues across different PTMs. Multi-Label PTM Type Where sufficient data exists, the model can discriminate among known PTM categories for more detailed predictions. Together, these models enhance both accuracy and robustness, balancing broad detection with targeted classification. Contextual Awareness via ESM2 and Transformers To capture the rich context crucial for PTM prediction, four key steps define AstraPTM’s pipeline: 1. Incorporated ESM2 Embeddings Each residue is encoded using the esm2_t33_650M_UR50D variant, which provides a 1,280-dimensional embedding trained on large-scale protein data. These embeddings contain evolutionary and structural signals that give AstraPTM a strong starting point for PTM discrimination. 2. Transformer Architecture with Multiple Attention Heads Building on ESM2’s rich embeddings, we use a transformer with eight attention heads per layer. Multiple heads let the model focus on distinct “subspaces” of protein features, from local sequence motifs to long-range structural interactions. This goes beyond raw embeddings by learning how specific residues and neighboring regions influence PTM sites. 3. Full-Length Sequences Rather than trimming proteins, we train on entire sequences (up to 3,000+ residues). This ensures that any remote interactions—which may be critical for PTM formation—are retained. Such a full-sequence approach outperforms methods restricted to short windows. 4. Contextual Labeling with PTM Data We curated a labeled dataset from dbPTM, annotating each residue with the PTM type and relevant structural context (when available). These detailed labels teach the transformer to distinguish which environmental cues correlate with specific modifications. Download figure Open in new tab Figure 1: A diagram visualizing how the features and the labels of AstraPTM model arestructured. Training Data Structure We leveraged dbPTM as our primary data source, which initially contains over 280 , 000 unique proteins . To manage computational constraints, we excluded proteins exceeding 3 , 000 residues , removing approximately 40,000 entries. This filtering struck a balance between covering a wide range of proteins and maintaining tractable sequence lengths for model training. For each protein in the filtered dataset, we generated ESM2 embeddings (see ‘Contextual Awareness”) to capture evolutionary and structural signals. To further enrich these representations, we incorporated additional residue-level and protein-level features that together form a 1 , 351-dimensional input per residue: Residue-level Information Hydrophobicity, Bulkiness, Karplus-Schulz Flexibility Index, Atomical Composition, Side Chain Polarity Index, Charge at pH 7.0, Aromaticity, Van der Waals Radii, Number of Total Bonds Protein-level Information Sequence Length, Molecular Weight, Instability Index, Gravy Score, Secondary Structure (% α -Helices, % β -Sheets, % Coils), Molar Extinction Coefficient, Isoelectric Point, Charge at Different pH Points, Atomical Composition By combining the contextual richness of ESM2 with these specialized physicochemical and structural features, the final dataset ensures that our transformer-based model receives both local (residue-specific) and global (protein-level) contextual information. We divided this enriched dataset into training, validation, and test splits to reliably measure performance and guard against overfitting, thereby supporting robust PTM prediction and generalization to novel proteins. Data Details Binary Classification We included the data from dbPTM for 72 PTMs in a broad “PTM vs. no PTM” classification. Even underrepresented modifications can inform the binary model, enhancing sensitivity to rare but potentially important PTMs. The PTMs included in the model (but not limited to): Phosphorylation, Ubiquitination, Acetylation, N-linked Glycosylation, Succinylation, Methylation, O-linked Glycosylation, Malonylation, Sulfoxidation, S-palmitoylation, Sumoylation, S-nitrosylation, Glutathionylation, Amidation, Hydroxylation, Neddylation, Pyrrolidone-carboxylic-acid, Glutarylation, Oxidation, Gamma-carboxyglutamic-acid, Lactylation, Crotonylation, Myristoylation, Sulfation, Formylation, C-linked Glycosylation, and others. Multi-label Classification For the multi-label classification task, we restricted our dataset to PTMs with a higher number of annotated protein observations. In particular, we included only those PTMs with more than 100 protein annotations; of these, 14 had more than 1,000 protein observations. Consequently, the multi-label model training set comprised PTMs such as Phosphorylation, Ubiquitination, Acetylation, N-linked Glycosylation, Methylation, Succinylation, Malonylation, O-linked Glycosylation, S-palmitoylation, Sulfoxidation, and several others. View this table: View inline View popup Download powerpoint Table 1: Summary of the total number of proteins that are used in training, test, and validation, in each category for both binary and multi-label classifications, separated by number of residues. 5 Model Architecture AstraPTM’s architecture is a sequence-to-sequence Transformer that takes, as input, a batch of protein residues—each residue described by 1 , 351 features —and outputs logits indicating the likelihood of a PTM (for binary classification) or the likelihood of each possible PTM type (for multi-label classification). Download figure Open in new tab Figure 2: The histogram of the sequence length of the proteins that are included in the training of the model. Input Projection A linear layer projects the 1,351-dimensional input to a hidden dimension of 512, compressing raw features into a space compatible with the transformer encoder. Positional Encoding We add sine-cosine positional embeddings to each residue’s input vector, helping the model track ordering across the protein sequence. Transformer Encoder We stack four layers, each with: - 8 self-attention heads - A feedforward network (4 × hidden dim) - Layer normalization and dropout This encoder can capture both short- and long-range dependencies critical for PTM prediction. Final Linear Layer Maps the final hidden dimension (512) back to: - 1 output for binary classification (PTM vs. no PTM), - 25 outputs for multi-label classification (which PTM(s) at each residue). Download figure Open in new tab Figure 3: Diagram of the AstraPTM Transformer architecture for PTM prediction. 6 Results AstraPTM’s performance was evaluated in two key modes: binary classification (whether a residue is modified at all) and multi-label classification (which PTM types are present). These complementary approaches are especially relevant for structural biologists and bioinformaticians who often need both a high-level scan for potential PTM sites and a fine-grained view of specific modifications that might alter protein conformation or function. 6.1 Overall Performance We assessed AstraPTM using common metrics such as AUC-ROC, AUC-PR, MCC , Precision, Recall, F1 score, and Accuracy. Table 2 summarizes the high-level results for each classification task. View this table: View inline View popup Download powerpoint Table 2: High-level performance metrics for AstraPTM in binary and multi-label (25 PTMs) modes. 6.1.1 Binary Classification This model exhibits high accuracy and a strong balance between precision (≈ 86%) and recall (≈ 90%). It effectively identifies whether a residue is modified or not, providing reliable guidance for downstream experiments or protein structure determination. By correctly flagging the majority of modified residues, the binary model offers a reliable “first pass” screen, minimizing missed sites while maintaining good specificity. 6.1.2 Multi-label Classification The multi-label model demonstrates excellent performance overall when aggregating all PTM labels (e.g., Micro AUC-ROC of ≈ 99.57%). However, lower macro metrics (MCC of ≈38%, F1 of ≈49%) reflect variability across individual PTMs, particularly those with fewer training examples. Table 3 details the per-PTM results, highlighting that labels with sufficient data (e.g., Phosphorylation, N-linked Glycosylation) achieve strong metrics, whereas rare PTMs (e.g., Formylation, Neddylation, Lactylation) show high AUC-ROC but very low AUC-PR and MCC. These discrepancies underscore the impact of class imbalance in multi-label prediction. View this table: View inline View popup Download powerpoint Table 3: Per-PTM performance metrics for the multi-label model. Key Insights High-Confidence PTMs Phosphorylation, N-linked Glycosylation, Myristoylation, and several others exhibit strong metrics (AUC-PR, MCC), making them prime targets for experimental validation (e.g., mutagenesis, structural analysis). Moderate-Confidence PTMs Acetylation or Ubiquitination can be influenced by data imbalance in certain families; further structure-based checks (e.g., solvent exposure) can refine predictions. Rare/Underrepresented PTMs Formylation, Neddylation, and Lactylation suffer from low Precision/Recall or near-zero MCC, indicating limited positive examples. Users should treat these predictions with caution and prioritize experimental followup. Overall, the binary model reliably identifies PTM sites, while the multi-label model excels in detecting well-represented PTMs and provides potential leads for rare modifications. In practice, combining both models helps guide scientists on which sites to investigate further. 6.2 Practical Takeaways Binary + Multi-label Synergy The two-step approach helps structural labs focus on residues for biochemical assays and bioinformatics teams plan more targeted in silico analyses. Residues flagged as “modified” in the binary model but missing strong multi-label signals could represent novel or rare modifications. Downstream Experimental Value High recall means labs are less likely to miss crucial modification sites, crucial for large or multi-domain proteins. Focus on Strong PTMs If certain PTMs (phosphorylation, glycosylation) are especially relevant, AstraPTM’s high precision can accelerate site-directed mutagenesis or cryo-EM studies. Guiding Structure Validation Overlaying AstraPTM predictions on 3D models (e.g., AlphaFold, X-ray structures) can reveal if predicted sites are solvent-exposed or near active sites, informing functional hypotheses. Rare PTMs For modifications with sparse data, users may adopt additional validation strategies (mass spectrometry, literature cross-checks) to confirm or refute predicted sites. 7 Comparisons As there are no other binary PTM prediction models aiming to answer the generic question, “where do PTMs occur along the sequence, regardless of PTM type?”, we focus on multi-label comparisons with popular methods. AstraPTM addresses multiple PTMs simultaneously, whereas most other predictors traditionally focus on binary classification (one PTM at a time). We compare AstraPTM to MusiteDeep, ModPred, Sitetack, MIND-S , and a fine-tuned GPT-2 model ( PTMGPT2 ), covering both classical and recent deep-learning approaches. Scope of Comparison Sitetack, ModPred Separate binary classifiers per PTM, potentially more finetuned but burdensome to deploy collectively. MusiteDeep CNN-based deep learning for specific PTMs, uses sliding windows. MIND-S Incorporates structural information (AlphaFold/PDB) for multi-label PTM analysis, but can be slow or require ensembles. PTMGPT2 Uses a GPT-2 language model for multi-label PTM prediction but can be limited by sequence length constraints (~1024 tokens). Dataset and Model Nuances While the comparisons showcase how models perform against each other, there are certain points that needs to be considered for a fair evaluation: The models Sitetack and ModPred are a set of binary classification models, meaning for every PTM, there is another model that needs to be deployed. Not all metrics of all models are publicly available, hence, although some comparisons are possible, the comparisons don’t reflect the full picture. Not all models are trained and evaluated with the same dataset . Hence, the comparisons are not completely reflective. The size of the dataset or the limitations on the dataset, such as protein length, may lead to misleading results, as it may limit models’ performances to only certain types of proteins. Table 4 overviews each model’s dataset size, sequence length limits, PTM coverage, and classification type. Note that AstraPTM’s ability to handle longer proteins and unify multiple PTMs in a single model distinguishes it from older, window-based tools. View this table: View inline View popup Download powerpoint Table 4: Comparison of PTM Prediction Models Note : ‘None (sliding)” indicates a model processes protein sequences in smaller windows (e.g., 25 residues upstream/downstream), which can miss long-range contacts important for structural insight. AstraPTM forgoes fixed-window constraints. 7.1 PTM-Specific AUC-ROC and AUC-PR Tables 5 and 6 compare AstraPTM to MusiteDeep, ModPred, and Sitetack on AUC-ROC and AUC-PR for selected PTMs. Overall, AstraPTM matches or outperforms other methods in most common modifications (e.g., phosphorylation, glycosylation). View this table: View inline View popup Download powerpoint Table 5: PTM-specific AUC-ROC comparison for AstraPTM vs. MusiteDeep, ModPred, and Sitetack. View this table: View inline View popup Download powerpoint Table 6: PTM-specific AUC-PR comparison for AstraPTM vs. MusiteDeep, ModPred, and Sitetack. 7.2 Micro- and Macro-Averaged Performance Table 7 compares micro-average (instance-weighted) and macro-average (class-weighted) performance across all PTMs for AstraPTM, MusiteDeep, and MIND-S. View this table: View inline View popup Download powerpoint Table 7: Micro- and macro-average performance (AUC-ROC, AUC-PR, MCC, and F1). Micro-Average Superiority AstraPTM leads in AUC-ROC (99.57%) and AUCPR (85.29%), confirming strong performance on frequent PTMs. Macro-Average Considerations Lower macro-average AUC-PR and MCC indicate room for improvement on rare modifications. 7.3 Comparison with MIND-S Insights from the MIND-S vs. AstraPTM Comparison High AUC Across Most PTMs AstraPTM achieves high AUC values on almost all modifications, indicating robust overall discrimination despite occasional drops in AUPR or MCC. Trade-off in Precision and Recall MIND-S sometimes secures higher AUPR (e.g., hydroxylysine (K)), suggesting it can excel when focusing on fewer false positives. AstraPTM, on the other hand, may capture more true positives but also more false positives, leading to variable AUPR in specific cases. Variability in Rare PTMs Both methods struggle on certain rare or highly imbalanced modifications (e.g., O-linked glycosylation with a low AUPR in AstraPTM), underscoring the difficulty of modeling less-characterized PTMs. 7.4 Comparison with PTMGPT2 We also compare AstraPTM to PTMGPT2 in Table 9 , showing that AstraPTM often leads on major modifications like phosphorylation and glycosylation, while PTMGPT2 can perform better on certain low-frequency modifications. 7.5 Concluding Remarks High Accuracy for Major PTMs AstraPTM consistently achieves top-tier metrics for phosphorylation, glycosylation, and other high-frequency modifications. Single-Model Multi-Label Efficiency Users can streamline their pipelines without deploying separate binary models per PTM. Room for Improvement on Rare PTMs Like other methods, AstraPTM’s performance dips on heavily imbalanced or less-characterized modifications (e.g., lactylation). Scalability Advantage The transformer-based design accommodates full-length sequences, a key benefit for structural biologists studying large, multi-domain proteins. While AstraPTM’s macro-average metrics indicate the need for improvement in certain rare PTMs, its strong performance on major modifications confirms its value for proteome-wide annotation tasks. The additional comparison against MIND-S ( Table 8 ) further illustrates AstraPTM’s ability to maintain high AUC across diverse modifications, albeit with some trade-offs in precision-recall balance. View this table: View inline View popup Download powerpoint Table 8: PTM-specific performance comparison (AUPR, AUC, F1, and MCC) for MIND-S vs. AstraPTM. View this table: View inline View popup Download powerpoint Table 9: PTMGPT2 vs. AstraPTM on F1, MCC, Precision, and Recall for diverse PTMs. 8 Learnings We introduced over 80 changes to architecture and hyperparameters, transitioning from a simple MLP to a BiLSTM and finally to a Transformer. This progression helped capture increasingly complex sequence patterns and improved overall metrics (Precision, Recall, F1, AUC–ROC). Model-specific Optimizations MLPs Lowering the learning rate (from 10 ≈ 3 to 10 ≈ 4 ) and tuning negative sampling rates pushed F1 from around 70% up to 85–88%. However, MLPs struggled with long-range context, leading us to explore recurrent models. BiLSTMs Integrating sequence context (hidden sizes 256–512, 1–2 layers) further improved recall beyond 90%, with F1 typically reaching 88–90%. We employed focal loss ( α, γ ) and dropout (0.2–0.4) to maintain precision. Transformers Ultimately, multi-head self-attention (4–8 heads) enabled learning of longer-range dependencies. With proper tuning of dropout (0.1–0.2), positional encodings, and thresholding, F1 climbed near 90%, with AUC–ROC at 95–96%. Context-Awareness Improvements BiLSTMs and Transformers surpassed MLPs primarily due to better modeling of sequence context; the former uses recurrent hidden states, while the latter uses self-attention to connect distant residues. For Transformers, refining positional encodings was crucial to improve alignment in the sequence, boosting both precision and recall significantly. Hyperparameter Optimizations Threshold Tuning Small changes in the decision threshold (48–50%) often led to large swings in Precision and Recall, highlighting its importance for imbalanced data. Focal Loss and Sampling Varying negative sampling rates (1.0–3.0) and carefully setting focal loss ( α, γ ) helped reduce false negatives while avoiding too many false positives. Model Dimensions and Data Growth Increasing hidden dimensions (256 to 512, or higher for Transformers) generally elevated F1 by 1–2%, though it required more GPU memory and longer training time. Expanding our dataset from 40K to 94K examples consistently improved all architectures, provided the sequences were diverse and representative of real-world PTM patterns. Key Takeaways A well-tuned Transformer with focal loss, moderate dropout, and balanced negative sampling produced the best F1 (near 90%) and highest AUC–ROC (95–96%). Careful hyperparameter scheduling (e.g., warm-up, OneCycleLR) and threshold calibration were critical to maintaining stable convergence and robust final metrics. Addressing class imbalance with negative sampling and focal loss had a stronger impact than any single architectural change alone. Overall, the transition from MLPs to sequence-aware models (BiLSTMs, Transformers), combined with thorough hyperparameter optimization, was instrumental in achieving state-of-the-art performance for this PTM prediction task. 9 Next Steps Although AstraPTM demonstrates strong potential for identifying PTMs and suggesting likely modification sites, further work is required to ( 1 ) boost accuracy/performance for rare modifications and ( 2 ) improve usability for end users. Enhancing Model Accuracy and Performance Integrate AlphaFold Structure Predictions PTMs are heavily influenced by 3D conformation. Incorporating structural predictions could provide crucial spatial context. Increase Training Data dbPTM was used as the primary source. Supplementing with other databases or curated mass spectrometry datasets may help mitigate data imbalance for rarer PTMs. User-Friendly Interface A web-based or desktop application with interactive visualization can help researchers easily interpret PTM predictions across large protein structures. Footnotes aniruddh.goteti{at}orbion.life caglar.bozkurt{at}orbion.life References [1]. ↵ Zhang , Y. , & Wei , Y. ( 2021 ). Dynamics of post-translational modification inspires drug design in the kinase family . Journal of Medicinal Chemistry , 64 ( 10 ), 6813 – 6825 . doi: 10.1021/acs.jmedchem.1c01076 OpenUrl CrossRef [2]. ↵ Mínguez , P. , et al. ( 2014 ). PTMcode v2: a resource for functional associations of post-translational modifications within and between proteins . Nucleic Acids Research , 42 ( D1 ), D320 – D325 . doi: 10.1093/nar/gku1081 OpenUrl CrossRef PubMed [3]. ↵ Bludau , I. , et al. ( 2022 ). The structural context of posttranslational modifications at a proteome-wide scale . PLOS Biology , 20 ( 1 ), e3001636 . doi: 10.1371/journal.pbio.3001636 OpenUrl CrossRef PubMed [4]. ↵ Huang , C. , et al. ( 2015 ). PTM cross talk: interplay between post-translational modifications of protein in cell fate . Cell Biochemistry and Function , 33 ( 2 ), 84 – 93 . OpenUrl [5]. Gutierrez , C. S. , Kassim , A. A. , Gutierrez , B. D. , Raines , R. T. ( 2024 ). Sitetack: A deep learning model that improves PTM prediction by using known PTMs . Bioinformatics , 40 ( 11 ), in press. [6]. ↵ Chang , Y. H. , et al. ( 2018 ). Systematic investigation of protein post-translational modifications in prokaryotes . BMC Genomics , 19 , 841 . OpenUrl CrossRef PubMed [7]. ↵ Liu , Z. , et al. ( 2018 ). Deep learning in predicting protein post-translational modifications . Methods , 145 , 16 – 25 . OpenUrl CrossRef PubMed [8]. ↵ Li , F. , et al. ( 2014 ). A comprehensive review of current approaches for predicting PTM sites in proteins . Briefings in Bioinformatics , 15 ( 6 ), 913 – 929 . OpenUrl [9]. ↵ Wang , M. , et al. ( 2015 ). Targeting PTMs of proteins in cancer therapy . Oncotarget , 6 ( 22 ), 19543 – 19553 . OpenUrl [10]. ↵ Nickchi , A. , et al. ( 2015 ). PEIMAN: A platform for PTM enrichment, integration, and mapping on 3D protein structures . Nucleic Acids Research , 43 ( W1 ), W525 – W530 . OpenUrl [11]. ↵ Dewhurst , H. M. , Torres , M. P. ( 2017 ). Systematic analysis of PTM hotspots (SAPHire) to identify regulatory modules across the proteome . Scientific Reports , 7 ( 1 ), 40737 . OpenUrl CrossRef PubMed [12]. ↵ Shrestha , P. , Kandel , J. , Tayara , H. , et al. ( 2024 ). Post-translational modification prediction via prompt-based fine-tuning of a GPT-2 model . Nature Communications , 15 , 6699 . OpenUrl CrossRef PubMed [13]. ↵ Wang , D. , Liu , D. , Yuchi , J. , He , F. , Jiang , Y. , Cai , S. , Li , J. , Xu , D. ( 2020 ). MusiteDeep: a deep-learning based webserver for protein post-translational modification site prediction and visualization . Nucleic Acids Research , 48 ( W1 ), W140 – W146 . OpenUrl CrossRef PubMed [14]. ↵ Yan , Y. , Jiang , J. Y. , Fu , M. , Wang , D. , Pelletier , A. R. , Sigdel , D. , Ng , D. C. M. , Wang , W. , Ping , P. ( 2023 ). MIND-S is a deep-learning prediction model for elucidating protein post-translational modifications in human diseases . Cell Reports Methods , 3 ( 3 ), 100430 . OpenUrl CrossRef PubMed [15]. ↵ Gutierrez , C. S. , Kassim , A. A. , Gutierrez , B. D. , Raines , R. T. ( 2024 ). Sitetack: a deep learning model that improves PTM prediction by using known PTMs . Bioinformatics , 40 ( 11 ), btae602 . OpenUrl CrossRef PubMed [16]. ↵ Zhu , Y. , et al. ( 2022 ). Machine learning in post-translational modification site prediction: Current statuses and future opportunities . Computational and Structural Biotechnology Journal , 20 , 3140 – 3151 . OpenUrl CrossRef [17]. ↵ Zhao , X. , et al. ( 2015 ). jEcho: A user-friendly tool for PTM prediction and visualization . Bioinformatics , 31 ( 24 ), 4051 – 4053 . OpenUrl [18]. ↵ Li , G. , et al. ( 2024 ). AlphaFold-driven structural insights for PTM site prediction . Nucleic Acids Research , 52 ( 2 ), 1034 – 1043 . OpenUrl [19]. Zhong , Q. , Xiao , X. , Qiu , Y. , Xu , Z. , Chen , C. , Chong , B. , et al. ( 2023 ). Protein posttranslational modifications in health and diseases: Functions, regulatory mechanisms, and therapeutic implications . MedComm , 4 ( 3 ), e261 . OpenUrl CrossRef [20]. Huang , X. , Feng , Z. , Liu , D. , Gou , Y. , Chen , M. , Tang , D. , et al. ( 2025 ). PTMD 2.0: An updated database of disease-associated post-translational modifications . Nucleic Acids Research , 53 ( D1 ), D554 – D560 . OpenUrl CrossRef PubMed [21]. Qin , Z. , Ren , H. , Zhao , P. , Wang , K. , Liu , H. , Miao , C. , Du , Y. , et al. ( 2024 ). Current computational tools for protein lysine acylation site prediction . Briefings in Bioinformatics , 25 ( 6 ), bbae469 . OpenUrl CrossRef PubMed [22]. Guo , L. , Wang , Y. , Xu , X. , Cheng , K.-K. , Long , Y. , et al. ( 2021 ). DeepPSP: A global–local information-based deep neural network for the prediction of protein phosphorylation sites . Journal of Proteome Research , 20 ( 1 ), 346 – 356 . OpenUrl CrossRef PubMed [23]. Huang , Y.-X. , Liu , R. , et al. ( 2024 ). Improved prediction of post-translational modification crosstalk within proteins using DeepPCT . Bioinformatics , 40 ( 12 ), btae675 . OpenUrl CrossRef PubMed [24]. Jumper , J. , Evans , R. , Pritzel , A. , Green , T. , Figurnov , M. , Ronneberger , O. , et al. ( 2021 ). Highly accurate protein structure prediction with AlphaFold . Nature , 596 ( 7873 ), 583 – 589 . OpenUrl CrossRef PubMed [25]. Le , N.Q.K. ( 2023 ). Leveraging transformers-based language models in proteome bioinformatics . Proteomics , 23 ( 23–24 ), e2300011 . OpenUrl CrossRef PubMed [26]. Chandra , A. , Tuünnermann , L. , Lüofstedt , T. , Grüatz , R. ( 2023 ). Transformer-based deep learning for predicting protein properties in the life sciences . eLife , 12 , e82819 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted February 25, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following AstraPTM: Context-Aware PTM Prediction Model for Large-Scale Proteins Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share AstraPTM: Context-Aware PTM Prediction Model for Large-Scale Proteins Aniruddh Goteti , Çağlar Bozkurt bioRxiv 2025.02.20.639276; doi: https://doi.org/10.1101/2025.02.20.639276 Share This Article: Copy Citation Tools AstraPTM: Context-Aware PTM Prediction Model for Large-Scale Proteins Aniruddh Goteti , Çağlar Bozkurt bioRxiv 2025.02.20.639276; doi: https://doi.org/10.1101/2025.02.20.639276 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41937) Biophysics (21452) Cancer Biology (18589) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15156) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.