iDeepLC: chemical structure information yields improved retention time prediction of peptides with unseen modifications

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 41,794 characters · extracted from preprint-html · click to expand
iDeepLC: chemical structure information yields improved retention time prediction of peptides with unseen modifications | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results iDeepLC: chemical structure information yields improved retention time prediction of peptides with unseen modifications View ORCID Profile Alireza Nameni , View ORCID Profile Arthur Declercq , View ORCID Profile Ralf Gabriels , View ORCID Profile Robbe Devreese , View ORCID Profile Sven Degroeve , View ORCID Profile Lennart Martens , View ORCID Profile Robbin Bouwmeester doi: https://doi.org/10.1101/2025.10.31.685771 Alireza Nameni 1 CompOmics, VIB Center for Medical Biotechnology , VIB, Ghent, Belgium 2 Department of Medical Protein Research, Faculty of Medicine and Health Sciences, Ghent University , Ghent, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alireza Nameni Arthur Declercq 1 CompOmics, VIB Center for Medical Biotechnology , VIB, Ghent, Belgium 2 Department of Medical Protein Research, Faculty of Medicine and Health Sciences, Ghent University , Ghent, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Arthur Declercq Ralf Gabriels 1 CompOmics, VIB Center for Medical Biotechnology , VIB, Ghent, Belgium 2 Department of Medical Protein Research, Faculty of Medicine and Health Sciences, Ghent University , Ghent, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ralf Gabriels Robbe Devreese 1 CompOmics, VIB Center for Medical Biotechnology , VIB, Ghent, Belgium 2 Department of Medical Protein Research, Faculty of Medicine and Health Sciences, Ghent University , Ghent, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Robbe Devreese Sven Degroeve 1 CompOmics, VIB Center for Medical Biotechnology , VIB, Ghent, Belgium 2 Department of Medical Protein Research, Faculty of Medicine and Health Sciences, Ghent University , Ghent, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sven Degroeve Lennart Martens 1 CompOmics, VIB Center for Medical Biotechnology , VIB, Ghent, Belgium 2 Department of Medical Protein Research, Faculty of Medicine and Health Sciences, Ghent University , Ghent, Belgium 3 BioOrganic Mass Spectrometry Laboratory (LSMBO), IPHC UMR 7178, University of Strasbourg, CNRS , Strasbourg 67000, France 4 Infrastructure Nationale de Protéomique ProFI–FR2048 , Strasbourg 67087, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lennart Martens For correspondence: lennart.martens{at}vib-ugent.be Robbin Bouwmeester 1 CompOmics, VIB Center for Medical Biotechnology , VIB, Ghent, Belgium 2 Department of Medical Protein Research, Faculty of Medicine and Health Sciences, Ghent University , Ghent, Belgium Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Robbin Bouwmeester Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Deep learning has notably advanced the field of liquid chromatography–mass spectrometry-based proteomics. Accurate prediction of peptide retention times significantly enhances our ability to match LC-MS data with the correct peptides and proteins, especially for DIA data. While numerous models predict peptide LC retention times with high accuracy, few can accurately predict the retention times of chemically modified peptides, particularly those with modifications not encountered during model training. In our previously developed DeepLC model, accurate predictions could be made for unseen modifications by leveraging the chemical composition of (modified) residues. Here, however, we present a further enhancement of this model based on chemical structural information. The resulting model, called iDeepLC, shows overall more accurate predictions, and better generalization performance for predicting the retention time of unseen modifications than DeepLC. iDeepLC is freely available as open-source software under the Apache2 license and can be found at https://github.com/CompOmics/iDeepLC . 1 INTRODUCTION In Liquid Chromatography-Mass Spectrometry (LC-MS) proteomics, retention time refers to the specific duration it takes for a peptide to traverse the chromatography column and reach the MS detector. This value serves as an additional dimension that separates peptides prior to analysis, and thus substantially reduces sample complexity in the acquired MS dimensions. Deep learning models have been built that can accurately predict this expected retention time for a given peptide sequence under various experimental conditions. These predictions enable the in-silico creation of spectral libraries for the analysis of data-independent acquisition 1 – 4 data, and enable the rescoring of peptide-spectrum-matches (PSMs) to increase identification sensitivity 5 – 9 . Furthermore, they are utilized to optimize experimental design 10 , to identify multiple peptides in chimeric fragmentation spectra 11 , to simulate LC-MS experiments 12 , and to provide for orthogonal validation of PSMs 13 . However, accurate predictions of the retention time of modified peptides remains challenging. Most current predictors use a binary encoding for a limited set of possible modifications, where the model only recognizes the type and location of a specific modification in the sequence. Examples of such models include architectures such as transformers (Pham et al.) 14 , linear residual convolutional neural networks (CNNs) (Chronologer) 15 , capsule CNNs (DeepRT) 16 , neural networks with long short-term memory (LSTM) layers (Guan et al. and AutoRT) 17 , 18 , combinations of an LSTM and a transformer (DeepPhospho) 19 , and encoder-decoder models with gated recurrent units (Prosit) 20 . Only a few models employ a more sophisticated encoding method that includes the atomic composition of modifications, allowing the model to generalize over the modifications in the training set. This approach facilitates the prediction of peptides carrying modifications not observed during training, as seen in models using neural networks with LSTM layers (AlphaPeptDeep and pDeep3) 21 , 22 , and our own branched CNN, DeepLC 23 . Here, we have explored the extension of these atomic count models by also providing structural information about the (modified) amino acid to the model. We hypothesized that this approach could further enhance the accuracy of retention time predictions for modified peptides and enable the differentiation of peptide isomers. Importantly, for small molecules, the issue of isomers has been addressed using Quantitative Structure-Selectivity Relationships (QSSR) and molecular descriptors 24 . Different sets of these molecular descriptor features have been used and compared in various machine learning models, with what is known as the MolLogP feature consistently emerging as the most discriminating 25 . This chemical descriptor represents the quantified propensity of a molecule to dissolve in non-polar solvents such as octanol, compared to polar solvents such as water 26 . Here we introduce iDeepLC short for ‘improved DeepLC’, a retention time predictor based on the DeepLC architecture, but that also incorporates structural and molecular information through the chemical descriptor MolLogP, in a branch neural network architecture with five paths. Our findings demonstrate that iDeepLC achieves superior predictive accuracy and generalization capabilities, making it a valuable tool for proteomics research. 2 METHODS 2.1 Model Architecture iDeepLC employs a multi-branch neural network architecture similar to DeepLC 23 , with an additional, fifth branch that encodes chemical structural information. As explained in more detail in section 2.3 , the first three branches consist of convolutional layers 27 that encode chemical properties, diamino chemical properties, and atomic counts. A fourth branch is a fully connected layer that encodes general peptide features. The fifth branch consists of convolutional layers for a one-hot encoding of amino acids. The outputs of these five branches are then flattened and concatenated into several connected layers, finally yielding a single numerical output. 2.2 Incorporation of Additional Chemical Descriptor To represent the structure of each, potentially modified, amino acid, we obtained their corresponding SMILES representation. This SMILES representation is then used to compute the MolLogP chemical structure descriptor with the RDKit library 28 version 2023.3.2. By incorporating MolLogP, iDeepLC gains insights into how atomic bonding patterns and individual atomic properties contribute to the hydrophobicity of amino acids. As a result, it can more effectively distinguish between various amino acids and their modifications, including those with identical atom compositions. 2.3 Input Encoding The input of iDeepLC consists of, potentially modified, peptide sequences, each represented as a matrix with dimensions of 41 rows for the features described below, and 60 columns with each column representing a (modified) amino acid in a peptide with a maximum length of 60 amino acids. These features are categorized into five groups: The chemical descriptor, MolLogP , is represented in the first row, encapsulating chemical characteristics of the peptides. Rows two to seven encode the atomic counts of six elements: Carbon (C), Hydrogen (H), Nitrogen (N), Oxygen (O), Phosphorus (P), and Sulfur (S) present in the (modified) amino acids. Diamino chemical descriptor and diamino atoms in rows eight and nine to fourteen, respectively. This branch sums the features of every two adjacent amino acids, enhancing the model’s ability to generalize across different peptide sequences. For peptides with an odd number of amino acids, the last column repeats the previous column’s features to ensure consistency without overlapping. Global features , which spans rows 15 to 21, provides supplementary information about the peptide sequence. It includes the normalized length of the peptide relative to the maximum defined length, the atom composition of the first four and the last four amino acids, and the total atom composition of the entire peptide. Rows 22 to 41 is the One-hot encoding to provide a clear, binary representation of amino acids within the sequence. This path is particularly important to distinguish between isoleucine and leucine as they are structural isomers 29 . To address the challenge of varying peptides lengths, peptides with less than 60 amino acids are padded to maintain uniformity of 60 columns. This facilitates the straightforward encoding of various (modified) peptides regardless of their length. Note that in instances where modifications are present, the chemical descriptor, diamino chemical descriptor, and the atomic composition are adjusted to reflect modified amino acid change. 2.4 Hyperparameter tuning The hyperparameters of the models were optimized on the Hela HF 30 dataset. This optimization process was conducted with the WandB machine learning platform 31 . In total 18 different hyperparameters were tuned, these hyperparameter values are available in Supplementary Table 1. The tuned hyperparameters include the learning rate, batch size, dropout rate, kernel sizes, the number of channels, and the number of layers for the four convolutional layers. 2.5 Datasets and evaluations We first evaluate performance of iDeepLC on 20 proteomics datasets (Supplementary Table 2), ranging from 3,000 to 160,000 sequences. These datasets mainly contain unmodified peptides and common modifications such as oxidation on methionine and carbamidomethyl on cysteine. One exception is ProteomeTools PTM 32 dataset which includes synthetic peptides with various modifications. To ensure a fair and accurate comparison, we used the same training, testing, and validation sets that are used in the evaluation of DeepLC 23 . Next, we assessed iDeepLC on 14 modifications from the ProteomeTools PTM dataset, an experiment referred to as the 14PTMs experiment that was originally described in DeepLC. In this experiment we trained and optimized 14 iDeepLC models where each model only trained on peptides that did not contain a specific modification. These models were then evaluated on the remaining peptides, which all contained the modification excluded during training. We created two test sets from these remaining peptides to evaluate predictions: one where the excluded modification was encoded and one where it was ignored (not encoded) ( Figure 1 ). Download figure Open in new tab Figure 1. The process of generating datasets to evaluate the generalization of iDeepLC for peptides containing a specific peptide modification. In this example we filtered all peptides carrying a Nitro oxidation modification and trained the model on peptides without nitro oxidation and used those with the modification as a test set. In the next step the test set is duplicated, and all nitro oxidation modifications were removed (Nitro not Encoded) while in the other one we kept the peptides as they were (Nitro Encoded). In an additional evaluation that we call the glycine experiment , we utilized all 20 datasets from Supplementary Table 2. We trained 19 models for each dataset, where each model excluded all peptides that contain a specific amino acid during training and was tested on peptides containing that amino acid. The excluded amino acids from training were initially represented as themselves and subsequently as glycine in a duplicated test set. Any modifications on the amino acids were disregarded when encoding it as glycine. This evaluation method was originally designed and described in DeepLC and was aimed at evaluating the generalization capabilities of iDeepLC. This evaluation assesses the model’s ability to predict the retention times of peptides incorporating amino acids absent from the training set. Furthermore, for 17 out of the 20 datasets there are many more training examples compared to the 14PTMs experiment evaluation and should thus reflect actual performance better. Finally, for the comparison with DeepLC, we focused on two specific datasets: DIA HF 4 which consists of approximately 113,000 sequences, and HeLa DeepRT 33 , with around 3,400 sequences. For all evaluations three different metrics are used to assess performance: Mean Absolute Error (MAE), relative MAE, and the Pearson correlation. The relative MAE is used to make the comparison between different datasets possible. Equation (1) is used for the calculation of the relative MAE: Here, the MAE is divided by the retention time difference between the first and last identified peptide in the respective dataset. 2.6 Training procedure To initialize all models’ parameters for learning, we utilized the Kaiming uniform initialization 34 method with a leaky ReLU 35 activation function and we initialized biases to zero. This initialization scheme was applied to both convolutional and fully connected layers within the models. All layers use ELU 36 as activation function except for fully connected layers that use ReLU 37 activation function. The training parameters are available in Supplementary Table 1. We trained the models for 1000 epochs for all 20 datasets and the 14PTMs evaluation , respectively. For the glycine evaluation , we trained for either 300 or 1000 epochs, depending on whether the size of the training dataset was more than 1500. This decision to run fewer epochs is based on early convergence which was achieved much sooner for the larger training set sizes. In the training of all models, we saved the best model based on the MAE of the validation set. The training was done on an NVIDIA GeForce RTX 3090, using Python version 3.10.11 and PyTorch version 2.2.1 with CUDA version 11.8. 3 RESULTS In this section, we first compare the performance of iDeepLC to DeepLC on 20 different datasets, followed by an evaluation of iDeepLC’s ability to generalize for (modified) peptides. This evaluation serves to assess its capability to generalize and is divided into two parts; first, iDeepLC’s capability to accurately predict the retention time of peptides with a specific modification excluded from training of the model ( 14PTMs evaluation ), and second, the prediction for peptides containing an amino acid not used for training ( glycine evaluation ). It is expected that the addition of the MolLogP descriptor improves the ability of iDeepLC to generalize for modifications, especially for those unseen during training. 3.1 Prediction performance iDeepLC We compared the prediction performance of iDeepLC with DeepLC on a set of 20 LC-MS datasets (see Methods). Figure 2 shows that iDeepLC demonstrates on-par performance with DeepLC. The largest difference in the relative MAE is observed for the ProteomeTools PTM dataset 32 , where the relative MAE differs more than 0.5%. The second biggest difference is for the Plasma 1h datasets with a much smaller 0.28% relative MAE difference. It is important to emphasize that both iDeepLC and DeepLC were trained, validated, and evaluated using identical data splits across each dataset. Download figure Open in new tab Figure 2. Performance comparison of DeepLC (orange) and iDeepLC (blue) for 20 datasets with the MAE as metric (lower is better). 3.2 PTM evaluation The improved performance on the dataset consisting primarily of modified peptides (ProteomeTools PTM dataset) indicates that iDeepLC can generalize more effectively for modified peptides. Due to the additional chemical descriptor, iDeepLC should be able understand the peptides’ physicochemical properties better compared to DeepLC, where modified peptides are only represented with their atomic composition. This improvement of iDeepLC to predict for modified peptides is further investigated in the 14PTMs evaluation . In this evaluation the Proteome Tools PTM 32 is used to assess how well iDeepLC understands the modified peptides’ physicochemical properties, without explicitly training on observations for a specific amino acid modification. Figure 3 highlights two modifications, nitro oxidation and deamidation, from the 14PTMs evaluation . For these modifications, iDeepLC can predict the modification induced retention time shift much better than DeepLC. For these two modifications, the MAE for deamidation and nitro oxidation are improved with 58% (from 257 to 109 seconds) and 52% (from 567 to 272 seconds), respectively. For completeness, the scatter plots of all modifications are available in Supplementary Figure 1. Download figure Open in new tab Figure 3. Performance comparison of DeepLC and iDeepLC for models that were not trained on peptides containing nitro oxidation, A and deamidation,B but evaluated on their respective modifications. The blue dots and the red dots show the retention time predicted by iDeepLC and DeepLC respectively tested with encoded modifications, and the grey dots show the retention time on an iDeepLC model tested without encoding modifications. This trend of better performance for iDeepLC and DeepLC over the baseline continues for the remaining twelve modifications, where performance is either better or comparable ( Figure 4 ). Furthermore, these results show that the impact of encoding or not encoding for the methyl group modifications is minimal. In contrast, there is a large difference between encoding and not encoding for acetyl, succinyl, propionyl, crotonyl, malonyl, oxidation and deamidation. Also, these modifications are much more accurately predicted when encoded for both prediction models. The sole exception are the phosphorylated peptides which are more accurately predicted when not encoded compared to when encoded. However, the prediction error of iDeepLC is still smaller than DeepLC when encoding the phosphorylation. This improved prediction accuracy of iDeepLC over DeepLC is also noticeably clear for carbamidomethylation, deamidation, oxidation, and nitro oxidation. Download figure Open in new tab Figure 4. Each modification used for testing is shown on horizontal axis with its corresponding error of predicted and observed retention time. Each modification has three boxes, when the evaluated modification was not encoded (grey) or encoded and predicted by DeepLC (red) or encoded and predicted by iDeepLC (blue). The relative differences with the not encoded baseline and DeepLC or iDeepLC are used to further investigate performance differences between these models ( Figure 5 and Supplementary Table 3). In figure 5 any modifications positioned above the diagonal line indicate a superior performance for iDeepLC. Any point that is below the diagonal means better performance for DeepLC. As observed before, iDeepLC outperforms DeepLC in predicting the retention times for phosphorylation, carbamidomethylation, deamidation, oxidation, and nitro oxidation modifications. For the remaining nine modifications the performance of iDeepLC is on par with DeepLC. Download figure Open in new tab Figure 5. iDeepLC and DeepLC percentage difference in MAE compared to the not encoded test set (baseline) for each modifications in the 14PTMs evaluation. 3.3 Modified glycine evaluation In the glycine evaluation , the same methodology as the 14PTMs evaluation is followed, but instead of modifications, amino acids are excluded from training and tested on. Furthermore, instead of not encoding the amino acid, a baseline is created by replacing the specific amino acid with glycine. As expected, this glycine evaluation shows that encoding amino acids as themselves results in a lower MAE in most cases ( Figure 6 ). This improved performance can be seen for the individual amino acids in Figure 6 positioned above the diagonal line, where the distance to the diagonal line quantifies this improvement over the baseline where amino acids are encoded as glycine. This also means that amino acids below the diagonal line have the reverse conclusion, encoding the amino acid as glycine performs better. Notably, the largest improvements over the baseline are achieved for hydrophobic amino acids. Indeed, correctly encoding amino acids that have the largest contribution to a peptide’s hydrophobicity, and thus retention time in reversed phase chromatography, greatly impacts the prediction accuracy. Importantly, this large hydrophobic contribution of specific amino acids is very effectively captured in the iDeepLC model. iDeepLC does show slightly worse performance for amino acids like lysine (K), but this is likely due to its distinct polar properties and small training dataset size after excluding tryptic peptides. Download figure Open in new tab Figure 6. Glycine evaluation where each amino acid that was excluded is shown in a circle with its size determining the number of training peptides and its color shows the chemical property. An amino acid’s position indicates the MAE for all peptides containing that amino acid and it is either encoded as glycine (vertical axis) or as its own atomic composition (horizontal axis). Everything above the diagonal line is predicted with a higher accuracy when the amino acid is encoded as itself. Two datasets are used, A , DIA HF (bigger dataset), B , HeLa DeepRT (smaller dataset) We summarized the results of the glycine evaluation for 15 datasets with a reverse-phase stationary phase in Figure 7 . This figure shows the relative MAE for each amino acid in all reverse-phased datasets when encoded as themselves or when encoded as glycine. Among the 19 amino acids, encoding the amino acid as itself showed improvements for eleven amino acids (A, E, F, I, L, M, P, R, V, W, and Y). For three amino acids (C, H, and K), the results for encoding the amino acids were worse compared to the baseline. Finally, for the remaining five amino acids (D, N, Q, S, T) there is a minimal difference between encoding them as their respective amino acid and encoding them as glycine. Download figure Open in new tab Figure 7. Relative MAE for each amino acid across all fifteen reverse-phased datasets. The boxplots illustrate the comparison between encoding the amino acid as itself (blue) versus encoding it as glycine (gray). Instead of comparing iDeepLC to a baseline here, the model was also compared to DeepLC with the same glycine evaluation ( Figure 8 ). In the DIA HF dataset, iDeepLC has higher prediction accuracy in 12 out of 19 cases, while for the much smaller dataset, HeLa DeepRT a lower MAE was achieved in 14 out of 19 cases. Notably, in both datasets most of the hydrophobic amino acids such as tryptophan (W), are much better predicted by iDeepLC. Furthermore, for the cases where iDeepLC is outperformed by DeepLC, there is only a small gain in prediction accuracy. Download figure Open in new tab Figure 8. Glycine evaluation where each amino acid that was excluded is shown in a circle with its size determining the number of training peptides and its color shows the chemical property of that amino acid. These figures compare the error of predicted and actual retention time between DeepLC (vertical axis) and iDeepLC (horizontal axis). Everything above the diagonal line shows the better performance of iDeepLC compared with DeepLC. Two datasets are used, A , DIA HF (bigger dataset), B , HeLa DeepRT (smaller dataset) 4 CONCLUSION & DISCUSSION Our results show that adding the MolLogP descriptor enables iDeepLC to learn the physicochemical characteristics of modifications. This understanding means that their impact on the LC retention time is more accurately predicted. Notably, iDeepLC delivers state-of-the-art predictive accuracy for unmodified peptides, while also exhibiting superior performance when applied to modified peptides. This was highlighted in two evaluations where iDeepLC outperforms DeepLC for specific modifications and predicting retention times for previously unobserved amino acids. The only drawback for iDeepLC is that it requires the structure of amino acids and their modifications to obtain the chemical descriptor. However, this is readily resolved in the future as these LC-MS behavior predictors are gaining traction and the value of incorporating structural information as input to these models is more apparent. 5 DATA AVAILABILITY All data used to train and evaluate iDeepLC are available on Zenodo at https://doi.org/10.5281/zenodo.15011301 and it contains the following projects: HeLa hf 30 , ProteomeTools 38 , SWATH library 39 , Plasma lumos 1h 40 , DIA HF 4 , HeLa lumos 2h 40 , Pancreas 41 , Xbridge 42 , ATLANTIS SILICA 42 , LUNA SILICA 42 , LUNA HILIC 42 , SCX 42 , Yeast 2h 43 , HeLa lumos 1h 40 , Yeast 1h 43 , Arabidopsis 44 , Yeast DeepRT 45 , ProteomeTools PTM 32 , Plasma lumos 2h 40 , and HeLa DeepRT 33 . 6 ACKNOWLEDGEMENTS A.N. acknowledges funding from the European Union’s Horizon 2020 research and innovation programme under the Marie Skłodowska-Curie grant agreement N° 956148. R.D., A.D., R.G., L.M. and R.B. acknowledge funding from the Research Foundation Flanders (FWO) [1SH9O24N, 12B7123N, 1SE3724N, G010023N, G028821N, 12A6L24N]. L.M. acknowledge funding from the Horizon Europe Projects BAXERNA 2.0 [101080544] and COMBINE [101191739], and from the Ghent University Concerted Research Action [BOF21/GOA/033]. L.M. is further supported by the CHIST-ERA project ODEEP-EU [G0GDV23N] and F.I. by Ghent University Starting Grant BOF/STA/202209/011. Footnotes https://doi.org/10.5281/zenodo.15011301 https://github.com/CompOmics/iDeepLC 7 REFERENCES 1. ↵ Michalski , A. , Cox , J. & Mann , M. More than 100,000 detectable peptide species elute in single shotgun proteomics runs but the majority is inaccessible to data-dependent LC-MS/MS . J Proteome Res 10 , 1785 – 1793 ( 2011 ). OpenUrl CrossRef PubMed Web of Science 2. Shishkova , E. , Hebert , A. S. & Coon , J. J. Now, More Than Ever, Proteomics Needs Better Chromatography . Cell Systems vol. 3 321 – 324 Preprint at doi: 10.1016/j.cels.2016.10.007 ( 2016 ). OpenUrl CrossRef PubMed 3. Demichev , V. , Messner , C. B. , Vernardis , S. I. , Lilley , K. S. & Ralser , M. DIA-NN: neural networks and interference correction enable deep proteome coverage in high throughput . Nat Methods 17 , 41 – 44 ( 2020 ). OpenUrl CrossRef PubMed 4. ↵ Bruderer , R. et al. Optimization of experimental parameters in data-independent mass spectrometry significantly increases depth and reproducibility of results . Molecular and Cellular Proteomics 16 , 2296 – 2309 ( 2017 ). OpenUrl 5. ↵ Zolg , D. P. et al. INFERYS rescoring: Boosting peptide identifications and scoring confidence of database search results . Rapid Communications in Mass Spectrometry ( 2021 ) doi: 10.1002/rcm.9128 . OpenUrl CrossRef 6. Silva , A. S. C. , Bouwmeester , R. , Martens , L. & Degroeve , S. Accurate peptide fragmentation predictions allow data driven approaches to replace and improve upon proteomics search engine scoring functions . Bioinformatics 35 , 5243 – 5248 ( 2019 ). OpenUrl CrossRef PubMed 7. Declercq , A. et al. MS2Rescore: Data-Driven Rescoring Dramatically Boosts Immunopeptide Identification Rates . Molecular and Cellular Proteomics 21 , ( 2022 ). 8. MacLean , B. et al. Skyline: An open source document editor for creating and analyzing targeted proteomics experiments . Bioinformatics 26 , 966 – 968 ( 2010 ). OpenUrl CrossRef PubMed Web of Science 9. ↵ Yang , K. L. et al. MSBooster: improving peptide identification rates using deep learning-based features . Nat Commun 14 , ( 2023 ). 10. ↵ Bertsch , A. et al. Optimal de novo design of MRM experiments for rapid assay development in targeted proteomics . J Proteome Res 9 , 2696 – 2704 ( 2010 ). OpenUrl CrossRef PubMed 11. ↵ Dorfer , V. , Maltsev , S. , Winkler , S. & Mechtler , K. CharmeRT: Boosting Peptide Identifications by Chimeric Spectra Identification and Retention Time Prediction . J Proteome Res 17 , 2581 – 2589 ( 2018 ). OpenUrl CrossRef PubMed 12. ↵ Kösters , M. , Leufken , J. & Leidel , S. A. SMITER—A python library for the simulation of LC-MS/MS experiments . Genes (Basel) 12 , ( 2021 ). 13. ↵ Gong , S. et al. The RNA landscape of the human placenta in health and disease . Nat Commun 12 , ( 2021 ). 14. ↵ Pham , T. V. et al. A transformer architecture for retention time prediction in liquid chromatography mass spectrometry-based proteomics . Proteomics 23 , ( 2023 ). 15. ↵ Wilburn , D. B. et al. Deep learning from harmonized peptide libraries enables retention time prediction of diverse post translational modifications . doi: 10.1101/2023.05.30.542978 . OpenUrl Abstract / FREE Full Text 16. ↵ Ma , C. et al. Improved Peptide Retention Time Prediction in Liquid Chromatography through Deep Learning . Anal Chem 90 , 10881 – 10888 ( 2018 ). OpenUrl CrossRef 17. ↵ Guan , S. , Moran , M. F. & Ma , B. Prediction of LC-MS/MS properties of peptides from sequence by deep learning . Molecular and Cellular Proteomics 18 , 2099 – 2107 ( 2019 ). OpenUrl 18. ↵ Wen , B. , Li , K. , Zhang , Y. & Zhang , B. Cancer neoantigen prioritization through sensitive and reliable proteogenomics analysis . Nat Commun 11 , ( 2020 ). 19. ↵ Lou , R. et al. DeepPhospho accelerates DIA phosphoproteome profiling through in silico library generation . Nat Commun 12 , ( 2021 ). 20. ↵ Gessulat , S. et al. Prosit: proteome-wide prediction of peptide tandem mass spectra by deep learning . Nat Methods 16 , 509 – 518 ( 2019 ). OpenUrl CrossRef PubMed 21. ↵ Zeng , W. F. et al. AlphaPeptDeep: a modular deep learning framework to predict peptide properties for proteomics . Nat Commun 13 , ( 2022 ). 22. ↵ Tarn , C. & Zeng , W. F. PDeep3: Toward More Accurate Spectrum Prediction with Fast Few-Shot Learning . Anal Chem 93 , 5815 – 5822 ( 2021 ). OpenUrl CrossRef 23. ↵ Bouwmeester , R. , Gabriels , R. , Hulstaert , N. , Martens , L. & Degroeve , S. DeepLC can predict retention times for peptides that carry as-yet unseen modifications . Nat Methods 18 , 1363 – 1369 ( 2021 ). OpenUrl CrossRef PubMed 24. ↵ Stefaniu , A. & Pintilie , L. Molecular Descriptors and Properties of Organic Molecules . in Symmetry (Group Theory) and Mathematical Treatment in Chemistry (InTech , 2018 ). doi: 10.5772/intechopen.72840 . OpenUrl CrossRef 25. ↵ Bouwmeester , R. , Martens , L. & Degroeve , S. Comprehensive and Empirical Evaluation of Machine Learning Algorithms for Small Molecule LC Retention Time Prediction . Anal Chem 91 , 3694 – 3703 ( 2019 ). OpenUrl CrossRef 26. ↵ Wildman , S. A. & Crippen , G. M. Prediction of physicochemical parameters by atomic contributions . J Chem Inf Comput Sci 39 , 868 – 873 ( 1999 ). OpenUrl CrossRef Web of Science 27. ↵ Fukushima , K. Neocognitron: A Hierarchical Neural Network Capable of Visual Pattern Recognition . vol. 1 ( 1988 ). 28. ↵ Landrum , G. RDKit: A Software Suite for Cheminformatics, Computational Chemistry, and Predictive Modeling . http://rdkit.sourceforge.net ( 2010 ). 29. ↵ J. M. R. Parker , D. G. and R. S. H. New Hydrophilicity Scale Derived from High-Performance Liquid Chromatography Peptide Retention Data: Correlation of Predicted Surface Residues with Antigenicity and x-Ray-Derived Accessible Sites. (Farellibooks , 1986 ). 30. ↵ Kelstrup , C. D. et al. Performance Evaluation of the Q Exactive HF-X for Shotgun Proteomics . J Proteome Res 17 , 727 – 738 ( 2018 ). OpenUrl CrossRef PubMed 31. ↵ Biewald , L. Experiment Tracking with Weights and Biases . https://www.wandb.com/ ( 2020 ). 32. ↵ Paul Zolg , D. et al. Proteometools: Systematic characterization of 21 post-translational protein modifications by liquid chromatography tandem mass spectrometry (lc-ms/ms) using synthetic peptides . Molecular and Cellular Proteomics 17 , 1850 – 1863 ( 2018 ). OpenUrl 33. ↵ Sharma , K. et al. Ultradeep Human Phosphoproteome Reveals a Distinct Regulatory Nature of Tyr and Ser/Thr-Based Signaling . Cell Rep 8 , 1583 – 1594 ( 2014 ). OpenUrl CrossRef PubMed Web of Science 34. ↵ He , K. , Zhang , X. , Ren , S. & Sun , J. Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification . ( 2015 ). 35. ↵ Nair , V. & Hinton , G. E. Rectified Linear Units Improve Restricted Boltzmann Machines . 36. ↵ Clevert , D.-A. , Unterthiner , T. & Hochreiter , S. Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs) . ( 2015 ). 37. ↵ Agarap , A. F. Deep Learning using Rectified Linear Units (ReLU) . ( 2018 ). 38. ↵ Zolg , D. P. et al. Building ProteomeTools based on a complete synthetic human proteome . Nat Methods 14 , 259 – 262 ( 2017 ). OpenUrl CrossRef PubMed 39. ↵ Rosenberger , G. et al. A repository of assays to quantify 10,000 human proteins by SWATH-MS . Sci Data 1 , ( 2014 ). 40. ↵ Li , W. et al. Assessing the Relationship Between Mass Window Width and Retention Time Scheduling on Protein Coverage for Data-Independent Acquisition . J Am Soc Mass Spectrom 30 , 1396 – 1405 ( 2019 ). OpenUrl CrossRef PubMed 41. ↵ Wang , D. et al. A deep proteome and transcriptome abundance atlas of 29 healthy human tissues . Mol Syst Biol 15 , ( 2019 ). 42. ↵ Gussakovsky , D. , Neustaeter , H. , Spicer , V. & Krokhin , O. V. Sequence-Specific Model for Peptide Retention Time Prediction in Strong Cation Exchange Chromatography . Anal Chem 89 , 11795 – 11802 ( 2017 ). OpenUrl 43. ↵ Jarnuczak , A. F. et al. Analysis of Intrinsic Peptide Detectability via Integrated Label-Free and SRM-Based Absolute Quantitative Proteomics . J Proteome Res 15 , 2945 – 2959 ( 2016 ). OpenUrl CrossRef PubMed 44. ↵ Mucha , S. et al. The Formation of a Camalexin Biosynthetic Metabolon . Plant Cell 31 , 2697 – 2710 ( 2019 ). OpenUrl Abstract / FREE Full Text 45. ↵ Nagaraj , N. et al. System-wide Perturbation Analysis with Nearly Complete Coverage of the Yeast Proteome by Single-shot Ultra HPLC Runs on a Bench Top Orbitrap . Molecular & Cellular Proteomics 11 , M111.013722 ( 2012 ). View the discussion thread. Back to top Previous Next Posted November 02, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following iDeepLC: chemical structure information yields improved retention time prediction of peptides with unseen modifications Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share iDeepLC: chemical structure information yields improved retention time prediction of peptides with unseen modifications Alireza Nameni , Arthur Declercq , Ralf Gabriels , Robbe Devreese , Sven Degroeve , Lennart Martens , Robbin Bouwmeester bioRxiv 2025.10.31.685771; doi: https://doi.org/10.1101/2025.10.31.685771 Share This Article: Copy Citation Tools iDeepLC: chemical structure information yields improved retention time prediction of peptides with unseen modifications Alireza Nameni , Arthur Declercq , Ralf Gabriels , Robbe Devreese , Sven Degroeve , Lennart Martens , Robbin Bouwmeester bioRxiv 2025.10.31.685771; doi: https://doi.org/10.1101/2025.10.31.685771 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41910) Biophysics (21436) Cancer Biology (18576) Cell Biology (25480) Clinical Trials (138) Developmental Biology (13368) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40360) Molecular Biology (17163) Neuroscience (88534) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00