Full text
58,204 characters
· extracted from
preprint-html
· click to expand
AllerStack: Predicting Allergenic Proteins with a Stacked Ensemble Approach | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results AllerStack: Predicting Allergenic Proteins with a Stacked Ensemble Approach Harshika Sharma , Mansi Goel , Devkul Sahu , Mazhar Ayaz Sayed , Pratham Mangla , Pratham Shekhawat , Shubham Yadav , View ORCID Profile Ganesh Bagler doi: https://doi.org/10.1101/2025.06.22.660976 Harshika Sharma 1 Department of Computational Biology, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India 3 Infosys Centre for Artificial Intelligence, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India 4 Center of Excellence in Healthcare, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mansi Goel 1 Department of Computational Biology, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India 3 Infosys Centre for Artificial Intelligence, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India 4 Center of Excellence in Healthcare, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Devkul Sahu 2 Department of Computer Science and Engineering, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mazhar Ayaz Sayed 2 Department of Computer Science and Engineering, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pratham Mangla 2 Department of Computer Science and Engineering, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Pratham Shekhawat 2 Department of Computer Science and Engineering, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Shubham Yadav 2 Department of Computer Science and Engineering, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ganesh Bagler 1 Department of Computational Biology, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India 3 Infosys Centre for Artificial Intelligence, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India 4 Center of Excellence in Healthcare, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) , Okhla Phase III, New Delhi 110020 India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ganesh Bagler For correspondence: bagler{at}iiitd.ac.in Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Accurate prediction of protein allergenicity is essential for ensuring food and drug safety. While machine learning and deep learning models have been explored for this task, limitations remain in dataset scale, feature representation, and model architecture. Here, we introduce AllerStack, a two-stage stacked ensemble model integrating handcrafted and ESM2-based learned features for allergenicity classification. The model was developed using a balanced dataset comprising 11,930 allergenic and 11,930 non-allergenic proteins. We extracted amino acid composition, dipeptide composition, and physicochemical features using the Biopython library, alongside contextual embeddings from the pre-trained ESM2 protein language model. Diverse classifiers (QDA, SVM, KNN, and ANN) were trained separately on these features in the base layer. Their predictions were used as input to a meta-classifier based on XGBoost. AllerStack achieved high predictive performance with 96.87% accuracy, 96.86% F1-score, 93.75% Matthews correlation coefficient (MCC), and an AUC of 0.99. A publicly accessible web server ( https://cosylab.iiitd.edu.in/allerstack/ ) enables real-time allergenicity prediction from protein sequences. AllerStack provides a robust, interpretable, and user-friendly platform for allergen detection in computational biology. Highlights An extensive dataset of 23,860 proteins. Combines handcrafted and ESM2-derived features. Stacked ensemble model with an accuracy of 96.87%. SHAP-based model interpretability at the model and feature level. Public web server, AllerStack ( https://cosylab.iiitd.edu.in/allerstack/ ). 1. Introduction Allergic diseases are a growing global health concern, affecting nearly 20% of the population and contributing to chronic conditions such as asthma, rhinitis, dermatitis, and respiratory distress [ 1 , 2 ]. These diseases result from aberrant immune responses to allergens, such as chemical substances, aeroallergens, or food proteins. While proteins are vital for nutrition and cellular function, specific proteins, such as those found in peanuts, shellfish, and gluten, can trigger severe allergic reactions in some individuals, leading to issues like anaphylaxis, respiratory distress, or gastrointestinal problems. Computational biology offers a powerful approach to addressing these challenges by enabling the prediction of allergenic proteins through data-driven modeling. By integrating machine learning with molecular biology and nutritional science, researchers can analyze large-scale protein datasets to identify patterns associated with allergenicity. These predictive models help uncover the underlying determinants of immune reactivity and support the identification of safer protein sources for food and therapeutic applications. The underlying immune mechanism involves the production of immunoglobulin E (IgE) antibodies, which sensitize mast cells to release inflammatory mediators like histamine and cytokines upon re-exposure [ 3 ]. The increasing prevalence and healthcare burden of allergic diseases underscore the urgent need for robust allergen screening tools [ 4 ]. Proteins with at least six consecutive matching amino acids or more than 35% sequence similarity to known allergens are considered potential allergens, according to the Food and Agriculture Organization. However, these threshold-based approaches often overlook structural and physicochemical determinants of allergenicity. Computational models have emerged to address these limitations by leveraging machine learning (ML) and deep learning (DL) for pattern recognition in biological sequences. Several methods have been proposed for allergenicity prediction. AlgPred 2.0 integrates BLAST and MERCI motif search, achieving an AUC of 0.98 [ 5 ]. With a similar spirit, AllerHunter employs SVM-based pairwise comparisons across 1,356 allergens to identify cross-reactivity [ 6 ]. AllerCatPro extends this by incorporating 3D epitope mapping with 84% accuracy [ 7 ]. Alignment-free models have also been developed to overcome limitations of sequence similarity. AllerTop applies the k-nearest neighbors (KNN) algorithm using ACC-transformed features from 2,210 allergens [ 8 ]. AllergenFP, another alignment-free tool, leverages a descriptor-based fingerprint approach to identify allergens and non-allergens [ 9 ]. Recent DL approaches have further enhanced predictive performance. ProAll-D employs LSTM on a 2,427-sequence dataset with an AUC of 91.5% [ 10 ]. In a similar spirit, BERT-based model by Wang et al. achieved an AUC of 95% using transformer and ensemble techniques on food allergens [ 11 ]. Other notable contributions include DeepAlgPro (accuracy: 91.24%) [ 12 ], ALLERDET (97.26%) [ 13 ], and hybrid frameworks like AllerHybrid, which combine ANN and SVM models using k-mer and physicochemical features, attaining 94.28% accuracy [ 14 ]. Researchers implemented a deep learning-based ensemble method that uses Extra Tree, Deep Belief Network, and CatBoost models to identify protein allergens [ 15 ]. Despite these advancements, challenges remain in scaling models to larger datasets, integrating interpretable features, and generalizing across diverse protein families. We present AllerStack, a stacked ensemble framework that fuses classical feature engineering and deep contextual protein embeddings to address these gaps. Our model is trained on a balanced and curated dataset of 11,930 allergenic and 11,930 non-allergenic proteins. Features include amino acid composition, dipeptide composition, physicochemical descriptors (via Biopython), and embeddings from the transformer-based ESM2 model. It integrates predictions with multiple base classifiers (QDA, SVM, KNN, ANN) and an XGBoost meta-classifier. The ensemble approach captures complex sequence-to-function relationships across multiple feature domains. Our best-performing configuration achieves an accuracy of 96.87% and an AUC of 0.99. Furthermore, a publicly accessible web server ( https://cosylab.iiitd.edu.in/allerstack/ ) supports end-user prediction from raw sequences. 2. Materials and Methods 2.1. Dataset Collection A comprehensive dataset of allergenic proteins was constructed by integrating multiple publicly available and validated sources. Allergenic protein sequences were obtained from COMPARE (2,784) [ 16 ], AllergenOnline (2,291) [ 17 ], ADFS (2,204) [ 18 ], IUIS (1,109) [ 19 ], ALLERMATCH (391) [ 20 ], ALLERBASE (1,737) [ 21 ], SDAP (1,887) [ 22 ], and Swiss-Prot (1,288), using the query “allergen AND reviewed: yes” [ 23 ]. 2.2. Data Preprocessing Sequences lacking UniProt identifiers or without literature validation were removed. Entries containing non-natural amino acids (B, O, J, U, X, Z) were excluded. After refinement, 11,930 allergenic sequences were retained, incorporating filtered entries from the databases above. Since proteins explicitly labeled as “non-allergen” are not available, we randomly selected 11,930 non-allergenic sequences from Swiss-Prot using the query “NOT allergenic AND reviewed: yes” [ 24 ], resulting in a balanced dataset of 23,860 sequences. 2.3. Redundancy Removal and Dataset Partitioning To minimize redundancy and avoid bias due to sequence similarity, clustering was performed using CD-HIT with a 90% sequence identity threshold [ 25 , 26 ]. Allergenic and non-allergenic sequences resulted in 2,158 and 4,829 clusters, respectively. These clusters were split into training and external validation sets using an 80:20 split. The training data is further divided into training and internal validation subsets. Stratified sampling ensured equal representation of allergenic and non-allergenic sequences in each split. 2.4. Feature Extraction Feature extraction transforms protein sequences into discrete data of a defined length. Three main types of features were extracted: composition-based, physicochemical-based, and deep contextual embeddings. 2.4.1 Composition-Based Features Amino acid composition (AAC) and dipeptide composition (DPC) were computed using Biopython v2.11 [ 27 ]. AAC generates a 20-dimensional vector representing residue frequency, while DPC produces a 400-dimensional vector by quantifying dipeptide frequencies. 2.4.2 Physicochemical-based Features Physicochemical descriptors were derived using Pseudo Amino Acid Composition (PAAC), considering composition and sequence [ 28 ]. Additionally, molecular weight, aromaticity, isoelectric point (pI), instability index, and secondary structure fractions (alpha-helices, beta-sheets, coils) were calculated via Biopython [ 27 ]. These attributes relate to protein stability, solubility, and immunogenic potential [ 29 – 31 ]. 2.4.3 ESM2 We extracted 1,280-dimensional embeddings using Meta AI’s pre-trained ESM (Evolutionary Scale Modeling 2) model [ 32 ]. The contextualized representations were obtained by averaging transformer-layer outputs across amino acid tokens, capturing high-level biological information [ 33 ]. 2.5. Feature Scaling and Selection All features were scaled to prevent variables with large magnitudes from dominating model performance. GradientBoosting-based feature selection was used to reduce dimensionality and select the most informative features [ 34 , 35 ]. This approach reduced the risk of overfitting and enhanced the model’s generalization ability. 2.6. Stacked Ensemble Model Framework We initially evaluated multiple machine learning models on Biopython features, including Quadratic Discriminant Analysis (QDA), Linear Discriminant Analysis (LDA), Extreme Gradient Boosting (XGBoost), Light Gradient Boosting Machine (LGBM), Adaptive Boosting (AdaBoost), Random Forest (RF), RidgeClassifierCV (RCV), and Extremely Randomized Tree (ExtraTree) with five-fold cross-validation. ESM2-derived features were evaluated using Extremely Randomized Tree (ExtraTree), Support Vector Classifier (SVC), K-nearest neighbors (KNN), RCV, LGBM, and deep learning models: Artificial Neural Network (ANN), Recurrent Neural Network (RNN), and Long Short Term Memory (LSTM). The most effective models were selected to form the final stacked ensemble pipeline. The final stacked model consists of four base classifiers: QDA, ANN, KNN, and SVC, followed by an XGBoost meta-classifier trained on their predictions [ 36 – 38 ]. The architecture of the stacked model is depicted in Figure 1 . Download figure Open in new tab Figure 1: Schematic representation of the AllerStack framework. The architecture includes four base classifiers (QDA, ANN, KNN, and SVC), whose outputs are used by a meta-classifier (XGBoost) for final allergenicity prediction. 2.7. Evaluation Metrics Model performance was assessed using standard classification metrics: Sensitivity (SENS), Specificity (SPEC), Precision (PREC), Accuracy (ACC), Matthews Correlation Coefficient (MCC), and F1 score, and Area under the Receiver Operating Characteristic curve (AUC-ROC) used in many previous studies [ 5 , 10 , 39 , 40 ]. These metrics capture both threshold-dependent and threshold-independent performances, enabling robust evaluation of binary classification under balanced and imbalanced conditions. 3. Results 3.1. Compositional Analysis We analyzed the amino acid composition of allergenic and non-allergenic proteins to identify residue-level trends associated with allergenicity. Our study showed that amino acids such as Aspartic Acid (D), Glutamic Acid (E), Isoleucine (I), Lysine (K), Leucine (L), and Glutamine (Q) were more prevalent in allergenic sequences, suggesting a potential role in eliciting immune responses. The enrichment of negatively charged residues like D and E may facilitate immunogenic interactions, thereby enhancing allergenic potential [ 41 ]. Sicherer and Sampson also reported that hydrophobic amino acids such as I and L can contribute to increased protein stability and persistence within the host, potentially prolonging immune system exposure[ 42 ]. Conversely, non-allergenic sequences exhibited higher frequencies of Cysteine (C), Phenylalanine (F), Glycine (G), Asparagine (N), Arginine (R), Serine (S), Threonine (T), and Tryptophan (W). Prior studies have shown that residues like C and N are commonly involved in glycosylation and disulfide bond formation, which influence protein folding and may reduce immunogenicity [ 43 ]. Additionally, amino acids such as R and T have been implicated in maintaining protein solubility and structural integrity, which may further reduce allergenic potential [ 44 ]. Supplementary Figure 1 illustrates the average amino acid composition differences between allergenic and non-allergenic proteins. 3.2. Physicochemical Analysis We evaluated a range of physicochemical properties of allergenic and non-allergenic proteins, including molecular weight, sequence length, aromaticity, isoelectric point (pI), instability index, secondary structure composition (alpha-helices, beta-sheets), acidity, basicity, hydrophobicity, hydrophilicity, flexibility, net charge at pH 7, and sequence complexity, as shown in Supplementary Figure 2. Allergenic proteins generally exhibit higher molecular weight and longer sequences, characteristics that may contribute to increased structural complexity and allergenic potential [ 45 ]. These proteins also display greater aromaticity, indicating enrichment of aromatic residues such as phenylalanine, tyrosine, and tryptophan, which are known to stabilize protein structures through hydrophobic interactions and π-π stacking [ 46 , 47 ]. The pI values of allergenic proteins are typically higher, potentially influencing their interactions with immune receptors and antibody binding [ 48 ]. Allergenic proteins also tend to have lower instability indices, suggesting enhanced structural stability and resistance to degradation [ 49 ]. In terms of secondary structure, allergenic proteins exhibit balanced distributions of alpha-helices and beta-sheets, which may contribute to overall structural integrity [ 50 ]. The distribution of acidic and basic residues influences solubility and immune recognition, and allergenic proteins show greater variability. Similarly, their hydrophobic character is elevated, which can promote aggregation and enhance immunogenic persistence [ 42 , 51 ]. Increased flexibility in allergenic proteins may facilitate conformational changes that expose epitopes to immune cells. Furthermore, a higher net charge at physiological pH and elevated sequence complexity may enhance immunogenic potential by increasing epitope diversity and immune engagement [ 44 , 52 ]. 3.3. Motif Analysis To explore conserved sequence patterns characteristic of allergenic and non-allergenic proteins, we performed motif discovery using STREME from the MEME Suite [ 53 ]. Motif analysis helps reveal biologically meaningful residue patterns that may be associated with allergenicity and immune recognition. These motifs, shown in Figure 2 , highlight the discriminative sequence patterns between allergenic and non-allergenic proteins. Download figure Open in new tab Figure 2: Allergenic and non-allergenic motif logos identified using STREME from the MEME suite. (a) Allergens: motif LLLLLLLAL (p = 6.3 × 10 -5 ) identified in 30.5%. (b) Non-allergens: motif GACGY (5.8 × 10 -4 ) found in 25.5%. In allergenic proteins, STREME identified a highly conserved 9-mer motif: LLLLLLLAL, present in 30.5% of sequences, with a statistically significant p-value of 6.3 × 10 -5 . This motif is dominated by hydrophobic residues, particularly Leucine (L) and Alanine (A), suggesting a possible role in maintaining protein stability or forming epitope-rich regions that facilitate immune recognition. The prevalence of Leucine-rich segments has been previously linked to structural rigidity and resistance to proteolysis traits commonly associated with allergenic proteins. In contrast, non-allergenic proteins were enriched for a distinct 5-mer motif: GACGY, occurring in 25.5% of sequences with a p-value of 5.8 × 10 -4 . This motif includes a mix of polar (Glycine, Cysteine) and acidic (Aspartic acid) residues, which may contribute to flexible and hydrophilic regions that are less likely to engage IgE-mediated immune responses. The presence of Cysteine (C) in this motif could also indicate a role in disulfide bond formation, potentially enhancing structural stability without eliciting allergenic responses. 3.4. Performance of Traditional Machine Learning Models We evaluated nine traditional machine learning (ML) classifiers: Quadratic Discriminant Analysis (QDA), Linear Discriminant Analysis (LDA), XGBoost, Light Gradient Boosting Machine (LGBM), AdaBoost, Random Forest (RF), RidgeClassifierCV (RCV), Extremely Randomized Trees (ExtraTree), and Ridge Regression to distinguish between allergenic and non-allergenic sequences. Each model was optimized through hyperparameter tuning, and their performance wa assessed on both test and validation datasets, as shown in Table 1 . Among these, QDA achieved the highest classification performance, with an accuracy of 90.83% on the test set and 91.26% on the validation set. Its F1 scores were 89.78% and 88.23%, respectively, indicating reliable prediction capabilities. View this table: View inline View popup Download powerpoint Table 1: Performance of traditional machine learning classifiers on Biopython-derived features across test and validation datasets. Metrics include sensitivity (SENS), specificity (SPEC), precision (PREC), accuracy (ACC), Matthews correlation coefficient (MCC), and F1 score. However, most ML models demonstrated limited performance overall, likely due to the heterogeneous nature of the input features. The variability in amino acid composition, physicochemical properties, and sequence length introduced complexity that conventional classifiers struggled to capture effectively. The high-dimensional and learned representations were then used to train a broader set of machine learning and deep learning models to improve allergenicity prediction performance. 3.5. Feature Selection Feature extraction is a critical step in developing machine learning-based classifiers for macromolecules, as it enables the transformation of protein sequences into fixed-length numerical representations suitable for computational analysis. In this study, we explored multiple feature generation techniques, including handcrafted features using the Biopython library, contextual embeddings from pre-trained models such as ESM2, ProtFlash, and ProtGPT-2, and binary vectors via one-hot encoding. The performance of these feature sets was evaluated using three classifiers with varying complexity: QDA, RidgeClassifierCV (RCV), and XGBoost, listed in Supplementary Table 1. On the validation dataset, Biopython-derived features achieved the highest accuracy with QDA (91.26%), followed by 82.87% and 84.95% using RCV and XGBoost, respectively. ESM2 embeddings yielded strong performance with RCV (91.31%) and XGBoost (90.27%), although QDA performed poorly on this representation. One-hot encoded features, by contrast, consistently underperformed, with all classifiers achieving accuracies below 63%. Given their superior performance, we selected Biopython features alongside ESM2-derived embeddings for the downstream model development. 3.6. Feature Optimization We employed dimensionality reduction and feature selection techniques to improve model performance and mitigate issues arising from high-dimensional and potentially redundant features. We used t-distributed stochastic neighbor embedding (t-SNE) to visualize the feature distribution in a two-dimensional space [ 37 , 54 ]. Figure 3a illustrates the original Biopython feature space, where allergenic and non-allergenic sequences exhibit considerable overlap. To enhance class separability, we applied feature selection using a GradientBoosting classifier, which identified the most informative features based on their contribution to classification. The refined Biopython feature set, reduced to 53 dimensions, exhibited improved separation in t-SNE visualization ( Figure 3b ). Download figure Open in new tab Figure 3: t-SNE visualization of Biopython and ESM2 features before and after feature selection. (a) Raw Biopython feature space (437 features); (b) Biopython features after selection (53 features); (c) Raw ESM2 embeddings (1,280 features); (d) ESM2 features after selection (133 features) using GradientBoosting-based feature importance. Similarly, we evaluated the 1,280-dimensional ESM2 embeddings ( Figure 3c ). Although these embeddings provided inherent separation between classes, their high dimensionality posed a risk of overfitting and increased computational complexity. GradientBoosting-based feature selection reduced the feature space to 133 dimensions, enhancing both interpretability and computational efficiency ( Figure 3d ). 3.7. Performance of ESM2 Features on Machine Learning and Deep Learning Models To evaluate the effectiveness of ESM2-derived embeddings, we implemented five machine learning (ML) models, including: Extremely Randomized Trees (ET), Support Vector Classifier (SVC), k-Nearest Neighbors (KNN), RidgeClassifierCV (RCV), and Light Gradient Boosting Machine (LGBM), alongside three deep learning (DL) architectures: Artificial Neural Network (ANN), Recurrent Neural Network (RNN), and Long Short-Term Memory (LSTM). Each model was fine-tuned for optimal performance. Among ML models, ET achieved the highest accuracy of 90.28% and an F1 score of 90.45% on the validation set. Among DL models, ANN performed best, achieving 91.71% accuracy and an F1 score of 91.42%. These results indicate that deep contextual embeddings from ESM2 can be effectively leveraged by both traditional and deep learning models for allergenicity classification, as depicted in Table 2 . View this table: View inline View popup Download powerpoint Table 2: Evaluation of machine learning and deep learning models on ESM2-derived features. Performance is reported on both test and validation datasets using standard classification metrics. 3.8. Performance of the Stacked Ensemble Framework We developed a stacked ensemble model, AllerStack, to further enhance predictive performance, which integrates diverse classifiers trained on heterogeneous features (Biopython-derived and ESM2 embeddings). Stacking combines multiple first-stage base classifiers, whose outputs are input features to a second-stage meta-classifier [ 36 , 38 , 55 ]. The base classifiers selected for stacking included QDA (91.26% accuracy, F1: 88.23%), ANN (91.71%, F1: 91.42%), and the top-performing ML models—ET (90.28%), SVC (90.21%), and KNN (90.02%). We evaluated seven stacked model configurations (SM1-SM7), combining different subsets of these classifiers. SM6, which included QDA, ANN, SVC, and KNN, achieved the highest performance with an accuracy of 96.87% and an F1 score of 96.86% on validation data, as shown in Table 3 . View this table: View inline View popup Download powerpoint Table 3: Performance of different stacked model configurations using combinations of base classifiers. The seven possible stacked models (SM) were: SM1: QDA, ANN, ET; SM2: QDA, ANN, SVC; SM3: QDA, ANN, KNN; SM4: QDA, ANN, ET, SVC; SM5: QDA, ANN, ET, KNN; SM6: QDA, ANN, SVC, KNN; and SM7: QDA, ANN, ET, SVC, KNN. We compared three options for the meta-classifier: logistic regression (LR), XGBoost, and CatBoost. While LR is widely used due to its interpretability [ 56 – 59 ], XGBoost achieved superior performance, likely due to its ability to model non-linear relationships and manage high-dimensional data efficiently. Supplementary Table 2 shows that XGBoost yielded the best results, with a validation accuracy of 96.87% and an F1 score of 96.86%. Figure 4 illustrates the ROC curves of the base models and the stacked ensemble model. The SM6 configuration achieved the highest area under the ROC curve (AUC = 0.99), indicating superior discrimination between allergenic and non-allergenic proteins. QDA showed the strongest individual performance among base models, followed by SVC and KNN. AllerStack thus demonstrated robust classification capability with high sensitivity (True Positive Rate =96.52%), outperforming all constituent models. Download figure Open in new tab Figure 4: Receiver Operating Characteristic (ROC) curves for base classifiers and the stacked model (SM6). The stacked model exhibits the highest AUC (0.99), indicating superior classification performance compared to individual models. 3.9. SHAP Analysis of the Stacked Ensemble Model SHAP (SHapley Additive exPlanations) is a model-agnostic interpretability method that quantifies the contribution of each feature to individual predictions, enabling transparency in complex machine learning models [ 60 – 62 ]. We applied SHAP analysis to interpret the meta-classifier (XGBoost) and its constituent base models (QDA, SVC, KNN, and ANN) within the AllerStack framework ( Figure 5 ). Download figure Open in new tab Figure 5: SHAP-based model interpretation of the base classifiers and meta model. (a) Contribution of each base model to the final prediction of XGBoost (meta-model); (b-e) Top 10 contributing features for each base model: (b) QDA (base-model); (c) ANN (base-model); (d) KNN (base-model); (e) SVC (base-model). Figure 5a highlights the relative contributions of base models to the final prediction. QDA and SVC display a broader range of SHAP values, indicating a more substantial and variable influence on the stacked model’s output compared to KNN and ANN, which was also supported by Figure 4 . Figure 5b - 5e depicts the top ten features influencing each base model. For QDA ( Figure 5b ), key features include PAAC_5, PAAC_9, aromaticity, alpha-helical content (Secondary_Structure_fraction_1), negatively charged residue E, positively charged residue K, and specific dipeptides such as NW, WC, CC, and GW. Notably, feature_440 appears across all three ML base models (SVC, KNN, ANN), while 515, 382, and 802 are shared between SVC and ANN. KNN and ANN commonly use features 989 and 1213. The limited overlap among top features suggests that each classifier captures distinct aspects of the sequence space, reinforcing the value of stacking diverse models to improve generalization. 3.10. Performance Comparison with Literature To contextualize the performance of AllerStack, we compared it against several state-of-the-art allergenicity prediction models. It integrates classical handcrafted features like AAC, DPC, physicochemical descriptors, and deep contextual embeddings from the pre-trained transformer-based ESM2 model. Unlike most prior models, which are trained on datasets ranging from 4,018 to 20,150 sequences [ 5 , 6 ]. AllerStack was trained and validated on a significantly larger, balanced dataset of 23,860 sequences. This enhanced dataset’s diversity and size contributed to its robust performance. The model architecture also uses a two-stage stacked ensemble, where predictions from optimized base classifiers serve as meta-classifiers’ features. As shown in Table 4 , AllerStack achieved the highest accuracy (96.87%), sensitivity (96.52%), and Matthews Correlation Coefficient (93.75%) among all models. These results reflect its ability to detect allergenic proteins while minimizing false positives and negatives. Additionally, the F1 score of 96.86% underscores the model’s balance between precision and recall, especially in imbalanced datasets. Overall, AllerStack demonstrates improved predictive power, scalability, and interpretability compared to existing allergenicity prediction methods. View this table: View inline View popup Download powerpoint Table 4: Performance comparison of AllerStack with existing allergenicity prediction. Metrics are based on published literature or reported values. 3.11. Web Server Interface We have created a user-friendly web server, AllerStack ( https://cosylab.iiitd.edu.in/allerstack/ ), based on the best stacked model, to predict allergenic proteins. The server enables the user to predict the allergenicity of proteins by providing a single protein sequence or by uploading a TXT, CSV, or FASTA format file as input. This server implements a stacked ensemble approach using AAC, DPC, physicochemical, and ESM2-derived advanced features. The web server uses a technology stack optimized for efficiency and scalability. The frontend was built with ReactJS, a JavaScript toolkit that allows for the building of dynamic and interactive user interfaces. HTML and CSS were used to structure and style web pages. Flask API endpoints, a lightweight Python web framework, were utilized on the backend. The database is stored in MongoDB, a NoSQL database that enables flexible and scalable data management. 4. Conclusions The accurate prediction of allergenic proteins is critical for preventing allergen exposure through food, pharmaceuticals, and consumer products. With allergies affecting approximately 10–30% of the global population, particularly in developed countries, there is an urgent need for robust computational tools to identify potential allergens before they reach the public domain [ 42 , 63 , 64 ]. Such predictive models are essential for regulatory compliance and developing hypoallergenic formulations in food and the therapeutic industries [ 45 ]. AllerStack is an ensemble-based allergenicity prediction framework trained on a comprehensive dataset of 23,860 allergenic and non-allergenic proteins. By integrating handcrafted features from Biopython (e.g., AAC, DPC, physicochemical properties) with contextual embeddings from the pre-trained ESM2 model, AllerStack effectively captures sequence-level, evolutionary, and structural information. Unlike conventional ML or DL models, it employs a stacked ensemble strategy that leverages the strengths of multiple base learners. This approach achieved a high discriminative performance with an AUC of 0.99, highlighting its reliability in distinguishing allergenic from non-allergenic proteins. Our findings contribute a scalable and interpretable framework for allergenicity prediction that can support applications in food safety, drug development, and cosmetic product design [ 65 , 66 ]. Future work will explore quantifying allergenic potential, integrating host-specific allergy variability, and expanding to multi-label or multi-class allergen types. AllerStack lays the groundwork for the next generation of predictive models in allergen science and bioinformatics [ 65 – 68 ]. CRediT Authorship Contribution Statement Harshika Sharma: Writing – original draft, Writing – review and editing, Project administration, Visualization, Methodology, Formal analysis. Mansi Goel: Writing – review and editing, Supervision, Software. Devkul Sahu: Data curation. Mazhar Ayaz Sayed: Data curation. Pratham Mangla: Software. Pratham Shekhawat: Software. Shubham Yadav: Software. Ganesh Bagler: Conceptualization, Investigation, Supervision, Project administration, Writing – review and editing. Funding Sources NA Data and Code Availability All the datasets and code are available on GitHub . Additional Information See Supplementary information. Acknowledgments GB thanks Infosys Centre for Artificial Intelligence, Center of Excellence in Healthcare, and Department of Computational Biology, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi) for the computational support. HS is a research scholar in the Complex Systems Laboratory and is thankful to the Department of Science and Technology (DST), Government of India, DST-INSPIRE, for providing the research fellowship. Footnotes https://github.com/cosylabiiit/AllerStack References [1]. ↵ B. Singh , A. Karnwal , A. Tripathi , A.K. Upadhyay , Food allergens and related computational biology approaches: a requisite for a healthy life , Bioinforma. Agric. High-Throughput Approaches ( 2021 ) 145 – 160 . [2]. ↵ A. Navines-Ferrer , E. Serrano-Candelas , G.-J. Molina-Molina , M. Martin , IgE-related chronic diseases and anti-IgE-based treatments , J. Immunol. Res . 2016 ( 2016 ) 8163803 . OpenUrl PubMed [3]. ↵ M. Masoli , D. Fabian , S. Holt , R. Beasley , G.I. for Asthma (GINA) Program , The global burden of asthma: executive summary of the GINA Dissemination Committee report , Allergy 59 ( 2004 ) 469 – 478 . OpenUrl CrossRef PubMed Web of Science [4]. ↵ B. Anibarro , F.J. Seoane , M. V Mugica , Involvement of hidden allergens in food allergic reactions , J. Investig. Allergol. Clin. Immunol . 17 ( 2007 ) 168 . OpenUrl PubMed [5]. ↵ N. Sharma , S. Patiyal , A. Dhall , A. Pande , C. Arora , G.P.S. Raghava , AlgPred 2.0: An improved method for predicting allergenic proteins and mapping of IgE epitopes , Brief. Bioinform . 22 ( 2021 ). doi: 10.1093/bib/bbaa294 . OpenUrl CrossRef PubMed [6]. ↵ H.C. Muh , J.C. Tong , M.T. Tammi , AllerHunter: a SVM-pairwise system for assessment of allergenicity and allergic cross-reactivity in proteins , PLoS One 4 ( 2009 ) e5861 . OpenUrl CrossRef PubMed [7]. ↵ S. Maurer-Stroh , N.L. Krutz , P.S. Kern , V. Gunalan , M.N. Nguyen , V. Limviphuvadh , F. Eisenhaber , G.F. Gerberick , AllerCatPro—prediction of protein allergenicity potential from the protein sequence , Bioinformatics 35 ( 2019 ) 3020 – 3027 . OpenUrl CrossRef PubMed [8]. ↵ I. Dimitrov , D.R. Flower , I. Doytchinova , AllerTOP-a server for in silico prediction of allergens , in: BMC Bioinformatics , 2013 : pp. 1 – 9 . [9]. ↵ I. Dimitrov , L. Naneva , I. Doytchinova , I. Bangov , AllergenFP: allergenicity prediction by descriptor fingerprints , Bioinformatics 30 ( 2014 ) 846 – 851 . OpenUrl CrossRef PubMed [10]. ↵ P.M. Shanthappa , R. Kumar , ProAll-D: protein allergen detection using long short term memory - a deep learning approach , ADMET DMPK 10 ( 2022 ) 231 – 240 . doi: 10.5599/admet.1335 . OpenUrl CrossRef [11]. ↵ L. Wang , D. Niu , X. Zhao , X. Wang , M. Hao , H. Che , A comparative analysis of novel deep learning and ensemble learning models to predict the allergenicity of food proteins , Foods 10 ( 2021 ) 809 . OpenUrl CrossRef PubMed [12]. ↵ C. He , X. Ye , Y. Yang , L. Hu , Y. Si , X. Zhao , L. Chen , Q. Fang , Y. Wei , F. Wu , others, DeepAlgPro: an interpretable deep neural network model for predicting allergenic proteins , Brief. Bioinform . 24 ( 2023 ) bbad246 . OpenUrl CrossRef PubMed [13]. ↵ F.M. Garcia-Moreno , M.A. Gutiérrez-Naranjo , ALLERDET: a novel web app for prediction of protein allergenicity , J. Biomed. Inform . 135 ( 2022 ) 104217 . OpenUrl CrossRef PubMed [14]. ↵ M.R. Islam , M.A. Kashem , L. Mia , AllerHybrid: A Hybrid System to Predict the Allergen Using K-mer and Physicochemical Properties , in: 2021 5th Int. Conf. Electr. Eng. Inf. Commun. Technol. , 2021 : pp. 1 – 6 . [15]. ↵ A. Kumar , P.S. Rana , A deep learning based ensemble approach for protein allergen classification , PeerJ Comput. Sci . 9 ( 2023 ) e1622 . OpenUrl CrossRef [16]. ↵ R. van Ree , D. Sapiter Ballerda , M.C. Berin , L. Beuf , A. Chang , G. Gadermaier , P.A. Guevera , K. Hoffmann-Sommergruber , E. Islamovic , L. Koski , J. Kough , G.S. Ladics , S. McClain , K.A. McKillop , S. Mitchell-Ryan , C.A. Narrod , L. Pereira Mouriès , S. Pettit , L.K. Poulsen , A. Silvanovich , P. Song , S.S. Teuber , C. Bowman , The COMPARE Database: A Public Resource for Allergen Identification, Adapted for Continuous Improvement, Front . Allergy 2 ( 2021 ) 700533 . doi: 10.3389/FALGY.2021.700533/BIBTEX . OpenUrl CrossRef [17]. ↵ R.E. Goodman , M. Ebisawa , F. Ferreira , H.A. Sampson , R. van Ree , S. Vieths , J.L. Baumert , B. Bohle , S. Lalithambika , J. Wise , S.L. Taylor , AllergenOnline: A peer-reviewed, curated allergen database to assess novel food proteins for potential cross-reactivity , Mol. Nutr. Food Res . 60 ( 2016 ) 1183 – 1198 . doi: 10.1002/mnfr.201500769 . OpenUrl CrossRef PubMed [18]. ↵ A. Mari , C. Rasi , P. Palazzo , E. Scala , Allergen databases: Current status and perspectives , Curr. Allergy Asthma Rep . 9 ( 2009 ) 376 – 383 . doi: 10.1007/s11882-009-0055-9 . OpenUrl CrossRef PubMed Web of Science [19]. ↵ S. Sudharson , T. Kalic , C. Hafner , H. Breiteneder , Newly defined allergens in the WHO/IUIS Allergen Nomenclature Database during 01/2019-03/2021, Allergy Eur . J. Allergy Clin. Immunol . 76 ( 2021 ) 3359 – 3373 . doi: 10.1111/all.15021 . OpenUrl CrossRef [20]. ↵ M.W.E.J. Fiers , G.A. Kleter , H. Nijland , A.A.C.M. Peijnenburg , J.P. Nap , R.C.H.J. van Ham , Allermatch TM , a webtool for the prediction of potential allergenicity according to current FAO/WHO Codex alimentarius guidelines , BMC Bioinformatics 5 ( 2004 ) 1 – 6 . doi: 10.1186/1471-2105-5-133/TABLES/2 . OpenUrl CrossRef PubMed Web of Science [21]. ↵ K. Kadam , R. Karbhal , V.K. Jayaraman , S. Sawant , U. Kulkarni-Kale , AllerBase: a comprehensive allergen knowledgebase , Database 2017 ( 2017 ) 1 – 12 . doi: 10.1093/DATABASE/BAX066 . OpenUrl CrossRef [22]. ↵ C.H. Schein , Identifying Similar Allergens and Potentially Cross-Reacting Areas Using Structural Database of Allergenic Proteins (SDAP) Tools and D-Graph , Methods Mol. Biol . 2717 ( 2024 ) 269 – 284 . doi: 10.1007/978-1-0716-3453-0_18 . OpenUrl CrossRef PubMed [23]. ↵ allergen in UniProtKB search (1286) | UniProt, (n.d.). https://www.uniprot.org/uniprotkb?query=allergen&facets=reviewed%3Atrue (accessed August 23, 2024). [24]. ↵ NOT allergen in UniProtKB search (570578) | UniProt , (n.d.). https://www.uniprot.org/uniprotkb?query=NOT+allergen&facets=reviewed%3Atrue (accessed August 23, 2024). [25]. ↵ M. De Angelis , G. Garruti , F. Minervini , L. Bonfrate , P. Portincasa , M. Gobbetti , The Food-gut Human Axis: The Effects of Diet on Gut Microbiota and Metabolome , Curr. Med. Chem . 26 ( 2017 ) 3567 – 3583 . doi: 10.2174/0929867324666170428103848 . OpenUrl CrossRef [26]. ↵ L. Fu , B. Niu , Z. Zhu , S. Wu , W. Li , CD-HIT: accelerated for clustering the next-generation sequencing data , Bioinformatics 28 ( 2012 ) 3150 – 3152 . doi: 10.1093/BIOINFORMATICS/BTS565 . OpenUrl CrossRef PubMed Web of Science [27]. ↵ P.J.A. Cock , T. Antao , J.T. Chang , B.A. Chapman , C.J. Cox , A. Dalke , I. Friedberg , T. Hamelryck , F. Kauff , B. Wilczynski , M.J.L. De Hoon , Biopython: freely available Python tools for computational molecular biology and bioinformatics , Bioinformatics 25 ( 2009 ) 1422 . doi: 10.1093/BIOINFORMATICS/BTP163 . OpenUrl CrossRef PubMed Web of Science [28]. ↵ P.K. Meher , T.K. Sahu , V. Saini , A.R. Rao , Predicting antimicrobial peptides with improved accuracy by incorporating the compositional, physico-chemical and structural features into Chou’s general PseAAC , Sci. Reports 2017 71 7 ( 2017 ) 1 – 12 . doi: 10.1038/srep42362 . OpenUrl CrossRef [29]. ↵ J.V. Kringelum , M. Nielsen , S.B. Padkjær , O. Lund , Structural analysis of B-cell epitopes in antibody:protein complexes , Mol. Immunol . 53 ( 2013 ) 24 – 34 . doi: 10.1016/J.MOLIMM.2012.06.001 . OpenUrl CrossRef PubMed [30]. E. Öztemiz Topcu , G. Gadermaier , To stay or not to stay intact as an allergen: the endolysosomal degradation assay used as tool to analyze protein immunogenicity and T cell epitopes, Front . Allergy 5 ( 2024 ) 1440360 . doi: 10.3389/FALGY.2024.1440360/BIBTEX . OpenUrl CrossRef [31]. ↵ K.M. Makwana , R. Mahalakshmi , Implications of aromatic–aromatic interactions: From protein structures to peptide models , Protein Sci . 24 ( 2015 ) 1920 – 1933 . doi: 10.1002/PRO.2814 . OpenUrl CrossRef PubMed [32]. ↵ A. Rives , J. Meier , T. Sercu , S. Goyal , Z. Lin , J. Liu , D. Guo , M. Ott , C.L. Zitnick , J. Ma , R. Fergus , Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences , Proc. Natl. Acad. Sci. U. S. A . 118 ( 2021 ) e2016239118 . doi: 10.1073/PNAS.2016239118/SUPPL_FILE/PNAS.2016239118.SAPP.PDF . OpenUrl Abstract / FREE Full Text [33]. ↵ Y. Zhang , X. Chen , B. Jin , S. Wang , S. Ji , W. Wang , J. Han , A Comprehensive Survey of Scientific Large Language Models and Their Applications in Scientific Discovery , ( 2024 ). https://arxiv.org/abs/2406.10833v1 (accessed August 23, 2024). [34]. ↵ D. Upadhyay , J. Manero , M. Zaman , S. Sampalli , Gradient Boosting Feature Selection with Machine Learning Classifiers for Intrusion Detection on Power Grids , IEEE Trans. Netw. Serv. Manag . 18 ( 2021 ) 1104 – 1116 . doi: 10.1109/TNSM.2020.3032618 . OpenUrl CrossRef [35]. ↵ Z. Xu , G. Huang , K.Q. Weinberger , A.X. Zheng , Gradient boosted feature selection , Proc. ACM SIGKDD Int. Conf. Knowl. Discov. Data Min . ( 2014 ) 522 – 531 . doi: 10.1145/2623330.2623635 . OpenUrl CrossRef [36]. ↵ A.I. Albu , M.I. Bocicor , G. Czibula , MM-StackEns: A new deep multimodal stacked generalization approach for protein–protein interaction prediction , Comput. Biol. Med . 153 ( 2023 ) 106526 . doi: 10.1016/J.COMPBIOMED.2022.106526 . OpenUrl CrossRef PubMed [37]. ↵ C. Lei , Z. Lu , M. Wang , M. Li , StackCPA: A stacking model for compound-protein binding affinity prediction based on pocket multi-scale features , Comput. Biol. Med . 164 ( 2023 ) 107131 . doi: 10.1016/J.COMPBIOMED.2023.107131 . OpenUrl CrossRef PubMed [38]. ↵ Y. Chen , L. Gao , T. Zhang , Stack-VTP: prediction of vesicle transport proteins based on stacked ensemble classifier and evolutionary information , BMC Bioinformatics 24 ( 2023 ) 1 – 18 . doi: 10.1186/S12859-023-05257-5/TABLES/7 . OpenUrl CrossRef PubMed [39]. ↵ M.N. Nguyen , N.L. Krutz , V. Limviphuvadh , A.L. Lopata , G.F. Gerberick , S. Maurer-Stroh , AllerCatPro 2.0: a web server for predicting protein allergenicity potential , Nucleic Acids Res . 50 ( 2022 ). doi: 10.1093/nar/gkac446 . OpenUrl CrossRef PubMed [40]. ↵ I. Dimitrov , I. Bangov , D.R. Flower , AllerTOP v. 2-a server for in silico prediction of allergens , (n.d.). doi: 10.1007/s00894-014-2278-5 . OpenUrl CrossRef PubMed [41]. ↵ C. Radauer , H. Breiteneder , Evolutionary biology of plant food allergens , J. Allergy Clin. Immunol . 120 ( 2007 ) 518 – 525 . doi: 10.1016/J.JACI.2007.07.024 . OpenUrl CrossRef PubMed Web of Science [42]. ↵ S.H. Sicherer , H.A. Sampson , Food allergy: Epidemiology, pathogenesis, diagnosis, and treatment , J. Allergy Clin. Immunol . 133 ( 2014 ) 291 – 307 .e5. doi: 10.1016/J.JACI.2013.11.020 . OpenUrl CrossRef PubMed Web of Science [43]. ↵ A.M. Chiriac , Y. Wang , R. Schrijvers , P.J. Bousquet , T. Mura , N. Molinari , P. Demoly , Designing Predictive Models for Beta-Lactam Allergy Using the Drug Allergy and Hypersensitivity Database , J. Allergy Clin. Immunol. Pract . 6 ( 2018 ) 139 – 148 .e2. doi: 10.1016/j.jaip.2017.04.045 . OpenUrl CrossRef PubMed [44]. ↵ H. Breiteneder , E.N.C. Mills , Molecular properties of food allergens , J. Allergy Clin. Immunol . 115 ( 2005 ) 14 – 23 . doi: 10.1016/J.JACI.2004.10.022 . OpenUrl CrossRef PubMed Web of Science [45]. ↵ H. Djeghim , I. Bellil , O. Benslama , S. Lekmine , E. Temim , H. Boufendi , I. Postigo , P. Sánchez , D. Khelifi , Effects of genetic diversity on the allergenicity of peanut (Arachis hypogaea) proteins: identification of the hypoallergenic accessions using BALB/c mice model and in silico analysis of Ara h 3 allergen cross-reactivity , J. Proteomics 306 ( 2024 ) 105264 . doi: 10.1016/J.JPROT.2024.105264 . OpenUrl CrossRef PubMed [46]. ↵ R. Sharma , A.K. Singh , V. Umashankar , Characterization of allergenic epitopes of Ory s1 protein from Oryza sativa and its homologs , Bioinformation 4 ( 2009 ) 12 . doi: 10.6026/97320630004012 . OpenUrl CrossRef PubMed [47]. ↵ W.J. Pichler , A. Beeler , M. Keller , M. Lerch , S. Posadas , D. Schmid , Z. Spanou , A. Zawodniak , B. Gerber , Pharmacological Interaction of Drugs with Immune Receptors: The p-i Concept , Allergol. Int . 55 ( 2006 ) 17 – 25 . doi: 10.2332/ALLERGOLINT.55.17 . OpenUrl CrossRef PubMed [48]. ↵ Q. Geng , Y. Zhang , M. Song , X. Zhou , Y. Tang , Z. Wu , H. Chen , Allergenicity of peanut allergens and its dependence on the structure , Compr. Rev. Food Sci. Food Saf . 22 ( 2023 ) 1058 – 1081 . doi: 10.1111/1541-4337.13101 . OpenUrl CrossRef PubMed [49]. ↵ J. Cui , L.Y. Han , H. Li , C.Y. Ung , Z.Q. Tang , C.J. Zheng , Z.W. Cao , Y.Z. Chen , Computer prediction of allergen proteins from sequence-derived protein structural and physicochemical properties , Mol. Immunol . 44 ( 2007 ) 514 – 520 . doi: 10.1016/J.MOLIMM.2006.02.010 . OpenUrl CrossRef PubMed Web of Science [50]. ↵ J. Pekar , D. Ret , E. Untersmayr , Stability of allergens , Mol. Immunol. 100 ( 2018 ) 14 – 20 . doi: 10.1016/J.MOLIMM.2018.03.017 . OpenUrl CrossRef PubMed [51]. ↵ A. Cianferoni , J.M. Spergel , Food Allergy: Review, Classification and Diagnosis , Allergol. Int. 58 ( 2009 ) 457 – 466 . doi: 10.2332/ALLERGOLINT.09-RAI-0138 . OpenUrl CrossRef PubMed [52]. ↵ K.D. Collins , G.W. Neilson , J.E. Enderby , Ions in water: Characterizing the forces that control chemical processes and biological structure , Biophys. Chem . 128 ( 2007 ) 95 – 104 . doi: 10.1016/J.BPC.2007.03.009 . OpenUrl CrossRef PubMed [53]. ↵ T.L. Bailey , STREME: accurate and versatile sequence motif discovery , Bioinformatics 37 ( 2021 ) 2834 – 2840 . doi: 10.1093/BIOINFORMATICS/BTAB203 . OpenUrl CrossRef PubMed [54]. ↵ A. Sharma , A. Lysenko , K.A. Boroevich , E. Vans , T. Tsunoda , DeepFeature: feature selection in nonimage data using convolutional neural network , Brief. Bioinform . 22 ( 2021 ) 1 – 12 . doi: 10.1093/BIB/BBAB297 . OpenUrl CrossRef PubMed [55]. ↵ C. Chen , Q. Zhang , B. Yu , Z. Yu , P.J. Lawrence , Q. Ma , Y. Zhang , Improving protein-protein interactions prediction accuracy using XGBoost feature selection and stacked ensemble classifier , Comput. Biol. Med . 123 ( 2020 ) 103899 . doi: 10.1016/J.COMPBIOMED.2020.103899 . OpenUrl CrossRef PubMed [56]. ↵ A. Bashir , M. Mohammed , F. Mussallum , M. Khalafallah Elbashir , A.B. Musa , A. Mussallum , SVM and Naïve Bayes Stacking Approach for Improving Gene Expression Data Classification Using Logistic Regression , Artic. Int. J. Adv. Soft Comput. Its Appl . 13 ( 2021 ). https://www.researchgate.net/publication/355960625 (accessed August 23, 2024). [57]. M. Samieinasab , S.A. Torabzadeh , A. Behnam , A. Aghsami , F. Jolai , Meta-Health Stack: A new approach for breast cancer prediction , Healthc. Anal . 2 ( 2022 ) 100010 . doi: 10.1016/J.HEALTH.2021.100010 . OpenUrl CrossRef [58]. P. Rajendra , S. Latifi , Prediction of diabetes using logistic regression and ensemble techniques , Comput. Methods Programs Biomed. Updat . 1 ( 2021 ) 100032 . doi: 10.1016/J.CMPBUP.2021.100032 . OpenUrl CrossRef [59]. ↵ J. Liu , X. Dong , H. Zhao , Y. Tian , Predictive Classifier for Cardiovascular Disease Based on Stacking Model Fusion , Process . 2022 , Vol. 10 , Page 749 10 (2022) 749. doi: 10.3390/PR10040749 . OpenUrl CrossRef [60]. ↵ H. Chen , S. Lundberg , S.I. Lee , Explaining Models by Propagating Shapley Values of Local Components, Stud . Comput. Intell . 914 ( 2021 ) 261 – 270 . doi: 10.1007/978-3-030-53352-6_24 . OpenUrl CrossRef [61]. H. Sahlaoui , E.A.A. Alaoui , A. Nayyar , S. Agoujil , M.M. Jaber , Predicting and Interpreting Student Performance Using Ensemble Models and Shapley Additive Explanations , IEEE Access 9 ( 2021 ) 152688 – 152703 . doi: 10.1109/ACCESS.2021.3124270 . OpenUrl CrossRef [62]. ↵ Y. Nohara , K. Matsumoto , H. Soejima , N. Nakashima , Explanation of machine learning models using shapley additive explanation and application for real data in hospital , Comput. Methods Programs Biomed . 214 ( 2022 ) 106584 . doi: 10.1016/J.CMPB.2021.106584 . OpenUrl CrossRef PubMed [63]. ↵ S. Prescott , K.J. Allen , Food allergy: Riding the second wave of the allergy epidemic , Pediatr. Allergy Immunol . 22 ( 2011 ) 155 – 160 . doi: 10.1111/J.1399-3038.2011.01145.X . OpenUrl CrossRef PubMed [64]. ↵ W. Loh , M.L.K. Tang , The Epidemiology of Food Allergy in the Global Context , Int. J. Environ. Res. Public Heal . 2018 , Vol. 15 , Page 2043 15 (2018) 2043. doi: 10.3390/IJERPH15092043 . OpenUrl CrossRef [65]. ↵ J.M. Larsen , C.H. Bang-Berthelsen , K. Qvortrup , A.I. Sancho , A.H. Hansen , K.I.H. Andersen , S.S.N. Thacker , T. Eiwegger , J. Upton , K.L. Bøgh , Production of allergen-specific immunotherapeutic agents for the treatment of food allergy , Crit. Rev. Biotechnol . 40 ( 2020 ) 881 – 894 . doi: 10.1080/07388551.2020.1772194 . OpenUrl CrossRef PubMed [66]. ↵ X. Pi , Y. Sun , J. Cheng , G. Fu , M. Guo , A review on polyphenols and their potential application to reduce food allergenicity , Crit. Rev. Food Sci. Nutr . 63 ( 2023 ) 10014 – 10031 . doi: 10.1080/10408398.2022.2078273 . OpenUrl CrossRef PubMed [67]. J.S. Kandula , V.P.K. Rayala , R. Pullapanthula , Chirality: An inescapable concept for the pharmaceutical, bio-pharmaceutical, food, and cosmetic industries , Sep. Sci. Plus 6 ( 2023 ) 2200131 . doi: 10.1002/SSCP.202200131 . OpenUrl CrossRef [68]. ↵ S. Gajbhiye , K. Pal , Toxic and Allergic Responses Caused by Secondary Metabolites Used in Cosmetic Formulations , Bioprospecting Nat. Sources Cosmeceuticals ( 2024 ) 73 – 104 . doi: 10.1039/9781837672288-00073 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted June 27, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following AllerStack: Predicting Allergenic Proteins with a Stacked Ensemble Approach Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share AllerStack: Predicting Allergenic Proteins with a Stacked Ensemble Approach Harshika Sharma , Mansi Goel , Devkul Sahu , Mazhar Ayaz Sayed , Pratham Mangla , Pratham Shekhawat , Shubham Yadav , Ganesh Bagler bioRxiv 2025.06.22.660976; doi: https://doi.org/10.1101/2025.06.22.660976 Share This Article: Copy Citation Tools AllerStack: Predicting Allergenic Proteins with a Stacked Ensemble Approach Harshika Sharma , Mansi Goel , Devkul Sahu , Mazhar Ayaz Sayed , Pratham Mangla , Pratham Shekhawat , Shubham Yadav , Ganesh Bagler bioRxiv 2025.06.22.660976; doi: https://doi.org/10.1101/2025.06.22.660976 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41911) Biophysics (21436) Cancer Biology (18578) Cell Biology (25482) Clinical Trials (138) Developmental Biology (13371) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15599) Genomics (22482) Immunology (17728) Microbiology (40363) Molecular Biology (17163) Neuroscience (88536) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.