Full text
70,526 characters
· extracted from
preprint-html
· click to expand
Zero-Shot Protein–Ligand Binding Site Prediction from Protein Sequence and SMILES | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Zero-Shot Protein–Ligand Binding Site Prediction from Protein Sequence and SMILES View ORCID Profile Mahdi Pourmirzaei , View ORCID Profile Salhuldin Alqarghuli , Kai Chen , Mohammadreza Pourmirzaei , View ORCID Profile Dong Xu doi: https://doi.org/10.1101/2025.09.28.679103 Mahdi Pourmirzaei 1 University of Missouri 2 ProGene Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mahdi Pourmirzaei Salhuldin Alqarghuli 1 University of Missouri Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Salhuldin Alqarghuli Kai Chen 1 University of Missouri Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mohammadreza Pourmirzaei 3 Politecnico di Milano , Milan, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dong Xu 1 University of Missouri Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Dong Xu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Accurate identification of protein–ligand binding sites is critical for mechanistic biology and drug discovery, yet performance varies widely across ligand families and data regimes. We present a systematic prediction and evaluation framework that stratifies ligands into three settings, overrepresented (many examples), under-represented (tens of examples; few-shot), and zero-shot (unseen at training). We developed a novel three-stage, sequence-based modeling suite that progressively adds ligand conditioning and zero-shot capability, and used an evaluation frame-work to assess the suite. Stage 1 trains per-ligand predictors using a pretrained protein language model (PLM). Stage 2 introduces ligand-aware conditioning via an embedding table, enabling a single multi-ligand model. Stage 3 replaces the table with a pretrained chemical language model (CLM) operating on SMILES, enabling zero-shot generalization. We show Stage 2 improves Macro F 1 on the overrepresented test set from 0.4769 (Stage 1) to 0.5832 and outperforms sequence- and structure-based baselines. Stage 3 attains zero-shot performance ( F 1 = 0.3109) on 5 612 previously unseen ligands while remaining competitive on represented ligands. Ablations across five PLM scales and multiple CLMs reveal larger PLM backbones consistently increase Macro F 1 across all regimes, whereas scaling the CLM yields modest or inconsistent gains, which need further investigation. Our results demonstrate that zero-shot residue-level prediction from sequence and SMILES is feasible and identifies the PLM scale as the dominant lever for further advances. The code is fully open source at GitHub . 1. Introduction Reliable identification of protein-ligand binding sites plays a critical role in understanding fundamental biological processes such as gene expression regulation, signal transduction pathways, and antigen–antibody interactions [ 1 – 3 ]. Additionally, precise determination of protein-ligand interaction sites is fundamental for effective drug discovery and rational therapeutic development [ 4 ]. However, pinpointing ligand-binding sites accurately remains challenging, particularly when high-resolution structural data for proteins are unavailable [ 5 ]. Although experimental techniques like nuclear magnetic resonance and absorption spectroscopy provide high-quality data, they are expensive and labor-intensive, underscoring the critical need for computational approaches capable of rapid, high-throughput binding site predictions [ 6 ]. Despite notable advancements in computational prediction methodologies for protein-ligand interactions, several limitations persist. Many current computational tools lack versatility, restricting their effectiveness to specific ligand categories [ 1 , 7 – 9 ]. This constraint is particularly challenging given the vast chemical diversity of ligands encountered biologically. Furthermore, data scarcity significantly impacts model performance, particularly affecting ligand types with limited available training examples, which hampers effective generalization [ 10 – 12 ]. To assess the effects of data scarcity on protein-ligand binding site prediction, we systematically establish a prediction and evaluation framework comprising three distinct test scenarios: overrep-resented, underrepresented (few-shot), and zero-shot ligand cases. Initially, we develop a baseline model designed to predict binding sites for a single ligand type. Subsequently, we enhance this foun-dational architecture to accommodate multiple ligands within a single model and further expand it to support zero-shot ligand predictions. Our approach not only achieves state-of-the-art performance but also quantifies how prediction performance shifts across these three evaluation scenarios. Our work introduces several novel contributions, which are summarized as follows: A homology-controlled, three-regime benchmark . We establish a standardized evaluation with ligand-wise splits for overrepresented, underrepresented , and zero-shot ligands, enabling directly comparable assessments across methods. A progressive, sequence-based modeling suite . We introduce a three-stage framework: (i) per-ligand baselines, (ii) a single multi-ligand model conditioned by a learned ligand embedding, and (iii) a zero-shot model that conditions on SMILES via a pretrained chemical language model (CLM). Scaling analysis across PLMs and CLMs . Through controlled ablations, we quantify that enlarging the protein encoder (ESM-2) consistently dominates gains across all regimes, while CLM choice/scale yields smaller or inconsistent improvements. 2 Related Work State-of-the-art sequence-based predictors of protein-ligand binding sites increasingly center on pretrained protein language models (PLMs). PLMs address two long-standing challenges: (i) modeling long-range dependencies via transformers and (ii) improving sample efficiency through large-scale pretraining. We group recent PLM-driven approaches by how they adapt pretrained representations to binding-site prediction. A common line of work keeps the PLM frozen (or lightly tuned) and adds lightweight classifiers. BindEmbed21 [ 13 ] feeds ProtT5 embeddings into a shallow CNN and combines them with homology-based inference, using class-weighted losses to counter label imbalance. CLAPE-SMB [ 9 ] freezes ESM-2 and employs a five-layer MLP trained with class-balanced focal loss plus a tripletcenter term to better separate binding from non-binding sites. For metal ions, LMetalSite [ 6 ] regularizes PLM embeddings with Gaussian noise, shares transformer layers across tasks, and uses ion-specific MLP heads within a multi-task framework to capture cross-ion commonalities while remaining ligand-aware. IonPred [ 14 ] instead adopts a generator–discriminator setup, a masked language model (MLM)-style generator with an ELECTRA-like discriminator to learn residue-substitution regularities, which improves overall data efficiency by reducing the need for labeled data during fine-tuning. We developed Prot2Token [ 15 ] using a sequence-only design in which a pretrained ESM-2 encoder provides context to a causal decoder via cross-attention, framing binding-site prediction as next-token generation over sorted residue indices. Each ligand type is represented by a learned task token given to the decoder, and the model is trained multi-task over 41 ligand classes. Because conditioning is done over a closed vocabulary, the approach is effective for a fixed set of ligands but does not generalize zero-shot to unseen ligands without adding new tokens and retraining. LaMPSite [ 16 ] augments ESM-2 sequence embeddings with an estimated contact map and pairs them with a graph neural network (GNN) over the ligand’s 2D molecular graph (plus a fast conformer-derived distance map). A residue–atom interaction module integrates protein contacts and ligand distances to refine residue-level binding likelihoods, yielding a sequence-anchored yet ligand-aware predictor. Across PLM-based approaches, common tactics for class imbalance include weighted losses [ 13 ], class-balanced focal objectives with metric-learning terms [ 9 ], multi-task sharing [ 6 ], and decoder pretraining for stability in few-shot regimes [ 15 ]. Conditioning on ligand identity via task tokens [ 15 ] or explicit ligand graphs [ 16 ] further improves specificity over purely ligand-agnostic sequence models. We de-emphasize earlier families—traditional ML, non-pretrained deep learning, and template-based methods—because they either train from scratch on limited data (hurting sample efficiency and long-range modeling) or depend on close structural templates (limiting coverage and ligand specificity). Brief summaries of such methods appear in Appendix A. 3 Method We systematically evaluated the impact of ligand representation, specifically overrepresented, underrepresented , and zero-shot ligands, on protein-ligand binding site prediction. We first established an evaluation framework tailored to these categories, then introduced a three-stage modeling frame-work that progressively enhances a baseline transformer architecture. Stage 1 focuses on single-ligand prediction; subsequent stages extend to multiple ligands and enable zero-shot prediction. This structure supports a systematic comparison of architectural modifications across diverse ligand contexts. 3.1 Data Preparation and Evaluation Setup We utilized BioLiP2 [ 17 ] as our primary dataset, a comprehensive database of biologically relevant protein-ligand binding interactions sourced from the Protein Data Bank (PDB). After applying quality filters and data preprocessing steps detailed in Appendix B, we obtained 5 780 ligands and 41 327 protein sequences. To systematically evaluate model performance across different data availability scenarios, we categorized ligands into three distinct groups based on the number of associated protein-ligand pairs ( Figure 1 and Table 1 ). View this table: View inline View popup Download powerpoint Table 1: Dataset split statistics across overrepresented, underrepresented , and zero-shot ligand categories. Numbers marked with † indicate values after compatibility-based ligand exclusions explained in Appendix B. Download figure Open in new tab Figure 1. Evaluation strategies showing the distribution of ligands across three categories based on the number of associated protein sequences. Numbers (in the log 10 scale) reflect the original dataset before compatibility-based exclusions. Overrepresented ligands (45 ligands) have more than 100 samples and represent well-studied binding interactions. These were split using standard train-validation-test ratios, with higher training proportions for ligands with abundant data. Underrepresented ligands (122 ligands) have 20-99 samples, representing moderately studied interactions. These were split using balanced ratios (34%-33%-33%) to ensure adequate representation across all sets. Zero-shot ligands (5,612 ligands) have fewer than 20 samples and were reserved exclusively for testing to evaluate the model’s ability to generalize to entirely unseen binding interactions. To pre-vent data leakage between splits, we applied CD-HIT [ 18 ] clustering with a 40% identity threshold to group similar protein sequences, ensuring that clusters rather than individual sequences were assigned to different sets. This approach maintains the independence of training and evaluation data while preserving the biological diversity within each category. Complete details of the clustering procedure and data splitting methodology are provided in Appendix B. To ensure compatibility across different CLMs, we applied minor modifications to the dataset during integration. Specifically, 29 ligands were excluded from the final evaluation dataset. Complete details of these compatibility adjustments are provided in Appendix B. Although we used the reduced version for our experiments, we release the original unmodified dataset as well to facilitate future research. 3.2 Predictor Architecture The proposed architecture leverages transformer-based models, specifically utilizing Bert-style pretrained language models tailored separately for proteins (PLMs) and chemicals (CLMs). PLMs such as ESM-2 [ 19 ] provide contextualized residue-level embeddings based solely on protein sequences. CLMs like MolFormer [ 20 ] are employed to represent ligand structures using their SMILES notation. These pre-trained models provide robust feature representations essential for accurately predicting binding sites. 3.2.1 Stage 1: Baseline Model with Single-Ligand Prediction Stage 1 establishes our baseline architecture, which predicts binding sites for one ligand type at a time. Let p be an input protein sequence of fixed length L . The sequence is tokenized and passed through a pre-trained protein language model G θ , whose parameters are denoted by θ . This model outputs contextualised residue-level embeddings G θ ( p ) ∈ ℝ L × d , where d is the hidden dimension. The embeddings are then fed into a linear predictor (binary classification head) C ϕ , parameterised by ϕ , to produce a logit for each residue ( Equation 1 ). At this stage, the architecture operates independently of the ligand-specific context, focusing exclusively on individual ligand predictions without cross-ligand generalization ( Figure 2 ). Download figure Open in new tab Figure 2. Stage-1 (single-ligand, sequence-only) baseline. A protein sequence p is tokenized and encoded by a pretrained PLM G θ to yield residue embeddings G θ ( p ) ∈ ℝ L × d . A linear head C ϕ produces residue-wise binding logits ŷ ∈ ℝ L ×2 ( Equation 1 ). No ligand conditioning is used; a separate model is trained per ligand type. Gray cells indicate padding/masks. The calculated loss for a given sample first uses the standard binary cross-entropy (BCE) formulation. To emphasize hard-to-classify residues while correcting for dataset- and class-specific imbalance, we multiply each residue by a sample weight w i , yielding the weighted loss in ( Equation 2 ). Here w i ≥ 0 is the product of a dataset-level weight and, when enabled, a positive-class token weight. 3.2.2 Stage 2: Multi-Ligand Prediction with Ligand-Specific Conditioning Stage 2 augments the baseline by conditioning on ligand identity, enabling the simultaneous prediction of binding sites for multiple pre-defined ligand types. Protein sequences p are encoded by the same G θ used in Stage 1, producing residue embeddings G θ ( p ) ∈ ℝ L × d . Ligand information is supplied as an index ℓ that queries a trainable embedding table E η , parameterized by η , yielding an embedding table E η ( ℓ ) ∈ ℝ d (analogous to word embeddings, e.g. word2vec [ 21 ]). A bidirectional transformer decoder T ψ , whose parameters are indicated by ψ , integrates protein and ligand representations through cross-attention, after which, similar to stage 1, the linear binary classification head C ϕ produces residue level logits ( Equation 3 ). This stage supports all ligand types seen during training, but cannot yet be generalized to novel ligands ( Figure 3 ). Download figure Open in new tab Figure 3. Stage-2 (multi-ligand with learned ligand embedding). The protein is encoded by G θ as in Stage 1. Ligand identity ℓ indexes a trainable embedding table E η ( ℓ ) ∈ ℝ d . A bidirectional decoder T ψ fuses protein and ligand signals via cross-attention, and the classifier C ϕ outputs residue-wise logits ( Equation 3 ). This design supports all ligands observed during training but does not generalize to novel ligands. Gray cells indicate padding/masks; the right panel sketches the predefined ligand set. 3.2.3 Stage 3: Zero-Shot Generalization with a Pre-Trained Chemical Language Model Stage 3 equips the model with zero-shot capability by replacing the fixed ligand-embedding mechanism with a pre-trained chemical language model. Protein sequences p are encoded by the same G θ as previous stages. Each ligand is supplied directly as a SMILES string s and embedded by the chemical language model F ω , whose parameters are denoted by ω , yielding an embedding F ω ( s ) ∈ ℝ L × c whose dimensionality c generally differs from the protein feature dimension d . To align these feature spaces we introduce a learnable linear projector P ρ : ℝ l × c → ℝ L × d , parame-terised by ρ , producing P ρ F ω ( s ) ∈ ℝ L × d for the cross-attention layers of the same decoder T ψ . The fused representation is then passed to the binary classifier head C ϕ . Because F ψ is pre-trained on large chemical corpora, the architecture ( Equation 4 ) can technically predict binding sites for entirely new ligand-protein combinations, including those that have not been encountered during training ( Figure 4 ). Download figure Open in new tab Figure 4. Stage-3 ( zero-shot ) architecture. A protein sequence p of length L is embedded by a pretrained PLM G θ (left), and the ligand SMILES s with l tokens is embedded by a pretrained CLM F ω (right). A learnable projector P ρ maps CLM features to the protein feature dimension, and a bidirectional decoder T ψ fuses the two streams via cross-attention; a linear head C ϕ then outputs residue-wise binding logits ŷ ∈ ℝ L ×2 ( Equation 4 ). Gray cells indicate padding/masks. Conditioning on SMILES rather than a fixed ligand ID enables inference on unseen ligands ( zero-shot residue-level prediction). 3.3 Evaluation Metrics We evaluate residue-level binary predictions with four standard metrics: Accuracy, F 1 , Macro F 1 , and Matthews correlation coefficient (MCC). Let y i ∈ {0, 1} denote the ground-truth label for residue i and ŷ i ∈ [0, 1 ] the predicted binding probability. We obtain a hard label via a fixed threshold τ (default τ = 0.5): . Summing over an evaluation subset yields counts TP, FP, TN, FN. Accuracy F 1 score With precision and recall , Macro F 1 For a ligand set ℒ (e.g., overrepresented, underrepresented ), we compute sepa-rately for each ligand ℓ and report the unweighted mean: For the zero-shot group, we report both the pooled F 1 (computed over all residues from unseen ligands jointly) and Macro F 1 (mean of per-ligand F 1 ). Matthews Correlation Coefficient (MCC) Unless otherwise noted, all metrics are computed at the residue level on the held-out test sets using the threshold τ = 0.5. 4 Experiments We constructed a base architecture and systematically extended it in two progressive stages to enable comprehensive evaluation across all ligand representation categories. These categories include overrepresented, underrepresented , and zero-shot ligands. Throughout our experiments, we evaluated multiple combinations of pre-trained language models: specifically, PLMs including different scales of ESM-2, and CLMs including MolFormer and UniMol-2 [ 22 ]. Optimization procedures employed the AdamW optimizer [ 23 ], configured with a weight decay of 0.01, beta-1 and beta-2 parameters set to β 1 = 0.9, β 2 = 0.98, respectively, and ϵ = 1 e − 7. We implemented a cosine annealing learning rate schedule [ 24 ], without incorporating an initial warm-up phase, wherein the learning rate starts from 5e-5 and ends at 0. All experiments were implemented using PyTorch 2.6.0 framework [ 25 ], and employed mixed-precision BF16 training [ 26 , 27 ] on an NVIDIA A100 GPU with 80GB of memory. Detailed hyperparameter configurations are documented in Appendix D. 4.1 Stage 1: Single Ligand In Stage 1, we trained individual models for each ligand type without using any ligand-specific embeddings or chemical representations. We focused this stage on the 45 overrepresented ligands in our dataset and maintained the same hyperparameter setup for each model. The result of each ligand was obtained from a single training run. Training was monitored using a validation set, with the best checkpoint from each run selected based on the peak validation F 1 score. Final evaluations were conducted on the held-out overrepresented test set. It is worth mentioning that we only used ESM-2 650m for the PLM at this stage. The overall results are provided in Tables 2 and 4 . The results, summarized in Table S3, provide a detailed performance baseline for residue-level binary classification. View this table: View inline View popup Download powerpoint Table 2: Benchmarking the three Stages on the test sets. Macro F 1 scores are based on the average of F 1 for each type of ligand. Despite uniform training procedures, we observed considerable variation in performance across lig- and types. While several ligands such as FAD and ZN achieved high F 1 scores, others, such as BMA and MAN , achieved zero or near-zero predictive accuracy. These discrepancies may reflect differences in sample size, but also suggest that certain ligands have inherently more complex binding behaviors. 4.2 Stage 2: Multiple Ligands For stage 2, we tested the effects of the extended architecture and exposure to multiple ligand types on the performance of the model. The introduction of ligand-specific information, implemented through the embedding table, enabled a single model to be trained on a compiled dataset consisting of multiple ligands. We evaluated this architecture under two training conditions: first, using only the 45 overrepresented ligands from Stage 1, and second, using an expanded training set that combined these overrepresented ligands with the set of underrepresented ligands. For both scenarios, results were obtained by averaging the performance across three training runs using random seeds, reported in Table 2 . In both settings, the Stage 2 model showed significantly improved predictive performance over the Stage 1 baseline with a Macro F 1 score of 0.4769, demonstrating the benefits of using multi-task learning. The comparison with Prot2Token on the overrepresented test set is shown in Table 4 . Need to mention that we only used ESM-2 650m PLM for this stage (Table S1). Fair benchmarking across sequence- and structure-based predictors is hampered primarily by data contamination: models pretrained or tuned on PDB-derived sequences/complexes can overlap (via homology or near-duplicates) with evaluation sets and inflate reported scores. We mitigate within-dataset leakage by CD-HIT clustering at 40% identity (Appendix B), but cross-corpus contamination remains a caveat when comparing to external tools. With that in mind, we include Boltz-2x [ 28 ]—a recent, high-performing open-source protein–ligand complex predictor—as a structure-based base-line alongside sequence-only methods. We ran Boltz-2x without multiple sequence alignments (MSAs) and truncated protein sequences to 1280, using only single protein sequences as input, so that its predictions were not advantaged by evolutionary context unavailable to our models. To evaluate the prediction of Boltz-2x at the residue level, we converted each predicted complex into binding-site labels by enumerating intermolecular atomic contacts: a contact is any protein–ligand atom pair whose distance is less than or equal the sum of their van der Waals radii + 0.5 Å ; a residue is designated binding if it forms more than 2 such contacts with the ligand. This yields residue-level labels directly comparable to our model’s outputs. 4.3 Stage 3: Zero-Shot Ligands In Stage 3, we evaluated the effect of using the MolFormer CLM to generate chemical embeddings, a novel approach that replaces the ligand embedding table from Stage 2 and abstracts ligands through the use of SMILES strings. We trained this architecture using the combined set of overrepresented and underrepresented training sets. Evaluation was conducted on three test sets and reported in Table S3. The reported results for this stage were obtained by averaging performance across three independent training rounds with randomized seeds. The comparison to Prot2Token [ 29 ] on the overrepresented test set is shown in Table 4 . We use Prot2Token as a primary comparator because it is among the strongest sequence-based predictors reported to date and has been shown to outper-form comparable methods. For fairness, our training/validation/test splits closely mirror those used for Prot2Token (from the same BioLiP2 source), differing only by minor per-ligand sample-count variations. When evaluating with the same set of hyperparameters used for the prior stages, we found that the Stage 3 model underperformed on both the overrepresented (0.5526 vs. 0.5832 Macro F 1 ) and underrepresented (0.3603 vs. 0.3752 Macro F 1 ) test sets compared to Stage 2. Nevertheless, Stage 3 offered a distinct advantage in its ability to perform zero-shot inference, achieving a F 1 score of 0.3109 on the test set of completely new ligands. 4.4 Scaling comparison To better understand the impact of model scale on protein-ligand binding site prediction, we conducted a series of ablation experiments varying both the size of the PLM backbone and the choice of CLM on the Stage 3 architecture. Specifically, we evaluated model performance across five ESM2 sizes (8M, 35M, 150M, 650M, and 3B), paired with different CLM configurations to assess how scaling affects predictive capability. In addition to varying the backbone size, we compared multiple CLMs, including MolFormer, UniMol-2 (both 84M and 570M parameter variants), and partially unfrozen versions of the 84M UniMol-2 model. To ensure a fair comparison across architectures, we held the training recipe fixed: similar optimization hyperparameters and the same architecture configurations (Tables S1 and S2). Each configuration was trained with three random seeds, and we report the mean results. As shown in Figure 5 , scaling the protein encoder dominates performance: larger ESM-2 backbones consistently yield higher Macro F 1 across overrepresented, underrepresented , and zero-shot test sets, largely independent of the paired CLM. In contrast, increasing the chemical encoder’s scale (e.g., UniMol-2 84M → 570M) produces negligible or inconsistent gains. Among CLMs, MolFormer is the most reliable—typically matching or exceeding the UniMol-2 variants with lower variability. Download figure Open in new tab Figure 5. Results of ablation studies on the three ligand test sets based on F 1 : (a) overrepresented, (b) underrepresented , and (c) zero-shot test sets. 5 Discussion In this study, we asked whether residue-level binding prediction can be made both ligand-aware and zero-shot from sequence and SMILES alone. The three-stage progression answers yes , and clarifies where the gains come from. Moving from per-ligand models to a single multi-ligand model (Stage 1→Stage 2) delivers a large improvement on overrepresented ligands (Macro F 1 0.4769→0.5832; Table 2 ), while remaining competitive or better than strong sequence baselines across many ions and cofactors (e.g., Zn 2+ F 1 = 0.818 vs. 0.557 for Boltz-2x under residue-label conversion; Table 3 ). Because our approach avoids 3D inference, it is also highly efficient: empirically, we observe ≥ 100× higher throughput than structure-based pipelines such as Boltz-2x under our evaluation setting, making it well-suited as a front-end, high-throughput virtual screening stage prior to more expensive 3D modeling or docking. View this table: View inline View popup Download powerpoint Table 3: Comparison of our method’s best performance for each ligand with other available methods on selected ligands in the overrepresented test set based on F 1 score. The main values are taken from the original papers, and * indicates methods evaluated on our test sets. View this table: View inline View popup Download powerpoint Table 4: Comparison of Prot2Token with our approach on the top 40 ligands in the overrepresented test set. †: values are reported from the original paper. Replacing the lookup embedding with a CLM over SMILES (Stage 3) unlocks zero-shot generalization: on the held-out set of unseen ligands, the model reaches F 1 = 0.3109 (Macro F 1 = 0.2338; Table 2 ). The price is a modest drop on represented ligands relative to Stage 2 ( overrepresented Macro F 1 0.5526 vs. 0.5832; underrepresented 0.3603 vs. 0.3752). Together with the ablation in Fig. 5 , which shows consistent gains from scaling the protein encoder (ESM-2) but only small or inconsistent effects from scaling the CLM, these results suggest that richer protein context is the dominant driver of binding-site discrimination, while current cross-modal fusion likely leaves accuracy on the table for seen ligands. Two factors likely limit headroom. First, label imbalance and noise (99%+ accuracy with far lower F 1 ) make optimization sensitive to calibration and thresholds, especially for sugars and small polar ligands where Stage 1 underperforms (e.g., BMA , MAN ; Table S3). Second, CLM–PLM alignment is learned through a shallow projector; stronger fusion could both recover the small Stage 3 degradation on seen ligands and actively elevate the importance of chemical representations, allowing the CLM to contribute more decisively (and potentially make CLM scale matter) without sacrificing zero-shot ability. 6 Conclusion We introduced a homology-controlled benchmark and a progressive, sequence-based modeling suite for residue-level protein–ligand binding prediction that spans overrepresented, underrepresented , and zero-shot regimes. Because the method avoids 3D inference, it is computationally lightweight and, in our setting, achieves ≥ 100× higher throughput than structure-based pipelines, making it well-suited as a front-end filter in high-throughput screening: from sequence and SMILES alone, one can triage protein-ligand pairs and prioritize a small subset for docking or structure prediction. We release code, data, and model weights to catalyze reproducible progress on zero-shot, residue-level binding prediction: https://github.com/mahdip72/ProteinLigand Appendix A Extended Related Work In this section, we categorize existing protein–ligand binding site prediction studies into distinct groups. For each study, we briefly describe two key aspects: the structure of the proposed method and the strategy used to address the class imbalance between binding and non-binding sites, which remains one of the main challenges in developing effective deep learning models for this task. While some previous works have grouped prediction methods into sequence and structure-based approaches [ 32 , 3 ], our focus is primarily on sequence-based methods. We further classify these studies into three groups based on the techniques used and the order in which they were developed. The earliest approaches applied traditional machine learning methods [ 5 , 33 ], followed by deep learning-based methods [ 34 , 4 ], and more recently, pretrained models-based methods [ 6 , 16 ]. We also include some studies that combine sequence and structure-based methods due to their novelty in extracting sequence features or addressing the class imbalance problem, as these provide valuable insights and broaden the scope for the reader. A.1 Traditional Machine Learning Methods Yu et al. [ 5 ] proposed TargetS, which integrates multiple features for binding residue prediction, including position-specific scoring matrices (PSSMs) to capture protein evolutionary information, predicted secondary structures obtained using PSIPRED, and ligand-specific binding propensities. The ligand-specific propensities are calculated based on the frequencies of amino acids among binding sites for each ligand type. A 17-dimensional window centered at the target residue is used to capture not only the residue’s own propensity but also the influence of its local environment. To enhance prediction accuracy, the authors employed a modified AdaBoost (MAdaBoost) ensemble scheme that combines multiple classifiers. To address the class imbalance between binding and non-binding sites, they applied a random undersampling strategy within the AdaBoost framework. Yang et al. [ 33 ] proposed TM-SITE and S-SITE, two template-based methods that leverage structural and sequence similarity, respectively, to predict ligand-binding sites. TM-SITE performs structural comparisons by aligning the subsequence spanning from the first to the last binding residue (SSFL) of the query protein against a pre-calculated SSFL library using TM-align. Proteins with the highest alignment scores are selected as putative templates. A secondary scan is then conducted using the entire query structure against the BioLiP database. Final predictions are obtained through a clustering and voting scheme. S-SITE, in contrast, relies on sequence-based alignment. PSI-BLAST is used to generate multiple sequence alignments for the query protein, from which a position-specific frequency matrix (PSFM) is derived. Template profiles in the BioLiP library, represented as position-specific scoring matrices (PSSMs), are precomputed in advance. The PSFM of the query is then compared to these template PSSMs using sequence alignment techniques, and a voting scheme is applied to determine the final binding sites. Hu et al. [ 35 ] introduced two ligand-specific methods for predicting ion-binding sites. IonSeq is a sequence-based approach that employs sequence profiles together with a modified AdaBoost algorithm to distinguish between binding and non-binding sites. IonCom is a composite method that combines IonSeq with multiple template-based predictors, including COFACTOR, TM-SITE, S-SITE, and COACH, to improve prediction accuracy and robustness. To address class imbalance and reduce the risk of overfitting, both methods use a modified AdaBoost framework that applies selective random sampling to negative (non-binding) residues while fully utilizing all positive (binding) residues in every training round. Hu et al. [ 36 ] proposed ATPbind, a method that combines sequence-based features, including position-specific scoring matrices (PSSMs), predicted secondary structure, and solvent accessibility, with the outputs of S-SITE and TM-SITE, which are sequence- and structure-template-based predictors, respectively. To address the imbalance between binding and non-binding sites, ATPbind employs multiple support vector machines (SVMs) trained with a random undersampling strategy and integrates their outputs using a mean-ensemble approach. Lin et al. [ 37 ] presents the MIB webserver, which utilizes available templates from the Protein Data Bank (PDB) to identify binding sites for 12 types of metal ions. The method compares the structure of query proteins with these templates without any data training. Metal ion–binding residue templates are extracted from the PDB, and homologous proteins are filtered out to ensure diversity. The server aligns the query protein with the templates and assigns a binding score based on sequence and structure similarity. MIB also predicts the docking positions of metal ions in protein structures. A.2 Deep Learning-Based Methods Cui et al. [ 34 ] introduced DeepCSeqSite (DCS-SI), a novel sequence-based approach for predicting protein–ligand binding sites. DCS-SI is built on a deep convolutional neural network with an encoder–decoder architecture. The encoder transforms entire amino acid sequences into hierarchical representations that capture both local and long-range dependencies between residues. The decoder then uses these features to predict binding sites. To address the class imbalance issue, the authors did not apply explicit oversampling or undersampling techniques. Instead, they leveraged the strong representation capability of their deep convolutional network and processed entire amino acid sequences as input units during mini-batch grouping. This approach preserved the natural proportion of positive (binding) and negative (non-binding) samples within each mini-batch, reducing the risk of batches dominated by negative samples and enabling the model to learn from a more representative distribution. Xia et al. [ 4 ] proposed DELIA, a hybrid deep learning model that combines 1D sequence-based features with a 2D structure-based distance matrix representing the Euclidean distances between residues in a protein structure. The model is composed of three main modules: a feature extractor, a ResNet module, and a BiLSTM module. The feature extractor utilizes various tools such as PSI-BLAST, HHblits, SCRATCH-1D, and S-SITE to generate features, which are then concatenated. The ResNet and BiLSTM modules are employed to extract high-level representations, and their outputs are concatenated and passed through a softmax layer to generate the final predictions. To address the class imbalance issue, the authors applied a combination of random undersampling and oversampling. For undersampling, they constructed multiple training subsets, each containing all positive samples and 20% of the original negative samples. For oversampling, they ensured that each negative sample was used only once per epoch, while positive samples were randomly selected and reused multiple times across mini-batches. Essien et al. [ 30 ] utilized a Capsule Network (CapsNet) for predicting zinc-binding sites. The model processes protein sequences using a sliding window approach, where each window consists of 25 amino acids. These fixed-length segments are extracted from longer sequences and processed individually. Convolutional layers extract increasingly abstract features, while the PrimaryCaps layer captures higher-level representations. Dynamic routing further refines these features, and the output layer uses them to predict zinc-binding sites. To address class imbalance, the authors applied a bootstrapping strategy. Specifically, in each iteration, a deep learning classifier was trained on a balanced subset of the data, containing all positive samples and a portion of negative samples. This process was repeated multiple times to ensure that all negative samples were used across different iterations, resulting in several independent classifiers. The final prediction was obtained by averaging the outputs of these classifiers. Xia et al. [ 38 ] proposed GraphBind, a model that leverages both structure-based and sequence-based features to identify binding sites. GraphBind consists of three main modules: feature extraction, structural context extraction, and graph construction. In the feature extraction module, both sequence and structure-based features are collected. Sequence features include position-specific scoring matrices (PSSMs) and hidden Markov models (HMMs), generated using PSI-BLAST and HHblits. Structure features include atomic properties and secondary structure (SS) information derived from DSSP analysis. The structural context extraction module defines a local environment around each target residue using a sliding sphere, creating pseudo-positions that represent the 3D context of the residue. In the graph construction module, each residue is treated as a node. Node feature vectors, a distance matrix, an adjacency matrix, and edge feature vectors are used to build the graph. The resulting graph-based representations are then passed to a classifier that determines whether a residue is part of a binding site. To address the class imbalance issue, the authors first applied BL2SEQ and TM-align to assess sequence identity and structural similarity between protein chain pairs. Chains with sequence identity > 0.8 and TM-score > 0.5 were clustered together. Binding site annotations from chains within each cluster were transferred to the chain with the largest number of residues. Finally, CD-HIT was used to remove redundant sequences, reducing sequence identity in the training set to below 30%. Xia et al. [ 7 ] presented BindWeb, a model that integrates predictions from GraphBind and DELIA, using mean shift clustering to identify binding sites. The binding scores predicted by GraphBind and DELIA are averaged to generate the final prediction. For each residue, the averaged binding score from both methods is compared to an averaged threshold (derived from the individual thresholds of GraphBind and DELIA) to classify the residue as binding or non-binding. To address the data imbalance issue, DELIA employs a random undersampling-based ensemble strategy. Essien et al. [ 39 ] proposed GPred, a model with four main modules for predicting the location of coordinated metal ion–ligand binding sites in proteins. The Point Neighborhood Grouper finds the nearby atoms around each atom in the protein’s point cloud. The Point Transformer combines information about atomic properties, shapes, and evolutionary data, using self-attention and position encoding. The Residual Pooler changes the information from the atom level to the residue level using a MaxPooling layer, helping the model see how atoms work together in the protein. The Classifier then decides if each residue is a binding site or not. The authors used a weighted binary cross-entropy (BCE) loss function to reduce the effect of data imbalance. Chelur and Priyakumar [ 40 ] proposed BiRDS, a deep Residual Neural Network designed to predict a protein’s most active binding site using only sequence information. It extracts features such as token embeddings, positional embeddings, segment embeddings, Position-Specific Scoring Matrix, information content, secondary structure, and solvent accessibility, which are combined through simple concatenation to form the input feature vector. This vector is fed into the BiRDS model for classification, and to address the imbalance between binding and non-binding sites, the model is trained using a weighted binary cross-entropy loss function. A.3 Pretrained Model-Based Methods Yuan et al. [ 6 ] proposed LMetalSite, a method in which the protein sequence is input into a pretrained protein language model to generate embedding representations. To prevent overfitting during training, Gaussian noise is added to these embeddings. The noise-augmented embeddings are then passed through shared transformer networks composed of multiple layers. The output from these shared layers is fed into ion-specific multilayer perceptrons (MLPs), each tailored to predict the binding patterns of one of four different metal ions. To address the limited availability of training data and to capture shared patterns among different ions, the method adopts a multi-task learning framework. Essien et al. [ 14 ] proposed IonPred, a two-part framework for ion-binding site prediction. The first component is a MLM generator, which takes protein sequences with certain amino acids masked and predicts the original residues based on the contextual information from surrounding residues. The second component is an ELECTRA-based discriminator, which evaluates the full sequence and determines whether each amino acid is original or replaced by the generator. To generate training samples, the frequency distribution of all 20 amino acids was computed for each ligand type to identify candidate residues. A sliding window centered on each candidate residue was then used to define positive and negative samples. Pourmirzaei et al. [ 15 ] proposed Prot2Token, a model that combines a protein language model with an autoregressive transformer. The model takes protein sequences and task tokens (representing ligand types) as input to predict the residue indices of binding sites. The protein encoder processes the input sequence, and its output is combined with learnable positional embeddings before being projected. This projected context, together with task-specific token embeddings, is used by the decoder to generate autoregressive outputs for binding site prediction. The decoder is pre-trained in a self-supervised way to provide initial weights, which is especially helpful when the number of training samples is small. Littmann et al. [ 13 ] proposed bindEmbed21, which has two main components: bindEmbed21DL (deep learning) and bindEmbed21HBI (homology-based inference). bindEmbed21DL uses embeddings from the Transformer-based protein language model ProtT5 as input, with a relatively shallow two-layer convolutional neural network (CNN) as its core. bindEmbed21HBI is based on the idea that proteins with high sequence similarity are often evolutionarily related and therefore share similar functions, including binding sites. The final bindEmbed21 method combines the strengths of both bindEmbed21DL and bindEmbed21HBI. To address class imbalance, a weighted cross-entropy loss function was used, with individual weights assigned to each ligand class. Wang et al. [ 9 ] proposed CLAPE-SMB, a method designed to predict the probability of general small molecule binding sites on a protein, rather than binding sites for a specific ligand. The model uses ESM-2 to extract informative features relevant to binding sites from the input protein sequence. The weights of ESM-2 are kept fixed during training to avoid the high computational cost of fine-tuning and to prevent catastrophic forgetting. The backbone of CLAPE-SMB is a multi-layer perceptron (MLP) composed of five fully connected layers, which uses the features extracted by ESM-2 to classify residues as binding or non-binding for small molecules. To address class imbalance, the model employs a combined loss function that integrates class-balanced focal loss and triplet center loss. Zhang and Xie [ 16 ] proposed LaMPSite, which takes a protein’s amino acid sequence and a ligand’s 2D molecular graph as input. The protein sequence is processed by the ESM-2 protein language model to generate residue embeddings and a contact map indicating which residues are likely close in 3D space. The ligand’s molecular graph is processed by a graph neural network (GNN) to produce atom embeddings and a distance map estimating atom–atom distances from a quick 3D conformer. Each residue vector is then paired with each atom vector to create an interaction map, which is refined by an interaction module using both the protein contact map and ligand distance map. For each residue, interaction scores are averaged across all ligand atoms to estimate binding likelihood, and nearby high-scoring residues are grouped based on the protein contact map into predicted binding pockets. B Data Pre-Processing BioLiP2 [ 17 ] was utilized in this work as a comprehensive and curated database for biologically relevant protein–ligand binding interactions. It primarily sources data from the Protein Data Bank (PDB), supplemented with annotations from literature and other specialized databases such as Binding MOAD [ 41 ] and BindingDB [ 42 ]. Three main files were used: BioLiP _nr.txt.gz , which provides annotations for each ligand–protein interaction site, including binding site residues with PDB residue numbering, binding site residues re-numbered starting from 1, and additional information such as structure resolution and binding affinity; protein _nr.fasta.gz , which contains protein receptor sequences clustered at a 90% identity cutoff, formatted in FASTA where lines starting with “ > ” represent sequence IDs (e.g., > 1hv2A) followed by amino acid sequences; and ligand.tsv.gz , which provides a ligand summary including the Chemical Component Dictionary (CCD), chemical formula, SMILES strings, and other identifiers. These resources were used to retrieve protein sequences, ligand SMILES representations, and binding site information for subsequent analysis. We removed any sequences containing fewer than 50 residues and excluded ligands without SMILES representations. Binary labels representing binding and non-binding sites were generated based on the binding _residues_ renum column, which contains the positions of binding sites. For each sequence, an equal number of zeros was initially assigned, and then the positions corresponding to binding sites were set to ones, representing binding sites, while the remaining positions retained zeros to indicate non-binding sites. First, the ligands were categorized based on the number of associated samples: Overrepresented : ligands with more than 100 samples Underrepresented : ligands with fewer than 100 samples Zero-shot : ligands with fewer than 20 samples After categorizing the ligands and their corresponding samples, we generated a separate FASTA file for each ligand. Each FASTA file contained all sequences that bind to that ligand, with the sequence ID followed by the amino acid sequence. Next, we applied CD-HIT to each ligand’s FASTA file individually, clustering the sequences into groups of similar sequences using a 40% identity threshold. The resulting clusters were then assigned to the training, validation, and testing sets according to the predefined split ratios. Finally, the overrepresented and underrepresented training sets were combined into a single training set, and their evaluation sets were merged into one evaluation set. For testing, we maintained three separate test sets: the overrepresented test set, the underrepresented test set, and the zero-shot test set. The Chemical Component Dictionary (CCD) was used to retrieve SMILES strings for each ligand from ligand.tsv.gz . This file contains multiple SMILES variants, although the BioLiP2 documentation does not specify what each variant represents. For consistency, we used the first SMILES string provided. A comparison with RCSB PDB [ 43 ] confirmed that this first entry corresponds to the standard SMILES and Canonical SMILES generated by OpenEye OEToolkits. We ended up with 5 780 ligands and 41 327 sequences. In order to have a systematic evaluation, we divided them into three sets. The first category includes overrepresented ligands, we broke it into two parts: Defined as those that bind to at least 200 protein sequences. This contained 32 ligands, which were split into training, evaluation, and testing sets using a 70%, 10%, and 20% ratio, respectively. Defined as those that bind to fewer than 200 sequences but at least 100 sequences. This category contained 13 ligands, and their data were split into training, evaluation, and testing sets using a 50%, 20%, and 30% ratio, respectively. The second category includes underrepresented ligands, defined as those that bind to fewer than 100 sequences but at least 20 sequences. This category contained 123 ligands, and their data were split into training, evaluation, and testing sets using a 34%, 33%, and 33% ratio, respectively. CD-HIT clusters sequences into groups based on similarity, ensuring that all sequences within a cluster are similar up to a specified threshold, and no sequence in one cluster is similar (beyond that threshold) to sequences in another cluster. After clustering, the next step is to assign clusters to the training, evaluation, and testing sets according to the predefined splitting ratios. However, for ligands with a small number of highly similar sequences, CD-HIT may produce fewer than three clusters. In such cases, there are not enough clusters to split across training, evaluation, and testing sets properly. This occurred for only one ligand in the underrepresented category ( CL0 ), which is why it was excluded. For the zero-shot category, we did not use CD-HIT because this category was reserved exclusively for the testing phase. The third category is the zero-shot category, which includes ligands that bind to fewer than 20 sequences but at least one. For this category, 100% of the sequences were used for testing, resulting in 5 612 ligands. During integration with the UniMol-2 CLMs, we applied minor changes to the dataset to ensure compatibility. UniMol-2’s built-in SMILES parser imposed strict parsing restraints compared to the MolFormer CLM. To standardize ligand representation in different CLMs, we attempted to use a standard canonical SMILES generated by OpenEye OEToolkits [ 44 ]. However, some SMILES representations under this format were rejected by the UniMol-2 parser. In such cases, we attempted to substitute alternative SMILES variants. These alternatives were selected from the additional SMILES strings listed in the CCD file. If no alternatives were accepted or available, the ligand was excluded from the dataset. A ligand, TRP , appeared in the underrepresented sets, required SMILES modification to be parsed successfully. In the zero-shot test set, 88 ligands required SMILES substitutions in order to be parsed, while 29 ligands had no valid alternatives and were excluded from the zero-shot test set at the end. The excluded ligands from the zero-shot test set included 08T , 7RZ , 8M0 , AOH , B1M , COB , D6N , DVT , GCR , ICS , K6G , KEG , MO7 , NFV , OEX , OZN , PQJ , R5Q , S5Q , S9F , SIW , TEW , U00 , UZC , V9G , WJS , WO2 , X3P , and ZRW . C Architecture View this table: View inline View popup Download powerpoint Table S1: Key hyperparameters for Stage 2 and Stage 3 architectures across different model scales. View this table: View inline View popup Download powerpoint Table S2: Key hyperparameters for Stage 3 architectures across different CLM scales. D Experiments View this table: View inline View popup Download powerpoint Table S3: Benchmark of stage 1 architecture on the ligands of the overrepresented test set. View this table: View inline View popup Download powerpoint Table S4: Ligand-based F 1 comparison of Prot2Token with our three approaches on the overrepresented test set. All values of Prot2Token are reported from the original paper. Footnotes { mpngf{at}missouri.edu , saakdr{at}missouri.edu , dc57y{at}missouri.edu , xudong{at}missouri.edu } mohammadreza.pourmirzaeioliaei{at}mail.polimi.it https://github.com/mahdip72/ProteinLigand References [1]. ↵ Jingtian Zhao , Yang Cao , and Le Zhang . Exploring the computational methods for proteinligand binding site prediction . Computational and structural biotechnology journal , 18 : 417 – 426 , 2020 . OpenUrl [2]. Ashwin Dhakal , Cole McKay , John J Tanner , and Jianlin Cheng . Artificial intelligence in the prediction of protein–ligand interactions: recent advances and future directions . Briefings in Bioinformatics , 23 ( 1 ): bbab476 , 2022 . OpenUrl CrossRef PubMed [3]. ↵ Ying Xia , Xiaoyong Pan , and Hong-Bin Shen. A comprehensive survey on protein-ligand binding site prediction . Current Opinion in Structural Biology , 86 : 102793 , 2024 . OpenUrl CrossRef PubMed [4]. ↵ Chun-Qiu Xia , Xiaoyong Pan , and Hong-Bin Shen. Protein–ligand binding residue prediction enhancement through hybrid deep heterogeneous learning of sequence and structure data . Bioinformatics , 36 ( 10 ): 3018 – 3027 , 2020 . OpenUrl CrossRef PubMed [5]. ↵ Dong-Jun Yu , Jun Hu , Jing Yang , Hong-Bin Shen, Jinhui Tang , and Jing-Yu Yang . Designing template-free predictor for targeting protein-ligand binding sites with classifier ensemble and spatial clustering . IEEE/ACM transactions on computational biology and bioinformatics , 10 ( 4 ): 994 – 1008 , 2013 . OpenUrl [6]. ↵ Qianmu Yuan , Sheng Chen , Yu Wang , Huiying Zhao , and Yuedong Yang . Alignment-free metal ion-binding site prediction from protein sequence through pretrained language model and multi-task learning . Briefings in bioinformatics , 23 ( 6 ): bbac444 , 2022 . OpenUrl CrossRef PubMed [7]. ↵ Ying Xia , Chunqiu Xia , Xiaoyong Pan , and Hong-Bin Shen. Bindweb: a web server for ligand binding residue and pocket prediction from protein structures . Protein Science , 31 ( 12 ): e4462 , 2022 . OpenUrl CrossRef PubMed [8]. Yitian Fang , Yi Jiang , Leyi Wei , Qin Ma , Zhixiang Ren , Qianmu Yuan , and Dong-Qing Wei . Deepprosite: structure-aware protein binding site prediction using esmfold and pretrained language model . Bioinformatics , 39 ( 12 ): btad718 , 2023 . OpenUrl CrossRef PubMed [9]. ↵ Jue Wang , Yufan Liu , and Boxue Tian . Protein-small molecule binding site prediction based on a pre-trained protein language model with contrastive learning . Journal of cheminformatics , 16 ( 1 ): 125 , 2024 . OpenUrl PubMed [10]. ↵ Gelany Aly Abdelkader and Jeong-Dong Kim . Advances in protein-ligand binding affinity prediction via deep learning: A comprehensive study of datasets, data preprocessing techniques, and model architectures . Current drug targets , 25 ( 15 ): 1041 – 1065 , 2024 . OpenUrl CrossRef PubMed [11]. Amit Gangwal , Azim Ansari , Iqrar Ahmad , Abul Kalam Azad , and Wan Mohd Azizi Wan Sulaiman . Current strategies to address data scarcity in artificial intelligence-based drug discovery: A comprehensive review . Computers in Biology and Medicine , 179 : 108734 , 2024 . OpenUrl CrossRef PubMed [12]. ↵ Tobias Harren , Torben Gutermuth , Christoph Grebner , Gerhard Hessler , and Matthias Rarey . Modern machine-learning for binding affinity estimation of protein–ligand complexes: Progress, opportunities, and challenges . Wiley Interdisciplinary Reviews: Computational Molecular Science , 14 ( 3 ): e1716 , 2024 . OpenUrl [13]. ↵ Maria Littmann , Michael Heinzinger , Christian Dallago , Konstantin Weissenow , and Burkhard Rost . Protein embeddings and deep learning predict binding residues for various ligand classes . Scientific reports , 11 ( 1 ): 23916 , 2021 . OpenUrl PubMed [14]. ↵ Clement Essien , Lei Jiang , Duolin Wang , and Dong Xu . Prediction of protein ion–ligand binding sites with electra . Molecules , 28 ( 19 ): 6793 , 2023 . OpenUrl CrossRef PubMed [15]. ↵ Mahdi Pourmirzaei , Salhuldin Alqarghuli , Farzaneh Esmaili , Mohammadreza Pourmirzaei , Mohsen Rezaei , and Dong Xu . Using autoregressive-transformer model for protein-ligand binding site prediction . bioRxiv , pages 2025 – 03 , 2025 . [16]. ↵ Shuo Zhang and Lei Xie . Protein language model-powered 3d ligand binding site prediction from protein sequence . arXiv preprint arxiv: 2312.03016 , 2023 . [17]. ↵ Chengxin Zhang , Xi Zhang , Peter L Freddolino , and Yang Zhang . Biolip2: an updated structure database for biologically relevant ligand–protein interactions . Nucleic Acids Research , 52 ( D1 ):D404–D412, 2024 . [18]. ↵ Weizhong Li and Adam Godzik . Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences . Bioinformatics , 22 ( 13 ): 1658 – 1659 , 2006 . OpenUrl CrossRef PubMed Web of Science [19]. ↵ Zeming Lin , Halil Akin , Roshan Rao , Brian Hie , Zhongkai Zhu , Wenting Lu , Nikita Smetanin , Robert Verkuil , Ori Kabeli , Yaniv Shmueli , et al. Evolutionary-scale prediction of atomic-level protein structure with a language model . Science , 379 ( 6637 ): 1123 – 1130 , 2023 . OpenUrl CrossRef PubMed [20]. ↵ Fang Wu , Dragomir Radev , and Stan Z Li . Molformer: Motif-based transformer on 3d heterogeneous molecular graphs . In Proceedings of the AAAI Conference on Artificial Intelligence, volume 37 , pages 5312 – 5320 , 2023 . [21]. ↵ Kenneth Ward Church . Word2vec . Natural Language Engineering , 23 ( 1 ): 155 – 162 , 2017 . OpenUrl CrossRef [22]. ↵ Xiaohong Ji , Zhen Wang , Zhifeng Gao , Hang Zheng , Linfeng Zhang , Guolin Ke , et al. Unimol2: Exploring molecular pretraining model at scale . arXiv preprint arxiv: 2406.14969 , 2024 . [23]. ↵ I Loshchilov . Decoupled weight decay regularization . arXiv preprint arxiv: 1711.05101 , 2017 . [24]. ↵ Ilya Loshchilov and Frank Hutter . xSgdr: Stochastic gradient descent with warm restarts . arXiv preprint arxiv: 1608.03983 , 2016 . [25]. ↵ Jason Ansel , Edward Yang , Horace He , Natalia Gimelshein , Animesh Jain , Michael Voznesensky , Bin Bao, Peter Bell , David Berard , Evgeni Burovski , et al. Pytorch 2: Faster machine learning through dynamic python bytecode transformation and graph compilation . In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 , pages 929 – 947 , 2024 . [26]. ↵ Dhiraj Kalamkar , Dheevatsa Mudigere , Naveen Mellempudi , Dipankar Das , Kunal Banerjee , Sasikanth Avancha , Dharma Teja Vooturi , Nataraj Jammalamadaka , Jianyu Huang , Hector Yuen , et al. A study of bfloat16 for deep learning training . arXiv preprint arxiv: 1905.12322 , 2019 . [27]. ↵ Paulius Micikevicius , Sharan Narang , Jonah Alben , Gregory Diamos , Erich Elsen , David Garcia , Boris Ginsburg , Michael Houston , Oleksii Kuchaiev , Ganesh Venkatesh , et al. Mixed precision training . arXiv preprint arxiv: 1710.03740 , 2017 . [28]. ↵ Saro Passaro , Gabriele Corso , Jeremy Wohlwend , Mateo Reveiz , Stephan Thaler , Vignesh Ram Somnath , Noah Getz , Tally Portnoi , Julien Roy , Hannes Stark , et al. Boltz-2: Towards accurate and efficient binding affinity prediction . BioRxiv , pages 2025 – 06 , 2025 . [29]. ↵ Mahdi Pourmirzaei , Farzaneh Esmaili , Salhuldin Alqarghuli , Mohammadreza Pourmirzaei , Ye Han , Kai Chen , Mohsen Rezaei , Duolin Wang , and Dong Xu . Prot2token: A unified framework for protein modeling via next-token prediction . arXiv preprint arxiv: 2505.20589 , 2025 . [30]. ↵ Clement Essien , Duolin Wang , and Dong Xu . Capsule network for predicting zinc binding sites in metalloproteins . In 2019 IEEE International Conference on Bioinformatics and Biomedicine (BIBM) , pages 2337 – 2341 . IEEE , 2019 . [31]. Chih-Hao Lu , Chih-Chieh Chen , Chin-Sheng Yu , Yen-Yi Liu , Jia-Jun Liu , Sung-Tai Wei , and Yu-Feng Lin . Mib2: metal ion-binding site prediction and modeling server . Bioinformatics , 38 ( 18 ): 4428 – 4429 , 2022 . OpenUrl CrossRef PubMed [32]. ↵ Daniel Barry Roche , Danielle Allison Brackenridge , and Liam James McGuffin . Proteins and their interacting partners: An introduction to protein–ligand binding site prediction methods . International journal of molecular sciences , 16 ( 12 ): 29829 – 29842 , 2015 . OpenUrl PubMed [33]. ↵ Jianyi Yang , Ambrish Roy , and Yang Zhang . Protein–ligand binding site recognition using complementary binding-specific substructure comparison and sequence profile alignment . Bioinformatics , 29 ( 20 ): 2588 – 2595 , 2013 . OpenUrl CrossRef PubMed Web of Science [34]. ↵ Yifeng Cui , Qiwen Dong , Daocheng Hong , and Xikun Wang . Predicting protein-ligand binding residues with deep convolutional neural networks . BMC bioinformatics , 20 : 1 – 12 , 2019 . OpenUrl CrossRef PubMed [35]. ↵ Xiuzhen Hu , Qiwen Dong , Jianyi Yang , and Yang Zhang . Recognizing metal and acid radical ion-binding sites by integrating ab initio modeling with template-based transferals . Bioinformatics , 32 ( 21 ): 3260 – 3269 , 2016 . OpenUrl CrossRef PubMed [36]. ↵ Jun Hu , Yang Li , Yang Zhang , and Dong-Jun Yu . Atpbind: accurate protein–atp binding site prediction by combining sequence-profiling and structure-based comparisons . Journal of chemical information and modeling , 58 ( 2 ): 501 – 510 , 2018 . OpenUrl CrossRef PubMed [37]. ↵ Yu-Feng Lin , Chih-Wen Cheng , Chung-Shiuan Shih , Jenn-Kang Hwang , Chin-Sheng Yu , and Chih-Hao Lu . Mib: metal ion-binding site prediction and docking server . Journal of chemical information and modeling , 56 ( 12 ): 2287 – 2291 , 2016 . OpenUrl CrossRef PubMed [38]. ↵ Ying Xia , Chun-Qiu Xia , Xiaoyong Pan , and Hong-Bin Shen. Graphbind: protein structural context embedded rules learned by hierarchical graph neural networks for recognizing nucleicacid-binding residues . Nucleic acids research , 49 ( 9 ): e51 – e51 , 2021 . OpenUrl CrossRef PubMed [39]. ↵ Clement Essien , Ning Wang , Yang Yu , Salhuldin Alqarghuli , Yongfang Qin , Negin Manshour , Fei He , and Dong Xu . Predicting the location of coordinated metal ion-ligand binding sites using geometry-aware graph neural networks . Computational and Structural Biotechnology Journal , 27 : 137 – 148 , 2025 . OpenUrl [40]. ↵ Vineeth R Chelur and U Deva Priyakumar . Birds-binding residue detection from protein sequences using deep resnets . Journal of Chemical Information and Modeling , 62 ( 8 ): 1809 – 1818 , 2022 . OpenUrl PubMed [41]. ↵ Mark L Benson , Richard D Smith , Nickolay A Khazanov , Brandon Dimcheff , John Beaver , Peter Dresslar , Jason Nerothin , and Heather A Carlson . Binding moad, a high-quality protein– ligand database . Nucleic acids research , 36 ( Suppl 1 ): D674 – D678 , 2007 . OpenUrl PubMed [42]. ↵ Tiqing Liu , Yuhmei Lin , Xin Wen , Robert N Jorissen , and Michael K Gilson . Bindingdb: a web-accessible database of experimentally determined protein–ligand binding affinities . Nucleic acids research , 35 ( Suppl 1 ): D198 – D201 , 2007 . OpenUrl CrossRef PubMed Web of Science [43]. ↵ Helen M Berman , John Westbrook , Zukang Feng , Gary Gilliland , Talapady N Bhat , Helge Weissig , Ilya N Shindyalov , and Philip E Bourne . The protein data bank . Nucleic acids research , 28 ( 1 ): 235 – 242 , 2000 . OpenUrl CrossRef PubMed Web of Science [44]. ↵ Paul CD Hawkins , A Geoffrey Skillman , and Anthony Nicholls . Comparison of shapematching and docking as virtual screening tools . Journal of medicinal chemistry , 50 ( 1 ): 74 – 82 , 2007 . OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted September 30, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Zero-Shot Protein–Ligand Binding Site Prediction from Protein Sequence and SMILES Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Zero-Shot Protein–Ligand Binding Site Prediction from Protein Sequence and SMILES Mahdi Pourmirzaei , Salhuldin Alqarghuli , Kai Chen , Mohammadreza Pourmirzaei , Dong Xu bioRxiv 2025.09.28.679103; doi: https://doi.org/10.1101/2025.09.28.679103 Share This Article: Copy Citation Tools Zero-Shot Protein–Ligand Binding Site Prediction from Protein Sequence and SMILES Mahdi Pourmirzaei , Salhuldin Alqarghuli , Kai Chen , Mohammadreza Pourmirzaei , Dong Xu bioRxiv 2025.09.28.679103; doi: https://doi.org/10.1101/2025.09.28.679103 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7636) Biochemistry (17698) Bioengineering (13896) Bioinformatics (41959) Biophysics (21457) Cancer Biology (18597) Cell Biology (25523) Clinical Trials (138) Developmental Biology (13381) Ecology (19904) Epidemiology (2067) Evolutionary Biology (24323) Genetics (15612) Genomics (22511) Immunology (17738) Microbiology (40401) Molecular Biology (17184) Neuroscience (88624) Paleontology (667) Pathology (2833) Pharmacology and Toxicology (4825) Physiology (7644) Plant Biology (15158) Scientific Communication and Education (2046) Synthetic Biology (4296) Systems Biology (9825) Zoology (2271)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.