NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 41,545 characters · extracted from preprint-html · click to expand
NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations Ke Ding , Brian John Parker , Jiayu Wen doi: https://doi.org/10.1101/2025.08.17.670700 Ke Ding 1 Division of Genome Science and Cancer, John Curtin School of Medical Research, Australian National University 2 The Shine-Dalgarno Centre for RNA Innovation, Australian National University 3 ARC Centre of Excellence for the Mathematical Analysis of Cellular Systems Find this author on Google Scholar Find this author on PubMed Search for this author on this site Brian John Parker 3 ARC Centre of Excellence for the Mathematical Analysis of Cellular Systems 4 School of Computing, Australian National University 5 Biological Data Science Institute, Australian National University Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: Brian.Parker{at}anu.edu.au Jiayu.Wen{at}anu.edu.au Jiayu Wen 1 Division of Genome Science and Cancer, John Curtin School of Medical Research, Australian National University 2 The Shine-Dalgarno Centre for RNA Innovation, Australian National University 3 ARC Centre of Excellence for the Mathematical Analysis of Cellular Systems Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: Brian.Parker{at}anu.edu.au Jiayu.Wen{at}anu.edu.au Abstract Full Text Info/History Metrics Preview PDF Abstract Pre-training large language models on genomic sequences has become a powerful approach for learning biologically meaningful representations. While masked language modeling (MLM)-based approaches, such as DNABERT and Nucleotide Transformer (NT), achieve strong performance, they are hindered by inefficiencies due to partial token supervision, pre-training/fine-tuning mismatches, and high computational costs. We introduce NucEL, the first ELECTRA-style pre-training framework for genomic foundation models, which overcomes these challenges. Through a discriminator network identifying tokens modified by a generator, Nu-cEL achieves comprehensive token-level supervision across all sequence positions, thereby markedly improving training efficiency relative to the partial supervision of masked positions inherent in MLM frameworks. By integrating ModernBERT’s architectural advancements, including hybrid local-global attention and flash attention mechanisms, NucEL establishes an optimized BERT architecture for genomic sequence modeling. Unlike traditional methods that tokenize genomic sequences into 6-mers, NucEL implements single-nucleotide tokenization, enabling fine-grained resolution and improving both efficiency and interpretability. Pre-trained on the human genome only, NucEL achieves state-of-the-art performance on benchmark datasets across diverse downstream tasks in both human and non-human species, including regulatory element identification (e.g., promoters, enhancers), transcription factor binding prediction in human and mouse, open chromatin region classification, and histone modification profiles, surpassing MLM-based models of similar size and rivaling models 25 times larger, such as NT. Ablation studies provide critical insights into tokenization and masking strategies, optimizing ELECTRA-style pretraining for DNA sequences. Attention analyses reveal NucEL’s superior ability to capture biologically relevant sequence motifs compared to NT, offering valuable insights into its hierarchical learning process and regulatory element modeling capabilities. This work highlights the potential of ELECTRA-style pretraining as an efficient and effective strategy for advancing genomic representation learning with broad implications for future genomic research. 1 Introduction Understanding genomic sequences is a fundamental challenge in biology, as DNA encodes the instructions for cellular function, development, and evolution. Large language models (LLMs), which have transformed natural language processing (NLP), are now being adapted for biological sequences, showing remarkable success across genomics, transcriptomics, proteomics, metagenomics, and epigenomics [ Dalla-Torre et al., 2025 , Sasse et al., 2024 , Jumper et al., 2021 , Abramson et al., 2024 ]. Genomic language models (gLMs) are particularly promising for decoding the functional logic of genomes, revealing how DNA sequences are organized and interact to produce the complexity of biological systems. Traditional deep learning methods for genomics, such as convolutional neural networks (CNNs) [ LeCun et al., 2015 ] and recurrent neural networks (RNNs) [ Hochreiter and Schmidhuber, 1997 ], are limited: CNNs struggle with long-range dependencies in DNA sequences, while RNNs process sequences inefficiently. Transformer-based architectures overcome these shortcomings by leveraging pre-training on vast unlabeled genomic datasets to learn meaningful representations of functional elements such as promoters and enhancers, facilitating effective transfer learning for downstream tasks [ Devlin et al., 2019 ]. However, existing transformer-based models, such as DNABERT [ Ji et al., 2021 ] and Nucleotide Transformer [ Dalla-Torre et al., 2025 ], rely on masked language modeling (MLM), which is computationally inefficient. MLM learns from only a small subset of the input—typically 15% of masked tokens—underutilizing the majority of the sequence during training [ Clark et al., 2020 ]. Additionally, the pre-training (with masking) and fine-tuning (without masking) mismatch can impair transfer learning efficacy. To address these limitations, we introduce NucEL, the first genomic foundation model to adopt the ELECTRA framework’s replaced token detection (RTD) objective [ Clark et al., 2020 ]. Unlike MLM, RTD employs a generator-discriminator architecture, enabling dense token-level supervision across all input positions and significantly boosting training efficiency. NucEL integrates ModernBERT’s architectural innovations [ Warner et al., 2024 ], including hybrid local-global attention [ Beltagy et al., 2020 ] and flash attention mechanisms [ Dao, 2023 ], to capture both short and long-range genomic dependencies effectively. Unlike k-mer [ Dalla-Torre et al., 2025 , Ji et al., 2021 ] or BPE tokenization [ Zhou et al., 2023 , Sanabria et al., 2024 ], NucEL uses single-nucleotide tokenization, which may yield more tokens for long sequences but ensures fine-grained resolution and interpretability [ Dotan et al., 2024 ]. Combining single-nucleotide resolution with RTD optimizes efficiency and precision: it preserves base-level detail for tasks like mutation analysis, while dense supervision and hybrid attention mitigate the computational cost of longer sequences and enhance long-range interaction modeling. We evaluate NucEL across a range of benchmark tasks—including promoter classification, transcription factor binding prediction, and epigenetic mark profiling—and demonstrate that it achieves state-of-the-art performance while using significantly fewer parameters than larger MLM-based models. Our contributions include: Introduced the first ELECTRA-style genomic foundation model, leveraging Replaced Token Detection (RTD) to enhance training efficiency over MLM-based approaches by utilizing all input tokens. Achieved state-of-the-art performance on genome foundation model benchmarks with significantly fewer parameters than MLM-based models. Utilized single-nucleotide resolution and fine-grained attention analyses to enable precise genomic modeling while enhancing interpretability by revealing complex biological representations. Integrated ModernBERT’s hybrid local-global attention and flash attention mechanisms to efficiently capture genomic dependencies while optimizing computational performance. Our results demonstrate that NucEL provides a practical and efficient approach to learning biologically meaningful genomic representations, advancing genomic language modeling while reducing computational demands, thus making advanced genomic models more accessible to the broader research community. 2 Related Work Genomic sequence modeling has evolved rapidly with deep learning, particularly through transformer-based models adapted from natural language processing. Here, we review key developments in genomic foundation models, pretraining objectives and tokenization strategies, positioning NucEL within this landscape. 2.1 Genomic Language Models Transformer-based models have become a cornerstone of genomic representation learning. DNABERT [ Ji et al., 2021 ] pioneered this approach by adapting BERT’s masked language modeling (MLM) to DNA sequences, using k -mer tokenization to predict masked tokens for tasks like promoter prediction. DNABERT-2 [ Zhou et al., 2023 ] refined this with Byte Pair Encoding (BPE) tokenization and architectural improvements, enhancing performance on genomic benchmarks. The Nucleotide Transformer (NT) series [ Dalla-Torre et al., 2025 ] scaled this paradigm to billions of parameters, pretrained on multi-species datasets, demonstrating strong transfer learning capabilities. Additionally, ModernBERT [ Warner et al., 2024 ] introduced architectural innovations such as hybrid local-global attention and flash attention mechanisms, improving transformer efficiency for sequence modeling. However, MLM-based models are limited by partial token supervision (only ∼ 15% of tokens) and a pretraining-finetuning mismatch, reducing efficiency. Alternatives to transformers include state-space models (SSMs) like HyenaDNA [ Nguyen et al., 2023 ] and Caduceus [ Schiff et al., 2024 ], which use single-nucleotide resolution and reverse-complement awareness for efficient long-sequence modeling. Convolutional neural networks (CNNs), such as DeepBind [ Alipanahi et al., 2015 ] and Enformer [ Avsec et al., 2021 ], remain relevant for capturing local motifs but struggle with long-range dependencies compared to transformers. 2.2 Pretraining Objectives Pretraining objectives shape the efficiency and effectiveness of genomic models. MLM, used in DNABERT [ Ji et al., 2021 ] and NT [ Dalla-Torre et al., 2025 ], masks a subset of tokens for prediction, but its sparse supervision leaves most of the forward pass unutilized. Next-Sentence Prediction (NSP), used alongside with MLM in BERT [ Devlin et al., 2019 ], is rarely adopted in genomics due to DNA’s lack of paragraph-like structure, offering limited benefits [ Poli et al., 2023 ]. In contrast, Replaced Token Detection (RTD), introduced by ELECTRA [ Clark et al., 2020 ], uses a generator-discriminator framework to provide dense supervision across all tokens, significantly improving efficiency. RTD has proven effective in NLP (e.g., DeBERTa-v3 [ He et al., 2021 ]), but its potential in genomics remains untapped, motivating our NucEL’s approach. 2.3 Tokenization Strategies Tokenization critically shapes a model’s ability to represent genomic sequences. K -mer tokenization, as used in DNABERT [ Ji et al., 2021 ], compresses sequences by grouping nucleotides into k -length substrings but fractures single-nucleotide context, exponentially inflating vocabulary size and diluting base-level detail. Byte Pair Encoding (BPE), adopted by DNABERT-2 [ Zhou et al., 2023 ] and GROVER [ Sanabria et al., 2024 ], learns variable-length subwords to balance efficiency and motif capture but can obscure single-nucleotide variations and introduce inconsistencies for similar sequences. Single-nucleotide tokenization, as in HyenaDNA [ Nguyen et al., 2023 ] and Caduceus [ Schiff et al., 2024 ], preserves base-level resolution and avoids vocabulary explosion, making it ideal for tasks requiring fine-grained precision, such as modeling point mutations and identifying regulatory motifs such as transcription factor binding motifs. 2.4 Positioning NucEL While MLM-based models perform well on downstream tasks, their inefficiencies in supervision and computational demands drive the need for alternatives. NucEL introduces the first ELECTRA-style framework for genomic modeling, leveraging RTD to achieve dense supervision and superior sample efficiency compared to MLM-based approaches. By integrating ModernBERT’s hybrid attention and flash attention mechanisms [ Warner et al., 2024 ] and adopting single-nucleotide tokenization, NucEL maximizes interpretability and efficiency. This enables state-of-the-art performance with fewer parameters, positioning NucEL as a scalable and effective solution for genomic representation learning. 3 Methods This section describes NucEL’s training strategies, model architecture, pre-training data and tokenization, optimization, and fine-tuning approach. 3.1 Training Strategies NucEL uses a RTD objective, training two transformer encoders in a generator-discriminator frame-work. The generator, a smaller transformer, performs MLM to predict masked tokens, while the discriminator, a larger transformer, identifies whether tokens are original or replaced by the generator. This provides dense supervision across all tokens, enhancing sample efficiency. Quantitatively, RTD supervises all N tokens in a sequence of length N , unlike MLM’s sparse supervision of r · N tokens (e.g., r = 0.15), yielding a supervision ratio of approximately 6.67. The total loss combines the generator’s MLM loss and the discriminator’s RTD loss, weighted by λ = 50.0: where ℒ generator is the cross-entropy over masked tokens, and ℒ discriminator is the binary cross-entropy over all tokens. For a sequence χ = { x 1 , x 2 , …, x N } with masked positions ℳ ⊆ {1, 2, …, N }, the generator loss is: and the discriminator loss is: where χ ′ is the sequence with masked tokens replaced by the generator’s predictions, and y i = 1 if , else 0. 3.2 Model Architecture NucEL’s generator-discriminator framework is shown in Figure 1 . The generator comprises 11 transformer layers, a hidden size of 256, and 8 attention heads. The discriminator, NucEL’s primary model, has 22 layers, a hidden size of 512, and 16 attention heads. Both use a hybrid attention mechanism, combining local attention windows (128 tokens) with global attention every third layer, balancing efficiency and long-range dependency modeling (see details in Appendix). Download figure Open in new tab Figure 1. Overview of NucEL the Electra-style Genomic Pre-training Framework. Illustrates the two-stage ELECTRA-style pre-training framework for NucEL, depicting the generator-discriminator architecture during the pre-training stage and the supervised fine-tuning stage for downstream genomic tasks. 3.3 Pre-training Data and Tokenization NucEL was pre-trained on the human genome (GRCh38/Hg38) using single-nucleotide tokenization, where each base (A, C, G, T) is a distinct token. Sequences were extracted using an overlapping sliding window (1224 bp, 100 bp overlap), with 1024 bp segments randomly sampled during training. The vocabulary consists of 27 tokens: 4 nucleotides, 7 special tokens ([PAD], [UNK], [SEP], [CLS], [MASK], [BOS], [EOS]), and 16 reserved for future extensions (see Appendix). Ambiguous bases (N) are mapped to [UNK] or omitted, though rare. Single-nucleotide tokenization preserves base-level precision but increases sequence length compared to alternatives like k-mer (k=6) and Byte Pair Encoding (BPE). For a sequence of length L , the tokenized length for single-nucleotide tokenization is N 1-mer = L , while for k-mer tokenization with overlapping windows, it is N k-mer = L − k + 1 (e.g., for k = 6), and for BPE, it is approximately N BPE ≈ αL , where 0.1 < α < 0.5 depending on the learned vocabulary. The vocabulary size for single-nucleotide tokenization is |𝒱 1-mer | = 27, significantly smaller than |𝒱 k-mer | = 4 k (e.g., 4 6 = 4096 for k = 6) for k-mer tokenization, and |𝒱 BPE | ≈ 5000 to 8000 for BPE. While single-nucleotide tokenization increases sequence length, this is mitigated by NucEL’s local attention mechanism, which reduces computational complexity while preserving base-level precision essential for fine-grained genomic tasks. 3.4 Pre-training and Fine-tuning Details NucEL was trained using the AdamW optimizer (learning rate 1 × 10 − 4 , β 1 = 0.9, β 2 = 0.999) for 50 epochs, with a global batch size of 192 and mixed precision (FP16) on 8 NVIDIA A100 GPUs. Training stability was ensured by a 1000-step warmup schedule and a maximum gradient norm of 1.0 (see Appendix). For downstream tasks, a linear layer was added atop the discriminator’s [CLS] token output, and the entire model was fine-tuned end-to-end. 4 Experiments We evaluated NucEL on three human genome-focused benchmark suites: Genome Understanding Evaluation (GUE) Zhou et al. [2023], Genomic Benchmarks (GB) Grešová et al. [2023], and Nucleotide Transformer (NT) Dalla-Torre et al. [2025] tasks. These cover diverse genomic tasks, including gene regulation, epigenomics, and sequence classification (see Appendix for details). Performance was assessed using the Matthews Correlation Coefficient (MCC) for all benchmarks ensuring robust evaluation across imbalanced datasets. In addition, accuracy was reported for GB tasks. 4.1 Evaluating Regulatory Sequence Recognition on GUE Benchmark We first evaluate NucEL on the human subset of the GUE benchmark, including transcription factor binding (TF-H), promoter detection (PD), core promoter detection (CPD), and splice site prediction (SSP). Following DNABERT2’s protocol [ Zhou et al., 2023 ], we report average MCC across three random seeds, selecting the model with the lowest validation loss. Compared models include DNABERT (DB1, 89M parameters), DNABERT-2 (DB2, 117M), and Nucleotide Transformer variants (500M and 2.5B parameters, trained on human or multi-species data)[ Zhou et al., 2023 ]. As shown in Table1, NucEL (93M parameters) demonstrates competitive performance across human genomic tasks, achieving top-1 results on CPD and SSP tasks. We further evaluate NucEL’s generalization ability across different species to assess its cross-domain transferability. Despite being trained exclusively on human genomic data, NucEL demonstrates remarkable performance on diverse genomic tasks across mouse (Transcription Factor binding, TF-M), yeast (Epigenetic Marks Prediction, EMP), and viral genomes (Covid Variant Classification, CVC), as shown in Table 1 . Notably, NucEL outperforms models trained on multi-species data while NT-multi contains 25 times more parameters than NucEL. The model achieves the highest overall average performance of 75.16 across all seven genomic tasks and sets top-2 results on 6 of 7 cross-species tasks, demonstrating that fundamental genomic patterns learned from human data can effectively generalize to other species. This cross-species transferability, combined with its parameter efficiency, highlights NucEL’s ability to capture universal genomic representations that transcend species boundaries. View this table: View inline View popup Download powerpoint Table 1: Performance comparison on GUE (MCC for all tasks except CVC which uses F1; Averaged over 3 Seeds; Best, Second-Best ) 4.2 Assessing Regulatory DNA Classification on GB Benchmark We assessed NucEL on seven human-centric regulatory sequence classification tasks from the Genomic Benchmarks dataset [ Grešová et al., 2023 ], evaluating its ability to distinguish functional DNA classes across diverse sequence lengths. Adopting Caduceus’s protocol [ Schiff et al., 2024 ], we report average accuracy and standard deviation across five random seeds, selecting the model with the lowest validation loss. Compared models include HyenaDNA, Caduceus variants (Ph and PS), and Nucleotide Transformer 2 (NT2-100M) [ Schiff et al., 2024 ]. Table 2 shows NucEL achieves the highest average accuracy of 89.9%, outperforming NT2-100M (89.0%) and Caduceus-Ph (88.2%), with state-of-the-art results on four tasks and second-best on two, underscoring its robustness in regulatory DNA classification. View this table: View inline View popup Download powerpoint Table 2: Performance comparison on GB (Accuracy; Averaged over 5 Seeds with Standard Deviation; Best , Second-Best ) 4.3 Profiling Genomic Element Prediction on NT Benchmark We evaluated NucEL on the revised Nucleotide Transformer benchmark [ Dalla-Torre et al., 2025 ], encompassing histone marker prediction, regulatory element identification, and splice site detection. Following Dalla-Torre et al. [2025], we report average MCC across ten random seeds. Table 3 shows NucEL (93M parameters) outperforms similarly sized models like DNABERT2 (117M) and NT2-Multi (100M), achieving state-of-the-art on 11 of 18 tasks. Its average MCC of 0.664 matches NT2-Multi (500M parameters) and slightly exceeds NT-Multi (2.5B parameters, 0.661), despite using only human genome data for pre-training, unlike the multi-species data used by DNABERT2 and NT2. NucEL’s competitive performance on chromatin profiles (e.g., histone markers) is detailed in the Appendix. With five times fewer parameters than NT2-500M and 27 times fewer than NT-Multi 2.5B, NucEL demonstrates exceptional efficiency and performance. View this table: View inline View popup Download powerpoint Table 3: Performance comparison on NT (MCC; Averaged over 10 Seeds with Standard Deviation; Best , Second-Best ) 4.4 Impact of Tokenization Strategies Tokenization significantly impacts genomic language model performance and efficiency. Figure 2(A) compares NucEL’s performance across tokenization schemes—single-nucleotide ( k = 1), k-mer ( k = 6), and Byte Pair Encoding (BPE)—over training epochs on GUE datasets. Single-nucleotide tokenization outperforms k-mer and BPE, preserving base-level detail critical for genomic tasks. K-mer tokenization, as used in NT-500M-1000g, reduces sequence length but dilutes single-nucleotide context, compromising performance. BPE, employed by DNABERT2, improves computational efficiency but obscures single-nucleotide variations and risks inconsistent tokenization for similar sequences due to mutation-induced boundary shifts. Paired with ELECTRA’s RTD objective, single-nucleotide tokenization enables efficient learning across all sequence positions, yielding superior representations. Download figure Open in new tab Figure 2. (A) Performance Across Tokenization Schemes. Performance of NucEL with different tokenization schemes (single-nucleotide, k-mer with k=6, and BPE) across training epochs on human GUE datasets, highlighting the superior representation learning of single-nucleotide tokenization. (B) Efficiency-Performance Tradeoff for Genomic Language Models . Efficiency-performance tradeoff on human GUE tasks, with GPU time (petaflop-days, log scale) on the x-axis, average MCC on the y-axis, and bubble size representing model parameter count. NucEL-93M (in red) achieves a strong balance, outperforming larger models like NT-multi-2.5B. 4.5 Efficiency-Performance Tradeoff Analysis We analyzed NucEL’s efficiency-performance tradeoff against BERT-based genomic models, focusing on training time, performance (MCC), and model size. Figure 2(B) illustrates this tradeoff on GUE tasks, with GPU time (petaflop-days, log scale) on the x-axis, average MCC on the y-axis, and bubble size representing parameter count. NucEL-93M (red) achieves a superior balance, outperforming larger models like NT-multi-2.5B and NT-1000g-2.5B, which are approximately 25 times larger in parameter count and require over 100 times more computational resources. Compared to models with similar computational demands (e.g., NT-1000g-500M, NT-human-500M), NucEL delivers over 10% higher MCC. This highlights the ELECTRA-style framework’s ability to provide robust genomic understanding with significantly reduced computational and parameter requirements. 4.6 Embedding Analysis by Gene Biotype To assess NucEL’s ability to capture biological signals, we extracted sequences from the most prevalent Ensembl biotypes and generated embeddings using NucEL, HyenaDNA (small 32K), DNABERT2 (117M), and NT2-100M. We visualized embeddings using t-SNE ( Figure 3 ) and trained an XGBoost classifier to predict biotypes, evaluating performance with macro, micro, and weighted F1 scores ( Table 4 ). NucEL achieved the highest F1 scores across all metrics, demonstrating superior separation of biotype classes, particularly for small RNAs (red circle in Figure 3 ). Notably, lincRNA and protein-coding genes, which are often hard to distinguish due to sequence similarities and shared exon-intron structures, were effectively clustered together in NucEL’s embeddings, highlighting its robust representation of complex biological relationships compared to competing models. View this table: View inline View popup Download powerpoint Table 4: Embedding quality Weighted F1 classification score on biotypes. Download figure Open in new tab Figure 3. t-SNE Visualization of Gene Biotype Embeddings. t-SNE visualizations of embeddings generated by NucEL, HyenaDNA, DNABERT2, and NT2-100M for the prevalent biotypes, demonstrating NucEL’s superior separation. 4.7 Visualizing Attention and Model Interpretability Predictive models for genomic sequences aim to uncover novel biological insights through inter-pretable analyses, surpassing traditional probabilistic enrichment studies by capturing complex interactions between regulatory elements. To evaluate model interpretability, we designed a synthetic “motif-order” classification task featuring two randomly placed motifs, A and B, where sequences with Motif A before B are positive and B before A are negative. This task mimics biological scenarios, such as RNA-binding proteins binding in a specific order to activate cis-regulatory elements. The motifs share identical base composition and are separated by a random gap, forcing the model to learn long-range interactions rather than relying on simple motif detection. See Appendix for detailed methods and results. We compared NucEL and NT2-100M on this motif-order task, with both models fine-tuned to near-perfect accuracy. Despite similar predictive performance, their interpretability differed substantially. We extracted attention weights from the global attention layers (layers 0, 3, 6, 9, 12, 15, 18, and 21) and computed averaged attention maps across 100 test sequences aligned at the motif positions. Figure 4 shows the signal-to-noise ratio (SNR) at true motif locations, quantifying how strongly attention signals rise above background noise. NucEL consistently exhibits higher SNRs across global layers, indicating more effective localization of informative features. Attention heatmaps (see Appendix) further reveal that NucEL focuses sharply on the embedded motif regions, while NT2-100M displays more diffuse attention with elevated background activity. Quantitatively, NucEL achieves substantial improvements in maximum SNR—65% higher for Motif A and 152% higher for Motif B—highlighting its superior capacity to distinguish motif signals from noise. Since high SNR is critical for minimizing false discovery rates in regulatory motif identification, these findings underscore NucEL’s strength in fine-grained genomic modeling and its ability to capture long-range regulatory dependencies with greater precision and interpretability than existing models. Download figure Open in new tab Figure 4. Signal-to-Noise Ratio (SNR) Comparison: Comparison of SNR for attention weights in NucEL and NT2-100M across global attention layers on the motif-order classification task, showing NucEL’s superior performance in detecting Motif A and Motif B with higher SNR values, indicating clearer motif identification and reduced background noise compared to NT2-100M. 5 Limitations While NucEL achieves state-of-the-art performance on human-genome benchmarks and demonstrates strong zero-shot generalization to non-human species (e.g., mouse, yeast, virus), its current pretraining remains confined to the human genome. Expanding pretraining to include diverse taxonomic groups (e.g., invertebrates, microbes) could enhance cross-species transferability, particularly for clade-specific regulatory elements or non-conserved motifs. This is a promising direction for future work. 6 Conclusion This study introduces NucEL, the first ELECTRA-style pre-training framework for genomic sequences, shifting from the conventional masked language modeling (MLM) paradigm. Extensive experiments demonstrate that NucEL’s replaced token detection (RTD) approach combined with single-nucleotide tokenization surpasses MLM, offering superior efficiency, accuracy, and inter-pretability in genomic representation learning. NucEL’s innovations include: (1) an ELECTRA-style generator-discriminator architecture for dense token-level supervision; (2) single-nucleotide tok-enization, enabling precise modeling of fine-grained genomic features; and (3) ModernBERT’s hybrid attention mechanisms, capturing both local and global genomic dependencies. Combining single-nucleotide resolution with RTD enhances efficiency and precision, preserving base-level detail while dense supervision and hybrid attention mitigate computational costs. These advancements allow NucEL to achieve state-of-the-art performance across diverse genomic benchmarks with fewer parameters than competing models. Enhanced by fine-grained tokenization and attention mechanisms, NucEL provides clear insights into genomic features driving predictions, improving model interpretability. This work advances genomic language modeling by delivering a computationally efficient and interpretable tool, demonstrating the ability of new pre-training strategies to extract additional information in genomic analysis. Acknowledgments and Disclosure of Funding This work was supported by the Australian Research Council Centre of Excellence for the Mathematical Analysis of Cellular Systems (CE230100001), and the Talo Scholarship and the Talo Innovative Grant funded by Taiyang Zhang and Loong Wang. The authors acknowledge computational resources provided by the National Computational Infrastructure (NCI), Argonne National Laboratory, and Google Cloud, as well as the NCI HPC-AI Talent Program. We are especially grateful to Dr. Jingbo Wang and Dr. Arvind Ramanathan for their valuable discussions and constructive suggestions, which contributed substantially to this work. Funder Information Declared Australian Research Council Centre of Excellence for the Mathematical Analysis of Cellular Systems References ↵ Josh Abramson , Jonas Adler , Jack Dunger , Richard Evans , Tim Green , Alexander Pritzel , Olaf Ronneberger , Lindsay Willmore , Andrew J Ballard , Joshua Bambrick , et al. Accurate structure prediction of biomolecular interactions with alphafold 3 . Nature , 630 ( 8016 ): 493 – 500 , 2024 . OpenUrl CrossRef PubMed ↵ Babak Alipanahi , Andrew Delong , Matthew T Weirauch , and Brendan J Frey . Predicting the sequence specificities of dna-and rna-binding proteins by deep learning . Nature biotechnology , 33 ( 8 ): 831 – 838 , 2015 . OpenUrl CrossRef PubMed ↵ Žiga Avsec , Vikram Agarwal , Daniel Visentin , Joseph R Ledsam , Agnieszka Grabska-Barwinska , Kyle R Taylor , Yannis Assael , John Jumper , Pushmeet Kohli , and David R Kelley . Effective gene expression prediction from sequence by integrating long-range interactions . Nature methods , 18 ( 10 ): 1196 – 1203 , 2021 . OpenUrl PubMed ↵ Iz Beltagy , Matthew E Peters , and Arman Cohan . Longformer: The long-document transformer . arXiv preprint arxiv: 2004.05150 , 2020 . ↵ Kevin Clark , Minh-Thang Luong , Quoc V Le , and Christopher D Manning . Electra: Pre-training text encoders as discriminators rather than generators . arXiv preprint arxiv: 2003.10555 , 2020 . ↵ Hugo Dalla-Torre , Liam Gonzalez , Javier Mendoza-Revilla , Nicolas Lopez Carranza , Adam Henryk Grzywaczewski , Francesco Oteri , Christian Dallago , Evan Trop , Bernardo P de Almeida , and Hassan Sirelkhatim . Nucleotide transformer: building and evaluating robust foundation models for human genomics . Nature Methods , 22 ( 2 ): 287 – 297 , 2025 . ISSN 1548-7091 . URL https://pmc.ncbi.nlm.nih.gov/articles/PMC11810778/ . OpenUrl PubMed ↵ Tri Dao . Flashattention-2: Faster attention with better parallelism and work partitioning . arXiv preprint arxiv: 2307.08691 , 2023 . ↵ Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . Bert: Pre-training of deep bidirectional transformers for language understanding . In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers) , pages 4171 – 4186 , 2019 . ↵ Edo Dotan , Gal Jaschek , Tal Pupko , and Yonatan Belinkov . Effect of tokenization on transformers for biological sequences . Bioinformatics , 40 ( 4 ): btae196 , 2024 . OpenUrl CrossRef PubMed ↵ Katarína Grešová , Vlastimil Martinek , David Čechák , Petr Šimeček , and Panagiotis Alexiou . Genomic benchmarks: a collection of datasets for genomic sequence classification . BMC Genomic Data , 24 ( 1 ): 25 , 2023 . OpenUrl PubMed ↵ Pengcheng He , Jianfeng Gao , and Weizhu Chen . Debertav3: Improving deberta using electra-style pre-training with gradient-disentangled embedding sharing . arXiv preprint arxiv: 2111.09543 , 2021 . ↵ Sepp Hochreiter and Jürgen Schmidhuber . Long short-term memory . Neural computation , 9 ( 8 ): 1735 – 1780 , 1997 . OpenUrl CrossRef PubMed Web of Science ↵ Yanrong Ji , Zhihan Zhou , Han Liu , and Ramana V Davuluri . Dnabert: pre-trained bidirectional encoder representations from transformers model for dna-language in genome . Bioinformatics , 37 ( 15 ): 2112 – 2120 , 2021 . ISSN 1367-4803 . OpenUrl CrossRef PubMed ↵ John Jumper , Richard Evans , Alexander Pritzel , Tim Green , Michael Figurnov , Olaf Ronneberger , Kathryn Tunyasuvunakool , Russ Bates , Augustin Žídek , Anna Potapenko , et al. Highly accurate protein structure prediction with alphafold . nature , 596 ( 7873 ): 583 – 589 , 2021 . OpenUrl CrossRef PubMed ↵ Yann LeCun , Yoshua Bengio , and Geoffrey Hinton . Deep learning . nature , 521 ( 7553 ): 436 – 444 , 2015 . OpenUrl CrossRef PubMed ↵ Eric Nguyen , Michael Poli , Marjan Faizi , Armin Thomas , Michael Wornow , Callum Birch-Sykes , Stefano Massaroli , Aman Patel , Clayton Rabideau , and Yoshua Bengio . Hyenadna: Long-range genomic sequence modeling at single nucleotide resolution . Advances in neural information processing systems , 36 : 43177 – 43201 , 2023 . OpenUrl ↵ Michael Poli , Stefano Massaroli , Eric Nguyen , Daniel Y Fu , Tri Dao , Stephen Baccus , Yoshua Bengio , Stefano Ermon , and Christopher Ré . Hyena hierarchy: Towards larger convolutional language models . In International Conference on Machine Learning , pages 28043 – 28078 . PMLR , 2023 . ↵ Melissa Sanabria , Jonas Hirsch , Pierre M Joubert , and Anna R Poetsch . Dna language model grover learns sequence context in the human genome . Nature Machine Intelligence , 6 ( 8 ): 911 – 923 , 2024 . ISSN 2522-5839 . OpenUrl ↵ Alexander Sasse , Maria Chikina , and Sara Mostafavi . Unlocking gene regulation with sequence-to-function models . Nature methods , 21 ( 8 ): 1374 – 1377 , 2024 . OpenUrl PubMed ↵ Yair Schiff , Chia-Hsiang Kao , Aaron Gokaslan , Tri Dao , Albert Gu , and Volodymyr Kuleshov . Caduceus: Bi-directional equivariant long-range dna sequence modeling . arXiv preprint arxiv: 2403.03234 , 2024 . ↵ Benjamin Warner , Antoine Chaffin , Benjamin Clavié , Orion Weller , Oskar Hallström , Said Taghadouini , Alexis Gallagher , Raja Biswas , Faisal Ladhak , and Tom Aarsen . Smarter, better, faster, longer: A modern bidirectional encoder for fast, memory efficient, and long context finetuning and inference . arXiv preprint arxiv: 2412.13663 , 2024 . ↵ Zhihan Zhou , Yanrong Ji , Weijian Li , Pratik Dutta , Ramana Davuluri , and Han Liu . Dnabert-2: Efficient foundation model and benchmark for multi-species genome . arXiv preprint arxiv: 2306.15006 , 2023 . View the discussion thread. Back to top Previous Next Posted August 17, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations Ke Ding , Brian John Parker , Jiayu Wen bioRxiv 2025.08.17.670700; doi: https://doi.org/10.1101/2025.08.17.670700 Share This Article: Copy Citation Tools NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations Ke Ding , Brian John Parker , Jiayu Wen bioRxiv 2025.08.17.670700; doi: https://doi.org/10.1101/2025.08.17.670700 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genomics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41911) Biophysics (21436) Cancer Biology (18578) Cell Biology (25482) Clinical Trials (138) Developmental Biology (13371) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15599) Genomics (22482) Immunology (17728) Microbiology (40363) Molecular Biology (17163) Neuroscience (88536) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00