Genomic Tokenizer: Toward a biology-driven tokenization in transformer models for DNA sequences

doi:10.1101/2025.04.02.646836

Genomic Tokenizer: Toward a biology-driven tokenization in transformer models for DNA sequences

2025 · doi:10.1101/2025.04.02.646836

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 27,061 characters · extracted from preprint-html · click to expand

Genomic Tokenizer: Toward a biology-driven tokenization in transformer models for DNA sequences | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Genomic Tokenizer: Toward a biology-driven tokenization in transformer models for DNA sequences View ORCID Profile Bell Raj Eapen doi: https://doi.org/10.1101/2025.04.02.646836 Bell Raj Eapen 1 University of Illinois Springfield , Springfield, IL 62703 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Bell Raj Eapen For correspondence: bpunn{at}uis.edu Abstract Full Text Info/History Metrics Preview PDF Abstract Summary Transformer models are revolutionizing sequence analysis across various domains, from natural language processing to genomics. These models rely on tokenizers to split input sequences into manageable chunks — a straightforward task in natural language but more challenging for long DNA sequences that lack distinct “words.” Most biological tokenizers are data-driven and do not align with the “central dogma of molecular biology”: DNA is transcribed into RNA, which is then translated into proteins, with each three-letter codon specifying a particular amino acid, some of which are synonymous for the amino acids they represent. Start codons signal the beginning of protein synthesis, while stop codons signal its termination. The Genomic Tokenizer (GT) incorporates this biological process flow into a standard tokenizer interface within the HuggingFace transformer package. GT can be used to pre-train foundational transformer models on DNA sequences. We compare the performance of GT with two alternate tokenization strategies and discuss its potential applications. Availability and implementation The source code of GT is available from https://github.com/dermatologist/genomic-tokenizer under the MPL-2.0 license. It can be installed from Python Package Index (PyPI) and used as a tokenizer in transformer model training pipelines. 1 Introduction The success and popularity of the use of transformer-based large language models (LLMs) in various domains have led to the exploration of the transformer architecture and attention mechanism in genomic sequence analysis. However, genomic sequences, whether DNA, RNA, or protein, vary from a sequence of words in human languages in many ways. Genomic sequences, unlike human language, lack distinct words, are highly repetitive, and have a complex structure that includes genes and regulatory elements. Despite these differences, models leveraging transformer architectures can still be beneficial for genomic sequence analysis due to their ability to capture complex patterns and dependencies within the sequence data. The attention mechanism [ 1 ] allows models to focus on relevant parts of genomic sequences, which is crucial when dealing with large-scale biological datasets where specific regions may hold critical information regarding gene expression or mutations associated with diseases. However, genomic sequences can be extremely long, making it computationally expensive and memory-intensive for standard transformers to process them in their entirety. The initial step in processing any sequence with a transformer architecture is to divide the sequence into small, manageable chunks known as tokens. Unlike natural language, which can be tokenized based on words, tokenizing DNA sequences presents unique challenges as meaningful units are not clearly defined. A simple character tokenization method is to consider each nucleotide (A, G, C, and T) as a single token. A, G, C, and T stand for the nucleotides: adenine, guanine, cytosine, and thymine, respectively, in DNA sequences. This approach makes the transformer models attend to every nucleotide and may fail to capture higher-order information, especially in long sequences. An alternative approach, k-mer tokenization, breaks the sequence into overlapping or non-overlapping segments of a predefined length k. While k-mer tokenization captures more contextual information than single-nucleotide tokenization, the optimal k value is often context-dependent and requires careful consideration. Several advanced data-driven approaches, such as byte-pair encoding (BPE) [ 2 ], have been proposed, with varying degrees of success [ 3 ]. If annotations are available, high-level features such as gene regulatory elements can be used as tokens [ 4 ]. Data-driven methods can be trained on co-occurrence patterns within a large corpus of DNA data [ 5 ]. However, most techniques are resource-intensive and not widely generalizable. Developing algorithms for efficient tokenization of DNA remains an active area of research, and the choice of tokenization strategy depends heavily on the specific downstream application. According to the central dogma of molecular biology [ 6 ], DNA is first transcribed into RNA (specifically mRNA), which then serves as a template for protein synthesis during translation. Codons are sequences of three nucleotides (k=3) that correspond to a specific amino acid or a stop signal during protein synthesis. Transcription begins with a start codon (typically AUG) and continues through several codons (reading frames), each representing specific amino acids, till one of the three stop codons is encountered. Some of the codons are synonymous as they code for the same amino acid. Exons are the gene’s coding regions that are expressed, whereas introns are non-coding regions that are excised during RNA processing. A mutation is a change in the DNA sequence that can lead to changes in protein structure and function. Addition or deletion of even a single nucleotide can shift reading frames, leading to extensive changes in the resulting protein. Single nucleotide polymorphisms (SNPs) are variations at a single position in the DNA sequence among individuals. These variations can influence gene expression and contribute to diverse phenotypic traits and diseases. In summary, the biology of DNA sequences is fundamentally different from the structure of word sequences. Genomic Tokenizer (GT) incorporates start codons, synonymous codons, and stop codons into a tokenizer interface of the HuggingFace transformer package [ 7 ], giving it the ability to handle shifts in reading frames caused by nucleotide additions or deletions within DNA sequences. By doing so, GT ensures that biological nuances inherent in genetic variations are preserved during tokenization, potentially enhancing their performance on tasks related to phenotypic predictions. 2 Materials and methods Hugging Face Transformers is an open-source Python library that provides tools for manipulating transformer models. It includes tokenizers implementing methods for splitting a string into chunks and encoding them into numerical representations (token IDs) and vice versa for input into a transformer model [ 8 ]. The core functionality takes a string or a list of strings as input and returns a dictionary containing the token IDs (input_ids) and the attention mask used to differentiate between actual data and padding. The PreTrainedTokenizer is a general-purpose Python base class [ 9 ] from which specific tokenizer classes such as BertTokenizer and GPT2Tokenizer inherit from implementing the respective tokenization strategies. Each tokenizer uses a vocabulary – a mapping between tokens and their corresponding IDs – specific to the tokenization strategy. Additionally, tokenizers manage special tokens like CLS, SEP, PAD, UNK, and MASK, each serving specific roles in transformer models. These roles include marking the start of a sequence, separating segments, padding shorter sequences, representing unknown tokens, and masking tokens for prediction tasks respectively. The GenomicTokenizer class extends the PreTrainedTokenizer class and implements the required interfaces. The vocabulary includes all possible codons, but synonymous codons that code for the same amino acids are assigned the same IDs, effectively reducing the vocabulary size and improve the efficiency of the tokenizer. The IDs one to six are reserved for special tokens. “ATG” is assigned as the start codon and “TAA”, “TAG” and “TGA” as the stop codons. These are customizable for use in the context of certain prokaryotic genomes where they differ. Start codon is treated as the BOS token and the end codons as SEP tokens. Tokenization begins at the start codon if one is identified within the sequence. If no start codon is found, tokenization defaults to the beginning of the sequence. If an end codon is found, all subsequent codons are marked as UNK tokens until another start codon is encountered. UNK tokens at the end are trimmed off so that padding can be applied as required. This process ensures that only the presumed coding sequence, delineated by start and stop codons, is processed and assigned meaningful tokens. The UNK tokens are attended to, but do not contribute towards loss calculation. The coding of introns as UNK tokens can be turned off to reduce the token count further. The tokenization algorithm is summarized in Table 1 . View this table: View inline View popup Download powerpoint Table 1: The tokenization algorithm 3 Result The complexity of genomic data, and variability of dataset and downstream tasks makes comparison of tokenizers difficult. We present a preliminary comparison of GT with two other tokenizers; HyenaDNA’s character tokenizer [ 10 ] (henceforth CT) with a vocabulary size of 4 and DNABERT-2’s [ 5 , 11 ] data driven adaptation of SentencePiece [ 12 ] (henceforth BPE) with a vocabulary size of 2048. We used GV-Rep; a pipeline for creating clinician-verified genetic variant sequences of specified lengths from reference genome [ 13 ]. Using GV-Rep we generated a subset of sequences linked to lung cancer as positive samples and an equal number of sequences representing non-lung cancer conditions as negative samples from ClinVar (a public archive of reported variants associated with diseases) [ 14 ] with sequence lengths; 512, 1024, 2048 and 4096 for the comparison of tokenizers. Using this generated dataset, we trained a simplified BERT architecture for sequence classification from scratch. We used the following hyperparameters throughout; a learning rate of 3e-4, and a batch size of 12 with 3 hidden layers and 3 attention heads each. The hidden layer dimension was 192 and the optimizer AdamW was used to train the model for one epoch. We compared three classification evaluation metrics: Accuracy (ACC), Area Under the Curve (AUC) and Matthew’s Correlation Coefficient (MCC) for CT, BPE and GT tokenizers using the BERT model and dataset described above. Accuracy (ACC) measures the proportion of true positive and true negative predictions among the total number of cases, reflecting the overall correctness of a model. The Area Under the Curve (AUC) of the Receiver Operating Characteristic (ROC) curve quantifies the ability of a model to distinguish between positive and negative classes, with higher values indicating better discrimination. Matthew’s Correlation Coefficient (MCC) measures the quality of binary classifications, especially for unbalanced datasets. It ranges from -1 to +1, where +1 means perfect prediction, 0 means random guessing, and -1 means total disagreement between predictions and actual outcomes. Data-driven BPE performed best in this task, though direct comparison of tokenizers is unjust due to the differences in vocabulary size, which in turn leads to minor changes in the BERT architecture used for comparison. A larger vocabulary size typically results in an increased number of trainable parameters, as each distinct word in the vocabulary needs its own embedding vector. The CT’s performance steadily decreased with increasing sequence length, exhibiting a clear sensitivity to the length of the input sequence. This suggests that the fixed-length representation inherent in character-level tokenization struggles to capture the long-range dependencies crucial for understanding longer sequences. In contrast, GT demonstrated greater robustness to variations in sequence length (see Figure 1 B-D ). Download figure Open in new tab Figure 1. Comparison showing A: Number of tokens generated and B,C,D: Accuracy (ACC), Matthews Correlation Coefficient (MCC) and Area Under Curve (AUC) for BPE, GT and CT tokenizers against the sequence length. Additionally, we have compared the average token count for each tokenizer. CT treats each character as a separate token, while BPE tokenization merges frequent character pairs iteratively, creating tokens that represent more than one character. GT adopts a 3-mer tokenization that accounts for synonymous codons and exons. As expected, the average token count of GT was lower compared to the character tokenizer but higher than BPE (see Figure 1A ). Furthermore, a simplified BERT model for masked language modeling (MLM) demonstrated the highest accuracy for CT, the lowest for BPE, and intermediate results for GT, consistent with the vocabulary size for each tokenizer. 4 Discussion The focus of data driven tokenizers is on finding repetitive elements in DNA sequences reducing the number of tokens to process. However, this leads to an increase in the size of the dictionary making them computationally resource intensive compared to non-data driven tokenizers such as CT or k-mer tokenizers. Furthermore, despite the apparent simplicity of having only four nucleotides as alphabets, the information that DNA encodes is highly variable across organisms, chromosomes and genomic regions, many of which are still unknown. GT’s biology-driven algorithm has the potential to maintain compact vocabulary, which significantly enhances computational efficiency. Additionally, it may capture long token dependencies crucial for modeling the complex and intricate relationships within DNA sequences. This needs to be confirmed by using GT for foundational model training. Overlapping k-mer tokenization leads to considerable redundancy and information leakage in masked language modelling (MLM) when adjacent tokens are not masked. As the vocabulary size increases, the search space in MLM increases along with the computational complexity. GT does not have either of these potential drawbacks. Specific mutations or genetic variations in the DNA sequence can affect an organism’s phenotype (observable charac-teristics). These alterations can range from single nucleotide changes (point mutations) to large-scale chromosomal rearrangements. Even single nucleotide polymorphisms (SNPs) and mutations can significantly influence biological properties [ 15 ]. Point mutations can be substitution, insertion or deletion of the nucleotide. If the substitution is “synonymous”, it will not alter the amino acid sequence of the resulting protein due to the redundancy of the genetic code. Missense change resulting in a different amino acid being incorporated into the protein can have varying effects, from no noticeable change to complete loss of function of the protein. “Nonsense” mutation creating a premature stop codon leads to a truncated protein. Insertion or deletion of one or more nucleotides can cause a frameshift altering the amino acid sequence downstream leading to a completely different protein. Understanding specific mutations and genetic variations is crucial for diagnosing and treating genetic diseases, developing personalized medicine, and understanding evolutionary processes. GT can capture these biological variations during model training. Additionally, start and stop codons can be customized to accommodate variations in mitochondrial genomes and certain prokaryotic organisms. Furthermore, customizable intron encoding enhances its utility for various tasks. No single tokenization strategy is optimal for all datasets [ 3 ] and the optimal choice depends on several factors such as the dataset, task, and the computational resources available. GT may be useful in existing architectures such as the convolutional long-context model of HyenaDNA [ 10 ]. Ultimately, a thorough comparative analysis across various tokenization methods, vocabulary sizes, and base models is crucial for determining the most effective approach for a given application. We offer GT to the open-source community, encouraging exploration with diverse datasets and tasks. 5 Acknowledgements We gratefully acknowledge the infrastructural support provided by Orion Lab at the University of Illinois Springfield (UIS). Footnotes ↵ * Department of Management Information Systems; College of Business & Management. References [1]. ↵ Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , Łukasz Kaiser , and Illia Polosukhin . Attention is All you Need . In Advances in Neural Information Processing Systems , volume 30 . Curran Associates, Inc ., 2017 . URL https://papers.nips.cc/paper_files/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html . [2]. ↵ Paul McNamee and James Mayfield . Character N-Gram Tokenization for European Language Text Retrieval . Information Retrieval , 7 ( 1 ): 73 – 97 , January 2004 . ISSN 1573-7659 . doi: 10.1023/B:INRT.0000009441.78971.be . URL https://doi.org/10.1023/B:INRT.0000009441.78971.be. OpenUrl CrossRef [3]. ↵ Edo Dotan , Gal Jaschek , Tal Pupko , and Yonatan Belinkov . Effect of tokenization on transformers for biological sequences . Bioinformatics , 40 ( 4 ): btae196 , April 2024 . ISSN 1367-4811 . doi: 10.1093/bioinformatics/btae196 . URL https://doi.org/10.1093/bioinformatics/btae196. OpenUrl CrossRef PubMed [4]. ↵ Zijing Gao , Qiao Liu , Wanwen Zeng , Rui Jiang , and Wing Hung Wong . EpiGePT: a pretrained transformer-based language model for context-specific human epigenomics . Genome Biol , 25 ( 1 ): 310 , December 2024 . ISSN 1474-760X . doi: 10.1186/s13059-024-03449-7 . URL https://doi.org/10.1186/s13059-024-03449-7. OpenUrl CrossRef PubMed [5]. ↵ Zhihan Zhou , Yanrong Ji , Weijian Li , Pratik Dutta , Ramana Davuluri , and Han Liu . DNABERT-2: Efficient Foundation Model and Benchmark For Multi-Species Genome , March 2024 . URL http://arxiv.org/abs/2306.15006 . arxiv:2306.15006 [q-bio]. [6]. ↵ Francis Crick . Central Dogma of Molecular Biology . Nature , 227 ( 5258 ): 561 – 563 , August 1970 . ISSN 1476-4687 . doi: 10.1038/227561a0 . URL https://www.nature.com/articles/227561a0 . Publisher: Nature Publishing Group . OpenUrl CrossRef PubMed Web of Science [7]. ↵ Shashank Mohan Jain Shashank Mohan Jain. Hugging Face . In Shashank Mohan Jain , editor, Introduction to Transformers for NLP: With the Hugging Face Library and Models to Solve Problems , pages 51 – 67 . Apress, Berkeley, CA , 2022 . ISBN 978-1-4842-8844-3 . doi: 10.1007/978-1-4842-8844-3_4 . URL https://doi.org/10.1007/978-1-4842-8844-3_4. OpenUrl CrossRef [8]. ↵ Hugging Face . Tokenizers ,. URL https://huggingface.co/docs/tokenizers/index . [9]. ↵ Hugging Face . Tokenizer ,. URL https://huggingface.co/docs/transformers/main_classes/tokenizer . [10]. ↵ Eric Nguyen , Michael Poli , Marjan Faizi , Armin Thomas , Callum Birch-Sykes , Michael Wornow , Aman Patel , Clayton Rabideau , Stefano Massaroli , Yoshua Bengio , Stefano Ermon , Stephen A. Baccus , and Chris Ré . HyenaDNA: Long-Range Genomic Sequence Modeling at Single Nucleotide Resolution , November 2023 . URL http://arxiv.org/abs/2306.15794 . arxiv:2306.15794 [cs]. [11]. ↵ Yanrong Ji , Zhihan Zhou , Han Liu , and Ramana V Davuluri . DNABERT: pre-trained Bidirectional Encoder Representations from Transformers model for DNA-language in genome . Bioinformatics , 37 ( 15 ): 2112 – 2120 , August 2021 . ISSN 1367-4803 . doi: 10.1093/bioinformatics/btab083 . URL https://doi.org/10.1093/bioinformatics/btab083. OpenUrl CrossRef PubMed [12]. ↵ Katrin Erk and Noah A. Smith Rico Sennrich , Barry Haddow , and Alexandra Birch . Neural Machine Translation of Rare Words with Subword Units . In Katrin Erk and Noah A. Smith , editors, Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1715 – 1725 , Berlin, Germany , August 2016 . Association for Computational Linguistics . doi: 10.18653/v1/P16-1162 . URL https://aclanthology.org/P16-1162/ . OpenUrl CrossRef [13]. ↵ Zehui Li , Vallijah Subasri , Guy-Bart Stan , Yiren Zhao , and Bo Wang . GV-Rep: A Large-Scale Dataset for Genetic Variant Representation Learning , December 2024 . URL http://arxiv.org/abs/2407.16940 . arxiv:2407.16940 [cs]. [14]. ↵ Melissa J. Landrum , Jennifer M. Lee , George R. Riley , Wonhee Jang , Wendy S. Rubinstein , Deanna M. Church , and Donna R. Maglott . ClinVar: public archive of relationships among sequence variation and human phenotype . Nucleic Acids Res , 42 (Database issue): D980 – 985 , January 2014 . ISSN 1362-4962 . doi: 10.1093/nar/gkt1113 . OpenUrl CrossRef PubMed Web of Science [15]. ↵ Joseph Nasser , Drew T. Bergman , Charles P. Fulco , Philine Guckelberger , Benjamin R. Doughty , Tejal A. Patwardhan , Thouis R. Jones , Tung H. Nguyen , Jacob C. Ulirsch , Fritz Lekschas , Kristy Mualim , Heini M. Natri , Elle M. Weeks , Glen Munson , Michael Kane , Helen Y. Kang , Ang Cui , John P. Ray , Thomas M. Eisenhaure , Ryan L. Collins , Kushal Dey , Hanspeter Pfister , Alkes L. Price , Charles B. Epstein , Anshul Kundaje , Ramnik J. Xavier , Mark J. Daly , Hailiang Huang , Hilary K. Finucane , Nir Hacohen , Eric S. Lander , and Jesse M. Engreitz . Genomewide enhancer maps link risk variants to disease genes . Nature , 593 ( 7858 ): 238 – 243 , May 2021 . ISSN 1476-4687 . doi: 10.1038/s41586-021-03446-x . URL https://www.nature.com/articles/s41586-021-03446-x . Publisher: Nature Publishing Group . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted April 09, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Genomic Tokenizer: Toward a biology-driven tokenization in transformer models for DNA sequences Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Genomic Tokenizer: Toward a biology-driven tokenization in transformer models for DNA sequences Bell Raj Eapen bioRxiv 2025.04.02.646836; doi: https://doi.org/10.1101/2025.04.02.646836 Share This Article: Copy Citation Tools Genomic Tokenizer: Toward a biology-driven tokenization in transformer models for DNA sequences Bell Raj Eapen bioRxiv 2025.04.02.646836; doi: https://doi.org/10.1101/2025.04.02.646836 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17635) Bioengineering (13859) Bioinformatics (41846) Biophysics (21401) Cancer Biology (18534) Cell Biology (25422) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24285) Genetics (15582) Genomics (22463) Immunology (17700) Microbiology (40298) Molecular Biology (17141) Neuroscience (88424) Paleontology (666) Pathology (2825) Pharmacology and Toxicology (4813) Physiology (7633) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00