Full text
104,413 characters
· extracted from
preprint-html
· click to expand
Comprehensive molecular impact mapping of common and rare variants at GWAS loci | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Comprehensive molecular impact mapping of common and rare variants at GWAS loci View ORCID Profile Brad Balderson , View ORCID Profile Sanjana Tule , Mei-Lin Okino , William JF Rieger , Sierra Corban , Jeff Jaureguy , View ORCID Profile Nathan Palpant , View ORCID Profile Kyle J. Gaulton , View ORCID Profile Mikael Bodén , View ORCID Profile Graham McVicker doi: https://doi.org/10.1101/2025.06.05.658079 Brad Balderson 1 Salk Institute for Biological Studies , La Jolla CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Brad Balderson For correspondence: gmcvicker{at}salk.edu Sanjana Tule 2 School of Chemistry and Molecular Biosciences, University of Queensland , Brisbane, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sanjana Tule Mei-Lin Okino 3 Biomedical Sciences Graduate Program, University of California San Diego , La Jolla CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site William JF Rieger 2 School of Chemistry and Molecular Biosciences, University of Queensland , Brisbane, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sierra Corban 4 Department of Pediatrics, University of California San Diego , La Jolla CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jeff Jaureguy 1 Salk Institute for Biological Studies , La Jolla CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nathan Palpant 5 Institute for Molecular Biosciences , Brisbane, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nathan Palpant Kyle J. Gaulton 4 Department of Pediatrics, University of California San Diego , La Jolla CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Kyle J. Gaulton Mikael Bodén 2 School of Chemistry and Molecular Biosciences, University of Queensland , Brisbane, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mikael Bodén For correspondence: gmcvicker{at}salk.edu Graham McVicker 1 Salk Institute for Biological Studies , La Jolla CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Graham McVicker For correspondence: gmcvicker{at}salk.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Deep learning sequence to function models can predict the molecular effects of genetic variants, but their predictions are limited to the cell types and assays they are trained on. Here we describe DNACipher , a deep learning model that predicts the effects of genetic variants across diverse biological contexts—including those not directly measured. DNACipher takes 196 kb of genome sequences as input and imputes variant effects across 38,582 cell type–assay combinations. DNACipher generates predictions for >7 times as many contexts as Enformer, which allows for better detection of variant effects at expression quantitative trait loci (eQTLs). We also introduce DNACipher Deep Variant Impact Mapping (DVIM), a method to identify variants with molecular effects at genome-wide association study (GWAS) loci. Application of DVIM to type 1 diabetes (T1D) reduced the mean fine-mapping credible set size from 24 to 1.4 variants per signal. DVIM variants had significantly higher fine-mapping posterior probabilities, and their predicted effects were supported by single-nucleus ATAC-seq and luciferase assays. DVIM also detected 6547 rare variants with molecular effects at 96% of GWAS T1D loci, and these were enriched for associations with immune traits. In summary, DNACipher DVIM prioritises common and rare variants at GWAS loci by predicting molecular effects across a broad range of contexts. Introduction GWAS have identified thousands of variants associated with complex traits and diseases, however it remains difficult to distinguish causal variants of phenotype associations from non-causal variants in linkage disequilibrium (LD) 1 . Statistical fine-mapping can reduce the number of candidate variants substantially, but often leaves substantial ambiguity, especially within high LD regions. Additionally, GWAS are underpowered to detect associations with rare genetic variants, due to the small number of individuals that harbour the minor allele 1 . Moreover, how the causal variants affect genome function in specific cellular contexts is largely unknown 2 . Methods that intersect variants with functional genomic data or evolutionary sequence conservation can prioritize likely causal variants and provide biological context for their potential molecular effects 3 – 9 . However, because these approaches rely on overlap with genome annotations that are hundreds to thousands of bases long, they cannot disentangle causal variants from non-causal variants that are nearby 10 , 11 . Sequence to function (S2F) deep learning models predict genomic measurements from DNA sequence, enable single base-pair predictions, and have the potential to distinguish causal from non-causal GWAS variants at risk loci 12 – 19 . By comparing predicted signals from reference and alternative sequences, these models can also infer the effects of a genetic variant for specific cell types and assays, and can do so for both common and rare genetic variants 20 . Since most GWAS variants are non-coding 21 and may affect the expression of genes that are over 1 million bp away 22 , 23 , recent improvements to S2F models have concentrated on expanding the maximum sequence length they take as input 15 , 17 , 19 . The largest models, Borzoi 19 and Enformer 17 , have sequence inputs (‘receptive fields’) of 524 kb and 196 kb, which can be used to predict the long range regulatory effects of noncoding variants. A major limitation of current S2F models is that they can only generate predictions for the experimental assays that they are trained on. While models such as Enformer have been trained on a large number of experiments (5,313 human and 1,643 mouse) 17 , this represents only a small fraction of the experiments that could be performed on different cell types and tissues 24 , 25 . In addition, the available training data used by these models are imbalanced. For example, of the 12,869 human genomics experiments on ENCODE, 11% are DNase-seq experiments and 16% are from just two cell lines (K562 and HepG2) 26 . The sparsity of experimental genomics data has motivated methods to impute missing experiments, such as ChromImpute 25 , 27 and Avocado 24 . Avocado is a deep learning model that utilizes multiple one-hot-encoded inputs—including genome location, cell type, and assay—to predict experimental signals for these inputted biological contexts. By encoding cell type and assay as model input, Avocado can predict experimental signals for experiments that have not been performed 24 . However, Avocado does not utilise DNA sequences during prediction, and therefore cannot predict how genetic variants will affect the unobserved experiments that it imputes. Recently, several S2F models have been developed to predict unobserved experimental signals including EpiGePT 28 , Enformer Celltyping 29 , and DeepCT 30 . Both EpiGePT and Enformer Celltyping take long sequence inputs (128kb and 100kb), but they only predict signals for a small number of assays (8 and 6 assays) and require cell type-specific gene expression or chromatin accessibility to be provided as inputs. On the other hand, DeepCT makes predictions for a large number of experiments (31,760 cell type-assay combinations), however it does not predict gene expression or long-range variant effects because its input sequences are limited to 1kb. Thus, current methods for predicting experimental signals for unobserved experiments from DNA sequences are limited because they either cannot take long sequences as input or cannot impute signals for many unobserved experiments. Here, we develop a new model, DNACipher, which combes the long-sequence modelling capability of Enformer 17 with the imputation capability of Avocado 24 to make it possible to predict long-range variant effects across a broad range of observed and unobserved biological contexts ( Figure 1A-C ). After training on ENCODE data, DNACipher can predict functional variant effects in 38,582 distinct contexts (191 biological samples across 202 assays) over distances of 196 kb ( Figure 1D-E ). We validate DNACipher on held-out experiments and fine-mapped GTEx expression quantitative trait loci (eQTLs), and demonstrate that it can detect >7-fold more molecular effects of high-confidence causal-eQTLs compared to Enformer and DeepCT. Download figure Open in new tab Figure 1. Schematic illustration of DNACipher for inference of DNA variant effects for unseen cell type/assay combinations. A . Schematic of DNACipher input, which includes Enformer embeddings extracted from 196,608bp of DNA sequence, and experiments representing combinations of cell types (e.g. heart, fibroblast) and assays (e.g. RNA-seq, ATAC-seq, ChIP-seq). Cell type/assay combinations which have been experimentally observed are highlighted, while cell type/assay combinations which are imputable are blank. B . Schematic of the DNACipher model, which takes 3 inputs: one-hot encoded cell type, one-hot-encoded assay, and a DNA sequence embedding from Enformer. Linear layers parse each of the one-hot-encoded inputs independently, before they are concatenated and parsed through deeper neural network layers to predict the mean signal of the inputted sequence for the given cell type/assay combination. C . DNACipher infers variant effects for unseen cell type/assay combinations by contrasting predicted signal predictions between DNA sequence variants for all cell type and assay combinations. D . The number of predicted cell type/assay contexts for DNACipher, Enformer, and DeepCT. E . The sequence length used as context for DNACipher, Enformer, and DeepCT. F . The cell type/assay matrix of observed and unobserved experiments from ENCODE, with the figure legend indicating which experiments were used to train and test DNACipher. Most contexts are not experimentally performed, but are imputable. We also develop Deep Variant Impact Mapping (DVIM), a method to prioritize trait-associated variants based on their DNACipher-predicted effects in relevant cell types (‘impact’ variants). We apply DVIM to Type 1 diabetes (T1D) GWAS loci, and find that impact variants have significantly higher fine-mapping posterior probabilities. DVIM can be applied to both common and rare genetic variants, and we use it to identify 6547 rare impact variants at 96% of T1D GWAS loci. Results The DNACipher model takes three inputs: embeddings for the genomic sequence, one-hot encoded cell type, and one-hot encoded assay. Latent representations of cell type and assay are learned by the model, whereas the sequence embeddings are provided from another pre-trained model. These inputs are then passed into a multi-layer perceptron with ∼1.3 million parameters ( Figure 1A-B ). This design makes it possible to predict variant effects for all combinations of cell types and assays, including those which have not been observed ( Figure 1C ). In addition, because DNACipher takes sequence embeddings from pre-trained models, different sequence models can be used, including those with long sequence contexts, without requiring high compute resources. Hence, DNACipher enables long-range variant effect predictions in unobserved biological contexts and it can be trained on a single A40 GPU ( Figure 1D-E ). DNACipher-NT-Loc predicts genomic signals from DNA sequence for unobserved cell types and assays We trained several versions of the DNACipher model that take different sequence embeddings as input. For most analyses, we use a model that uses sequence embeddings from Enformer, which allows for long sequence inputs. However a potential concern is that Enformer is pre-trained on ENCODE data 17 , 26 , and data leakage could cause DNACipher’s performance to be overestimated 31 . To address this potential issue, we trained an initial version of DNACipher that does not use Enformer, and instead takes short sequence embeddings from Nucleotide Transformer (NT) 32 as well as genome locations as model inputs. We call this model DNACipher-NT-Loc ( Figure 2A-B ). NT is trained on 6kb DNA sequences using a self-supervised masked token prediction approach 32 , and is not trained on functional genomics measurements. Therefore, DNACipher-NT-Loc performance metrics on ENCODE will not be biased from the pretraining step. Similar to Avocado 24 , we encoded the genome locations of sequences on model input at multiple levels of resolution (12kb, 48kb, and 120kb). Within 12kb bins on chromosomes 2 and 16, we allocated one 6kb sequence to training and the other for testing, to evaluate model performance on held-out sequences ( Figure 2A ). In total, our sequence data consisted of ∼1.3 million training sequences and 54,440 test sequences. Download figure Open in new tab Figure 2. DNACipher-NT-Loc signal prediction for unseen sequences and experiments maintains predictive performance across diverse assay types. A . Schematic of the DNACipher-NT-Loc model inputs, which include sequence embeddings for 6kb sequences from the Nucleotide Transformer model, location of the input sequence in the genome (provided as a one-hot-encoding at 120kb, 48kb, and 12kb bins within each chromosome), and the allocation of train and test sequences within each 12kb genome bin on chromosomes 2 and 16. B . Depiction of the DNACipher-NT-Loc model, which takes the inputs depicted in A, to predict the mean genomic signal in the centre 300bp of the inputted 6kb sequence. C . Pearson’s correlation (R) between the predicted and observed genomic signals stratified by training sequences in testing (unobserved) experiments, test sequences in training (observed) experiments, and test sequences in test (unobserved) experiments. D . Pearson’s correlations of predicted versus actual signal values when stratified by assay type (x-axis) and by testing stratification. Error bars indicate the interquartile range of correlations across cell types. For test experiment stratifications, there is only one held-out experiment per assay type, so error bars are not depicted. We trained DNACipher-NT-Loc on 2,676 experiments (cell type / assay combinations), and created a test dataset consisting of one held-out cell type for each assay so that we could evaluate predictions on unobserved experiments (206 total, Figure 2A , Table S1). This data split strategy allowed us to assess prediction accuracy for combinations of observed vs. unobserved experiments and sequences. We evaluated the overall performance of DNACipher-NT-Loc across all possible experiments (cell type / assay combinations) by computing the Pearson correlation (R) between predicted and actual experimental values. We obtained Pearson correlations of R=0.68 for predicting training experiments from unobserved test sequences and R=0.67 for predicting unobserved test experiments from training sequences ( Figure 2C ). This indicates that DNACipher-NT-Loc can impute signals for unobserved experiments ( Figure 2C ). We next evaluated the performance of DNACipher-NT-Loc on unobserved (test) sequences and unobserved (test) experiments, and obtained a Pearson correlation of R=0.60 ( Figure 2C ). This demonstrates that DNACipher-NT-Loc can impute experimental signals even when the sequence and experiment has not been observed. We next quantified the performance of DNACipher-NT-Loc by assay type. For chromatin accessibility assays (ATAC-seq and DNase-seq), we observed consistently high Pearson correlations (R≈0.50) across observed/unobserved experiment and sequence stratifications ( Figure 2D , Table S2). For RNA-seq assays (total RNA-seq, mRNA-seq, and polyA depleted mRNA-seq), we observed correlations of R≈0.70 across testing stratifications ( Figure 2D , Table S2). Histone modification ChIP-seq assays were more variable. Certain marks (H3K4me3, H3K79me3, H3K9ac, H3K4me2, H3K27ac, and H3K36me3) achieved Pearson correlations of 0.40-0.64, while others achieved lower correlations of ∼0.20 ( Figure 2D , Table S2). TF ChIP-seq prediction performance was generally consistent across stratifications ( Figure 2D , Table S2), however, certain TFs had substantially reduced performance when they were predicted in unobserved cell types and sequences (PATZ1, ZBTB8A, TRIM22, ZNF274, REST, GATA3, MYNN, ZNF707, and YBX1) ( Figure 2D , Table S2). These results show that DNACipher-NT-Loc was able to infer signal measurements in unobserved cell types across a diverse range of assays, including for unobserved sequences, and has consistently high performance for chromatin and gene expression assays. DNACipher with Enformer embeddings generalizes to unobserved experiments After establishing that DNACipher-NT-Loc can predict unobserved experimental signals, we next evaluated a model that utilized Enformer embeddings, which would enable a long-sequence input and provide a potentially more informative representation. Since this is the version of the model we use for most analysis, we refer to it as simply ‘DNACipher’. With this long-sequence input model of DNACipher, we removed the genome location encoding component, and allocated all sequences on chr2 and chr16 for testing. Surprisingly, the performance metrics for DNACipher with Enformer embeddings were somewhat worse than the DNACipher-NT-Loc model, even without removing experiments that intersected the Enformer pre-training dataset ( Figure 3A ). DNACipher’s predictions achieved a correlation of R=0.57 with observations for train experiments and test sequences, R=0.64 for test experiments and train sequences, and R=0.48 for test experiments and test sequences ( Figure 3A ). This suggests that the pre-training data of the Enformer model did not improve predictive performance. Download figure Open in new tab Figure 3. DNACipher with Enformer embeddings on independent non-Enformer experiments validates generalisation to unobserved contexts. A . Pearson’s correlation (R) between the predicted and observed genomic signals stratified by training sequences in testing (unobserved) experiments, test sequences in training (observed) experiments, and test sequences in test (unobserved) experiments. B . Pearson’s correlation (R) between DNACipher predicted and experimentally observed ATAC-seq signals for peripheral blood mononuclear cells (PBMCs) and endocrine pancreas data for training sequences. These two samples are independently performed ATAC-seq experiments and these specific tissues do not have ATAC-seq in ENCODE, and were thus not seen by Enformer in pre-training. C . Venn diagram depicting the intersection between the Enformer experiments based on ENCODE accession numbers and the allocated train and test experiments for DNACipher. D . Venn diagram depicting the intersection between Enformer experiments and the DNACipher test experiments based upon cell type and assay annotations of the experiments (intersections by ENCODE accession number are removed), indicating 71 non-intersecting experiments based on both accession and cell type / assay annotations. E . For the 71 non-intersecting test experiments with Enformer, bar plots indicate Pearson correlations of predicted versus actual signal values when stratified by assay type (x-axis) and by testing stratification. Error bars indicate the interquartile range of correlations across cell types. For test experiment stratifications, there is one held-out experiment per assay type and so error bars are not depicted. We then controlled for potential pre-training leakage from Enformer by evaluating DNACipher predictions against ATAC-seq data generated independently of ENCODE. These ATAC signals were processed to generate bulk ATAC-seq signals representing total peripheral blood mononuclear cells (PBMCs) and endocrine pancreas cells 33 , tissue types which do not have ATAC-seq data in ENCODE 26 (although cell types derived from PBMCs and whole pancreas do have ATAC-seq data and DNase-seq data in ENCODE (Table S1)). For this dataset, DNACipher had a mean Pearson correlation of R=0.79 on observed sequences (i.e. those used in training) for the PBMCs and R=0.64 for the endocrine pancreas ATAC-seq signals ( Figure 3B ). For unobserved test sequences, a correlation of R=0.62 for PBMCs and R=0.57 for endocrine pancreas was achieved ( Figure 3B ). We also evaluated the degree of overlap between the ENCODE data used to train and test DNACipher with the Enformer ENCODE datasets ( Figure 3C-D ). In total, 90 / 206 (43.9%) of our test experiments were different ENCODE experiments ( Figure 3C ), and of these, 71 represented cell type / assay combinations that did not intersect cell type / assay combinations used to train Enformer ( Figure 3D ). For these 71 testing experiments which did not intersect Enformer training data, we observed similar predictive performance of DNACipher across testing stratifications as observed for the DNACipher-NT-Loc model ( Figure 3E , Table S3). When we systematically compared DNACipher implementations with different combinations of short or long sequence context or genome location on model inputs, we found that short sequences (6kb) alone did not outperform long sequence context (196kb) but that short sequences combined with genome location had comparable performance for predicting local signals (Supp. Note 1, Figure S1, Table S4). Taken together, these results suggest that DNACipher infers chromatin accessibility with high accuracy on an independent validation dataset, and also that our performance metrics on the held-out (test) ENCODE experiments were not inflated due to data-leakage from Enformer pre-training. To further assess the validity of the trained DNACipher model, we examined its internal latent representations and predictions and found that they reflect known biological relationships between cell types, assays, and sequences. Within the cell type latent space, similar cell types (e.g. different subsets of lymphocytes) clustered together, and within the assays latent space, related assays (e.g. those associated with polycomb repressive complex 2) clustered together (Supp. Note 2, Figure S2A-C, Table S5). We also found that signals that were predicted to co-occur across experiments matched co-occurrence patterns in the real data showing that DNACipher maintains the covariance structure of experimental signals across unobserved cell types and assays (Supp. Note 3, Figure S2D-E, Table S6). We also confirmed that DNACipher predictions consistently outperformed those made by a simple mean-based estimator (Figure S2F-H). In combination, these results indicate that DNACipher integrates information from sequence, cell type, and assay embeddings to accurately predict molecular measurements in unobserved cell types and DNA sequences. DNACipher long-range variant effect prediction detects a causal eQTL effect missed by Enformer Although versions of DNACipher with long (DNACipher) and short (DNACipher-NT-Loc) sequence contexts performed similarly for local signal prediction, the long context model offers an important advantage. Namely, it can predict the effects of variants over long genomic distances because it outputs experimental signals continuously across the input sequence (the central 114kb / 196kb input at 128bp resolution) 17 . To evaluate whether DNACipher can predict long-range effects of genetic variants, we first focused on the WRN locus as an example to highlight the benefit of imputation ( Figure 4 ), before performing a systematic comparison across thousands of eQTLs in GTEx ( Figure 5 ). We also focused specifically on prostate tissue because the WRN locus contains a fine-mapped prostate eQTL from GTEx 23 , which has a single intronic variant with a high posterior inclusion probability (PIP=0.9999)( Figure 4 ). Download figure Open in new tab Figure 4. DNACipher predicts long-range effects of an eQTL for WRN that are not detected with Enformer. A . DNACipher signal predictions for the top 30 prostate-specific assays (of 202 total) at the WRN locus based on the reference genome sequence. The x-axis is the genome position. Assays marked with stars are imputed. The WRN transcription start site (TSS), WRN exons, and an ENCODE candidate Cis-Regulatory Element (cCRE) are annotated. B . Equivalent to A, except with Enformer prostate predictions, which are limited to two assays (CAGE-seq and H3K36me3 ChIP-seq). C . Predicted change in assay signals between the reference and alternate alleles of a likely causal expression quantitative trait locus (eQTL) variant for WRN . Δ values are the percent predicted change in signal (Alt-Ref sequence) across the WRN locus for the likely causal eQTL. D . Equivalent to C, except with Enformer. Download figure Open in new tab Figure 5. DNACipher improves detection of causal eQTL molecular effects. A . Schematic of balanced causal variant selection for benchmarking. Briefly, eQTLs and fine-mapping statistics were collected from GTEx v8. For each tissue and within 100kb TSS-distance bins we selected likely causal (PIP > 0.9) and non-causal (PIP < 0.1) eQTL variants. We performed DNACipher variant effect inference on these variants, and recorded tissue-specific variant effects across all predicted assays. The mean causal and non-causal absolute variant effects for each assay were then compared with a t-test. B . Violin plot with boot-strapped (n=1000) predicted variant effect means from DNACipher for likely causal and non-causal eQTL variants on the y-axis. The top 20 most significant molecular assays are shown on the x-axis. C . Equivalent to B, except utilising Enformer predictions. D . Bar plots, with assays ranked by t-value on the y-axis, and t-values from comparing predicted variant effects of likely causal and non-causal variants on the x-axis. Significant assays are highlighted and expanded, while non-significant assays are collapsed in grey. Results are shown for DNACipher, DNACipher-NT-Loc, Enformer, and DeepCT. E . The number of assays with effect sizes that differ between likely causal and non-causal eQTL variants for DNACipher (n=202), DNACipher-NT-Loc (n=202), Enformer (n=40), and DeepCT (n=40). Error bars indicate the minimum and maximum values observed when repeating variant effect detection 10 times, re-sampling 80% of the variants each time with replacement. DNACipher predicted signals for 202 prostate assays ( Figure 4A ), but because prostate tissue has sparse experimental data within ENCODE, Enformer could only generate predictions for two prostate assays (CAGE-seq and H3K36me3 ChIP-seq) ( Figure 4B ). DNACipher predicted prostate-specific positive-strand RNA-seq expression that was specific to WRN ’s exons and corresponded to the orientation of the gene ( Figure 4A ). This is notable, because Enformer was not trained on RNA-seq, and its expression predictions are limited to CAGE-seq. At the WRN transcription start site (TSS), DNACipher predicted chromatin accessibility (DNAse-seq and ATAC-seq) and transcription-associated histone modifications (H3K4me3 and H3K27ac) ( Figure 4A ), and at an ENCODE candidate cis-regulatory element (cCRE), DNACipher predicted CTCF and RAD21 ChIP-seq signals as well as nearby enhancer-associated histone modifications (H2BK20ac and H2BK12ac) 34 . Together, these observations illustrate that DNACipher reconstructs complex regulatory landscapes even in unobserved experiments. To test DNACipher’s ability to predict the long-range effects of genetic variants, we utilized the likely causal eQTL variant for WRN (rs4733225, PIP=0.9999, T>C substitution) and compared DNACipher’s predictions from the reference and alternative sequences. For each assay, we summed the difference between the reference and alternative predictions across the entire locus to obtain a predicted signal change (Δ; Figure 4C ). DNACipher predicted an overall increase in gene expression for the alternative allele across the WRN locus (Δ10.5% plus strand polyA plus RNA-seq; Figure 4C ) consistent with the GTEx eQTL effect direction (β=0.34). The eQTL variant is located just upstream of exon 24, and DNACipher predicted the alternative allele decreases the expression of this exon but increases the overall WRN expression, potentially indicating a change in exon inclusion ( Figure 4C ). Other assays remained relatively unchanged, except for an increase in predicted H3K36me3 signal in the gene body (Δ7.08%). Enformer’s prostate-specific predictions for the WRN locus were limited to CAGE-seq and H3K36me3 ChIP-seq signals ( Figure 4D ). CAGE-seq captures 5’ reads, and signal predictions were correctly localized to the WRN TSS, however exon-specific expression is not captured by this assay. As a result, Enformer could not predict the exon-specific gene expression of WRN , and the eQTL effect was not captured (Δ0.01% CAGE-seq; Figure 4D ). Enformer’s predicted effect on H3K36me3 ChIP-seq was equivalent to DNACipher’s predictions (Δ6.05%; Figure 4D ). While we found Enformer missed this eQTL due to a lack of assay representation, previous studies have also found that Enformer is unable to predict the direction of gene expression differences from personalized genome sequences 35 , 36 . However, Scooby 37 , a recent S2F model that finetunes Borzoi 19 on single cell multi-ome data, predicts effect directions correctly for likely causal eQTL variants that have high predicted effects on expression. To determine if DNACipher had similar behaviour, we predicted likely causal eQTL (PIP > 0.9) effects on the associated eGene across all GTEx tissues (62,558 eQTLs, mean PIP = 0.98) (Figure S3). We found that eQTLs predicted to have strong gene expression effects had accurate magnitudes and directions (Spearman’s ρ=0.56, Cohen’s κ for negative effect eQTLs=0.84, Cohen’s κ for positive effect eQTLs=0.49) (Figure S3E-F). Overall, these findings illustrate that DNACipher predicts variant effects over large genome regions, provides a comprehensive molecular interpretation of genetic variants across observed and imputed assays, and predicts the correct direction of strong eQTL effects. Inference to unobserved experimental contexts enhances detection of eQTL molecular effects Based on our observations at the WRN locus, we hypothesized that DNACipher would outperform other S2F models at identifying context-specific molecular effects of genetic variants, due to its ability to impute effects for unmeasured experiments. Specifically, since DNACipher can generate comprehensive predictions for assays spanning transcription, chromatin accessibility, histone modifications, and TF binding, we expected that DNACipher would be able to detect a larger number of differences between likely causal and non-causal eQTLs compared to models that do not perform imputation. To test this, we compared likely causal and non-causal eQTL variants from GTEx across 49 tissues. For each tissue, we binned eQTLs by their distance to the eGene’s TSS and selected 3 likely-causal eQTLs (PIP > 0.9) and 3 non-causal eQTLs (PIP < 0.1) within each bin ( Figure 5A , Figure S4A). This yielded 23,058 eQTL variants with a balanced representation of tissue types, TSS distances, and expected regulatory impacts ( Figure 5A , Figure S4B-D). We then matched each GTEx tissue to a corresponding DNACipher tissue and predicted variant effects across all 202 assays. For each assay, we compared the absolute predicted effect sizes of likely causal variants and non-causal variants (Student’s two-sided t-test) ( Figure 5A-B , Table S7). For comparison, we repeated this analysis with Enformer, DeepCT, and DNACipher-NT-Loc ( Figure 5C-E ). Enformer’s coverage was limited to 40 assays because it cannot predict assays for many of the GTEx tissues (Table S7). For DNACipher, the assays with the most significant differences between likely causal- and non-causal eQTLs were RNA-seq assays ( Figure 5D , Table S7). DNACipher-NT-Loc showed a similar pattern ( Figure 5D , Table S7). In contrast, Enformer ranked H3K36me3 and H3K4me3 ChIP-seq as the assays with the most significant differences rather than a gene expression assay, which may be because it is not trained to predict RNA-seq, but instead uses CAGE-seq as an expression output. DNACipher, DNACipher-NT-Loc, and DeepCT all found that likely causal variants had smaller effects on heterochromatin associated marks (H3K9me3, H3K9me2, SETDB1) compared to non-causal variants ( Figure 5D , Table S7) consistent with the depletion of eQTLs in heterochromatin 38 , 39 . Enformer failed to capture this difference in effect sizes, likely due to its restricted assay coverage ( Figure 5D , Table S7). Furthermore, when comparing the range of t-statistics as a measure of effect size contrast, DNACipher had the largest range (−2.5, 15), followed by Enformer (0, 10), DNACipher-NT-Loc (−4, 4), and DeepCT (−4, 4) ( Figure 5D , Table S7). This pattern suggests that context length has a greater influence on the magnitude of detected differences compared to the number of imputed contexts, likely because a longer context makes it possible to capture distal eQTL effects. We evaluated the number of assays with significant differences in effect sizes between likely causal and non-causal eQTL variants for each method. DNACipher detected 176 molecular effects (out of 202), DNACipher-NT-Loc detected 115 (out of 202), Enformer detected 30 (out of 40), and DeepCT detected 15 (out of 40) ( Figure 5D , Table S7). Notably, DNACipher-NT-Loc detected substantially more assays with significant differences in effect sizes compared to Enformer despite its shorter context window. This underscores that imputation to unobserved biological contexts is important for the detection of molecular variant effects. While long-sequence context is required to detect the distal effects of genetic variants, imputation to unobserved experiments allows molecular effects to be detected across biological contexts. DNACipher Deep Variant Impact Mapping (DVIM) identifies likely causal variants underlying Type 1 Diabetes (T1D) GWAS loci Since DNACipher can predict molecular effects across a broad range of cell types and assays, we reasoned that it could be used to prioritize variants at GWAS loci and to determine the specific biological contexts they act in. To test this, we applied DNACipher to genetic variants at GWAS hit loci for type 1 diabetes (T1D) 33 , and developed a variant prioritization method we term Deep Variant Impact Mapping (DVIM). We first stratified variants at each GWAS signal into candidate causal and background variants, based on their significance (GWAS association p-value) and minor allele frequency (MAF). At each GWAS locus, we defined ‘background variants’ as non-significant common variants (P>0.001; MAF > 5%), and ‘candidate common variants’ as those with low p-values (P 5 × 10 −7 ) with MAF < 5% at GWAS signals ( Figure 6A ). Considering rare variants separately is important because, while DNACipher can assess variant function independently of MAF, GWAS are underpowered to detect associations with rare variants. All other rare variants outside the GWAS signal at each locus were excluded ( Figure 6A ). Download figure Open in new tab Figure 6. DNACipher Deep Variant Impact Mapping (DVIM) identifies functional effects of fine-mapped genetic variants in Type 1 Diabetes (T1D) GWAS. A . GWAS associations with T1D (−log10 p-values, y-axis) for both rare and common genetic variants at the RUNX3 locus (genome coordinates, x-axis). Variants are categorized as candidate common variants (orange), candidate rare variants (pink), background variants (black), and rare variants (gray) based on their GWAS p-values and minor allele frequencies. The red vertical line marks the center of the GWAS signal, determined as the highest posterior probability variant by FINEMAP. The black line indicates the RUNX3 gene body and transcription start site. B . Predicted variant effects from DNACipher DVIM. The y-axis shows the number of significant predicted effects for each variant computed using a null distribution generated by bootstrapping the background variant effects (common variants not associated with T1D). The top-ranked common variant (blue outline) was also identified as the most likely causal variant by statistical fine-mapping with FINEMAP. C . Reduction in FINEMAP credible set sizes after filtering for significant variants by DNACipher DVIM (“impact variants”). The y-axis is the FINEMAP credible set size before (gray) and after (blue/red) filtering. The x-axis is the genomic position of T1D GWAS signals. Red markers indicate loci where all variants were removed after DVIM filtering. D . Scatter plot comparing the FINEMAP credible set sizes before (x-axis) and after (y-axis) applying DNACipher DVIM filtering. E . Ranked bar chart showing the number of variants in the FINEMAP credible sets after DNACipher DVIM filtering (y-axis) and the number of GWAS signals corresponding to each set size (x-axis). F . Distribution of mean posterior probabilities (PPA) from FINEMAP across 10,000 randomly selected credible set variants (grey; mean=0.085), compared to the mean PPA of variants prioritized by DNACipher DVIM (blue; mean=0.117) (empirical p = 0.0047). G . Jurkat T cell luciferase assay measurements as box-and-whisker plots for three genetic variants—one predicted as non-functional, and the other two predicted as impact variants. The y-axis is the normalized Relative Luciferase Units (RLU) which quantifies enhancer activity, and the x-axis indicates the allele substitution used in the DNA sequence of the luciferase reporter. Two conditions were tested for each variant and allele, a T cell activation condition (+IL10 and CD3/CD28 Dynabeads) and an unstimulated condition. H . Strip plot of DNACipher predicted effect sizes across all assays on the y-axis, separated by background, common, and rare variants on the x-axis. Only significant effects are shown for common and rare variants. Background variant effects are randomly down-sampled to the same number as the rare variants. I . Log2-odds-ratio of overlap between rare impact variants and rare variants previously associated with human traits compiled by the Rare Variant Association Repository (RAVAR). The horizontal dotted line indicates the random expectation, and the error bars indicate the 95% confidence interval for the log2-odds-ratio. ***padj<0.001, **padj<0.01, *padj<0.05. Next, we used DNACipher to predict the functional effects of the variants in these categories in contexts related to T1D (immune cells and pancreatic tissue). We estimated the significance of the predicted effects for each assay by comparing the candidate common and rare variant effects to a null distribution created by bootstrap resampling of the background variant effects. Candidate common or rare variants that had at least one significant predicted effect above the background variants we term as ‘impact’ variants. We performed this analysis at all 136 T1D GWAS signals at 92 loci and examined the intersection of impact variants with the FINEMAP 95% credible set variants at each signal 33 ( Figure 6B-C , Table S8, Supp. Material 1, Supp. Material 2). Taking detected impact variants as a filter, the average credible set size could be reduced from 24 variants to 1.4 variants ( Figure 6D , Table S8). The majority (n=71; 52%) of the GWAS signals had at least one credible set variant that was an impact variant, with 21 (15%) credible sets containing a single impact variant and 53 credible sets (39%) containing 3 or less impact variants ( Figure 6E ). The impact variants within credible sets had higher posterior probabilities of association (mean PPA = 0.117), compared to an equivalent number of randomly selected credible set variants (mean PPA = 0.085; empirical p=0.0047) ( Figure 6F ). This indicates that impact variants are more likely to be causal variants. We highlight the RUNX3 locus as an example application of DVIM ( Figure 6A-B ), and provide equivalent example plots for all of the T1D signals (Supp. Material 1). Here, 4 out of 49 (8%) of the candidate common variants associated with T1D were impact variants ( Figure 6A-B ), and the impact variant with the highest number of predicted effects was also the FINEMAP credible set variant with the highest PPA ( Figure 6A-B ). Rare impact variants had more significant predicted variant effects than the common impact variants (mean significant contexts n=11 vs. n=7), and the rare impact variants were concentrated at the RUNX3 TSS ( Figure 6B ). To test whether DNACipher accurately predicted the cell type context of molecular effects, we tested whether impact variants for each molecular effect were enriched in accessible chromatin of the predicted cell type using independently generated single nucleus ATAC-seq (snATAC-seq) that was not within the DNACipher training data 33 . Impact variants were significantly enriched within accessible chromatin of the cell types predicted to be affected across molecular modalities (Figure S5, Table S9). This demonstrates DNACipher provides context aware predictions across molecular modalities that are corroborated by independent snATAC-seq measurements. We also experimentally tested the regulatory function of impact variants, by performing luciferase assays in Jurkat T cells for two impact variants (rs6908626 and rs72928038 at the BACH2 locus) and one non-impact variant (rs17574546 at the RASGRP1 locus) ( Figure 6G , Table S10). The sequences containing the impact variants drove higher luciferase expression compared to the sequences containing the predicted non-functional variant ( Figure 6G ). In addition, both impact variants had significant allele-specific differences in luciferase expression, but the non-functional variant did not ( Figure 6G , Table S10). DNACipher DVIM detects functional rare variants at T1D GWAS loci GWAS are underpowered to detect associations with rare variants 1 , and most rare variants are not included in fine-mapping credible sets. Nonetheless, 17 of the 136 T1D GWAS signals (12.5%) contained rare (MAF < 5%) impact variants within FINEMAP credible sets (Figure S6A). When all rare variants were considered, not just those within credible sets, DNACipher DVIM found 6547 rare impact variants (5.2% of the 126,742 rare variants tested) at 131 of the 136 T1D signals (96%) (Figure S6B). These rare variants had greater magnitudes of predicted effects compared with common impact variants and the background variants ( Figure 6H ), consistent with purifying selection that prevents large-effect alleles from reaching high population frequencies 40 , 41 . We next assessed whether these rare variants were previously found to be functional, by intersecting them with the Rare Variant Association Repository (RAVAR) 42 . While only 30 of the 6451 trait-associated RAVAR variants intersected with T1D GWAS loci (Table S11), 5 of these were rare impact variants, which is significantly more than expected by chance (odds-ratio=3.7, p=0.018, Figure 6I , Table S11). The traits associated with these rare impact variants were mostly related to immune cell abundance, such as platelet counts (rs754838420 43 MAF=0.001, rs149765041 41 , 44 MAF=0.02), monocyte counts (rs540639423 44 , 45 MAF=0.008), and eosinophil/granulocyte counts (rs12722502 41 , 44 MAF=0.02) (Table S11). Autoimmune-diagnostic blood measurements such as IgA, IgM, ILR5, SLAM7 and Albumin protein measurements were also associated with one of the rare impact variants (rs188468174 41 , 46 MAF=0.01). Overall, these results show that DNACipher DVIM can detect functional rare variants. Since very few rare variants currently have trait associations, this is a promising step toward understanding their molecular function and role in human disease 1 . Discussion DNACipher combined with Deep Variant Impact Mapping predicts the functional effects of genetic variants in precise biological contexts, providing specific hypotheses of causal genetic variants at GWAS hit loci and their influence on gene regulation. This approach is particularly promising for the analysis of rare genetic variants. Greater than 97% of the 400 million human genetic variants catalogued by the Trans-omics for Precision Medicine project programme are rare (MAF < 1%) 47 , and rare variants can be important risk variants for common diseases 41 . Unfortunately GWAS are underpowered to detect associations with most rare variants. DNACipher DVIM can be used to identify rare variants with potential disease-influencing molecular effects that cannot be detected with GWAS. Indeed, we identified 6547 rare impact variants with molecular effects at T1D risk loci. This lays the foundation for future high-throughput screening studies, such as massively parallel reporter assays 10 , 48 , 49 , which could validate their predicted functions. A major advance in S2F models has been the expansion of the “receptive-field” or input length, which has enabled long-range variant effect prediction 17 , 19 . DNACipher provides another important advance by greatly increasing the “context-field”--the range of biological contexts that can be predicted. DNACipher not only has a substantial receptive field (196,608bp), but it can also impute signals for 38,582 contexts corresponding to 202 assays and 191 cell types. This scale surpasses previous models that are restricted to very short sequence contexts 30 , or lack the ability to represent DNA sequence directly 24 . Alternatives to DNACipher are more limited because they require cell type-specific inputs like gene expression 28 , 29 , 50 or they can only make predictions for the same contexts that they are trained on. In contrast, DNACipher’s latent representation of cell types and assays allows scalable predictions across a wide range of assays and cell types. While DNACipher imputes most assays effectively, predicting cell type-specific TF binding remains challenging. This could be because the sparse TF ChIP-seq data that is available for training is insufficient to learn the highly cell type-specific binding of some TFs, which depend on TF binding partners and local chromatin state 51 , 52 . Expanding the available training data could potentially improve the accuracy and generalization of TF binding predictions across cell types. This could be achieved by collecting more cell type-specific TF ChIP-seq experimental data (e.g. using multiplexed methods like ChIP-DIP 53 ), or alternatively by creating TF footprinting profiles from base-pair resolution chromatin-accessibility signals 54 , 55 . Looking ahead, additional biological contexts—beyond cell type and assay metadata—could be modeled as inputs to S2F-like frameworks, such as developmental stage or perturbation conditions. This would enable inference of how genetic variants influence phenotypes across development or in response to perturbation. Recent efforts have sought to impute missing molecular measurements during development 56 and predict perturbation responses in unobserved cell types 57 . However, these models typically overlook DNA sequence as an input for variant effect prediction. Extending such models to incorporate DNA sequence—using an approach similar to the one developed here—could make it possible to pinpoint when, and under what conditions, genetic variants exert their effects. In this way, the limited experimental genomics data available across relevant biological conditions could be overcome 58 , enabling the discovery of highly context-specific variants such as response eQTLs 59 . Methods ENCODE data preprocessing We accessed the ENCODE 26 data portal ( https://www.encodeproject.org ) in September of 2023, and obtained metadata for 748,860 processed human ( Homo sapiens , GRCh38) files which had “released” status, no quality control issues, and which were derived from untreated biosamples. We restricted the dataset to biosample types “cell line”, “tissue”, or “primary cell”, thereby excluding in-vitro differentiated and cell free samples. We included the following assay types: “ATAC-seq”, “Histone ChIP-seq”, “Mint-ChIP-seq”, “TF ChIP-seq”, “DNase-seq”, “polyA minus RNA-seq”, “polyA plus RNA-seq”, and “total RNA-seq”. For RNA-seq assays, we retained files that provided strand-specific or overall “signal of unique reads” normalized by reads-per-million. If duplicate files existed for a given experiment, we selected the most recent file or the file with the largest size as a proxy for sequencing depth. For non RNA-seq assays, we used bigBed peak files. A single peak set per experiment was selected by prioritizing peaks files that aggregated the most replicates or that had the most recent release date. When files could not be prioritized with these criteria for a given experiment, we prioritized them by their “Output type” in the following order: “pseudoreplicated peaks”, “IDR thresholded peaks”, “replicated peaks”, and “optimal IDR thresholded peaks”. For DNAse-seq, “peak” files were selected. This data preprocessing methodology resulted in a single bigBed file per cell type-assay combination. Cell type and assay labeling Cell type labels were derived from the ENCODE “Biosample term name” metadata field, while assays were labeled according to the “Assay annotation” field. For ChIP-seq experiments we appended the target name (e.g., a specific histone modification or TF) to the assay label to ensure that each assay mapped to a clearly defined molecular target. Following criteria similar to Avocado 24 , we only retained cell types that had >= 5 assays, and assays present in >=3 cell types. For each assay, we designated the cell type represented in the most assays as the test cell type for that assay, to avoid depleting training data for cell types with extremely sparse experimental measurements. This selection process was iterative: once a test cell type was chosen for an assay, we updated the training set counts before proceeding to the next assay. A complete list of the ENCODE files used and their associated cell types and assay labels is provided in Table S1. Signal representation and genomic tiling For ChIP-seq, ATAC-seq, and DNase-seq assays, we generated continuous “peak signal” profiles by taking the -log10(p-value) of each peak and assigning this value to every base pair within the genomic peak coordinate range. This approach reduced experimental background noise and enabled us to represent peaks as continuous signal tracks. When p-values were unavailable (e.g., DNase-seq or “IDR thresholded peaks”), we substituted the “signal value” instead. Next, we converted each track into bigWig format 60 , generating one “peak signal” file per cell type-assay combination. To define genome regions for training and testing, we used bedtools 61 to tile the hg38 reference genome into 5,994bp windows, with a 1,998bp step size to obtain 3-fold coverage. This window size matches the 5,994bp input length for Nucleotide Transformer 32 . We excluded regions overlapping the ENCODE blacklist 62 . Chromosomes 2 and 16 were allocated for the test dataset, leaving a total of 1,344,175 regions for training and 163,745 for testing. For the alternate DNACipher models which utilized genome location on model input, we utilized a different train/test region allocation, to avoid leakage between train and test sets due to intersecting sequences (which were originally tiled at ⅓ overlaps across the genome). For these models, the train and test regions were generated by tiling at 5994bp across the genome using bedtools 61 with a 5994bp step size. Within each genome location encoded at 12kb resolution on chr2 and chr16, the first 6kb sequence was allocated to the training set and the second sequence was allocated for testing ( Figure 2A ). This resulted in a total of 54,440 test regions and 1,399,036 training regions for the genome-location encoding models. Normalization For each region, we extracted the average signal of the central 294bp using pyBigWig 63 . We then applied the arcsinh transformation to mitigate the influence of outliers, similar to Avocado 24 . All normalization parameters were derived exclusively from the training data to prevent data leakage, and the same normalization steps were then applied to the test data. For RNA-seq assays, we first applied a log2-transformation to non-zero signals, then subtracted the minimum log2 signal value to ensure all signals were positive. Across all assays, we then computed the mean non-zero signal for each assay across all training cell types and training regions. From these per-assay means (μ i ), we calculated the median of those means ( m ). Using this median as a reference, we defined a scaling factor ( s i ) for each assay i as per equation 1 . Multiplying each assay’s signals by s i , we aligned their average magnitudes, resulting in a uniform mean non-zero signal across assays (Figure S7). This normalization approach allowed for more direct comparisons of signals from diverse assays and cell types, and was essential to minimise systematic scaling differences between assays that otherwise dominated the loss function. Enformer embeddings We extracted Enformer embeddings by tiling across the hg38 reference genome with a window size of 196,608bp (the Enformer sequence input size) and a step size of 114,688bp (the Enformer signal output length) 17 . For each tile, we extracted the corresponding reference genome sequence, and passed the sequence through Enformer to extract the latent features from the layer immediately prior to the species-specific output heads. We introduced “N” padding at chromosome boundaries to maintain an input sequence length of 196,608bp for each tile, adding it to either the start of the sequence for chromosome start positions or at the end of the sequence for chromosome end positions. For each sequence a 896 × 3,072 embedding matrix was generated, where the 896 dimension refers to 128bp sequence bins for the middle 114,688bp of the input sequence, and 3,072 represents the latent feature dimension.To generate sequence embeddings for each train and test region, the Enformer embeddings at the closest 3 positions to each region mid-point were averaged, to generate 3,072 latent sequence features per region. We used the Pytorch-based Enformer implementation for the embedding extractions. Nucleotide Transformer (NT) embeddings We obtained Nucleotide Transformer (NT) embeddings from the last layer (layer 24) by passing the full 5994bp train/test region through the hg38 trained, 500 million parameter model 32 . This produced a 999 × 1,280 feature matrix per region, where the 999 dimension represents contiguous 6bp sequence bins, and 1,280 denotes the latent sequence feature dimension. To obtain a single embedding, we averaged the middle 48 rows of this matrix (representing the central 288bp of the region) generating a 1,280-dimensional sequence embedding per region. DNACipher model architectures We implemented the DNACipher model in Pytorch 64 , resulting in approximately 1.3 million parameters. DNACipher has 3 input layers: (1) a cell type embedding layer (input size=191, output size = 128), (2) an assay embedding layer (input size=202, output size=256), and (3) a fully-connected layer that projects Enformer embeddings (3072 features) down to 128 dimensions. The outputs from the 3 input layers (cell type=128, assay=256, sequence=128) are concatenated to one tensor of size 512, and then passed through 3 fully-connected layers with 512 input sizes and 512 output sizes. Layer normalization was applied after each of the deep layers 65 . All fully connected linear layers included learnable bias parameters. Gelu was used as the activation function for all fully connected linear layers. The final output layer projects to a single continuous signal value. DNACipher models that incorporated NT embeddings instead of Enformer embeddings had a 1280 dimensional input for the sequence embedding layer. For models that included genome location, there were an additional 3 input embedding layers to encode genome location at different binning-sizes: 11,988bp, 47,952bp, and 119,880bp. At each bin size, the bin within which the corresponding input sequence occurred was one-hot encoded and passed into the model at the corresponding input layer. Bin size 11,988bp had 257,628 bins, bin size 47,952bp had 64,417 bins, and the largest bin size of 119,880bp had 25,772 bins across the hg38 genome. Each genome location input layer was projected to 64 dimensions. Models utilizing NT embeddings and genome location had 23.3 million parameters, models with NT embeddings and without genome location had 1 million parameters, and models with genome location and no sequence input had 23.14 million parameters. Training strategy Training batches consisted of 256 regions with the full set of signals across training experiments. We utilized Adam as the optimizer 66 with a fixed learning rate of 0.0014526, which was determined from hyper-parameter testing of different learning rates ranging from 0.001 to 0.01 using RayTune 67 . For regularisation, we used dropout at a 5% rate across all layers 68 . DNACipher models that included genome location were prone to overfitting and therefore a more stringent dropout regularization 68 was employed specifically at the genome location input layers at a rate of 20%. An initial round of training was conducted for 25 epochs with MSE as the loss function and without ReLU activation applied to the model output. A secondary round of training for an additional 25 epochs was then performed with ReLU activation applied to the model output and where each experiment was weighted by importance. Experiments representing rarer cell types or assays were more heavily weighted. In this case, the squared error between the predicted and observed signals across the training experiments was multiplied by the weight assigned to each experiment. The mean of the weighted squared error was the loss for this second round of training. The weight of each experiment was determined using a sampling strategy that aimed to reduce the inherent bias in the ENCODE dataset due to assays which were overrepresented across cell types (e.g. DNase-seq) and cell types which were overrepresented across assays (e.g. K562 cells) ( Figure 1F ). To achieve a uniform distribution of experiments for each cell type, would require over-sampling some assays. Similarly to achieve a uniform distribution of cell types for each assay would require oversampling some cell types. To reduce representation bias, we empirically tested a range of assay ( p_assay ) and cell type ( p_cell = 1 - p_assay ) sampling biases. For a given assay bias ( p_assay ), we conducted 100,000 samplings of the available experiments. At each sampling, we chose whether to prioritize reducing assay bias or cell type bias with probability p_assay or p_cell , respectively. If reducing assay bias was selected, a random assay was selected with a probability inversely proportional to its frequency within the dataset. Of the available cell types for the chosen assay, a cell type was then chosen with probability inversely proportional to the cell type’s frequency within the dataset. Conversely, if reducing cell type bias was selected for the sampling, the cell type was chosen first, and then the assay. The number of times a given experiment was chosen using this sampling process was counted. For different values of p_assay chosen uniformly between 0 and 1, the bias of chosen assays and cell types was quantified using the standard deviation of cell type counts across assays (assay bias) and assay counts across cell types (cell type bias), revealing an optimal p_assay=0 . 428 that minimized both the cell type and assay bias (Figure S8). The sampling count of each training experiment c i was then used to determine the weight of the experiment ( w i ) as per equation 2 . Where n is the total number of training experiments (2676). Comparison to independent ATAC-seq dataset We downloaded 10X single nuclei ATAC-seq 33 paired-end fastq files for PBMCs and endocrine pancreas from GEO (GSE163160). Because PBMCs and endocrine pancreas are represented as bulk ATAC-seq in DNACipher and we needed to ensure consistency with our training data, we processed these data as bulk ATAC-seq using the ENCODE 26 ATAC-seq pipeline ( https://github.com/ENCODE-DCC/atac-seq-pipeline ). We then processed the outputted pseudoreplicate peak files equivalently to the ENCODE peaks described above, and also applied the normalization settings learned on the ENCODE training data to this independently generated and processed ATAC-seq data. DNACipher was then used to predict ATAC-seq signals for PBMCs and endocrine pancreas, and we compared the predicted signals at training and test regions to the observed signals at equivalent regions from this processed dataset using Pearson’s correlation. Latent embedding annotation We obtained latent representations of cell types, assays, and sequences directly from DNACipher’s input layers. For cell types and assays, we passed one-hot encoded vectors through their respective embedding layers and extracted the resulting latent features (128 dimensions for cell type and 256 for assay embeddings). 10,984 test sequence embeddings from Enformer (see generation details above, size=3072), were passed through DNACipher’s sequence input layer to obtain 128 dimensional latent features. We used Scanpy 69 to apply UMAP to these latent representations, constructing k-nearest-neighbour graphs with cosine distance. For cell type and assay embeddings, we used k=3, while for sequence embeddings we used k=15. After dimensionality reduction with UMAP, we applied Leiden clustering 70 to identify assay clusters using default parameters. Clusters were annotated by examining the biological themes of the grouped assays. Cell types were clustered and annotated using the same approach, but at Leiden resolution of 4. For sequences, we intersected regions with ENCODE 26 cCREs and NCBI RefSeq annotations 71 . We excluded un-annotated or repetitive regions prior to visualization with UMAP, since these regions lacked annotations for functional interpretation. Mean baseline comparison To assess DNACipher’s predictive accuracy relative to a simple baseline, we compared its prediction against a mean-based estimate. For each batch of 256 regions and their corresponding experimental signals, we computed the mean signal across those 256 regions for each experiment. This per-experiment mean was then used as baseline prediction for all regions in a batch. We calculated the MSE between these mean-based estimates and the observed signals and compared it to the MSE from DNACipher’s predictions. A one-tailed paired t-test was then used to determine whether DNACipher’s errors were significantly smaller than those from the mean-based baseline. By testing if DNACipher’s MSE was lower than the baseline MSE, we could confirm that DNACipher provided a more accurate prediction than the mean estimate. GTEx eQTL effect size benchmark Significant eQTLs and DAPG fine-mapping summary statistics for GTEx version 8 were downloaded from the data portal 23 ( https://gtexportal.org/home/ ). We selected all variants with a fine-mapping PIP > 0.9 that were within 30 kb of the TSS for the associated eGene across all 49 GTEx tissues as likely causal eQTL variants. We manually matched the GTEx tissues to the tissues that were predictable by DNACipher. We then predicted the reference and alternative allele gene expression for plus-strand polyA+ RNA-seq and minus-strand polyA+ RNA-seq for each eQTL centred on the eQTL variant location. To ensure our predictions were specific for the eGene, we only considered output signals that intersected the eGene location and for the strand-specific RNA-seq predictions that corresponded to the strand the gene was encoded on. To score the effect of the eQTL on the eGene, we used two different scoring approaches; 1) log2 fold-change between the alternative and reference sequence gene expression predictions, and 2) the ‘signed’ sum of the absolute difference between the alternate and reference sequence predictions, as per equation 6 . Where equation 3 is the absolute sum of differences between the reference and alternative gene expression predictions for the eQTL at the strand-specific eGene expression predictions ( s ), equation 4 is the sum of differences (Δ), and equation 5 is +1 if Δ is positive and -1 if Δ is negative, and equation 6 is the ‘signed’ sum of the absolute differences. We then correlated the DNACipher tissue-specific predicted effects for each eQTL and eGene with the tissue-specific eQTL effect sizes (Figure S3A). We also performed a sensitivity analysis by considering eQTLs with predicted absolute effect sizes above different cutoffs using the two different scoring methods (Figure S3B-F). This showed the log2 fold-change scoring approach performed best across all cutoffs of predicted eQTL effect sizes for correctly ranking the eQTLs by effect size (measured by Spearman’s ϱ) (Figure S3E) and also predicting the correct direction of eQTL effects (measured by Cohen’s ϰ) (Figure S3F). GTEx likely causal and non-causal eQTL molecular effect detection GTEx 23 variants were stratified by tissue and distance from the TSS of the associated eGene in bins of 100kb. Within each stratification, the top and bottom 3 variants with the highest posterior inclusion probability (PIP) which had a PIP>0.9 or PIP<0.1 were chosen as example likely causal and non-causal variants, respectively (Figure S4A). This resulted in an equal representation of variants across TSS distances and tissues, with a range of effect sizes (Figure S4B-D). Selected eQTLs were scored by constructing the reference and alternative sequences for each eQTL, one-hot encoding each sequence, and parsing these through DNACipher, DNACipher-NT-Loc, Enformer, or DeepCT. All sequences were centred at the eQTL location and constructed to be the maximum size that could be inputted to each model: 196,608bp for DNACipher and Enformer, 5994bp for DNACipher-NT-Loc, and 1000bp for DeepCT. The full set of possible predictions for each model were determined for each sequence, corresponding to 38,582 experiments for DNACipher and DNACipher-NT-Loc, 5313 for Enformer, and 31,800 for DeepCT. For models that utilize sequence embeddings across the inputted DNA sequence (DNACipher, Enformer, DNACipher-NT-Loc), variant effect sizes were determined by summing the absolute difference between the signals predicted along the reference and alternative sequences for each experiment ( equation 3 ). For DeepCT which only provides a single signal estimate in the centre sequence for each experiment, the difference between the reference and alternative sequence signals was used as the variant effect estimate. The 49 GTEx tissues were then manually matched to the most similar tissue represented in the experimental outputs for each model. For Enformer, since there is inconsistent tissue metadata associated with each experiment on account of multiple data sources, a consistent tissue ontology was first determined. For Enformer output experiments from ENCODE (4435/5313), ENCODE metadata was used to determine tissue labels equivalently to the DNACipher data. The remaining 878 experiments corresponded to FANTOM5 CAGE-seq 72 or various other experiments 73 – 75 available from the Gene Expression Omnibus (GEO). For the GEO experiments, the tissue types were determined by extracting the ‘characteristics_ch1’ tag from the GEO metadata using GEOParse ( https://github.com/guma44/GEOparse?tab=readme-ov-file ). Tissue labels for the FANTOM5 data were extracted directly from the Enformer description associated with each experiment by removing the assay, pooling, age, and sex information from the experiment description. For each eQTL, predicted effects specific to the matched tissue were then extracted. Within each assay type, tissue-specific causal- and non-causal eQTL predicted effects were then compared using a two-side independent t-test. P-values across assay types were then adjusted using the Benajamini-Hochberg false-discovery-rate correction 76 . Assays with an adjusted p-value < 0.05 were taken as having significantly different predicted effects between causal- and non-causal eQTLs for each model. Deep Variant Impact Mapping variant allocation We downloaded summary statistics of variant associations with T1D, FINEMAP posterior probabilities of variant causality for GWAS signals, and variant chromatin accessibility peak annotations from the supplemental data provided by Chiou, et al . 33 . For each of the 136 independent GWAS signals, variants were stratified as background, candidate common variants, candidate rare variants, and other rare variants. Variants within ±57kb from the GWAS signal center were considered relevant for testing, since this falls within the 114kb range where signal predictions can be made with DNACipher. Candidate common variants were selected as variants with a nominal p-value of association with T1D less than 5 × 10 −7 . Candidate rare variants were taken as any variant with MAF < 5% that were within the same genomic region as the selected candidate common variants (defined as the minimum and maximum positions of the selected candidate common variants). Background variants were variants which did not meet the criteria of candidate common or rare variants, that were non-significant (nominal p-value > 0.001), and had a MAF > 5%. Because these initial criteria resulted in some of the GWAS signals having few background variants, if less than 100 background variants were selected with these criteria, the top 100 non-significant variants with the highest MAFs were used as the background variants instead. This resulted in a total of 197,148 variants selected across the 136 T1D signals, with all remaining variants within ±57kb from the GWAS signals considered ‘other rare variants’ that were not used for DVIM. Deep Variant Impact Mapping statistical testing For each of the 197,148 variants selected, we used DNACipher to predict the effect of the variants (scored as per equation 3 ) for T cells, B cells, natural killer cells, and pancreas. Predictions were made for a total of 9 assays, which included minus strand polyA plus RNA-seq, plus strand polyA plus RNA-seq, minus strand total RNA-seq, plus strand total RNA-seq, ATAC-seq, H3K27ac ChIP-seq, H2BK20ac ChIP-seq, H3K27me3 ChIP-seq, and CTCF ChIP-seq. To ensure the variant effects at each GWAS signal were comparable, variants associated with each GWAS signal had fixed query sequence locations so that each variant was compared against the same reference sequence (hg38). The reference sequence for each signal was fixed to the centre of the signal as defined by Chiou, et al . 33 , which is the location of the most likely causal variant (highest PPA) from FINEMAP for each signal. Because the original study utilized hg19 mapped variants and lifted over all coordinates to hg38, we replaced the reference allele with the corresponding hg19 nucleotide in the cases where the hg19 and hg38 reference sequences did not match. The predicted effect of the alternate alleles of variants was scored as per equation 3 . In summary, we predicted variant effects for 36 different cell type and assay combinations for each of the 197,148 variants across 136 T1D GWAS signals. This generated 7.097 million variant effect predictions. Within each T1D GWAS signal, we tested whether the predicted effect of each candidate (common or rare) variant differed significantly from a null distribution of variant effects. To account for uncertainty in the estimated null distribution due to a limited number of background variants 77 , 78 , we performed bootstrap resampling of the background variant effects. For each molecular context—defined as a cell type–assay combination m —we resampled the predicted effects of background variants B times ( B = 1,000,000). In each bootstrap iteration b , we calculated the mean ( μ bm ) and standard deviation (σ bm ) of the resampled background effects, enforcing a minimum standard deviation of 0.01 to prevent spurious significance from zero-variance. We then counted the number of bootstraps in which each candidate variant’s predicted effect was not statistically significant ( N vm ) when compared to the bootstrapped null using a two-sided one-sample z-test ( equation 7 ). The critical z-score threshold ( z crit ) was derived using a Bonferroni correction for the total number of variant effects tested within the GWAS signal ( equation 8 ). Where Φ −1 is the inverse cumulative density function of the normal distribution, 0.05 is the un-adjusted critical p-value, V is the total number of variants tested at the GWAS signal, T is the total number of cell types tested, A is the total number of assays tested, and the 2 is because it is a two-sided test. We then estimated a p-value per variant effect that considered the uncertainty in the null distribution 77 ( p vm ), as the proportion of times a given candidate variant effect ( vm ) was non-significant across the boot-strapped null distributions ( equation 9 , pseudocounts added to prevent 0 p-values). To determine the number of GWAS signals with impact variants, while accounting for multiple hypothesis testing, we took a hierarchical approach typical for eQTL studies 79 . For each GWAS signal, we recorded the minimum observed Bonferroni-corrected p-value as the p-value for that signal, and then adjusted p-values across signals using the Benjamini-Hochberg (BH) False Discovery Rate method. GWAS signals with an adjusted BH p-value less than 0.05 were considered as having at least one impact variant. This was repeated for rare and common variants separately (Figure S6), and also for variants that intersected the credible set variants ( Figure 6C ), to determine the number of signals containing impact variants of each type. For the RUNX3 example shown ( Figure 6A-B ), we predicted variant effects for a larger number of cell type contexts initially using the same approach described above. Specifically we predicted variant effects in a total of 18 cell types (all PBMC derived cell types and pancreas) and 6 assays (ATAC-seq, DNase-seq, plus strand polyA minus RNA-seq, plus strand polyA plus RNA-seq, plus strand total RNA-seq, and signal of polyA plus RNA-seq). The statistical testing approach was the same, except we used B =10,000 and also applied an absolute log-fold-change cutoff above the mean background variant effect sizes of 2 for a particular effect to be considered significant. We found this had negligible effect on which variants were called as impact variants at this locus (Supp. Material 2). Deep Variant Impact Mapping comparison with statistical fine mapping After calling impact variants at each GWAS signal, we intersected them with the FINEMAP credible set variants for each signal provided by Chiou, et al . 33 . To determine if the impact variants were more likely to be causal, we calculated an empirical p-value for the observed mean FINEMAP posterior probability (PPA) for the impact variants. We randomly sampled the same number of credible set variants from each GWAS signal as the number of called impact variants at each signal, and took the mean PPA of these randomly selected variants . We repeated this 10,000 times ( R =10,000) to generate a random background distribution of mean PPAs. We then calculated the empirical p-value as the number of random mean PPA values greater than or equal to the observed mean PPA of the impact variants, divided by the number of random selections performed ( R ) ( equation 10 ). Deep Variant Impact Mapping comparison with independent snATAC-seq data To evaluate whether the predicted cell type-specific variant effects were consistent with functional elements in the same cell types, we examined the overlap between impact variants and chromatin accessibility annotations from Chiou et al . 33 . These variant annotations included cell types determined from PBMCs and pancreatic snATAC-seq. Because DNACipher does not represent the cell types at the same level of resolution as the snATAC-seq data, we collapsed cell types into broader categories including pancreas, T cells, B cells, and natural killer cells. We then constructed contingency tables to represent the intersection between predicted variant effects and overlap with accessible chromatin in the same cell types. The contingency tables were constructed across all variants at all GWAS signals, counting the number of variants in the following categories: accessible in cell type and predicted to affect the cell type by the given assay, accessible in cell type and not predicted to affect the cell type by the given assay, not accessible and predicted to affect the cell type by the given assay, and not accessible in the cell type and not predicted to affect the cell type by the given assay (Table S9). We then performed Fisher’s exact test to determine if variants predicted to affect particular cell types were enriched in accessible chromatin of the corresponding cell types. The p-values were adjusted for multiple hypothesis testing using the Bonferroni method (Table S9). Luciferase assays We tested three variants for enhancer activity using luciferase assays. We previously published the luciferase assay corresponding to impact variant rs72928038 80 . The assays for impact variant rs6908626 and non-impact variant rs17574546 are new to this study (Table S10). We cloned short sequences containing the reference alleles for rs72928038, rs6908626, and rs17574546 and alternate alleles for rs6908626, rs17574546 from human DNA samples (Coriell) into the luciferase reporter vector pGL4.23 (Promega) in the forward direction with restriction enzymes SacI-HF (NEB) and KpnI-HF (NEB). The alternate allele for rs72928038 was introduced via site-directed mutagenesis using the Q5 SDM kit (NEB). Plasmids were prepared from competent DH5alpha E.coli (NEB) using the Qiaspin Miniprep Kit (Qiagen) and sequences were verified via Sanger sequencing. Sequences used for cloning and Sanger sequencing are listed in Table S10. Jurkat cells from ATCC were maintained in culture according to manufacturer’s recommendations at a concentration of 1×10E5/mL-1×10E6/mL and tested every 10 passages to confirm lack of mycoplasma contamination. Approximately 0.5×10E6 cells per replicate (3 replicates) were co-transfected with 500 ng of firefly luciferase vector containing either the reference or alternate allele or an empty pGL4.23 vector as a control, and 50 ng pRL-SV40 Renilla luciferase vector (Promega), using the Lipofectamine LTX reagent. Media was refreshed 24 hours after transfection with standard culture media containing 30U/mL IL-2 and Human T-Activator CD3/CD28 Dynabeads (Invitrogen) at a 1:1 ratio with cell counts where indicated. Cells were collected 48 hours post transfection and assayed using the Dual-Luciferase Reporter system (Promega). Firefly activity was normalized to the Renilla activity and expressed as fold change compared to the luciferase activity of the empty vector (RLU). A two-sided t-test was used to compare the luciferase activity between the two alleles in the stimulated and unstimulated treatment conditions, and separately to compare normalized enhancer activity for each allele in the stimulated and unstimulated conditions. Rare impact variant comparison with the Rare Variant Association Repository RAVAR 42 variant-trait associations were downloaded from http://www.ravar.bio/api/download/static/snp_fulltable.txt . RAVAR variants were then intersected with all DVIM tested rare variants at the T1D GWAS hit loci by the position of the genetic variants, since not all of the rare variants had associated reference SNP cluster identifiers (RSIDs). We then constructed a contingency matrix, representing the overlap of impact variants and non-impact variants with RAVAR variants with previously-determined trait associations (Table S11). We then calculated odds ratios, confidence intervals, and performed Fisher’s exact test using Scipy 81 . Data and code availability All data utilised are publicly available. ENCODE 26 data tracks used (Table S1) are publicly available at https://www.encodeproject.org/ . GTEx 23 v8 data utilised is available at https://gtexportal.org/home/ . Independent validation ATAC-seq data 33 is available via the Gene Expression Omnibus at accession GSE163160. DNACipher and DVIM are available with a Python interface and command-line tool at https://github.com/BradBalderson/DNACipher . DNACipher variant effect inference is also available via Google collab ( https://colab.research.google.com/gist/BradBalderson/c4389baa0d789314259b8479cfd35747/dnacipher_inference_local.ipynb ). DVIM is also available via Google Collab ( https://colab.research.google.com/drive/17GiWLt_SigpVa6hl6A9yP_edM4IcQeEy?usp=sharing#scrollTo=1plovJ7slx0r ). Contributions BB conceived the project, secured funding, designed the approach and analysis, developed software, performed benchmarking, made figures, and wrote the manuscript. ST consulted on DNACipher training strategy, implemented determination of Enformer embeddings, benchmarked DeepCT for the eQTL analysis, edited the manuscript, and tested software. MO and SC designed, performed, and analyzed the luciferase assay experiments in Jurkat cells. WJFR benchmarked Enformer for the causal eQTL analysis, edited the manuscript, and tested software. JJ edited the manuscript and tested software. NP edited the manuscript and secured funding. KJG secured funding and supervised the luciferase experiments. MB conceived the project, supervised the project, secured funding, and edited the manuscript. GM conceived the project, supervised the project, secured funding, and edited the manuscript. Acknowledgements BB was supported by a Hewitt Foundation Fellowship and Ekhart Scholarship, as well as a Medical Research Future Fund (MRFF) grant to NP and MB (grant number 2016033). ST was also supported by the same MRFF grant. MO was supported by NIGMS/NIH T32 award GM008666. WJFR was supported by an Australian Research Training Program (RTP) scholarship. JJ was supported by NIH F31 fellowship HG013262. KJG was supported by NIH grants HG012059 and DK138512, and a grant from the Larry L Hillblom Foundation. GM was supported by NIH grant HG011315 and a grant from the Larry L Hillblom Foundation. We thank Qiongyi Zhao, Weston Elison, Chris Duroiu, Han Chen, and David Laub for testing the DNACipher software and providing feedback. Funder Information Declared Hewitt Foundation Fellowship and Ekhart Scholarship National Institutes of Health , GM008666 , HG013262 , HG012059 , DK138512 , HG011315 Medical Research Future Fund , 2016033 Footnotes https://github.com/BradBalderson/DNACipher References 1. ↵ McCarthy , M. I. et al. Genome-wide association studies for complex traits: consensus, uncertainty and challenges . Nat. Rev. Genet . 9 , 356 – 369 ( 2008 ). OpenUrl CrossRef PubMed Web of Science 2. ↵ Zou , Y. , Carbonetto , P. , Wang , G. & Stephens , M. Fine-mapping from summary data with the ‘Sum of Single Effects’ model . PLoS Genet . 18 , e1010299 ( 2022 ). OpenUrl CrossRef PubMed 3. ↵ Dong , S. et al. Annotating and prioritizing human non-coding variants with RegulomeDB v.2 . Nat. Genet . 55 , 724 – 726 ( 2023 ). OpenUrl CrossRef PubMed 4. Dong , S. & Boyle , A. P. Predicting functional variants in enhancer and promoter elements using RegulomeDB: DONG and BOYLE . Hum. Mutat . 40 , 1292 – 1298 ( 2019 ). OpenUrl CrossRef PubMed 5. Ritchie , G. R. S. , Dunham , I. , Zeggini , E. & Flicek , P. Functional annotation of noncoding sequence variants . Nat. Methods 11 , 294 – 296 ( 2014 ). OpenUrl CrossRef PubMed Web of Science 6. Rentzsch , P. , Witten , D. , Cooper , G. M. , Shendure , J. & Kircher , M. CADD: predicting the deleteriousness of variants throughout the human genome . Nucleic Acids Res . 47 , D886 – D894 ( 2019 ). OpenUrl CrossRef PubMed 7. Rentzsch , P. , Schubach , M. , Shendure , J. & Kircher , M. CADD-Splice-improving genome-wide variant effect prediction using deep learning-derived splice scores . Genome Med . 13 , 31 ( 2021 ). OpenUrl CrossRef PubMed 8. Gao , H. et al. The landscape of tolerated genetic variation in humans and primates . Science 380 , eabn8153 ( 2023 ). OpenUrl 9. ↵ Frazer , J. et al. Disease variant prediction with deep generative models of evolutionary data . Nature 599 , 91 – 95 ( 2021 ). OpenUrl CrossRef PubMed 10. ↵ McQuerry , J. A. et al. Massively parallel identification of functionally consequential noncoding genetic variants in undiagnosed rare disease patients . Sci. Rep . 12 , 7576 ( 2022 ). OpenUrl CrossRef PubMed 11. ↵ Liu , L. et al. Biological relevance of computationally predicted pathogenicity of noncoding variants . Nat. Commun . 10 , 330 ( 2019 ). OpenUrl CrossRef PubMed 12. ↵ Zhou , J. & Troyanskaya , O. G. Predicting effects of noncoding variants with deep learning-based sequence model . Nat. Methods 12 , 931 – 934 ( 2015 ). OpenUrl CrossRef PubMed 13. Kelley , D. R. , Snoek , J. & Rinn , J. L. Basset: learning the regulatory code of the accessible genome with deep convolutional neural networks . Genome Res . 26 , 990 – 999 ( 2016 ). OpenUrl Abstract / FREE Full Text 14. Zhou , J. et al. Deep learning sequence-based ab initio prediction of variant effects on expression and disease risk . Nat. Genet . 50 , 1171 – 1179 ( 2018 ). OpenUrl CrossRef PubMed 15. ↵ Kelley , D. R. et al. Sequential regulatory activity prediction across chromosomes with convolutional neural networks . Genome Res . 28 , 739 – 750 ( 2018 ). OpenUrl Abstract / FREE Full Text 16. Kelley , D. R. Cross-species regulatory sequence activity prediction . PLoS Comput. Biol . 16 , e1008050 ( 2020 ). OpenUrl CrossRef PubMed 17. ↵ Avsec , Ž. et al. Effective gene expression prediction from sequence by integrating long-range interactions . Nat. Methods 18 , 1196 – 1203 ( 2021 ). OpenUrl CrossRef PubMed 18. Chen , K. M. , Wong , A. K. , Troyanskaya , O. G. & Zhou , J. A sequence-based global map of regulatory activity for deciphering human genetics . Nat. Genet . 54 , 940 – 949 ( 2022 ). OpenUrl CrossRef PubMed 19. ↵ Linder , J. , Srivastava , D. , Yuan , H. , Agarwal , V. & Kelley , D. R. Predicting RNA-seq coverage from DNA sequence as a unifying model of gene regulation . Nat. Genet . ( 2025 ) doi: 10.1038/s41588-024-02053-6 . OpenUrl CrossRef 20. ↵ Wesolowska-Andersen , A. et al. Deep learning models predict regulatory variants in pancreatic islets and refine type 2 diabetes association signals . Elife 9 , ( 2020 ). 21. ↵ Lee , P. H. et al. Principles and methods of in-silico prioritization of non-coding regulatory variants . Hum. Genet . 137 , 15 – 30 ( 2018 ). OpenUrl CrossRef PubMed 22. ↵ Smemo , S. et al. Obesity-associated variants within FTO form long-range functional connections with IRX3 . Nature 507 , 371 – 375 ( 2014 ). OpenUrl CrossRef PubMed Web of Science 23. ↵ GTEx Consortium . The GTEx Consortium atlas of genetic regulatory effects across human tissues . Science 369 , 1318 – 1330 ( 2020 ). OpenUrl Abstract / FREE Full Text 24. ↵ Schreiber , J. , Durham , T. , Bilmes , J. & Noble , W. S. Avocado: a multi-scale deep tensor factorization method learns a latent representation of the human epigenome . Genome Biol . 21 , 81 ( 2020 ). OpenUrl CrossRef PubMed 25. ↵ Boix , C. A. , James , B. T. , Park , Y. P. , Meuleman , W. & Kellis , M. Regulatory genomic circuitry of human disease loci by integrative epigenomics . Nature 590 , 300 – 307 ( 2021 ). OpenUrl CrossRef PubMed 26. ↵ ENCODE Project Consortium et al . Expanded encyclopaedias of DNA elements in the human and mouse genomes . Nature 583 , 699 – 710 ( 2020 ). OpenUrl CrossRef PubMed 27. ↵ Ernst , J. & Kellis , M. Large-scale imputation of epigenomic datasets for systematic annotation of diverse human tissues . Nat. Biotechnol . 33 , 364 – 376 ( 2015 ). OpenUrl CrossRef PubMed 28. ↵ Gao , Z. , Liu , Q. , Zeng , W. , Jiang , R. & Wong , W. H. EpiGePT: a pretrained transformer-based language model for context-specific human epigenomics . Genome Biol . 25 , 310 ( 2024 ). OpenUrl CrossRef PubMed 29. ↵ Murphy , A. E. , Beardall , W. , Rei , M. , Phuycharoen , M. & Skene , N. G. Predicting cell type-specific epigenomic profiles accounting for distal genetic effects . Nat. Commun . 15 , 9951 ( 2024 ). OpenUrl CrossRef PubMed 30. ↵ Sindeeva , M. et al. Cell type-specific interpretation of noncoding variants using deep learning-based methods . Gigascience 12 , ( 2023 ). 31. ↵ Hermann , L. , Fiedler , T. , Nguyen , H. A. , Nowicka , M. & Bartoszewicz , J. M. Beware of data leakage from protein LLM pretraining . bioRxiv 261 , 106 – 116 ( 2024 ). OpenUrl 32. ↵ Dalla-Torre , H. et al. Nucleotide Transformer: building and evaluating robust foundation models for human genomics . Nat. Methods 22 , 287 – 297 ( 2025 ). OpenUrl CrossRef PubMed 33. ↵ Chiou , J. et al. Interpreting type 1 diabetes risk with genetics and single-cell epigenomics . Nature 594 , 398 – 402 ( 2021 ). OpenUrl CrossRef PubMed 34. ↵ Narita , T. et al. Acetylation of histone H2B marks active enhancers and predicts CBP/p300 target genes . Nat. Genet . 55 , 679 – 692 ( 2023 ). OpenUrl CrossRef PubMed 35. ↵ Huang , C. et al. Personal transcriptome variation is poorly explained by current genomic deep learning models . Nat. Genet . 55 , 2056 – 2059 ( 2023 ). OpenUrl CrossRef PubMed 36. ↵ Sasse , A. et al. Benchmarking of deep neural networks for predicting personal gene expression from DNA sequence highlights shortcomings . Nat. Genet . 55 , 2060 – 2064 ( 2023 ). OpenUrl CrossRef PubMed 37. ↵ Hingerl , J. C. et al. scooby: Modeling multi-modal genomic profiles from DNA sequence at single-cell resolution . bioRxivorg ( 2024 ) doi: 10.1101/2024.09.19.613754 . OpenUrl Abstract / FREE Full Text 38. ↵ Pagadala , M. et al. Germline modifiers of the tumor immune microenvironment implicate drivers of cancer risk and immunotherapy response . Nat. Commun . 14 , 2744 ( 2023 ). OpenUrl CrossRef PubMed 39. ↵ Liu , H. et al. Epigenome-augmented eQTL-hotspots reveal genome-wide transcriptional programs in 36 human tissues . Brief. Bioinform . 25 , ( 2024 ). 40. ↵ Quintana-Murci , L. Understanding rare and common diseases in the context of human evolution . Genome Biol . 17 , 225 ( 2016 ). OpenUrl CrossRef PubMed 41. ↵ Fiziev , P. P. et al. Rare penetrant mutations confer severe risk of common diseases . Science 380 , eabo1131 ( 2023 ). OpenUrl CrossRef PubMed 42. ↵ Cao , C. et al. RAVAR: a curated repository for rare variant-trait associations . Nucleic Acids Res . 52 , D990 – D997 ( 2024 ). OpenUrl CrossRef PubMed 43. ↵ Barton , A. R. , Sherman , M. A. , Mukamel , R. E. & Loh , P.-R. Whole-exome imputation within UK Biobank powers rare coding variant association and fine-mapping analyses . Nat. Genet . 53 , 1260 – 1269 ( 2021 ). OpenUrl CrossRef PubMed 44. ↵ Vuckovic , D. et al. The polygenic and monogenic basis of blood traits and diseases . Cell 182 , 1214 – 1231 .e11 ( 2020 ). OpenUrl CrossRef PubMed 45. ↵ Astle , W. J. et al. The Allelic landscape of human blood cell trait variation and links to common complex disease . Cell 167 , 1415 – 1429 .e19 ( 2016 ). OpenUrl CrossRef PubMed 46. ↵ Pietzner , M. et al. Mapping the proteo-genomic convergence of human diseases . Science 374 , eabj1541 ( 2021 ). OpenUrl CrossRef PubMed 47. ↵ Taliun , D. et al. Sequencing of 53,831 diverse genomes from the NHLBI TOPMed Program . Nature 590 , 290 – 299 ( 2021 ). OpenUrl CrossRef PubMed 48. ↵ Klein , J. C. et al. A systematic evaluation of the design and context dependencies of massively parallel reporter assays . Nat. Methods 17 , 1083 – 1091 ( 2020 ). OpenUrl CrossRef PubMed 49. ↵ Mouri , K. et al. Prioritization of autoimmune disease-associated genetic variants that perturb regulatory element activity in T cells . Nat. Genet . 54 , 603 – 612 ( 2022 ). OpenUrl CrossRef PubMed 50. ↵ Nair , S. , Kim , D. S. , Perricone , J. & Kundaje , A. Integrating regulatory DNA sequence and gene expression to predict genome-wide chromatin accessibility across cellular contexts . Bioinformatics 35 , i108 – i116 ( 2019 ). OpenUrl CrossRef PubMed 51. ↵ Ng , A. H. M. et al. A comprehensive library of human transcription factors for cell fate engineering . Nat. Biotechnol . 39 , 510 – 519 ( 2021 ). OpenUrl CrossRef PubMed 52. ↵ Awdeh , A. , Turcotte , M. & Perkins , T. J. Identifying transcription factors with cell-type specific DNA binding signatures . BMC Genomics 25 , 957 ( 2024 ). OpenUrl CrossRef PubMed 53. ↵ Perez , A. A. et al. ChIP-DIP maps binding of hundreds of proteins to DNA simultaneously and identifies diverse gene regulatory elements . Nat. Genet . 56 , 2827 – 2841 ( 2024 ). OpenUrl CrossRef PubMed 54. ↵ Pampari , A. et al. ChromBPNet: bias factorized, base-resolution deep learning models of chromatin accessibility reveal cis-regulatory sequence syntax, transcription factor footprints and regulatory variants . bioRxivorg ( 2025 ) doi: 10.1101/2024.12.25.630221 . OpenUrl Abstract / FREE Full Text 55. ↵ Hu , Y. et al. Multiscale footprints reveal the organization of cis-regulatory elements . Nature 638 , 779 – 786 ( 2025 ). OpenUrl CrossRef PubMed 56. ↵ Zhang , R. et al. Multi-condition and multi-modal temporal profile inference during mouse embryonic development . bioRxivorg ( 2024 ) doi: 10.1101/2024.03.03.583179 . OpenUrl Abstract / FREE Full Text 57. ↵ Ji , Y. et al. Scalable and universal prediction of cellular phenotypes . bioRxiv ( 2024 ) doi: 10.1101/2024.08.12.607533 . OpenUrl Abstract / FREE Full Text 58. ↵ Umans , B. D. , Battle , A. & Gilad , Y. Where are the disease-associated eQTLs? Trends Genet . 37 , 109 – 124 ( 2021 ). OpenUrl CrossRef PubMed 59. ↵ Kim-Hellmuth , S. et al. Genetic regulatory effects modified by immune activation contribute to autoimmune disease associations . Nat. Commun . 8 , 266 ( 2017 ). OpenUrl CrossRef PubMed 60. ↵ Kent , W. J. , Zweig , A. S. , Barber , G. , Hinrichs , A. S. & Karolchik , D. BigWig and BigBed: enabling browsing of large distributed datasets . Bioinformatics 26 , 2204 – 2207 ( 2010 ). OpenUrl CrossRef PubMed Web of Science 61. ↵ Quinlan , A. R. & Hall , I. M. BEDTools: a flexible suite of utilities for comparing genomic features . Bioinformatics 26 , 841 – 842 ( 2010 ). OpenUrl CrossRef PubMed Web of Science 62. ↵ Amemiya , H. M. , Kundaje , A. & Boyle , A. P. The ENCODE blacklist: Identification of problematic regions of the genome . Sci. Rep . 9 , 9354 ( 2019 ). OpenUrl CrossRef PubMed 63. ↵ deeptools/pyBigWig: 0.3.19. 64. ↵ Paszke , A. et al. PyTorch: An imperative style, high-performance deep learning library . arXiv [cs.LG] ( 2019 ). 65. ↵ Ba , J. L. , Kiros , J. R. & Hinton , G. E. Layer Normalization . arXiv [stat.ML] ( 2016 ). 66. ↵ Kingma , D. P. & Ba , J. Adam: A method for stochastic optimization . arXiv [cs.LG] ( 2014 ). 67. ↵ Liaw , R. et al. Tune: A research platform for distributed model selection and training . arXiv [cs.LG] ( 2018 ). 68. ↵ Srivastava , N. , Hinton , G. , Krizhevsky , A. , Sutskever , I. & Salakhutdinov , R. Dropout: A Simple Way to Prevent Neural Networks from Overfitting . Journal of Machine Learning Research 15 , 1929 – 1958 ( 2014 ). OpenUrl 69. ↵ Wolf , F. A. , Angerer , P. & Theis , F. J. SCANPY: large-scale single-cell gene expression data analysis . Genome Biol . 19 , 15 ( 2018 ). OpenUrl CrossRef PubMed 70. ↵ Traag , V. A. , Waltman , L. & van Eck , N.J. From Louvain to Leiden: guaranteeing well-connected communities . Sci. Rep . 9 , 5233 ( 2019 ). OpenUrl CrossRef PubMed 71. ↵ O’Leary , N. A. et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation . Nucleic Acids Res . 44 , D733 – 45 ( 2016 ). OpenUrl CrossRef PubMed 72. ↵ Noguchi , S. et al. FANTOM5 CAGE profiles of human and mouse samples . Sci. Data 4 , 170112 ( 2017 ). OpenUrl CrossRef PubMed 73. ↵ Buenrostro , J. D. et al. Integrated single-cell analysis maps the continuous regulatory landscape of human hematopoietic differentiation . Cell 173 , 1535 – 1548 .e16 ( 2018 ). OpenUrl CrossRef PubMed 74. Yan , J. et al. Transcription factor binding in human cells occurs in dense clusters formed around cohesin anchor sites . Cell 154 , 801 – 813 ( 2013 ). OpenUrl CrossRef PubMed Web of Science 75. ↵ Cohen , D. M. et al. ATF4 licenses C/EBPβ activity in human mesenchymal stem cells primed for adipogenesis . Elife 4 , e06821 ( 2015 ). OpenUrl CrossRef PubMed 76. ↵ Benjamini , Y. , Drai , D. , Elmer , G. , Kafkafi , N. & Golani , I. Controlling the false discovery rate in behavior genetics research . Behav. Brain Res . 125 , 279 – 284 ( 2001 ). OpenUrl CrossRef PubMed Web of Science 77. ↵ Efron , B. Large-scale simultaneous hypothesis testing: The choice of a null hypothesis . J. Am. Stat. Assoc . 99 , 96 – 104 ( 2004 ). OpenUrl CrossRef Web of Science 78. ↵ Johansson , R. , Strålfors , P. & Cedersund , G. Combining test statistics and models in bootstrapped model rejection: it is a balancing act . BMC Syst. Biol . 8 , 46 ( 2014 ). OpenUrl CrossRef PubMed 79. ↵ Ongen , H. , Buil , A. , Brown , A. A. , Dermitzakis , E. T. & Delaneau , O. Fast and efficient QTL mapper for thousands of molecular phenotypes . Bioinformatics 32 , 1479 – 1485 ( 2016 ). OpenUrl CrossRef PubMed 80. ↵ Benaglio , P. et al. Mapping genetic effects on cell type-specific chromatin accessibility and annotating complex immune trait variants using single nucleus ATAC-seq in peripheral blood . PLoS Genet . 19 , e1010759 ( 2023 ). OpenUrl CrossRef PubMed 81. ↵ Virtanen , P. et al. SciPy 1.0: fundamental algorithms for scientific computing in Python . Nat. Methods 17 , 261 – 272 ( 2020 ). OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted June 06, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Comprehensive molecular impact mapping of common and rare variants at GWAS loci Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Comprehensive molecular impact mapping of common and rare variants at GWAS loci Brad Balderson , Sanjana Tule , Mei-Lin Okino , William JF Rieger , Sierra Corban , Jeff Jaureguy , Nathan Palpant , Kyle J. Gaulton , Mikael Bodén , Graham McVicker bioRxiv 2025.06.05.658079; doi: https://doi.org/10.1101/2025.06.05.658079 Share This Article: Copy Citation Tools Comprehensive molecular impact mapping of common and rare variants at GWAS loci Brad Balderson , Sanjana Tule , Mei-Lin Okino , William JF Rieger , Sierra Corban , Jeff Jaureguy , Nathan Palpant , Kyle J. Gaulton , Mikael Bodén , Graham McVicker bioRxiv 2025.06.05.658079; doi: https://doi.org/10.1101/2025.06.05.658079 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetics Subject Areas All Articles Animal Behavior and Cognition (7642) Biochemistry (17715) Bioengineering (13907) Bioinformatics (42005) Biophysics (21472) Cancer Biology (18624) Cell Biology (25534) Clinical Trials (138) Developmental Biology (13391) Ecology (19935) Epidemiology (2067) Evolutionary Biology (24356) Genetics (15617) Genomics (22529) Immunology (17753) Microbiology (40437) Molecular Biology (17200) Neuroscience (88697) Paleontology (667) Pathology (2840) Pharmacology and Toxicology (4829) Physiology (7653) Plant Biology (15171) Scientific Communication and Education (2046) Synthetic Biology (4304) Systems Biology (9827) Zoology (2272)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.