Multiple instance fine-mapping: predicting causal regulatory variants with a deep sequence model

doi:10.1101/2025.06.13.25329551

Multiple instance fine-mapping: predicting causal regulatory variants with a deep sequence model

2025 · doi:10.1101/2025.06.13.25329551

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 61,378 characters · extracted from preprint-html · click to expand

Multiple instance fine-mapping: predicting causal regulatory variants with a deep sequence model | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Multiple instance fine-mapping: predicting causal regulatory variants with a deep sequence model View ORCID Profile Alexander Rakowski , View ORCID Profile Christoph Lippert doi: https://doi.org/10.1101/2025.06.13.25329551 Alexander Rakowski 1 Digital Health Machine Learning, Hasso Plattner Institute for Digital Engineering , Potsdam, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alexander Rakowski For correspondence: alexander.rakowski{at}hpi.de Christoph Lippert 1 Digital Health Machine Learning, Hasso Plattner Institute for Digital Engineering , Potsdam, Germany 2 Hasso Plattner Institute for Digital Health at Mount Sinai , New York, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Christoph Lippert Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Identifying causal genetic variants in a computational manner remains an open problem. Training end-to-end prediction models is not possible without large ground-truth datasets, while results of genome-wide association studies (GWAS) are entangled by linkage disequilibrium (LD), and gene expression datasets do not contain genetic variation at individual-level. Here, we propose Multiple Instance Fine-mapping (MIFM) – a multiple instance learning (MIL) objective to overcome the lack of strong labels by grouping putatively causal variants together based on their LD scores. Using MIFM, we trained a deep classifier on a dataset aggregating over 13, 000 GWAS to predict causal variants based on their underlying DNA sequences. We validated variants prioritized by MIFM by constructing polygenic risk scores which transferred better to different target ancestries. Furthermore, we demonstrated how MIFM can be used to disentangle effect sizes of highly-correlated variants to better fine-map GWAS results. Author summary Genome-wide association studies have identified tens of thousands genetic variants associated with traits or diseases. However, the majority of identified variants is only spuriously correlated with the phenotype of interest, having no causal effect on it. Instead, these variants are often inherited together with nearby biologically causal variants, thus creating the spurious associations. Fine-mapping, i.e., predicting which variants are causal, is crucial for downstream tasks, such as uncovering the biological mechanisms affecting the phenotype or robustly identifying individuals with high genetic risk of a disease. While most fine-mapping methods are based on the available association statistics or functional annotations of genetic regions, it should be possible to identify causal variants based on their neighboring DNA sequences. However, training a standard machine learning classifier for that task is obstructed by the scarcity of strong, ground-truth labels. Here, we proposed a method to train sequence models predicting variant causality using weakly-labeled data. We trained a model on a large set of associated variants, and demonstrated its utility by improving cross-ancestry predictions of genetic risk, or disentangling the effect sizes of highly correlated variants. 2 Introduction Genome-wide association studies (GWAS) remain a powerful tool for identifying genetic variants associated with phenotypes or diseases, with recent studies detecting up to thousands of associations per trait. However, while one usually assumes a clear genotype → phenotype causal direction, only a small fraction of variants significant in a GWAS are expected to be truly causal [ 46 ]. Due to linkage disequilibrium (LD), single nucleotide polymorphisms (SNPs) in proximity to causal variants become associated with the trait and can even have lower p-values than the causal SNPs. Identifying the true causal variants is important for understanding the underlying mechanisms such as transcription factor (TF) binding and for making robust predictions in populations with different LD structures than the one where the GWAS was performed. Without in-vivo experimental validation, one can employ computational fine-mapping methods, which aim to narrow down the set of putative causal variants from GWAS summary statistics. The simplest approach is to test the significance of all candidate SNPs in a joint model. While yielding unbiased estimates of the true effect sizes, it is not feasible in scenarios with large numbers of variants or strong LD. More advanced methods utilize the Bayesian framework to estimate credible sets of variants given prior knowledge on the distribution of effect sizes [ 22 , 6 , 51 ], optionally incorporating functional annotations as additional priors [ 26 , 11 , 55 ]. While powerful, the above methods require selecting hyperparameters, such as the assumed number of causal variants, rely on availability of functional annotations for the regions of interest, and are sensitive to LD patterns, yielding large credible sets for strongly correlated variants. Another approach is to use machine learning (ML) prediction models to assign a score to each variant as a proxy of its likelihood of being causal. A common choice are deep neural networks (DNNs) trained to predict functional genomic annotations or gene expression values from DNA, with architectures typically based on a convolutional neural network (CNN) [ 59 , 25 , 3 ] or a transformer backbone [ 2 ]. The difference in predictions for a sequence with the reference allele and a sequence with the alternative allele is then taken as a measure of the potential causality of a variant. As opposed to the statistical methods, the ML-based approaches are independent of GWAS results or the LD structure. However, while they take DNA data at base pair level resolution as inputs, they are typically trained on reference genome data, which limits the SNP-level variability of DNA motifs to ones present at population-level. Furthermore, the performance of such models can be hindered by coarseness and noise of the labels [ 29 , 8 ]. Here, we introduce Multiple Instance Fine-mapping (MIFM), a framework for training deep learning models to predict the causality of non-coding variants directly from the underlying DNA sequence, without the need for summary statistics or functional annotations at test time. We circumvent the lack of ground-truth labels at SNP resolution by formulating the training objective as a multiple instance learning (MIL) problem, where putatively causal variants in LD with each other are grouped together to form a single, weakly-labeled positive example, and fit the model on a dataset of more than 2 million associated variants from over 13, 000 studies. We demonstrate the robustness of MIFM-prioritized variants by creating polygenic scores (PGS) of 20 traits, which transferred better from European to non-European ancestries, compared to variants selected using existing fine-mapping methods. Furthermore, we show how MIFM can be used to detect additional signals in analyses of GWAS results by prioritizing variants for joint tests, even in strongly-correlated cases. Finally, we report the results of model analysis which revealed enrichment of regulatory elements, existing and putatively novel TF motifs, as well as context-dependent mechanisms. The corresponding code as well as the trained model used in our experiments are available at github.com/HealthML/multiple-instance-fine-mapping 3 Results 3.1 Overview of the method We proposed Multiple Instance Fine-mapping (MIFM), a framework for training models prioritizing causal genetic variants with the multiple instance learning (MIL) paradigm, using GWAS associations as training data ( Figure 1 ). Our goal was to obtain a classifier predicting the probability of a variant being causal, given the DNA sequence around it. To train such a model in the standard supervised manner one would need ground-truth labels for each individual instance (variant). However, variants discovered with GWAS typically contain a large number of false positives due to LD between SNPs. To circumvent the lack of ground-truth labels, we trained a model to classify bags of instances (LD blocks of variants), instead of individual SNPs. We constructed the training dataset by selecting significant variants from a large set of GWAS results and grouping SNPs in LD with each other into positive bags. Conversely, we created negative examples by selecting common variants from the human reference genome which were not significantly associated in any of the GWAS. During training, the model makes instance-level predictions for each SNP within an LD block, which are then pooled by selecting the highest score as the bag-level prediction. Once trained, the pooling operation is discarded, and the instance-level model can be used to make predictions for individual variants. See Section 5.1 for a detailed description of the method, and Section 5.2 regarding dataset construction. Download figure Open in new tab Figure 1: Overview of Multiple Instance Fine-mapping (MIFM) – a framework for training variant prioritization models. We frame the task of identifying causal variants as a multiple instance learning (MIL) problem, where loci of GWAS-associated SNPs are grouped to form positive “bags” for the MIL algorithm, to overcome the lack of instance-level (per-variant) labels. We assume that at each positive bag contains at least one causal variant, while negative bags contain none. Dataset creation: We construct the training dataset using a large set of GWAS results, by selecting SNPs significantly associated in any study (marked in red). Since we do not have variant-level labels, we treat whole LD-blocks of associated SNPs as positive bags. Conversely, we construct the set of negative examples (marked in blue) by selecting the remaining, non-significant variants, to form single-element negative bags. Model training: we train a prediction model, e.g., a deep neural network, to classify the MIL bags, based on the underlying DNA sequences of the variants. The model first makes separate predictions for each element in the bag, which are then aggregated using the max operator to yield a single bag-level prediction. After the model is trained, we can discard the max operation and predict the causality of single variants. 3.2 Polygenic risk scores created with MIFM transfer better to non-European ancestries The predictive performance of PGS can decrease when applied to populations different from the one where the GWAS summary statistics were obtained from [ 33 , 13 , 32 ]. Due to varying LD patterns across ancestries, SNPs associated with the phenotype in the GWAS population might not be tagging the causal variants in the target population. As most studies are biased towards European populations [ ? , 47, 16], this can increase health disparities, e.g., by failing to identify individuals at risk in minority ancestries [ 34 ]. On the other hand, there is evidence for causal variants and their effect sizes being consistent across ancestries [ 54 , 42 , 23 ]. Thus, identifying causal variants should improve the cross-ancestry transferability of PGS. We created PGS by prioritizing variants using MIFM and 9 baseline methods, and evaluated their performance on non-European ancestries ( Section 5.3 ). Each method was evaluated for 20 traits and 5 ancestries, yielding a total of 100 scenarios per model. Within each scenario, we compared the performance of PGS created with MIFM and each baseline, and counted the total number of scenarios where MIFM would perform significantly better or worse than a baseline ( Figure 2 ). Overall, MIFM performed better in 19% and worse in 5% of all scenarios. The net number of scenarios where MIFM performed better was positive regardless of the baseline, ranging from a net difference of 5% for ABF and Enformer, to 10% for Polyfun FINEMAP. The smallest improvements were obtained for the East Asian (EAS) ancestry (6% of scenarios better, 4% worse), while the largest improvements were for the African (AFR) ancestry (17% better, 2% worse) (Supplementary Figure 1). With respect to individual traits, MIFM performed the worst for breast cancer (4% of times better, 16% worse) and prostate cancer (11% worse), while the most consistent improvements were for serum urate levels (91% better) and coronary artery disease (CAD) (24% better) (Supplementary Figure 2). Download figure Open in new tab Figure 2: Performance comparison of PGS created with MIFM and 9 baseline methods on 5 non-European ancestries and 20 traits. We created PGS using results from 20 GWAS performed on European samples and evaluated them on 5 non-European samples, yielding 100 test scenarios per model. For each baseline, we counted the number of scenarios where MIFM would perform better than the baseline (in green), worse (in red), or not significantly different (in gray). 3.3 MIFM enables discovery of additional GWAS signals Joint regression models of multiple variants can estimate the causal effect sizes instead of the marginal ones [ 56 ]. However, in the presence of a large number of highly correlated SNPs, large sample sizes are needed to disentangle the signals. Thus it is often infeasible to test all the putative variants jointly, especially for studies of modest sizes. We used MIFM to prioritize variants for conditional testing of highly-correlated ( R 2 ≥ 0.9) SNPs and compared the results with a naive approach of selecting all variants in high LD in 4 moderately sized GWAS in Table 1 . In each GWAS, the joint regression of MIFM-prioritized SNPs yielded a larger number of significant variants, yielding 47 variants in total, compared to 32 variants from the baseline models. 2 out of the 47 variants had significantly different effect size estimates in the larger model, and might be false positives. One of the variants identified by the MIFM joint model was above the significance threshold in the GWAS and would otherwise be undetected by the marginal effect size estimates. We also counted secondary signals, i.e., cases where more than 1 SNP was significant in a joint model, where MIFM identified two cases more than the baseline models ( Table 2 ). View this table: View inline View popup Download powerpoint Table 1: Number of variants identified by conditional analyses of GWAS results for selected phenotypes. We tested SNPs highly correlated with each lead SNP using a joint model of all variants in LD (2nd column) and of variants prioritized by p-values (3rd column) and by MIFM (4th column). In the 5th column we report the number of variants identified with MIFM whose effect size estimates were matching the estimates from the full model. View this table: View inline View popup Download powerpoint Table 2: Number of secondary signals identified by conditional analyses of GWAS results for selected phenotypes. We tested SNPs highly correlated with each lead SNP using a joint model of all variants in LD (2nd column) and of variants prioritized by p-values (3rd column) and by MIFM (4th column), and report the number of secondary signals, i.e., cases where more than one variant was significant in the joint model. 3.4 MIFM variants are enriched for enhancer, repressed, and silencer chromatin signatures To characterize variants prioritized by MIFM, we analyzed whether they enrich for regulatory elements compared to all putatively causal variants from CAUSALdb2. We computed the enrichment for 20 GenoSTAN-defined states [ 57 ], which we divided into 6 groups: enhancers, promoters, repressed regions, transcriptional elongations, repressed-enhancer regions, and low-signal regions ( Figure 3 ). MIFM-prioritized variants are enriched significantly for repressed regions (odds ratio (OR) from 1.02 for the lowest quantile to 1.13 for the highest quantile), repressed-enhancer regions (OR from 1.01 to 1.13), and the enhancer elements (OR from 1.01 to 1.10), with the highest enrichment for “strongly” defined enhancers subgroup (“Enh.6”, up to an OR of 1.16). Conversely, we observed a significant depletion of low-signal regions at higher model score quantiles (OR from 0.98 at the 0.7 quantile to 0.97 at the 0.9 quantile), and for transcription elongations (OR 0.99 for the 0.3 quantile to 0.98 for the 0.9 quantile), with the exceptions of “Gen5’.13” which was enriched for, with an OR up to 1.12. We did not observe significant deviations from 1 in the odds ratios for promoter regions. To understand why MIFM enriches for repressed regions, we further analyzed the repressed and repressed-enhancer variants prioritized by MIFM, and observed a significant enrichment for enhancer regions compared to all repressed and repressed-enhancer variants of CAUSALdb2 (up to 1.08 for repressed regions and up to 1.10 for repressed enhancer-like regions) (Supplementary Tables 1 and 2), indicating that repressed regions prioritized by MIFM are often active enhancers in other cell lines. Additionally, we analyzed repressed, repressed-enhancer, and enhancer regions in terms of silencers, and observed a significant enrichment in MIFM-prioritized repressed regions and in enhancer regions (up to 1.07 in both subsets), and no significant change in OR for repressed-enhancer regions (Supplementary Tables 3–5). The presence of silencers is associated with the H3K27me3 histone modification [ 9 ], which also characterizes the repressed GENOSTAN states, further suggesting that MIFM enriches for repressed regions with functional elements. Silencers can act as enhancers depending on the cellular context [ 19 , 36 , 12 ], and their enrichment in the enhancer subset can either indicate a preference of the model for such “dual” elements. Download figure Open in new tab Figure 3: Enrichment analysis of regulatory elements in variants prioritized by MIFM. We plot the odds ratios (y-axis) of variants with MIFM scores above each quantile (x-axis) versus all variants in the CAUSALdb2 data. Each plot represents a different regulatory group based on GenoSTAN [ 57 ] chromatin state annotations (plots a to f ) and silencerDB [ 58 ] validated and predicted silencer elements (plot g ). 3.5 Syntax analysis of a MIFM trained model We analyzed the trained MIFM model using Transcription-Factor Motif Discovery from Importance Scores (TF-MoDISco) and identified 161 unique DNA patterns in total, consisting of 67 patterns with positive attribution scores for models predictions and 94 patterns with negative attributions. The negative patterns had a smaller support in terms of corresponding seqlets (instances of similar patterns), with a median number of 798 seqlets compared to a median 1, 257 of seqlets for a positive pattern. In total, 60% of the positive patterns and 40% of the negative ones had a support of at least 1, 000 corresponding seqlet instances in the data. 20 positive and 49 negative patterns were significantly matched to at least one known human TF binding motif using TOMTOM [ 48 ]. TF motifs with the highest number of matched patterns are shown in Supplementary Table 6. Overall, we observed several motifs that were matched to both positive and negative patterns at the same time. We further analyzed how predictions of MIFM are influenced by the positive and the negative patterns by performing in silico mutagenesis of sequences with patterns matching the binding motif of the ARID3A protein, a TF with reported interactions with other regulatory elements [ 18 , 41 , 43 ]. We modified sequences containing positive patterns upstream of the SNP position by replacing the positive pattern with the negative one, and vice versa, and compared MIFM predictions for the original and modified sequences ( Figure 4 ). Adding negative patterns to sequences with a positive pattern shifted the distributions of scores towards lower values (from an average mean score of 0.54 to 0.48) and increased their spread (average standard deviation from 0.03 to 0.08). Download figure Open in new tab Figure 4: In silico mutagenesis of ARID3A-matching patterns and the corresponding MIFM predictions. We selected 3 positive patterns (rows) and 3 negative patterns (columns) matching the ARID3A motif which were identified by TF-MoDISco as influencing MIFM predictions. For each positive-negative pattern pair, we computed MIFM scores for sequences containing the positive pattern (odd subcolumns) and sequences with the negative pattern (even subcolumns) and plot the density functions of the scores in blue. We modified the sequences by adding the negative pattern to the positive sequences (odd subcolumns) and vice versa (even subcolumns), scored the modified sequences with MIFM, and plot the density functions of the modified scores in red. The vertical lines denote the means of MIFM scores of the original (blue) and modified sequences (red). Adding positive patterns to “negative” sequences slightly shifted their scores towards higher values (average mean from 0.17 to 0.20, average standard deviation from 0.08 to 0.09). This suggests that the context of a variant is necessary, but not sufficient, for it to be predicted as causal by MIFM — one can “disable” the function of a variant by modifying its context, but it is not enough to modify the context to make a variant being predicted as causal. 4 Discussion Identifying causal non-coding variants is typically done with fine-mapping methods which rely on summary statistics and population LD structure, without directly using the underlying DNA sequences. Alternatively, one can employ sequence models of gene expression, which, however, are trained on reference genome data and do not observe individual-level DNA variation. To this end, we proposed a problem formulation of fine-mapping using the multiple instance learning objective, where we predict the presence of GWAS associations within LD-blocks containing multiple variants. By using the underlying DNA sequences as input features we can exploit similarities in DNA patterns between causal variants, while by constructing the labels using GWAS summary statistics we indirectly incorporate the individual-level genetic variation which drives SNP associations. Using this approach, we trained a DNN model which predicts the probability of a SNP being causal given its neighboring DNA sequence, allowing us to prioritize variants of interest. One of the motivations for identifying causal variants is to robustly predict genetic liability of a phenotype across different populations, especially those which are under-represented in GWAS. By evaluating MIFM prioritized variants across a range of traits and ancestries, we were able to increase the robustness of polygenic scores predictions compared to a wide range of baselines. Furthermore, we showed that utilizing sequence information can be useful for disentangling highly-correlated GWAS variants, a task otherwise statistically infeasible with typical sample sizes. We note that our goal was to propose a framework for training variant-prioritization models, rather than developing a new DNN architecture. We employed a relatively lightweight model for our experiments (less than 25, 000 parameters), and while we did not observe improvements with more complex architectures, we did not conduct an exhaustive comparison of possible DNN models. Besides improving the predictive performance, a valuable extension would be to increase the interpretability of the model with interpretable-by-design architectures [ 37 , 49 ]. We further note that as MIFM utilizes a database of GWAS results, it can be continuously fine-tuned whenever new summary statistics are available, each time further narrowing down the MIL objective. We showed how one can utilize the vast amount of GWAS results available to train machine learning models for variant prioritization, overcoming the problem of inaccurate labels due to confounding from LD. Such models can complement traditional fine-mapping methods, being able to reduce the number of putative variants to be analyzed, even without the access to the corresponding test statistics. Finally, by introducing base-pair level variations in the training data, this paradigm can be used to increase the robustness of existing DNA sequence models. 5 Methods 5.1 Fine-mapping as a multiple instance learning problem We begin by introducing the MIL paradigm for a binary classification task. We assume that the data consists of pairs of { x i,j , y i,j }, where x i,j ∈ ℝ d are input features and y i,j ∈ {0, 1} are binary labels, and the input instances are grouped into bags of examples X i = { x i ,1 , … , x i,m }, where m can differ across X i . We use i to index along the bag-level (e.g., X i , y i ) and j to index individual instances within a bag (e.g., x i,j , y i,j ). At training time, we only have access to bag-level labels, which indicate whether at least one instance in a bag is positive, and are defined as: This can be interpreted as a form of weak labeling, since we do not know which particular instance(s) in a positive bag are positive, as opposed to strong, instance-level labels. Our goal is then to learn an instance-level classifier f : ℝ d → [0, 1], given the bag-level data { X i , y i }. A common approach to train f is to define a bag-level classifier F : and train it to minimize the cross-entropy loss ℒ: wrt. the bag-level examples { X i , y i }. As GWAS estimate the marginal effect size of each SNP, the significantly associated variants typically comprise groups of SNPs in LD with each other, out of which only a small fraction is truly causal, and most SNPs are spuriously associated with the trait of interest through their correlations with causal variants. On the other hand, we assume that the causal signals are driven by DNA patterns around the variants, e.g., by TF binding motifs, and could be identified given enough data. Databases such as GWAS Catalog [ 10 ] or CAUSALdb [ 52 ] aggregate the results of thousands of GWAS, providing a large set of putatively causal variants of the human genome across a range of traits and populations. In order to use these data to infer about causal variants, we propose the following MIL scenario: let each x i,j represent the DNA sequence centered at a SNP and y i,j be unobserved ground-truth labels indicating causal variants. We construct positive bags as independent LD-blocks of putatively causal variants, e.g., surpassing a given significance threshold in any study and grouped using a clumping procedure, while the negative ones are single-element bags of variants without any significant associations. If at least one causal variant is present in each positive bag, this is a valid MIL objective. Assuming that the causal variants share DNA patterns between each other, we can successfully train a sequence model f , e.g., a neural network, to predict causal variants. Formally, given s GWAS over v SNPs, let S i ∈ ℝ v , i ∈ [ s ] be the summary statistics of the i -th GWAS and S i,j be the p-value for the k -th variant in that study. Furthermore, let b be the total of independent LD-blocks, and L 1 , … , L b be pair-wise disjoint sets of integers indicating which variants belong to which LD block. Given a significance threshold T , the positive bags are then defined as: while the negative bags are defined as: 5.2 Dataset and model training We constructed the training dataset using data from the CAUSALdb2 database [ 53 ], which aggregates 13, 709 GWAS summary statistics, resulting in over 2, 618, 834 putative causal variants from the GRCh37 human genome, which are grouped into 2, 772 independent LD-blocks. We further divided the blocks into primary and secondary signals, using labels provided by CAUSALdb2, yielding 4, 790 smaller blocks, and excluded blocks with fewer than 10 variants, to reduce the chance of a small number of false positive SNPs driving the training signal. We treated the resulting blocks as separate bag-level positive examples . Since CAUSALdb2 only contains associated variants, we created the negative bags from common (MAF ≥ 0.01) GRCh37 variants which were not present in CAUSALdb2, excluding SNPs further than 128 base pairs away from any CAUSALdb2 variant, yielding a total of 1, 045, 506 training examples, each forming a separate single-element negative bag. For model training, we employed a modified, smaller version of the Basenji2 CNN architecture [ 25 ], with 4 blocks in the first model stage, 2 blocks in the second stage, a final number of 64 filters, and a single output. We one-hot encoded the 512 base pair DNA sequences around each variant to serve as the inputs x i . As per Equation 2 , for each block X i = { x i ,1 , … , x i,m } the CNN makes a separate prediction y i,j = f ( x i,j ) for each individual input x i,j , and the maximum value across x i,j ∈ X i is taken as the bag-level prediction. We trained the model for 100 epochs using the Adam optimizer [ 27 ] with a learning rate of 10 −4 and exponential decay of γ = 0.9 per epoch, with a batch size of 32 bags. For regularization, we used a dropout on the residual layer connections of the model with a rate of 0.5, and randomly shifted the input sequences by up to 8 base pairs. We used a modified DNA one-hot encoding by introducing a 6-th special token “V”, which we replaced the nucletotide value in the middle of the sentence, i.e., at the SNP position ( A = [1, 0, 0, 0, 0], C = [0, 1, 0, 0, 0], G = [0, 0, 1, 0, 0], T = [0, 0, 0, 1, 0], N = [0, 0, 0, 0, 0], V = [0, 0, 0, 0, 1]). The use of this additional token allowed us to employ random shift augmentations by signaling the position of the variant of interest to the network, and thus potentially increasing its precision to the base pair level. Using the standard 5-token encoding (e.g., as in [ 2 ]) with random shifts would instead force the model to make the same predictions for any neighboring SNPs within the shift range of the variant of interest. To further increase robustness, we employed model ensembling [ 17 ] by repeating the training with 5 different random seeds, and training a student model [ 21 ] to predict the averaged output of the 5 models. We used the same network architecture for the student model with the exception of using the standard 5 token encoding and not using data augmentations. We implemented the models using the PyTorch [ 38 ] and PyTorch Lightning [ 14 ] software libraries, and trained them using a single NVIDIA A40 48GB GPU and 8 CPU cores per-model, with an average training time of 34 hours. We did not observe a benefit of using larger versions of the model, other network architectures (Enformer [ 2 ], BPNet [ 3 ], a pretrained MFD [ 40 ]), or differentiable alternatives to the max operator in the formulation of F in Equation 2 (log-sum-exponential [ 7 ], generalized mean [ 4 ], attention-based [ 24 ]). Finally, we note that since only “weak” labels are used for model fitting, the network can be further retrained, or fine-tuned, “at test time”, i.e., whenever one obtains results from a new GWAS, by updating the positive bags with the set of new putative variants. 5.3 Construction and evaluation of polygenic risk scores We selected GWAS of 10 continuous and 10 binary traits from the CAUSALdb database [ 53 ] which were performed in populations of European ancestry and had matching traits in UK Biobank (UKB). For each study, we divided the corresponding variants into LD-blocks according to the CAUSALdb2 labels, treating the primary and secondary signals within a single block as separate blocks. We then constructed PGS by selecting the variant with the highest fine-mapping annotation score from each block. This resulted in 10 PGS per GWAS, using annotation scores from: MIFM, “raw” p-values, a pretrained Enformer model [ 2 ], and each of the 7 fine-mapping tools included in the CAUSALdb2 annotations: ABF [ 50 ], CAVIARBF [ 11 ], FINEMAP [ 6 ], PAINTOR [ 26 ], SuSiE [ 51 ], PolyFun FINEMAP [ 55 , 6 ], and PolyFun SuSiE [ 55 , 51 ]. For the raw p-values, we calculated the annotation scores as 1 minus the p-value of a variant. For Enformer, we calculated the annotation scores as the maximum differences in predictions for the alternative versus reference allele over all model outputs. We evaluated the scores on the African (AFR), Admixed American (AMR), Central/South Asian (CSA), East Asian (EAS) and Middle Eastern (MID) ancestry subsets of UKB, which we defined using the ancestry analysis functionality of pgs-calc [ 30 ], with a 1,000 Genomes LD reference panel [ 1 ]. This resulted in a total of 100 scenarios (20 traits × 5 ancestries). Within each scenario, we divided the samples into 5 folds and fitted 5 linear models of the PGS and covariates (age, sex, UKB assessment center, genotyping batch, and the first 10 genetic principal components). Each time we selected a different set of 4 folds for model training and the remaining fold for evaluation, and averaged the final outcome. For each of the 100 scenarios, we assessed the significance of the difference between the performances in terms of the R 2 score (we used the McFadden pseudo- R 2 [ 35 ] for binary traits) of MIFM and each baseline using a permutation test with 10 8 permutations. 5.4 GWAS and conditional analyses We performed GWAS of 4 traits — height, red blood cell count, systolic blood pressure, and heel bone mineral density — on a sample of N = 40, 000 unrelated individuals from UKB using the standard linear regression functionality of the BOLT-LMM software [ 31 ]. We filtered the SNPs with the following criteria: minor allele frequency (MAF)≥ 0.01, Hardy-Weinberg Equilibrium with a significance level of 10 −12 , and included imputed variants with an INFO score ≥ 0.01, which resulted in 9, 637, 426 SNPs in total. We transformed the phenotypes using the rank-based inverse normal transformation [ 5 ] and adjusted them for confounders using age, sex, the identifiers of the genotyping array and UKB assessment center, and the first 10 genetic principal components. For each trait, we constructed a set of independent loci using the clumping functionality of the PLINK software [ 39 ] with a significance threshold of 5 · 10 −8 for the lead SNPs (variants with the lowest p-value per locus) and a threshold of 5 · 10 −6 for secondary variants associated with the lead SNPs. Since we focus our analysis on disentangling the effect sizes of highly correlated variants, we used an R 2 threshold of 0.9 and a physical distance threshold of 1, 000 kb. For each lead SNP and its secondary variants, we fitted three joint models of SNPs and confounders: a baseline model with all variants in the clump, and two models, which only included the lead SNP and variants filtered based on either their p-values of MIFM scores. For the filtered models we retained secondary variants with p-values below the 30-th percentile of p-values of all GWAS-associated SNPs, or above the 70-th percentile of scores for the MIFM model. For each locus we additionally fitted a full joint model of all variants on a second, independent sample, and tested for differences in effect size estimates of all significant variants from the previous step between the filtered, and full models. This was to exclude cases where a non-causal variant becomes significant as a “substitute”, due to the true causal variant beyond removed 5.5 Functional annotation of the training data We utilized GenoSTAN [ 57 ] and silencerDB [ 58 ] data to annotate CAUSALdb2 variants in terms of their regulatory functions. For GenoSTAN data, we mapped all the variants from CAUSALdb2 into hg38 coordinates using the UCSC genome browser LiftOver tool [ 20 ] and annotated them with chromatin state annotations of 127 cell lines for the hg38 genome which we downloaded from https://www.cmm.in.tum.de/public/paper/GenoSTAN/ . For each variant, we assigned to it the chromatin states which were repeated in at least 5 different cell lines. A single variant could thus have multiple assigned states due to heterogeneity of the cell line experiments. For example, it could be marked as an enhancer-like element in one experiment and marked as a repressed region in a different cell type. For silencerDB, we downloaded annotations for the GRCh37 genome from http://health.tsinghua.edu.cn/SilencerDB/download/Species/Homo_sapiens.bed , and marked all CAUSALdb2 variant within each silencerDB region as silencers. We performed enrichment analyses for each annotation class by computing the odds ratios of SNPs with the given label compared to a subset of variants passing a given MIFM score threshold, and computed the p-values for enrichment using the Fisher exact test [ 15 ]. 5.6 Motif discovery We used TF-MoDISco [ 3 , 45 ] to identify motifs contributing to MIFM predictions. We computed the attribution scores for all training sequences using DeepLIFT [ 44 ] and ran TF-MoDISco with 200, 000 positive and 200, 000 negative seqlets (motif occurences). Finally, we matched the resulting patterns to known human TF binding models from HOCOMOCO 11 [ 28 ] using the TOMTOM [ 48 ]. For the in silico mutagenesis analysis of the contributions of positive and negative patterns, we selected sequences containing the 3 top positive and 3 top negative TF-MoDISco patterns matching the ARID3A binding motif. For each positive-negative pattern pair, we did the following: We computed the offset with respect to the ARID3A motif for the positive and negative pattern. As we had access to the starting positions of the TF-MoDISco patterns for each example, we also computed the start position of ARID3A motif in them. We obtained MIFM predictions for all positive and negative examples. We selected the positions in the negative pattern where the probability of the top nucleotide exceeded 0.4. For each positive example, we replaced the nucleotides at the positions matching the nucleotides selected from the negative pattern. We computed MIFM predictions for the modified positive examples. We repeated steps 3–5 by modifying the negative examples with the positive pattern. Data Availability All data produced in the present study are available upon reasonable request to the authors https://github.com/HealthML/multiple-instance-fine-mapping References [1]. ↵ Adam Auton , Gonçalo R Abecasis , David M Altshuler , Richard M Durbin , David R Bentley , Aravinda Chakravarti , Andrew G Clark , Peter Donnelly , Evan E Eichler , Paul Flicek , et al. A global reference for human genetic variation . Nature , 526 ( 7571 ): 68 , 2015 . OpenUrl CrossRef PubMed [2]. ↵ čiga Avsec , Vikram Agarwal , Daniel Visentin , Joseph R Ledsam , Agnieszka Grabska-Barwinska , Kyle R Taylor , Yannis Assael , John Jumper , Pushmeet Kohli , and David R Kelley . Effective gene expression prediction from sequence by integrating long-range interactions . Nature Methods , 18 ( 10 ): 1196 – 1203 , 2021 . OpenUrl CrossRef PubMed [3]. ↵ čiga Avsec , Melanie Weilert , Avanti Shrikumar , Sabrina Krueger , Amr Alexandari , Khyati Dalal , Robin Fropf , Charles McAnany , Julien Gagneur , Anshul Kundaje , et al. Base-resolution models of transcriptionfactor binding reveal soft motif syntax . Nature Genetics , 53 ( 3 ): 354 – 366 , 2021 . OpenUrl CrossRef PubMed [4]. ↵ Boris Babenko . Multiple instance learning: algorithms and applications . View Article PubMed/NCBI Google Scholar , 19 , 2008 . [5]. ↵ T Mark Beasley , Stephen Erickson , and David B Allison . Rank-based inverse normal transformations are increasingly used, but are they merited? Behavior Genetics , 39 : 580 – 595 , 2009 . OpenUrl CrossRef PubMed Web of Science [6]. ↵ Christian Benner , Chris CA Spencer , Aki S Havulinna , Veikko Salomaa , Samuli Ripatti , and Matti Pirinen . FINEMAP: efficient variable selection using summary data from genome-wide association studies . Bioinformatics , 32 ( 10 ): 1493 – 1501 , 2016 . OpenUrl CrossRef PubMed [7]. ↵ Stephen P Boyd and Lieven Vandenberghe . Convex optimization . Cambridge University Press , 2004 . [8]. ↵ RV Broekema , OB Bakker , and IH Jonkers . A practical view of fine-mapping and gene prioritization in the post-genome-wide association era . Open biology , 10 ( 1 ): 190221 , 2020 . OpenUrl CrossRef PubMed [9]. ↵ Yichao Cai , Ying Zhang , Yan Ping Loh , Jia Qi Tng , Mei Chee Lim , Zhendong Cao , Anandhkumar Raju , Erez Lieberman Aiden , Shang Li , Lakshmanan Manikandan , et al. H3K27me3-rich genomic regions can function as silencers to repress gene expression via chromatin interactions . Nature Communications , 12 ( 1 ): 719 , 2021 . OpenUrl PubMed [10]. ↵ Maria Cerezo , Elliot Sollis , Yue Ji , Elizabeth Lewis , Ala Abid , Karatuğ Ozan Bircan , Peggy Hall , James Hayhurst , Sajo John , Abayomi Mosaku , et al. The NHGRI-EBI GWAS Catalog: standards for reusability, sustainability and diversity . Nucleic Acids Research , 53 ( D1 ): D998 – D1005 , 2025 . OpenUrl CrossRef PubMed [11]. ↵ Wenan Chen , Beth R Larrabee , Inna G Ovsyannikova , Richard B Kennedy , Iana H Haralambieva , Gregory A Poland , and Daniel J Schaid . Fine mapping causal variants with an approximate Bayesian method using marginal test statistics . Genetics , 200 ( 3 ): 719 – 736 , 2015 . OpenUrl Abstract / FREE Full Text [12]. ↵ Monica Della Rosa and Mikhail Spivakov . Silencers in the spotlight . Nature Genetics , 52 ( 3 ): 244 – 245 , 2020 . OpenUrl CrossRef PubMed [13]. ↵ Laramie Duncan , Hanyang Shen , Bizu Gelaye , J Meijsen , K Ressler , M Feldman , R Peterson , and Ben Domingue . Analysis of polygenic risk score usage and performance in diverse human populations . Nature Communications , 10 ( 1 ): 3328 , 2019 . OpenUrl PubMed [14]. ↵ William Falcon and The PyTorch Lightning team . PyTorch Lightning , March 2019 . [15]. ↵ Ronald A Fisher . On the interpretation of χ2 from contingency tables, and the calculation of P . Journal of the Royal Statistical Society , 85 ( 1 ): 87 – 94 , 1922 . OpenUrl CrossRef Web of Science [16]. Hugo Fitipaldi and Paul W Franks . Ethnic, gender and other sociodemographic biases in genome-wide association studies for the most burdensome non-communicable diseases: 2005–2022 . Human Molecular Genetics , 32 ( 3 ): 520 – 532 , 2023 . OpenUrl CrossRef PubMed [17]. ↵ Mudasir A Ganaie , Minghui Hu , Ashwani Kumar Malik , Muhammad Tanveer , and Ponnuthurai N Suganthan . Ensemble deep learning: A review . Engineering Applications of Artificial Intelligence , 115 : 105151 , 2022 . OpenUrl [18]. ↵ Joshua Garton , Malini Shankar , Brittany Chapman , Kira Rose , Patrick M Gaffney , and Carol F Webb . Deficiencies in the DNA binding protein ARID3a alter chromatin structures important for early human erythropoiesis . ImmunoHorizons , 5 ( 10 ): 802 – 817 , 2021 . OpenUrl Abstract / FREE Full Text [19]. ↵ Stephen S Gisselbrecht , Alexandre Palagi , Jesse V Kurland , Julia M Rogers , Hakan Ozadam , Ye Zhan , Job Dekker , and Martha L Bulyk . Transcriptional silencers in Drosophila serve a dual role as transcriptional enhancers in alternate cellular contexts . Molecular Cell , 77 ( 2 ): 324 – 337 , 2020 . OpenUrl CrossRef PubMed [20]. ↵ Angela S Hinrichs , Donna Karolchik , Robert Baertsch , Galt P Barber , Gill Bejerano , Hiram Clawson , Mark Diekhans , Terrence S Furey , Rachel A Harte , Fan Hsu , et al. The UCSC genome browser database: update 2006 . Nucleic Acids Research , 34 ( suppl_1 ): D590 – D598 , 2006 . OpenUrl CrossRef PubMed Web of Science [21]. ↵ Geoffrey Hinton , Oriol Vinyals , and Jeff Dean . Distilling the knowledge in a neural network . arXiv preprint arxiv: 1503.02531 , 2015 . [22]. ↵ Farhad Hormozdiari , Emrah Kostem , Eun Yong Kang , Bogdan Pasaniuc , and Eleazar Eskin . Identifying causal variants at loci with multiple signals of association . In Proceedings of the 5th ACM Conference on Bioinformatics, Computational Biology, and Health Informatics , pages 610 – 611 , 2014 . [23]. ↵ Kangcheng Hou , Yi Ding , Ziqi Xu , Yue Wu , Arjun Bhattacharya , Rachel Mester , Gillian M Belbin , Steve Buyske , David V Conti , Burcu F Darst , et al. Causal effects on complex traits are similar for common variants across segments of different continental ancestries within admixed individuals . Nature Genetics , 55 ( 4 ): 549 – 558 , 2023 . OpenUrl CrossRef PubMed [24]. ↵ Maximilian Ilse , Jakub Tomczak , and Max Welling . Attention-based deep multiple instance learning . In International Conference on Machine Learning , pages 2127 – 2136 . PMLR , 2018 . [25]. ↵ David R Kelley . Cross-species regulatory sequence activity prediction . PLoS Computational Biology , 16 ( 7 ): e1008050 , 2020 . OpenUrl [26]. ↵ Gleb Kichaev , Wen-Yun Yang , Sara Lindstrom , Farhad Hormozdiari , Eleazar Eskin , Alkes L Price , Peter Kraft , and Bogdan Pasaniuc . Integrating functional data to prioritize causal variants in statistical fine-mapping studies . PLoS Genetics , 10 ( 10 ): e1004722 , 2014 . OpenUrl [27]. ↵ Diederik P Kingma . Adam: A method for stochastic optimization . arXiv preprint 1412.6980 , 2014 . [28]. ↵ Ivan V Kulakovskiy , Yulia A Medvedeva , Ulf Schaefer , Artem S Kasianov , Ilya E Vorontsov , Vladimir B Bajic , and Vsevolod J Makeev . HOCOMOCO: a comprehensive collection of human transcription factor binding sites models . Nucleic Acids Research , 41 ( D1 ): D195 – D202 , 2013 . OpenUrl CrossRef PubMed Web of Science [29]. ↵ Natsuhiko Kumasaka , Andrew J Knights , and Daniel J Gaffney . High-resolution genetic mapping of putative causal interactions between regions of open chromatin . Nature Genetics , 51 ( 1 ): 128 – 137 , 2019 . OpenUrl CrossRef PubMed [30]. ↵ Samuel A Lambert , Benjamin Wingfield , Joel T Gibson , Laurent Gil , Santhi Ramachandran , Florent Yvon , Shirin Saverimuttu , Emily Tinsley , Elizabeth Lewis , Scott C Ritchie , et al. Enhancing the Polygenic Score Catalog with tools for score calculation and ancestry normalization . Nature Genetics , pages 1 – 6 , 2024 . [31]. ↵ Po-Ru Loh , George Tucker , Brendan K Bulik-Sullivan , Bjarni J Vilhjalmsson , Hilary K Finucane , Rany M Salem , Daniel I Chasman , Paul M Ridker , Benjamin M Neale , Bonnie Berger , et al. Efficient Bayesian mixed-model analysis increases association power in large cohorts . Nature Genetics , 47 ( 3 ): 284 – 290 , 2015 . OpenUrl CrossRef PubMed [32]. ↵ Nina Mars , Sini Kerminen , Yen-Chen A Feng , Masahiro Kanai , Kristi Läll , Laurent F Thomas , Anne Heidi Skogholt , Pietro della Briotta Parolo , Benjamin M Neale , Jordan W Smoller , et al. Genome-wide risk prediction of common diseases across ancestries in one million people . Cell Genomics , 2 ( 4 ), 2022 . [33]. ↵ Alicia R Martin , Christopher R Gignoux , Raymond K Walters , Genevieve L Wojcik , Benjamin M Neale , Simon Gravel , Mark J Daly , Carlos D Bustamante , and Eimear E Kenny . Human demographic history impacts genetic risk prediction across diverse populations . The American Journal of Human Genetics , 100 ( 4 ): 635 – 649 , 2017 . OpenUrl CrossRef PubMed [34]. ↵ Alicia R Martin , Masahiro Kanai , Yoichiro Kamatani , Yukinori Okada , Benjamin M Neale , and Mark J Daly . Clinical use of current polygenic risk scores may exacerbate health disparities . Nature Genetics , 51 ( 4 ): 584 – 591 , 2019 . OpenUrl CrossRef PubMed [35]. ↵ Daniel McFadden . Regression-based specification tests for the multinomial logit model . Journal of Econometrics , 34 ( 1-2 ): 63 – 82 , 1987 . OpenUrl [36]. ↵ Chew Yee Ngan , Chee Hong Wong , Harianto Tjong , Wenbo Wang , Rachel L Goldfeder , Cindy Choi , Hao He , Liang Gong , Junyan Lin , Barbara Urban , et al. Chromatin interaction analyses elucidate the roles of PRC2-bound silencers in mouse development . Nature Genetics , 52 ( 3 ): 264 – 272 , 2020 . OpenUrl CrossRef PubMed [37]. ↵ Gherman Novakovsky , Oriol Fornes , Manu Saraswat , Sara Mostafavi , and Wyeth W Wasserman . ExplaiNN: interpretable and transparent neural networks for genomics . Genome Biology , 24 ( 1 ): 154 , 2023 . OpenUrl CrossRef PubMed [38]. ↵ Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , et al. Pytorch: An imperative style, high-performance deep learning library . Advances in Neural Information Processing Systems, 32 , 2019 . [39]. ↵ Shaun Purcell , Benjamin Neale , Kathe Todd-Brown , Lori Thomas , Manuel AR Ferreira , David Bender , Julian Maller , Pamela Sklar , Paul IW De Bakker , Mark J Daly , et al. PLINK: a tool set for whole-genome association and population-based linkage analyses . The American Journal of Human Genetics , 81 ( 3 ): 559 – 575 , 2007 . OpenUrl CrossRef PubMed [40]. ↵ Alexander Rakowski , Remo Monti , Viktoriia Huryn , Marta Lemanczyk , Uwe Ohler , and Christoph Lippert . Metadata-guided feature disentanglement for functional genomics . Bioinformatics , 40 ( Supplement 2 ): ii4 – ii10 , 2024 . OpenUrl PubMed [41]. ↵ Khandakar ASM Saadat , Widya Lestari , Endrawan Pratama , Teng Ma , Sachiko Iseki , Megumi Tatsumi , and Masa-Aki Ikeda . Distinct and overlapping roles of ARID3A and ARID3B in regulating E2F-dependent transcription via direct binding to E2F target genes . International Journal of Oncology , 58 ( 4 ): 1 – 12 , 2021 . OpenUrl CrossRef PubMed [42]. ↵ Marie Saitou , Andy Dahl , Qingbo Wang , and Xuanyao Liu . Allele frequency differences of causal variants have a major impact on low cross-ancestry portability of PRS . medRxiv , pages 2022 – 10 , 2022 . [43]. ↵ Mengting Shen , Shengli Li , Yiming Zhao , Yizhe Liu , Zhen Liu , Lin Huan , Yejun Qiao , Lu Wang , Leng Han , Zhiao Chen , et al. Hepatic ARID3A facilitates liver cancer malignancy by cooperating with CEP131 to regulate an embryonic stem cell-like gene signature . Cell Death & Disease , 13 ( 8 ): 732 , 2022 . OpenUrl PubMed [44]. ↵ Avanti Shrikumar , Peyton Greenside , and Anshul Kundaje . Learning important features through propagating activation differences . In International conference on machine learning , pages 3145 – 3153 . PMlR , 2017 . [45]. ↵ Avanti Shrikumar , Katherine Tian , čiga Avsec , Anna Shcherbina , Abhimanyu Banerjee , Mahfuza Sharmin , Surag Nair , and Anshul Kundaje . Technical note on transcription factor motif discovery from importance scores (TF-MoDISco) version 0.5. 6.5 . arXiv preprint arxiv: 1811.00416 , 2018 . [46]. ↵ Nasa Sinnott-Armstrong , Sahin Naqvi , Manuel Rivas , and Jonathan K Pritchard . GWAS of three molecular traits highlights core genes and pathways alongside a highly polygenic background . Elife , 10 : e58615 , 2021 . OpenUrl CrossRef PubMed [47]. Giorgio Sirugo , Scott M Williams , and Sarah A Tishkoff . The missing diversity in human genetic studies . Cell , 177 ( 1 ): 26 – 31 , 2019 . OpenUrl CrossRef PubMed [48]. ↵ Emi Tanaka , Timothy Bailey , Charles E Grant , William Stafford Noble , and Uri Keich . Improved similarity scores for comparing motifs . Bioinformatics , 27 ( 12 ): 1603 – 1609 , 2011 . OpenUrl CrossRef PubMed Web of Science [49]. ↵ Alex M Tseng , Gokcen Eraslan , Tommaso Biancalani , and Gabriele Scalia . A mechanistically interpretable neural network for regulatory genomics . arXiv preprint arxiv: 2410.06211 , 2024 . [50]. ↵ Jon Wakefield . A bayesian measure of the probability of false discovery in genetic epidemiology studies . The American Journal of Human Genetics , 81 ( 2 ): 208 – 227 , 2007 . OpenUrl CrossRef PubMed Web of Science [51]. ↵ Gao Wang , Abhishek Sarkar , Peter Carbonetto , and Matthew Stephens . A simple new approach to variable selection in regression, with application to genetic fine mapping . Journal of the Royal Statistical Society Series B: Statistical Methodology , 82 ( 5 ): 1273 – 1300 , 2020 . OpenUrl CrossRef PubMed [52]. ↵ Jianhua Wang , Dandan Huang , Yao Zhou , Hongcheng Yao , Huanhuan Liu , Sinan Zhai , Chengwei Wu , Zhanye Zheng , Ke Zhao , Zhao Wang , et al. CAUSALdb: a database for disease/trait causal variants identified using summary statistics of genome-wide association studies . Nucleic Acids Research , 48 ( D1 ): D807 – D816 , 2020 . OpenUrl CrossRef PubMed [53]. ↵ Jianhua Wang , Liao Ouyang , Tianyi You , Nianling Yang , Xinran Xu , Wenwen Zhang , Hongxi Yang , Xianfu Yi , Dandan Huang , Wenhao Zhou , et al. Causaldb2: an updated database for causal variants of complex traits . Nucleic Acids Research , 53 ( D1 ): D1295 – D1301 , 2025 . OpenUrl CrossRef PubMed [54]. ↵ Ying Wang , Jing Guo , Guiyan Ni , Jian Yang , Peter M Visscher , and Loic Yengo . Theoretical and empirical quantification of the accuracy of polygenic scores in ancestry divergent populations . Nature Communications , 11 ( 1 ): 3865 , 2020 . OpenUrl CrossRef PubMed [55]. ↵ Omer Weissbrod , Farhad Hormozdiari , Christian Benner , Ran Cui , Jacob Ulirsch , Steven Gazal , Armin P Schoech , Bryce Van De Geijn , Yakir Reshef , Carla Màrquez-Luna, et al. Functionally informed fine-mapping and polygenic localization of complex trait heritability . Nature genetics , 52 ( 12 ): 1355 – 1363 , 2020 . OpenUrl CrossRef PubMed [56]. ↵ Jian Yang , Teresa Ferreira , Andrew P Morris , Sarah E Medland , Genetic Investigation of ANthropometric Traits (GIANT) Consortium, DIAbetes Genetics Replication , Meta analysis (DIAGRAM) Consortium, Pamela AF Madden , Andrew C Heath , Nicholas G Martin , Grant W Montgomery , et al. Conditional and joint multiple-SNP analysis of GWAS summary statistics identifies additional variants influencing complex traits . Nature Genetics , 44 ( 4 ): 369 – 375 , 2012 . OpenUrl CrossRef PubMed [57]. ↵ Benedikt Zacher , Margaux Michel , Bjorn Schwalb , Patrick Cramer , Achim Tresch , and Julien Gagneur . Accurate promoter and enhancer identification in 127 encode and roadmap epigenomics cell types and tissues by genostan . PloS one , 12 ( 1 ): e0169249 , 2017 . OpenUrl CrossRef PubMed [58]. ↵ Wanwen Zeng , Shengquan Chen , Xuejian Cui , Xiaoyang Chen , Zijing Gao , and Rui Jiang . SilencerDB: a comprehensive database of silencers . Nucleic Acids Research , 49 ( D1 ): D221 – D228 , 2021 . OpenUrl CrossRef PubMed [59]. ↵ Jian Zhou and Olga G Troyanskaya . Predicting effects of noncoding variants with deep learning-based sequence model . Nature Methods , 12 ( 10 ): 931 – 934 , 2015 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted June 14, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Multiple instance fine-mapping: predicting causal regulatory variants with a deep sequence model Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Multiple instance fine-mapping: predicting causal regulatory variants with a deep sequence model Alexander Rakowski , Christoph Lippert medRxiv 2025.06.13.25329551; doi: https://doi.org/10.1101/2025.06.13.25329551 Share This Article: Copy Citation Tools Multiple instance fine-mapping: predicting causal regulatory variants with a deep sequence model Alexander Rakowski , Christoph Lippert medRxiv 2025.06.13.25329551; doi: https://doi.org/10.1101/2025.06.13.25329551 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4411) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15205) Forensic Medicine (30) Gastroenterology (1119) Genetic and Genomic Medicine (6575) Geriatric Medicine (666) Health Economics (994) Health Informatics (4511) Health Policy (1365) Health Systems and Quality Improvement (1608) Hematology (537) HIV/AIDS (1263) Infectious Diseases (except HIV/AIDS) (15903) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6573) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (954) Oncology (3319) Ophthalmology (968) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (662) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5423) Public and Global Health (9205) Radiology and Imaging (2191) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1191) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9febe0ef49474193',t:'MTc3OTI4NTc0MQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00