Accounting for uncertainty in residual variances improves calibration for fine-mapping with small sample sizes

preprint OA: closed CC-BY-4.0
📄 Open PDF Full text JSON View at publisher
Full text 44,527 characters · extracted from preprint-html · click to expand
Accounting for uncertainty in residual variances improves calibration for fine-mapping with small sample sizes | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Accounting for uncertainty in residual variances improves calibration for fine-mapping with small sample sizes William R.P. Denault , Peter Carbonetto , Ruixi Li , The Alzheimer’s Disease Functional Genomics Consortium , Gao Wang , Matthew Stephens doi: https://doi.org/10.1101/2025.05.16.654543 William R.P. Denault 1 Department of Human Genetics, University of Chicago , Chicago, IL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: wdenault{at}uchicago.edu mstephens{at}uchicago.edu Peter Carbonetto 1 Department of Human Genetics, University of Chicago , Chicago, IL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ruixi Li 2 Center for Statistical Genetics, The Gertrude H. Sergievsky Center, Columbia University , New York, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gao Wang 2 Center for Statistical Genetics, The Gertrude H. Sergievsky Center, Columbia University , New York, NY, USA 3 Department of Neurology, Columbia University , New York, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Matthew Stephens 1 Department of Human Genetics, University of Chicago , Chicago, IL, USA 4 Department of Statistics, University of Chicago , Chicago, IL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: wdenault{at}uchicago.edu mstephens{at}uchicago.edu Abstract Full Text Info/History Metrics Preview PDF Abstract The Sum of Single Effects (SuSiE) model is a widely adopted method for genetic fine-mapping. We show that, in small-sample studies, the original SuSiE fitting procedure produces substantially higher rates of false positive findings. We show that a simple modification to SuSiE improves performance in small-sample studies. This modification is particularly important for emerging molecular QTL applications in rare cell types and primary tissues where sample sizes are inherently limited. Main Wang et al [ 1 ] introduced the “Sum of Single Effects” (SuSiE) model as a way to capture uncertainty in variable selection in multiple linear regression. This work was motivated by settings like genetic fine-mapping which often involves highly correlated variables, and where it is difficult to confidently identify the relevant variables, i.e., those with a non-zero regression coefficient. They pointed out that, even in such settings, it may be possible, and useful, to narrow down the potentially relevant variables by identifying sets of (highly correlated) variables that, with high probability, contain at least one variable with a non-zero coefficient. They refer to these sets of variables as “Credible Sets” (CSs), and the ability of SuSiE to efficiently generate reliable CSs is a key feature of the method. Since its introduction, a number of studies have extended the method in various ways [ 2 – 7 ]. Here we highlight a limitation of the original SuSiE method: it can produce severely miscalibrated CSs when applied to data sets with small sample sizes ( n≤ 50). We show that this miscalibration occurs even in the simplest case where exactly one variable is assumed to have a non-zero coefficient. While many fine-mapping studies are large, this miscalibration issue is particularly important in smaller studies such as pilot experiments, complex experimental designs (e.g., dynamic eQTLs from [ 8 ], n = 19, and more recently [ 9 ], n = 53), costly multimodal-omics experiments (e.g., multimodal QTL mapping in influenza-infected cells [ 10 ], n = 35), and studies of rare cell types (e.g., eQTL mapping in microglia from the subventricular zone [ 11 ], n = 47). To address this limitation, we show that the miscalibration can be greatly improved by a simple modification of the procedures in [ 1 ] to account for uncertainty in the residual variance parameter. Specifically, whereas the original SuSiE method estimates the residual variance by maximum likelihood, here we instead integrate it out under a diffuse prior, following ideas from the classical Bayesian literature [ 12 – 15 ]. Breifly, the SuSiE model builds on the simpler “Single Effect Regression” (SER) model, which assumes that exactly one variable has a nonzero effect on the outcome. The original SER formulation from [ 1 ] does not account for uncertainty in the residual variance; here we augment it by placing a conjugate inverse-gamma prior on this parameter. This augmented SER is, in essence, the model considered in [ 13 ], so we refer to it as the “SS-SER” model (short for “Servin and Stephens”). This simple modification substantially improves CS calibration in simulations, and provides more realistic results when applying the SuSiE model to small-sample QTL association studies ( Fig. 1 ). Download figure Open in new tab Fig. 1. Panel A: Comparison of 95% CSs from SuSiE with the default SER (red) and SuSiE with the SS-SER (blue) in a simulation setting where the genotype accounts for 25% of the trait variance. The top row displays the observed empirical coverage of the two methods for N= 20,30, and 50. The black line represents the target 95% coverage. Coverage was calculated as where FP and TP denote the number of false positives and true positives, respectively. The second row gives the mean CS size. The third row gives the power. Note the y-axis is on a logarithmic scale. The fourth row presents the median CS purity. Panels B, C: Illustration of severe miscalibration of SuSiE with the default SER in fine-mapping ADAM17 expression using data from microglia in the subventricular zone from [ 11 ], N = 47. Panel B shows the estimated posterior inclusion probabilities (PIP) and credible sets (CS) for both SuSiE with the default SER and SS-SER. The x-axis represents the SNP index, while the y-axis gives the PIPs. Colors correspond to different CSs. SuSiE with the default SER identifies 9 CSs, whereas SuSiE with SS-SER reports 1 CS (this CS overlaps a CS from SuSiE with default SER). Panel C compares the predicted ADAM17 gene expression under both models against normalized gene expression. The default SER exhibits explains > 99% of the variance, suggesting overfitting, while the SS-SER model provides a more realistic fit (68% variance explained). Panel D: summary of genome-wide fine-mapping using RNA-seq data from microglia in the subventricular zone [ 11 ]. It compares of the number of 95% credible sets (CSs) reported by SuSiE using the default SER (x-axis) versus SuSiE with the SS-SER (y-axis). For improved visualization, the point at (0,0) is removed; in 12,232 genes, both methods give no CSs. We compared our new approach (SuSiE with the SS-SER ) with the original approach (SuSiE with the default SER ) on simulated data. For each simulation, we sampled genotypes by randomly selecting loci from the 1000 Genomes Project Phase 3 [ 16 ], including only unrelated individuals. Within these loci, we retained biallelic single nucleotide polymorphisms (SNPs) with minor allele frequencies ≥ 5%. From the remaining SNPs, we randomly selected 1–10 causal variants that collectively explained a specified proportion of the phenotypic variance ( h 2 = 25%, 30%, 50%, 75%). We evaluated different sample sizes ( N = 20, 30, 50, 75, 100). Both methods were run using an upper bound of L = 10 causal variants, and with the purity filtering step from [ 1 ], which removes CSs containing pairs of variables with correlation less than 0.5. See Methods for details. Our results ( Fig. 1A , Fig. 2 ) show that, for small-sample studies ( n ≤ 50), SuSiE with the default SER had consistently poor CS coverage, substantially below the nominal 95% level. By comparison, SuSiE with SS-SER substantially improved coverage (although not attaining 95%). For example, with N = 20 and h 2 = 0.25, the default SER CS coverage dropped as low as 44.1%, but increased to 82.1% using the SS-SER ( Fig. 1A ). This improved coverage occurs because SS-SER produces larger CSs ( Fig. 1A , Fig. 3 ) with lower purity ( Fig. 1A , Fig. 4 ). Consequently, more CSs are discarded by the purity filter, resulting in a corresponding drop in power ( Fig. 1A , Fig. 5 ). This loss of power and localization appear to be an inevitable consequence of improving calibration. Download figure Open in new tab Fig. 2. Comparison of coverage (y-axis) for SuSiE with the default SER (red) and SuSiE with the SS-SER (blue). The black line represents the target 95% coverage. Coverage was calculated as where FP and TP denote, respectively, the number of false positives and true positives. Download figure Open in new tab Fig. 3. Comparison of mean CS size (y-axis) for SuSiE with the default SER (red) and SuSiE with the SS-SER (blue). Download figure Open in new tab Fig. 4. Comparison of median CS purity (y-axis) for SuSiE with the default SER (red) and SuSiE with the SS-SER (blue). Download figure Open in new tab Fig. 5. Comparison of power (y-axis) for SuSiE using the default SER (red) and the SS-SER (blue). Note the y-axis is on a logarithmic scale. To gauge the impact of estimating the number of causal variants (as well as the purity filtering step), we conducted additional simpler simulations where a single variant affects the trait, assessing CS coverage when (i) allowing SuSiE to estimate the number of causal effects (setting L = 10 as above), and (ii) running SuSiE with the correct number of causal effects ( L = 1). We ran (i) and (ii) both with and without purity filtering. Note that fitting SuSiE with L = 1 does not involve any variational approximation, so (ii) removes any potential error from the approximation. Results are summarized in Figures 6 and 7 . Download figure Open in new tab Fig. 6. Comparison of coverage between SuSiE using the default SER (red) and SuSiE using the SS-SER (blue) under a single causal SNP scenario without purity filtering. The solid line denotes the target 95% coverage level, while the dashed line indicates 100% coverage. Simulations were conducted with the L = 1, a configuration under which SuSiE computes exact posteriors for both SERs. Download figure Open in new tab Fig. 7. Comparison of coverage between SuSiE using the default SER (red) and SuSiE using the SS-SER (blue) under a single causal SNP scenario with purity filtering. The solid line denotes the target 95% coverage level, while the dashed line indicates 100% coverage. Round markers represent simulations where SuSiE dynamically estimates the number of causal effects (with an upper bound of L = 10), whereas triangle markers correspond to simulations with the L parameter fixed to 1. In these simpler simulations, the miscalibration issue appeared only with the smallest sample sizes ( n = 20, 30) and smallest effect sizes ( h 2 = 0.25, 0.3), and only in the cases with purity filtering. Allowing SuSiE to estimate the number of causal effects had only a minor impact on the calibration in all cases. These results may seem to suggest avoiding the purity filter altogether; however, in practice this produces many “noninformative CSs” that contain numerous variables. These noninformative CSs technically improve calibration bby including almost every variable, and thus likely including a relevant variable by chance. However, outputting many uninformative CSs to improve calibration in this way is typically undesirable. Therefore, in small-sample studies we recommend maintaining the purity filter, using the SS-SER, and bearing in mind that the resulting CSs may still be somewhat less reliable than in large studies. To compare the methods on real data, we fine-mapped gene expression QTL (eQTL) from two RNA-seq studies. The first dataset from Aracena et al [ 10 ] examined the transcriptional response in primary macrophages (n = 35) before and after infection with influenza. The second dataset from Lopes et al [ 11 ]investigated eQTLs in microglia, a key cell type involved in brain aging. For each gene, we selected SNPs within ± 200 kb of the gene body with MAF > 0.05, and fine-mapped these SNPs using both approaches (default SuSiE and SuSiE with SS-SER). For the Aracena et al [ 10 ] study, across 13,680 fine-mapped genes, default SuSiE reported 1,020 unique CSs, whereas SuSiE with SS-SER reported 389 CSs. Among the genes where SuSiE with SS-SER identified a single CS (381 out of 389), this CS consistently overlapped with a CS reported by default SuSiE. For the Lopes et al [ 11 ] study, across 17,132 fine-mapped genes, default SuSiE identified 6,918 CSs, while SuSiE with SS-SER reported 1,556 CSs. Again, nearly all genes with a single CS from SuSiE with SS-SER (844 out of 858) had overlapping CSs from default SuSiE, which exhibited clear signs of miscalibration ( Fig. 1B–D ), with 146 genes having 4 or more CSs, including up to 12 CSs for GAL3ST2 . By contrast, SuSiE with SS-SER reported only one gene with 4 CSs. In conclusion, we have highlighted a critical limitation of the SuSiE method: it produces poorly calibrated CSs in studies with small sample sizes. We present a simple modification that uses the SS-SER to better account for uncertainty in the residual variance parameter, substantially improving CS calibration. Our method retains the computational advantages of SuSiE and can be applied to summary statistics [ 17 ], making it practical for a wide range of fine-mapping studies. While our approach improves credible set coverage, some miscalibration remains in certain settings, and further improving performance for small-sample studies remains an area for future work. Code availability SuSiE with SS-SER is implemented into the original susieR package available at https://github.com/stephenslab/susieR . The scripts to replicate these simulations are available at https://github.com/william-denault/susie_small_sample . Methods Single effect regression accounting for uncertainty in residual variance estimates The Sum of Single Effects (SuSiE) model is built on the simpler “Single Effect Regression” (SER) model, which assumes that exactly one of the variables in the regression has a non-zero effect on the outcome. The SER model from [ 1 ] is where y is the vector of observed outcomes, X is a matrix of explanatory variables, β is the vector containing the effects of these variables on the phenotype, and ϵ is Gaussian noise with variance σ 2 . Note that γ is a vector in which exactly one entry is non-zero. The prior probability for which entry is non-zero is encoded by the vector π (a vector of non-negative entries that sum to one). The hyperparameter determines the prior on the effect size relative to the residual variance. In [ 1 ], the residual variance σ 2 was estimated by maximizing the likelihood (or, in SuSiE, by maximizing the “evidence lower bound”). Given σ 2 and , it is straightforward to compute the posterior distribution on β analytically, essentially by summing over all the possible values for γ [ 1 , 13 ]. To account for uncertainty in small-sample studies, we augment the SER model by placing a (conjugate) inverse-gamma prior on the residual variance: This augmented SER is, in essence, the model considered in [ 13 ], and so we call it the “SS-SER” (“SS” being short for “Servin and Stephens”). Given the hyperparameters κ, λ and , it is straightforward to compute the posterior distribution on β analytically. Servin and Stephens [ 13 ] showed that this posterior distribution has a sensible limit as κ → 0, λ → 0, which we adopt for our implementation. Extension to the SuSiE model: SuSiE with the SS-SER model The SER model is limited because it allows for only a single non-zero effect. Therefore, Wang et al [ 1 ] addressed this by allowing for L≥ 1 effects through summing L SER models, yielding the “sum of single effects” (SuSiE) model. They developed a fitting procedure, “iterative Bayesian stepwise selection” (IBSS), which fits the SuSiE model by iteratively fitting a sequence of SER models. They showed that this IBSS algorithm, when used with their SER, corresponds to optimizing a variational approximation [ 18 ] to the posterior distribution. Here we use the same IBSS algorithm as in [ 1 ], but replace their SER (which fixes σ at a point estimate), with the SS-SER (which integrates out σ ); see supplementary materials for details. Unlike the original IBSS algorithm, this modified IBSS is not guaranteed to exactly optimize a variational approximation to the posterior distribution; however, our empirical results show that in small-sample studies it produces better CS calibration while maintaining the computational simplicity of the original IBSS algorithm. Univariate regression model For simplicity, we present all derivations assuming κ and λ are strictly positive, ensuring a well-defined prior. However, the quantities can be computed with κ = 0, λ = 0. Consider the following univariate regression model where σ 2 is treated as unknown: where y and x are in ∈ ℝ n , β is an unknown parameter, ϵ is Gaussian noise, and is a scaling factor on the effect size. IG( κ, λ ) denotes the inverse gamma distribution with shape κ and scale λ . The posterior distribution of β conditional on the other inputs and σ 2 is where The posterior distribution of σ 2 is where Thus, the posterior distribution of β is The Bayes factor (BF) for this model is given by Further details of these derivations are provided in the Supporting derivations. Bayes factor and posterior distribution for SS-SER model Under the SS-SER model (1–2), the posterior distribution of β j is a mixture of a point mass at zero and a t -distribution: where , and λ 1 j follow the same definitions as in the univariate case, substituting x with X j in equation (3). Here, y = X j β j + ϵ . The mixture weights are given by: with where BF j is the Bayes factor for the univariate association between column j of X and y (see eq. 9 ). Simulations In all simulations, we randomly sample a genomic region from the 1000 Genomes Phase 3 dataset [ 16 ] and select N unrelated individuals within this locus. We retain only biallelic SNPs with a minor allele frequency (MAF) ≥ 5% among the selected individuals. Multiple causal SNPs simulations For each simulation, we vary the number of individuals as N = 20, 30, 50, 75, and 100 and set the heritability, defined as the variance explained by the genotype, to h 2 = 25%, 30%, 50%, and 75%. For a given sample size N and heritability h 2 , we randomly select N individuals as described above. We then randomly determine the number of causal SNPs, L , where L is drawn uniformly from { 1, …, 10 } . The phenotype for individual i is generated as: y i = µ i + ϵ i , where . Where x il denotes the genotype at the l th causal SNP for individual i , and ϵ i is a Gaussian noise term. To ensure that the in-sample variance explained by the genotype matches the specified h 2 , we rescale the error term ϵ i to satisfy the following identity: where and are the empirical means of µ and y , respectively. In these simulations, we set the upper bound for the number of effects to L = 10 when running SuSiE, regardless of the selected single-effect regression (SER) variant, and allow the method to estimate L . Single causal SNP simulations Simulations for the single causal SNP scenario follow the same setup as in the multiple causal SNP scenario, except that we fix the number of causal SNPs at L = 1. For each combination of N and h 2 , we evaluate the performance of SuSiE and its variants using the following configurations: SuSiE with default SER, setting the upper bound on L to 1, with and without purity filtering. SuSiE with default SER, setting the upper bound on L to 10, with and without purity filtering. SuSiE with SS-SER, setting the upper bound on L to 1, with and without purity filtering. SuSiE with SS-SER, setting the upper bound on L to 10, with and without purity filtering. Supplementary Notes Here, we provide detailed derivations of SuSiE with SS-SER model with sample code implementation appended to the end of this document. Bayes factors Consider the simple regression model to account for uncertainty in σ 2 where IG( a, b ) denotes the inverse gamma distribution with shape a and scale b , which has probability density function and we note the IG normalizing factor. We note x := ( x , …, x ) T ∈ R n , y := ( y 1 , …, y n ) T ∈ R n and e := ( e 1 , …, e n ) T ∈ R n , b ∈ R is the regression coefficient, σ 2 > 0 is the residual variance, is the prior variance of the regression coefficient b , I n denotes the n × n identity matrix, N ( µ, σ 2 ) denotes the univariate normal distribution with mean µ ∈ R and variance σ 2 > 0, and N d ( µ , Σ ) denotes the multivariate normal distribution with mean µ ∈ R d , and d × d covariance matrix Σ . Here we assume that both x and y are centered to have mean zero, which removes the need for an intercept in the model [ 19 ]. In practice, we scale the prior variance of b by the sample variance of y to ensure that the prior is invariant to the scale of y , but we can consider this to be “baked in” to the parameter , which is assumed to be known and is considered a fixed quantity in most of these derivations. Given σ 2 and , the posterior computations for this model are simple: they can be conveniently written in terms of the usual least-squares estimate of b , its variance, and the z -score, which are Note that scaling the prior variance of b by σ 2 is important to obtain analytical expressions for marginal likelihoods and Bayes factors. A few side notes: Placing an inverse gamma prior on σ 2 is equivalent to a gamma prior on the “precision” σ 2 = 1 /σ 2 with shape a and inverse scale b . The mean of IG( a, b ) is (for a > 1), its variance is (for a > 2), and the mode is . The “noninformative” prior for σ 2 is recovered with α → 0, β → 0, and this was the prior used in [ 14 ]. For simplicity, we assume α and β are strictly positive so that the prior is well-defined. However, the derived quantities can be computed with α = 0, β = 0. From [ 13 ], the posterior distribution of b | σ 2 is where Writing the posterior mean and variance in this way is helpful for intuition: as n becomes large, the “prior” term approaches 1 (because 1 / ( x T x ) gets small), and therefore approach the least-squares estimate and its variance s 2 . Also note that when . The posterior distribution of σ 2 | b is where is the residual sum of squares. It is interesting to note that the posterior mode of σ 2 when α = 0, β = 0 is simply RSS /n , the mean of the residual squared errors. The posterior distribution of σ 2 (averaging over b ) is where Here, RSS can be viewed as a Bayesian analog of the RSS [ 22 ]. This expression agrees with [ 13 ] (specifically, see eq. 4 in Protocol S1 of [ 13 ]). It is interesting to note that when n is large and a = 0, b = 0, the posterior mode of σ 2 approaches , which is the sample variance of y scaled by a correction term for the correlation between x and y . Also, when α = 0, β = 0, , the posterior mean of σ 2 is . Therefore, the posterior distribution of b is From this result, one can easily obtain the posterior mean and variance of b . Note that when α = 0, β = 0 and n is large, the posterior variance of b approaches The Bayes factor for this model is The dependencies on α, β are implicit in the expressions above and were not included to simplify notation. Let’s denote the marginal likelihood (integrated over σ 2 ) as Then the Bayes factor for b in model 13 is This integral works out to using the α 1 and β 1 defined in (24) and (25). When , this simplifies to Therefore, the Bayes factor for this model is This expression agrees with the Bayes factor given in [ 13 ] (specifically, see eq. 12 in Protocol S1 of [ 13 ]). This also appears to agree with eq. 16 in [ 22 ].When β = 0, the Bayes factor simplifies to the somewhat intuitive expression Single effect regression For convenience, we restate the single effect regression (SER) model from [ 1 ]. The SER extends (13) to a multiple linear regression model in which exactly one of the p variables affects the outcome, y : where X ∈ R n×p , y ∈ R n , γ ∈ { 0, 1 } p , and Mulitnom( n , π ) denotes the multinomial distribution with multinomial probabilities π = ( π 1 , …, π p ) and n trials. We assume y and the columns of X are centered to have mean zero, which implicitly accounts for an intercept. We will use x j to denote a column of X . Since our ultimate aim here is to extend this model to account for uncertainty, here we reparameterize the SER model slightly so that the prior variance is scaled by the residual variance, . This has no effect on the derivations except for the part where we update σ 2 . From our derivations above, it follows that the posterior distribution of b j , σ 2 under the SER model has the following form [the dependence on α, β and π is implicit]: where , in which , are given in equations (19 , 20 , 25 ). Here, we are reusing the expressions we have derived previously, with one change made to the notation to make explicit the dependence of these expressions on x, y . As in the previous section, does not actually depend on y , but we write it this way for consistency. The mixture weights are where BF j := BF( x j , y ), and BF( x, y ) is given by the previous result (35), again with the notation changed slightly to make the dependence on x, y explicit. And α 1 j = α + n for all j = 1, …, p . Following the previous notation we refer to SER as the function that outputs the quantities to compute the posterior quantities under the SER model with unknown σ 2 . Updating The only parameter that may need to be updated is the (unscaled) prior variance, . The EM update for this parameter is , in which is given by the posterior second moment of b 2 . For the SER with unknown σ 2 , this works out to Posterior for σ 2 Following the fact that Assuming independence between on p post ( b j | σ 2 , γ j = 1) leads to the following quantities The “Sum of Single Effects” (SuSiE) model with unknown σ 2 We now extend the SER model with unknown σ 2 to the “Sum of Single Effects” (SuSiE) model with unknown σ 2 . Instead of a single variable affecting the outcome y , the SuSiE model allows at most L variables to affect y : Algorithm 1 General Iterative Bayesian Stepwise Selection (GIBBS) for Servin and Stephens prior Download figure Open in new tab In the seminal SuSiE work [ 1 ], the authors assumed that the residual variance σ 2 is either known or could be estimated via maximum marginal likelihood. They fit the SuSiE model using a variational approximation (VA) of the posterior, where the variational posterior assumes that the L single-effect coefficients are independent under the posterior: Wang and colleagues [ 1 ] further demonstrated that under this conditional independence assumption, each q l must be a valid posterior distribution for the corresponding SER model. When extending SuSiE to the case where σ 2 is unknown, the same conditional independence assumption leads to the following variational form: A key challenge in this variational approach is that q σ ( σ 2 ) turns out to be a mixture of p × L inverse Gamma distributions, making it computationally intractable for efficient evaluation. However, given a known form for q σ ( σ 2 ), obtaining q l ( b ( l ) | σ 2 ) is straightforward. Despite this, employing such a VA necessitates numerical shortcuts and additional approximations to maintain computational efficiency. To circumvent these computational difficulties, we adopt an alternative heuristic approach called Generalized Iterative Bayesian Stepwise Selection (GIBSS) for fitting model 45, as described below. GIBSS for SuSiE with unknown σ 2 A key feature of the (Gaussian SER) SuSiE model introduced by [ 1 ] is that, given estimates of b 1 , …, b L− 1 , estimating b L reduces to fitting a single-SER model with an offset: As noted by Wang and colleagues [ 1 ], this observation suggests an iterative approach to fitting the SuSiE model by repeatedly fitting SER models—an algorithm they refer to as Iterative Bayesian Stepwise Selection (IBSS). We build on this idea, we SuSiE with the Servin and Stephens prior using an iterative algorithm in which each iteration fits the SER with the Servin and Stephens prior. We refer to the resulting algorithm as Generalized IBSS (GIBSS), see Algorithm 1 . In short, GIBBS computes the approximate posterior distribution for each single effect b l , which allows computing 1) the posterior mean of each estimated effect and their corresponding credible set (CS). The CSs are computing following [ 1 ]. As opposed to the original IBSS, it is not proven that GIBSS optimizes a proper variational approximation of the posterior distribution of b under the Servin and Stephens prior. Nonetheless, in our numerical experiment, GIBSS showed particularly good performances. In summary, GIBSS computes an approximate posterior distribution for each single-effect coefficient b l , enabling the derivation of both (1) the posterior mean of each estimated effect and (2) the corresponding credible sets (CSs), which are constructed following [ 1 ]. Unlike the original IBSS, GIBSS is not proven to optimize a formal variational approximation of the posterior distribution of b under the Servin and Stephens prior. Nevertheless, our numerical experiments demonstrate strong empirical performance of GIBSS. GIBSS stopping criterion As described in Algorithm 1 , GIBSS terminates when either the maximum number of iterations is reached (set to 100 by default) or a predefined stopping criterion is satisfied. In standard VA methods, it is common practice to monitor the evidence lower bound (ELBO) [ 18 ] and halt the algorithm when the increment in ELBO between consecutive iterations falls below a certain threshold (e.g., 10 − 6 ). Since GIBSS does not explicitly optimize a VA objective, we do not have direct access to an ELBO. Instead, we assess the variation in the parameters ( π 1 , …, π L ) across iterations and stop when the change between consecutive iterations is sufficiently small, i.e., where ϵ is a small threshold, set to 10 − 6 by default. While this stopping criterion is heuristic, we note that some fine-mapping methods that explicitly optimize a variational approximation employ a similar approach in practice (see the implementation of SuSiE-inf [ 6 ]). Some notes on the t distribution Let x ~ t ν be random variable drawn from the t distribution with ν degrees of freedom. The probability density function is Now let y = µ + σx . We denote its distribution as y ~ t ν ( µ, σ 2 ), and its probability density function is The mean of y is µ (for ν > 1) and its variance is (for ν > 2). A useful identity: if and x | σ 2 ~ N ( µ, s 2 σ 2 ), then . Posterior distribution of σ 2 , and b when σ 2 is unknown Let’s denote Then the posterior first and second moment of b j under the SER SS model 37 is given by R script: Below the script to compute the posterior mean and variance for the SS model α = β = 0 Sample code Here the R code to compute the key quantities under the Servin and Stephens prior. posterior_mean_SS<- function(x,y, s0_t=1){ omega <- ((1/s0_t^2)+crossprod(x))^-1 b_bar<- omega%*%(crossprod(x,y)) return(b_bar) } posterior_mean_SS_suff <- function(xtx,xty, s0_t=1){ omega <- ((1/s0_t^2)+xtx)^-1 b_bar<- omega%*%(xty) return(b_bar) } posterior_var_SS <- function (x,y, s0_t=1){ omega <- ((1/s0_t^2)+crossprod(x))^-1 b_bar<- omega%*%(crossprod(x,y)) post_var_up <- 0.5*(crossprod(y) - b_bar *(omega ^(-1))*b_bar) post_var_down <- 0.5*(length(y)*(1/omega)) post_var <- (post_var_up/post_var_down)* length(y)/(length(y)-2) return(post_var) } posterior_var_SS_suff<- function (xtx,xty, n,s0_t=1){ omega <- ((1/s0_t^2)+xtx)^-1 b_bar<- omega%*%(xty) post_var_up <- 0.5*(yty - b_bar *(omega ^(-1))*b_bar) post_var_down <- 0.5*(n*(1/omega)) post_var <- (post_var_up/post_var_down)* n/(n-2) return(post_var) } compute_log_ssbf <- function (x, y, s0) { x <- x - mean(x) y <- y - mean(y) n <- length(x) xx <- sum(x*x) xy <- sum(x*y) yy <- sum(y*y) r0 <- s0/(s0 + 1/xx) sxy <- xy/sqrt(xx*yy) return((log(1 - r0) - n*log(1 - r0*sxy^2))/2) } # assuming centered x and y compute_log_ssbf_suff <- function (xtx, xty, yty, s0,n) { r0 <- s0/(s0 + 1/xtx) sxty <- xty/sqrt(xtx*yty) return((log(1 - r0) - n*log(1 - r0*sxty^2))/2) } Footnotes We have updated and changed the format of the manuscript References [1]. ↵ Wang , G. , A. Sarkar , P. Carbonetto , and M. Stephens ( 2020 ). A simple new approach to variable selection in regression, with application to genetic fine mapping . Journal of the Royal Statistical Society, Series B 82 ( 5 ), 1273 – 1300 . OpenUrl [2]. ↵ Gao , B. and X. Zhou ( 2024 ). MESuSiE enables scalable and powerful multi-ancestry fine-mapping of causal variants in genome-wide association studies . Nature Genetics 56 , 170 – 179 . OpenUrl CrossRef PubMed [3]. Rossen , J. , H. Shi , B. J. Strober , M. J. Zhang , M. Kanai , Z. R. McCaw , L. Liang , O. Weissbrod , and A. L. Price ( 2024 ). MultiSuSiE improves multi-ancestry fine-mapping in All of Us whole-genome sequencing data . medRxiv , doi: 10.1101/2024.05.13.24307291 . OpenUrl Abstract / FREE Full Text [4]. Yuan , K. , R. J. Longchamps , A. F. Pardiñas , M. Yu , T.-T. Chen , S.-C. Lin , et al. ( 2024 ). Fine-mapping across diverse ancestries drives the discovery of putative causal variants underlying human complex traits and diseases . Nature Genetics 56 ( 9 ), 1841 – 1850 . OpenUrl CrossRef PubMed [5]. Zou , Y. , P. Carbonetto , D. Xie , G. Wang , and M. Stephens ( 2023 ). Fast and flexible joint fine-mapping of multiple traits via the Sum of Single Effects model . bioRxiv , doi: 10.1101/2023.04.14.536893 . OpenUrl Abstract / FREE Full Text [6]. ↵ Cui , R. , R. A. Elzur , M. Kanai , J. C. Ulirsch , O. Weissbrod , M. J. Daly , B. M. Neale , Z. Fan , and H. K. Finucane ( 2024 ). Improving fine-mapping by modeling infinitesimal effects . Nature Genetics 56 , 162 – 169 . OpenUrl CrossRef PubMed [7]. ↵ Weissbrod , O. , F. Hormozdiari , C. Benner , R. Cui , J. Ulirsch , S. Gazal , A. P. Schoech , B. van de Geijn , Y. Reshef , C. Márquez-Luna , L. O’Connor , M. Pirinen , H. K. Finucane , and A. L. Price ( 2020 , November ). Functionally informed fine-mapping and polygenic localization of complex trait heritability . Nature Genetics , 1 – 9 . Publisher: Nature Publishing Group . [8]. ↵ Strober , B. , R. Elorbany , K. Rhodes , N. Krishnan , K. Tayeb , A. Battle , and Y. Gilad ( 2019 ). Dynamic genetic regulation of gene expression during cellular differentiation . Science 364 ( 6447 ), 1287 – 1290 . OpenUrl Abstract / FREE Full Text [9]. ↵ Popp , J. M. , K. Rhodes , R. Jangi , M. Li , K. Barr , K. Tayeb , A. Battle , and Y. Gilad ( 2024 ). Cell type and dynamic state govern genetic regulation of gene expression in heterogeneous differentiating cultures . Cell Genomics 4 ( 12 ), 100701 . OpenUrl PubMed [10]. ↵ Aracena , K. A. , Y.-L. Lin , K. Luo , A. Pacis , S. Gona , Z. Mu , et al. ( 2024 ). Epigenetic variation impacts individual differences in the transcriptional response to influenza infection . Nature Genetics 56 ( 3 ), 408 – 419 . OpenUrl CrossRef PubMed [11]. ↵ Lopes , K. P. , G. J. L. Snijders , J. Humphrey , A. Allan , M. A. M. Sneeboer , E. Navarro , B. M. Schilder , R. A. Vialle , M. Parks , R. Missall , W. van Zuiden , F. A. J. Gigase , R. Kübler , A. B. van Berlekom , E. M. Hicks , C. Böttcher , J. Priller , R. S. Kahn , L. D. de Witte , and T. Raj ( 2022 , Jan ). Genetic analysis of the human microglial transcriptome across brain regions, aging and disease pathologies . Nature Genetics 54 , 4 – 17 . OpenUrl CrossRef PubMed [12]. ↵ Gönen , M. , W. O. Johnson , Y. Lu , and P. H. Westfall ( 2005 ). The Bayesian two-sample t test . American Statistician 59 ( 3 ), 252 – 257 . OpenUrl CrossRef Web of Science [13]. ↵ Servin , B. and M. Stephens ( 2007 ). Imputation-based analysis of association studies: Candidate regions and quantitative traits . PLoS Genetics 3 ( 7 ), e114 . OpenUrl [14]. ↵ Liang , F. , R. Paulo , G. Molina , M. A. Clyde , and J. O. Berger ( 2008 ). Mixtures of g priors for Bayesian variable selection . Journal of the American Statistical Association 103 ( 481 ), 410 – 423 . OpenUrl CrossRef Web of Science [15]. ↵ Rouder , J. N. , P. L. Speckman , D. Sun , R. D. Morey , and G. Iverson ( 2009 ). Bayesian t tests for accepting and rejecting the null hypothesis . Psychonomic Bulletin & Review 16 , 225 – 237 . OpenUrl PubMed [16]. ↵ The 1000 Genomes Project Consortium ( 2015 ). A global reference for human genetic variation . Nature 526 ( 7571 ), 68 – 74 . OpenUrl CrossRef PubMed [17]. ↵ Zou , Y. , P. Carbonetto , G. Wang , and M. Stephens ( 2022 ). Fine-mapping from summary data with the “Sum of Single Effects” model . PLoS Genetics 18 ( 7 ), e1010299 . OpenUrl PubMed [18]. ↵ Blei , D. M. , A. Kucukelbir , and J. D. McAuliffe ( 2017 ). Variational inference: a review for statisticians . Journal of the American Statistical Association 112 ( 518 ), 859 – 877 . OpenUrl CrossRef References for Supplementary Notes [19]. ↵ George , E. I. and R. E. McCulloch ( 1997 , April ). Approaches for Bayesian Variable Selection . Statistica Sinica 7 ( 2 ), 339 – 373 . Publisher: Institute of Statistical Science, Academia Sinica . OpenUrl CrossRef [14]. Liang , F. , R. Paulo , G. Molina , M. A. Clyde , and J. O. Berger ( 2008 ). Mixtures of g priors for Bayesian variable selection . Journal of the American Statistical Association 103 ( 481 ), 410 – 423 . OpenUrl CrossRef Web of Science [13]. Servin , B. and M. Stephens ( 2007 ). Imputation-based analysis of association studies: Candidate regions and quantitative traits . PLoS Genetics 3 ( 7 ), e114 . OpenUrl [22]. ↵ Stephens , M. ( 2013 , July ). A Unified Framework for Association Analysis with Multiple Related Phenotypes . PLOS ONE 8 ( 7 ), e65245 . Publisher: Public Library of Science . OpenUrl CrossRef PubMed [1]. Wang , G. , A. Sarkar , P. Carbonetto , and M. Stephens ( 2020 ). A simple new approach to variable selection in regression, with application to genetic fine mapping . Journal of the Royal Statistical Society, Series B 82 ( 5 ), 1273 – 1300 . OpenUrl [18]. Blei , D. M. , A. Kucukelbir , and J. D. McAuliffe ( 2017 ). Variational inference: a review for statisticians . Journal of the American Statistical Association 112 ( 518 ), 859 – 877 . OpenUrl CrossRef [6]. Cui , R. , R. A. Elzur , M. Kanai , J. C. Ulirsch , O. Weissbrod , M. J. Daly , B. M. Neale , Z. Fan , and H. K. Finucane ( 2024 ). Improving fine-mapping by modeling infinitesimal effects . Nature Genetics 56 , 162 – 169 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted September 01, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Accounting for uncertainty in residual variances improves calibration for fine-mapping with small sample sizes Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Accounting for uncertainty in residual variances improves calibration for fine-mapping with small sample sizes William R.P. Denault , Peter Carbonetto , Ruixi Li , The Alzheimer’s Disease Functional Genomics Consortium , Gao Wang , Matthew Stephens bioRxiv 2025.05.16.654543; doi: https://doi.org/10.1101/2025.05.16.654543 Share This Article: Copy Citation Tools Accounting for uncertainty in residual variances improves calibration for fine-mapping with small sample sizes William R.P. Denault , Peter Carbonetto , Ruixi Li , The Alzheimer’s Disease Functional Genomics Consortium , Gao Wang , Matthew Stephens bioRxiv 2025.05.16.654543; doi: https://doi.org/10.1101/2025.05.16.654543 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7637) Biochemistry (17705) Bioengineering (13899) Bioinformatics (41970) Biophysics (21463) Cancer Biology (18605) Cell Biology (25526) Clinical Trials (138) Developmental Biology (13385) Ecology (19911) Epidemiology (2067) Evolutionary Biology (24329) Genetics (15615) Genomics (22514) Immunology (17743) Microbiology (40424) Molecular Biology (17194) Neuroscience (88650) Paleontology (667) Pathology (2835) Pharmacology and Toxicology (4827) Physiology (7648) Plant Biology (15160) Scientific Communication and Education (2046) Synthetic Biology (4302) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-23T02:00:01.238055+00:00
License: CC-BY-4.0