Full text
63,240 characters
· extracted from
preprint-html
· click to expand
A Bayesian Informative Shrinkage Approach for Large-scale Multiple Hypothesis Testing (BISHOT): with Applications in Differential Analysis of Omics Data | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results A Bayesian Informative Shrinkage Approach for Large-scale Multiple Hypothesis Testing (BISHOT): with Applications in Differential Analysis of Omics Data Ya Su , Mary Eunice Joy Z. Clark , Chi Wang doi: https://doi.org/10.1101/2025.09.11.675690 Ya Su 1 Department of Statistical Sciences and Operational Research, Virginia Commonwealth University , Richmond, VA 23284-3083, U.S.A. Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: suyaf{at}vcu.edu Mary Eunice Joy Z. Clark 1 Department of Statistical Sciences and Operational Research, Virginia Commonwealth University , Richmond, VA 23284-3083, U.S.A. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chi Wang 2 Division of Cancer Biostatistics, Department of Internal Medicine and Markey Cancer Center, University of Kentucky , Lexington, 40536, U.S.A. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract A major goal of many omics studies is to identify differential features, e.g. differentially expressed genes, between experimental groups. When performing differential analysis for a given dataset, relevant information from another platform or species is often available. Incorporating such prior information can help identify features that show consistent differential patterns across platforms or species, which are more likely to reflect shared biological processes, and thereby enhance the robustness and generalizability of the findings. However, existing differential analysis methods typically analyze only the data from the current study and do not leverage prior knowledge about the magnitude or direction of changes from other platforms or species. We address this challenge, and the associated multiple testing problem, using a Bayesian framework that enables the incorporation of prior knowledge obtained from different platforms or species. We propose a new test statistic, Bayesian Credible Ratio (BCR), based on a heteroscedastic global local shrinkage prior, and a new multiple testing criterion, sign-adjusted FDR (SFDR), that emphasize information regarding the direction of the differentially features. We prove that BCR achieves the largest count of sign-based true positives among all legitimate SFDR-controlling methods. Simulation results offer numerical evidence of its advantage compared to an empirical Bayesian method. The approach is demonstrated through the analysis of RNAseq and single-cell RNAseq datasets. 1 Introduction The statistical differential analysis plays an important role in many omics studies that involve two or more experimental groups. Various statistical methods have been developed to identify differentially expressed genes from RNAseq or single cell RNAseq (scRNAseq) ( Finak et al., 2015 ; Love et al., 2014 ; Robinson et al., 2010 ; Smyth, 2005 ; Wu et al., 2013 ), differentially bound sites from ChIPseq ( Ross-Innes et al., 2012 ), differentially accessible peaks from ATACseq ( Gontarz et al., 2020 ), or differentially abundant proteins/metabolites from mass spectrometry ( Huang et al., 2020 ; Li et al., 2019 ) between experimental groups. When conducting the differential analysis for a given omics dataset, there is often relevant information available from another platform or species. Such prior information enables investigators to distinguish features that exhibit consistent differential patterns across platforms or species from those specific to a single platform or species. Because features that are consistent across platforms or species likely reflect shared underlying biological processes ( Vitorino, 2024 ), prioritizing them can enhance the robustness and generalizability of the findings ( Altenbuchinger et al., 2017 ; Wang et al., 2022 ). A few such examples are as follows: Example 1: Hohmann et al. (2016) performed an RNAseq experiment on a human acute myeloid leukemia cell line, MV4, to investigate changes in the gene expression profile with the treatment of BI-7273, a BRD9 inhibitor. In addition to this experiment, the investigators also performed an RNAseq experiment on a mice cell line, RN2, using the same treatment. Hohmann et al. (2016) found that MYC expression was downregulated in BRD4 inhibitor–treated cells in both human (MV4) and mouse (RN2) cell lines. This consistent suppression of MYC across species provides strong evidence that BRD4 plays a critical role in sustaining MYC transcription. Example 2: Angelidis et al. (2019) conducted a scRNAseq experiment that profiled the cell types and gene expressions in old and young mice to study lung aging. An analysis of interest is to identify differentially expressed genes in type-2 pneumocytes, one of the most prevalent cell types in the samples, between old and young mice. In parallel, the authors also conducted a bulk RNAseq experiment on type-2 pneumocytes, selected by flow cytometry sorting, from old and young mice. Angelidis et al. (2019) found significant agreement between the scRNAseq results and bulk RNAseq results, supporting the robustness of the age-related genes identified by scRNAseq. Example 3: Li et al. (2022) performed an ATACseq experiment to identify chromatin regions related to resistance to second-generation androgen receptor inhibitors, e.g. enzalutamide, by comparing enzalutamide-sensitive and resistant cell lines. Prior to this experiment, the investigators had already performed an RNAseq experiment on the same cell lines ( Li et al., 2020 ). Li et al. (2022) focused on consensus genes that were both differentially expressed in RNAseq data and had differentially accessible promoter peaks in ATACseq data, thereby narrowing down the list of candidate genes and ultimately identifying a potential therapeutic target, GSTM2, for overcoming resistance to enzalutamide. A common approach for leveraging prior information is to consider a two-step procedure, where the first step is to perform differential analysis on the current dataset alone to identify significant features, and the second step is to only keep the subset of significant features that are consistent with prior information from other platform/species ( Chen et al., 2024 ; Li et al., 2022 ). However, since all features are analyzed together and treated equally in the first step, the significance levels of true consistently differential features can be reduced due to the presence of platform/species-specific differential features, e.g. a relatively small (but significant) effect claimed by both prior and current data, therefore consistent, can be declared as insignificant due to low signal noise ratio in the current data, e.g., in the analysis of MV4 cell line data in Section 5 , the current standard approach fails to identify MYC which is claimed to be significant by BISHOT. In addition, downstream analyses, such as gene set enrichment analysis, may become complicated since ranking genes based on significance levels is difficult with the filtering based on prior information. Despite the demand for a rigorous procedure to incorporate these prior information, most existing differential analysis methods perform the analysis based on the given dataset alone without allowing historical prior information to be supplied ( Robinson et al., 2010 ; Smyth, 2005 ; Sun and McLain, 2012 ). Although leveraging historical data has been explored ( Li et al., 2017 , 2016 ), the focus has primarily been on stabilizing the estimation of mean and variance parameters, either by treating historical data as informative priors ( Li et al., 2016 ) or by grouping genes with similar variations from historical data ( Li et al., 2017 ). To our knowledge, there is a lack of methods that explicitly incorporate prior knowledge about the magnitude or direction of changes, e.g. fold changes in gene expression, between experimental groups. Another potential area of research is regarding multiple testing error control, given the number of genes in expression data grows substantially. The property of false discovery rate (FDR) has been extensively studied in the literature ( Benjamini and Yekutieli, 2001 ; Genovese and Wasserman, 2004 ; Storey, 2003 , among others). Various methods consider alternatives to the traditional p -value based FDR control approach ( Benjamini and Hochberg, 1995 ), by using some ‘local’ measure for evidence towards H ag ( Efron, 2008 ; Stephens, 2017 ; Sun and McLain, 2012 ), which quantifies individualized ‘significance’ under data heteroscedasticity and arguably enhances the power in large scale testing problems. More evidence regarding benefits of empirical Bayes based procedures in handling multiple comparisons can be found in Efron (2012) , Kendziorski et al. (2003) , Muralidharan (2010) and references therein. As a comparison method, we briefly describe Sun and McLain (2012) shortly. For simplicity purpose, we will derive our method based on a linear regression model for RNAseq data. An extension to a more complicated model for scRNAseq data is provided in Section 6 . For gene g ( g = 1, …, G ) in subject i ( i = 1, …, n ), let Y ig be the normalized expression level in the form of log 2 (CPM + 1), where CPM denotes counts per million. We consider a classical regression model that allows for heteroscedasticity for Y ig : Generally speaking, X i ∈ R p is a set of covariates for subject i, c g is the intercept, is the variance and β g is a vector of coefficients for gene g . The methodology is illustrated for the application of comparing RNAseq gene expressions between two groups, e.g. control and treatment, corresponding to model (1) with p = 1 where X i ∈ { 0, 1 } represents a treatment group indicator, equivalently, β g is the log 2 fold change (LFC) quantifying the treatment effect for gene g . The problem we are interested in is whether β g is different from 0 for g = 1, …, G . A natural statistical formulation is via hypothesis test. Specifically we consider the null hypotheses to be composite, that is In this context we consider A = (− ϵ, ϵ ), a small neighborhood containing 0. The composite null hypothesis ensures that identified differences are sufficiently large to be biologically meaningful, with the choice of ϵ informed by domain experts ( McCarthy and Smyth, 2009 ). A popular multiple hypothesis testing procedure is built upon some summary statistic which is normally distributed and centered on β g , that is, . A common shrinkage prior with a zero component is typically utilized for β g , g = 1, …, G . That is, β g ∼ f ( · ), with f ( x ) = π 0 δ 0 ( x ) + (1 − π 0 ) g ( x ), where δ 0 ( x ) is a point mass at 0, g ( x ) is the distribution for non-zero signals and the probability of observing 0 is given by π 0 . Sun and McLain (2012) is specifically designed for composite null and the proposed ‘local’ estimate is proven to be optimal within a family of estimators called MLRC in large scale hypothesis testing. There are several potential limitations in the above and similar procedures. First, they do not apply when there is prior knowledge about β g as those procedures make heavy use of a common prior distribution for β g . Second, they rely on the existence of the summary statistics and its marginal distribution over β g , which are crucial in estimating π 0 and g ( · ). Besides, estimations of π 0 and g ( · ) are shown to be unstable from our simulation studies especially for smaller sample sizes. We introduce a new B ayesian i nformative s hrinkage multiple h yp o thesis t esting (BISHOT) procedure that addresses the question of elucidating consistent features between two data sources with proven error control. Specifically, we propose a new shrinkage prior distribution where the local parameter plays a key role in governing the prior influence and is numerically shown to be adaptive to the feature-level coherence from prior to present. In addition, a new test statistic, Bayesian Credible Ratio (BCR), based on the posterior distribution is proposed which accounts for the heteroscedasticity of genes in favor of one side of tail over the other. While conducting multiple tests, we also take the tail-favoring perspective and propose a new criterion called the sign-adjusted FDR (SFDR). The calculation of BCRs for all genes combined naturally promotes parallelization across all genes. The decision process according to BCR is proven to attain the maximum number of true positives among all valid sign-adjusted false positive controlled procedures. To the best of our knowledge, it is a difficult task for the available approaches to work with a more complicated model or when a vector β g is present due to the lack of explicit distribution of as well as multivariate deconvolution methods in the presence of unknown point mass on zero. The fully Bayesian framework we propose lays out a potential path to this setting without the need of establishing and carefully modeling f ( · ) and thus it is free from estimating non-null probability and non-null distribution. We present our solution for a two component model for scRNAseq data in Section 6 . Outline of this paper is as follows. Section 2 describes the construction of the prior distribution and the test statistic named the Bayesian credible ratio, its connection with the decision rule. Section 3 introduces a classification risk-based framework for large-scale differential analysis, detailing threshold selection and establishing the procedure’s optimality. Section 2 and Section 3 combined lead to the proposed BISHOT. Section 4 includes simulation results and comparisons with the above popular method. Applications to RNAseq and scRNAseq gene expression data are illustrated in Section 5 and 6 . We conclude with several discussions in Section 7 . 2 Bayesian credible ratio We illustrate the proposed method using model (1), we later dive into a more complicated model applicable to scRNAseq data in Section 6 . The likelihood function corresponding to (1) for a given gene feature g along with observed expressions i = 1, …, n is: by convention, ϕ σ denotes the normal density with mean zero and standard deviation σ . We propose a heteroscedastic global local shrinkage (HGLS) prior for β g based on our knowledge about β g , the input of non-central parameter h g is reflective of the information given about β g , in this work, h g is a fixed value and set as the prior estimate for the treatment effect associated with feature g based on findings from earlier studies. The prior variance consists of the local parameter λ g and the global parameter τ , in addition to the standard deviation σ g from error distribution in (1). HGLS for β g builds upon the classical global and local prior by explicitly modeling the substantial heteroscedasticity present in gene expression data. Contrary to zero, it has a shrinkage effect towards h g , the existing knowledge about β g . The shrinkage is also relative to the variability of that specific gene feature captured by σ g . The local shrinkage λ g governs the prior influence and is learned in a data-driven manner based on the gene-specific coherence between historical and current data. As illustrated in Figure 5 , λ g increases with the discrepancy between historical and current treatment effects, thereby reducing the impact of historical values when they are not aligned with current findings. In this work we adopt standard prior distributions for shrinkage parameters λ g and τ from horseshoe. That is, λ g , τ ∼ Half-Cauchy(0, 1). The heteroscedastic error variance takes a conjugate prior, with ξ 1 and ξ 2 set according to the moments of empirical estimates of given the observed gene expressions. The overall expression c g is assigned a prior distribution in the general case. Under the proposed prior (3) and model (1) for the expression data, we can derive the (conditional) posterior distribution of and τ given data D g = {Y ig , i = 1, …, n} and h g , where λ g and τ are sampled using slice sampling algorithm, similar to Bhattacharya et al. (2016) . For better mixing of the chain, we sample β g given λ g and τ while integrating out σ g . The conditional posterior distributions for all parameters are explicitly written in the Appendix A. The proposed test procedure is based on the tail evidence from the posterior distribution of the targeting parameter. For the purpose of illustration, in the remainder of the section we simplify the notation from β g and D g to β and D correspondingly, e.g., posterior distribution of β is denoted as Pr( β | D ). We define the Bayesian credible ratio (BCR) for the hypothesis test H 0 : β ∈ (− ϵ, ϵ ) vs H a : β ∈ / (− ϵ, ϵ ) by C β = C 0, β (1 − C a,β ) −1 , where C 0, β = Pr( β ∈ (− ϵ, ϵ )| D ) and C a,β = min { Pr( β ≤ − ϵ | D ), Pr( β ≥ ϵ | D ) } are the posterior probability for β over (− ϵ, ϵ ) and its smaller tail over (−∞, − ϵ ) or ( ϵ , ∞). The null hypothesis is rejected if C β is smaller than a threshold 0 ≤ λ ≤ 1. Given the model and the prior structures, the hypothesis test in (2) is analyzed by BCR based on the posterior samples of β g . BCR compares evidence towards the null/alternative hypotheses by C 0, β and 1 − C a,β . The notion of BCR is model and prior free and might be of interest for its own purpose. When hypotheses regarding all feature effects are considered, we suggest the threshold λ be selected to control the sign-adjusted FDR (SFDR) in the multiple testing regime, see details in Section 3 . 2.1 BCR as a Bayesian decision rule Generally speaking, a hypotheses testing problem H 0 : β ∈ Θ 0 vs can be formulated as a Bayesian decision process ( Berger, 2013 ). Let a 0 / a 1 be the action of accepting of H 0 / H 1 , the loss function for a i is L ( a i , β ), i = 0, 1. Consider a simple ‘0 − K i ’ loss function, L ( a i , β ) = 0 if β ∈ Θ i and L ( a i , β ) = K i if β ∈ / Θ i . The expected posterior loss for a 0 and a 1 is K 0 P (Θ 1 | D ) and K 1 P (Θ 0 | D ) correspondingly. The Bayes rule opts for the action which minimizes the expected posterior loss, that is, accepting H 0 when P (Θ 0 | D ) > K 0 / ( K 0 + K 1 ) and accepting H 1 otherwise. When Θ 0 = (− ϵ, ϵ ), we revise the above decision process as follows. First since the alternative set is composed of two non-overlapping sets, we consider performing two hypotheses with alternatives being H 1 : β ∈ Θ 1 = (−∞, − ϵ ) and H 2 : β ∈ Θ 2 = ( ϵ , ∞). In this circumstance, we define three actions a 0 , a 1 or a 2 corresponding to accepting of H 0 , H 1 or H 2 . Suppose the ‘0 − K i ’ loss functions are chosen for a i , i = 0, 1, 2 under a constraint K 1 = K 2 (no distinction of loss under a 1 or a 2 ), the Bayes rule regarding H 0 and H 1 is accepting H 0 when P (Θ 0 | D ) K 1 > K 0 P (Θ 1 | D ) and accepting H 1 otherwise; the Bayes rule regarding H 0 and H 2 is accepting H 0 when P (Θ 0 | D ) K 1 > K 0 P (Θ 2 | D ) and accepting H 2 otherwise. We propose the ‘universal’ Bayes rule regarding H 0 : β ∈ Θ 0 vs H a : β ∈ Θ 1 ∪ Θ 2 as accepting H 0 when both decisions made above agree with action a 0 and accepting H a when either of the above decisions says otherwise. Equivalently, the universal decision process is accepting H 0 when P (Θ 0 | D ) > ( K 0 /K 1 ) max {P (Θ 1 | D ), P (Θ 2 | D ) } and accepting H 1 otherwise. Define λ = K 0 / ( K 0 + K 1 ), 0 < λ λ or accepting H 1 if C β < λ , where C β is the BCR defined previously. 3 Optimal decision rule for differential analysis via sign-adjusted classification risk In the presence of a large number of features, the issues of multiple testing emerge when hypotheses tests (2) are carried out for each feature individually. To design an optimal decision rule while controlling the marginal false discovery rate (mFDR), Sun and Cai (2007) shows the equivalence to study the weighted classification risk corresponding to β ∈ (− ϵ, ϵ ) or β ∈ / (− ϵ, ϵ ). The optimal decision rule is shown to be the minimizer to the classification risk, while the associated statistic is optimal in that it minimizes the false nondiscovery rate, subject to a constraint on the false discovery rate. As detailed in Section 2.1 , the Bayes decision rule governed by BCR, is based on two separate decisions regarding β g ≤ − ϵ or β g ≥ ϵ . We will show that this Bayes rule is an estimator of the optimal decision/minimizer to a weighted classification risk with sign preferences for all features. Furthermore, it is proven to capture the maximum number of sign-adjusted expected true positives while controlling for the marginal SFDR (mSFDR). The proofs to all Theorems in this Section can be found in the Appendix A. 3.1 BCR as minimizer of classification risk utilizing sign preferences Define We purposely treat β g ≤ − ϵ and β g ≥ ϵ separately since the combination of the data and the informative shrinkage prior should provide evidence (if any) favoring one of them rather than both. Define δ g = 1 if the null hypothesis for feature g is rejected and 0 otherwise, δ and ω represent the concatenated vector of δ g and ω g for g = 1, …, G . Let λ ∈ [0, 1] represent the weight governing the false positive/negative losses. Define , where y g = {Y 1 g , …, Y ng } is the vector of observations for feature g . Consider first two loss functions to losses of detecting negative effects ( ω g = 1) or positive effects ( ω g = 2) correspondingly: Theorem 1. For k = 1, 2, the minimizer of EL k,λ (the expected value of L k,λ ) is denoted by δ k , which is the vector form of {δ kg , g = 1, …, G} where . EL k,λ is defined over the probability law of δ and ω . Next, we define a partition of all features with regards to the preference of positive or negative signs when rejecting. This partition exists under Assumption 1 below. Incorporating information about the sign of the parameter to construct decision, or to choose between the terms in L 1, λ or L 2, λ , leads to a more conservative decision than ignoring the sign. It is preferred in our setting when features are provided with additional confidence per prior knowledge (3). Assumptions 1 . For a fixed , either (i) or (ii) holds: (i) φ ϵ ,∞ ( λ ) ≤ φ −∞,− ϵ ( λ ) for all λ ∈ (0, 1) (ii) φ ϵ ,∞ ( λ ) ≥ φ −∞,− ϵ ( λ ) for all λ ∈ (0, 1). Assumption 1 requires the sign preference is invariant to λ , that is, case (i) and case (i) holds universally for all any choice of λ. φ −∞,− ϵ ( λ ) has an equivalent form and similarly . Therefore, under case (i) or (ii) is expected to be larger or smaller when the decision is rejection. Let 𝒮 or 𝒮 c denote the set of features which belong to case (i) or (ii) correspondingly. Based on Assumption 1, loss in L 1, λ tends to concern features in 𝒮 than those in 𝒮 c , this motivates us to prioritize ω g = 1 or ω g = 2 in each scenario and thus combine δ 1 and δ 2 into one. The combined decision we propose is and . Denote δ OPT as the concatenated vector. Theorem 2. δ OP T minimizes EL 3, λ where The optimal decision δ OP T has one missing piece: the set 𝒮 is unknown and needs to be estimated. When δ g = 1, it is expected that if g ∈ 𝒮 and if g ∈ 𝒮 c . Therefore the optimal decision can be estimated empirically as . According to the definitions of and , one comes to an equivalent expression where and . This establishes the relationship that BCR is an estimator of . 3.2 Selection of λ in the multiple testing regime The choice of λ can be selected by controlling the SFDR at the desired level α . In practice, which denotes the marginal SFDR, and it can be shown that mSFDR = SFDR + O ( G −1 / 2 ) using the same technique as Genovese and Wasserman (2002) for the asymptotic equivalence for mFDR and FDR. mSFDR is the mFDR based on sign of the parameters and has a natural link to the classification risk L 3, λ . Although the explicit form of mSFDR( δ ) is generally unknown, we can estimate it using the following procedure (a slight modification of Section 3.2 in Sun and McLain (2012) ): The numerator of mSFDR( δ ) can be unbiasedly estimated by . The same arguments hold true for the denominator of mSFDR( δ ), which can be estimated by . Together with these estimators, we select λ ∈ (0, 1) to be the largest such that . 3.3 Optimal property of δ OP T The discovery of true positives is one criterion to compare procedures given the same FDR α . Under additional knowledge about the sign of the signal (given rejection), as the proposed mSFDR, the true positives count is modified accordingly, that is, true positives count are refined with information about sign appropriately. Definition 1. The sign-adjusted expected number of true positives is defined as follows: Clearly, compared to the traditional ETP, SETP only counts when the parameter β g is in the correct sign given rejection, which is divided up by either 𝒮 or 𝒮 c (Remark 3.1). The following Theorem states that the proposed decision is optimal in achieving the greatest SETP under the same mSFDR threshold. Theorem 3. Assume that λ is chosen such that mSFDR = α for a fixed α . The procedure δ OP T has the largest SETP among all valid mSFDR procedures. 4 Simulations 4.1 Simulation setup Our simulation studies is based on model (1) which can be useful for comparing gene expressions in two groups, e.g., control and treatment. This is achieved by setting X i to be an indicator (0 or 1) variable if i th expression in the treatment group, β g thus becomes the treatment effect. We select a simulation scheme where 100(1 − p )% of the genes share the same expression mechanisms between two groups. That is, β g is set to zero except for 100 p % of the genes that are randomly selected. The proportion p is normally referred to as the proportion of differentially expressed (DE) genes. The following choices are used p = 0.1 or p = 0.5 corresponding to low or moderate presence of DE genes. Denote the set of DE genes as Ω, hence β g = 0 when g ∈ / Ω. For g ∈ Ω, β g ∼ N(log 2 (2), 0.1 2 ) (up to a random sign placement). In the Appendix A, a separate simulation with expressions of DE genes generated from a unimodal distribution is conducted. In model (3) the following values are used: c g = 4, σ g = 1 for a total G = 5000 genes. The sample size for treatment/control group is n = 10 or n = 50 with the total sample size for the two groups doubled. The rest of the hyperparameters required are set as follows. The hyperparameters ξ 1 and ξ 2 are determined such that the first and second moments for all (pooled) sample variances of the expressions (combining treatment and control group) match with those under the prior. We consider two options for ϵ in the hypothesis tests (2), log 2 (1.5) or log 2 (1.2) targeting at moderate or small treatment effects in DE analysis. The prior distributions for β g is centered on h g if g ∈ Ω. For h g , we purposely perturb the true value of β g with h g ∼ N( β g , 0.5 2 ), that is, the information regarding β g is only within a vicinity of its true value. We generate N = 50 replicated data sets under each scenario and results are summarized in Section 4.2 . We provide a concise description of the method that is compared to BISHOT, denoted as SunSpike0. As mentioned in the introduction, the algorithm in Sun and McLain (2012) is theoretically-justified and suitable for composite null hypothesis. Since it relies on estimating the distribution of β g with a zero component and a distribution for DE genes given , it is not directly applicable to any data-generating model (when the distribution of does not have an explicit form as such) but can be adapted to model (1) if is selected to be the MLE and as the standard error. To the best of our knowledge the code underlying Sun and McLain (2012) is not publicly available. Alternatively, the R package ashr ( Stephens, 2017 ) can extract the distribution of β g assuming the distribution for DE genes is unimodal and symmetric with which we estimate the test statistic in Sun and McLain (2012) . The resulting estimate then used in controlling mSFDR is compared with BISHOT in the simulation above. Similar findings are present in simulation II where the unimodal and symmetric distribution is used as the true distribution for β g (the Appendix A). 4.2 Simulation results Under a collection of SFDR nominal values ranging over [0, 0.25], we validate the actual SFDR achieved using (4) with δ g and the true ω g . Figure 1 and 2 display the actual SFDR values averaged over 50 simulations against the corresponding nominal SFDR thresholds for the four scenarios given by n and p when ϵ = log 2 (1.2) ( Figure 1 ) and ϵ = log 2 (1.5) ( Figure 2 ). It can be seen that almost always BISHOT produces SFDR much closer to its nominal value while SunSpike0 tends to yield much higher FDR and not surprisingly the advantage becomes more prominent when the sample size is smaller and/or there is less DE genes (small n and/or small p ). It is noteworthy that SunSpike0 behaves quite differently when the proportion of DE genes p and sample size n changes, conversely, BISHOT is dramatically more stable/robust among all scenarios given, thanks to the informative prior. Generally speaking, ϵ has an impact on the conservativeness of both methods, genes with small or zero β g become more distinct from the genes with large β g relative to a larger threshold hence the actual false discovery proportion tends to decrease when ϵ increases. Hence the desired conservativeness of the method can provide some guidance of choosing ϵ . Download figure Open in new tab Figure 1: The actual signed false discovery rate (SFDR) when ϵ = log 2 (1.2) averaged over 50 simulated data using mSFDR threshold 0.001 to 0.25 corresponding to 10 (left panel) or 50 (right panel) samples in the treatment and control group. The top and bottom panel displays the case when p = 0.1 and p = 0.5. The solid black line corresponds to the y = x line. Download figure Open in new tab Figure 2: The actual signed false discovery rate (SFDR) when ϵ = log 2 (1.5) averaged over 50 simulated data using mSFDR threshold 0.001 to 0.25 corresponding to 10 (left panel) or 50 (right panel) samples in the treatment and control group. The top and bottom panel displays the case when p = 0.1 and p = 0.5. The dashed black line corresponds to the y = x line. We next compare how the top-ranked genes match the true DE genes. For this purpose, we select the first R genes, , ranked by their SFDR based q values (the minimum SFDR you can achieve by calling a given test significant) and report the proportion of those fall in Ω whose absolute values of β g are greater than ϵ as well, that is, . The value R is selected from 1 to the total number of DE genes plus 100, that is, Gp + 100. The averaged proportion p R for all simulations against R is shown in Figure 3 and 4 corresponding to ϵ = log 2 (1.2) and ϵ = log 2 (1.5). The R highest ranked genes by BISHOT are never falsely detected for even relatively large R (relative to the size of Ω), while SunSpike0 does not perform as good especially when expression sample size is small regardless of p and ϵ . Hence BISHOT is more robust and their top ranked genes are more accurate. Download figure Open in new tab Figure 3: The proportion of true DE genes for the first R genes with smallest q values when ϵ = log 2 (1.2) averaged over 50 simulated data for R = 1, …, 5000 p + 100 corresponding to 10 (left panel) or 50 (right panel) samples in the treatment and control group. The top and bottom panel displays the case when p = 0.1 and p = 0.5. Download figure Open in new tab Figure 4: The proportion of true DE genes for the first R genes with smallest q values when ϵ = log 2 (1.5) averaged over 50 simulated data for R = 1, …, 5000 p + 100 corresponding to 10 (left panel) or 50 (right panel) samples in the treatment and control group. The top and bottom panel displays the case when p = 0.1 and p = 0.5. 5 Transcriptional effects of BRD9 inhibition in acute myeloid leukemia Acute myeloid leukemia (AML) is a type of blood cancer with a five-year survival rate of 31.9% ( Surveillance Research Program, National Cancer Institute, 2025 ). Hohmann et al. (2016) investigated the potential of BRD9, a subunit of the SWI/SNF chromatin remodeling complex that promotes cell proliferation, to serve as a therapeutic target for AML. Hohmann et al. (2016) conducted two RNAseq experiments, one using murine RN2 cell line and the other using human MV4 cell line. In each experiment, cells were treated with BI-7273, a BRD9 inhibitor, or control with two replicates per group. RNAseq data were generated to elucidate the transcriptional effects of BRD9 inhibition. We applied BISHOT to the Hohmann RNAseq data, which were transformed into log 2 (CPM + 1) values. We focused on identifying DE genes from human MV4 cell line while using the RNAseq data from murine RN2 cell line as prior knowledge. Specifically, the parameter h g was specified based on the LFC for gene g in the data from the murine RN2 cell line. Two options for ϵ are considered, log 2 (1.2) or log 2 (1.5), same as what are used in Section 4.2 . In Figure 5 we illustrate the effect of shrinkage by examining the relationship of the local shrinkage λ g with the consistency between prior knowledge based on RN2 and what the MV4 data conveys about β g , measured by the distance between h g and the maximum likelihood estimator . The shrinkage is in accordance to this consistency, to be more specific, it is prominent when is small and less so when is large. Download figure Open in new tab Figure 5: The relationship between the gene-specific shrinkage parameter and the distance between the two sources of information. The former is estimated by the posterior mean and the latter is by the absolute value of the difference between the Maximum likelihood estimator and the center of the prior distribution. Figure 6 presents the identified DE genes at SFDR ϵ ) in both RN2 and MV4 (top-right and bottom-left regions) were identified as DE, except for a few genes with high variations. Conversely, no genes with consistently small LFCs ( < ϵ ) in both RN2 and MV4 (middle region) were identified as significant. Hence, by sharing information, the prior data from RN2 reinforced the analysis of MV4 data in determining DE genes, thanks to the reduction of posterior uncertainty when information is consistent. Download figure Open in new tab Figure 6: Gene expression LFCs between drug-treated and control groups based on RN2 (x-axis) and MV4 (y-axis) cell lines. Each dot represents a gene, colored according to DE results from the MV4 cell line. Left panel: ϵ = log 2 (1.2); right panel: ϵ = log 2 (1.5). For genes with large LFCs in MV4 but relatively small LFCs in RN2 (vertical band outside the middle square), many were also identified as DE, demonstrating that our method prioritizes genes strongly supported by observed data, even when prior information is limited. This is important, as differences between prior data (e.g., mice) and the data under analysis (e.g., human) are expected, and the method should be able to detect features specific to the current dataset. Interestingly, many genes with large LFCs in RN2 but relatively small LFCs in MV4 (horizontal band outside the middle square) were not identified as differentially expressed, likely due to inconsistent LFCs or high variability in MV4. This result highlights that prior information alone is insufficient. Genes must be supported by the data under analysis to be considered significant. Figure 6 also highlights the difference in DE results between BISHOT and SunSpike0. Genes that were called as DE by SunSpike0 only (green dots) tend to be the ones with absolute LFCs > ϵ in the MV4 but not in the RN2 data. In contrast, genes that were called as DE by BISHOT only (red dots) tend to be the ones with LFCs consistently greater than ϵ in both MV4 and RN2 data. In other words, BISHOT preferentially identifies signals that exhibit consistency between the two datasets. A Venn diagram comparing the number of DE genes between BISHOT and SunSpike0 is provided in the Appendix B. We also compared the results from BISHOT with other popular methods available in R including limma ( Smyth, 2005 ), voom ( Law et al., 2014 ), and edgeR ( Robinson et al., 2010 ). The limma (using the lmFit function in the limma package with log 2 (CPM + 1) values as the input), voom (using the voomLmFit function in the limma package with raw counts as the input), and edgeR (using the edgeR package with raw counts as the input) methods did not identify any DE genes at FDR < 0.05. A key gene of interest in Hohmann et al. (2016) is MYC , where the authors showed that BRD4 plays a critical role in sustaining MYC transcription. We compared the significant level of MYC gene across different differential analysis methods. BISHOT with an ϵ = log 2 (1.2) yielded a q -value of 0.02, whereas all other methods produced non-significant q -values ( Figure 7 ). Because MYC had a moderate LFC of −0.33 in the MV4 data, it was missed by differential analysis methods relying solely on that dataset. However, because MYC also showed a LFC of −0.36 in the RN2 data, which was highly consistent with that in the MV4 data, BISHOT with an ϵ = log 2 (1.2) was able to leverage such prior information to identify the consistent down-regulation of MYC . In addition, we noted that BISHOT with an ϵ = log 2 (1.5) produced a non-significant q -value for MYC because the method emphasizes genes with large fold changes and therefore did not capture MYC given its moderate fold change. Download figure Open in new tab Figure 7: Comparison of the q -value for MYC across different methods. The dashed horizontal line indicates q -value=0.05. 6 The scRNAseq analysis of the aging lung Angelidis et al. (2019) conducted a scRNAseq study in old (24 months) and young (3 months) mice to investigate cell type and gene expression alterations in lung aging. One of the most prevalent cell types identified in the study was type-2 pneumocytes. In parallel, the authors also conducted a bulk RNAseq experiment on type-2 pneumocytes, selected by flow cytometry sorting, from old and young mice. We applied BISHOT to scRNAseq data for type-2 pneumocytes to identify differentially expressed genes between old and young mice. The flow-sorted bulk RNAseq data for type-2 pneumocytes were used as our prior knowledge. We first extend our model to handle scRNAseq data, which contains dropouts. Let Y ig be the log 2 (CPM+ 1) expression value of gene g in cell i . Suppose Z ig is a dropout indicator, where Z ig = 1 if Y ig ≠ 0 and Z ig = 0 if Y ig = 0. We adopt the two-component model developed in Finak et al. (2015) : a logistic regression for Z ig and a classic regression model for Y ig conditioning on Z ig = 1, where corresponding to the probability component and the nonzero component, the parameters are superscripted by D or C . One would identify two treatment effects and representing differentiations in terms of log odds for the probability of nonzero and the expected nonzero expression. The question becomes which genes should be claimed significant based on either β D or β C . Equivalently, we aim at a bivariate hypothesis for each gene, H 0 g : β g ∈ A vs H ag : β g ∈ / A with . The choices of ϵ 1 ∈ { log(1.2), log(1.5) } and ϵ 2 ∈ { log 2 (1.2), log 2 (1.5) } correspond to a LFC or log odds ratio of 1.2 or 1.5 converted to the scale of Z or Y . To this end, we need to adapt the BCR and SFDR to the bivariate case. BCR is motivated to incorporate the information about sign of the parameter, in this case, four scenarios regarding the signs for and are possible, so the bivariate BCR is defined as where and . Similarly, mSFDR can be adjusted based on these four cases (expressions are omitted here for conciseness). We applied BISHOT to the scRNAseq data while incorporating LFCs from bulk RNAseq (denoted as µ g ) in the prior, specifically, h g = ( κµ g , µ g ) ⊤ where κ is a parameter potentially used to specify the sign and size of through µ g . With ϵ 1 = ϵ 2 = log 2 (1.5), we identified 436 DE genes at SFDR = 0.05. When ϵ 1 = ϵ 2 = log 2 (1.2), this number increased to 2642. For comparison, we applied MAST ( Finak et al., 2015 ) to the data using the FindMarkers function in the R Seurat package ( Hao et al., 2024 ), which yielded 108 DE genes. Figure 8 left panel presents the overlaps in DE genes across these methods: BISHOT with larger ϵ = ( ϵ 1 , ϵ 2 ) ⊤ , identified 65 out of the 108 DE genes from MAST. For lower ϵ , BISHOT was able to identify almost all the DE genes from MAST. Notably, BISHOT identified 371 additional DE genes, within the intersection of genes identified using both thresholds, that were not identified by MAST. Download figure Open in new tab Figure 8: Venn diagrams comparing different methods in terms of identified DE genes (left panel) and enriched GO terms (right panel). To further investigate the biological interpretations of the DE genes, we performed Gene Ontology (GO) enrichment analysis based on the GSEA software ( Subramanian et al., 2005 ). As shown in Figure 8 right panel, there were 60 significant (FDR < 0.05) GO terms that were identified by BISHOT with both thresholds, but not by MAST. The Appendix B provides a list of these GO terms, many of which are related to lung aging. The endothelial barrier (GO:0061028) is crucial for lung function, regulating substance exchange between blood and tissue. Senescence of endothelial cells during aging compromises this barrier, increasing susceptibility to acute lung injury and acute respiratory distress syndrome ( Barabutis et al., 2016 ; Najari Beidokhti et al., 2025 ). Mitochondria, essential for energy, senescence, apoptosis, and regeneration of type-2 pneumocytes, show impaired function with age due to altered expression of mitochondrial genes and ribosomal subunits (GO:0140053, GO:0005763, GO:0005762) ( Cloonan et al., 2020 ). Cell morphogenesis (GO:0000902) disruptions contribute to lung aging, marked by alveolar enlargement and structural changes known as the “senile lung” ( Wang et al., 2024 ). Protein serine/threonine kinases (GO:0004674) influence age-related lung disease. For example, PINK1 regulates mitophagy, with its dysregulation linked to fibrosis and COPD ( Mizumura et al., 2014 ). Overexpression of MAP kinaseinteracting serine/threonine kinase 2 promotes lung cancer progression ( Guo et al., 2017 ), while protein kinase B (Akt) signaling protects myofibroblast from apoptosis, making it a therapeutic target for lung fibrosis ( Wang et al., 2022 ). 7 Discussions As far as we know there is an absence in the literature where a multiple testing methodology was applied to β g ∈ R p when p > 1. We have seen a success for p = 2 in the single cell data application in Section 6 on non-normally distributed data. This conveys a positive signal of the flexibility of BISHOT regarding model and parameter choices and therefore is of interest on a separate note to the multiple testing regime. We consider a composite null hypothesis in this paper as desired by the practical problems we described. It is observed that the role of ϵ is connected with conservativeness in terms of the actual SFDR. More thorough investigation regarding the combined effect of ϵ , the true distribution of β g and the accuracy of h g (these factors are what we believe contribute to the degree of conservativeness of our method) is beyond the scope of this paper and is left as potentially a future work. BISHOT offers a general framework for Bayesian differential analysis that incorporates prior knowledge with relatively low computational cost and remarkable accuracy. In this paper, we illustrate the framework using normal linear and two-component models, though it is broadly applicable to other models. Extending BISHOT to other frequently used models, such as the negative binomial model ( Robinson et al., 2010 ; Wu et al., 2013 ), is a direction for future research. In this work, we limit ourselves to the specifications of the shrinkage parameters λ g , τ based on the original horseshoe. As we explained in Section 5 there exists a close proximity between the notion of sparsity (in nonzero parameters) and the consistency between h g and . Given the well-known drawbacks for the horseshoe prior distributions, we can extend the proposed framework to be combined with more recent developments regarding the horseshoe prior ( Bhadra et al., 2017 ), in order to tackle the various scenarios regarding the overall consistency between the two studies. Data availability The AML data ( Hohmann et al., 2016 ) used in Section 5 is available from GEO under accession number GSE79284. The lung aging data ( Angelidis et al., 2019 ) used in Section 6 is available from Zenodo at https://doi.org/10.5281/zenodo.5048449 , which is provided by Squair et al. (2021) . Supporting Information Appendix A referenced in Sections 2 and 4 and Appendix B, referenced in Sections 5 and 6 , is available with this paper at the journal website. Acknowledgements High Performance Computing resources provided by the High Performance Research Computing (HPRC) core facility at Virginia Commonwealth University ( https://hprc.vcu.edu ) were used for conducting the research reported in this work. This research was supported by the Biostatistics and Bioinformatics Shared Resource of the University of Kentucky Markey Cancer Center (P30CA177558). Footnotes clarkmz{at}vcu.edu chi.wang{at}uky.edu Introduction updated; Abstract updated; Figure 7 added; References ↵ Altenbuchinger , M. , Schwarzfischer , P. , Rehberg , T. , Reinders , J. , Kohler , C. W. , Gronwald , W. , Richter , J. , Szczepanowski , M. , Masqué-Soler , N. , Klapper , W. , et al. ( 2017 ). Molecular signatures that can be transferred across different omics platforms . Bioinformatics , 33 , i333 – i340 . OpenUrl PubMed ↵ Angelidis , I. , Simon , L. M. , Fernandez , I. E. , Strunz , M. , Mayr , C. H. , Greiffo , F. R. , Tsitsiridis , G. , Ansari , M. , Graf , E. , Strom , T.-M. , et al. ( 2019 ). An atlas of the aging lung mapped by single cell transcriptomics and deep tissue proteomics . Nature Communications , 10 , 963 . OpenUrl PubMed ↵ Barabutis , N. , Verin , A. , and Catravas , J. D. ( 2016 ). Regulation of pulmonary endothelial barrier function by kinases . American Journal of Physiology-Lung Cellular and Molecular Physiology , 311 , 832 – 845 . OpenUrl ↵ Benjamini , Y. and Hochberg , Y. ( 1995 ). Controlling the false discovery rate: a practical and powerful approach to multiple testing . Journal of the Royal Statistical Society: Series B (Methodological) , 57 , 289 – 300 . OpenUrl CrossRef PubMed Web of Science ↵ Benjamini , Y. and Yekutieli , D. ( 2001 ). The control of the false discovery rate in multiple testing under dependency . Annals of statistics , pages 1165 – 1188 . ↵ Berger , J. O. ( 2013 ). Statistical decision theory and Bayesian analysis . Springer Science & Business Media . ↵ Bhadra , A. , Datta , J. , Polson , N. G. , and Willard , B. ( 2017 ). The horseshoe+ estimator of ultra-sparse signals . Bayesian Analysis , 12 , 1105 – 1131 . OpenUrl ↵ Bhattacharya , A. , Chakraborty , A. , and Mallick , B. K. ( 2016 ). Fast sampling with gaussian scale mixture priors in high-dimensional regression . Biometrika , 103 , 985 – 991 . OpenUrl CrossRef PubMed ↵ Chen , J. , Xu , J. , Gou , L. , Zhu , Y. , Zhong , W. , Guo , H. , and Du , Y. ( 2024 ). Integrating transcriptomic and proteomic data for a comprehensive molecular perspective on the association between sarcopenia and osteoporosis . Archives of Gerontology and Geriatrics , 125 , 105486 . OpenUrl PubMed ↵ Cloonan , S. M. , Kim , K. , Esteves , P. , Trian , T. , and Barnes , P. J. ( 2020 ). Mitochondrial dysfunction in lung ageing and disease . European Respiratory Review , 29 , 200165 . OpenUrl Abstract / FREE Full Text ↵ Efron , B. ( 2008 ). Microarrays, empirical Bayes and the two-groups model . Statistical Science , 23 , 1 – 22 . OpenUrl CrossRef Web of Science ↵ Efron , B. ( 2012 ). Large-scale inference: empirical Bayes methods for estimation, testing, and prediction, volume 1 . Cambridge University Press . ↵ Finak , G. , McDavid , A. , Yajima , M. , Deng , J. , Gersuk , V. , Shalek , A. K. , Slichter , C. K. , Miller , H. W. , McElrath , M. J. , Prlic , M. , et al. ( 2015 ). MAST: a flexible statistical framework for assessing transcriptional changes and characterizing heterogeneity in singlecell RNA sequencing data . Genome Biology , 16 , 278 . OpenUrl CrossRef PubMed Fu , L. , Gang , B. , James , G. M. , and Sun , W. ( 2022 ). Heteroscedasticity-adjusted ranking and thresholding for large-scale multiple testing . Journal of the American Statistical Association , 117 , 1028 – 1040 . OpenUrl ↵ Genovese , C. and Wasserman , L. ( 2002 ). Operating characteristics and extensions of the false discovery rate procedure . Journal of the Royal Statistical Society: Series B (Statistical Methodology) , 64 , 499 – 517 . OpenUrl ↵ Genovese , C. and Wasserman , L. ( 2004 ). A stochastic process approach to false discovery control . Annals of Statistics , pages 1035 – 1061 . ↵ Gontarz , P. , Fu , S. , Xing , X. , Liu , S. , Miao , B. , Bazylianska , V. , Sharma , A. , Madden , P. , Cates , K. , Yoo , A. , et al. ( 2020 ). Comparison of differential accessibility analysis strategies for atac-seq data . Scientific reports , 10 , 10150 . OpenUrl PubMed ↵ Guo , Z. , Peng , G. , Li , E. , Xi , S. , Zhang , Y. , Li , Y. , Lin , X. , Li , G. , Wu , Q. , and He , J. ( 2017 ). Map kinase-interacting serine/threonine kinase 2 promotes proliferation, metastasis, and predicts poor prognosis in non-small cell lung cancer . Scientific Reports , 7 , 10612 . OpenUrl PubMed ↵ Hao , Y. , Stuart , T. , Kowalski , M. H. , Choudhary , S. , Hoffman , P. , Hartman , A. , Srivastava , A. , Molla , G. , Madad , S. , Fernandez-Granda , C. , et al. ( 2024 ). Dictionary learning for integrative, multimodal and scalable single-cell analysis . Nature Biotechnology , 42 , 293 – 304 . OpenUrl CrossRef PubMed ↵ Hohmann , A. F. , Martin , L. J. , Minder , J. L. , Roe , J.-S. , Shi , J. , Steurer , S. , Bader , G. , McConnell , D. , Pearson , M. , Gerstberger , T. , et al. ( 2016 ). Sensitivity and engineered resistance of myeloid leukemia cells to brd9 inhibition . Nature Chemical Biology , 12 , 672 – 679 . OpenUrl PubMed ↵ Huang , Z. , Lane , A. N. , Fan , T. W. , Higashi , R. M. , Weiss , H. L. , Yin , X. , and Wang , C. ( 2020 ). Differential abundance analysis with Bayes shrinkage estimation of variance (dasev) for zero-inflated proteomic and metabolomic data . Scientific Reports , 10 , 876 . OpenUrl PubMed ↵ Kendziorski , C. , Newton , M. , Lan , H. , and Gould , M. ( 2003 ). On parametric empirical Bayes methods for comparing multiple groups using replicated gene expression profiles . Statistics in Medicine , 22 , 3899 – 3914 . OpenUrl CrossRef PubMed Web of Science ↵ Law , C. W. , Chen , Y. , Shi , W. , and Smyth , G. K. ( 2014 ). voom: Precision weights unlock linear model analysis tools for rna-seq read counts . Genome Biology , 15 , 1 – 17 . OpenUrl CrossRef PubMed ↵ Li , B. , Li , Y. , and Qin , Z. S. ( 2017 ). Improving hierarchical models using historical data with applications in high-throughput genomics data analysis . Statistics in biosciences , 9 , 73 – 90 . OpenUrl PubMed ↵ Li , B. , Sun , Z. , He , Q. , Zhu , Y. , and Qin , Z. S. ( 2016 ). Bayesian inference with historical data-based informative priors improves detection of differentially expressed genes . Bioinformatics , 32 , 682 – 689 . OpenUrl CrossRef PubMed ↵ Li , C. , Lanman , N. A. , Kong , Y. , He , D. , Mao , F. , Farah , E. , Zhang , Y. , Liu , J. , Wang , C. , Wei , Q. , et al. ( 2020 ). Inhibition of the erythropoietin-producing receptor ephb4 antagonizes androgen receptor overexpression and reduces enzalutamide resistance . Journal of Biological Chemistry , 295 , 5470 – 5483 . OpenUrl Abstract / FREE Full Text ↵ Li , C. , Liu , J. , He , D. , Mao , F. , Rao , X. , Zhao , Y. , Lanman , N. A. , Kazemian , M. , Farah , E. , Liu , J. , et al. ( 2022 ). Gstm2 is a key molecular determinant of resistance to sg-aris . Oncogene , 41 , 4498 – 4511 . OpenUrl PubMed ↵ Li , Y. , Fan , T. W. , Lane , A. N. , Kang , W.-Y. , Arnold , S. M. , Stromberg , A. J. , Wang , C. , and Chen , L. ( 2019 ). Sda: A semi-parametric differential abundance analysis method for metabolomics and proteomics data . BMC bioinformatics , 20 , 1 – 10 . OpenUrl CrossRef PubMed ↵ Love , M. I. , Huber , W. , and Anders , S. ( 2014 ). Moderated estimation of fold change and dispersion for rna-seq data with deseq2 . Genome biology , 15 , 1 – 21 . OpenUrl CrossRef PubMed ↵ McCarthy , D. J. and Smyth , G. K. ( 2009 ). Testing significance relative to a fold-change threshold is a treat . Bioinformatics , 25 , 765 – 771 . OpenUrl CrossRef PubMed Web of Science ↵ Mizumura , K. , Cloonan , S. M. , Nakahira , K. , Bhashyam , A. R. , Cervo , M. , Kitada , T. , Glass , K. , Owen , C. A. , Mahmood , A. , Washko , G. R. , et al. ( 2014 ). Mitophagy-dependent necroptosis contributes to the pathogenesis of copd . The Journal of Clinical Investigation , 124 , 3987 – 4003 . OpenUrl CrossRef PubMed Web of Science ↵ Muralidharan , O. ( 2010 ). An empirical Bayes mixture method for effect size and false discovery rate estimation . The Annals of Applied Statistics , 4 , 422 – 438 . OpenUrl ↵ Najari Beidokhti , M. , Villalba , N. , Ma , Y. , Reynolds , A. , Villamil , J. H. , and Yuan , S. Y. ( 2025 ). Lung endothelial cell senescence impairs barrier function and promotes neutrophil adhesion and migration . GeroScience , pages 1 – 17 . ↵ Robinson , M. D. , McCarthy , D. J. , and Smyth , G. K. ( 2010 ). edger: a bioconductor package for differential expression analysis of digital gene expression data . Bioinformatics , 26 , 139 – 140 . OpenUrl CrossRef PubMed Web of Science ↵ Ross-Innes , C. S. , Stark , R. , Teschendorff , A. E. , Holmes , K. A. , Ali , H. R. , Dunning , M. J. , Brown , G. D. , Gojis , O. , Ellis , I. O. , Green , A. R. , et al. ( 2012 ). Differential oestrogen receptor binding is associated with clinical outcome in breast cancer . Nature , 481 , 389 – 393 . OpenUrl CrossRef PubMed Web of Science ↵ Smyth , G. K. ( 2005 ). Limma: linear models for microarray data . In Bioinformatics and computational biology solutions using R and Bioconductor , pages 397 – 420 . Springer . ↵ Squair , J. W. , Gautier , M. , Kathe , C. , Anderson , M. A. , James , N. D. , Hutson , T. H. , Hudelle , R. , Qaiser , T. , Matson , K. J. , Barraud , Q. , et al. ( 2021 ). Confronting false discoveries in single-cell differential expression . Nature Communications , 12 , 5692 . OpenUrl PubMed ↵ Stephens , M. ( 2017 ). False discovery rates: a new deal . Biostatistics , 18 , 275 – 294 . OpenUrl CrossRef PubMed ↵ Storey , J. D. ( 2003 ). The positive false discovery rate: a Bayesian interpretation and the q-value . The annals of statistics , 31 , 2013 – 2035 . OpenUrl ↵ Subramanian , A. , Tamayo , P. , Mootha , V. K. , Mukherjee , S. , Ebert , B. L. , Gillette , M. A. , Paulovich , A. , Pomeroy , S. L. , Golub , T. R. , Lander , E. S. , et al. ( 2005 ). Gene set enrichment analysis: a knowledge-based approach for interpreting genome-wide expression profiles . Proceedings of the National Academy of Sciences , 102 , 15545 – 15550 . OpenUrl Abstract / FREE Full Text ↵ Sun , W. and Cai , T. T. ( 2007 ). Oracle and adaptive compound decision rules for false discovery rate control . Journal of the American Statistical Association , 102 , 901 – 912 . OpenUrl CrossRef Web of Science ↵ Sun , W. and McLain , A. C. ( 2012 ). Multiple testing of composite null hypotheses in heteroscedastic models . Journal of the American Statistical Association , 107 , 673 – 687 . OpenUrl ↵ Surveillance Research Program, National Cancer Institute ( 2025 ). Seer*explorer: An interactive website for SEER cancer statistics . ↵ Vitorino , R. ( 2024 ). Transforming clinical research: the power of high-throughput omics integration . Proteomes , 12 , 25 . OpenUrl PubMed ↵ Wang , J. , Hu , K. , Cai , X. , Yang , B. , He , Q. , Wang , J. , and Weng , Q. ( 2022 ). Targeting pi3k/akt signaling for treatment of idiopathic pulmonary fibrosis . Acta Pharmaceutica Sinica B , 12 , 18 – 32 . OpenUrl PubMed Wang , K. Y. , Pupo , G. M. , Tembe , V. , Patrick , E. , Strbenac , D. , Schramm , S.-J. , Thompson , J. F. , Scolyer , R. A. , Muller , S. , Tarr , G. , et al. ( 2022 ). Cross-platform omics prediction procedure: a statistical machine learning framework for wider implementation of precision medicine . npj Digital Medicine , 5 , 85 . OpenUrl PubMed ↵ Wang , Y. , Huang , X. , Luo , G. , Xu , Y. , Deng , X. , Lin , Y. , Wang , Z. , Zhou , S. , Wang , S. , Chen , H. , et al. ( 2024 ). The aging lung: microenvironment, mechanisms, and diseases . Frontiers in Immunology , 15 , 1383503 . OpenUrl PubMed ↵ Wu , H. , Wang , C. , and Wu , Z. ( 2013 ). A new shrinkage estimator for dispersion improves differential expression detection in rna-seq data . Biostatistics , 14 , 232 – 243 . OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted November 29, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A Bayesian Informative Shrinkage Approach for Large-scale Multiple Hypothesis Testing (BISHOT): with Applications in Differential Analysis of Omics Data Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A Bayesian Informative Shrinkage Approach for Large-scale Multiple Hypothesis Testing (BISHOT): with Applications in Differential Analysis of Omics Data Ya Su , Mary Eunice Joy Z. Clark , Chi Wang bioRxiv 2025.09.11.675690; doi: https://doi.org/10.1101/2025.09.11.675690 Share This Article: Copy Citation Tools A Bayesian Informative Shrinkage Approach for Large-scale Multiple Hypothesis Testing (BISHOT): with Applications in Differential Analysis of Omics Data Ya Su , Mary Eunice Joy Z. Clark , Chi Wang bioRxiv 2025.09.11.675690; doi: https://doi.org/10.1101/2025.09.11.675690 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetics Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17633) Bioengineering (13856) Bioinformatics (41841) Biophysics (21399) Cancer Biology (18529) Cell Biology (25422) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24282) Genetics (15582) Genomics (22462) Immunology (17700) Microbiology (40295) Molecular Biology (17140) Neuroscience (88419) Paleontology (666) Pathology (2823) Pharmacology and Toxicology (4813) Physiology (7632) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.