Penalized Semiparametric Estimation for Causal Inference with Possibly Invalid Instruments

doi:10.1101/2024.01.19.24301518

Penalized Semiparametric Estimation for Causal Inference with Possibly Invalid Instruments

2024 · doi:10.1101/2024.01.19.24301518

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 47,539 characters · extracted from preprint-html · click to expand

Penalized Semiparametric Estimation for Causal Inference with Possibly Invalid Instruments | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Penalized Semiparametric Estimation for Causal Inference with Possibly Invalid Instruments Yunlong Cao , Yuquan Wang , Dapeng Shi , Dong Chen , Yue-Qing Hu doi: https://doi.org/10.1101/2024.01.19.24301518 Yunlong Cao 1 Institute of Biostatistics, School of Life Sciences, Fudan University , Shanghai, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yuquan Wang 1 Institute of Biostatistics, School of Life Sciences, Fudan University , Shanghai, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dapeng Shi 2 Shanghai Center for Mathematical Sciences, Fudan University , Shanghai, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dong Chen 1 Institute of Biostatistics, School of Life Sciences, Fudan University , Shanghai, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yue-Qing Hu 1 Institute of Biostatistics, School of Life Sciences, Fudan University , Shanghai, China 2 Shanghai Center for Mathematical Sciences, Fudan University , Shanghai, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: yuehu{at}fudan.edu.cn Abstract Full Text Info/History Metrics Data/Code Preview PDF S ummary Inferring causal effects with unmeasured confounder is a main challenge in causal inference. Many researchers impose parametric assumptions on the distribution of unmeasured confounder. However, due to the unobservable nature of the unmeasured confounder, it is more reasonable to leave its distribution unrestricted. Another key challenge in causal inference is the involvement of invalid instrumental variables, which may lead to biased inference and possibly misleading scientific conclusions. To this end, we employ a flexible semiparametric model that allows for possibly invalid instruments without specifying the distribution of unmeasured confounder in this work. A penalized semiparametric estimator for causal effects is constructed and its oracle and asymptotic properties are well established for statistical inference. We evaluate the performance of the estimator through simulation studies, revealing that our proposed estimator exhibits asymptotic unbiasedness and robustness in estimating causal effects, along with consistent selection of invalid instruments. We also demonstrate its application using Atherosclerosis Risk in Communities Study data set, which further validates its robustness in the presence of invalid instruments. Additionally, we have implemented the proposed method in R, and the corresponding R code is available for free download. 1. Introduction Causal inference is vital for elucidating cause-and-effect relationships. For observational data, it is rather difficult to make inference due to the existence of unmeasured confounders. Instrumental variables (IV) stand out as a widely used technique for detecting causality and estimating the causal effect of an exposure on an outcome in this situation. A valid IV which is suitable for estimating causal effects, must adhere to three fundamental assumptions (Angrist, 1996; Sargan, 1958 ), namely: Relevance: the IV is related to the exposure; Exchangeability: the IV is independent of unmeasured confounders; Exclusion Restriction: the IV has no direct effect on the outcome. The relevance of instruments can be scrutinized through observed data of exposure and instruments. However, checking assumptions (2) and (3) in a data-dependent manner neces-sitates substantial domain expertise to discern valid IVs. In certain instances, causal effects can be deduced even with the presence of invalid IVs. In the context of a linear outcome model, Kolesár et al. (2015) and Bowden et al. (2015) provided solutions wherein all candidate IVs may be valid, but the strength of the IV and its direct effect on the outcome are nearly orthogonal. Kang et al. (2016) and Windmeijer et al. (2018) put forth consistent estimators for causal effects, assuming the majority rule that at least 50% of the IVs are valid. Li and Guo (2020) extended the majority rule to nonlinear outcome models, presenting the three-step inference procedure SpotIV for estimating the conditional average treatment effect (CATE). Semiparametric methodologies are extensively employed in causal inference. Sun et al. (2023) introduced a class of g-estimators guaranteed to maintain consistency and asymptotic normality in estimating the causal effect of interest, even in the presence of invalid instrumental variables. Zhang and Tchetgen Tchetgen (2022) proposed a robust estimator reaching the efficiency bound for the semiparametric model, without imposing parametric assumptions on the unmeasured confounder. They established the consistency and asymptotic normality of the estimator under appropriate identification and regularity conditions. However, a general identification condition for the semiparametric model is not explicitly stated. Considering the assumption of majority rule in semiparametric model setting, the penalized semiparametric estimating approaches were developed to estimate causal effects. Diverging from various penalties imposed on the loss function in the conventional parametric models, the semiparametric approach to estimating causal effects does not involve minimizing any objective function. Fu (2003) proposed penalizing the estimating function, instead of the loss function, for generalized linear models with a bridge penalty ( Frank and Friedman, 1993 ; Fu and Knight, 2000 ). Subsequently, Johnson et al. (2008) presented a comprehensive asymptotic properties of estimators derived from a broad class of penalized estimating functions. Given semiparametric model setting and majority rule, we explore the penalized semi-parametric estimating method to simultaneously estimate causal effects and select invalid instrumental variables in this work. The article is organized as follows. Section Methods serves to introduce our model setting and discuss the identifiability of model. Subsequently, we present the semiparametric estimating equations (SEE) for the model and introduce the penalized semiparametric estimating equations (PSEE). Section Implementation and Results establishes the algorithm and asymptotic theory for PSEE. In Section Simulation Study, numerical results from simulation studies are presented. Moving to Section Real Data Analysis, we apply the PSEE method to the Atherosclerosis Risk in Communities Study (ARIC) dataset. Section Discussions is dedicated to providing some discussions. 2. Methods 2.1. Semiparametric model setting Consider the causal effect of an exposure D ∈ ℝ and outcome Y ∈ ℝ. Z ∈ ℝ q denote the q -dimensional vector of instrumental variables for inferring the causality, U ∈ ℝ is a scalar unmeasured confounder. We consider the following outcome model, For the exposure D , we consider where g 1 , g 2 are link functions, β ∈ ℝ represents the causal parameter of interest, α ∈ ℝ q and γ ∈ ℝ q represent the direct effect of the instruments on the outcome and exposure, respectively, c 1 , c 2 are fixed sensitivity parameters used to adjust the influence of confounder on the outcome. Let θ = ( γ T , β, α T ) T ∈ ℝ p denote the finite dimensional parameters with p = 2 q + 1. In many applications ( Harbord et al., 2013 ), the distribution of U is often assumed to follow a parametric distribution, such as the normal distribution in Shi et al. (2023) . Since U is unobservable, it may be more appropriate to refrain from imposing any parametric assumptions on its distribution. Therefore we are exploring a semiparametric model in which both the outcome model and exposure model are accurately specified, as denoted by Equations (1) and (2) , respectively. The joint distribution of ( U , Z ) in this model remains unrestricted. Note that the model (1)-(2) is highly versatile, encompassing linear and nonlinear outcome as well as exposure as special cases. For example, when g 1 is identity function, it includes continuous outcome as when g 1 is standard logistic function, it includes binary outcome as The exposure model (2) is similar, it includes continuous and binary exposure depending on g 2 . 2.2. Identifiability of Model The presence of direct effects of instruments on the outcome poses a significant challenge for instrumental variable inference. Previous studies have delved into the identifiability conditions within some models, as discussed by Kang et al. (2016) and Li and Guo (2020) . This section provides an overview of these works, laying the foundation for the subsequent discussion of our estimating method for the model parameters. Let s = ∥ α ∥ 0 denote the number of invalid instruments, i.e. the number of nonzero components of α . For a continuous outcome, Kang et al. (2016) proved the identifiability of the model under the condition of s < q/ 2, known as the majority rule . Additionally, they provided a method sisVIVE for estimating the causal effect, which utilizes l 1 penalization on α . Li and Guo (2020) studied the nonlinear causal inference and proposed a method SpotIV to estimate the CATE. Therefore in this article, we assume that majority rule holds and our model (1)-(2) is identifiable under majority rule. In particular, when outcome Y is continuous and Equation (3) holds, and 𝔼( U | Z ) = 0, then our model is actually identifiable under majority rule according to Kang et al. (2016) ; similarly when exposure D is continuous and 𝔼( U | Z ) = 0, our model is identifiable according to Li and Guo (2020) . 2.3. Semiparametric Estimating Equation In standard semiparametric theory, we only consider estimators that are regular and asymptotically linear (RAL) ( Newey, 1990 ; Bickel et al., 1993 ; Van der Vaart, 2000 ; Tsiatis, 2006 ). Note that the full data ℱ = {ℱ i = ( Y i , D i , Z i , U i ), i = 1, 2 …, n} , the observed data only consist of 𝒪 = {O i = ( Y i , D i , Z i ), i = 1, 2, …, n} as U is not observed. An asymptotically linear estimator of model parameter θ based on the full data satisfies where the measurable random function φ ( F i ) is referred to as the i -th influence function of the estimator and satisfies E{φ ( F ) } = 0, E ( φφ T ) is finite and nonsingular. Regularity conditions are imposed to exclude super-efficient estimators, which are unnatural and have undesirable local properties. Any RAL estimator is asymptotically normally distributed; i.e., We hope to find the efficient influence function φ eff ( F ), which is the influence function with the smallest variance matrix in the sense that for any influence function φ ( F ) ≠ φ eff ( F ), var{ φ eff (F)} − var{ φ (F)} is negative definite. By standard semiparametric theory, the efficient influence function based on the full data, φ eff ( F ), is proportional to the full data efficient score S eff ( Y, D , Z , U ), which can be obtained by projecting the full data score S θ ( Y, D , Z , U ) onto the orthogonal component of the full data nuisance tangent space Λ F , which is given by , where Accordingly, for observed data efficient score S eff ( Y, D , Z ), we need to calculate corresponding observed data score S θ ( Y, D , Z ) = 𝔼[ S θ ( Y, D , Z , U ) | Y, D , Z ] and the observed data nuisance tangent space Λ = 𝔼[Λ F | Y, D , Z ]. Zhang and Tchetgen Tchetgen(2022) derived the specific form of the observed data efficient score for the considered model and introduced a working model f * ( U | Z ; ξ ) instead of the unknown f ( U | Z ) to proceed calculation: where a ( U , Z ) satisfies the integral equation: Note that the true data distribution 𝒫 F can be factored as and the misspecified data distribution with working model is 𝔼[ · ] denotes expectation taken with respect to 𝒫 F and 𝔼 * [ · ] taken with respect to . The efficient score has an important property: Equation (4) yields an estimator for θ that exhibits appealing robustness and efficiency properties. This is achieved by substituting the expectation with its empirical analogue and formulating the following estimating equation: Under suitable identification and regularity conditions, the solution to the estimating equation (5) is consistent and asymptotically normal, with variance given by If the conditional distribution f ( U | Z ) is correctly specified, then is locally efficient with asymptotic variance . 2.4. Penalized Semiparametric Estimating Equation Given estimating equation (5) and assumed identification condition, majority rule, we consider the penalized semiparametric estimating equation for simultaneous estimation and invalid instrumental variable selection. Specifically, the penalized semiparametric estimating functions are defined as where and the second term is the componentwise product of q λ and sgn( θ ). To select invalid instrumental variables, we design q λ (| θ |) as follows: (i) for j = 1, 2, …, q + 1, set q λ,j (| θ j |) = 0; (ii) for j = q + 2, …, p , set q λ,j (| θ j |) as SCAD penalty: with a > 2. Here we adopt the nonconvex SCAD penalty proposed by Fan and Li (2001) , which results in an estimator with oracle property: that is, the estimator has the same limiting distribution as an estimator that konws the true model a priori. Note that we only penalize the latter q component of S ( θ ) which corresponding to parameter α , therefore γ and β will not be shrunken. Intuitively, q λ (| α j |) is zero for a large value of | α j |, while it increases significantly for a small value of | α j |. Consequently, the jth component of the semiparametric estimating function S ( θ ), denoted as S j ( θ ), is not penalized when | α j | is large. Conversely, S j ( θ ) is heavily penalized if | α j | is close (but not equal) to zero, compelling the estimator of α j to shrink to zero. When α j is shrunk to zero, it implies that the jth instrumental variable is deemed valid and is consequently excluded from the outcome model. 3. Implementation and Results 3.1. Algorithm To solve the penalized semiparametric estimating equation, we employ an iterative algorithm similar to that utilized in Johnson et al. (2008) , Wang et al. (2012 , 2013 ). This algorithm combines the minorization-maximization algorithm for the nonconvex penalty introduced by Hunter and Li (2005) with the Newton-Raphson algorithm: Where the constant ϵ represents a small perturbation, set to 10 − 6 in our simulation studies. We initialize the algorithm with the adaptive lasso estimator . For a chosen tuning parameter, the algorithm iterates until the convergence criterion is satisfied. Typically, this criterion is met within 20 iterations in our simulation studies. Additionally, any coefficient that becomes sufficiently small is constrained to zero; specifically, if upon convergence, then the estimator for this coefficient is set to exactly zero. We need to select ( a, λ ) for the SCAD penalty. Fan and Li (2001 , 2002 ) demonstrated that the choice a ≡ 3.7 performs well across various scenarios, and we adopt this recommendation for our numerical analyses. In practice, cross-validation is a widely used data-driven method for choosing λ . In the same vein, we employ k -fold cross-validation, minimizing the L 2 norm of the estimating equation S ( θ ). This approach aligns with the fact that the parameter of interest is θ , which sets the expected value of the estimating equation to zero (see Equation (4) ). We derive the following sandwich formula from the algorithm to estimate the asymptotic covariance matrix of : where 3.2. Asymptotic Theory for Penalized Semiparametric Estimator Let denote the true value of θ , where and . Without loss of generality, suppose that for j ⩽ q + 1 + s and for j > q + 1 + s . For the asymptotic theory, we require the following regularity conditions. exists and is continuous in an open neighborhood of β 0 ; converges uniformly to its limit in a neighborhood of θ 0 ; is invertible; λ n → 0 and Remark 1 Conditions (a)-(c) are imposed by Zhang and Tchetgen Tchetgen (2022) to ensure the consistency and asymptotic normality of the estimator derived from the semiparametric estimating equation S ( θ ) = 0. Condition (d) represents a standard requirement concerning the rate of the tuning parameter to attain the oracle property ( Fan and Li, 2001 ). Theorem 1 Assuming conditions (a)-(d), the following results hold: There exists a root-n-consistent approximate solution of , in the sense that . (Oracle Property). For any root-n-consistent approximate solution , we have that for j > q + 1 + s . Furthermore, if , then has the asymptotic normality where A 11 , V 11 are the first ( q + 1 + s ) × ( q + 1 + s ) submatrices of and V (as defined by Equation (6) ), , and Let denote the first ( q + 1 + s )-components of S P ( θ ), then there exists such that that is, the solution is exact. 4. Simulation Study We consider a binary response Y and continuous exposure D in this section. Assume there are n = 1000 individuals and q = 10 candidate instruments. The observations ( Y i , D i , Z i ), i = 1, …, n are generated by where Z i· is drawn from a multivariate normal with zero mean and identity covariance matrix. We set β = 2, the fixed effect γ are drawn from N (0, 1). We vary (i) the direct effect parameter α = (1, 1, … , 0, 0) where we change s in ∥ α ∥ 0 = s , (ii) the distribution of unmeasured confounder U to test the robustness of our PSEE estimator. Under each simulation scenario, we conduct 1000 replications. Our evaluation involves comparing the proposed PSEE method for estimating β with the original SEE method developed by Zhang and Tchetgen Tchetgen (2022) . Additionally, we compute estimates from the “naive” Two-Stage Least Squares (TSLS) method under the assumption that all instruments are valid, and the “oracle” TSLS method, assuming perfect knowledge of which instruments are valid. Our focus is on the estimation accuracy and invalid instrument selection properties of these methods, assessed through bias and root mean square error (RMSE), along with the average number of correct (C) and incorrect (I) zero estimates, respectively. Additionally, we calculate the sample standard deviation of and the mean of the estimated standard deviation using the sandwich variance, denoted as SD 1 and SD 2 . We also employ the sandwich variance formula to construct approximate 95% confidence intervals, relying on asymptotic normality theory, and report the corresponding empirical coverage probabilities. The results of Table 1 summarize the performance of the naive TSLS, oracle TSLS, PSEE, and SEE for different number of invalid instruments s . The true distribution of unmeasured confounder U is Bernoulli(0.2) and c 1 = c 2 = 1. The PSEE (correct) and SEE (correct) denote estimators with correctly specified working model U ∼ Bernoulli(0.2), the PSEE (incorrect) and SEE (incorrect) denote estimators with incorrectly specified working model U ∼ Bernoulli(0.5). We observe that when majority rule holds, PSEE performs close to oracle TSLS in terms of instrument selection properties, and the estimated standard deviation closely approximates the empirical standard deviation, and the empirical coverage probability closely approaches 95%. These numerical results suggest the effective performance of the sandwich variance formula. Additionally, it is evident that PSEE outperforms naive TSLS and SEE in bias and RMSE, even when the majority rule is violated. We also observe that PSEE performs well with incorrectly specified working model, these indicate Equation (4) . View this table: View inline View popup Download powerpoint Table 1: Estimation Results for β ( n = 1000, q = 10, U ∼ Bernoulli(0.2), c 1 = c 2 = 1): comparison of naive TSLS, oracle TSLS, PSEE (correct), SEE (correct), PSEE (incorrect) and SEE (incorrect). Here “correct” denotes estimator with correctly specified working model, i.e., f * ( U | Z ; ξ ) = Bernoulli(0.2); “incorrect” denotes estimator with incorrectly specified working model, i.e., f * ( U | Z ; ξ ) = Bernoulli(0.5). SD 1 and SD 2 denote the sample standard deviation and the mean of the estimated standard deviation using the sandwich variance. C and I denote the average number of correct and incorrect zero estimates, respectively. True β equals 2.0. Table 2 summarizes the performance when the unmeasured confounder is continuous, U ∼ N (0, 1), and c 1 = c 2 = 0.5. Our working model for U is a discrete uniform distribution on the interval [ − 0.5, 0.5] with mesh size h . For computationally efficiency, we take h = 0.5. We observe that when majority rule holds, PSEE performs close to oracle TSLS in terms of bias, RMSE and instrument selection properties, and coverage approaches 95%. When majority rule does not hold, PSEE also has good performance in bias and RMSE. Table 3 presents the results for an alternative continuous unmeasured confounder setting, U ∼ t (3), with c 1 = c 2 = 0.25. The working model is consistent with that of Table 2 . It is concluded from Table 3 again that PSEE exhibits robust performance across various evaluation metrics in this scenario as well. View this table: View inline View popup Download powerpoint Table 2: Estimation Results for β ( n = 1000, q = 10, U∼ N (0, 1), c 1 = c 2 = 0.25): comparison of naive TSLS, oracle TSLS, PSEE, SEE. Working model for U is a discrete uniform distribution on the interval [ − 0.5, 0.5] with mesh size h , here we take h = 0.5. SD 1 and SD 2 denote the sample standard deviation and the mean of the estimated standard deviation using the sandwich variance. C and I denote the average number of correct and incorrect zero estimates, respectively. True β equals 2.0. View this table: View inline View popup Download powerpoint Table 3: Estimation Results for β ( n = 1000, q = 10, U∼ t (3), c 1 = c 2 = 0.25): comparison of naive TSLS, oracle TSLS, PSEE, SEE. Working model for U is a discrete uniform distribution on the interval [ − 0.5, 0.5] with mesh size h , here we take h = 0.5. SD 1 and SD 2 denote the sample standard deviation and the mean of the estimated standard deviation using the sandwich variance. C and I denote the average number of correct and incorrect zero estimates, respectively. True β equals 2.0. 5. Real Data Analysis We demonstrate the potential advantages of our method in Mendelian randomization (MR) by analyzing the effect of BMI on suffering stroke. For this analysis, we leverage data from the Atherosclerosis Risk in Communities Study (ARIC), which is a prospective longitudinal epidemiological study conducted in four U.S. communities in North Carolina, Massachusetts, Maryland, and Minnesota. Similar to another analysis with the ARIC data, we include individuals of white origin and extract European ancestry individuals and impute the data set on Michigan Imputation Center with EUR population from 1000 Genomes Phase 3 v5 reference panel ( Shi et al., 2023 ). We remove 0.82% missing data in the following analysis. Finally, 8739 individuals are selected. We consider potential candidate instruments for our MR analysis using the following SNPs in the ARIC data that have been previously associated with BMI: rs725959, rs1147199, rs3817334, and rs6477694. While we have no specific reason to believe any of these SNPs are invalid IVs, uncertainty arises due to incomplete knowledge about their biological functions. Additionally, the inability to control all confounders precisely is a common scenario in MR studies. Under the assumption that all instruments are valid, the TSLS method estimates a causal effect of 0.0805 (OR = 1.0838). In contrast, PSEE (with working model U ∼ Bernoulli(0.5)) estimates a causal effect of 0.1107 (SE: 0.0265, OR = 1.1171) with a 95% confidence interval [0.0588, 0.1626], excluding 0. The difference between TSLS and PSEE may stem from the underlying distribution of unmeasured confounder, as demonstrated in our simulations. Importantly, PSEE does not identify any SNPs as invalid IVs. To further validate our method, we introduce another instrument, rs42039, associated with both BMI and stroke. Under the assumption that all four instruments are valid, TSLS estimates an effect of -0.0331 (OR = 0.9674). Conversely, PSEE (with working model U ∼ Bernoulli(0.5)) estimates a causal effect of 0.1108 (SE: 0.0265, OR = 1.1171), similar to the estimates when using four instruments. PSEE also excludes rs42039, suspected to be invalid. In the real data analysis, PSEE provides similar estimates and consistently excludes the suspected invalid instrument (rs42039) when additional instrument is introduced, this indicates its robustness to possibly invalid instruments compared to TSLS. Furthermore, Harshfield et al. (2021) found that heightened obesity will increase the risk of ischemic, large artery, and small vessel stroke, which supports our PSEE results. Therefore, it is recommended to focus on interventions that reduce obesity to mitigate the risk of stroke. 6. Discussions In this paper, we consider a flexible semiparametric instrumental variable model accommodating for continuous/binary exposure/outcome. We assume identifiability of this model under majority rule and propose a penalized semiparametric approach to estimate causal effect. The asymptotic and oracle properties are outlined in Theorem 1 and further illustrated in comprehensive simulation studies. Specifically, our proposed method, PSEE, demonstrates performance close to that of the oracle TSLS regarding bias, RMSE, and instrument selection properties for both binary and continuous unmeasured confounder scenarios when majority rule holds. Even when majority rule does not hold, PSEE performs well in terms of bias and RMSE. Additionally, the robustness of our estimator is evident in both simulation experiments and real data analysis. We emphasize that PSEE does not impose specific requirements on whether the outcome and exposure are continuous or binary; it only requires the ability to specify the conditional probability distributions of the outcome and exposure. However, in our simulations, we specifically consider scenarios with binary outcome and continuous exposure. Nevertheless, it is worth noting that PSEE exhibits a lower coverage for increased values of sensitivity parameters c 1 and c 2 , as illustrated in the Appendix C. Further work could involve extending the considered model to the presence of additional complexities, such as nonlinear ( Staley and Burgess, 2017 ) or time-varying exposure ( Labrecque and Swanson, 2019 ), and vector-valued confounder. One could also consider different penalty functions, such as MCP ( Zhang, 2010 ) and ALASSO ( Zou, 2006 ) penalty, to compare their estimation accuracy and instrument selection properties. Moreover, the computational challenges posed by a considerable number of instrumental variables need to be tackled in future research. In our simulation experiments, involving a sample size of 1000 and 10 instrumental variables, the task of conducting 1000 replications on an 80-node cluster required an average of 30 hours. This highlights the necessity of developing efficient algorithms or optimization techniques, which would significantly enhance the scalability and practical applicability of our proposed method. Data Availability All data produced are available online at https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000090.v1.p1 https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000090.v1.p1 https://www.ebi.ac.uk/gwas/ Acknowledgements We express our gratitude to ARIC for generously providing the data. This research was supported partially by the National Key R&D Program of China [2023YFF1205101]. Appendix Appendix A: Proof of Theorem 1 Proof: To prove part a , we consider , where . For j = 1, 2, …, p , we have where A j is the j th row of A . To prove part b , we consider the sets in the probability . We show that for any ε > 0, when n is sufficiently large, P ( C j ) < ε . Because , there exists some M such that when n is large enough, Using the jth component of the penalized estimating function and the definition of the approximate solution, we obtain that on the set of , The first three terms on the right side are of order O p (1). As a result, there exists some M ′ such that for large n , Because by condition and imply that for large n . Thus . Therefore, We next show the asymptotic normality of when the order of is o p (1). We have Employing Taylor expansions of at θ 10 yields Where And we then obtain that where V 11 is the ( q + 1 + s ) × ( q + 1 + s ) submatrix in the upper-left corner of V . This completes the proof. To establish part c, we examine θ 1 ∈ ℝ q +1+ s situated on the boundary of a ball centered around θ 10 , defined as θ 1 = θ 10 + n − 1 / 2 u with | u | = r for a constant r . Leveraging the penalized estimating function , we derive the following expression: where lies between θ j and θ 0 j for j = 1, …, s . As A 11 is nonsingular, the second term on the right side exceeds a 0 r 2 n − 1 / 2 , where a 0 denotes the smallest eigenvalue of . The first term is of order rO p ( n − 1 / 2 ). Due to the convergence of to 0, the third term is dominated by the second term. Therefore, by selecting r adequately large such that, for large n , the probability that the absolute value of the first term surpasses the second term is less than ϵ , we obtain Applying the Brouwer fixed-point theorem to the continuous function , we see that implies that has a solution within this ball. In other words, has a solution within this ball, or equivalently, has a solution within this ball. Thus, we can select an exact solution to with . Appendix B: Boxplots and histograms for simulation studies U ∼ Bernoulli(0.2) . Download figure Open in new tab Figure A1: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ Bernoulli (0.2), c 1 = c 2 = 1, 1 invalid IV, 1000 replications. Download figure Open in new tab Figure A2: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ Bernoulli (0.2), c 1 = c 2 = 1, 3 invalid IVs, 1000 replications. Download figure Open in new tab Figure A3: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ Bernoulli(0.2), c 1 = c 2 = 1, 5 invalid IVs, 1000 replications. Download figure Open in new tab Figure A4: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ Bernoulli(0.2), c 1 = c 2 = 1, 7 invalid IVs, 1000 replications. U ∼ N (0, 1). Download figure Open in new tab Figure A5: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.25, 1 invalid IV, 1000 replications. Download figure Open in new tab Figure A6: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.25, 3 invalid IVs, 1000 replications. Download figure Open in new tab Figure A7: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.25, 5 invalid IVs, 1000 replications. Download figure Open in new tab Figure A8: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.25, 7 invalid IVs, 1000 replications. U ∼ t (3). Download figure Open in new tab Figure A9: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.25, 1 invalid IV, 1000 replications. Download figure Open in new tab Figure A10: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.25, 3 invalid IVs, 1000 replications. Download figure Open in new tab Figure A11: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.25, 5 invalid IVs, 1000 replications. Download figure Open in new tab Figure A12: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.25, 7 invalid IVs, 1000 replications. Appendix C: Simulation results for increased values of sensitivity parameters U ∼ N (0, 1) and c 1 = c 2 = 0.5. View this table: View inline View popup Download powerpoint Table A1: Estimation Results for β ( n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.5): comparison of naive TSLS, oracle TSLS, PSEE, SEE. Working model for U is a discrete uniform distribution on the interval [ − 0.5, 0.5] with mesh size h , here we take h = 0.5. SD 1 and SD 2 denote the sample standard deviation and the mean of the estimated standard deviation using the sandwich variance. C and I denote the average number of correct and incorrect zero estimates, respectively. True β equals 2.0. Download figure Open in new tab Figure A13: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.5, 1 invalid IV, 1000 replications. Download figure Open in new tab Figure A14: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.5, 3 invalid IVs, 1000 replications. Download figure Open in new tab Figure A15: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.5, 5 invalid IVs, 1000 replications. Download figure Open in new tab Figure A16: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ N (0, 1), c 1 = c 2 = 0.5, 7 invalid IVs, 1000 replications. U ∼ t (3) and c 1 = c 2 = 0.5. View this table: View inline View popup Download powerpoint Table A2: Estimation Results for β ( n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.5): comparison of naive TSLS, oracle TSLS, PSEE, SEE. Working model for U is a discrete uniform distribution on the interval [ − 0.5, 0.5] with mesh size h , here we take h = 0.5. SD 1 and SD 2 denote the sample standard deviation and the mean of the estimated standard deviation using the sandwich variance. C and I denote the average number of correct and incorrect zero estimates, respectively. True β equals 2.0. Download figure Open in new tab Figure A17: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.5, 1 invalid IV, 1000 replications. Download figure Open in new tab Figure A18: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.5, 3 invalid IVs, 1000 replications. Download figure Open in new tab Figure A19: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.5, 5 invalid IVs, 1000 replications. Download figure Open in new tab Figure A20: Boxplot and histogram of estimates of the causal effect β, n = 1000, q = 10, U ∼ t (3), c 1 = c 2 = 0.5, 7 invalid IVs, 1000 replications. References Angrist , J. D. , Imbens , G. W. , and Rubin , D. B. ( 1996 ). Identification of causal effects using instrumental variables . Journal of the American Statistical Association 91 , 444 – 455 . OpenUrl CrossRef Web of Science ↵ Bickel , P. J. , Klaassen , C. A. , Ritov , Y. , and Wellner , J. A. ( 1993 ). Efficient and Adaptive Estimation for Semiparametric Models . Johns Hopkins University Press, Baltimore . ↵ Bowden , J. , Davey Smith , G. , and Burgess , S. ( 2015 ). Mendelian randomization with invalid instruments: effect estimation and bias detection through egger regression . International Journal of Epidemiology 44 , 512 – 525 . OpenUrl CrossRef PubMed ↵ Fan , J. and Li , R. ( 2001 ). Variable selection via nonconcave penalized likelihood and its oracle properties . Journal of the American Statistical Association 96 , 1348 – 1360 . OpenUrl CrossRef Web of Science ↵ Fan , J. and Li , R. ( 2002 ). Variable selection for Cox’s proportional hazards model and frailty model . The Annals of Statistics 30 , 74 – 99 . OpenUrl ↵ Frank , L. E. and Friedman , J. H. ( 1993 ). A statistical view of some chemometrics regression tools . Technometrics 35 , 109 – 135 . OpenUrl CrossRef Web of Science ↵ Fu , W. and Knight , K. ( 2000 ). Asymptotics for lasso-type estimators . The Annals of Statistics 28 , 1356 – 1378 . OpenUrl ↵ Fu , W. J. ( 2003 ). Penalized estimating equations . Biometrics 59 , 126 – 132 . OpenUrl PubMed ↵ Harbord , R. M. , Didelez , V. , Palmer , T. M. , Meng , S. , Sterne , J. A. , and Sheehan , N. A. ( 2013 ). Severity of bias of a simple estimator of the causal odds ratio in Mendelian randomization studies . Statistics in Medicine 32 , 1246 – 1258 . OpenUrl CrossRef PubMed ↵ Harshfield , E. L. , Georgakis , M. K. , Malik , R. , Dichgans , M. , and Markus , H. S. ( 2021 ). Modifiable lifestyle factors and risk of stroke: A Mendelian randomization analysis . Stroke 52 , 931 – 936 . OpenUrl ↵ Hunter , D. and Li , R. ( 2005 ). Variable selection using MM algorithms . The Annals of Statistics 33 , 1617 – 1642 . OpenUrl ↵ Johnson , B. A. , Lin , D. , and Zeng , D. ( 2008 ). Penalized estimating functions and variable selection in semiparametric regression models . Journal of the American Statistical Association 103 , 672 – 680 . OpenUrl CrossRef PubMed ↵ Kang , H. , Zhang , A. , Cai , T. T. , and Small , D. S. ( 2016 ). Instrumental variables estimation with some invalid instruments and its application to Mendelian randomization . Journal of the American Statistical Association 111 , 132 – 144 . OpenUrl CrossRef ↵ Kolesár , M. , Chetty , R. , Friedman , J. , Glaeser , E. , and Imbens , G. W. ( 2015 ). Identification and inference with many invalid instruments . Journal of Business and Economic Statistics 33 , 474 – 484 . OpenUrl ↵ Labrecque , J. A. and Swanson , S. A. ( 2019 ). Interpretation and potential biases of Mendelian randomization estimates with time-varying exposures . American Journal of Epidemiology 188 , 231 – 238 . OpenUrl CrossRef PubMed ↵ Li , S. and Guo , Z. ( 2020 ). Causal inference for nonlinear outcome models with possibly invalid instrumental variables . arXiv preprint arXiv:2010.09922 . ↵ Newey , W. K. ( 1990 ). Semiparametric efficiency bounds . Journal of Applied Econometrics 5 , 99 – 135 . OpenUrl ↵ Sargan , J. D. ( 1958 ). The estimation of economic relationships using instrumental variables . Econometrica 26 , 393 – 415 . OpenUrl CrossRef Web of Science ↵ Shi , D. , Wang , Y. , Zhang , Z. , Cao , Y. , and Hu , Y.-Q. ( 2023 ). MR-BOIL: Causal inference in one-sample Mendelian randomization for binary outcome with integrated likelihood method . Genetic Epidemiology 47 , 332 – 357 . OpenUrl ↵ Staley , J. R. and Burgess , S. ( 2017 ). Semiparametric methods for estimation of a non-linear exposure-outcome relationship using instrumental variables with application to Mendelian randomization . Genetic Epidemiology 41 , 341 – 352 . OpenUrl CrossRef PubMed ↵ Sun , B. , Liu , Z. , and Tchetgen Tchetgen , E. ( 2023 ). Semiparametric efficient G-estimation with invalid instrumental variables . Biometrika 110 , 953 – 971 . OpenUrl ↵ Tsiatis , A. A. ( 2006 ). Semiparametric Theory and Missing Data . Springer , New York . ↵ Van der Vaart , A. W. ( 2000 ). Asymptotic Statistics . Cambridge University Press . ↵ Wang , L. , Kai , B. , Heuchenne , C. , and Tsai , C.-L. ( 2013 ). Penalized profiled semiparametric estimating functions . Electronic Journal of Statistics 7 , 2656 – 2682 . OpenUrl ↵ Wang , L. , Zhou , J. , and Qu , A. ( 2012 ). Penalized generalized estimating equations for high-dimensional longitudinal data analysis . Biometrics 68 , 353 – 360 . OpenUrl CrossRef PubMed Web of Science ↵ Windmeijer , F. , Farbmacher , H. , Davies , N. , and Smith , G. D. ( 2018 ). On the use of the lasso for instrumental variables estimation with some invalid instruments . Journal of the American Statistical Association 114 , 1339 – 1350 . OpenUrl ↵ Zhang , B. and Tchetgen Tchetgen , E. J. ( 2022 ). A semi-parametric approach to model-based sensitivity analysis in observational studies . Journal of the Royal Statistical Society Series A: Statistics in Society 185 , S668 – S691 . OpenUrl ↵ Zhang , C.-H. ( 2010 ). Nearly unbiased variable selection under minimax concave penalty . The Annals of Statistics 38 , 894 – 942 . OpenUrl ↵ Zou , H. ( 2006 ). The adaptive lasso and its oracle properties . Journal of the American Statistical Association 101 , 1418 – 1429 . OpenUrl CrossRef Web of Science View the discussion thread. Back to top Previous Next Posted January 21, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Penalized Semiparametric Estimation for Causal Inference with Possibly Invalid Instruments Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Penalized Semiparametric Estimation for Causal Inference with Possibly Invalid Instruments Yunlong Cao , Yuquan Wang , Dapeng Shi , Dong Chen , Yue-Qing Hu medRxiv 2024.01.19.24301518; doi: https://doi.org/10.1101/2024.01.19.24301518 Share This Article: Copy Citation Tools Penalized Semiparametric Estimation for Causal Inference with Possibly Invalid Instruments Yunlong Cao , Yuquan Wang , Dapeng Shi , Dong Chen , Yue-Qing Hu medRxiv 2024.01.19.24301518; doi: https://doi.org/10.1101/2024.01.19.24301518 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (303) Cardiovascular Medicine (4457) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (610) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15244) Forensic Medicine (30) Gastroenterology (1132) Genetic and Genomic Medicine (6620) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4557) Health Policy (1372) Health Systems and Quality Improvement (1614) Hematology (543) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15935) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6634) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3348) Ophthalmology (980) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5463) Public and Global Health (9256) Radiology and Imaging (2210) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1198) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (532) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a0310cef5f9b85c8',t:'MTc4MDAxMTA2MQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00