α-KIDS: A novel feature evaluation in the ultrahigh-dimensional right-censored setting, with application to Head and Neck Cancer

doi:10.1101/2024.08.13.24311946

α-KIDS: A novel feature evaluation in the ultrahigh-dimensional right-censored setting, with application to Head and Neck Cancer

2024 · doi:10.1101/2024.08.13.24311946

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

⚙ AI-generated deep summary by claude@2026-06, 2026-06-24 · read from full text ⓘ

The paper introduces α-KIDS, a model-free two-stage feature screening and selection method for ultrahigh-dimensional right-censored survival data, motivated by identifying genes predictive of time to death in head and neck squamous cell carcinoma using TCGA HNSCC data and validation in GEO. The first stage performs nonparametric reproducing-kernel-based ANOVA statistics with a dual screening mechanism, while the second stage uses a unified knockoff approach with directional FDR control to refine features; finite-sample behavior is assessed via simulation studies. A key limitation emphasized is that survival/censoring methods require assumptions for proper performance, and α-KIDS is designed to be robust to unknown censoring mechanisms but still relies on the available right-censored framework and the screening/knockoff construction. This paper does not explicitly discuss endometriosis or adenomyosis; it was included in the corpus via a keyword match in the upstream search index.

Read from the paper's body, not the abstract. Not a substitute for reading the paper. No clinical advice. How this works

Full text 79,202 characters · extracted from preprint-html · click to expand

α-KIDS: A novel feature evaluation in the ultrahigh-dimensional right-censored setting, with application to Head and Neck Cancer | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search α -KIDS: A novel feature evaluation in the ultrahigh-dimensional right-censored setting, with application to Head and Neck Cancer Atika FArzana Urmi , Chenlu Ke , Dipankar Bandyopadhyay doi: https://doi.org/10.1101/2024.08.13.24311946 Atika FArzana Urmi 1 Department of Biostatistics, Virginia Commonwealth University , VA, USA Ph.D Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chenlu Ke 2 Department of Statistical Sciences and Operations Research, Virginia Commonwealth University , VA, USA Ph.D Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: kec2{at}vcu.edu Dipankar Bandyopadhyay 1 Department of Biostatistics, Virginia Commonwealth University , VA, USA Ph.D Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF A bstract Recent advances in sequencing technologies have allowed collection of massive genome-wide information that substantially enhances the diagnosis and prognosis of head and neck cancer. Identifying predictive markers for survival time is crucial for devising prognostic systems, and learning the underlying molecular driver of the cancer course. In this paper, we introduce α -KIDS, a model-free feature screening procedure with false discovery rate (FDR) control for ultrahigh dimensional right-censored data, which is robust against unknown censoring mechanisms. Specifically, our two-stage procedure initially selects a set of important features with a dual screening mechanism using nonparametric reproducing-kernel-based ANOVA statistics, followed by identifying a refined set (of features) under directional FDR control through a unified knockoff procedure. The finite sample properties of our method, and its novelty (in light of existing alternatives) are evaluated via simulation studies. Furthermore, we illustrate our methodology via application to a motivating right-censored head and neck (HN) cancer survival data derived from The Cancer Genome Atlas, with further validation on a similar HN cancer data from the Gene Expression Omnibus database. The methodology can be implemented via the R package DSFDRC, available in GitHub. 1. Introduction Recent advancements in automated data collection techniques has led to an escalating prevalence of ultrahighdimensional data in biomedical sciences. Such data frequently exhibits an abundance of features that far surpasses the available number of observations. A salient example is evident in the massive data generated through high-throughput sequencing processes. With the ability to capture a wide spectrum of molecular, genetic, and phenotypic information on a large scale, researchers can now explore complex biological systems with unprecedented granularity and unveil novel insights into precision medicine, biomarker identification, and the exploration of pathways and networks. However, traditional statistical learning methods tend to falter when confronted with ultrahigh-dimensional data a predicament known as the ‘curse of dimensionality’. The present work was motivated by a study of the head and neck squamous cell carcinomas (HNSCC). Constituting around 4% of all cancer cases in the United States, head and neck cancer predominantly manifests as squamous cell carcinomas[ 1 ]. Despite surgery, radiation and chemotherapy, the 5-year survival rate stands at only 40-50% among all patients[ 30 ]. Extensive studies have found high biological and clinical heterogeneity in HNSCC patients, underscoring the need for a deeper molecular-level understanding of the disease. Our goal in this paper is to identify which genes, among hundreds of thousands, contribute to survival prognosis of HNSCC, utilizing data from the HNSCC cohort available in the Cancer Genome Atlas (TCGA) network. The primary endpoint is the time to death (due to HNSCC). Due to loss to follow-up or no event occurrence until the study’s conclusion, more than half of cases were right-censored. As it is widely acknowledged that only a small subset of molecular features are truly relevant to specific clinical outcomes, feature selection has become one of the cornerstones for biomarker identification. While regularization methods such as LASSO [ 43 ], SCAD [ 13 ], adaptive Lasso [ 53 ], and Dantzig selector [ 6 ] have been the most popular feature selection tools, they can suffer from various issues including computational expediency, statistical inaccuracy and algorithmic instability [ 16 ] when applied to ultrahigh dimensional settings. A pragmatic solution is to perform feature screening before embarking on exact feature selection. A screening procedure applies a coarse filter to individual features to winnow out a significant portion of noise, thus circumventing a concurrent analysis of all features that gives rise to the aforementioned issues. This is frequently achieved through dependence learning between the outcome and individual features, which can be model-based [ 14 , 23 , 12 ], or model-free [ 52 , 32 , 35 ]. Given the challenges of verifying model assumptions in ultrahigh dimensional data, coupled with the risk of overlooking vital features due to model misspecification, opting for model-free screening approaches is a more prudent choice in practice. In the regime of survival analysis, a plethora of model-free feature screening procedures have been developed for survival outcomes, subject to right censoring [40, 31, 51, 26, 7]. Since the true survival time is not fully observable, these approaches focus on the association between the estimated survival probabilities and individual features through, for examples, inverse-probability-of-censoring-weighted (IPCW) rank correlation [ 40 ], a generalized Kolmogorov statistic for covariate-stratified survival distribution [ 26 ], and distance correlation [ 7 ]. Nonetheless, there is no free lunch in estimating the survival function; assumptions on censoring are imposed, explicitly or implicitly, to ensure proper behavior of the survival estimators. The efficacy of the most widely used Kaplan-Meier estimation requires censoring to be independent of the event, as well as the predictive features for the survival time. Violations of independent censoring are not uncommon; subjects may tend to withdraw from the study due to either favorable or unfavorable prognosis, which can be further complicated by the effect of prognostic covariates. The ability of the IPCW method to adjust for dependent censoring hinges on the assumptions of exchangeability, and correct model specification used to estimate the weights. While a biased estimator undermines screening accuracy, it is extremely difficult to conjecture the censoring mechanism in practice to ensure unbiasedness, given ultrahigh dimensional covariates. Heavy censoring can also lead to less reliable estimators as the equivalent number of subjects at risk decreases at later times. The main impetus of this paper is the general lack of flexible and reliable screening tools for ultrahigh dimensional survival analysis, that are robust to heavy censoring and uncertain censoring mechanism as presented in the TCGA HNSCC dataset. Moreover, in order to ensure the retention of important features with high confidence, feature screening procedures tend to opt for a conservative threshold to distinguish signal from noise. This approach, however, often leads to an excess of false discoveries. Consequently, it is essential to supplement the feature screening process with a more precise feature selection step to control the false discovery rate (FDR) in biomarker identification, thereby enhancing the accuracy of prognostic modeling. Traditionally, feature selection and feature screening have been treated as separate domains in the literature. This division persisted until recent groundbreaking contributions in the form of the knockoff methodology [ 4 , 5 , 3 , 33 ], which bridged this gap. The knockoff features are strategically designed to replicate the correlation structure inherent in the original variables, serving as negative controls to aid in the identification of truly significant features while controlling the FDR. This innovative approach can be extended to ultrahigh dimensional survival analysis, offering a solution to the longstanding challenge of FDR control in feature screening. In this paper, we propose a novel feature screening procedure with FDR control for ultra-high dimensional right-censored survival outcomes. The proposed method operates in two key stages. First, we find a preliminary set of potentially important features with a dual screening mechanism. Specifically, two filters are implemented to screen out irrelevant information through nonparametric dependence learning between the raw survival outcome and individual features, with no need for intermediate survival function estimation. The contribution of each feature is quantified by reproducing-kernel-based ANOVA statistics [ 28 ] in a model-free way. Then, we further identify a refined set of important features under directional FDR through a unified knockoff procedure based on the same utility measures adopted in the initial screening step. Our proposed method enjoys several distinctive advantages. First, it requires no pre-specification of the model structure and minimizes the assumption on the censoring mechanism. As a result, it exhibits more resilience to dependent and heavy censoring, than existing alternatives. Second, it effectively detects both linear and nonlinear features by capitalizing on the kernel-based utility measures. Third, it coherently controls FDR, and thus protects prognostic modeling from excessive noise. Finally, it boasts general applicability to other censored regression settings characterized by ultrahigh dimensional data with easy and fast implementation. All these advantages greatly facilitate the utilization of the proposed method in real applications. We substantiate both theoretically and numerically that the proposed feature evaluation procedure enjoys the sure screening property with rigorous control over FDR. Furthermore, through empirical analysis conducted on the TCGA HNSCC dataset and external validation, we showcase the efficacy and practical utility of the proposed method. The rest of the paper is organized as follows. In Section 2 , we develop our new framework of feature screening for ultrahigh dimensional survival analysis, and provide necessary theoretical justification. Simulation studies assessing finite sample properties of our proposal, and comparisons to existing alternatives are provided in Section 3 . We illustrate our proposed methodology via application to the motivating TCGA HNSCC data in Section 4 . Finally, Section 5 concludes, with a short discussion. All technical proofs, along with additional simulation results are deferred to the Web Supplement accompanying the paper. 2 Statistical Methods In this section, we introduce a model-free dual screening framework for ultra-high dimensional right-censored data with FDR control. The proposed method is implemented via two main steps. First, it finds a crude set of potentially important features through a dual screening mechanism. Then, it further identifies a refined set of important features under directional FDR. 2.1 Assumptions and Dual Screening Let T be the survival time and X = ( X 1 , …, X p ) T be a p-dimensional set of covariates. We denote the censoring time by C . In reality, the observable survival response variables are ( Y, δ ), where Y = min {T, C} and δ = I{T ≤ C} with I{·} being the indicator function. Ideally, we would like to identify the smallest active set, denoted as , satisfying Since T is not fully observable, our focus shifts to identifying the smallest active set relevant to the observable outcome ( Y, δ ), denoted as X 𝒜 = {X j : j ∈ 𝒜} , ensuring that Nonetheless, the relation between the two active sets A T and A can be established under a relatively simple condition, as shown in the following proposition. Proposition 1 Let and X 𝒜 be the active sets that satisfy (1) and (2), respectively. Assuming that we have . The pair of conditions (1) and (3) is equivalent to ( T, C ) , which mplies ( Y, δ ) ⊥ because ( Y, δ ) is a function of ( T, C ). It follows immediately that under condition (3), the important predictors for the observable outcome ( Y, δ ) are also important predictors for the true survival time T . Moreover, it is expected that in practice the equality will normally hold since proper containment requires carefully balanced conditions. We note that assumption (3) is a mild condition since it allows censoring to vary with the true survival time and all the prognostic features. By contrast, the independent censoring condition, is more stringent and implies assumption (3). While many existing survival data screening methods assume independent censoring [ 40 , 7 , 10 , 34 ], this assumption can be unrealistic in cases with complex censoring mechanisms and a large number of features. Another common assumption for ensuring identifiability [ 51 , 26 , 48 ], does not ensure the equivalence of and X 𝒜 . Throughout this paper, we assume that condition 3 holds, based on which a new framework of feature screening for right-censored data is developed. Specifically, we propose a new screening approach that directly applies to the raw survival outcome and thus avoids estimating the survival function. The following proposition lays the cornerstone for our method. Proposition 2 The pair of the following conditions ( b 1) and ( b 2) is equivalent to condition ( a ): for j = 1, …, p . According to Proposition 2, the pair of conditions ( b 1) and ( b 2) jointly implies the irrelevance of X j . Or, in other words, important features must be either marginally correlated with δ or conditionally correlated with Y given δ . Since δ is simply binary, the two conditions can be easily assessed by a wide range of independence measures. As a result, feature screening for right-censored data boils down to traditional univariate independence learning on complete data, bypassing the need to estimate the survival function. In this paper, we adopt a recently developed nonparametric independence measure, namely Expected Conditional Characteristic function Based Independence Criterion[ 28 ] (ECCFIC), as the filter in our screening procedure. We briefly review ECCFIC in the following subsection and illustrate its advantages over some existing celebrated independence measures. 2.2 Independence Measures Let U, V ∈ ℝ be two random variables. Also, let ϕ U denote the characteristic function of U , and ϕ U | V denote the conditional characteristic function of U given V . Elicited by the fact that U ⊥ V if and only if ϕ U | V = ϕ U , the ECCFIC for quantifying the association between U and V is defined by where, w ( · ) is a finite nonnegative Borel measure on R. An equivalent formula to (4) is given by where ( U 1 , V 1 ) is an i.i.d. copy of ( U, V ), denotes conditional expectation E ( · | V = v, V 1 = v ), and K : ℝ → C is a translation-invariant positive definite kernel induced by w , such that K ( x ) = ∫ ℝ e − ixt dw ( t ) for x ∈ ℝ by Bochner Theorem [ 46 ]. Henceforth, we consider the alternative representation in (5), as it is easier to estimate for a given kernel. As a special case, we have It can be shown that . Moreover, if K is characteristic [ 19 ], then if and. only if U ⊥ V . As a result, we can define an R 2 -type statistic as and . In particular, if and only if U ⊥ V and if and only if U is a measurable function of V . Intuitively, this kernel-based R statistic can be regarded as a nonlinear generalization of theclassical R 2 as it requires no linearity or distributional assumptions for the regression of U on V . It is worth noting that ECCFIC is closely related to a well-known family of measures, called Hilbert-Schmidt independence criterion[ 20 ] (HSIC), which includes distance covariances[ 42 ] as a special case. To assess the association between U and V , HSIC consider the discrepancy between the joint characteristic function ϕ U,V and the product of the marginals ϕ U ϕ V [ 21 ], where the two random variables are treated symmetrically. Although HSIC equal zero also indicates independence and vice versa, it is not clear under what circumstances HSIC approaches its upper bound or how the random variables are related when the upper bound is attained. Compared to HSIC, ECCFIC better quantifies the contribution of a feature to the outcome, since it characterizes both independence and functional dependence in a supervised way, thereby making ECCFIC a more appealing alternative for model-free feature screening. In the similar vein, the marginal effect associated with V (given another variable Z ∈ ℝ already contained in the model to explain U ) can be measured by the expected conditional characteristic function based conditional independence criterion (ECCFCIC [ 28 ]): Then, a kernel-based partial R 2 statistic can be defined by and with if and only if U ⊥ V | Z . Although here we restrict ourselves to translation-invariant kernel for ease of presentation, ECCFIC can be generalized using any positive definite characteristic kernel in the associated reproducing kernel Hilbert space[ 28 ]. Examples of characteristic kernel include but not limited to Gaussian, Laplacian, and inverse multiquadric. 2.3 The Screening Procedure Returning to the context of feature screening for right-censored data, we propose to use the following two utility measures to evaluate each feature based on conditions ( b 1) and ( b 2), respectively: The first measure is the kernel-based R 2 for the inverse regression of X j | δ and the second measure is the kernel-based partial R 2 for the inverse regression of X j | Y while adjusting for δ . Here the inverse regression is to facilitate sample estimation as δ is a binary variable. We note again that any appropriate nonparametric dependence measures may be used to access conditions ( b 1) and ( b 2). For example, the marginal measure can be replaced by the Kolmogorov filter [ 35 ] or the MV-SIS filter [ 8 ], while the conditional measure can be replaced by by exchanging X j and Y . Given sample data { ( X i , Y i , δ i ) : i = 1, …, n} , we develop the estimators for the proposed utility measures. Let 𝒥 s = {i : δ i = s}, n s = | 𝒥 s |, and for s = 0, 1. Denote and for s = 0, 1. The marginal utility can be estimated as The conditional utility can be estimated as where, is the Nadaraya-Watson estimator of given δ = s , relying on a smoothing kernel G : ℝ → ℝ and an associated tuning bandwidth h s = h s ( n s ) ∈ ℝ . Specifically, where, . According to Proposition 2, features making discernibly marginal or conditional contribution to the survival outcome should be retained. Therefore, we estimate the active index set by where, c 1 , c 2 , γ 1 and γ 2 are some threshold values relying on the strength of the true signal, which is to be defined in condition v below. Henceforth, we refer to the proposed screening procedure as k ernel-based i ndependence d ual s reening (abbreviated as KIDS). The proposed procedure embraces the sure screening property as well as the rank consistency property, which are established in Theorems 2.3 and 2.3 below. Let 𝒜 1 = {j ∈ 𝒜 : X j ⊥ / ⊥ δ} and 𝒜 2 = {j ∈ 𝒜 : X j ⊥ / ⊥ Y | δ} . Then 𝒜 = 𝒜 1 ∪ 𝒜 2 . The following regularity conditions are imposed to facilitate the technical proof, although, they may not be the weakest one. libel=(C0),itemsep=0pt The characteristic kernel K is bounded. liibel=(C0),iitemsep=0pt The smoothing kernel G : ℝ→ ℝ satisfies ∫ ℝ y i G ( y ) dy = I{i = 0 } for i = 0 and 1, and G ( y ) = O ((1 + | y | 4 ) −1 ). liiibel=(C0),iiitemsep=0pt h s → 0 and as n s → ∞, for s = 0, 1. livbel=(C0),ivtemsep=0pt The density of Y given δ = s , denoted as f Y,s ( y ), is bounded away from zero, for s = 0, 1. In addition, the first partial derivatives of f Y,s ( y ) is uniformly bounded by some constant that does not depend on y , for s = 0, 1. lvbel=(C0),vtemsep=0pt There exist c 1 , c 2 > 0 and γ 1 , γ 2 ∈ [0, 1 / 2), such that and . lvibel=(C0),vitemsep=0pt There exist c 3 , c 4 > 0 and γ 3 , γ 4 ∈ [0, 1 / 2), such that and . Condition i is satisfied for many popular kernels [ 2 ]. Conditions ii-iv are commonly assumed for Nadaraya-Watson estimators. Condition v and vi are also standard in the literature of variable screening requiring that the true signal is detectable and is distinguishable from noise. [Sure Screening] Under conditions i-v, where, a > 0 is some constant. [Rank Consistency] Under conditions i-iv and vi, almost surely for . Proofs of Theorems 2.3 and 2.3 appear in Web Supplement 6.2 and 6.3, respectively. Theorem 2.3 suggests that all the important features are selected asymptotically almost surely, and Theorem 2.3 further indicates that active features can be well separated from inactive ones. Both properties hold with NP-dimensionality for some γ ∈ [0, 1 / 2). There is no established way of determining the threshold values in a finite sample setting. As it is commonly assumed that the cardinality of the truly important set is small, one may specify a model size d < n and select , where for d 1 + d 2 = d . Typical choices of d are [ n/ log( n )], 2[ n/ log( n )], 3[ n/ log( n )], and n − 1 [ 14 , 32 ]. We can simply set d 1 = d 2 = [ d/ 2], in which case the marginal and conditional utility measures are equally weighted in the selection of and be the two rankings of variables by and , respectively. A joint ranking can be acquired by ascending . Then selecting the top d variables is identical to the trivial choice of with d 1 = d 2 . The sure screening property entails that the probability of selecting all the active predictors is close to one when d is sufficiently large. Inevitably, false discoveries can be inflated simultaneously with a generous choice of d . We address this issue in the next two subsections. 2.4 FDR Control via Knockoff The most important assumption of ultrahigh dimensional problems is the sparsity principle, which assumes that the cardinality of 𝒜 is very small compared to p . In most cases, it is very hard, if not impossible, to recover A exactly without error. Ensuring all the active predictors are selected with high probability in the preceding screening procedure may introduce too much noise to the downstream analysis in the meanwhile. Therefore, a natural interest is to find a balancing trade-off between the sure screening property and the false discovery rate (FDR). In this subsection, we develop a dual selection procedure for right-censored data with FDR controlling using knockoff features. We say is a knockoff copy of X if Swapping X j with does not change the joint distribution of ( X , ); . The second condition is trivially achieved as long as is constructed without using ( Y, δ ). However, if the distribution of X is unknown, how to obtain exact knockoff copies that satisfy the first condition remains elusive. Nonetheless, we may construct approximate second-order knockoff features, such that is pairwise exchangeable with respect to the first two moments. Suppose µ = E ( X ), Σ = Cov ( X ). Mean invariance can be easily achieved by forcing . The second-order pairwise exchangeable condition is equivalent to and h ∈ ℝ p is a vector that makes G a positive semidefinite covariance matrix. Different approaches are available to select h [ 4 ]. For example, one may find h by solving subject to h j ≥ 0 and 2Σ − diag { h } being positive semidefinite. If we treat X as fixed [ 4 ] and normalize each feature such that the sample covariance and with X ∈ ℝ n×p being the data matrix, then the knockoff data matrix can be obtained by where, is an n × p orthonormal matrix that is orthogonal to the span of X , and is a Cholesky decomposition. In a more general Model- X setting [ 5 ] where X has an unknown distribution, we can generate approximate knockoff features from conditional normal distribution as Note that, if X is Gaussian, the equivalence of the first two moments implies the equivalence of the joint distribution, such that (8) yields exact knockoff features. Then, we quantify the contribution of X j to ( Y, δ ) by the following two measures: . . Given sample data , we estimate W j ,1 by , where and are calculated using (6) Similarly, we estimate W j ,2 by using (7). Intuitively, a large value of either or indicates the significance of X j as X j outperforms . On the other hand, it is expected that irrelevant variables behave similarly to their knockoff counterparts, resulting in small sample utilities that bounce around 0 as shown in the following proposition. Proposition 3 Let be an exact knockoff copy of X . Then, for j / ∈ 𝒜 , W j ,1 = W j ,2 = 0; Conditioning on and Bernoulli(0 . 5) , where I{·} is the indicator function . For fixed thresholds t 1 , t 2 > 0, the false discovery proportion is and FDR( t 1 , t 2 ) = E [FDP( t 1 , t 2 )]. Note, from Proposition 3, which leads to a conservative estimator of FDP: The proof of Proposition 3 appear in Appendix 6.4. The offset of 1 in the numerator, yielding a slightly more conservative{_es(timator, is neces)sary both theoretic}ally and empirically to control the FDR. Define and let . Then ( T, ::< ) is a partially ordered set, where ( if and , for ( t 1 , t 2 ), . To control FDR at a pre-specified level α , we choose the thresholds T α ,1 and T α ,2 as where min -< represents the minimal element of the set with respect to ::< . Then, the selected active set is given by Note that there can be more than one minimal element in T ; so, the choice of ( T α ,1 , T α ,2 ) may not be unique, leading to different estimates of the active set. In practice, one can choose the minimal element that yields the largest average utility of the selected features, , where if X j is ranked higher based on the marginal statistic than the conditional statistic and vice versa. This approach works well in our simulation studies (see, Section 3 ). Although this dual selection procedure controls FDR, it is not readily applicable to ultrahigh dimensional data because constructing knockoff features becomes computationally intractable for large p . However, feature screening and knockoff-based selection naturally complement each other under ultrahigh dimensionality: one can perform screening to reduce p , and then apply the knockoff technique to further control FDR [ 3 , 33 ]. We elaborate this adaption in the next subsection and show that sure screening is still attainable with FDR under control. 2.5 Refined Screening with FDR Control Consider splitting n observations into two disjoint groups of size n 1 and n 2 = n − n 1 , denoted as and . We follow the next two steps: We start with conducting the screening procedure as described in 2.3 using to select a small index subset of potentially relevant features for d < n 2 . Next, we run the knockoff procedure as described in 2.4 on the remaining data , ignoring features that were not selected in the screening step. Specifically, we first obtain the knockoff matrix for the original design matrix on the remaining data. Then, we compute and for . For a pre-specified FDR level α , we ultimately select where T α ,1 and T α ,2 are determined by solving (9). Hereafter, we refer to this screening-and-knockoff procedure as α -controlled k ernel-based i ndependence d ual s reening ( α -KIDS for short). It is critical that the two steps of α -KIDS are performed on distinct data. The following procedure would not control the FDR: we perform the screening step using the full data set to select and run the knockoff procedure on the dimension-reduced data . The problem is that can be viewed as a function of ( X , Y, δ ) because is selected using all data. As a result, there is no guarantee that , even if is constructed without using ( Y, δ ). The loss of FDR control is not merely theoretical; an unimportant feature X j that is kept by the screening step is generally more likely to appear as a false positive when running the knockoff filter, leading to a much higher FDR [ 3 ]. With the data splitting mechanism, as long as the screening step correctly identifies all the relevant features (which happens asymptotically almost surely as shown in Theorem 2.3), the knockoff step will control the FDR as desired. Moreover, the sure screening property is inherited. That is, the α -KIDS procedure achieves a balancing trade-off between type I and type II errors. This appealing property is justified in Theorem 2.5 below. Denote the event that all the important features are selected in the screening step. We further require that the true signal cannot be too weak to be captured by the knockoff filter: lvbel=(C0’),vtemsep=0pt There exist c 5 , c 6 > 0 and γ 5 , γ 6 ∈ [0, 1 / 2), such that [FDR-Controlled Sure Screening] For any α ∈ (0, 1), we have Furthermore, under condition v, for α ≥ 1 / | A |, where b > 0 is a constant. The proof of Theorem 2.5 appear in Appendix 6.5. Although data splitting is a straightforward approach to handle ultrahigh-dimensionality, there is certainly a loss of power since each step only uses part of the data. One solution to the issue is to smartly recycle the data used in the screening step to raise power while retaining the FDR control property for the knockoff procedure [ 3 ]. We modify the α -KIDS procedure as follows: The screening step remains the same. Use to select a small index subset of potentially relevant features for d < n 2 . For the knockoff step, we still start with obtaining the knockoff matrix for the original design matrix on the remaining data. Then we concatenate the original design matrix on the first n 1 observations with the knockoff matrix on the next n 2 observations as Now, we calculate and using the full data for , where is the i th row of the knockoff matrix . For a pre-specified FDR level α , we ultimately select where T α ,1 and T α ,2 are determined by solving (9). Here, we follow the convention to treat as fixed when creating in the knockoff step [ 3 ]. In other words, although was selected using the first portion of the data, we think of as being independent of . As a result, gives legitimate knockoff features and the procedure controls the directional FDR. On the other hand, there is an inherent gain of power compared to the data splitting approach as the first n 1 observations weigh in. If a feature X j is important, the first portion of data will contribute to large R 2 or partial R 2 values for both X j and since by design, and the second portion of data will help separate X j from its knockoff counterpart, resulting in a positive value of W j ,1 or W j ,2 . 3 Simulation Studies In this section, We evaluate the performance of our method on simulated ultra-high dimensional datasets, and make comparisons with several other competing methods, including censored rank independence screening (CRIS) [ 40 ], integrated powered density (IPOD) [ 26 ], and robust censored distance correlation screening (RCDCS) [ 7 ]. Our method is conducted with the Gaussian kernel being the reproducing kernel as well as the smoothing kernel for density estimation. The bandwidths of the two Gaussian kernels are set to heuristic median pairwise distance [ 22 ] and , where n is the sample size and is the sample standard deviation [ 39 ]. We generate correlated features X from N p ( 0 , Σ) with p = 5, 000 and Σ having a first-order autoregressive (AR) structure, and consider a variety of survival models under independent or dependent censoring. The design of correlated features is to mimic the phenomenon that features tend to be correlated, even purely by chance, in ultrahigh dimensional space [ 15 ], which makes it more challenging to distinguish truly important features from spurious ones and achieve exact feature selection. We report the following results based on 200 replicates: the τ th quantiles of the minimum model size (MMS), denoted as M τ , that includes all active features for the screening methods, where the MMS for KIDS is defined as min {M 1 + M 2 } such that ; the proportion of selecting a certain active predictor X j , denoted as P j , and the proportion of including all active predictors, denoted as P A , for all the screening methods and α -KIDS; the average model size (AMS) determined by α -KIDS; and empirical FDR (EFDR) for α -KIDS. Example 1 In this example, we evaluate the efficacy of KIDS in comparison to the other screening methods. Let X ∼ N p ( 0 , Σ), where Σ = AR (0.5). Given X , the true survival time is generated from the following accelerated failure time (AFT) model and proportional hazard (PH) model: 1. Model 1: , where E ∼ N (0, 1) independently; 2. Model 2: , where E follows the standard extreme value distribution independently, which corresponds to a PH model [ 27 ]. For each model, the survival time is subject to two censoring mechanisms: independent censoring time C generated from uniform distribution on [0, c 0 ]; dependent censoring time C generated from exponential distribution with mean , where, the constant c 0 is chosen to achieve 30% or 50% censoring rate (CR). The results are summarized in Table 1 for n = 200 and d = [ n/ log n ] = 38. In all scenarios, KIDS outperforms the other methods with higher selection proportions, and minimum model sizes closer to the truth, i.e., | 𝒜 | = 3. The three competitors are not as robust to heavy censoring or dependent censoring as KIDS. In addition, CRIS barely detects the feature ( X 7 ) that is non-linearly related to the endpoint. In the Supplements, we further consider a linear design with varying signal strength (Example 3), and a more complex nonlinear design (Example 4) for both AFTand PH-type of models under varying censoring mechanisms. Once again, our method performs consistently well compared to the other methods. View this table: View inline View popup Download powerpoint Table 1: Quantiles of MMS ( M τ ) and selection proportions ( P j ’s and P A ) for models in Example 1 based on 200 replicates with n = 200, p = 5000 and d = [ n/ log n ] = 38. Example 2 This example is to verify Theorem 2.5 for the α -KIDS procedure. Similar to Example 1, we generate X from N ( 0 , Σ) with Σ = AR (0.3) and simulate the true survival time from the following two models: 1. Model 3: log T = µ ( X ) + E , where and E ∼ N (0, 1), independently; 2. Model 4: log(.5( e 2 T − 1)) = µ ( X ) + E , where µ ( X ) is the same as in Model 3 and E follows the standard extreme value distribution, independently. The censoring time is simulated from: (a) uniform distribution on [0, c 0 ]; (b) exponential distribution with mean , where, the constant c 0 is chosen to yield 30% or 50% CR. We set n = 2, 000, n 1 = 500, n 2 = 1, 500, d = 100 and vary the nominal level α from .1 to .3[ 3 , 33 ]. We report the overall selection proportion for the screening step, whereas, for the knockoff step, we report P j ’s, P 𝒜 , AMS and EFDR given . The results are summarized in Table 2 . Despite the models involve linear and nonlinear terms of correlated features, the α -KIDS procedure in general controls the FDR at the desired level fairly well and inherits the sure screening property across different censoring settings. Note, at α = .1, the procedure has to precisely identify 𝒜 (in theory) to control the FDR and maintain power simultaneously because the FDP is exactly .1 with | 𝒜 | = 10 – a challenging borderline scenario. If α < .1, with high probability, the procedure ends up with an empty set. View this table: View inline View popup Download powerpoint Table 2: Selection proportions ( , P j ’s and P A ), AMS and EFDR for models in Example 2 based on 200 replicates with n 1 = 500, n 2 = 1500, p = 5000 and d = 100. The choices of n 1 /n 2 ratio and the model size d for the screening step in our setting appear to give a favorable balance between finding a sufficiently good screened set at the first stage, and retaining a large enough sample size for powerful inference in the second stage. In practice, we also suggest a split with n 2 > n 1 to allow more information for accurate selection via knockoff. On the other hand, d cannot be too small to ensure the coverage of 𝒜 for the screening step and to provide adequate amount of noise as reference to control FDR for the knockoff step, while the computational cost for knockoff may prevent us from choosing an arbitrarily large d . Whether we can determine theoretically the optimal split and model size for making the most discoveries is worthy of future research. 4 Application: Head And Neck Cancer Data We investigated the head and neck squamous cell carcinoma (HNSCC) cohort in the Cancer Genome Atlas (TCGA) network. Upper quartile normalized RSEM TPM mRNA expression values for 518 primary-solid tumor samples with matched clinical information were obtained using the R package curatedTCGAData. Genes with low expression, as indicated by a zero interquartile range, were excluded from the analysis. The remaining genes were log-transformed. The endpoint of interest in our study was the number of days to death, which was subject to right censoring either due to loss of follow-up or no event occurrence until the end of the study. The observed survival time ranged between 2 to 6417 days with a median of 649.5 days and with 57.53% censoring rate. For external validation of our findings, gene expressions and clinical data for 253 HNSCC primary tumor samples were acquired from the Gene Expression Omnibus (GEO) database (accession number GSE65858 [ 47 ]). After preliminary data processing, a total of 15,887 genes commonly available in the TCGA and the GSE65858 datasets, along with important clinical covariates (as summarized in Table 8 in the Web Supplement), were included for our analysis. The R package DSFDRC available in GitHub ( https://github.com/urmiaf/DSFDRC ) implements the methodology. 4.1 Model Selection The TCGA samples were partitioned into training and testing subgroups using a 4:1 ratio. The training cohort consisted of 414 samples, while the testing cohort had 104 samples, and both cohorts shared the same censoring proportion. We employed various competing methods on the training data to identify prognostic gene signatures for HNSCC, and subsequently evaluated their performance using the testing samples. The models were as follows: α -KIDS, with specific parameters set to n 1 = 200, d = 100 and α = 0.1, followed by a Cox proportional hazard model built on the selected genes; KIDS/CRIS/IPOD/RCDCS to pre-select d = [104 / log(104)] = 22 candidate genes, followed by a penalized Cox model (CoxNet) applied on the dimension-reduced data for further gene selection and prognostic modeling; α -KIDS followed by a Cox gradient boosting machine (CoxGBM [ 18 ]) built on the selected genes; KIDS/CRIS/IPOD/RCDCS to pre-select d = 22 candidate genes, followed by the double-slicing assisted procedure (DS [ 9 ]) for further gene selection and finally a prognostic CoxGBM applied on the dimensionreduced data after the screening and selection steps. To ensure a fair comparison between α -KIDS (which performs both screening and selection), and the screening-only procedures (KIDS, CRIS, IPOD, and RCDCS), we augmented the screening procedures with more precise selection techniques, namely CoxNet, a model-based approach, and DS, a model-free method. The DS procedure identifies low-dimensional sparse linear combinations of features Γ T X , such that ( Y, δ ) ⊥ X |Γ T X , where Γ is a p × q matrix with q (the number of linear combinations) usually being much smaller than p . It achieves simultaneous feature selection through regularization without assuming any parametric distribution of ( Y, δ ) or linear relation between ( Y, δ ) and Γ T X . For the purpose of gene selection, we only leveraged the ability of DS to extract relevant genes rather than utilizing the linear combinations it produced. Both linear Cox model and nonlinear CoxGBM were used to construct prognostic signatures based on the selected genes. Optimal tuning parameters for CoxNet, DS, and CoxGBM were determined through cross-validation. A patient’s gene signature loading was calculated as the linear predictor for the fitted Cox/CoxNet model, or the link function value of the fitted CoxGBM model, which can also be viewed as a risk score. Subsequently, patients were classified into high-risk and low-risk groups, using the median risk score of the training cohort as the cutoff. The log-rank tests were conducted to compare the survival functions of the two risk groups and the p-values are reported in Table 3 . Furthermore, we assessed the gene signatures by the time-dependent dynamic receiver operating characteristic (ROC) curves [ 24 ] at 1, 3 and 5 years. The corresponding area under curve (AUC) values are summarized in Table (3). According to the log-rank tests and the ROC curves, relevant genes selected by α -KIDS and KIDS led to more informative prognostic signatures for HNSCC in terms of risk stratification and survival prediction. Notably, the α -KIDS+CoxGBM model demonstrated the most favorable overall performance on the testing samples. We then proceeded to refit the model to the entire TCGA dataset, and validated the resulting gene signature with the external data. View this table: View inline View popup Download powerpoint Table 3: Gene signature sizes (number of genes in a signature), p-values for the log-rank tests on the risk stratification of the TCGA samples, and AUCs for 1-, 3-, 5-year ROC curves of the risk scores across competing methods. 4.2 10-gene Signature and External Validation The α -KIDS procedure was applied to the full TCGA dataset, resulting in the selection of 10 genes: OLR1, SPOCK1, DDX19A, FADS3, P2RX6, C9ORF4, C15ORF21, TMED6, TFB2M and C22ORF15. A 10-gene prognostic signature was constructed using CoxGBM subsequently. The GEO platform data was used to validate the effectiveness of the gene signature. Patients were classified as having a high-risk gene signature or a low-risk gene signature on the basis of the link function values, with the median score of the TCGA samples as the threshold. Patients with a high-risk 10-gene signature exhibited significantly lower median survival compared to those with a low-risk gene signature in both the TCGA cohort (727 days vs. 2717 days) and the validation cohort (1068 days vs. 1962 days), as supported by the Kaplan-Meier curves and the log-rank test p-values ( Figure 1 ). An interesting finding is that patients with HPV infection were associated with lower risk scores in both cohorts (p-values < 0.001 based on two-sample t-tests). This aligns with previous reports that patients with HPV-positive cancers generally experience better prognoses than those with HPV-negative cancers, particularly for tumors arising in the oropharynx [ 29 ]. Therefore, the gene signature may offer insights into the underlying molecular mechanisms of the HPV heterogeneity. Download figure Open in new tab Figure 1: Kaplan–Meier estimates of overall survival (solid) with 95% confidence interval (dash) and log-rank test p-values for risk stratification of training (TCGA) and validation (GEO) samples according to the 10-gene signature identified by the α -KIDS+CoxGBM model. Additionally, multivariate Cox proportion-hazards regression analysis was used to evaluate independent prognostic factors associated with survival, and the 10-gene signature, age, sex, tumor stage, HPV status, alcohol history and smoking history were used as covariates. The fitted models are summarized in Table 4 . For the TCGA cohort, the 10-gene signature was a strong predictor with an hazard ratio of 7.62 (p-value < 0.001), after adjusting for other clinical covariates. There was a 2% increase in the expected hazard relative to a one year increase in age (p-value < 0.001). Patients with IV-stage cancer experienced a remarkable 98% increase in hazard (p-value < 0.001) compared to those in early stages (I/II). Similar results were observed in the validation cohort, indicating the potential clinical utility of the 10-gene signature in enhancing prognostic assessments and guiding personalized treatment decisions for HNSCC patients beyond conventional phenotype-based predictors. View this table: View inline View popup Download powerpoint Table 4: Multivariate Cox regression analysis based on the 10-gene signature and other clinical covariates. CI denotes confidence interval. 4.3 Integrated Clinicogenomics Modeling Results from the above analysis motivated us to consider a CoxGBM combining the 10 genes selected by α -KIDS and important clinical covariates, namely age, sex, tumor stage, HPV status, alcohol history and smoking history. The composite model was compared against two other CoxGBM models: the one based solely on the 10 selected genes (which was used to discover the 10-gene signature in the previous subsection) and the other based solely on the clinical covariates. Again, the TCGA cohort was utilized as the training data and the GEO platform data served as the external validation data. Differences in survival between the high-risk group and the low-risk group were analyzed with the log-rank test. ROC curves and associated AUCs were calculated to assess time-dependent predictive performance of the three models. The results, as summarized in Table 5 , revealed that the model integrating both clinical and genetic information had improved prognostic accuracy over the other two models. View this table: View inline View popup Download powerpoint Table 5: P-values for the log-rank tests on the risk stratification, and AUCs for 1-, 3-, 5-year ROC curves of the risk scores across three competing CoxGBMs. Finally, we highlight some biological implications of the genes selected by α -KIDS. OLR1 is a scavenger receptor for oxidized low-density lipoprotein (LDL) on endothelial cells and other cell types. OLR1 up-regulation in different tumors has evidenced its involvement in cancer onset, progression and metastasis, including HNSCC [ 36 , 50 ]. High expression of FADS3, located at the cancer genomic hotspot 11q13 locus, has been reported to predict poor prognosis in HNSCC [ 41 ]. The oncogenic functions of SPOCK1, C15orf21, and TMED6 have also been investigated in several cancer cells [ 38 , 17 , 44 , 11 , 49 ]. 5 Discussion Large scale collaborative effort, such as TCGA, have allowed researchers with access to vast and curated data, enabling investigations into the underlying molecular mechanisms of HNSCC prognosis at various levels of complexity. A notable characteristic of such datasets is their ultra-high dimensionality, which places particular demands on the methods used to build prognostic models they must be able to handle data where the number of features far exceeds the number of observations. Moreover, in the context of survival analysis, how to handle censoring appropriately is paramount to avoid biased estimations and drawing incorrect conclusions, especially in the presence of heavy censoring. Feature screening emerges as a crucial step to efficiently reduce dimension before undertaking more accurate analyses. However, existing methods often impose explicit or implicit assumptions on censoring that are rather difficult to verify given the large number of features, creating impediments to their practical uses. Our proposed novel feature screening procedure quickly reduces irrelevant information under ultrahigh-dimensional right-censored settings, along with a unified selection procedure to control FDR. The proposed framework requires no pre-specification of the model structure and has the minimal assumption on the censoring mechanism. The flexibility is achieved by direct nonparametric learning of the survival outcome, without the need for intermediate estimation of survival probabilities. Our methodology is also readily generalizable for feature evaluation in other cancer types. We remark that even if our assumption in equation 3 is not met, our procedure still serves the purpose of feature screening by identifying 𝒜 ( T,C ) , the active set for ( T, C ), jointly, which inherently contains A T . To further isolate the important features for T from the estimated active set, more precise feature selection methods [ 9 ] that are tailored for lower dimensional data, can be further applied. The FDR control step ensures that only the most informative features enter the downstream analyses to construct accurate prognostic models. Although our initial motivation was to address the challenges lying within the TCGA HNSCC dataset, the developed method is generally applicable to devise robust prognostic systems for new patient cohorts and other cancers. Along the line, future research should explore how to account for sample heterogeneity and integrate domain knowledge into the feature screening procedure, especially for HNSCC data encompassing samples from diverse sites and HPV subtypes. Additional avenues for future research include extending the methodology to more complex (cancer) endpoints, such as interval-censoring, and multistate models, etc. Our current exploration only considers patient mRNA genomic information. However, an integrative approach that analyzes and combines multiple -omics data, such as genomic, transcriptomic, and methylome data via identifying and validating a multi-omics signature may enhance HNSCC prognosis[ 37 ]. An extension of our current methodology for the multi-omics case, although non-trivial, is relevant. Data Availability All data produced in the present study are available upon reasonable request to the authors https://github.com/urmiaf/DSFDRC Supplementary Materials 6 Theorems and Proofs 6.1 Lemmas and Proofs In this subsection, we first show some useful lemmas as preliminary results for Supplements 6.2 and 6.3. Lemma 1 (Deviation bound for U-statistics, [ 25 ]). Let g ( U 1 , …, U r ) be a kernel of a U-statistic U n , i . e ., , where n > r , and is taken over all r-tuples {i 1 , …, i r } drawn without replacement from { 1, …, n}. If b 1 ≤ g ( U 1 , .., U r ) ≤ b 2 , then for any E > 0, the following bound holds: where w := [ n/r ], the largest integer contained in n/r . This lemma gives a uniform bound for any U-statistic of arbitrary dimensional data, as long as the associated kernel is bounded. We repeatedly use this result to prove the next two lemmas. Lemma 2 (Deviation bound for marginal utilities). Under condition i, for any E ∈ (0, 1), where j = 1, …, p, and a 1 > 0 is a constant . Proof . We aim to show the uniform consistency of the denominator and the numerator of Ω j ,1 under regularity conditions respectively. Because the denominator of Ω j ,1 has a similar form as the numerator, we deal with its numerator only below. Let where are V-statistics. Let be corresponding U-statistics with . Under condition i, without loss of generality, we assume that the kernel K is bounded above by 1. Hence, 0 ≤ E l ≤ 1 for s = 0, 1. Denote , where P s = P ( δ = s ). For any E ∈ (0, 1), Let us consider T 1 first. where the last inequality follows from Lemma 1. Also, and . Combining T, T and T , we have for some a 1 > 0. Lemma 3 (Deviation bound for conditional utilities). Under conditions i-iv, for any E ∈ (0, 1), where j = 1, …, p, and a 2 > 0 is a constant . Proof . For a given j ∈ { 1, …, p} and s ∈ { 0, 1 } , let , where , then . The kernel regression estimator where is the density function of Y given and Without loss of generality, we assume that f Y,s ( y ) is bounded below by some L > 0 in condition iv. We first show some intermediate results. libel=(R0),leftmirgin=2 Note that and conditions ii and iii. Denote . Then by Taylor expansion and conditions ii and iv. Hence, liibel=(R0),leftmiirgiin=2 . Denote the corresponding U-statistic of as , that is, Where and Σ π represents summation over the 5! permutations of ( i 1 , …, i 5 ). Under conditions ii and iii, . We will show in the next that in two parts. Firstly, by Taylor expansion and conditions ii and iv. Similarly, we can show Therefore, . Then Now, for arbitrary ϵ ∈ (0, 1), By (R2), T 1 ≤ 2 exp { − n w ϵ 2 / 32 } . Moreover, By (R1), . Let and be the corresponding U-statistic. Similar to (R2), we can show that Hence, for n s sufficiently large, Finally, we have Where is some constant depending on L . Consequently, for some a 2 > 0. 6.2 Proof of Theorem 2.3 Proof . Following from Lemma 2 and 3 , And Under condition v, if , there must exist some j ∈ 𝒜 1 such that or some j ∈ 𝒜 2 such that but and . Therefore, where b is a constant depending on c and c , and γ = max {γ 1 , γ 2 } . In other words, 6.3 Proof of Theorem 2.3 Proof . By condition vi and Lemma 2 , for some a 3 > 0 depending on c 3 . Since , we have for n sufficiently large. For some n 0 sufficiently large, By Borel-Cantelli Lemma, We can derive similarly that 6.4 Proof of Proposition 3 Proof . For any j , let ( X , ) ( j ) be the vector by swapping the entries X j and in ( X , ); let ( x , ) ( j ) be the vector by swapping the entries x j and in ( x , ) ∈ ℝ 2 p ; and let x − j denote the vector of x excluding x j . Let f U | V ( u | v ) denote the conditional distribution of U given V = v . For j / ∈ 𝒜 1 , where the second and the last equations are due to ( Y, δ ) , and the two equations in between follow from ( Y, δ ) ⊥ X j | X − j for j / ∈ 𝒜 . That is, Since by the definition of knockoff copies, it follows that which implies that and . Therefore, W j ,1 = W j ,2 = 0. In fact, we can show by repeating the above arguments that for any 𝒮 ⊂ 𝒜 c , where ( X , ) S is the vector by swapping the entries X j and in ( X , ) for all j ∈ 𝒮 . Let and let be a function such that . Define E 1 , · · ·, E p such that E j = 1 for j ∈ A 1 and E j is i.i.d. coin flip of { +1, −1 } for j / ∈ 𝒜 1 . Consider 𝒮 = {j : E j −1 } ⊂ , then The statement for can be shown analogously. 6.5 Proof of Theorem 2.5 Throughout this proof, we restrict ourselves to the event . Let . Denote by the k th largest absolute value of the marginal statistics , k = 1, …, d . Define analogously for the conditional statistics. For ease of presentation, we further define . Then we have The first inequality holds since and the second inequality is due to the definition of ( T α ,1 , T α ,2 ) in (9). Consider a partially-ordered discrete time ocess for , where Let F ( k 1 , k 2 ) be the σ -field generated by knowing {V ± ( j 1 , j 2 ) : d + 1 ≥ j 1 ≥ k 1 , d + 1 ≥ j 2 ≥ k 2 } as well as all the non-null statistics. The collection {F ( k 1 , k 2 ) : k 1 , k 2 = d + 1, d , …, 1 } of σ -fields is monotonic (thus a filtration) since for and . In the next we show that {M ( k 1 , k 2 ) : k 1 , k 2 = d + 1, d , …, 1 } is a supermartingale (running backward) with respect to {F ( k 1 , k 2 ) : k 1 , k 2 = d + 1, d , …, 1 } . In other words, E [ M ( k 1 − 1, k 2 )| F ( k 1 , k 2 )] ≤ M ( k 1 , k 2 ) and E [ M ( k 1 , k 2 − 1)| F ( k 1 , k 2 )] ≤ M ( k 1 , k 2 ), ∀ k 1 , k 2 . Suppose that for . The filtration F ( k 1 , k 2 ) informs us about whether ∈ 𝒜 or not. On the one hand, if ∈ A or , then M ( k 1 − 1, k 2 ) = M ( k 1 , k 2 ). On the other hand, if ∈ ℬ and , then Where . Since I j Bernoulli(0.5) for j ∈ B by Proposition 3, it follows that given ℱ ( k 1 , k 2 ). As a result, Therefore, E [ M ( k 1 − 1, k 2 )| ℱ ( k 1 , k 2 )] ≤ M ( k 1 , k 2 ). We can show that E [ M ( k 1 , k 2 − 1)| ℱ ( k 1 , k 2 )] ≤ M ( k 1 , k 2 ) in the similar vein. In this process, ( T α ,1 , T α ,2 ) can be regarded as a stopping time with respect to the filtration ℱ ( k 1 , k 2 ) as ∈ ℱ ( k 1 , k 2 ), where and denote the indices such that and . According to the optional sampling theorem [ 45 ] and Proposition 3, we deduce where d 0 = | ℬ |, Y 1 = # { and }, Y 2 = # { and }. Let Y 3 = # { and } and Y 4 = # { and }. Then ( Y 1 , …, Y 4 ) follow a multinomial distribution with equal event probabilities and . It follows that Since Y 2 ∼ Binomial by Proposition 3, we have And Therefore, As a consequence, FDR≤ α . In the next, we show the sure screening property. We can deduce from Lemma 2 that for some b 5 > 0. Furthermore, since W j = 0 for j / ∈ A and d 0, That is, important features are ranked above unimportant ones with probability approaching 1. Given and , the knockoff procedure stops at and as , in which case . 7 Additional Simulation Results As additional simulation studies, we further consider a linear design with varying signal strength (Example 3) and a more complex nonlinear design (Example 4) for both AFT- and PH-type of models under different censoring mechanisms. Example 3 . Let X ∼ N p ( 0 , Σ), where Σ = AR (0.5). Given X , the true survival time is generated from the following accelerated failure time (AFT) model and proportional hazard (PH) model: 1. Model 1: log T = 2 X 1 + .8 X 2 + .9 X 3 + X 4 + 2 X 5 + E , where E ∼ N (0, 1) independently; 2. Model 2: log(.5( e 2 T − 1)) = 2 X 1 + .8 X 2 + .9 X 3 + X 4 + 2 X 5 + E , where E follows the standard extreme value distribution independently. For each model, the survival time is subject to two censoring mechanisms: independent censoring time C generated from uniform distribution on [0, c 0 ]; dependent censoring time C generated from exponential distribution with mean c 0 e X 3, where the constant c 0 is chosen to achieve 30% or 50% censoring rate (CR). The results are summarized in Table 6 for n = 200 and d = [ n/ log n ] = 38. View this table: View inline View popup Download powerpoint Table 6: Quantiles of MMS ( M τ ) and selection proportions ( P j ’s and P A ) for models in Example 3 based on 200 replicates with n = 200, p = 5000 and d = [ n/ log n ] = 38. Example 4 The setup of this example is identical to Example 3, except that the true survival time is generated from the following accelerated failure time (AFT) model and proportional hazard (PH) model: 1. Model 3: log T = g 1 + g 2 + g 3 + g 10 + E ,where E ∼ N (0, 1) independently and , g 3 = 2[ exp (−3( X 3 − 1) 2 ) + exp (−4( X 3 − 3) 2 )] and : 2. Model 4: log(.5( e 2 T − 1)) = g 1 + g 2 + g 3 + g 10 + ϵ, where ϵ follows the standard extreme value distribution independently. The results are summarized in Table 7 for n = 200 and d = [ n/ log n ] = 38. View this table: View inline View popup Download powerpoint Table 7: Quantiles of MMS ( M τ ) and selection proportions ( P j ’s and P A ) for models in Example 4 based on 200 replicates with n = 200, p = 5000 and d = [ n/ log n ] = 38. 8 Clinical Characteristics of The TCGA and GSE65858 Primary Tumor Samples A summary of the clinical characteristics of the TCGA and GSE65858 primary tumor samples is presented in Table 8 . View this table: View inline View popup Download powerpoint Table 8: Subgroup frequency (and percentage in parentheses) for clinical characteristics of the TCGA and GSE65858 Primary Tumor Samples. Acknowledgements This research was partially funded by grants P20CA252717, P20CA264067 and R21DE031879 from the United States National Institutes of Health (NIH), and the VCU Quest fund. Services and products in support of the research project were generated by the VCU Massey Comprehensive Cancer Center Biostatistics Shared Resource, supported, in part, with funding from NIH-NCI Cancer Center Support Grant P30CA016059. Footnotes urmiaf{at}vcu.edu dbandyop{at}vcu.edu References [1]. ↵ Athanassios Argiris , Michalis V Karamouzis , David Raben , and Robert L Ferris . Head and Neck Cancer . The Lancet , 371 ( 9625 ): 1695 – 1709 , 2008 . OpenUrl [2]. ↵ Carlos M. Carvalho and Pradeep Ravikumar Krishnakumar Balasubramanian , Bharath Sriperumbudur , and Guy Lebanon . Ultrahigh dimensional feature screening via rkhs embeddings . In Carlos M. Carvalho and Pradeep Ravikumar , editors, Proceedings of the Sixteenth International Conference on Artificial Intelligence and Statistics, volume 31 of Proceedings of Machine Learning Research , pages 126 – 134 . PMLR , 2013 . [3]. ↵ Rina Foygel Barber and Emmanuel J Candès . A knockoff filter for high-dimensional selective inference . The Annals of Statistics , 47 ( 5 ): 2504 – 2537 , 2019 . OpenUrl [4]. ↵ Rina Foygel Barber and Emmanuel J. Candès . Controlling the false discovery rate via knockoffs . The Annals of Statistics , 43 ( 5 ): 2055 – 2085 , 2015 . OpenUrl [5]. ↵ Emmanuel Candes , Yingying Fan , Lucas Janson , and Jinchi Lv . Panning for gold:’model-x’knockoffs for high dimensional controlled variable selection . Journal of the Royal Statistical Society: Series B (Statistical Methodology) , 80 ( 3 ): 551 – 577 , 2018 . OpenUrl CrossRef [6]. ↵ Emmanuel Candes and Terence Tao . The dantzig selector: Statistical estimation when p is much larger than n . The Annals of Statistics , 35 ( 6 ): 2313 – 2351 , 2007 . OpenUrl [7]. ↵ Xiaolin Chen , Xiaojing Chen , and Hong Wang . Robust feature screening for ultra-high dimensional right censored data via distance correlation . Computational Statistics & Data Analysis , 119 : 118 – 138 , 2018 . OpenUrl [8]. ↵ Hengjian Cui , Runze Li , and Wei Zhong . Model-free feature screening for ultrahigh dimensional discriminant analysis . Journal of the American Statistical Association , 110 ( 510 ): 630 – 641 , 2015 . OpenUrl [9]. ↵ Shanshan Ding , Wei Qian , and Lan Wang . Double-slicing assisted sufficient dimension reduction for high-dimensional censored data . The Annals of Statistics , 48 ( 4 ): 2132 – 2154 , 2020 . OpenUrl [10]. ↵ Dominic Edelmann , Manuela Hummel , Thomas Hielscher , Maral Saadati , and Axel Benner . Marginal variable screening for survival endpoints . Biometrical Journal , 62 ( 3 ): 610 – 626 , 2020 . OpenUrl [11]. ↵ João Fadista , Petter Vikman , Emilia Ottosson Laakso , Inês Guerra Mollet , Jonathan Lou Esguerra , Jalal Taneera , Petter Storm , Peter Osmark , Claes Ladenvall , Rashmi B Prasad , et al. Global genomic and transcriptomic analysis of human pancreatic islets reveals novel genes influencing glucose metabolism . Proceedings of the National Academy of Sciences , 111 ( 38 ): 13924 – 13929 , 2014 . OpenUrl Abstract / FREE Full Text [12]. ↵ Jianqing Fan , Yang Feng , and Rui Song . Nonparametric independence screening in sparse ultra-high-dimensional additive models . Journal of the American Statistical Association , 106 ( 494 ): 544 – 557 , 2011 . OpenUrl CrossRef PubMed [13]. ↵ Jianqing Fan and Runze Li . Variable selection via nonconcave penalized likelihood and its oracle properties . Journal of the American statistical Association , 96 ( 456 ): 1348 – 1360 , 2001 . OpenUrl CrossRef Web of Science [14]. ↵ Jianqing Fan and Jinchi Lv . Sure independence screening for ultrahigh dimensional feature space . Journal of the Royal Statistical Society: Series B (Statistical Methodology) , 70 ( 5 ): 849 – 911 , 2008 . OpenUrl CrossRef PubMed [15]. ↵ Jianqing Fan and Jinchi Lv . A selective overview of variable selection in high dimensional feature space . Statistica Sinica , 20 ( 1 ): 101 , 2010 . OpenUrl [16]. ↵ Jianqing Fan , Richard Samworth , and Yichao Wu . Ultrahigh dimensional feature selection: beyond the linear model . The Journal of Machine Learning Research , 10 : 2013 – 2038 , 2009 . OpenUrl [17]. ↵ Li-Ching Fan , Yung-Ming Jeng , Yueh-Tong Lu , and Huang-Chun Lien . Spock1 is a novel transforming growth factor-β–induced myoepithelial marker that enhances invasion and correlates with poor prognosis in breast cancer . PLoS One , 11 ( 9 ): e0162933 , 2016 . OpenUrl [18]. ↵ Jerome H Friedman . Greedy function approximation: a gradient boosting machine . Annals of statistics , pages 1189 – 1232 , 2001 . [19]. ↵ Y. Bengio , D. Schuurmans , J. Lafferty , C. Williams , and A. Culotta Kenji Fukumizu , Arthur Gretton , Gert Lanckriet , Bernhard Schölkopf , and Bharath K. Sriperumbudur . Kernel choice and classifiability for rkhs embeddings of probability distributions . In Y. Bengio , D. Schuurmans , J. Lafferty , C. Williams , and A. Culotta , editors, Advances in Neural Information Processing Systems , volume 22 , pages 1750 – 1758 . Curran Associates, Inc ., 2009 . OpenUrl [20]. ↵ Arthur Gretton , Olivier Bousquet , Alex Smola , and Bernhard Schölkopf . Measuring statistical dependence with hilbert-schmidt norms . In Algorithmic Learning Theory: 16th International Conference, ALT 2005, Singapore, October 8-11, 2005. Proceedings 16 , pages 63 – 77 . Springer , 2005 . [21]. ↵ Arthur Gretton , Kenji Fukumizu , and Bharath K Sriperumbudur . Discussion of: Brownian distance covariance . The annals of applied statistics , 3 ( 4 ): 1285 – 1294 , 2009 . OpenUrl [22]. ↵ J. Platt , D. Koller , Y. Singer , and S. Roweis Arthur Gretton , Kenji Fukumizu , Choon Teo , L. Song , Bernhard Schölkopf , and Alex Smola . A kernel statistical test of independence . In J. Platt , D. Koller , Y. Singer , and S. Roweis , editors, Advances in Neural Information Processing Systems , volume 20 , page 585 – 592 . MIT Press , 2008 . OpenUrl [23]. ↵ Peter Hall and Hugh Miller . Using generalized correlation to effect variable selection in very high dimensional problems . Journal of Computational and Graphical Statistics , 18 ( 3 ): 533 – 550 , 2009 . OpenUrl [24]. ↵ Patrick J Heagerty and Yingye Zheng . Survival model predictive accuracy and roc curves . Biometrics , 61 ( 1 ): 92 – 105 , 2005 . OpenUrl CrossRef PubMed Web of Science [25]. ↵ Wassily Hoeffding . Probability inequalities for sums of bounded random variables . Journal of the American Statistical Association , 58 ( 301 ): 13 – 30 , 1963 . OpenUrl CrossRef Web of Science [26]. ↵ Hyokyoung G Hong , Xuerong Chen , David C Christiani , and Yi Li . Integrated powered density: Screening ultrahigh dimensional covariates with survival outcomes . Biometrics , 74 ( 2 ): 421 – 429 , 2018 . OpenUrl [27]. ↵ John D Kalbfleisch and Ross L Prentice . The Statistical Analysis of Failure Time Data . John Wiley & Sons , 2011 . p. 241 . [28]. ↵ Chenlu Ke and Xiangrong Yin . Expected conditional characteristic function-based measures for testing independence . Journal of the American Statistical Association , 115 ( 530 ): 985 – 996 , 2020 . OpenUrl [29]. ↵ Randall J Kimple and Paul M Harari . The prognostic value of hpv in head and neck cancer patients undergoing postoperative chemoradiotherapy . Annals of translational medicine , 3 ( Suppl 1 ), 2015 . [30]. ↵ C René Leemans , Boudewijn JM Braakhuis , and Ruud H Brakenhoff . The molecular biology of head and neck cancer . Nature reviews cancer , 11 ( 1 ): 9 – 22 , 2011 . OpenUrl CrossRef PubMed Web of Science [31]. Jialiang Li , Qi Zheng , Limin Peng , and Zhipeng Huang . Survival impact index and ultrahigh-dimensional model-free screening with survival outcomes . Biometrics , 72 ( 4 ): 1145 – 1154 , 2016 . OpenUrl [32]. ↵ Runze Li , Wei Zhong , and Liping Zhu . Feature screening via distance correlation learning . Journal of the American Statistical Association , 107 ( 499 ): 1129 – 1139 , 2012 . OpenUrl CrossRef PubMed [33]. ↵ Wanjun Liu , Yuan Ke , Jingyuan Liu , and Runze Li . Model-free feature screening and fdr control with knockoff features . Journal of the American Statistical Association , 117 ( 537 ): 428 – 443 , 2022 . OpenUrl [34]. ↵ Yi Liu , Xiaolin Chen , and Gang Li . A new joint screening method for right-censored time-to-event data with ultra-high dimensional covariates . Statistical methods in medical research , 29 ( 6 ): 1499 – 1513 , 2020 . OpenUrl [35]. ↵ Qing Mai and Hui Zou . The kolmogorov filter for variable screening in high-dimensional binary classification . Biometrika , 100 ( 1 ): 229 – 234 , 2013 . OpenUrl CrossRef [36]. ↵ M Murdocca , C De Masi , S Pucci , R Mango , G Novelli , C Di Natale , and F Sangiuolo . Lox-1 and cancer: an indissoluble liaison . Cancer gene therapy , 28 ( 10-11 ): 1088 – 1098 , 2021 . OpenUrl CrossRef [37]. ↵ Ilda Patrícia Ribeiro , Luísa Esteves , Francisco Caramelo , Isabel Marques Carreira , and Joana Barbosa Melo . Integrated multi-omics signature predicts survival in head and neck cancer . Cells , 11 ( 16 ): 2536 , 2022 . OpenUrl [38]. ↵ Yi-Jun Shu , Hao Weng , Yuan-Yuan Ye , Yun-Ping Hu , Run-Fa Bao , Yang Cao , Xu-An Wang , Fei Zhang , Shan-Shan Xiang , Huai-Feng Li , et al. Spock1 as a potential cancer prognostic marker promotes the proliferation and metastasis of gallbladder cancer cells by activating the pi3k/akt pathway . Molecular cancer , 14 ( 1 ): 1 – 14 , 2015 . OpenUrl [39]. ↵ Bernard W Silverman . Density Estimation for Statistics and Data Analysis . Routledge , 2018 . [40]. ↵ Rui Song , Wenbin Lu , Shuangge Ma , and X Jessie Jeng . Censored rank independence screening for high-dimensional survival data . Biometrika , 101 ( 4 ): 799 – 814 , 2014 . OpenUrl CrossRef [41]. ↵ Kuiwei Su , Ying Wang , Hefeng Gu , Lan Ma , and Guihong Xuan . Overexpression of fatty acid desaturase 3 predicts poor prognosis in head and neck squamous cell carcinoma . Medicine , 101 ( 49 ), 2022 . [42]. ↵ Gábor J. Székely , Maria L. Rizzo , and Nail K. Bakirov . Measuring and testing dependence by correlation of distances . The Annals of Statistics , 35 ( 6 ): 2769 – 2794 , 2007 . OpenUrl [43]. ↵ Robert Tibshirani . Regression shrinkage and selection via the lasso . Journal of the Royal Statistical Society: Series B (Methodological) , 58 ( 1 ): 267 – 288 , 1996 . OpenUrl CrossRef Web of Science [44]. ↵ Scott A Tomlins , Bharathi Laxman , Saravana M Dhanasekaran , Beth E Helgeson , Xuhong Cao , David S Morris , Anjana Menon , Xiaojun Jing , Qi Cao , Bo Han , et al. Distinct classes of chromosomal rearrangements create oncogenic ets gene fusions in prostate cancer . Nature , 448 ( 7153 ): 595 – 599 , 2007 . OpenUrl CrossRef PubMed Web of Science [45]. ↵ Robert Buchanan Washburn . The optional sampling theorem for partially ordered time processes and multiparameter stochastic calculus . PhD thesis, Massachusetts Institute of Technology , 1979 . [46]. ↵ Holger Wendland . Scattered data approximation , volume 17 . Cambridge university press , 2004 . [47]. ↵ Gunnar Wichmann , Maciej Rosolowski , Knut Krohn , Markus Kreuz , Andreas Boehm , Anett Reiche , Ulrike Scharrer , Dirk Halama , Julia Bertolini , Ulrike Bauer , et al. The role of hpv rna transcription, immune response-related gene expression and disruptive tp53 mutations in diagnostic and prognostic profiling of head and neck cancer . International journal of cancer , 137 ( 12 ): 2846 – 2857 , 2015 . OpenUrl CrossRef PubMed [48]. ↵ Jinfeng Xu , Wai Keung Li , and Zhiliang Ying . Variable screening for survival data in the presence of heterogeneous censoring . Scandinavian Journal of Statistics , 47 ( 4 ): 1171 – 1191 , 2020 . OpenUrl [49]. ↵ Lijing Yao , Yu Gyoung Tak , Benjamin P Berman , and Peggy J Farnham . Functional annotation of colon cancer risk snps . Nature communications , 5 ( 1 ): 5114 , 2014 . OpenUrl [50]. ↵ Peng Zhang , Yan Zhao , Xin Xia , Song Mei , Yixuan Huang , Yingying Zhu , Shuting Yu , and Xingming Chen . Expression of olr1 gene on tumor-associated macrophages of head and neck squamous cell carcinoma, and its correlation with clinical outcome . Oncoimmunology , 12 ( 1 ): 2203073 , 2023 . OpenUrl CrossRef [51]. ↵ Tingyou Zhou and Liping Zhu . Model-free feature screening for ultrahigh dimensional censored regression . Statistics and Computing , 27 ( 4 ): 947 – 961 , 2017 . OpenUrl [52]. ↵ Li-Ping Zhu , Lexin Li , Runze Li , and Li-Xing Zhu . Model-free feature screening for ultrahigh-dimensional data . Journal of the American Statistical Association , 106 ( 496 ): 1464 – 1475 , 2011 . OpenUrl CrossRef PubMed [53]. ↵ Hui Zou . The adaptive lasso and its Oracle properties . Journal of the American Statistical Association , 101 ( 476 ): 1418 – 1429 , 2006 . OpenUrl CrossRef Web of Science View the discussion thread. Back to top Previous Next Posted August 14, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following α-KIDS: A novel feature evaluation in the ultrahigh-dimensional right-censored setting, with application to Head and Neck Cancer Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share α -KIDS: A novel feature evaluation in the ultrahigh-dimensional right-censored setting, with application to Head and Neck Cancer Atika FArzana Urmi , Chenlu Ke , Dipankar Bandyopadhyay medRxiv 2024.08.13.24311946; doi: https://doi.org/10.1101/2024.08.13.24311946 Share This Article: Copy Citation Tools α -KIDS: A novel feature evaluation in the ultrahigh-dimensional right-censored setting, with application to Head and Neck Cancer Atika FArzana Urmi , Chenlu Ke , Dipankar Bandyopadhyay medRxiv 2024.08.13.24311946; doi: https://doi.org/10.1101/2024.08.13.24311946 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Oncology Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (302) Cardiovascular Medicine (4453) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15242) Forensic Medicine (30) Gastroenterology (1131) Genetic and Genomic Medicine (6615) Geriatric Medicine (669) Health Economics (1001) Health Informatics (4552) Health Policy (1372) Health Systems and Quality Improvement (1614) Hematology (543) HIV/AIDS (1270) Infectious Diseases (except HIV/AIDS) (15929) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6625) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3344) Ophthalmology (979) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5461) Public and Global Health (9252) Radiology and Imaging (2207) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1197) Rheumatology (597) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a02ee705de42c13d',t:'MTc3OTk4ODUzNw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00