Generative prediction of causal gene sets responsible for complex traits

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 106,378 characters · extracted from preprint-html · click to expand
Generative prediction of causal gene sets responsible for complex traits | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Generative prediction of causal gene sets responsible for complex traits View ORCID Profile Benjamin Kuznets-Speck , View ORCID Profile Buduka K. Ogonor , View ORCID Profile Thomas P. Wytock , View ORCID Profile Adilson E. Motter doi: https://doi.org/10.1101/2025.04.17.649405 Benjamin Kuznets-Speck a Department of Physics and Astronomy, Northwestern University , Evanston, IL, 60208 b Center for Network Dynamics, Northwestern University , Evanston, IL, 60208 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Benjamin Kuznets-Speck Buduka K. Ogonor a Department of Physics and Astronomy, Northwestern University , Evanston, IL, 60208 b Center for Network Dynamics, Northwestern University , Evanston, IL, 60208 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Buduka K. Ogonor Thomas P. Wytock a Department of Physics and Astronomy, Northwestern University , Evanston, IL, 60208 b Center for Network Dynamics, Northwestern University , Evanston, IL, 60208 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Thomas P. Wytock Adilson E. Motter a Department of Physics and Astronomy, Northwestern University , Evanston, IL, 60208 b Center for Network Dynamics, Northwestern University , Evanston, IL, 60208 c Department of Engineering Sciences and Applied Mathematics, Northwestern University , Evanston, IL, 60208 d Northwestern Institute on Complex Systems, Northwestern University , Evanston, IL, 60208 e National Institute for Theory and Mathematics in Biology , Evanston, IL, 60208 f Chemistry of Life Processes Institute, Northwestern University , Evanston, IL, 60208 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Adilson E. Motter For correspondence: motter{at}northwestern.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract The relationship between genotype and phenotype remains an outstanding question for organism-level traits because these traits are generally complex . The challenge arises from complex traits being determined by a combination of multiple genes (or loci), which leads to an explosion of possible genotype-phenotype mappings. The primary techniques to resolve these mappings are genome/transcriptome-wide association studies, which are limited by their lack of causal inference and statistical power. Here, we develop an approach that leverages transcriptional data endowed with causal information and a generative machine learning model to strengthen statistical power. Our implementation of the approach—dubbed TWAVE—includes a variational autoencoder trained on human transcriptional data, which is incorporated into an optimization framework. Given a trait phenotype, TWAVE generates expression profiles, which we dimensionally reduce by identifying independently varying generalized pathways (eigengenes). We then conduct constrained optimization to find causal gene sets that are the gene perturbations whose measured transcriptomic responses best explain trait phenotype differences. By considering several complex traits, we show that the approach identifies causal genes that cannot be detected by the primary existing techniques. Moreover, the approach identifies complex diseases caused by distinct sets of genes, meaning that the disease is polygenic and exhibits distinct subtypes driven by different genotype-phenotype mappings. We suggest that the approach will enable the design of tailored experiments to identify multi-genic targets to address complex diseases. Significance summary Researchers have long sought to bridge the gap between phenotypes and the genotypes that cause them. This gap remains open because current methods focus on associating phenotypes to a combinatorially explosive number of genotypic possibilities, resulting in a loss of statistical power. We overcome this limitation by employing transcriptomic data from complex, polygenic, human diseases combined with measured transcriptomic responses to gene perturbations in cell lines. The former data allow us to perform generative modeling and dimensional reduction to map transcriptome to phenotype, while the latter incorporate causal information regarding how gene regulation shapes phenotype. We predict sets of genes that explain the emergence of complex traits, which suggest possible multi-target disease treatments. Introduction Complex traits are polygenic, orchestrated by networks of correlated genes that work together to produce phenotypic variation [ 1 , 2 , 3 ]. An outstanding question in the study of such traits is the identification of the specific combinations of gene variants that give rise to the different phenotypic expressions [ 4 , 5 , 6 , 7 , 8 , 9 , 10 ]. Association studies have been performed to search for genetic loci significant to a complex trait phenotype by conducting hypothesis tests on individual genetic loci, from which independent mutations/genes are statistically associated with the phenotype in question [ 11 , 12 , 13 , 14 , 15 ]. We innovate on these techniques by developing a framework to jointly predict sets of genes while accounting for collective behavior not captured by statistical tests on individual genes. Association studies, such as genome/transcriptome-wide association studies (GWAS/TWAS), have been broadly adopted in over 5,700 studies and 3,300 traits as of 2021 [ 12 ]. A common critique of these studies is that they have low statistical power due to the combinatorial explosion in the number of gene sets that must be tested [ 11 , 13 , 16 ]. Post-GWAS/TWAS analyses such as fine-mapping attempt to address this limitation by considering the correlation structure of the genetic data [ 17 , 18 , 19 , 20 , 21 ]. However, they rely on an initial association study to select what variants to fine-map, potentially leaving behind genes that would be significant collectively but have low independent effect size. In the framework presented here, we develop and apply an approach that considers all genes simultaneously, regardless of individual effect size. The framework combines generative machine learning, dimensionality reduction, and constrained optimization ( Fig. 1 ). Download figure Open in new tab Figure 1: Schematic of the proposed approach. Synthetic transcriptomic profiles are generated from a machine learning model that learns from RNA-Seq data on complex traits. The generated data are projected onto eigengenes, which are linear combinations of genes that vary independently and retain important gene correlations that differentiate between complex trait phenotypes. From here, gene perturbations whose transcriptional responses bridge the gap between trait phenotypes are found by constrained optimization. A key aspect of our approach is the use of increasingly available trait-labeled transcriptomic data from bulk and single-cell RNA-Seq experiments, which contends with the biological networks that influence complex traits [ 22 , 23 ]. To better extract patterns from our transcriptional data, we develop the Transcriptome-Wide conditional Variational auto-Encoder (TWAVE), a machine learning model that generates denoised transcriptional profiles for the relevant phenotypes. To understand how regulatory changes affect expression levels, we also incorporate complementary data on transcriptional response to gene perturbations (knockdowns and overexpressions) [ 24 ]. These transcriptional data are dimensionally reduced while maintaining causal information, which we achieve using the concept of eigengenes, to facilitate the optimization over gene perturbations [ 25 ]. Together, these data sources allow us to explore how the regulatory network drives phenotypic changes without prior knowledge of network structure. We focus on the human disease traits in Table 1 . Throughout, we take care to distinguish between traits (e.g., eye color) and their phenotype variants (e.g., blue, brown, green), and we consider traits that have a baseline and variant phenotype. Moreover, we interpret the states associated with each variant as defined by distinct attractors of the gene regulatory network [ 26 , 27 , 28 ]. The problem of identifying the genes that cause a trait phenotype can thus be mapped to an optimization over combinations of transcriptional perturbations that steer transcriptomic states from baseline to variant attractors and vice-versa. The resulting framework reveals groups of gene perturbations that most influence phenotypic variation, pinpointing the molecular underpinnings that determine complex traits. View this table: View inline View popup Download powerpoint Table 1: Gene Expression Omnibus and DepMap datasets. The columns represent (from left to right) the traits considered, the tissues of origin, the type of sequencing, the number of samples of the baseline and variant phenotypes, and the GEO or DepMap series accession numbers. Results Generating complex trait transcriptomes Identifying sets of differentially expressed genes is complicated by the fact that transcriptional measurements include a large number of genes and a comparatively small number of samples. It is precisely this feature that makes it challenging to distinguish real biological differences from random variance when using statistical tests that treat genes as independent variables. Our method recognizes that genes operate in concert rather than independently to orchestrate cell function, which transforms the problem into learning an effective representation of these relationships from data. Fig. 2 presents TWAVE, which solves this problem by looking at the data as a whole, using a neural network encoder to embed high-dimensional gene expression profiles onto a low-dimensional latent space ( Z ), where data points can be classified and new representative points can be generated. Points in the latent space, including newly generated ones, are decoded back up to the full gene expression space as in Fig. 2A . The model (consisting of the encoder, decoder, and latent space classifier) is trained with a combination of three loss terms. The first term accounts for how accurately the autoencoder can reconstruct the original data. The second term is the Kullback-Leibler (KL) divergence loss, which regularizes Z so that each dimension contributes roughly equally to the overall variance in the latent space. The third term is a classification loss to mold the structure of the latent space so that different phenotypes of the trait (baseline and variant for example) can be distinguished by their transcriptional states, as detailed in Materials and Methods. Download figure Open in new tab Figure 2: TWAVE construction and validation, presented for the inflammatory bowel disease trait. ( A ) TWAVE architecture, where gene expression profiles are projected onto a low-dimensional latent space ( Z ) and subsequently reconstructed with a neural network decoder. ( B ) First two principal components (PC) of the latent space Z , showing a clear separation of complex trait phenotypes (baseline and variant). A linear interpolation between the means of the two populations in the latent space (blue-to-red stars) falls along the first principal component. ( C ) Comparison between original (blue) and TWAVE-reconstructed (red) distributions of gene expression for 4 different genes, conveying strong agreement. ( D ) Receiver operating characteristic for significant gene associations in the reconstructed versus original data, confirming that TWAVE retains significant associations found in the original data. Fig. 2B shows that transcriptional measurement associated with the baseline and variant populations (blue and red clusters) segregate to coordinates along the first principal component, which is a natural outcome of the latent space learned by TWAVE. Here, we use the representative case of inflammatory bowel disease to illustrate the construction of TWAVE, but we observe similar performance among the other traits we considered in Table 1 . The figure also shows that a linear interpolation between the two clusters in Z -space lies along the first principal component, which accounts for the largest fraction of variation in the data. Fig. 2C demonstrates a close agreement between the original gene expression distributions and the reconstructions from TWAVE. To test whether our auto-encoder retains associations between genes and the complex trait of interest, we compare the differentially expressed genes identified by t -tests on both the original and reconstructed expression profiles using the area under the receiver operating characteristic curve (AUROC). Fig. 2D shows this curve, which is constructed by: 1) arranging both sets of genes from smallest to largest in terms of their p -values, 2) varying the threshold at which genes are statistically significant, and 3) counting the fraction of significant genes in the original data that are selected by TWAVE (true positive rate) as a function of the fraction of genes selected by TWAVE that are not significant in the original data (false positive rate). For all traits, we find that the AUROC approaches 1, indicating that the sets of differentially expressed genes identified by TWAVE and within the original data are nearly identical. Full technical details concerning the construction of TWAVE are provided in Materials and Methods. For each of the complex trait datasets described in Table 1 , we use TWAVE to estimate the distributions of the transcriptional data in the latent space arising from the baseline and variant phenotypes, while retaining the fundamental features that distinguish the two populations. We then draw points from these distributions in the latent space, decoding them as depicted in Fig. 2A . By choosing an equal number of each phenotype (baseline and variant), estimates of the distributions from the data are equally precise for each of these trait phenotypes. It is instructive to compare against the method of “extreme pseudo-sampling,” in which the latent space of a variational auto-encoder (VAE) is sampled randomly [ 29 , 30 ]. Our method is different from extreme pseudo-sampling in two crucial ways. First, TWAVE employs a conditional VAE (i.e., it includes a latent space explicitly trained to classify between baseline and variant). Second, TWAVE draws from a probability distribution in the latent space associated with the trait phenotype label instead of drawing randomly from any state in the latent space. Overall, TWAVE allows us to make the most from limited transcriptional data, drawing new representative samples in a way that would be unfeasible without generative modeling. Causal dimensions of complex trait variation We select the eigengenes that are most determinative of the trait phenotypes (i.e., are causal). Conceptually, eigengenes are weighted combinations of genes that vary in concert within the eigengene, but any given eigengene can vary independently of the others. Mathematically, they are eigenvectors of the TWAVE-generated gene expression matrix Y and form an orthogonal basis e i in gene expression space [ 25 ]. This basis corresponds to the columns of the unitary matrix U = [ e 0 , …, e i , …, e n ] in the singular value decomposition Y T = U Σ V T , where Σ is a diagonal matrix of singular values in descending order and V T contains the left eigenvectors of Y T . For each dataset, we perform singular value decompositions of the m × n matrix Y , where m is the number of sample expression profiles in the TWAVE dataset and n the number of genes in each sample. We proceed to determine which eigengenes are most likely to capture differences between the baseline and variant trait phenotype by adapting Bayesian fine-mapping [ 17 ] to eigengenes. This procedure seeks a small set of r eigengenes that can accurately distinguish between the phenotypes according to the posterior inclusion probability, which quantifies how well a proposed set of eigengenes explains the data (i.e., how causal the set is). The fine-mapping procedure starts by projecting the data onto the d = 200 eigengenes with the largest singular values X = Y U n × d . This choice of d ensures that the set of eigengenes from which the causal set is selected accounts for the large majority of the variance, as shown in Fig. S1 . A logistic regression model over eigengenes is then fit using the expression profiles and associated trait phenotype labels, achieving high accuracy, F1-score, and recall, for all datasets analyzed. We also show that a maximum likelihood estimator for the posterior distribution of causal eigengenes can be formed from: 1) the odds ratios ζ = log[ ρ/ (1 − ρ )] from the logistic regression, where ρ is the probability of a data point belonging to the variant class; and 2) the projected expression matrix X . To reduce from d to r eigengenes, we perform Markov chain Monte Carlo (MCMC) sampling to maximize the likelihood of causal eigengenes given the regression data. For each of these d eigengenes, we evaluate the posterior inclusion probability that each eigengene is causal p ( e i causal | data = { X, ζ }) and build sets from the top r = 50 causal eigengenes ( r = 10 for allergic asthma) with the largest posterior inclusion probability. We show p ( e i | X, ζ ) for the first d principal components of Y in Fig. 3A , ordered from non-causal ( p = 0) to causal ( p = 1). Download figure Open in new tab Figure 3: Dimension reduction by selecting the most causal eigengenes for the inflammatory bowel disease trait. ( A ) Posterior inclusion probability for eigengenes to be causal arranged in descending order of their singular values. ( B ) Classification accuracy from logistic regression on data projected onto an increasing subset of the top d eigengenes arranged in descending order of the posterior probability of being causal (blue), in descending order of the singular values (gray), and randomly (red). As a validation test for the selected eigengenes, we perform the logistic regressions on X summarized in Fig. 3B . The regression accuracy for including data projected onto the first i eigengenes sorted in order of causality p ( e i | X, ζ ) quickly climbs to above 0.9 within r top eigenvectors. This validates our choice of keeping r eigengenes in our dimensionality reduction. On the other hand, including eigengenes in principal component order (i.e., ordered by the fraction of variance that aligns along each eigenvector), yields significantly poorer classification results. Arranging the eigengenes randomly can actually produce a better result than arranging them in order of the singular values when keeping less than 60 eigengenes, but principal component ordering outperforms the random one as the dimension of this reduced space is increased. Complex trait transitions via eigengene perturbations Constrained optimization To implicate genes responsible for transitioning between the baseline and variant phenotypes, we explore extensive data on transcriptional responses to gene perturbations. Specifically, we define a perturbation response matrix, B , whose rows represent eigengenes in the original dataset and whose columns are average transcriptional responses to a transcriptional perturbation (one response profile for each column). This matrix consists of 10% overexpressions and 90% knockdowns (most of the latter are implemented through RNA interference), as indicated in Dataset S1. With this matrix in hand, we investigate which combinations of perturbations can cause the baseline transcriptional profile to match the variant, and vice-versa. Formally, this question is answered by solving the following constrained optimization problem: where and || · || denotes the Euclidean distance. The choice of Euclidean distance reflects our assumptions 1) that there is a single phenotype for each transcriptional state and 2) that differences in the expression of each eigengene are equally likely to contribute to phenotypic differences. We discuss alternative choices of the distance metric in the Supplementary Information (SI). Here, perturbations add with weight u in causal eigengene space to transition from the state x baseline to the state x variant ( Fig. 4A ). Before considering transitions between individual states in the baseline and variant clusters, we consider transitions between average baseline and variant states. Download figure Open in new tab Figure 4: Attributing complex trait phenotypes to sets of genes. ( A ) Identification of gene sets that differ between baseline and variant trait phenotypes, where blue and red dots represent individual states. A state in the baseline cluster (blue background) transitions to a state in the variant cluster (red background) upon targeted transcriptional perturbations. ( B ) Coefficient of determination for controlled transitions between phenotypes of complex traits, where R 2 close to 1 indicates that the final transcriptional state approaches that of the target phenotype. Since we expect the baseline and variant phenotypes to be stable with respect to the fluctuations inherent to transcription, the closer we approach states known to belong to a given phenotype transcriptionally, the more likely that state is to exhibit that phenotype. We quantify this likelihood with the coefficient of determination, where R 2 ≤ 1 and R 2 close to 1 indicates high efficacy. In particular, R 2 > 0.5 indicates that the distance between the two states has been at least halved. The relationship between distance and cell behavior becomes less precise in the full high-dimensional expression space because there are many points that are a given distance away from a target point. Thus, reducing the number of dimensions of our problem by working with a select set of eigengenes is crucial, which is implemented through our choice of matrix B . We use the Python function minimize from SciPy (which implements the L-BFGS-B method) to solve the constrained optimization problem in Eq. 1 for all of our RNA-Seq datasets. The optimal u * yields R 2 ≈ 1 for all complex traits we consider, namely allergic asthma, inflammatory bowel disease, food allergy, cancer metastasis (where baseline and variant refer to primary and metastatic tumors), age related macular degeneration, type-1 diabetes, and non-small cell lung cancer ( Fig. 4B ). This is further confirmed by examining the selected perturbations for individual trait phenotypes and noting that they generally point towards the phenotype expression in question. Table 2 shows the genes with the top 12 perturbation weights in u * for allergic asthma along with a brief annotation of their function. Many of these top-selected genes have been implicated in allergic asthma, lung and airway function, and inflammation and immunity. Mutations in BMPR2 , for example, have been shown to cause asthma-like symptoms and pulmonary hypertension in response to mild antigens in the airway [ 34 , 35 ]. In addition, TCF7 promotes T-cell differentiation to Th2 or memory T-cells [ 36 ], consistent with allergic asthma being the result of immune system dysregulation. Other identified genes, such as TARDBP, TENT4B , and HNRNPL , have not been previously implicated in allergic asthma. However, these genes are associated with RNA metabolism and modifications including alternative splicing and poly-A tail alteration, which in turn are related to immune response [ 46 ]. In particular, TARDBP ( TDP-43 ) has been shown to regulate alternative splicing and alternative polyadenylation in CD8+ T-cells, and specific RNA splicing and polyadenylation events depend on the presence of TARDBP during CD8+ T-cell co-stimulation [ 31 ]. TENT4B is involved in mRNA stabilization, influencing B-cell proliferation and the cellular response to viral infections [ 32 ], whereas HNRNPL participates in the regulation of inflammatory responses, particularly through its interaction with long non-coding RNAs and its role in regulating TNF-α transcription [ 44 , 45 ]. We performed the optimization between the average baseline and variant states for all 6 other complex traits shown in Fig. 4B , and refer the reader to Supplementary Tables S1 - S6 for the top selected genes for each trait. View this table: View inline View popup Download powerpoint Table 2: Allergic asthma transcriptional perturbations (GSE96783) Optimization for individual baseline-variant states We perform optimizations across individual baseline and variant pairs to account for the fact that the measured transcriptional signatures of a given a phenotype vary heterogeneously across cell samples and individuals ( Fig. 4A ). An analogous approach in TWAS would require sub-sampling an already small number of measurements, leading to a large uncertainty in the variance and an inability to detect baseline-variant differences [ 47 ]. By recasting the hypothesis test as an optimization problem, we avoid this issue using information on how the regulatory network responds to perturbations. This reformulation allows us to investigate how the gene perturbations responsible for a trait may change across individual baseline-variant pairs. We break down our strategy into two steps: 1) find the the optimal set of perturbations for a large number of baseline-variant state pairs and 2) compare the observed co-occurrence of perturbation pairs with a null model. The null model is designed to preserve both the frequency with which each perturbation is selected and the number of perturbations needed for each pair of states. The transition between any x i and x j can be induced by applying the perturbation where the variant state is x i and the baseline state is x j . We solve this optimization problem over a range of different λ values, taking the largest λ (the sparsest solution) such that R 2 = 1 − D ( u ij ) /D ( 0 ) > 0.99. This is repeated for N = 2500 randomly selected pairs in the forward (baseline-to-variant) and reverse directions. We then construct a bipartite network represented by the (adjacency) matrix where the columns of A are perturbed genes and the rows are different accepted i, j pairs. We take the dot-product between the columns µ and ν of A to get the frequency f µν at which the corresponding perturbed genes co-occur in the same u ij : To identify statistically significant pairs µ, ν , we compare f µν to the frequency at which µ and ν occur together in a null model of a random maximum entropy graph whose row and column sums are fixed to that of A [ 48 , 49 ]. The expected co-occurrence frequency in the null model is where p ij is the probability of an edge existing between nodes i and j of the maximum entropy graph (see Materials and Methods). In addition to ⟨ f µν ⟩, the standard error σ µν can be approximated via error propagation using the fact that the probability of edge occurrences are Bernoulli random variables. From these ensemble statistics a z-value can be constructed as z µν = ( f µν − ⟨ f µν ⟩) /σ µν . Now that we have established a null model for the co-occurrence of perturbations in causing/reversing the variant behavior, we can examine the network formed by the statistically significant co-occurrences that deviate from the maximum entropy model. To determine where co-occurrences begin to deviate from the null model, we identify the set of significant pairs with high z µν above a threshold defined by inspection of the quantile-quantile plot (see SI Fig. S2 ). Application to allergic asthma In the case of allergic asthma, pairs with z µν > 20 were kept for analysis. The corresponding network is depicted in Fig. 5A for the baseline-to-variant transition (i.e., the onset of asthma). Each node represents a perturbed gene within a significantly co-occurring pair and is color-coded according to the nature of the perturbation (knockdown or overexpression). Edges represent significant co-occurrences and are color-coded by the expression correlations across responses between the genes they connect. We find that genes with many connections in this network representation, such as ADAR, PAN3 , and MAPK1 have been implicated in allergic asthma before [ 50 , 51 , 52 ]. We also optimized over gene perturbations in the reverse direction, as shown in Fig. 5B . Among the genes featured in this network, we again find several linked to allergic inflammation: SUZ12 inhibition is associated with the reduction of allergic inflammation through is role in the protein complex PRC2, JAK2 inhibitors have been proposed to alleviate asthma because of JAK2 ‘s role in the JAK-STAT signaling pathway, and knockdown of MYC has been shown to repress ILC2 (immune cell) activity, which in turn reduced airway inflammation and immune hyperresponsiveness [ 53 , 54 , 55 ]. Download figure Open in new tab Figure 5: Genes perturbed in transitions between baseline and variant clusters for allergic asthma. (A, B ) Gene perturbation co-occurrence networks for forward (A) and reverse (B) transitions. Edges appear between a pair of perturbed genes if they frequently co-occur in successful transitions (i.e., they both occur with high frequency in u ij compared to a maximum entropy graph null model). The edges are colored by gene-gene correlation in the perturbation response dataset. The nodes are sized proportionally to the number of edges and color-coded according to whether the perturbation is a knockdown (blue) or overexpression (red). ( C ) Histograms of the number of gene perturbations required to induce the forward transition (red) and the reverse transition (blue). Note that many of the genes in the forward and reverse co-occurrence graphs in Fig. 5A, B are distinct . In a dynamical network, reversing a perturbation does not generally restore the state of the system, a phenomenon that is accounted for by the bounds placed on u in Eqs. (1) and (4). These bounds may prevent the same genes from being selected in the forward and reverse directions, which within our approximation reflects the fact that the responses to knockdowns and overexpressions are not exactly anti-aligned. In our case, this phenomenon gives rise to the observation that the genes causing a given trait phenotype are not necessarily the ones that mitigate it. Moreover, as shown in Fig. 5C , the number of perturbations required to make the transition in the forward and reverse directions are also different. Remarkably, it takes a combination of fewer single-gene perturbations to induce a transition in the reverse direction (i.e., from the variant to the baseline state) than in the forward one. This may be because it takes more perturbations to go to a particular variant state than to a generic baseline state (“all roads lead to Rome” but not necessarily the inverse). A second possibility is that there is an overrepresentation of certain genes in the library of gene perturbations (see Materials and Methods). Importantly, both genes involved in the forward direction and genes involved in the reverse direction can be associated with the trait because they can promote or reverse a change from the baseline to variant phenotype. For example, the involvement of MYC in the reverse direction may be due to its known function in promoting plasticity between transcriptional states and amplifying gene expression overall when overexpressed [ 56 , 57 ]. Moreover, pluripotent stem cells with MYC knocked-down have been shown to decrease allergic reactions in mice by inhibiting T-helper cell immune reaction [ 58 ]. Aberrant translation of the CEBPA gene, which is implicated in the forward direction, has also been associated with causing bronchial smooth muscle cells—a tissue that plays a key role in asthma—to proliferate faster [ 59 ]. Notably, there are genes featured in both the forward and reverse co-occurrence networks that play roles in both promoting and attenuating allergic asthma. For example, FOXO1 overexpression in mice has been shown to promote allergic asthma through macrophage polarization, Th9 (T-helper 9 cell) differentiation, and regulation of IRF4 expression, though inhibition of FOXO1 led to attenuation of immune response and asthmatic inflammation through regulation of IRF4 [ 60 , 61 ]. Likewise, the role of JUNB depends on the state of other transcription factors. Though JUNB significantly influences Th2 (T-helper 2 cell) differentiation and the production of Th2 cytokines, promoting allergic inflammation, it also plays a role in maintaining homeostasis. Specifically, knockdown of JUNB limits excessive inflammation by modulating regulatory T-cell differentiation [ 62 ]. We also find that, except for BMPR2 and TENT4B , the genes appearing in the co-occurrence graphs from the optimization between individual baseline-variant states are distinct from those identified by the average variant-baseline optimization ( Table 2 ). The apparent contrast between optimizing over average transcriptional states and individual pairs highlights the fact that there can be multiple paths (defined by different perturbation sets) through which the disease progresses and is mediated and that these paths are not necessarily the ones connecting the average transcriptional states. Consequently, the distinct co-occurring genes in Fig. 5 could potentially relate to different mechanisms playing a role in allergic asthma. We emphasize that such a co-occurrence structure cannot be inferred from studying population averages alone, as typically done in GWAS/TWAS [ 47 ]. It is instructive to consider the transcription factors that regulate the genes in our perturbation response library ( upstream transcription factors). Not all upstream transcription factors have transcriptional responses measured in our library, so we use the Enrichr gene set enrichment database to find them. We focus on the case where the upstream factors simultaneously regulate both genes in a co-occurring pair so that a single transcription factor could explain their combined influence on the trait phenotype. For instance, GATA2, TET2 , and TWIST1 are enriched for more than one gene co-occurrence pair and are known to influence allergic asthma [ 63 , 64 , 65 ]. The most parsimonious explanation for the enrichment of these transcription factors is that they exert their influence on the phenotype (at least in part) through the genes that appear in our perturbation response dataset. The enrichment of these additional factors shows that we may be able to infer trait-associated genes outside our dataset. Learning across different contexts with TWAVE Thus far, we have considered 7 complex traits, each based on data from a unique tissue type and previously examined by differential expression. Next, we generalize to contexts where 1) the data (and trait phenotype) in question are associated with multiple disparate tissues and 2) the trait phenotype is caused by a mutation that affects the function of the protein instead of its transcriptional expression. We consider a phenotype that manifests itself through multiple tissues by studying the trait of cancer metastasis in the cancer dependency map (DepMap) dataset [ 66 ]. Because these pan-cancer data come from many different cell types, there are confounding variables that render a simple differential expression analysis unable to detect any differentially expressed genes common to the process of metastasis across all tissues. Indeed, we found no statistically significant associations by performing such an analysis. However, using TWAVE, we are able to disentangle the effects of the confounding variables associated with different disease contexts (e.g., cell type, tissue origin, tumor location, and systematic effects) to uncover the common biological mechanisms driving cancer metastasis (see SI Fig. S3 ). In this case, we again find that our co-occurring perturbation networks contain many genes previously found to promote or mitigate cancer metastasis, including NF1 knockdown, SOX5 overexpression, CBFB overexpression, TOX4 knockdown, PROX1 over-expression, and EHF knockdown [ 67 , 68 , 69 , 70 , 71 , 72 ]. An Enrichr search for the co-occurring genes reveals out-of-sample upstream transcription factors that are known to affect metastasis as well, such as STAT3 and CTCF [ 73 , 74 ]. Although both of these genes appear to be essential for growth [ 75 , 76 ], which limits opportunities to perturb them in cell-line experiments, one can detect their influence on trait phenotype variation through the pairs of genes in the co-occurrence graph that they regulate. To examine the scenario in which a causal mutation affects a gene’s protein function but not its transcriptional expression, we consider maturity-onset diabetes of the young type 3 (MODY3). MODY3 is known to be a largely monogenic trait caused by mutations to the transcription factor HNF1A that impact beta cell function and diabetes in general. Since HNF1A is one of the genes perturbed in our perturbation response matrix B , MODY3 provides an excellent example where the solution is known. Documented mutations of HNF1A alter its protein function, which in turn alters the expression of other genes as opposed to its own. In fact, HNF1A overexpression appears in 30.2% of baseline-variant pair optimizations. This is compared to the top overall perturbation, NEAT1 knockdown, appearing 57.4% of the time. However, HNF1A , as opposed to NEAT1 , also appears in the forward co-occurrence network (see SI Fig. S4 ). Of the top 13 gene perturbations, three of them— MED1 knockdown, HNF1A overexpression, and GATA2 overexpression—were also in the forward co-occurrence network and have been implicated in diabetic function. This narrows down the large list of possible genes to a number that could be tested in low-throughput lab experiments. For instance, MED1 knockout mice show a heightened sensitivity to insulin and an improved glucose tolerance [ 77 ]. All of the other genes in the co-occurrence network exhibit transcriptional responses that are highly positively correlated with that of HNF1A , meaning that their corresponding column vectors in B all point in the same direction. This pattern is markedly different from those observed in the complex traits above, in which we also find transcriptional responses that are negatively correlated and uncorrelated. Among the perturbations correlated with HNF1A overexpression is ALOX5 overexpression, which also impacts beta cell function in diabetes via increased insulin resistance [ 78 , 79 ]. Finally, we directly compare the genes identified by our method, differential expression, and TWAS in the case of inflammatory bowel disease in SI Fig. S5 . We find that the only 8% of the differentially expressed genes identified in the dataset [ 80 ] overlap with the TWAS genes, which is reflective of the challenges inherent to reconciling results produced by different approaches. Applying our method, we find that 36% of the genes participating in over 54% of the solutions to Eq. (4) overlap with TWAS. We emphasize that this improvement by our method occurs because it accounts for the downstream impacts of the gene perturbations through the B matrix, which naturally filters out spurious differentially expressed genes. Discussion The approach presented here leverages existing transcriptomic data to address the challenges that complex traits pose to traditional mutation-association screening methods [ 13 ]. We implement this by identifying generalized cellular pathways (eigengenes) relevant to a complex trait and by calculating optimal sets of gene perturbations whose transcriptional responses change the combined state of these pathways from one phenotype to another. Our approach accounts for limited data, heterogeneity within phenotypes, confounding biological variation, and combinatorial explosion in gene sets in ways that traditional methods cannot [ 13 ]. In particular, limited data is addressed by our development of the generative model TWAVE; inference of heterogeneous pathways is illustrated in the example of allergic asthma; common drivers of cancer progression across biological subtypes are found in the DepMap example; and finally, a combinatorial explosion is avoided by casting the identification of causal genes as an optimization problem. The approach can also implicate candidate genes through their known downstream effects on the gene regulatory network obtained from experiments. An overarching goal of our approach is to narrow the scope of candidate gene combinations to a number amenable to targeted low-throughput experiments. As in previous successful applications of Boolean networks [ 81 , 82 , 83 , 84 , 85 , 86 ] and principal component-based techniques that uncover low-dimensional structure in gene regulatory networks [ 87 ], we aim to uncover causal influences. The main advantage of our approach is that it can generate predictions solely from publicly available data without explicit network reconstruction or specific knowledge concerning the gene functions and interactions, making it broadly applicable. It is constructive to reflect on the key assumptions underlying our approach. First, we assume that cellular traits are well reflected by gene expression, which is validated by the fact that we and others [ 24 , 88 , 89 , 90 , 91 , 91 , 92 ] can accurately classify gene expression profiles by their phenotypic labels. While transcriptional data do not directly account for post-transcriptional/translational regulation [ 93 ], they do account for downstream impacts on gene expression. Nevertheless, it is straightforward to incorporate multi-omic [ 94 , 95 , 96 , 97 ] data to directly account for mechanisms beyond transcription. Second, we assume that transcriptional responses combine additively, which has been demonstrated to be a good approximation to control cell behavior [ 24 ]. Recent work has applied VAEs to the forward problem of estimating nonadditive transcriptional responses to combinatorial perturbations [ 98 ], raising the possibility of going beyond the additive assumption in the future. Integration of this technique into our method to solve the inverse problem of mapping causes to trait phenotypes, as considered here, would require targeted experiments to train the VAE to recognize nonadditivity. Finally, we assume that our library of transcriptional responses is sufficiently large and diverse to comprehensively capture the impact of genes. Notwithstanding, we demonstrate that enrichment analysis [ 99 , 100 ] can implicate upstream transcription factors that are not included in our library. The success of our approach has several far-reaching implications. First, it suggests that cell line perturbations in vitro are informative of the gene behavior in situ [ 101 ]. Second, it shows that the genes needed to drive a phenotypic change are distinct from those that reverse the change. This is a consequence of our optimization model acknowledging the fundamentally different network impacts of reversing a knockdown versus overexpressing a given gene, which is consistent with irreversibility to transient perturbations observed in gene regulatory networks [ 102 , 103 ]. Third, the success of TWAVE suggests that gene expression can be represented in a low-dimensional space [ 87 ], which might be a general feature across many complex network systems [ 104 ]. Ultimately, our approach provides a new tool to investigate genotype-phenotype relationships in complex traits, which is applicable across a range of organisms and traits. In humans, our approach also lays a new ground for the design of multi-target disease treatment strategies. Materials and Methods TWAVE Architecture We employ a conditional variational autoencoder in PyTorch, which is tailored for the analysis of genomic data and leverages class labels (baseline or variant) to impart enhanced interpretability and classification precision. The architectural blueprint consists of three neural network: an encoder, a decoder, and a classifier. The encoder comprises two fully connected hidden layers, each embedding 256 and 128 units, respectively, with Rectified Linear Unit (ReLU) activation functions. The input layer takes a single gene expression profile, length aligned with the number of genes, and is concatenated with pertinent class labels. This design not only captures the intricate gene expression patterns but also incorporates class-specific information for a more nuanced latent representation. The decoder component reconstructs the input RNA-Seq data through a series of ReLU-activated layers, culminating in a sigmoid activation function. This reconstruction process aims for the faithful reconstruction of the input profile from the encoded latent space. Simultaneously, the classifier, featuring a linear layer, facilitates class predictions grounded in the extracted latent representation. TWAVE Training and Sampling During the training phase, a set of loss functions drives the optimization process. The combination of reconstruction loss and Kullback-Leibler divergence loss is deployed, ensuring a balance between accurate data reproduction and the regularization of the latent space. The training regimen spans 500−10 4 epochs, depending on the dataset, with mini-batches consisting of 50−200 samples. We use an Adam optimizer, with a learning rate set at 0.0001. For sampling, we generate synthetic profiles within the latent space of TWAVE. Latent vectors are obtained by sampling from clusters corresponding to distinct class labels. The latent vectors corresponding to different classes are then clustered, and marginal distributions of baseline and variant profiles in the latent space are extracted using kernel density estimation (KDE). The band-width parameter for our KDE is set to 0.2 to control the smoothness of the estimated density. We then sample new latent space points from these two clusters using our density estimator, and these points are decoded to the full gene expression space with the decoder, producing synthetic profiles for both class labels. Bayesian Inference of Causal Eigengenes We perform Bayesian inference of causal eigengenes from the gene expression data projected onto the top d = 200 eigengenes, X = Y U n × d , where we observe that these eigengenes account for over the overwhelming majority of the variance in all traits. In Fig. S1 , we show that the remaining percent variance after d = 200 eigengenes is less than 1% for all traits except (lung) cancer metastasis and type 1 diabetes, where the remaining eigengenes account for about 22% and 23.5% of the total variance, respectively. Our choice d trims the number of eigengenes to a tractable one that allows for relatively fast Monte-Carlo optimization. Our inference procedure adapts the fine-mapping of causal variants in GWAS [ 17 ] to eigengenes. The first step of the fine-mapping is a logistic regression using class labels. We fit the log-odds ratios ζ to the data X with effect sizes β : where ϵ is a noise vector and ρ denotes the probability that the (binary) label is 1. In this scheme, we seek to infer an optimal d -dimensional vector of causal effects γ = ( γ i ), γ i ∈ {0, 1}, which take on values of zero or one depending on whether eigengene i is causal or not. It can be shown that the likelihood of ζ, X given γ is where ξ = X T ζ/d is the z-value, is the eigengene correlation matrix, Σ γ = ds 2 diag( γ ), and s is a hyper-parameter. We set an initial s = 0.05 as in FINEMAP [ 17 ]. Taking a uniform prior on the number of causal effects k , we can then express the posterior distribution of causal effects given our data as We use MCMC to optimize this distribution over s and γ , though other techniques such as FINEMAP or the sum of single effects (Susie) model could be employed as well [ 19 ]. Posterior inclusion probabilities are calculated as an average over MCMC samples of the posterior distribution where N is the number of samples. The Monte Carlo steps consist of flipping a causal effect (eigengene) on or off at random and we attempt 10 5 steps with a burn-in period of 10 3 steps, according to a Metropolis acceptance criterion. Maximum Entropy Null Graph Model As a null model for our gene concurrence graph, we construct a maximum entropy graph constrained by the row and column sums of our matrix A . The null model G maximizes the entropy S ( G ) = − ∑ G P ( G ) ln P ( G ), where P ( G ) is the canonical distribution P ( G ) ∝ exp[−∑ i β i k i − ∑ i γ i k i ]. Here, k i =∑ j A ij is the row sum and κ i =∑ j A ij is the column sum of node i , whereas β i and γ i are the respective Lagrange multipliers that enforce k i and κ i to be fixed as all other degrees of freedom equilibrate. The row and column sums sequences must follow the maximum entropy conditions These conditions can be solved for the Lagrange multipliers iteratively as and where l denotes the iteration and r ( x, y ) = 1 / ( e − y + e x ). From here, the link probabilities in Eq. 7 can be computed as Complex Disease Data Curation We obtained our 7 datasets from GEO (see Table 1 ). The expression matrices, originally in raw counts, are curated keeping those genes and samples that meet the following criterion: average counts in a gene > 5 and total counts in a sample > 10 5 . The data was normalized to the number of transcripts per million ( N TPM ) using reference transcript lengths mapped from gene ENSEMBL id, and the final expression data was saved as log 10 ( N TPM + 10 −10 ) + 10. Labels, whether they are baseline or variant, were one hot encoded. Transcriptional Response Library The transcriptional response data was curated as described in Ref. [ 24 ]. The list of GEO series accession numbers and associated gene perturbations are listed in Dataset S1. The inclusion of a knockdown in the library does not imply the inclusion of its overexpression and vice versa. Data, Materials, and Software Availability Raw gene expression counts data are available through GEO. Relevant accession numbers are included in Materials and Methods. The software and processed data for employing the method are available from the GitHub repository: https://github.com/biophysben/Generative-prediction-of-causal-gene-sets-responsible-for-complex-traits . Source data for training TWAVE are stored on Dryad at https://doi.org/10.5061/dryad.s4mw6m9hf . Author contributions B.K.S., T.P.W. and A.E.M. designed research; B.K.S. and B.K.O performed research; B.K.S. analyzed data; and B.K.S., B.K.O., T.P.W., and A.E.M. wrote the paper. Author Declaration The authors declare no competing interests exist. Supporting Information for SI Overview This Supplementary Information file contains a detailed discussion of the choice of the Euclidean metric underlying our optimization, figures that support the analysis presented in the main text, an extended description of the genes associated with changes in trait phenotypes beyond those discussed in the main text, and figures analyzing the results for the pan-cancer, MODY3, and inflammatory bowel disease (IBD) traits. The text contains a detailed discussion of the genes associated with each trait in Table 1 that is not explicitly discussed in the main text, with the corresponding genes laid out in Supplementary Tables S1 - S6 . Supplementary Figs. S1 and S2 contain numerical and statistical analysis relevant to the identification of causal genes and significant gene pairs, respectively. The results for co-occurring genes in pan-cancer are contained in Supplementary Fig. S3A,B , and the corresponding results for IBD are in Fig. S3C,D . Supplementary Figs. S4 - S5 concern a comparison of our method with existing ones in the cases of MODY3 and IBD. The first trait phenotype is purported to be caused by a mutation to a single gene, and in Supplementary Fig. S4 we show that the genes identified by our method overlap with the known downstream effects of that mutation. IBD contains over 1,000 TWAS-associated genes, 44 of which overlap with our perturbations. In Supplementary Fig. S5 , we show that the occurrence frequency of a perturbation is associated with the gene being identified by TWAS at a much higher rate than for differentially expressed genes. Justification of the Euclidean metric In this section, we motivate why we choose to formulate optimization in Eqs. (1)-(2) and (4) of the main text in terms of a Euclidean distance between the baseline and variant states. We recall that the eigengenes are selected for causality via the fine mapping procedure, and thus the Euclidean norm posits that any selected eigengene could equally lead to a change in phenotype. We comment on four representative alternative choices for the distance: Minkowski distances, which allow for p —the polynomial order of the norm—to be different from 2 (the Euclidean case); scaled Euclidean metrics, which allow for the different dimensions to be differently weighted; similarity metrics, such as the cosine distance, which characterize the alignment between state vectors; and arbitrary curvilinear metrics, which are well-suited for measuring distance in curved spaces. Type (1) notions of distance satisfy the triangle inequality when p ≥ 1, so we focus on those cases. When p = 1, the decrease in the objective function is piecewise constant, and the optimal points occur at cusps, but the location of optimal point remains the same. Thus, the perturbations identified by the method presented in the paper is expected to yield similar results, though the value of the regularization parameter λ could change. At the same time, the optimization would likely be more expensive due to the large number of non-differentiable points. For p > 1, the objective function is smooth, but the optimal point is again the same, so we expect the selected genes to remain largely similar for other values of p , although some specifics may change in the process of optimization. Type (2) notions of distance can readily incorporate the existing data and/or trait-specific biological knowledge to reweight the causal eigengenes. One natural weighting is to rescale the projection along each causal eigengene by the (dataset-wide) standard deviation of expression across that same eigengene. This choice would remove any extensive effects of the selected eigengenes (i.e., eigengenes with more weight among highly-expressed genes would be treated on equal footing with lower-expressed genes). In general, reweighting implicitly reflects a choice that the absolute change in expression of certain genes is more important to the phenotype than others. To remain unbiased as to which absolute changes in gene expression influence phenotype, we did not employ such a reweighting here. Type (3) notions of distance concern only the direction, but not the magnitude of the changes. However, as far as cell behavior is concerned, both the magnitude and direction of the change are expected to be important. We explicitly account for this by restricting the range of perturbation strengths we allow in Eqs. (1)-(2) and (4) of the main text, and we find cases in which multiple perturbations that point in almost the same direction in transcriptional space are needed to realize a change in the cell behavior. Type (4) notions of distance require the specification of a global metric tensor at all points in transcriptional space, which implicitly makes substantial assumptions about the transcriptional regulatory network that are difficult to verify with the existing data. As a result, we eschewed implementing metrics of this type in Eqs. (1)-(2) and (4) of the main text. Speaking loosely, the Euclidean metric is a parsimonious choice that treats the causal eigengenes on equal footing. Genes implicated in complex traits We now discuss the remaining complex traits mentioned in Table 1 but not otherwise discussed in detail in the main text, and the gene perturbations our method associates with their phenotypes. We recall that we use italicized capital letters to refer to genes, while unitalicized capital letters are used for acronyms denoting traits. Inflammatory bowel disease IBD, which includes ulcerative colitis and Crohn’s disease, is a chronic intestinal inflammatory disease associated with immune system dysregulation in which an imbalance occurs between anti-inflammatory and pro-inflammatory responses (S1). Accordingly, some of the perturbations that account for the largest transcriptional differences between the average healthy transcriptional profile and the average IBD-associated transcriptional profile are associated with genes involved in immune system function (see Table S1 ). Several of the genes in Table S1 have also been found in the literature to be differentially expressed in inflammatory tissues. These genes include: CPSF3 , which has been found to be markedly higher in ulcerative colitis tissues than in healthy tissues (S1); CD86 , which is upregulated in B-cells of mouse models and human patients with IBD (S2); PROX1 , which is upregulated in Crohn’s disease patients (S3); and CDK12 , which is often downregulated to mitigate inflammatory disease (S4). All of these genes make sense biologically since they are known to have roles in regulating immune function. Food allergy Food allergies are a sustained overreaction of the immune system to harmless environmental factors, with disruptions in early immune development likely playing a role (S5; S6). Our method implicates several genes crucial to immune system development and immune response modulation, including CEBPA, LEF1, ERG, TCF7L1, ONECUT2, ETV1 , and CBFB (see Table S2 ). Further, LEF1 coexpression with TCF1 (which is in the same protein family as TCF7L1 ) has been shown to lead to chronic activation of helper T-cells, likely due to their combined roles in the Wnt pathway critical for T-cell memory (S5). Cancer metastasis Cancer cells undergo a series of transcriptomic and physiological changes that give rise to metastases in different tissues, with the cells’ phenotypic plasticity playing a role (S7). Consistent with this observation, we find that genes related to countervailing processes of the epithelial to mesenchymal transition (EMT) and tumor suppression are implicated in cancer metastasis by our method ( Table S3 ). For example, our method identifies PTHLH , which is associated with cell migration (S8). In other cases, our method identifies related genes whose polarity of perturbation (up or down regulation) runs counter to that observed in previous experiments. Such genes include FAM83H-AS1 , also associated with cell migration (S9), and MYCN , which is associated with tumor proliferation (S10). In these instances, the same cellular processes are targeted, but outside factors may alter the way that the specific gene upregulations and downregulations influence phenotype. Age-related macular degeneration Age-related macular degeneration (AMD) is a leading cause of vision impairment and loss (S11). Disease progression is characterized by the build-up of extracellular deposits in the macula (an anatomical structure within the retina of the eye) and the ensuing degeneration of photoreceptors and nearby tissue (S12). Dysregulation of lipid, vascular, inflammatory, and extracellular matrix pathways have been implicated in this disease (S11). Accordingly, several of the perturbations for AMD identified in Table S4 are in genes involved in these pathways. Specifically, MIR29A was found to disrupt the formation of blood vessels in the eye (S13), while a polymorphism in IGF1R (a gene with a role in inflammatory responses and angiogenesis) is significantly associated with advanced AMD (S14). Type 1 diabetes Type 1 diabetes (T1D) progresses through autoimmune attacks on insulin-producing β -cells within the pancreas (S15). In agreement with this, our method identifies perturbations to genes involved in immune development and response, like MIR126, CEBPA , and ZAP70 , as causal in T1D. Interestingly, we also find perturbations to genes with functions relevant to the pathogenesis of type 2 diabetes (T2D), including insulin resistance and storage and metabolic breakdown of fats (S16). We further find perturbations to genes involved in vascularization, as in processes known to be disrupted as part of the long-term complications of T2D (S16). Non-small cell lung cancer Non-small cell lung cancer (NSCLC) represents 85% of new lung cancer cases (S17). Our predicted perturbations involve several genes associated with cell growth, proliferation, and tumor suppression. These include EGFR , which is expressed in a variety of human tumors, including NSCLC (S18). Furthermore, BCL11B has been shown to have tumor suppressing functionality (S19). Supplementary Figures Download figure Open in new tab Figure S1: Justification for using d = 200 eigengenes. The percent of unexplained variance in the data is plotted as a function of the number of principal components (ordered by the fraction of variance explained). The top 200 eigengenes explain > 99% variance for all traits, except cancer metastasis and type 1 diabetes, where > 77% of the variance is explained. Download figure Open in new tab Figure S2: Quantile-quantile plots for allergic asthma ( A ), inflammatory bowel disease ( B ) and pan-cancer metastasis ( C ). Horizontal lines indicate thresholds above which co-occurring gene pairs are considered significant. Each case shows the baseline-to-variant (left) and reverse (right) directions. Download figure Open in new tab Figure S3: Gene pair co-occurrence networks for ( A, B ) pan-cancer metastasis and ( C, D ) inflammatory bowel disease. The baseline-to-variant pairs are indicated in (A, C) and reverse pairs in (B, D). The edge colors indicate correlations between the genes, the node colors indicate the type of perturbation, and the node sizes are proportional to the number of edges. Download figure Open in new tab Figure S4: Co-occurring genes in the MODY3 trait and the correlations between them. ( A ) Gene co-occurrence network in the baseline-to-variant direction. ( B ) Color-coded gene expression correlations of the genes in (A) across all transcriptional responses in the library. The transcriptional responses to SRRM4 and GYPC appear in the library but their expressions are not measured in the dataset, so they do not appear in (B). Download figure Open in new tab Figure S5: Comparison of genes identified by differential expression, u opt (our method), and TWAS for inflammatory bowel disease. ( A ) Venn diagram of the differentially expressed genes in GSE193677 (S20) and all TWAS genes associated with inflammatory bowel disease. Only 8% of the differentially expressed (DE) genes are also implicated by TWAS. ( B ) Precision-recall curve of the 719 unique genes associated with columns of the perturbation matrix B versus the 44 TWAS genes that overlap with these perturbed genes. Genes were ranked by the number of times they appear in the point-to-point forward optimizations for IBD. For the top n genes, the precision is the fraction of the n genes that are in the set of TWAS genes, while the recall is the fraction of the 44 TWAS genes contained in the top n genes. The red dot indicates n = 14 genes of which 5 overlap with TWAS, as indicated in the Venn diagram. Supplementary Tables View this table: View inline View popup Download powerpoint Table S1: Transcriptional perturbations associated with inflammatory bowel disease. View this table: View inline View popup Download powerpoint Table S2: Transcriptional perturbations associated with food allergy. View this table: View inline View popup Download powerpoint Table S3: Transcriptional perturbations associated with cancer metastasis to the lung. View this table: View inline View popup Download powerpoint Table S4: Transcriptional perturbations associated with age-related macular degeneration. View this table: View inline View popup Download powerpoint Table S5: Transcriptional perturbations associated with type 1 diabetes. View this table: View inline View popup Download powerpoint Table S6: Transcriptional perturbations associated with non-small cell lung cancer. Supplementary Datasets Dataset_S1-transcriptional_response_metadata.xlsx Excel file of metadata including the SRA accession numbers of the publicly available data used to specify the transcriptional response matrix B . The NCBI Bioproject, cell line, cell type, treatment, and genotype of the initial (unperturbed) and treated (perturbed) states are recorded (left to right) in this table. Acknowledgments This work was supported by NIH/NCI grant No. P50-CA221747 through the Malnati Brain Tumor Institute and leveraged research from NSF grant No. MCB-2206974. The authors also acknowledge support from the National Institute for Theory and Mathematics in Biology (NSF Grant No. DMS-2235451 and Simons Foundation Grant No. MP-TMPS-00005320) and the use of Quest High-Performance Computing Cluster at Northwestern University. Funding National Cancer Institute, https://ror.org/040gcmg81 , P50-CA221747 National Science Foundation, https://ror.org/021nxhr62 , DMS-2235451 Simons Foundation, https://ror.org/01cmst727 , MP-TMPS-00005320 Footnotes https://github.com/biophysben/Generative-prediction-of-causal-gene-sets-responsible-for-complex-traits https://datadryad.org/dataset/doi:10.5061/dryad.s4mw6m9hf References [1]. ↵ KA Frazer , SS Murray , NJ Schork , EJ Topol , Human genetic variation and its contribution to complex traits . Nat. Rev. Genet . 10 , 241 – 251 ( 2009 ). OpenUrl CrossRef PubMed Web of Science [2]. ↵ EA Boyle , YI Li , JK Pritchard , An expanded view of complex traits: from polygenic to omnigenic . Cell 169 , 1177 – 1186 ( 2017 ). OpenUrl CrossRef PubMed [3]. ↵ K Watanabe , et al. , A global overview of pleiotropy and genetic architecture in complex traits . Nat. Genet . 51 , 1339 – 1348 ( 2019 ). OpenUrl CrossRef PubMed [4]. ↵ O Canela-Xandri , K Rawlik , A Tenesa , An atlas of genetic associations in UK Biobank . Nat. Genet . 50 , 1593 – 1599 ( 2018 ). OpenUrl CrossRef PubMed [5]. ↵ JZ Liu , et al. , Association analyses identify 38 susceptibility loci for inflammatory bowel disease and highlight shared genetic risk across populations . Nat. Genet . 47 , 979 – 986 ( 2015 ). OpenUrl CrossRef PubMed [6]. ↵ EE Laing , et al. , Blood transcriptome based biomarkers for human circadian phase . eLife 6 , e20214 ( 2017 ). OpenUrl CrossRef PubMed [7]. ↵ J Wang , et al. , CAUSALdb: a database for disease/trait causal variants identified using summary statistics of genome-wide association studies . Nucleic Acids Res . 48 , D807 – D816 ( 2020 ). OpenUrl CrossRef PubMed [8]. ↵ B Pasaniuc , AL Price , Dissecting the genetics of complex traits using summary association statistics . Nat. Rev. Genet . 18 , 117 – 127 ( 2017 ). OpenUrl CrossRef PubMed [9]. ↵ AM Glazier , JH Nadeau , TJ Aitman , Finding genes that underlie complex traits . Science 298 , 2345 – 2349 ( 2002 ). OpenUrl Abstract / FREE Full Text [10]. ↵ S Tang , et al. , Novel Variance-Component TWAS method for studying complex human diseases with applications to Alzheimer’s dementia . PLoS Genet . 17 , e1009482 ( 2021 ). OpenUrl CrossRef PubMed [11]. ↵ P Visscher , et al. , 10 years of GWAS Discovery: Biology, Function, and Translation . Am. J. Hum. Genet 101 , 5 – 22 ( 2017 ). OpenUrl CrossRef PubMed [12]. ↵ E Uffelmann , et al. , Genome-wide association studies . Nat. Rev. Methods Primers . 1 ( 2021 ). [13]. ↵ V Tam , et al. , Benefits and limitations of genome-wide association studies . Nat. Rev. Genet . 20 , 467 – 484 ( 2019 ). OpenUrl CrossRef PubMed [14]. ↵ E Gamazon , et al. , A gene-based association method for mapping traits using reference transcriptome data . Nat. Genet . 47 , 1091 – 1098 ( 2015 ). OpenUrl CrossRef PubMed [15]. ↵ M Wainberg , et al. , Opportunities and challenges for transcriptome-wide association studies . Nat. Genet . 51 , 592 – 599 ( 2019 ). OpenUrl CrossRef PubMed [16]. ↵ W Wei , G Hemani , C Haley , Detecting epistasis in human complex traits . Nat. Rev. Genet . 15 , 722 – 733 ( 2014 ). OpenUrl CrossRef PubMed [17]. ↵ C Benner , et al. , FINEMAP: efficient variable selection using summary data from genomewide association studies . Bioinform . 32 , 1493 – 1501 ( 2016 ). OpenUrl CrossRef PubMed [18]. ↵ Y Zou , P Carbonetto , G Wang , M Stephens , Fine-mapping from summary data with the “Sum of Single Effects” model . PLoS Genet . 18 , e1010299 ( 2022 ). OpenUrl CrossRef PubMed [19]. ↵ G Wang , A Sarkar , P Carbonetto , M Stephens , A simple new approach to variable selection in regression, with application to genetic fine mapping . J. R. Stat. Soc. Series B Stat. Methodol . 82 , 1273 – 1300 ( 2020 ). OpenUrl CrossRef PubMed [20]. ↵ A Hutchinson , J Asimit , C Wallace , Fine-mapping genetic associations . Hum. Mol. Genet . 29 , R81 – R88 ( 2020 ). OpenUrl CrossRef PubMed [21]. ↵ KKH Farh , et al. , Genetic and epigenetic fine mapping of causal autoimmune disease variants . Nature 518 , 337 – 343 ( 2015 ). OpenUrl CrossRef PubMed [22]. ↵ AR Sonawane , ST Weiss , K Glass , A Sharma , Network medicine in the age of biomedical big data . Front. Genet . 10 , 294 ( 2019 ). OpenUrl CrossRef PubMed [23]. ↵ G Zhang , NM Roberto , D Lee , SR Hahnel , EC Andersen , The impact of species-wide gene expression variation on Caenorhabditis elegans complex traits . Nat. Commun . 13 , 3462 ( 2022 ). OpenUrl CrossRef PubMed [24]. ↵ TP Wytock , AE Motter , Cell reprogramming design by transfer learning of functional transcriptional networks . Proc. Natl. Acad. Sci. USA 121 , e2312942121 ( 2024 ). OpenUrl CrossRef PubMed [25]. ↵ O Alter , PO Brown , D Botstein , Singular value decomposition for genome-wide expression data processing and modeling . Proc. Natl. Acad. Sci. USA 97 , 10101 – 10106 ( 2000 ). OpenUrl Abstract / FREE Full Text [26]. ↵ S Kauffman , Metabolic stability and epigenesis in randomly constructed genetic nets . J. Theoret. Biol . 22 , 437 – 467 ( 1969 ). OpenUrl CrossRef PubMed Web of Science [27]. ↵ S Huang , G Eichler , Y Bar-Yam , DE Ingber , Cell Fates as High-Dimensional Attractor States of a Complex Gene Regulatory Network . Phys. Rev. Lett . 94 , 128701 ( 2005 ). OpenUrl CrossRef PubMed [28]. ↵ R Meyers S Huang , S Kauffman , Complex Gene Regulatory Networks – from Structure to Biological Observables: Cell Fate Determination in Encyclopedia of Complexity and Systems Science , ed. R Meyers . ( Springer New York, NY ), pp. 1180 – 1213 ( 2009 ). [29]. ↵ R Shemirani , S Wenric , E Kenny , JL Ambite , EPS: automated feature selection in case– control studies using extreme pseudo-sampling . Bioinform . 37 , 3372 – 3373 ( 2021 ). OpenUrl CrossRef PubMed [30]. ↵ S Wenric , R Shemirani , Using supervised learning methods for gene selection in RNA-Seq case-control studies . Front. Genet . 9 , 297 ( 2018 ). OpenUrl CrossRef PubMed [31]. ↵ TA Karginov , A Ménoret , AT Vella , Optimal CD8+ T cell effector function requires costimulation-induced RNA-binding proteins that reprogram the transcript isoform landscape . Nat. Commun . 13 , 3540 ( 2022 ). OpenUrl CrossRef PubMed [32]. ↵ S Yu , VN Kim , A tale of non-canonical tails: gene regulation by post-transcriptional RNA tailing . Nat. Rev. Mol. Cell Biol . 21 , 542 – 556 ( 2020 ). OpenUrl CrossRef PubMed [33]. X Feng , et al. , Integrative analysis of the expression profiles of whole coding and non-coding RNA transcriptomes and construction of the competing endogenous RNA networks for chronic obstructive pulmonary disease . Front. Genet . 14 , 1050783 ( 2023 ). OpenUrl CrossRef PubMed [34]. ↵ EM Mushaben , GK Hershey , MW Pauciulo , WC Nichols , TD Le Cras , Chronic allergic inflammation causes vascular remodeling and pulmonary hypertension in BMPR2 hypomorph and wild-type mice . PLoS One 7 , e32468 ( 2012 ). OpenUrl CrossRef PubMed [35]. ↵ SH Park , et al. , Modification of hemodynamic and immune responses to exposure with a weak antigen by the expression of a hypomorphic BMPR2 gene . PloS One 8 , e55180 ( 2013 ). OpenUrl CrossRef PubMed [36]. ↵ Y Zhu , W Wang , X Wang , Roles of transcriptional factor 7 in production of inflammatory factors for lung diseases . J. Transl. Med . 13 , 273 ( 2015 ). OpenUrl CrossRef PubMed [37]. N Hernandez-Pacheco , et al. , Genome-wide association study of inhaled corticosteroid response in admixed children with asthma . Clin. Exp. Allergy 49 , 789 – 798 ( 2019 ). OpenUrl CrossRef PubMed [38]. AK Kheirallah , CH de Moor , A Faiz , I Sayers , IP Hall , Lung function associated gene Integrator Complex subunit 12 regulates protein synthesis pathways . BMC Genom . 18 , 1 – 20 ( 2017 ). OpenUrl CrossRef [39]. X Li , S Ye , Y Lu , Long non-coding RNA NEAT1 overexpression associates with increased exacerbation risk, severity, and inflammation, as well as decreased lung function through the interaction with microRNA-124 in asthma . J. Clin. Lab. Anal . 34 , e23023 ( 2020 ). OpenUrl CrossRef PubMed [40]. DG Roy , et al. , Methionine metabolism shapes T helper cell responses through regulation of epigenetic reprogramming . Cell Metab . 31 , 250 – 266 ( 2020 ). OpenUrl CrossRef PubMed [41]. M Shang , et al. , The folate cycle enzyme MTHFD2 induces cancer immune evasion through PD-L1 up-regulation . Nat. Commun . 12 , 1940 ( 2021 ). OpenUrl CrossRef PubMed [42]. RD Britt Jr . , et al. , Sterols and immune mechanisms in asthma . J. Allergy. Clin. Immunol . 151 , 47 – 59 ( 2023 ). OpenUrl CrossRef [43]. A García Del Río , et al. , The mitochondrial isoform of FASTK modulates nonopsonic phagocytosis of bacteria by macrophages via regulation of respiratory complex I . J. Immunol . 201 , 2977 – 2985 ( 2018 ). OpenUrl Abstract / FREE Full Text [44]. ↵ MR Hadjicharalambous , MA Lindsay , Long non-coding RNAs and the innate immune response . Non-coding RNA 5 , 34 ( 2019 ). OpenUrl PubMed [45]. ↵ Z Li , et al. , The long noncoding RNA THRIL regulates TNFα expression through its interaction with hnRNPL . Proc. Natl. Acad. Sci. USA 111 , 1002 – 1007 ( 2014 ). OpenUrl Abstract / FREE Full Text [46]. ↵ Z Su , D Huang , Alternative splicing of pre-mRNA in the control of immune activity . Genes 12 , 574 ( 2021 ). OpenUrl CrossRef [47]. ↵ P Zeng , et al. , Statistical analysis for genome-wide association study . J. Biomed. Res . 29 , 285 ( 2015 ). OpenUrl CrossRef PubMed [48]. ↵ S Chatterjee , P Diaconis , A Sly , Random Graphs with a Given Degree Sequence . Ann. Appl. Probab . 21 , 1400 – 1435 ( 2011 ). OpenUrl CrossRef [49]. ↵ G Bianconi , Entropy of network ensembles . Phys. Rev. E 79 , 036114 ( 2009 ). OpenUrl CrossRef [50]. ↵ KM Magnaye , et al. , A-to-I editing of miR-200b-3p in airway cells is associated with moderate-to-severe asthma . Eur. Respir. J . 58 , 2003862 ( 2021 ). OpenUrl Abstract / FREE Full Text [51]. ↵ V Tubita , et al. , Role of microRNAs in inflammatory upper airway diseases . Allergy 76 , 1967 – 1980 ( 2020 ). OpenUrl PubMed [52]. ↵ T Xia , J Ma , Y Sun , Y Sun , Androgen receptor suppresses inflammatory response of airway epithelial cells in allergic asthma through MAPK1 and MAPK14 . Hum. Exp. Toxicol . 41 , 09603271221121320 ( 2022 ). OpenUrl CrossRef [53]. ↵ CR Keenan , et al. , Polycomb repressive complex 2 is a critical mediator of allergic inflammation . JCI Insight 4 , e127745 ( 2029 ). OpenUrl [54]. ↵ SN Georas , P Donohue , M Connolly , ME Wechsler , JAK inhibitors for asthma . J. Allergy. Clin. Immunol . 148 , 953 – 963 ( 2021 ). OpenUrl CrossRef [55]. ↵ L Ye , et al. , A critical role for c-Myc in group 2 innate lymphoid cell activation . Allergy 75 , 841 – 852 ( 2020 ). OpenUrl CrossRef PubMed [56]. ↵ V Poli , et al. , MYC-driven epigenetic reprogramming favors the onset of tumorigenesis by inducing a stem cell-like state . Nat. Commun . 9 , 1024 ( 2018 ). OpenUrl CrossRef PubMed [57]. ↵ S Patange , et al. , MYC amplifies gene expression through global changes in transcription factor dynamics . Cell Rep . 38 ( 2022 ). [58]. ↵ CY Wang , et al. , Induced pluripotent stem cells without c-Myc reduce airway responsiveness and allergic reaction in sensitized mice . Transplant . 96 , 958 – 965 ( 2013 ). OpenUrl CrossRef PubMed [59]. ↵ P Borger , et al. , Impaired translation of CCAAT/enhancer binding protein α mRNA in bronchial smooth muscle cells of asthmatic patients . JACI 123 , 639 – 645 ( 2009 ). OpenUrl [60]. ↵ S Chung , et al. , FoxO1 regulates allergic asthmatic inflammation through regulating polarization of the macrophage inflammatory phenotype . Oncotarget 7 , 17532 ( 2016 ). OpenUrl CrossRef PubMed [61]. ↵ X Rao , et al. , MiR-493-5p inhibits Th9 cell differentiation in allergic asthma by targeting FOXO1 . Respir. Res . 23 , 286 ( 2022 ). OpenUrl CrossRef PubMed [62]. ↵ FJ Ren , XY Cai , Y Yao , GY Fang , JunB: a paradigm for Jun family in immune response and cancer . Front. Cell. Infect. Microbiol . 13 , 1222265 ( 2023 ). OpenUrl CrossRef PubMed [63]. ↵ S Rojo-Tolosa , et al. , Influence of genetics on the response to omalizumab in patients with severe uncontrolled asthma with an allergic phenotype . Int. J. Mol. Sci . 24 , 7029 ( 2023 ). OpenUrl CrossRef PubMed [64]. ↵ C Meng , et al. , Ten-eleven translocation 2 modulates allergic inflammation by 5-hydroxymethylcytosine remodeling of immunologic pathways . Hum. Mol. Genet . 30 , 1985 – 1995 ( 2021 ). OpenUrl CrossRef PubMed [65]. ↵ H Zhang , et al. , Targeting epithelial cell-derived TWIST1 alleviates allergic asthma . Cell. Signal . 102 , 110552 ( 2023 ). OpenUrl CrossRef PubMed [66]. ↵ A Tsherniak , et al. , Defining a Cancer Dependency Map . Cell 170 , 564 – 576 ( 2017 ). OpenUrl CrossRef PubMed [67]. ↵ PS Dischinger , et al. , NF1 deficiency correlates with estrogen receptor signaling and diminished survival in breast cancer . NPJ Breast Cancer 4 , 29 ( 2018 ). OpenUrl CrossRef PubMed [68]. ↵ J Hu , et al. , Sox5 contributes to prostate cancer metastasis and is a master regulator of TGFβ-induced epithelial mesenchymal transition through controlling Twist1 expression . Br. J. Cancer 118 , 88 – 97 ( 2018 ). OpenUrl CrossRef PubMed [69]. ↵ N Malik , et al. , The transcription factor CBFB suppresses breast cancer through orchestrating translation and transcription . Nat. Commun . 10 , 2071 ( 2019 ). OpenUrl CrossRef PubMed [70]. ↵ YL Chen , et al. , LncRNA SLCO4A1-AS1 suppresses lung cancer progression by sequestering the TOX4-NTSR1 signaling axis . J. Biomed. Sci . 30 , 80 ( 2023 ). OpenUrl CrossRef PubMed [71]. ↵ T Elsir , A Smits , M S Lindström , M Nistér , Transcription factor PROX1: its role in development and cancer . Cancer Metastasis Rev . 31 , 793 – 805 ( 2012 ). OpenUrl CrossRef PubMed [72]. ↵ Z Cheng , et al. , Knockdown of EHF inhibited the proliferation, invasion and tumorigenesis of ovarian cancer cells . Mol. Carcinog . 55 , 1048 – 1059 ( 2016 ). OpenUrl CrossRef PubMed [73]. ↵ MZ Kamran , P Patil , RP Gude , Role of STAT3 in cancer metastasis and translational advances . Biomed Res. Int . 2013 , 421821 ( 2013 ). OpenUrl PubMed [74]. ↵ L Zhao , et al. , CTCF promotes epithelial ovarian cancer metastasis by broadly controlling the expression of metastasis-associated genes . Oncotarget 8 , 62217 ( 2017 ). OpenUrl CrossRef PubMed [75]. ↵ K Takeda , et al. , Targeted disruption of the mouse Stat3 gene leads to early embryonic lethality . Proc. Natl. Acad. Sci. USA 94 , 3801 – 3804 ( 1997 ). OpenUrl Abstract / FREE Full Text [76]. ↵ AM Fedoriw , P Stein , P Svoboda , RM Schultz , MS Bartolomei , Transgenic RNAi reveals essential function for CTCF in H19 gene imprinting . Science 303 , 238 – 240 ( 2004 ). OpenUrl Abstract / FREE Full Text [77]. ↵ W Chen , X Zhang , K Birsoy , RG Roeder , A muscle-specific knockout implicates nuclear receptor coactivator MED1 in the regulation of glucose and energy metabolism . Proc. Natl. Acad. Sci. USA 107 , 10196 – 10201 ( 2010 ). OpenUrl Abstract / FREE Full Text [78]. ↵ D Liu , et al. , Evaluation of the oxidative stress–related genes ALOX5, ALOX5AP, GPX1, GPX3 and MPO for contribution to the risk of type 2 diabetes mellitus in the Han Chinese population . Diabetes Vasc. Dis. Res . 15 , 336 – 339 ( 2018 ). OpenUrl CrossRef [79]. ↵ MM Heemskerk , et al. , Increased PUFA content and 5-lipoxygenase pathway expression are associated with subcutaneous adipose tissue inflammation in obese women with type 2 diabetes . Nutrients 7 , 7676 – 7690 ( 2015 ). OpenUrl CrossRef PubMed [80]. ↵ C Argmann , et al. , Biopsy and blood-based molecular biomarker of inflammation in IBD . Gut 72 , 1271 – 1287 ( 2023 ). OpenUrl Abstract / FREE Full Text [81]. ↵ A Samal , S Jain , The regulatory network of E. coli metabolism as a boolean dynamical system exhibits both homeostasis and flexibility of response . BMC Syst. Biol . 2 , 1 – 18 ( 2008 ). OpenUrl CrossRef PubMed [82]. ↵ RS Wang , A Saadatpour , R Albert , Boolean modeling in systems biology: an overview of methodology and applications . Phys. Biol . 9 , 055001 ( 2012 ). OpenUrl CrossRef PubMed [83]. ↵ H Sizek , A Hamel , D Deritei , S Campbell , E Ravasz Regan , Boolean model of growth signaling, cell cycle and apoptosis predicts the molecular mechanism of aberrant cell cycle progression driven by hyperactive PI3K . PLoS Comput. Biol . 15 , e1006402 ( 2019 ). OpenUrl CrossRef PubMed [84]. ↵ JC Rozum , J Gómez Tejeda Zañudo , X Gan , D Deritei , R Albert , Parity and time reversal elucidate both decision-making in empirical models and attractor scaling in critical boolean networks . Sci. Adv . 7 , eabf8124 ( 2021 ). OpenUrl FREE Full Text [85]. ↵ E Newby , JG Tejeda Zañudo , R Albert , Structure-based approach to identifying small sets of driver nodes in biological networks . Chaos 32 ( 2022 ). [86]. ↵ E Sullivan , et al. , Boolean modeling of mechanosensitive epithelial to mesenchymal transition and its reversal . iScience 26 ( 2023 ). [87]. ↵ K Hari , et al. , Low dimensionality of phenotypic space as an emergent property of coordinated teams in biological regulatory networks . iScience 28 , 111730 ( 2025 ). OpenUrl CrossRef PubMed [88]. ↵ KF Mahin , et al. , PanClassif: Improving pan cancer classification of single cell RNA-Seq gene expression data using machine learning . Genomics 114 , 110264 ( 2022 ). OpenUrl CrossRef PubMed [89]. ↵ T Lee , H Lee , Prediction of Alzheimer’s disease using blood gene expression data . Sci. Rep . 10 , 3485 ( 2020 ). OpenUrl CrossRef PubMed [90]. ↵ B Kegerreis , et al. , Machine learning approaches to predict lupus disease activity from gene expression data . Sci. Rep . 9 , 9617 ( 2019 ). OpenUrl CrossRef PubMed [91]. ↵ F Alharbi , A Vakanski , Machine learning methods for cancer classification using gene expression data: A review . Bioengineering 10 , 173 ( 2023 ). OpenUrl CrossRef PubMed [92]. ↵ W Cookson , L Liang , G Abecasis , M Moffatt , M Lathrop , Mapping complex disease traits with global gene expression . Nat. Rev. Genet . 10 , 184 – 194 ( 2009 ). OpenUrl CrossRef PubMed Web of Science [93]. ↵ D Zhang , et al. , Global and gene-specific translational regulation in Escherichia coli across different conditions . PLoS Comput. Biol . 18 , e1010641 ( 2022 ). OpenUrl CrossRef PubMed [94]. ↵ S Albaradei , et al. , MetaCancer: a deep learning-based pan-cancer metastasis prediction model developed using multi-omics data . Comput. Struct. Biotechnol. J . 19 , 4404 – 4411 ( 2021 ). OpenUrl CrossRef PubMed [95]. ↵ A Ebrahim , et al. , Multi-omic data integration enables discovery of hidden biological regularities . Nat. Commun . 7 , 13091 ( 2016 ). OpenUrl CrossRef PubMed [96]. ↵ A Gayoso , et al. , Joint probabilistic modeling of single-cell multi-omic data with totalVI . Nat. Methods 18 , 272 – 282 ( 2021 ). OpenUrl CrossRef PubMed [97]. ↵ Y Lu , M Oliva , BL Pierce , J Liu , LS Chen , Integrative cross-omics and cross-context analysis elucidates molecular links underlying genetic effects on complex traits . Nat. Commun . 15 , 2383 ( 2024 ). OpenUrl CrossRef PubMed [98]. ↵ M Lotfollahi , et al. , Predicting cellular responses to complex perturbations in high-throughput screens . Mol. Syst. Biol . 19 , e11517 ( 2023 ). OpenUrl CrossRef PubMed [99]. ↵ X. Z, et al. , Gene Set Knowledge Discovery with Enrichr . Curr Protoc 1 , e90 ( 2021 ). OpenUrl CrossRef [100]. ↵ K Glass , M Girvan , Finding new order in biological functions from the network structure of gene annotations . PLoS Comput. Biol . 11 , e1004565 ( 2015 ). OpenUrl CrossRef PubMed [101]. ↵ M Lotfollahi , F Wolf , F Theis , scGen predicts single-cell perturbation responses . Nat. Methods . 16 , 715 – 721 ( 2019 ). OpenUrl CrossRef PubMed [102]. ↵ Y Zhao , TP Wytock , KA Reynolds , AE Motter , Irreversibility in Bacterial Regulatory Networks . Sci. Adv . 10 , eado3232 ( 2024 ). OpenUrl CrossRef PubMed [103]. ↵ R Zhu , JM del Rio-Salgado , J Garcia-Ojalvo , MB Elowitz , Synthetic multistability in mammalian cells . Science 375 , eabg9765 ( 2022 ). OpenUrl CrossRef PubMed [104]. ↵ V Thibeault , A Allard , P Desrosiers , The low-rank hypothesis of complex systems . Nat. Phys 20 , 294 – 302 ( 2024 ). OpenUrl CrossRef References [S1]. Y Chen , J Lei , S He , m6A modification mediates mucosal immune microenvironment and therapeutic response in inflammatory bowel disease . Front. Cell Dev. Biol . 9 , 692160 ( 2021 ). OpenUrl CrossRef PubMed [S2]. I Gadjalova , et al. , B cell-mediated CD4 T-cell costimulation via CD86 exacerbates proinflammatory cytokine production during autoimmune intestinal inflammation . Mucosal Immunol . 17 , 67 ( 2024 ). OpenUrl CrossRef PubMed [S3]. W Shen , et al. , Decreased expression of Prox1 is associated with postoperative recurrence in Crohn’s disease . J. Crohns Colitis 12 , 1210 ( 2018 ). OpenUrl CrossRef PubMed [S4]. KL Henry , et al. , CDK12-mediated transcriptional regulation of noncanonical NF-κB components is essential for signaling . Sci. Signal . 11 , eaam8216 ( 2018 ). OpenUrl Abstract / FREE Full Text [S5]. R Kratchmarov , et al. , TCF1–LEF1 co-expression identifies a multipotent progenitor cell (TH2-MPP) across human allergic diseases . Nat. Immunol . 25 , 902 ( 2024 ). OpenUrl CrossRef PubMed [S6]. D Martino , et al. , Epigenetic dysregulation of naive cd4+ t-cell activation genes in childhood food allergy . Nat. Commun . 9 , 3308 ( 2018 ). OpenUrl CrossRef PubMed [S7]. B Hamelin , et al. , Single-cell analysis reveals inter-and intratumour heterogeneity in metastatic breast cancer . J Mammary Gland Biol. Neoplasia 28 , 26 ( 2023 ). OpenUrl CrossRef PubMed [S8]. J Urosevic , et al. , Colon cancer cells colonize the lung from established liver metastases through p38 MAPK signalling and PTHLH . Nat. Cell Biol . 16 , 685 ( 2014 ). OpenUrl CrossRef PubMed [S9]. J Zhang , et al. , Overexpression of FAM83H-AS1 indicates poor patient survival and knockdown impairs cell proliferation and invasion via MET/EGFR signaling in lung cancer . Sci. Rep . 7 , 42819 ( 2017 ). OpenUrl CrossRef PubMed [S10]. LA Goodman , et al. , Modulation of N-myc expression alters the invasiveness of neuroblastoma . Clin. Exp. Metastasis 15 , 130 ( 1997 ). OpenUrl CrossRef PubMed [S11]. P Mitchell , G Liew , B Gopinath , TY Wong , Age-related macular degeneration . Lancet 392 , 1147 ( 2018 ). OpenUrl CrossRef PubMed [S12]. M Fleckenstein , et al. , Age-related macular degeneration . Nat. Rev. Dis. Primers 7 , 31 ( 2021 ). OpenUrl CrossRef PubMed [S13]. D Peng , et al. , Anti-angiogenic properties of microRNA-29a in preclinical ocular models . Proc. Natl. Acad. Sci. U.S.A . 145 , e2204795119 ( 2022 ). OpenUrl [S14]. CJ Chiu , et al. , Associations between genetic polymorphisms of insulin-like growth factor axis genes and risk for age-related macular degeneration . Invest. Ophthalmol. Vis. Sci . 52 , 9099 ( 2011 ). OpenUrl Abstract / FREE Full Text [S15]. PS Linsley , et al. , Germline-like TCR-α chains shared between autoreactive T cells in blood and pancreas . Nat. Commun . 15 , 4971 ( 2024 ). OpenUrl CrossRef PubMed [S16]. E Ahmad , S Lim , R Lamptey , DR Webb , MJ Davies , Type 2 diabetes . Lancet 400 , 1803 ( 2022 ). OpenUrl CrossRef PubMed [S17]. C Gridelli , et al. , Non-small-cell lung cancer . Nat. Rev. Dis. Primers 1 , 1 ( 2015 ). OpenUrl [S18]. F Hirsch , M Varella-Garcia , F Cappuzzo , Predictive value of EGFR and HER2 overexpression in advanced non-small-cell lung cancer . Oncogene 28 , S32 ( 2009 ). OpenUrl CrossRef PubMed Web of Science [S19]. K Kamimura , et al. , Haploinsufficiency of Bcl11b for suppression of lymphomagenesis and thymocyte development . Biochem. Biophys. Res. Co . 355 , 538 ( 2007 ). OpenUrl CrossRef PubMed Web of Science [S20]. C Argmann , et al. , Biopsy and blood-based molecular biomarker of inflammation in IBD . Gut 72 , 1271 – 1287 ( 2023 ). OpenUrl Abstract / FREE Full Text [S21]. P Shen , et al. , Therapeutic targeting of CPSF3-dependent transcriptional termination in ovarian cancer . Sci. Adv . 9 , eadj0123 ( 2023 ). OpenUrl CrossRef PubMed [S22]. AT Nguyen , et al. , UBE2O remodels the proteome during terminal erythroid differentiation . Science 357 , eaan0218 ( 2017 ). OpenUrl Abstract / FREE Full Text [S23]. P Jeannin , et al. , Human Effector Memory T Cells Express CD86: A Functional Role in Naive T Cell Priming . J. Immunol . 162 , 2044 ( 1999 ). OpenUrl Abstract / FREE Full Text [S24]. V Sundar , et al. , Transcriptional cyclin-dependent kinases as the mediators of inflammation-a review . Gene 769 , 145200 ( 2021 ). OpenUrl CrossRef PubMed [S25]. W Slotkin , K Nishikura , Adenosine-to-inosine RNA editing and human disease . Genome Med . 5 , 1 ( 2013 ). OpenUrl CrossRef PubMed [S26]. Z Gao , et al. , Crystal structure and function of rbj: A constitutively gtp-bound small g protein with an extra dnaj domain . Protein Cell 10 , 760 ( 2019 ). OpenUrl CrossRef PubMed [S27]. TCP Pham , et al. , The mitochondrial mRNA-stabilizing protein SLIRP regulates skeletal muscle mitochondrial structure and respiration by exercise-recoverable mechanisms . Nat. Commun . 15 , 9826 ( 2024 ). OpenUrl CrossRef PubMed [S28]. C Fazio , L Ricciardiello , Inflammation and notch signaling: a crosstalk with opposite effects on tumorigenesis . Cell Death Dis . 7 , e2515 ( 2016 ). OpenUrl CrossRef PubMed [S29]. P Xia , et al. , Sox2 functions as a sequence-specific DNA sensor in neutrophils to initiate innate immunity against microbial infection . Nat. Immunol . 16 , 366 ( 2015 ). OpenUrl CrossRef PubMed [S30]. B Holm , S Barsuhn , H Behrens , S Krüger, C Röcken, The tumor biological significance of RNF43 and LRP1B in gastric cancer is complex and context-dependent . Sci. Rep . 13 , 3191 ( 2023 ). OpenUrl CrossRef PubMed [S31]. M Lucerna , et al. , NAB2, a corepressor of EGR-1, inhibits vascular endothelial growth factormediated gene induction and angiogenic responses of endothelial cells . J. Biol. Chem . 278 , 11433 ( 2003 ). OpenUrl Abstract / FREE Full Text [S32]. L Olofsson , et al. , CCAAT/Enhancer Binding Protein α (C/EBPα) in Adipose Tissue Regulates Genes in Lipid and Glucose Metabolism and a Genetic Variation in C/EBPα Is Associated with Serum Levels of Triglycerides . J. Clin. Endocrinol. Metab . 93 , 4880 ( 2010 ). OpenUrl [S33]. R Avellino , R Delwel , Expression and regulation of C/EBPα in normal myelopoiesis and in malignant transformation . Blood 129 , 2083 ( 2017 ). OpenUrl Abstract / FREE Full Text [S34]. H Guan , et al. , Long noncoding RNA LINC00673-v4 promotes aggressiveness of lung adenocarcinoma via activating WNT/β-catenin signaling . Proc. Natl. Acad. Sci. U.S.A . 116 , 14019 ( 2019 ). OpenUrl Abstract / FREE Full Text [S35]. T Reya , et al. , Wnt signaling regulates B lymphocyte proliferation through a LEF-1 dependent mechanism . Immunity 13 , 15 ( 2000 ). OpenUrl CrossRef PubMed Web of Science [S36]. AP Ng , et al. , An Erg-driven transcriptional program controls B cell lymphopoiesis . Nat. Commun . 11 , 3013 ( 2020 ). OpenUrl CrossRef PubMed [S37]. YC Wen , et al. , TCF7L1 regulates cytokine response and neuroendocrine differentiation of prostate cancer . Oncogenesis 10 , 81 ( 2021 ). OpenUrl CrossRef PubMed [S38]. MV Luna Velez , et al. , ONECUT2 regulates RANKL-dependent enterocyte and microfold cell differentiation in the small intestine; a multi-omics study . Nucleic Acids Res . 51 , 1277 ( 2023 ). OpenUrl CrossRef PubMed [S39]. M Delacher , et al. , Rbpj expression in regulatory T cells is critical for restraining TH2 responses . Nat. Commun . 10 , 1621 ( 2019 ). OpenUrl CrossRef PubMed [S40]. X Shen , et al. , ETV1 positively correlated with immune infiltration and poor clinical prognosis in colorectal cancer . Front. Immunol . 13 , 939806 ( 2022 ). OpenUrl CrossRef PubMed [S41]. M Forouzanfar , et al. , Intracellular functions of RNA-binding protein, Musashi1, in stem and cancer cells . Stem Cell Res. Ther . 11 , 193 ( 2020 ). OpenUrl CrossRef PubMed [S42]. L Zhao , JL Cannons , LH Castilla , PL Schwartzberg , PP Liu , The Role of CBFβ in T Cell Development . Blood 104 , 3234 ( 2004 ). OpenUrl CrossRef [S43]. I Cuthbertson , N Morrell , P Caruso , BMPR2 mutation and metabolic reprogramming in pulmonary arterial hypertension . Circ. Res . 132 , 109 ( 2023 ). OpenUrl CrossRef PubMed [S44]. S Kuriyama , et al. , In vivo collective cell migration requires an LPAR2-dependent increase in tissue fluidity . J. Cell Biol . 206 , 113 ( 2014 ). OpenUrl Abstract / FREE Full Text [S45]. M Ruiz-Pérez , A Henley , M Arsenian-Henriksson , The MYCN Protein in Health and Disease . Genes 8 , 42819 ( 2017 ). OpenUrl [S46]. S Lee , et al. , The role of EZH1 and EZH2 in development and cancer . BMB Rep . 55 , 595 ( 2022 ). OpenUrl CrossRef PubMed [S47]. FJ Ren , XY Cai , Y Yao , GY Fang , JunB: a paradigm for Jun family in immune response and cancer . Front. Cell. Infect. Microbiol . 13 , 1222265 ( 2023 ). OpenUrl CrossRef PubMed [S48]. L Qi , et al. , HomeoboxC6 promotes metastasis by orchestrating the DKK1/Wnt/β-catenin axis in right-sided colon cancer . Cell Death Dis . 12 , 337 ( 2021 ). OpenUrl CrossRef PubMed [S49]. RN Joshi , et al. , Phosphatase inhibitor PPP1R11 modulates resistance of human t cells toward Treg-mediated suppression of cytokine expression . J. Leukoc. Biol . 106 , 413 ( 2019 ). OpenUrl CrossRef PubMed [S50]. A Evans , N Lennemann , C Coyne , BPIFB3 Regulates Endoplasmic Reticulum Morphology To Facilitate Flavivirus Replication . J Virol 94 ( 2020 ). [S51]. M Yao , et al. , Tumor signatures of PTHLH overexpression, high serum calcium, and poor prognosis were observed exclusively in clear cell but not non clear cell renal carcinomas . Cancer Med . 3 ( 2014 ). [S52]. ZQ Zheng , et al. , VIRMA promotes nasopharyngeal carcinoma, tumorigenesis, and metastasis by upregulation of E2F7 in an m6a-dependent manner . J. Biol. Chem . 299 , 104677 ( 2023 ). OpenUrl CrossRef PubMed [S53]. W Han , et al. , RNA-binding protein PCBP2 modulates glioma growth by regulating FHL3 . J. Clin. Invest . 123 , 2103 ( 2013 ). OpenUrl CrossRef PubMed [S54]. P Moustardas , D Aberdam , N Lagali , MAPK Pathways in Ocular Pathophysiology: Potential Therapeutic Drugs and Challenges . Cells 12 , 617 ( 2021 ). OpenUrl [S55]. K Morita , et al. , Common variants of HNF1A gene are associated with diabetic retinopathy and poor glycemic control in normal-weight Japanese subjects with type 2 diabetes mellitus . JDC 31 , 483 ( 2017 ). OpenUrl [S56]. M Zou , et al. , Inhibition of cGAS-STING by JQ1 alleviates oxidative stress-induced retina inflammation and degeneration . Cell Death Differ . 29 , 1816 ( 2022 ). OpenUrl CrossRef PubMed [S57]. M Loix , N Zelcer , JF Bogie , JJ Hendriks , The ubiquitous role of ubiquitination in lipid metabolism . Trends Cell Biol , 34 , 416 ( 2024 ). OpenUrl CrossRef PubMed [S58]. D Shackelford , R Shaw , The LKB1–AMPK pathway: metabolism and growth control in tumour suppression . Nat. Rev. Cancer 9 , 563 ( 2009 ). OpenUrl CrossRef PubMed Web of Science [S59]. J Wawrzynski , et al. , Spectrum of mutations in NDP resulting in ocular disease; a systematic review . Front. Genet . 13 , 884722 ( 2022 ). OpenUrl CrossRef PubMed [S60]. H An , N Williams , T Shelkovnikova , NEAT1 and paraspeckles in neurodegenerative diseases: A missing lnc found? Noncoding RNA Res . 3 , 243 ( 2018 ). OpenUrl CrossRef PubMed [S61]. P Sahoo , et al. , Disruption of G3BP1 granules promotes mammalian CNS and PNS axon regeneration . Proc. Natl. Acad. Sci. U.S.A . 122 , e2411811122 ( 2025 ). OpenUrl CrossRef PubMed [S62]. T Truong , RZ Silkiss , The Role of Insulin-like Growth Factor-1 and Its Receptor in the Eye: A Review and Implications for IGF-1R Inhibition . Ophthal. Plast. Reconstr. Surg . 39 , 4 ( 2023 ). OpenUrl CrossRef PubMed [S63]. L Yang , et al. , Temporal landscape and translational regulation of A-to-I RNA editing in mouse retina development . BMC Biol . 22 , 106 ( 2024 ). OpenUrl CrossRef PubMed [S64]. R Suriben , et al. , β-Cell Insulin Secretion Requires the Ubiquitin Ligase COP1 . Cell 163 , 1457 ( 2015 ). OpenUrl CrossRef PubMed [S65]. H Qi , M Sarkissian , G Hu , Histone H4K20/H3K9 demethylase PHF8 regulates zebrafish brain and craniofacial development . Nature 466 , 503 ( 2010 ). OpenUrl CrossRef PubMed Web of Science [S66]. YG Jeon , et al. , RNF20 Functions as a Transcriptional Coactivator for PPARγ by Promoting NCoR1 Degradation in Adipocytes . Diabetes 69 , 20 ( 2019 ). OpenUrl PubMed [S67]. A Zampetaki , et al. , Plasma microRNA profiling reveals loss of endothelial miR-126 and other microRNAs in type 2 diabetes . Circ. Res . 107 , 810 ( 2010 ). OpenUrl Abstract / FREE Full Text [S68]. T Wang , et al. , C9orf72 regulates energy homeostasis by stabilizing mitochondrial complex I assembly . Cell Metab . 33 , 531 ( 2021 ). OpenUrl CrossRef PubMed [S69]. A Nawaz , T Shilikbay , G Skariah , S Ceman , Unwinding the roles of RNA helicase MOV10 . Wiley Interdiscip. Rev. RNA 13 , e1682 ( 2022 ). OpenUrl CrossRef [S70]. J Dooley , et al. , The microRNA-29 Family Dictates the Balance Between Homeostatic and Pathological Glucose Handling in Diabetes and Obesity . Diabetes 65 , 53 ( 2015 ). OpenUrl [S71]. D Jensen , et al. , Broad-acting therapeutic effects of miR-29b-chitosan on hypertension and diabetic complications . Mol. Ther . 30 , 3462 ( 2022 ). OpenUrl CrossRef PubMed [S72]. A Fischer , et al. , ZAP70: A master regulator of adaptive immunity . Semin. Immunopathol . 32 , 107 ( 2010 ). OpenUrl CrossRef PubMed [S73]. D Yang , H Liu , Z Yang , D Fan , Q Tang , BMI1 in the heart: Novel functions beyond tumorigenesis . eBioMedicine 63 , 103193 ( 2021 ). OpenUrl CrossRef PubMed [S74]. RS Iyer , et al. , Drug-resistant EGFR mutations promote lung cancer by stabilizing interfaces in ligand-free kinase-active EGFR oligomers . Nat. Commun . 15 , 2130 ( 2024 ). OpenUrl CrossRef PubMed [S75]. T Litman , W Stein , Ancient lineages of the keratin-associated protein (KRTAP ) genes and their co-option in the evolution of the hair follicle . BMC Ecol. Evo . 23 ( 2023 ). [S76]. C Hu , et al. , A prediction model integrated genomic alterations and immune signatures of tumor immune microenvironment for early recurrence of stage I NSCLC after curative resection . Transl. Lung Cancer Res . 11 ( 2022 ). [S77]. CC Morosky S , Lennemann NJ , BPIFB6 regulates secretory pathway trafficking and enterovirus replication . J. Virol . 90 , 5098 ( 2016 ). OpenUrl Abstract / FREE Full Text [S78]. A Gutierrez , et al. , The BCL11B tumor suppressor is mutated across the major molecular subtypes of T-cell acute lymphoblastic leukemia . Blood 118 , 4169 – 4173 ( 2011 ). OpenUrl Abstract / FREE Full Text [S79]. R Dhanasekaran , et al. , The MYC oncogene - the grand orchestrator of cancer growth and immune evasion . Nat. Rev. Clin. Oncol . 19 , 23 ( 2022 ). OpenUrl CrossRef PubMed [S80]. A Yang , et al. , p63, a p53 Homolog at 3q27–29, Encodes Multiple Products with Transactivating, Death-Inducing, and Dominant-Negative Activities . Mol. Cell 2 , 305 ( 1998 ). OpenUrl CrossRef PubMed Web of Science [S81]. Q Wen , et al. , Inactivating Celsr2 promotes motor axon fasciculation and regeneration in mouse and human . Brain 145 , 670 ( 2022 ). OpenUrl CrossRef PubMed [S82]. H Roca , et al. , Transcription Factors OVOL1 and OVOL2 Induce the Mesenchymal to Epithelial Transition in Human Cancer . PLoS ONE 8 , e76773 ( 2013 ). OpenUrl CrossRef PubMed [S83]. L Deng , et al. , MYC-driven U2SURP regulates alternative splicing of SAT1 to promote triple-negative breast cancer progression . Cancer Lett . 560 , 216124 ( 2023 ). OpenUrl CrossRef PubMed [S84]. J Yang , W Sun , CG, Roles of the NR2F Family in the Development, Disease, and Cancer of the Lung . J. Dev. Biol . 12 , 24 ( 2024 ). OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted April 22, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Generative prediction of causal gene sets responsible for complex traits Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Generative prediction of causal gene sets responsible for complex traits Benjamin Kuznets-Speck , Buduka K. Ogonor , Thomas P. Wytock , Adilson E. Motter bioRxiv 2025.04.17.649405; doi: https://doi.org/10.1101/2025.04.17.649405 Share This Article: Copy Citation Tools Generative prediction of causal gene sets responsible for complex traits Benjamin Kuznets-Speck , Buduka K. Ogonor , Thomas P. Wytock , Adilson E. Motter bioRxiv 2025.04.17.649405; doi: https://doi.org/10.1101/2025.04.17.649405 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17690) Bioengineering (13892) Bioinformatics (41936) Biophysics (21451) Cancer Biology (18588) Cell Biology (25499) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88603) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15152) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00