Full text
39,379 characters
· extracted from
preprint-html
· click to expand
MOKA: A pipeline for multi-omics bridged SNP-set kernel association test | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search MOKA: A pipeline for multi-omics bridged SNP-set kernel association test David Enoma , View ORCID Profile Dinghao Wang , Ariel Ghislain Kemogne Kamdoum , View ORCID Profile Rodrigo Ortega Polo , Quan Long , View ORCID Profile Jingni He doi: https://doi.org/10.1101/2025.07.06.25330974 David Enoma 1 Department of Biochemistry and Molecular Biology, Cumming School of Medicine, University of Calgary AB , Canada 2 The Mathison Centre for Mental Health Research and Education, Hotchkiss Brain Institute, Cumming School of Medicine, University of Calgary , Calgary, AB, Canada 4 Alberta Children’s Hospital Research Institute , Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: David.enoma{at}ucalgary.ca jingni.he{at}monash.edu Dinghao Wang 3 Department of Mathematics and Statistics, Faculty of Science, University of Calgary , Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Dinghao Wang Ariel Ghislain Kemogne Kamdoum 3 Department of Mathematics and Statistics, Faculty of Science, University of Calgary , Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rodrigo Ortega Polo 6 Lethbridge Research and Development Centre, Agriculture and Agri-Food Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Rodrigo Ortega Polo Quan Long 1 Department of Biochemistry and Molecular Biology, Cumming School of Medicine, University of Calgary AB , Canada 2 The Mathison Centre for Mental Health Research and Education, Hotchkiss Brain Institute, Cumming School of Medicine, University of Calgary , Calgary, AB, Canada 3 Department of Mathematics and Statistics, Faculty of Science, University of Calgary , Canada 4 Alberta Children’s Hospital Research Institute , Canada 5 Department of Medical Genetics, University of Calgary , Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jingni He 1 Department of Biochemistry and Molecular Biology, Cumming School of Medicine, University of Calgary AB , Canada 7 Department of Neuroscience, School of Translational Medicine, Faculty of Medicine, Nursing and Health Sciences, Monash University , Melbourne, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jingni He For correspondence: David.enoma{at}ucalgary.ca jingni.he{at}monash.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract The explosion of genomic and multi-omics data has created a need for scalable, reproducible tools that integrate functional annotations into genome-wide association studies (GWAS). We introduce multi-omics data bridged Kernel Association test (MOKA) pipeline, a Snakemake-based workflow that automates SNP-set kernel-based association testing by incorporating multi-omics data, including gene expression, transcription factor binding, evolutionary conservation scores and neural network-derived features. This data-bridged architecture enhances variant prioritization and aggregation, improving statistical power in GWAS. MOKA supports population structure correction via spectral decomposition, parallel computation, and post-GWAS analyses, including visualization, Gene Ontology annotation, pathway enrichment, and validation. As a use case, we applied MOKA to a schizophrenia GWAS cohort, identified 89 Bonferroni-significant genes, with a 15.7% validation rate in disease-specific DisGeNET database and enrichment in pathways relevant to neuropsychiatric disease. MOKA provides a robust, scalable, and extensible framework for functional multi-omics integration in genetic studies. It is open-source and available at https://github.com/davidenoma/moka . 1. Introduction Genome-wide association studies (GWAS) have been instrumental in identifying genetic variants associated with complex traits and diseases ( Wu et al. 2010 ; Dehghan 2018 ). However, there is still a large proportion of unexplained heritability, particularly due to limitations in detecting the cumulative effects of causal variants ( Manolio et al. 2009 ). Leveraging external and functional multi-omics data has shown promise in improving the power of GWAS by enabling more informed variant selection and aggregation ( Wu et al. 2010 ). Our previous research efforts have developed and applied the kernel-based association tests with the data-bridged architecture ( Cao et al. 2021 ), which integrates external multi-omics data to guide variant prioritization and aggregation for downstream association mapping. This framework has been successfully applied to various data modalities, including gene expression ( Cao et al. 2022a ), brain imaging-derived phenotypes ( He et al. 2024 ), transcription factor occupancy ( He et al. 2022 ), and transcription factor binding-informed trans-variants ( He et al. 2025 ). Additionally, a database resource has been developed to disseminate genome-wide analysis results ( Cao et al. 2022b ). As the scale and diversity of genomic and multi-omics datasets continue to grow, there is an increasing demand for robust, scalable, and reproducible tools to streamline data processing and analysis. In bioinformatics, workflow management systems such as Snakemake ( Koster and Rahmann 2018 ) have gained popularity due to their reproducibility, scalability, and ease of integration with high-performance computing environments. Despite the increasing complexity of the multi-omics data analysis ( Cooper and Shendure 2011 ), there is a lack of automated pipelines tailored for kernel-based association tests, a class of statistical methods that model the joint effects of multiple variants in genetic studies. To address this gap, we introduce the Multi-Omics Kernel-based Association (MOKA) pipeline, a fully automated analysis workflow built on the Snakemake workflow management system ( Koster and Rahmann 2018 ). MOKA enables researchers and scientists to efficiently leverage the data-bridged kernel-based association tests in their GWAS datasets, boosting statistical power through the integration of diverse multi-omics data. By leveraging Snakemake’s widespread adoption, extensive documentation, and active user community, MOKA ensures reproducibility, scalability, and ease of customization. This pipeline streamlines the entire analysis process, from multi-omics data integration and kernel-based association testing (with correction for population structure) to result visualization, biological annotation, disease database validation, Gene Ontology enrichment, pathway analysis. Additionally, this Snakemake pipeline also facilitates the harmonized use of tools from both Python and R, bridging distinct computational environments. Given the heterogeneity of data types and the complexity of parameter tuning in post-GWAS analysis, MOKA offers a robust and user-friendly solution for conducting comprehensive multi-omics association studies. 2. Methods 2.1. Design of MOKA The MOKA pipeline is implemented using the Snakemake workflow management system ( Koster and Rahmann 2018 ). Installation requires only a few simple steps, as outlined in the MOKA online documentation ( https://github.com/davidenoma/moka ). 2.2. Configuration, Data and software requirements MOKA requires input genotype data in PLINK binary format (bed, bim, fam)( Purcell et al. 2007 ). Configuration is handled through a YAML file (config/config.yaml, see Table 1 ), which specifies input files, parameters, and auxiliary scripts. To optimize computational efficiency, MOKA supports chromosome-level parallelization by invoking GNU Parallel ( Tange 2018 ) within a single Snakemake rule. Gene regions are defined as ±500 kb from gene boundaries based on the hg38 human genome reference ( Mudge et al. 2025 ). View this table: View inline View popup Download powerpoint Table 1. Configuration table for MOKA. yaml format 2.3. Multi-omics data bridge Multi-omics data sources include user-defined functional genomic annotations at the nucleotide level, capturing biologically relevant features ( Figure 1A ), such as cis-regulatory variants, transcription factor binding ( He et al. 2022 ; He et al. 2025 ), evolutionary conservation scores ( Hubisz and Pollard 2014 ), gene expression changes ( Li et al. 2024 ), imaging-derived weights, neural network-based ( Li et al. 2023 ) approaches ( Enoma et al. 2022 ), and others that promise to uncover disease-associated variants and genes ( Cooper and Shendure 2011 ). All input weights are uniformly formatted across SNPs for integration into kernel models, with weight types customizable by the user. Download figure Open in new tab Figure 1. MOKA pipeline outlining the processes, including A. Diverse multi-omics data sources include variant-specific neural network weights, gene expression, conservation scores, brain image weights and regulatory element weights. B. multi-omics bridged association test on GWAS data. C. Gene Ontology and KEGG pathway enrichment analyses. D. Visualization of results in a Manhattan plot. E. DisGeNET disease database validation ratio of causal genes. 2.4. Kernel-based association testing with multi-omics weights Within the input GWAS dataset, each SNP set or gene region is tested using a kernel-based method, which flexibly models epistatic and nonlinear effects of SNPs ( Wu et al. 2010 ). For each individual, a weighted kernel is constructed using the SNPs and the corresponding data-bridged weights ( Figure 1B ). Where x is the genotype matrix for the SNPs in each gene region, W mo is a diagonal matrix defined as , with p being the number of selected variants. The weights are derived from multi-omics data integration. This data-bridged weighted kernel ( K W ) is used in the association analysis implemented through a kernel-based test. The test statistic is defined as: Where Y is the vector of phenotype values from the GWAS dataset, and K W is the weighted kernel matrix defined above. Under the null hypothesis, Q follows a mixture of chi-squared distributions. The Q-score test statistic evaluates whether the SNPs within the gene regions contribute to the observed phenotypic variation. The significance of the association for each gene region is assessed by calculating the p-value at a 0.05 threshold (prior to multiple testing correction), based on the null distribution of Q 2.5. Data transformation to control population structure Uneven genetic relatedness will cause population structure, leading to inflated p-values ( Kang et al. 2010 ). Linear Mixed Models are usually used for controlling this in single-SNP analysis ( Lippert et al. 2011 ). However, here we are carrying out gene-based set test and aggregates SNPs within the gene region. To handle this problem, we use a transformation based on the decomposition of GRM. Previous work was done for both GWAS ( Long et al. 2013 ) and gene expression analysis ( Long et al. 2016 ) to decorrelated artifacts caused by uneven genetic relatedness using the same decompositions which we implement as a feature in the MOKA snakemake pipeline ( https://github.com/davidenoma/moka ). Assuming the Linear Mixed model without fixed effect of a SNP takes form: Where Y is the phenotype vector with n samples, z is the covariate matrix, and r is the vector of fixed effect sizes. g represents the random genetic effects, with denoting the genetic variance and G being the genomic relationship matrix (GRM). ∈ denotes the residual effects, and is the residual variance. Then, the variance of Y is given by and the unknown parameters can be efficiently estimated using methods such as Restricted Maximum Likelihood (REML) ( Lippert et al. 2011 ; Yang et al. 2011a ). Residual structure in G can inflate the genome-wide test statistics, a phenomenon usually summarised by the genomic inflation factor λ GC , defined as the ratio of the median observed chi-square statistic to its null expectation ( Devlin and Roeder 1999 ). We denote the eigen decomposition of GRM to be G = USU T , where U is an orthogonal matrix, whose columns are the eigenvectors of G , and S is a diagonal matrix containing the corresponding eigenvalues. Substituting this into Var ( Y ), we obtain: Next, we define a transformation matrix . Subsequently, the transformed variance of Y becomes: The last step holds because is a diagonal matrix. Now, we apply this transformation to both the phenotype Y (with n samples) and the genotype X (with n samples and p variants), defining and . This transformation ensures that the covariance matrix of is the identity matrix, i.e., . So, we apply the spectral transformation to the MOKA test to the transformed data, we compute the test statistic: Where is the kernel matrix and is W mo a weight matrix that reflects the multi-omics weight specific contribution of each variant and used for MOKA. Spectral decomposition is available as a default functionality in MOKA which can be toggled on or off on the configuration file of the pipeline (see Table 1 ). 2.6. Genomic inflation factor (λGC) calculation Following Devlin and Roeder (1999) , we first convert each gene-level p-value to its equivalent χ 2 statistic via the inverse cumulative-distribution function (inverse CDF) of a χ 2 distribution with one degree of freedom. denotes the median of these observed statistics. We then divide this value by the theoretical median of a χ 2 distribution with one degree of freedom, : Because the denominator is approximately by theory, λGC ≈ 1 indicates well-calibrated test statistics, whereas λGC > 1.4 signals genomic inflation arising from unmodelled population structure or other confounding effects ( Yang et al. 2011b ); however they show that even with perfect population structure matching, λGC rises roughly in proportion to polygenicity, so gene - level statistics with many contributing variants can have values well above 1 without implying confounding. 2.7. Post-GWAS annotation and Gene Set Enrichment The output structure is organized into easily navigable folders, including “output_plots” and “result_folder.” KEGG pathway enrichment (snakemake --cores 1 kegg_pathway_analysis) is performed using PathfindR ( Ulgen et al. 2019 ) package, and the Gene Ontology analysis (snakemake --cores 1 go_analysis) is performed with g:Profiler ( Reimand et al. 2007 ) package ( Figure 1C ). DisGeNET ( Pinero et al. 2017 ) is a database encompassing 1,134,942 gene-disease associations (GDAs) involving 21,671 genes and 30,170 traits. DisGeNET provides specific summaries for each disease within this platform, detailing gene associations and information on identified significant genes after multiple testing corrections and the proportions associations are in the database ( Figure 1E ) (snakemake --cores 1 disgenet_annotation_005). 3. Results 3.1. Configuration of MOKA and multi-omics data bridge In this demonstration, we used 17-way human and primates accelerated conservation scoring ( Siepel et al. 2005 ; Pollard et al. 2010 ) as weights ( W mo ) for each variant. The acceleration measures quantify the extent of human-specific sequence change, such as sites that have diverged more rapidly than expected under neutrality. Negative phyloP ( Pollard et al. 2010 ) scores already encode acceleration as −log 10 p from a likelihood-ratio test, so negative values (e.g., phyloP ≤ −3, corresponding to p ≤ 10 -3 ) signal human-specific evolution and associated disorder. These annotations are of particular interest in schizophrenia, a human-complex disease ( Doan et al. 2016 ; Levchenko et al. 2018 ), providing a biologically informed basis for SNP aggregation. In the neural-network weights’ data bridge, weights ( W mo ) were obtained by taking the element-wise sum of the variant-specific encoder and decoder weight matrices from a variational auto-encoder (VAE) ( Kingma and Welling 2013 ) model as previously described by (Enoma D). The network was trained on the input genotype data, so after tuning and convergence the neural network weights learnt in the training process of the latent representations for reconstruction are extracted ( Unterthiner et al. 2020 ; Herrmann et al. 2024 ). 3.2. Association results Figure 2 presents a Manhattan plot of association results using negative conservation score– based weights, with genes on chromosomes 1–22 plotted against –log 10 . The Bonferroni-corrected significance threshold (p = 2.64e-06) is shown as a horizontal line. Using human accelerated conservation scores as weights, 89 genes were significantly associated with schizophrenia. The top-associated gene, TMEM17, encodes a transmembrane protein involved in ciliogenesis and neural signaling, and has previously been implicated in schizophrenia ( Bigdeli et al. 2021 ). Download figure Open in new tab Figure 2. Example Manhattan plot of Association results. 3.3. External Database Validation For validation, we used DisGeNET disease-specific database as a reference database (Schizophrenia in this case). The validation ratio was calculated as the proportion of FDR-significant genes at p-< 0.05 ( Benjamini and Hochberg 1995 ), overlapping with DisGeNET entries, i.e., the number of significant genes found in DisGeNET divided by the total number of significant genes. Applying the MOKA association test to the schizophrenia GWAS dataset resulted in a 15.7% validation ratio, with 351 associated genes present in DisGeNET. In comparison, SKAT( Wu et al. 2010 ) yielded a 13.3% validation ratio with 2,530 genes (genomic-inflation factor of λGC ≈ 113), and REGENIE( Mbatchou et al. 2021 ) yielded 12% with 115 genes validated. 3.4. Spectral decomposition and genomic inflation The MOKA pipeline incorporates spectral decomposition of the variance component to correct inflation in kernel-based tests, improving calibration without sacrificing biological relevance from functional weights (Method). In our schizophrenia GWAS, SKAT( Wu et al. 2010 ) yielded a highly inflated genomic inflation factor of λGC ≈ 113, indicating substantial confounding and false positives. REGENIE’s whole-genome mixed model reduced this to ≈ 4.7, but lingering stratification remains. Using phyloP acceleration scores in MOKA (MOKA-Ph) reduced SKAT’s inflation to 10.5. Applying MOKA’s spectral correction (MOKA-Ph-COR) further lowered λGC to ≈ 4.3. While still above the ideal benchmark of <1.2, this matches REGENIE’s calibration while retaining mechanistically informed weighting for improved power. The cohort-wide comparison ( Table 2 ) shows SKAT inflates the test statistics (λGC ≈ 6.9–112.8), while REGENIE reduced this to 3.3–4.7. Incorporating encoder–decoder VAE weights in MOKA (MOKA-ED) reduces the inflation to 4.0–10.1, but remains above REGENIE. The decisive improvement comes from the spectral decomposition of the GRM: MOKA-ED-COR further reduces inflation to 2.5–4.6. In schizophrenia, λGC drops from 113 (SKAT) to 10.1 (MOKA-ED) and 4.6 (MOKA-ED-COR), comparable to REGENIE. These findings demonstrate that MOKA with spectral correction yields well-calibrated, biologically informed association tests, although additional adjustment for population structure may still be beneficial. View this table: View inline View popup Download powerpoint Table 2. Genomic-inflation factor (λGC) across five cohorts for four association methods (REGENIE( Mbatchou et al. 2021 ); SKAT( Wu et al. 2010 ); MOKA-ED = MOKA with encoder– decoder neural-network weights; MOKA-ED-COR MOKA-ED with spectral (GRM) decomposition) 3.5. Gene Ontology analysis Gene Ontology analysis in MOKA ( Figure 3 ) of FDR-significant results (p < 0.05) identified top terms for cellular component (cytoplasm, p = 2.0e-31), molecular function (protein binding, p = 5.0e-28), and biological process (anatomical structure development, p = 1.6e-12) for schizophrenia, respectively. Download figure Open in new tab Figure 3. Example Gene Ontology Enrichment results. 3.6. KEGG Pathway enrichment KEGG pathway enrichment analysis in MOKA ( Figure 4 ) of FDR-significant results (p < 0.05) identified several significantly enriched pathways. The top five pathways were Proteasome (p = 1.0e-11), Human Cytomegalovirus Infection (p = 2.0e-10), Nucleocytoplasmic Transport (p = 4.0e-9), prion disease (p = 8.0e-9), and T Cell Receptor Signaling Pathway (p = 1.2e-8). Download figure Open in new tab Figure 4. Example KEGG pathway enrichment results 4. Conclusion MOKA is a scalable, automated Snakemake pipeline that enhances GWAS by integrating multi-omics data through SNP-set kernel association tests and applying spectral decomposition to control genomic inflation and population structure. It supports comprehensive post-GWAS analyses and visualization. In the schizophrenia GWAS, SKAT produced extreme inflation (λGC ≈ 113) with a 13.3% validation rate. MOKA using negative phyloP scores reduced λGC to 10.5 and improved validation to 15.7%. Activating the decorrelation step (“decor-phylo”) further reduced inflation to 4.3. Across WTCCC and schizophrenia datasets, SKAT yielded λGC values between 6.9 and 113, while MOKA with correction reduced this to 2.5–4.6, comparable to REGENIE. Liu et al.( Liu et al. 2013 ) demonstrated that gene-based tests are susceptible to inflation from gene length and allele-frequency heterogeneity, and that single-SNP genomic control can be overly conservative. MOKA addresses these challenges by providing a robust, flexible, and user-friendly framework for multi-omics integration and kernel-based association analysis, accessible to the broader research community. 5. Data and Code availability The input genotype data is from the Molecular Genetics of Schizophrenia - nonGAIN Sample (MGS_nonGAIN) (dbGaP Study Accession: phs000167.v1.p1) may available at https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000167.v1.p1 and Wellcome Trust Case Control Consortium (WTCCC GWAS)( Wellcome Trust Case Control 2007 ) are in the public domain. The code of Moka Pipeline is freely available on GitHub under the MIT license and can be found at https://github.com/davidenoma/moka . The Conjoint Health Research Ethics Board (CHREB) at the University of Calgary approved this work with ID REB23-0045_REN. The PhyloP( Pollard et al. 2010 ) conservation scores for each variant position may be downloaded from the UCSC genome browser( Nassar et al. 2023 ). 7. Funding This work is partly supported by the Mathison Centre Graduate Recruitment Scholarship (D.E.), ACHIR Graduate Scholarship (D.E.), the Alberta Innovates Graduate Scholarship (A.K. and D.W.), the Indigenous and Black Momentum Scholarship in Science (A.K.), and the Agriculture Funding Consortium through the Western Grains Research Foundation (D.E. and R.O.P.). 8. Conflict of Interest The author declares that they have no competing interests. 6. Acknowledgements We want to acknowledge group members of the Quan Long Lab (Cumming School of Medicine, University of Calgary) that developed models showing the utility of the data-bridged architecture, which is now automated in the Multi omics kernel-based association testing (MOKA) pipeline to promote ease of use of reproducibility. 9. References ↵ Benjamini , Y. , and Y. Hochberg , 1995 Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing . Journal of the Royal Statistical Society: Series B (Methodological) 57 ( 1 ): 289 – 300 . OpenUrl CrossRef PubMed Web of Science ↵ Bigdeli , T.B. , A.H. Fanous , Y. Li , N. Rajeevan , F. Sayward et al. , 2021 Genome-Wide Association Studies of Schizophrenia and Bipolar Disorder in a Diverse Cohort of US Veterans . Schizophr Bull 47 ( 2 ): 517 – 529 . OpenUrl CrossRef PubMed ↵ Cao , C. , P. Kossinna , D. Kwok , Q. Li , J. He et al. , 2022a Disentangling genetic feature selection and aggregation in transcriptome-wide association studies . Genetics 220 ( 2 ). ↵ Cao , C. , D. Kwok , S. Edie , Q. Li , B. Ding et al. , 2021 kTWAS: integrating kernel machine with transcriptome-wide association studies improves statistical power and reveals novel genes . Brief Bioinform 22 ( 4 ). ↵ Cao , C. , J. Wang , D. Kwok , F. Cui , Z. Zhang et al. , 2022b webTWAS: a resource for disease candidate susceptibility genes identified by transcriptome-wide association study . Nucleic Acids Res 50 ( D1 ): D1123 – D1130 . OpenUrl PubMed ↵ Cooper , G.M. , and J. Shendure , 2011 Needles in stacks of needles: finding disease-causal variants in a wealth of genomic data . Nat Rev Genet 12 ( 9 ): 628 – 640 . OpenUrl CrossRef PubMed ↵ Dehghan , A. , 2018 Genome-Wide Association Studies . Methods Mol Biol 1793 : 37 – 49 . OpenUrl CrossRef PubMed ↵ Devlin , B. , and K. Roeder , 1999 Genomic control for association studies . Biometrics 55 ( 4 ): 997 – 1004 . OpenUrl CrossRef PubMed Web of Science ↵ Doan , R.N. , B.I. Bae , B. Cubelos , C. Chang , A.A. Hossain et al. , 2016 Mutations in Human Accelerated Regions Disrupt Cognition and Social Behavior . Cell 167 ( 2 ): 341 – 354 e312 . OpenUrl CrossRef PubMed Enoma D , K.K.A., Chernenkoff S and Long Q. , Representation learning bridged association mapping in complex disorders [version 1; not peer reviewed] . F1000Research 2024 , 13 : 1339 (poster) ( doi: 10.7490/f1000research.1119990.1 ). OpenUrl CrossRef ↵ Enoma , D.O. , J. Bishung , T. Abiodun , O. Ogunlana , and V.C. Osamor , 2022 Machine learning approaches to genome-wide association studies . Journal of King Saud University - Science 34 ( 4 ): 101847 . OpenUrl ↵ He , J. , L. Antonyan , H. Zhu , K. Ardila , Q. Li et al. , 2024 A statistical method for image-mediated association studies discovers genes and pathways associated with four brain disorders . Am J Hum Genet 111 ( 1 ): 48 – 69 . OpenUrl CrossRef PubMed ↵ He , J. , D. Perera , W. Wen , J. Ping , Q. Li et al. , 2025 Enhancing disease risk gene discovery by integrating transcription factor-linked trans-variants into transcriptome-wide association analyses . Nucleic Acids Res 53 ( 1 ). ↵ He , J. , W. Wen , A. Beeghly , Z. Chen , C. Cao et al. , 2022 Integrating transcription factor occupancy with transcriptome-wide association analysis identifies susceptibility genes in human cancers . Nat Commun 13 ( 1 ): 7118 . OpenUrl PubMed ↵ Herrmann , V. , F. Faccio , and J. Schmidhuber , 2024 Learning Useful Representations of Recurrent Neural Network Weight Matrices . ArXiv abs/2403.11998 . ↵ Hubisz , M.J. , and K.S. Pollard , 2014 Exploring the genesis and functions of Human Accelerated Regions sheds light on their role in human evolution . Curr Opin Genet Dev 29 : 15 – 21 . OpenUrl CrossRef PubMed ↵ Kang , H.M. , J.H. Sul , S.K. Service , N.A. Zaitlen , S.Y. Kong et al. , 2010 Variance component model to account for sample structure in genome-wide association studies . Nat Genet 42 ( 4 ): 348 – 354 . OpenUrl CrossRef PubMed Web of Science ↵ Kingma , D.P. , and M. Welling , 2013 Auto-Encoding Variational Bayes . CoRR abs/1312.6114 . ↵ Koster , J. , and S. Rahmann , 2018 Snakemake-a scalable bioinformatics workflow engine . Bioinformatics 34 ( 20 ): 3600 . OpenUrl CrossRef PubMed ↵ Levchenko , A. , A. Kanapin , A. Samsonova , and R.R. Gainetdinov , 2018 Human Accelerated Regions and Other Human-Specific Sequence Variations in the Context of Evolution and Their Relevance for Brain Development . Genome Biol Evol 10 ( 1 ): 166 – 188 . OpenUrl CrossRef PubMed ↵ Li , Q. , J. Bian , Y. Qian , P. Kossinna , C. Gau et al. , 2024 An expression-directed linear mixed model discovering low-effect genetic variants . Genetics 226 ( 4 ). ↵ Li , Q. , Y. Yu , P. Kossinna , T. Lun , W. Liao et al. , 2023 XA4C: eXplainable representation learning via Autoencoders revealing Critical genes . PLoS Computational Biology 19 ( 10 ): e1011476 . OpenUrl ↵ Lippert , C. , J. Listgarten , Y. Liu , C.M. Kadie , R.I. Davidson et al. , 2011 FaST linear mixed models for genome-wide association studies . Nat Methods 8 ( 10 ): 833 – 835 . OpenUrl CrossRef PubMed Web of Science ↵ Liu , Q. , D.L. Nicolae , and L.S. Chen , 2013 Marbled inflation from population structure in gene-based association studies with rare variants . Genet Epidemiol 37 ( 3 ): 286 – 292 . OpenUrl CrossRef PubMed ↵ Long , Q. , C. Argmann , S.M. Houten , T. Huang , S. Peng et al. , 2016 Inter-tissue coexpression network analysis reveals DPP4 as an important gene in heart to blood communication . Genome Med 8 ( 1 ): 15 . OpenUrl CrossRef PubMed ↵ Long , Q. , Q. Zhang , B.J. Vilhjalmsson , P. Forai , U. Seren et al. , 2013 JAWAMix5: an out-of-core HDF5-based java implementation of whole-genome association studies using mixed models . Bioinformatics 29 ( 9 ): 1220 – 1222 . OpenUrl CrossRef PubMed Web of Science ↵ Manolio , T.A. , F.S. Collins , N.J. Cox , D.B. Goldstein , L.A. Hindorff et al. , 2009 Finding the missing heritability of complex diseases . Nature 461 ( 7265 ): 747 – 753 . OpenUrl CrossRef PubMed Web of Science ↵ Mbatchou , J. , L. Barnard , J. Backman , A. Marcketta , J.A. Kosmicki et al. , 2021 Computationally efficient whole-genome regression for quantitative and binary traits . Nat Genet 53 ( 7 ): 1097 – 1103 . OpenUrl CrossRef PubMed ↵ Mudge , J.M. , S. Carbonell-Sala , M. Diekhans , J.G. Martinez , T. Hunt et al. , 2025 GENCODE 2025: reference gene annotation for human and mouse . Nucleic Acids Res 53 ( D1 ): D966 - D975 . OpenUrl CrossRef PubMed ↵ Nassar , L.R. , G.P. Barber , A. Benet-Pages , J. Casper , H. Clawson et al. , 2023 The UCSC Genome Browser database: 2023 update . Nucleic Acids Res 51 ( D1 ): D1188 - D1195 . OpenUrl CrossRef PubMed ↵ Pinero , J. , A. Bravo , N. Queralt-Rosinach , A. Gutierrez-Sacristan , J. Deu-Pons et al. , 2017 DisGeNET: a comprehensive platform integrating information on human disease-associated genes and variants . Nucleic Acids Res 45 ( D1 ): D833 - D839 . OpenUrl CrossRef PubMed ↵ Pollard , K.S. , M.J. Hubisz , K.R. Rosenbloom , and A. Siepel , 2010 Detection of nonneutral substitution rates on mammalian phylogenies . Genome Res 20 ( 1 ): 110 – 121 . OpenUrl Abstract / FREE Full Text ↵ Purcell , S. , B. Neale , K. Todd-Brown , L. Thomas , M.A. Ferreira et al. , 2007 PLINK: a tool set for whole-genome association and population-based linkage analyses . Am J Hum Genet 81 ( 3 ): 559 – 575 . OpenUrl CrossRef PubMed ↵ Reimand , J. , M. Kull , H. Peterson , J. Hansen , and J. Vilo , 2007 g:Profiler--a web-based toolset for functional profiling of gene lists from large-scale experiments . Nucleic Acids Res 35 ( Web Server issue ): W193 – 200 . OpenUrl CrossRef PubMed Web of Science ↵ Siepel , A. , G. Bejerano , J.S. Pedersen , A.S. Hinrichs , M. Hou et al. , 2005 Evolutionarily conserved elements in vertebrate, insect, worm, and yeast genomes . Genome Res 15 ( 8 ): 1034 – 1050 . OpenUrl Abstract / FREE Full Text ↵ Tange , O. , 2018 GNU Parallel 2018. In GNU Parallel 2018 (p. 112 ). ↵ Ulgen , E. , O. Ozisik , and O.U. Sezerman , 2019 pathfindR: An R Package for Comprehensive Identification of Enriched Pathways in Omics Data Through Active Subnetworks . Front Genet 10 : 858 . OpenUrl CrossRef PubMed ↵ Unterthiner , T. , D. Keysers , S. Gelly , O. Bousquet , and I.O. Tolstikhin , 2020 Predicting Neural Network Accuracy from Weights . ArXiv abs/2002.11448 . ↵ Wellcome Trust Case Control, C ., 2007 Genome-wide association study of 14,000 cases of seven common diseases and 3,000 shared controls . Nature 447 ( 7145 ): 661 – 678 . OpenUrl CrossRef PubMed Web of Science ↵ Wu , M.C. , P. Kraft , M.P. Epstein , D.M. Taylor , S.J. Chanock et al. , 2010 Powerful SNP-set analysis for case-control genome-wide association studies . Am J Hum Genet 86 ( 6 ): 929 – 942 . OpenUrl CrossRef PubMed Web of Science ↵ Yang , J. , S.H. Lee , M.E. Goddard , and P.M. Visscher , 2011a GCTA: a tool for genome-wide complex trait analysis . Am J Hum Genet 88 ( 1 ): 76 – 82 . OpenUrl CrossRef PubMed ↵ Yang , J. , M.N. Weedon , S. Purcell , G. Lettre , K. Estrada et al. , 2011b Genomic inflation factors under polygenic inheritance . Eur J Hum Genet 19 ( 7 ): 807 – 812 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted July 07, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following MOKA: A pipeline for multi-omics bridged SNP-set kernel association test Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share MOKA: A pipeline for multi-omics bridged SNP-set kernel association test David Enoma , Dinghao Wang , Ariel Ghislain Kemogne Kamdoum , Rodrigo Ortega Polo , Quan Long , Jingni He medRxiv 2025.07.06.25330974; doi: https://doi.org/10.1101/2025.07.06.25330974 Share This Article: Copy Citation Tools MOKA: A pipeline for multi-omics bridged SNP-set kernel association test David Enoma , Dinghao Wang , Ariel Ghislain Kemogne Kamdoum , Rodrigo Ortega Polo , Quan Long , Jingni He medRxiv 2025.07.06.25330974; doi: https://doi.org/10.1101/2025.07.06.25330974 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4411) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15205) Forensic Medicine (30) Gastroenterology (1119) Genetic and Genomic Medicine (6575) Geriatric Medicine (666) Health Economics (994) Health Informatics (4511) Health Policy (1365) Health Systems and Quality Improvement (1608) Hematology (537) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15903) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6573) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (954) Oncology (3319) Ophthalmology (968) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (662) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5423) Public and Global Health (9205) Radiology and Imaging (2191) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1191) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9feca8f01872ad07',t:'MTc3OTI5MzkzNA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.