Full text
42,408 characters
· extracted from
preprint-html
· click to expand
Data-Driven Symbolic Higher-Order Epistasis Discovery with Kolmogorov-Arnold Networks | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Data-Driven Symbolic Higher-Order Epistasis Discovery with Kolmogorov-Arnold Networks View ORCID Profile Oankar R. Patil , Kamran Shazand , Benoit Marteau , Yuxuan Shen , May D. Wang doi: https://doi.org/10.1101/2025.10.31.685894 Oankar R. Patil 1 Georgia Institute of Technology , Atlanta, GA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Oankar R. Patil For correspondence: opatil31{at}gatech.edu Kamran Shazand 2 Shriners Children’s Genomics, Research Institute , Tampa, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Benoit Marteau 1 Georgia Institute of Technology , Atlanta, GA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yuxuan Shen 1 Georgia Institute of Technology , Atlanta, GA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site May D. Wang 1 Georgia Institute of Technology , Atlanta, GA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Preview PDF Abstract Many human diseases are polygenic conditions that arise from a complex interplay of interactions between multiple genes at different loci, but currently most Genome-Wide Association Studies (GWAS) largely only consider the main additive effects of single nucleotide polymorphisms (SNPs), resulting in a missing heritability problem in some complex traits. Identifying non-additive interactions, or epistasis, at a higher-order could aid in filling this gap, but it is computationally difficult due to the massive search space involved. Current epistasis detection approaches struggle with noncartesian higher order interactions and lack inherent explainability. We present a novel deep learning (DL) approach, EPIstasis Discovery with Kolmogorov-Arnold Networks (EPIK), a data-driven, modular, and symbolically representable framework. We also introduce a novel approach for higher-order XOR (a non-Cartesian type) interaction detection, utilized in EPIK’s XOR detection module. EPIK slightly outperforms other DL approaches on simulated pure epistasis interactions benchmark in average F1 score. It outperforms other, general, traditional epistasis detection approaches on simulated mixed epistasis detection datasets and real-world GWAS datasets of Arabidopsis Thaliana. Finally, EPIK recovers a known gene interaction between MAPT and WNT3 for Parkinson’s Disease (PD) while also suggesting a more complex interaction between MAPT, WNT3, and another gene, KANSL1. 1 Introduction Many human diseases, including Parkinson’s disease, are polygenic and shaped by interactions between multiple genes across loci, known as epistasis [ 7 , 28 ]. Epistasis is a key candidate for the “missing heritability” problem in complex traits, yet most Genome-Wide Association Studies (GWAS) focus only on additive effects of single nucleotide polymorphisms (SNPs) [ 19 ]. Detecting higherorder, non-additive interactions is computationally challenging: for n SNPs, evaluating all possible interactions up to order m requires combinations, an o ( n m ) operation. The high dimensionality of GWAS data further compounds this issue by invoking the curse of dimensionality [ 6 ]. Recent work has pursued non-exhaustive search strategies, particularly deep learning (DL) methods, for efficient higher-order epistasis detection. These include multilayer perceptrons (MLPs) with biological priors, layer-wise relevance propagation (e.g., Deep-COMBI) [ 26 ] [ 17 ], convolutional and transformer-based models [ 14 ] [ 10 ], and relief-based algorithms [ 8 ]. While promising, these approaches remain black-box models and struggle with non-Cartesian interactions such as XOR, which can model more relevant interactions than additive or multiplicative models in some cases [ 2 ]. Kolmogorov-Arnold Networks (KANs) offer a potential solution. Unlike MLPs with fixed activations, KANs use learnable B-spline functions on edges, enabling compact parameterization, improved generalization, and intrinsic explainability [ 16 ]. Their design is inspired by the Kolmogorov-Arnold representation theorem, which decomposes multivariate functions into sums of univariate functions, aligning well with sparse compositional structures such as true causal SNPs embedded within noisy features. Importantly, KANs yield interpretable symbolic or graphical representations of detected interactions. We propose EPIstasis discovery with Kolmogorov-Arnold Networks (EPIK), a modular, data-driven framework for uncovering higher-order epistasis from SNP datasets with intrinsic symbolic explainability. We evaluate EPIK on simulated epistasis and real GWAS datasets, comparing it against state-of-the-art (SOTA) methods, and apply it to Parkinson’s disease. EPIK successfully recovers a known MAPT–WNT3 interaction in Parkinson’s Disease (PD) and suggests a novel three-way interaction that includes another gene, KANSL1. Our contributions are as follows: A novel polynomial-time approach for efficient higherorder (≥ 4) XOR epistasis detection, improving performance on non-Cartesian interactions. To the best of our knowledge, this is the first application of KANs to epistasis discovery, providing interpretable symbolic and graphical representations while outperforming or matching other non intrinsically-interpretable approaches. 2 Methodology 2.1 Datasets 2.1.1 Simulated Pure Epistasis Datasets We utilized a publicly available collection of datasets for higher-order pure epistasis detection, utilizing the synthetic test datasets seen in [ 10 ], which were generated utilizing PyToxo [ 9 ] for penetrance table generation and GAMETES [ 25 ] for simulating interacting SNP combinations that satisfy Minor Allele Frequency (MAF), heritability, number of SNPs, sample size, control vs case ratio parameters. Such simulated data produced falls under pure, strict epistasis models, being worst-case scenarios for disease association due to only being observable if all k-loci are included. There were a total of 25 datasets to evaluate performance in a wide variety of k-way epistasis detection scenarios, covering Additive, Multiplicative, Thresholding, XOR interactions. 2.1.2 Simulated Realistic Mixed Epistasis Datasets To evaluate performance on mixed epistasis interaction types/realistic genetic noise, we used the “msprime” population genetic simulator [ 3 ] and simulated the genetic ancestries of 1,600 individuals (800 AFR, 400 EUR, 400 ASN) with recombination/mutations. We implemented penetrance tables for two mixed epistasis scenarios and adding biallelic mutation, generating two datasets with varying MAFs, each containing ∼93,500 SNPS. The first contained 2-way Multiplicative (MAF: 0.17, 0.41), 2-way Recessive (MAF: 0.12, 0.31), 2-way XOR (MAF: 0.25, 0.23) with 780 Cases/820 Controls. The second contained 3-way Multiplicative (MAF: 0.05, 0.19, 0.18) and 5-way XOR (MAF: 0.31, 0.37, 0.41, 0.45, 0.40) with 757 Cases/843 Controls. 2.1.3 Arabidopsis Thaliana GWAS datasets We utilized GWAS data of Arabidopsis Thaliana [ 1 ] for additional experiments with traditional non-DL epistasis approaches [ 27 ] [ 8 ] [ 13 ] [ 22 ] [ 18 ]. The datasets consisted of SNPs (∼24,500) across all of its 5 chromosomes, aiming to identify genes involved in resistance of various bacterial effector phenotypes (avrRPM1, avrPphB). 2.1.4 Parkinson’s Disease Dataset For deeper insight into PD which has recently been heavily investigated for epistasis interactions [ 5 ] [ 21 ], we utilized genomic patient samples from the UKBioBank for SNPs across chromosome 17 [ 4 ]. Specifically, we selected 1790 patients afflicted with a PD diagnosis based on patient samples with ICD-10 codes of G20. We then selected 18159 controls patients, thus following a 10:1 control:case ratio. For quality control of our patient control population, we followed the same procedure as previous meta-analysis studies for epistasis in PD GWAS [ 5 ]. 2.2 Architecture Overview EPIK overall consists of four epistasis interaction type detection modules that each propose a candidate set of SNPs involved in epistasis interaction before being fed to their respective KANs. Each KAN attempts to learn the interaction from the candidate SNPs, prune SNPs it finds unhelpful, and produce a symbolic representation of the interaction. Our approach operates on the usage of machine learning for intrinsic explainability and modularity, aiming to work with algorithmic improvements as the field evolves. 2.3 Additive Detection Module For higher order additive detection, we utilize inspiration from approaches already established in literature that follow heuristic search and information gain for epistasis detection [ 23 ] [ 24 ]. Our approach works by doing a stepwise floating search strategy that combines spline-based nonlinear modeling with Bayesian Information Criterion (BIC) optimization [ 11 ]. We pre-screen SNPs by computing univariate BIC scores using spline-transformed genotypes to select top candidates, then perform iterative forward-backward search: adding SNPs that minimize BIC when their genotype sum is transformed via spline basis functions (capturing nonlinear interactions) and removing previously added SNPs if exclusion improves BIC. This approach models additive joint effects through the sum of genotypes while controlling complexity via BIC’s penalty term. 2.4 XOR Detection Module We present a novel approach for detecting multi-way XOR-type epistatic interactions in high-dimensional genotype-phenotype data. Due to the nature of an XOR interaction, no single SNP/subset of SNPs is guaranteed a large marginal correlation with the phenotype, but only when all causal SNPs are in combination with one another. As such, any algorithm for this task has two major goals: Filtering the initial high-dimensional dataset to a reduced subset of candidate SNPs that retain the causal SNPs. Recognizing combinations of SNPs as an XOR interaction that has high correlation with the phenotype. Before addressing the problem of a high-dimensional dataset, we first consider what would be a relevant and useful “mechanism” for detecting such an XOR interaction if we were somehow given the actual set of causal SNPs. If we take a moment to consider what an XOR interaction actually shows up as, we can see from the definition that it can be boiled down to the idea of, “If an odd number of the inputs are true, the output will be true”. Thus, the problem transforms into: if given some set, is the parity of this set even or odd? As such, we need some sort of approach that can compute the parity of a given set. The mechanism we use to detect this parity pattern is computing the Walsh Coefficient of a given set. For a set of SNP indices S = i 1 , i 2 , …, i m , the Walsh coefficient is: Here: x k,j ∈ +1, −1 is the coded genotype of individual k at SNP The product is +1 if an even number of carriers in S are present, and –1 if an odd number are present. We take the absolute value of β s . Now that we know we are trying to maximize the Walsh coefficient given various sets, the question becomes how can we effectively select those sets, especially given a high-dimensional dataset. If we were to try to compute the Walsh coefficients for all subsets of p SNPs, there would be 2 p subsets, which would be computationally intractable in a high-dimensional setting. Thus we now lead into discussion on how to address the first goal of effectively filtering SNPs and choosing relevant subsets of SNPs to find the true XOR interaction. We consider whether some sort of correlation based approach would be a cheaper yet effective initial screen. Based on the correlation derivation we did, shown in detail in the Appendix, we can see that for a k -way XOR with independent SNPs, with p being MAF: This effectively denotes the following relationship: Thus, we see that as MAF approaches 0.5, this term -> 0, and quickly as k grows (higher order). We plot this relationship in Figure 2 . Download figure Open in new tab Figure 1. The missing heritability problem in GWAS may stem from factors such as epistasis, gene–environment interactions, and rare variants. Within epistasis, higher-order interactions pose major computational and interpretability challenges. We propose EPIK, a framework combining optimized higher-order detection methods with Kolmogorov–Arnold Networks for efficient and interpretable epistasis discovery. Download figure Open in new tab Figure 2. We plot the region of final k-way XOR correlation for given MAF and interaction order k. We also plot specific cases for second order interactions and eighth order interactions at varying MAF values. As such, we consider each SNP with random sets of SNPs at a given order and compute the parity-transformed set k -way correlation with the phenotype. We then rank each SNP as candidates based on how well their sets generally correlated with the phenotype and take a user-defined number of initial candidates, hoping to capture small “spikes” in initial correlation. For high-dimensional datasets, scaling-wise we see this shifts us from the intrac table 2 p to effectively n * p , where n is the number of random sets for each SNP. From these indvidual SNP candidates, we then do a beam search up to the user-defined max order, computing the Walsh coefficient for each potential set. The final output is the set with the highest Walsh coefficient. SNPs with low MAF values in an XOR interaction are thus more likely to be captured, but it is still possible to capture SNPs with higher MAF values but is just more difficult. This can be mitigated by increasing the number of initial candidates parameter. We further detail our approach in Algorithm 1 and 2 in the Appendix. 2.5 Multiplicative Detection Module We detect higher-order multiplicative epistasis through a parallelized beam search guided by log-likelihood ratio (LLR) tests, beginning with mutual information-based pre-screening of top candidate SNPs to reduce combinatorial complexity, similar to other approaches in literature for lower-order interaction detection [ 12 ]. 2.6 Thresholding Detection Module Our approach detects higher-order threshold epistasis through an adaptive beam search guided by point-biserial correlation optimization, beginning with pre-screening SNPs based on their marginal correlation with binarized phenotypes. Our approach evaluates candidate SNP sets by computing minor allele counts across loci and dynamically determining optimal activation threshold ≥ k (within a proportional range) where the rule counts ≥ k maximizes absolute correlation with the phenotype. Using beam search with backtracking, it expands candidate sets order-by-order while retaining only top-performing combinations based on correlation strength. 2.7 KANs Hyperparameters, Pruning, Symbolification and Prediction For each candidate set of SNPs, a KAN is trained to simultaneously learn which SNPs from the candidate set are most relevant, whether the initial order of the candidate set can be reduced, and a symbolic representation (mathematical or graphical) of how these SNPs interact together in consideration of their proposed epistasis interaction type. We take a quick aside to clarify such symbolic representations are an inherent property of KANs and not something achieved through another method such as how LRP or Neural Interaction Detection (NID) might be applied to MLPs. All of the KANs have an initial first layer width equivalent to an initial proposed candidate set for each interaction type. We provide the exact networks depth/widths and other parameters in the Appendix. All of the KANs discussed used auto-symbolification/pruning parameters of 0.7 for the r2 threshold and a simplicity weight of 0.5. We utilize a stratified 80/20 split to train and test each KAN. We tuned each KAN for each epistasis interaction type considered in our framework. We did this by separately generating and extracting only the causal sets of SNPs for each interaction type using PyToxo and GAMETES, leaving the benchmark datasets untouched, and doing a hyperparameter search/different network sizes on these. The problem on how to optimally design KANs for different tasks is still open, but based on [ 16 ][ 15 ], we treat KANs like more interpretable MLPs and start as small as possible before first increasing width and then depth. We cap depth to a maximum of five, as [ 15 ] discusses how deep KANs roughly become an identity map in the later layers. 2.8 Experimental Design Our first experiment evaluates the performance of EPIK on simulated pure and mixed epistasis interaction scenarios. We compare (F1 Score) against other DL approaches [ 10 ] [ 17 ] [ 14 ] on the 25 simulated pure epistasis interactions datasets benchmark described in Section 2.1.1 . We also evaluate performance of EPIK against non deep-learning epistasis detection approaches [ 27 ] [ 8 ] [ 13 ] [ 22 ] on mixed epistasis interaction datasets described in Section 2.1.2 . Our second experiment evaluates the performance of EPIK with well-established epistasis detection approaches [ 27 ] [ 8 ] [ 13 ] [ 22 ] [ 18 ] on two real-world GWAS datasets described in Section 2.1.3 . Finally, our last experiment applies EPIK for PD in chromosome 17 with a magnitude of 10 4 input SNPs considered, using the dataset as described in Section 2.1.4 . We first do traditional LD-pruning, with a r 2 threshold of 0.7, a MAF filter of 0.01, and population covariate control through PCA before then feeding the dataset to EPIK. We evaluated EPIK on a single Nvidia A100 GPU with 4 CPU cores and 16 GB memory. We utilized up to 4 A100 GPUs as needed for the other approaches, dependent on their computational needs. 3 Results 3.1 Simulated Epistasis Detection We evaluate the performance of EPIK against SOTA DL methods on the simulated datasets described in Section 2.1.1 . We present performance against the current SOTA, a distributed transformer approach [ 10 ] with the remaining DL approaches in the Appendix. For the pure epistasis interaction types, we find that EPIK outperforms or is on par with the other SOTA DL method by average F1 score in the majority of k -way scenarios, shown in Table 1a . We also present the performance of EPIK against non deep-learning approaches on simulated mixed epistasis datasets described in Section 2.1.2 , where EPIK outperforms the other approaches in number of causal SNPs found, shown in Table 1 ’s mixed epistasis sub-tables Table 1b and Table 1c . View this table: View inline View popup Download powerpoint Table 1: Simulated Epistasis Interaction Results View this table: View inline View popup Download powerpoint Table 2: A. Thaliana GWAS Experiments 3.2 A. Thaliana GWAS Experiments Results We evaluate EPIK against well-established epistasis detection approaches as described in Section 2.1.1 in Table 2 . Out of these approaches, HOGIMine is the only one that uses biological priors. For resistance of avrRPM1, EPIK and HOGIMine recovered RPM1/RIN4, while MultiSURF, RelifF-10NN, Fast-LMM recovered only RPM1, with RPM1/RIN4 being involved [ 29 ]. For resistance of avrPphB, EPIK, MultiSURF, RelifF-10NN, w-Test, Fast-LMM, HOGIMine recovered RPS5, with RPS5/PBS1 being involved [ 20 ]. 3.3 Parkinson’s Disease Results We show EPIK’s findings for PD on Chromosome 17 in comparison to prior literature ( Table 3 ), a genome-wide non-exhaustive epistasis meta-analysis [ 5 ]. We show EPIK’s intrinsic symbolic representation, suggesting an XOR interaction ( Figure 3 ). View this table: View inline View popup Download powerpoint Table 3: Parkinson’s Disease Epistasis Findings (Chr 17) Download figure Open in new tab Figure 3. EPIK’s intrinsic symbolic representation suggests a potential 3-way XOR interaction between KANSL1, MAPT, and WNT3 for Parkinson’s Disease in Chromosome 17. 4 Discussion Overall, across simulated datasets with varying epistasis interaction types and orders, EPIK matched or outperformed other DL and traditional methods in most scenarios. A key strength of EPIK is its intrinsic symbolic representation, which enables pruning of unlikely SNPs. For example, in the pure 7-way additive scenario (where EPIK underperformed against Distributed Transformer), we recovered the following symbolic expression: Here, x 1 has a near-zero negative coefficient, correctly indicating it as a noisy non-epistasis SNP that was pruned. Not all interactions yield concise mathematical expressions. In such cases, EPIK’s symbolic graphical representation highlights pruned SNPs, noisy candidates, and potential interactions. For instance, in the 3-way Pure Threshold Interaction ( Fig. 4 ), one proposed SNP, highlighted in yellow, is faintly represented, signaling possible noise, which is consistent with the ground truth. The remaining SNPs in blue were correctly pruned and the ones in Red were the causal SNPs. Download figure Open in new tab Figure 4. EPIK’s intrinsic symbolic graphical representation of the 3-way Threshold Interaction scenario. Applying EPIK to chromosome 17 in Parkinson’s disease, we identified a 3-way XOR interaction between KANSL1, MAPT, and WNT3. The symbolic graph ( Fig. 3 ) emphasized MAPT and WNT3 as the primary interaction, with KANSL1 faint, suggesting noise or a regulatory role. Literature confirms MAPT and WNT3 as a significant non-additive epistasis pair [ 5 ], while additional EQTL/ E 2 QT L analysis implicates KANSL1 and nearby genes. Despite reduced sample size compared to prior studies, EPIK successfully recovered the main MAPT/WNT3 interaction. We note that the XOR module lacks global statistical guarantees due to its non-exhaustive search, tested subsets yield exact correlation estimates, with detection power depending on effect size, beam width, and candidate selection. For the full framework, statistical guarantees can be incorporated via candidate–test splits, likelihood ratio tests, and BH-FDR correction. We provide a detailed discussion of these procedures and considerations for higher-order interactions in the Appendix. EPIK’s XOR detection module design is biased in that it has a higher chance of finding SNPs in an interaction if they have lower MAF values, necessitating future work to mitigate this. 5 Conclusion EPIK presents a symbolically representative approach to epistasis discovery. KANs are still a very nascent class of neural networks and thus optimal architecture design for this task is still greatly open for future work. Further work could also investigate integration with functional annotations and other biological prior knowledge like known pathways, chromatin accessibility, or regulatory annotations, into the interpretation or prioritization steps which could potentially greatly improve the ability to find biologically relevant and robust higher order epistasis interactions. Overall, EPIK introduces a novel higher-order XOR epistasis detection algorithm and KAN-based approach for data-driven intrinsic symbolic discovery of higher-order epistasis, outperforming or matching other approaches on simulated and real GWAS datasets, and suggesting a more complex epistasis interaction in 17q21.31 for PD. Acknowledgments This research was supported by Shriners Children’s Hospital and the Georgia Institute of Technology through the Early Onset Scoliosis (EOS) Project. This research was supported in part through research cyberinfrastructure resources and services, including the AI Makerspace of the College of Engineering, provided by the Partnership for an Advanced Computing Environment (PACE) at the Georgia Institute of Technology, Atlanta, Georgia, USA. This research has been conducted using the UK Biobank Resource under Application 17984. A Appendix A.1 Marginal Correlation for XOR Epistasis Consider a k -way XOR model for the phenotype Y : where X i ∼ Bernoulli ( p ) are independent SNPs. The correlation ρ j between SNP X j and Y is: Covariance Decomposition: Expectation of Y : Y = 1 when an odd number of X i = 1, being a sum that follows binomial distribution: Expectation of X j Y : Conditional on X j = 1: Compute Covariance: Variance Components: Final Correlation: A.2 XOR Detection Module - Algorithms Shown in Algorithm 1 and 2 . A.3 Simulated Pure Epistasis Interaction Results We present the full comparison of EPIK against the DL approaches discussed in the main paper in Table 4 . F1 score for EPIK was calculated by taking the union of final proposed candidate SNP sets from each KAN module after their pruning process. View this table: View inline View popup Download powerpoint Table 4: Simulated Epistasis Interaction Results (F1 Score) A.4 Sensitivity Analysis of XOR Detection Module for Pure XOR under Moderate MAF We performed our method on 3-way XOR datasets with moderate MAF (0.3) (due to computational generation limitations with GAMETES we did not go higher) and had the initial candidate subset parameter vary from [33, 32, 31, 30], representing a reduction to only 0.033% to 0.03% of the original high-dimensional SNP dataset, shown in Table 5 . We see that the first causal SNP lost was the 32nd highest marginal correlation. Thus, we overall see the core mechanisms of the XOR modules performance lies in MAF values and initial candidate subset parameter. View this table: View inline View popup Download powerpoint Table 5: Moderate MAF impact on 3-Way XOR Detection A.5 KAN Models Details Additive KAN directly goes to a size [ 1 ] width output, number of grid intervals is set to 5, a piecewise polynomial order of 3, and the use of identity as the base function. XOR Kan has a width list of [ 20 , 1 ] after the first layer, number of grid intervals is set to 5, a piecewise polynomial order of 3, and the use of Sigmoid Linear Unit (SiLU) as the base function. Multiplicative KAN has a width list of [[0, 1 ], 1], number of grid intervals set to 20, a piecewise polynomial order of 3, the use of identity as the base function, and a multiplication arity of either 2 or the closest scalar of half of the candidate set. Multiplicative KAN utilizes the recent multiplication node architecture developed for KANs [ 15 ]. Finally, the Thresholding KAN directly goes to a size [ 1 ] width output, number of grid intervals is set to 5, a piecewise polynomial order of 3, and the use of identity as the base function. A.6 Multiplicative KAN We highlight EPIK’s usage of the multiplication node for KAN architectures, developed in [ 15 ], in its intrinsic symbolic graphical representations for Multiplicative epistasis scenarios in Figure 5 . Download figure Open in new tab Figure 5. EPIK’s learned intrinsic symbolic graphical representation of the 4-way Multiplicative Interaction scenario. Algorithm 1 Fixed Higher Order XOR Detection Download figure Open in new tab A.7 Statistical Guarantees discussion of XOR Detection Module and EPIK For statistical guarantees for the XOR detection module, there aren’t explicit global statistical guarantees as we do not do an exhaustive enumeration. However, we can state that for any of the subsets that are tested, the correlation is an exact estimate for that set. As such, the probability of finding the correct XOR set generally hinges on the actual effect size of the interaction, beam width, the number of initial candidates and their random sets considered, and any traditional quality control procedures, like MAF filtering and LD pruning, keeping the causal SNPs for consideration while reducing feature space. Maximizing beam width and initial candidates/their random sets are what increase the XOR detection module itself for better detecting the interaction as it increases the chance the true set can be found, once found the Walsh coefficient will be much higher than other non-causal sets by nature of XOR’s parity pattern. For EPIK as a general framework itself, statistical guarantees can be integrated into the framework through splitting the data into a set for selecting candidate SNPs involved in epistasis and using the remaining data to test each candidate SNP set (straifying as needed) with pre-specified generalized linear models dependent on interaction model type and nested likelihood ratio test (LRT) for each with PCs/covariates included as necessary and controlling for FDR. Each test’s p-value for that interaction model’s proposed set can be pooled together and standard Benjamini-Hochberg (BH) at target FDR alpha can be applied. By doing this pooling, it should handle the multiple hypothesis testing involved in epistasis detection. Algorithm 2 Generalized Higher Order XOR Detection Download figure Open in new tab For the higher-arity interactions, we note: Candidates increase, resulting in number of tests increasing, causing BH thresholds to decrease and finally resulting in decreased power. In general, but especially with higher arity, there should be at least 20-30 samples in this inference set as an initial rule for the framework’s approach. Funder Information Declared Shriners Hospitals for Children Partnership for Advanced Computing Environment at the Georgia Institute of Technology References [1]. ↵ Susanna Atwell , Yu S Huang , Bjarni J Vilhjálmsson , Glenda Willems , Matthew Horton , Yan Li , Dazhe Meng , Alexander Platt , Aaron M Tarone , Tina T Hu , et al. 2010 . Genome-wide association study of 107 phenotypes in Arabidopsis thaliana inbred lines . Nature 465 , 7298 (2010), 627 – 631 . OpenUrl CrossRef PubMed Web of Science [2]. ↵ Sandra Batista , Vered Senderovich Madar , Philip J Freda , Priyanka Bhandary , Attri Ghosh , Nicholas Matsumoto , Apurva S Chitre , Abraham A Palmer , and Jason H Moore . 2024 . Interaction models matter: an efficient, flexible computational framework for model-specific investigation of epistasis . BioData Mining 17 , 1 (2024), 7 . OpenUrl PubMed [3]. ↵ Franz Baumdicker , Gertjan Bisschop , Daniel Goldstein , Graham Gower , Aaron P Ragsdale , Georgia Tsambos , Sha Zhu , Bjarki Eldon , E Castedo Ellerman , Jared G Galloway , et al. 2022 . Efficient ancestry and mutation simulation with msprime 1.0 . Genetics 220 , 3 (2022), iyab229 . OpenUrl CrossRef PubMed [4]. ↵ Clare Bycroft , Colin Freeman , Desislava Petkova , Gavin Band , Lloyd T Elliott , Kevin Sharp , Allan Motyer , Damjan Vukcevic , Olivier Delaneau , Jared O’Connell , et al. 2018 . The UK Biobank resource with deep phenotyping and genomic data . Nature 562 , 7726 (2018), 203 – 209 . OpenUrl CrossRef PubMed [5]. ↵ Alejandro Cisterna-Garcia , Bernabe I Bustos , Sara Bandres-Ciga , Thiago P Leal , Elif I Sarihan , Christie Jok , Dimitri Krainc , Ignacio F Mata , Steven J Lubbe , and Juan A Botia . 2025 . Genome-wide epistasis analysis reveals significant epistatic signals associated with Parkinson’s disease risk . Brain 148 , 6 (2025), 2060 – 2074 . OpenUrl PubMed [6]. ↵ Adolfo Crespo Márquez . 2022 . The curse of dimensionality . In Digital Maintenance Management: Guiding Digital Transformation in Maintenance . Springer , 67 – 86 . [7]. ↵ Jacob Oliver Day and Stephen Mullin . 2021 . The genetics of Parkinson’s disease and implications for clinical practice . Genes 12 , 7 (2021), 1006 . OpenUrl CrossRef [8]. ↵ Philip J Freda , Suyu Ye , Robert Zhang , Jason H Moore , and Ryan J Urbanowicz . 2024 . Assessing the limitations of relief-based algorithms in detecting higherorder interactions . BioData Mining 17 , 1 (2024), 37 . OpenUrl PubMed [9]. ↵ Borja González-Seoane , Christian Ponte-Fernández , Jorge González-Domínguez , and María J Martín . 2022 . PyToxo: a Python tool for calculating penetrance tables of high-order epistasis models . BMC bioinformatics 23 , 1 (2022), 117 . OpenUrl PubMed [10]. ↵ Miguel Graça , Ricardo Nobre , Leonel Sousa , and Aleksandar Ilic . 2024 . Distributed transformer for high order epistasis detection in large-scale datasets . Scientific Reports 14 , 1 (2024), 14579 . OpenUrl PubMed [11]. ↵ Ning Hao and Hao Helen Zhang . 2014 . Interaction screening for ultrahigh-dimensional data . J. Amer. Statist. Assoc . 109 , 507 (2014), 1285 – 1301 . OpenUrl [12]. ↵ Felix Heinrich , Faisal Ramzan , Abirami Rajavel , Armin Otto Schmitt , and Mehmet Gültas . 2021 . MIDESP: mutual information-based detection of epistatic SNP pairs for qualitative and quantitative phenotypes . Biology 10 , 9 (2021), 921 . OpenUrl PubMed [13]. ↵ Christoph Lippert , Jennifer Listgarten , Ying Liu , Carl M Kadie , Robert I Davidson , and David Heckerman . 2011 . FaST linear mixed models for genome-wide association studies . Nature methods 8 , 10 (2011), 833 – 835 . OpenUrl PubMed [14]. ↵ Yang Liu , Duolin Wang , Fei He , Juexin Wang , Trupti Joshi , and Dong Xu . 2019 . Phenotype prediction and genome-wide association study using deep convolutional neural network of soybean . Frontiers in genetics 10 (2019), 1091 . [15]. ↵ Ziming Liu , Pingchuan Ma , Yixuan Wang , Wojciech Matusik , and Max Tegmark . 2024 . Kan 2.0: Kolmogorov-arnold networks meet science . arXiv preprint arxiv: 2408.10205 (2024). [16]. ↵ Ziming Liu , Yixuan Wang , Sachin Vaidya , Fabian Ruehle , James Halverson , Marin Soljačić , Thomas Y Hou , and Max Tegmark . 2024 . Kan: Kolmogorov-arnold networks . arXiv preprint arxiv: 2404.19756 (2024). [17]. ↵ Bettina Mieth , Alexandre Rozier , Juan Antonio Rodriguez , Marina MC Höhne Nico Görnitz , and Klaus-Robert Müller . 2021 . DeepCOMBI: explainable artificial intelligence for the analysis and discovery in genome-wide association studies . NAR genomics and bioinformatics 3 , 3 (2021), qab065 . OpenUrl [18]. ↵ Paolo Pellizzoni , Giulia Muzio , and Karsten Borgwardt . 2023 . Higher-order genetic interaction discovery with network-based biological priors . Bioinformatics 39 , Supplement_1 (2023), i523 – i533 . OpenUrl PubMed [19]. ↵ Wolfgang Sadee , Katherine Hartmann , Michał Seweryn Maciej Pietrzak , Samuel K Handelman , and Grzegorz A Rempala . 2014 . Missing heritability of common diseases and treatments outside the protein-coding exome . Human genetics 133 (2014), 1199 – 1215 . [20]. ↵ Feng Shao , Catherine Golstein , Jules Ade , Mark Stoutemyer , Jack E Dixon , and Roger W Innes . 2003 . Cleavage of Arabidopsis PBS1 by a bacterial type III effector . Science 301 , 5637 (2003), 1230 – 1233 . OpenUrl Abstract / FREE Full Text [21]. ↵ Pankhuri Singhal , Shefali Setia Verma , and Marylyn D Ritchie . 2023 . Gene interactions in human disease studies—evidence is mounting . Annual Review of Biomedical Data Science 6 , 1 (2023), 377 – 395 . OpenUrl PubMed [22]. ↵ Rui Sun , Xiaoxuan Xia , Ka Chun Chong , Benny Chung-Ying Zee , William Ka Kei Wu , and Maggie Haitian Wang . 2019 . wtest: an integrated R package for genetic epistasis testing . BMC Medical Genomics 12 , Suppl 9 (2019), 180 . OpenUrl PubMed [23]. ↵ Yan Sun , Yijun Gu , Qianqian Ren , Yiting Li , Junliang Shang , Jin-Xing Liu , and Boxin Guan . 2022 . MDSN: A module detection method for identifying high-order epistatic interactions . Genes 13 , 12 (2022), 2403 . OpenUrl [24]. ↵ Shouheng Tuo , Chao Li , Fan Liu , Aimin Li , Lang He , Zong Woo Geem , JunLiang Shang , Haiyan Liu , YanLing Zhu , ZengYu Feng , et al. 2023 . MTHSA-DHEI: multitasking harmony search algorithm for detecting high-order SNP epistatic interactions . Complex & Intelligent Systems 9 , 1 (2023), 637 – 658 . OpenUrl [25]. ↵ Ryan J Urbanowicz , Jeff Kiralis , Nicholas A Sinnott-Armstrong , Tamra Heberling , Jonathan M Fisher , and Jason H Moore . 2012 . GAMETES: a fast, direct algorithm for generating pure, strict, epistatic models with random architectures . BioData mining 5 (2012), 1 – 14 . [26]. ↵ Arno van Hilten , Federico Melograna , Bowen Fan , Wiro Niessen , Kristel Van Steen , and Gennady Roshchupkin . 2024 . Detecting genetic interactions with visible neural networks . bioRxiv (2024), 2024 – 02 . [27]. ↵ Xiang Wan , Can Yang , Qiang Yang , Hong Xue , Xiaodan Fan , Nelson LS Tang , and Weichuan Yu . 2010 . BOOST: A fast approach to detecting gene-gene interactions in genome-wide case-control studies . The American Journal of Human Genetics 87 , 3 (2010), 325 – 340 . OpenUrl CrossRef PubMed Web of Science [28]. ↵ Kenneth Ward , James Ogilvie , VeeAnn Argyle , Lesa Nelson , Mary Meade , John Braun , and Rakesh Chettier . 2010 . Polygenic inheritance of adolescent idiopathic scoliosis: a study of extended families in Utah . American journal of medical genetics Part A 152 , 5 (2010), 1178 – 1188 . OpenUrl CrossRef [29]. ↵ Ning Xu , Xuming Luo , Wen Li , Zongyi Wang , and Jun Liu . 2017 . The bacterial effector AvrB-induced RIN4 hyperphosphorylation is mediated by a receptor-like cytoplasmic kinase complex in Arabidopsis . Molecular Plant-Microbe Interactions 30 , 6 (2017), 502 – 512 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted November 04, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Data-Driven Symbolic Higher-Order Epistasis Discovery with Kolmogorov-Arnold Networks Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Data-Driven Symbolic Higher-Order Epistasis Discovery with Kolmogorov-Arnold Networks Oankar R. Patil , Kamran Shazand , Benoit Marteau , Yuxuan Shen , May D. Wang bioRxiv 2025.10.31.685894; doi: https://doi.org/10.1101/2025.10.31.685894 Share This Article: Copy Citation Tools Data-Driven Symbolic Higher-Order Epistasis Discovery with Kolmogorov-Arnold Networks Oankar R. Patil , Kamran Shazand , Benoit Marteau , Yuxuan Shen , May D. Wang bioRxiv 2025.10.31.685894; doi: https://doi.org/10.1101/2025.10.31.685894 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genomics Subject Areas All Articles Animal Behavior and Cognition (7642) Biochemistry (17715) Bioengineering (13907) Bioinformatics (42005) Biophysics (21472) Cancer Biology (18624) Cell Biology (25534) Clinical Trials (138) Developmental Biology (13391) Ecology (19935) Epidemiology (2067) Evolutionary Biology (24356) Genetics (15617) Genomics (22529) Immunology (17753) Microbiology (40437) Molecular Biology (17200) Neuroscience (88697) Paleontology (667) Pathology (2840) Pharmacology and Toxicology (4829) Physiology (7653) Plant Biology (15171) Scientific Communication and Education (2046) Synthetic Biology (4304) Systems Biology (9827) Zoology (2272)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.