Novel Epistatic Interaction Between RBMS3 and CDKN2B-AS1 in Coronary Artery Disease Risk Identified by Machine Learning Tool VariantSpark

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 66,351 characters · extracted from preprint-html · click to expand
Novel Epistatic Interaction Between RBMS3 and CDKN2B-AS1 in Coronary Artery Disease Risk Identified by Machine Learning Tool VariantSpark | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Novel Epistatic Interaction Between RBMS3 and CDKN2B-AS1 in Coronary Artery Disease Risk Identified by Machine Learning Tool VariantSpark View ORCID Profile Letitia M.F. Sng , Mitchell J. O’Brien , Brendan Hosking , Piotr Szul , Roc Reguant , Mythreye Venkatesan , View ORCID Profile Philip J. Freda , Zhiping Wang , Jason H. Moore , Anne H. Klein , Michael Kuiper , Angus Panagopoulos , Johan W. Verjans , Yatish Jain , View ORCID Profile Denis C. Bauer , Natalie A. Twine doi: https://doi.org/10.1101/2025.10.19.25338331 Letitia M.F. Sng 1 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Westmead, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Letitia M.F. Sng For correspondence: letitia.sng{at}csiro.au Mitchell J. O’Brien 1 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Westmead, Australia 2 Sydney Informatics Hub, The University of Sydney , NSW 2006, Australia 3 Australian BioCommons , Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Brendan Hosking 1 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Westmead, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Piotr Szul 4 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Herston, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Roc Reguant 1 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Westmead, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mythreye Venkatesan 5 Cedars-Sinai Medical Center, Department of Computational Biomedicine , Los Angeles, California, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Philip J. Freda 5 Cedars-Sinai Medical Center, Department of Computational Biomedicine , Los Angeles, California, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Philip J. Freda Zhiping Wang 5 Cedars-Sinai Medical Center, Department of Computational Biomedicine , Los Angeles, California, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jason H. Moore 5 Cedars-Sinai Medical Center, Department of Computational Biomedicine , Los Angeles, California, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Anne H. Klein 4 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Herston, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Michael Kuiper 6 Data61, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Black Mountain, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Angus Panagopoulos 1 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Westmead, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Johan W. Verjans 7 Australian Institute for Machine Learning, University of Adelaide , Adelaide, Australia 8 Lifelong Health, South Australian Health and Medical Research Institute , Adelaide, Australia 9 Royal Adelaide Hospital , Central Adelaide Health Network, Adelaide, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yatish Jain 1 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Westmead, Australia 10 Applied BioSciences, Faculty of Science and Engineering, Macquarie University , Macquarie Park, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Denis C. Bauer 7 Australian Institute for Machine Learning, University of Adelaide , Adelaide, Australia 10 Applied BioSciences, Faculty of Science and Engineering, Macquarie University , Macquarie Park, Australia 11 Australian e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Adelaide Australia 12 Department of Biomedical Informatics and Digital Health, School of Medical Science, University of Sydney , Sydney, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Denis C. Bauer For correspondence: letitia.sng{at}csiro.au Natalie A. Twine 1 Australia e-Health Research Centre, Commonwealth Scientific and Industrial Research Organisation (CSIRO) , Westmead, Australia 10 Applied BioSciences, Faculty of Science and Engineering, Macquarie University , Macquarie Park, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Background Genome-wide association studies (GWAS) of coronary artery disease (CAD), the leading cause of mortality and morbidity globally, have identified approximately 163 risk loci, yet only 40% of CAD genetic heritability can be explained. Non-additive genetic effects, like epistasis, likely contribute to CAD aetiology, but remain elusive due to limited data and insensitive algorithms. Results Using machine-learning (VariantSpark) followed by exhaustive epistasis search (BitEpi), we discovered an epistatic interaction for CAD between RBMS3 and CDKN2B-AS1 in the UK Biobank. We also observe this interaction in the independent All of Us cohort and provide a binding model using AlphaFold3. VariantSpark provides the needed sensitivity, identifying associated loci (e.g. PMAIP1-MC4R and AAK1) with 72% fewer samples than previous studies, to reduce the search space for systematic epistasis detection. Conclusions We provide in silico evidence of RBMS3 as a novel CAD risk gene, acting in epistasis with the established CDKN2B-AS1 9p21.3 risk loci. Background Coronary artery disease (CAD) is the leading cause of mortality and morbidity globally with a strong genetic component in its aetiology ( 1 , 2 ). Since the first genome-wide association studies (GWAS) in 2007 ( 3 , 4 ), approximately 897 independent loci have been associated to CAD, but only cumulatively explain 35-40% of CAD heritability on the liability scale ( 5 , 6 ). Furthermore, one study estimated a 34.3% heritability even with the inclusion of ultra-rare variants ( 7 ), suggesting other sources of the missing heritability observed. Epistasis, the combinatorial effect of one or more genes which may act alongside or in absence of marginal effects, has been suggested to be part of the solution but has not yet been systematically studied. This is despite empirical evidence demonstrating its role in the genetic architecture of CAD ( 8 – 13 ). This absence of epistasis studies is because the combinatorial nature of epistasis analysis results in challenges for traditional parametric statistical methods such as logistic regression due to: (i) the curse of dimensionality where much larger sample sizes are required when searching for epistatic interactions than for additive effects ( 14 ); (ii) the need to specify interactions which is difficult due to the levels of k (i.e., ) that need to be considered ( 15 ), and (iii) the computational demand and high multiple testing burden. In this study, we address these challenges with an innovative two-stepped approach combining two established algorithms in a complementary way that provides higher sensitivity than previous GWAS approaches and evaluates non-linear epistatic interactions ( 16 ). First, VariantSpark ( 17 ), a cloud-based, scalable, machine learning (ML) GWAS tool is applied to search for putative interactions genome-wide. VariantSpark is based on a random forest framework using the Gini importance score to identify associations with both individual and interacting effects. This enables a purely data-driven and unbiased interaction discovery without requiring pre-specified models. VariantSpark effectively narrows the search space from of possible interactions to the set of likely ones that can be evaluated systematically in the second step: BitEpi ( 18 ), a fast, exhaustive epistasis detection algorithm, is then used to mathematically test and annotate the exact epistatic interaction partners in the dataset. By first filtering epistasis candidates with VariantSpark, we substantially reduce the multiple testing burden associated with such a search, making it feasible to analyse datasets large enough to observe the subtle effects of epistasis. We apply this approach to a CAD cohort within the UK Biobank (UKB) and TOPMed (TM) datasets to identify putative epistasis signals associated with CAD. Results VariantSpark Identifies Novel CAD Associations in a UK Biobank Cohort and Demonstrates Higher Sensitivity VariantSpark was run on the imputed array genotype data from a CAD cohort within the UK Biobank ( 19 ) (UKB, n = 51,107) and validated in the Trans-Omics for Precision Medicine (TOPMed) ( 20 ) dataset (TM, n = 11,326). 115 single nucleotide polymorphisms (SNPs) significantly associated to CAD were identified in the UKB dataset (Supplementary Table 1, see file ‘Supplementary_Tables’), of which 99 (89.09%) were previously known and annotated in the Cardiovascular Disease Knowledge Portal (CVD KP) ( 21 , 22 ). This included well established CAD risk loci, namely 6q25.3 (SLC22A3), 6q26 (LPA), and 9p21.3 (CDKN2B-AS1) ( Figure 1 ). Download figure Open in new tab Figure 1. Manhattan plot of genome-wide association analysis using VariantSpark on UK Biobank CAD Cohort of 51,107 samples The plot displays −log 10 (P) values calculated using VariantSpark’s RFlocalfdr implementation ( 23 ), with the horizontal line indicating the genome-wide significance threshold (localFDR < 5%). Notable loci are labelled, with established CAD-associated genes in black and novel associations highlighted in red (including RBMS3 ). The VariantSpark Manhattan plot appears sparser than traditional logistic regression plots because the RFlocalFDR approach assigns p-values to SNPs meeting specific criteria only. Key significant associations are visible at chromosomes 1 ( CELSR2 ), 6 ( LPA ), 9 ( CDKN2B-AS1 ), and 19 ( APOE ), with novel findings at chormosomes 3 ( RBMS3 ), 16 ( WWP2 , ZNF469 ), and others. SNPs have not been pruned for linkage disequilibrium. The 115 associations were clumped to 24 independent loci, and 8 loci were not previously associated to CAD in the CVD KP and/or GWAS Catalog ( Table 1 ). These 8 loci may not have been picked up by traditional GWAS approaches as their individual effects are not large. In contrast, VariantSpark also includes interaction effects and hence increases sensitivity especially when boosted through its purpose-build local false discovery rate approach ( 23 ) increasing the chance of finding biologically meaningful effects. Indeed, two lead SNPs on chromosome 16 have evidence of association to CAD in previous studies: rs 10852491 ( WWP2 ) is in moderate LD (R 2 = 0.397) with CAD-associated SNPs ( 6 , 24 ) and rs 72803480 ( ZNF469 ) has been marginally associated with coronary atherosclerosis on the PheWeb (P = 1.8 × 10 -6 ) ( 25 ). At the gene level, CDH13 ( 5 , 26 ) and AAK1 ( 5 , 6 ) have been significantly associated to CAD in previous GWAS although the identified SNPs in these studies are not in high LD (R 2 > 0.2) with those identified in our study. View this table: View inline View popup Download powerpoint Table 1. Significant Lead SNPs from VariantSpark GWAS with UK BioBank Cohort Applying VariantSpark to the independent but smaller TM dataset replicated 14 SNPs at genome-wide significance ( P < 5 × 10 -8 ) (Supplementary Table 2, see file ‘Supplementary_Tables’), mapping to two established CAD risk loci: LPA and CDKN2B-AS1 . In comparison, a previous study replicated 14 SNPs between UKB and TM but below genome-wide significance levels ( P < 0.001) ( 27 ). At the gene-level, CDH13 replicated across both UKB and TM cohorts, however, the identified SNPs from each cohort were not in LD. This is likely due to the heterogeneity in the TM cohort, which was formed by harmonising multiple independent studies (e.g., Framingham and the Women’s Health Study). As a result, a clearly defined CAD phenotype was lacking, unlike the UKB dataset where the availability and use of ICD codes enabled a more precise CAD definition. To compare VariantSpark’s performance against a more traditional GWAS approach, we applied PLINK’s logistic regression ( 28 ) on the same UKB and TM cohorts. For UKB, PLINK identified 117 significant SNPs, clumping to 10 risk loci. VariantSpark found 9 of the 10 PLINK identified loci except TRIB1 on chromosome 8 (Supplementary Table 3, see file ‘Supplementary_Tables’). VariantSpark assigned a p -value of 0.30 to TRIB1 , likely because it is an independent association and does not reach significance amongst the other features. In the TM cohort, PLINK only identified 5 significant SNPs, all mapping to one locus, CDKN2B-AS1 (Supplementary Table 4, see file ‘Supplementary_Tables’). In contrast, VariantSpark also identified the well-known LPA locus, which in the PLINK analysis failed genome-wide significance for TM (5 × 10 -8 < P < 1 ×10 -5 ), confirming VariantSpark’s higher sensitivity, also observed in Alzheimer’s ( 16 ). The novel CAD-risk gene, RBMS3 , is likely involved in CAD pathogenesis through interaction with CDKN2B-AS1 Next, we conducted an epistasis search using BitEpi on the larger UKB cohort with the well-defined CAD phenotype. Specifically, BitEpi exhaustively searched for pairwise interactions (i.e., 276 tests) between the 24 lead SNPs prioritised by VariantSpark. BitEpi returned one significant interaction ( P FDR = 0.0105) after multiple testing correction: rs 1970112 and rs 74467064 (Supplementary Table 5, see file ‘Supplementary_Tables’). Consistent with an epistatic interaction, rs 1970112 is in the intronic region of the long non-coding RNA ( CDKN2B-AS1 ), while rs 74467064 lies in the intronic region of RBMS3, a member of the RNA-binding motif (RBM) family which play roles in multiple biological activities including RNA stability and pre-mRNA splicing ( 29 ) While intronic, rs 1970112 has been associated to CDKN2B-AS1 expression levels in aortic tissue in the Stockholm-Tartu Atherosclerosis Reverse Networks Engineering Task (STARNET) ( 30 ). The CDKN2B-AS1 risk loci has been associated with CAD across multiple ancestries ( 6 ), independent of traditional CAD risk factors such as cholesterol and hypertension. On the other hand, RBMS3 has not been directly linked to CAD but it has been marginally associated to carotid intima-media thickness in sub-Saharan Africans ( 31 ) and is significantly differentially expressed between patients with CAD and matched controls in STARNET across multiple tissues, including the aorta and liver. Replication of the rs 1970112 ( CDKN2B-AS1 ) and rs 74467064 ( RBMS3 ) interaction in the independent All of Us dataset (AoU; n = 175,731) was tested using BitEpi only. However, this pair returned a non-significant result ( P = 0.14), likely due to variance in allele frequencies and resulting difference in genotype combinations between datasets. We hence expanded the analysis to SNPs in moderate to high LD (0.2 < R 2 < 0.5) to rs 74467064 and rs 1970112. Amongst the interactions tested, rs 17665445- rs 1970112 returned a significant BitEpi result ( P = 0.048) but failed multiple testing correction (BitEpi P FDR = 0.24) (Supplementary Table 6, see file ‘Supplementary_Tables’). AlphaFold3 predicts physical interaction between Rbms3 protein and CDKN2B-AS1 RNA To investigate the epistasis hypothesis further, we modelled the potential physical binding between CDKN2B-AS1 and RBMS3 . From the literature, RBMS3 ’s known roles in cancer progression is primarily through interactions with lncRNAs ( 32 , 33 ) and RMBS3 has been implicated in epistatic interactions linked to other complex traits ( 34 , 35 ). Furthermore, CDK2NB-AS1 has been shown to physically interact with RBMS1 ( 36 ), a paralog of RBMS3 . Using AlphaFold3 ( 37 ), the physical interaction between CDKN2B-AS1 lncRNA and Rmbs3 protein was modelled. Using a 3.5Å cutoff to define a contact, a threshold commonly used to capture atomic interactions such as hydrogen bonds and van der Waals contacts ( 38 ), there were 1,054 predicted contact points between the CDKN2B-AS1 lncRNA and the Rbms3 protein (Supplementary Table 7, see file ‘Supplementary_Tables’). A particularly extensive interaction surface was identified and mapped to exon 19 of the CDKN2B-AS1 transcript (NR003529.4). Interestingly, RBPmap ( 39 ) analysis significantly matched an experimentally defined RBMS3 motif to this CDKN2B-AS1 region (chr9:22120679-22120692; hg19) (Z = 2.28; P < 1.12 × 10 -2 ; Supplementary Table 8, see file ‘Supplementary_Tables’). In the AlphaFold3 model, this region corresponds to a loop-like structure in CDKN2B-AS1 suggesting a strong binding affinity to the Rbms3 protein ( Figure 2 ). Download figure Open in new tab Figure 2. Screenshot of AlphaFold3 Prediction of CDKN2B-AS1 (NR003529.4) and Rbms3 (Q6XE24) Annotated with RBPmap Motif Analysis Result AlphaFold3 was used to model CDKN2B-AS1 (coloured orange) and Rbms3 (coloured green) together. The loop-like structure in CDKN2B-AS1 mapped to a region in exon 19 of NR003529.4/CDKN2B-AS1. Motif analysis using RBPmap matched two experimentally defined RBMS3 motifs within this region in CDKN2B-AS1 spanning chr9:22120679-22120692 (hg19/GRCh37). As the SNPs identified in this epistatic interaction by VariantSpark-BitEpi (i.e., rs 1970112 in CDKN2B-AS1 and rs 74467064 in RMBS3 ) are in intronic regions, they are not represented in the AlphaFold3 prediction as their roles are likely regulatory. In fact, their FORGEdb ( 40 ) scores are 7 and 6 (where 10 indicates highest likelihood of regulatory function), respectively, with established regulatory roles of CDKN2B-AS1 and RBMS3 in other polygenic traits, like cancer ( 41 , 42 ). To evaluate the robustness of the AlphaFold3 prediction, we conducted additional modelling of Rbms1- CDKN2B-AS1 as a positive control, previously shown to physically interact, and Actin-CDKN2B-AS1 as a negative control, experimentally demonstrated not to interact ( 36 ). AlphaFold3 predicted 93 Rbms1 protein residues in close spatial proximity to CDKN2B-AS1 , compared with only 16 residues for Actin. This contrast can be visually discerned in Supplementary Figure 1 where the Rbms1 protein is entangled with CDKN2B-AS1 while Actin is clearly separated from the RNA structure. Furthermore, the positive control (Rbms1) showed highly conserved RNA-binding residues which was not observed with the negative control (Supplementary Materials S1, see file ‘Supplementary_Materials’). Collectively, these findings suggest that AlphaFold3 does not generate spurious interaction when none exist, therefore supporting the biological plausibility of the predicted interaction between Rbms3 and CDKN2B-AS1 . Discussion In this study, we used a two-step approach to identify epistasis associated to CAD. We first applied VariantSpark to identify associations linked to CAD based on individual and epistatic effects before using BitEpi to exhaustively search for interactions within this narrowed search space. With this approach, we identified a putative epistatic interaction associated with CAD between two SNPs, rs 1970112 ( CDKN2B-AS1 ) and rs 74467064 ( RBMS3 ). While there likely are other interactions, we set a very stringent significance cut-off to only focus on a straightforward case for this paper. CDKN2B-AS1 ( ANRIL ) is a well-studied lncRNA, having been linked to various diseases including multiple cancer types ( 41 , 43 ), type 2 diabetes ( 44 ), and was one of the first CAD-risk loci that has also been consistently identified since ( 26 , 45 ). However, despite plausible involvement, its molecular mechanisms in CAD pathogenesis remain unclear due to the highly complex nature of that region. This includes multiple splicing events that generate various isoforms with opposing effects on atherosclerosis ( 46 , 47 ) and interactions with genes both within the 9p21.3 region ( 48 ) as well as in trans ( 49 ). While RBMS3 has been primarily studied in cancer, where it exhibits both tumour-suppressive and oncogenic properties ( 42 , 50 ), it is also known to be involved in the Wnt/β-catenin signalling pathways ( 51 ) and angiogenesis ( 52 ), both of which have been implicated in CAD pathophysiology. RBMS3 is also differentially expressed in CAD aortic tissue on STARNET. Both CDKN2B-AS1 and RBMS3 have been linked to breast cancer, which itself has a genetic causal relationship with CAD ( 53 ). Furthermore, AlphaFold3 predicted an extensive interaction surface between RBMS3 and CDKN2B-AS1 . This interaction surface on CDKN2B-AS1 mapped to the region that was significantly matched to an experimentally defined RBMS3 motif by RBPmap. Moreover, RBMS3 ’s paralog, RBMS1 , has been experimentally shown to bind to CDKN2B-AS1 ( 36 ). At the cellular level, both the Rbms3 protein and CDKN2B-AS1 transcripts have been detected in the cytoplasm ( 54 , 55 ), further suggesting the potential of a physical interaction. Collectively, these findings provide compelling evidence that RBMS3 is a novel contributor to the genetic architecture of CAD but hasn’t been associated in previous GWAS studies as its effect is likely to be in epistasis with CDKN2B-AS1 . While replication of genetic associations across two or more independent datasets is expected in GWAS, for epistasis analysis, it is highly unlikely that the same combination of SNPs would be associated in the same model across two datasets due to variation in allele frequency and linkage disequilibrium (LD) patterns ( 56 ), and gene-based replication should be considered. This variation in MAF is something we observed in our study where rs 74467064 had a MAF = 5.6% in the UKB cohort (i.e., considered a common variant) but a MAF = 4.2% in the AoU cohort (i.e., considered a low-frequency variant and may be filtered out). We, hence, expanded our analysis to evaluate combinations of SNPs in LD to rs 74467064 and identified the pairwise interaction between rs 17665445 (in LD with rs 74467064, R 2 = 0.3645, p < 0.0001) and rs 1970112. However, this approach required multiple testing correction and rs 17665445- rs 1970112 did not stand up to that scrutiny (BitEpi P FDR = 0.24). This highlights the importance of reducing the epistasis search space to minimise the multiple testing burden for effective epistasis detection as discussed in a review by Balvert et al ( 57 ). A better approach would have been to run the VariantSpark and BitEpi pipeline on the AoU dataset to pinpoint the SNPs in RBMS3 and CDKN2B-AS1 that are working together in epistasis specific to the AoU dataset. However, we were unable to do so due to limited access to AoU, which forms a limitation of this study. Another interesting discovery is VariantSpark’s increased power in identifying associations, aligning with our previous findings in Alzheimer’s disease ( 16 ). Specifically, on the TM cohort, VariantSpark was able to identify both LPA and CDKN2B-AS1 risk loci while PLINK only identified CDKN2B-AS1 . This is even more pronounced on the larger UKB dataset (UKB cohort, n = 51,107) with a well-defined phenotype. Here, VariantSpark identified CAD risk loci that were previously only detected in substantially larger cohorts, including PMAIP1-MC4R identified in 2015 with 184,505 samples ( 58 ), AAK1 in 2022 with 1,559,665 samples ( 5 ), and WWP2 in 2022 with 773,238 samples ( 6 ). None of these were identified by PLINK’s logistic regression on the UKB cohort. This increase in power is likely due to VariantSpark’s ability to detect both individual and interactive effects as well as capture non-linear relationships. Conclusions In conclusion, we have demonstrated the effectiveness of a two-stepped approach for epistasis detection in CAD, combining ML-based genome-wide loci prioritisation with exhaustive search for interactions. The first step leverages the higher sensitivity of VariantSpark over logistic regression to identify genetic variants associated with CAD, reducing the cohort sizes needed for epistasis detection. In combination with BitEpi, we uncovered RBMS3 as a novel CAD-risk gene, working in epistasis with CDKN2B-AS1 , a well-established but mechanistically unclear CAD risk locus. We show strong in silico evidence in support of this epistatic interaction, including AlphaFold3 predictions of physical interaction between Rbms3 protein and a CDK2B-AS1 isoform, warranting further experimental validation. These findings illustrate the potential of our approach, and with expanding cohort sizes will enable the discovery of additional epistasis, addressing the missing heritability of CAD and other complex traits. Methods Datasets and Cohort Selection GWAS & Epistasis Discovery using UK Biobank Detailed information regarding the UK Biobank (UKBB) study are as described by Bycroft et al.( 19 ), however, a summary is provided here. Between 2006 and 2010, the UK Biobank recruited approximately 500,000 participants aged between 40 and 69 from England, Wales, and Scotland. A variety of phenotypic information and biological samples were collected and analysed at recruitment. In addition, participants’ electronic health records, which included inpatient International Classification of Disease (ICD) codes and OPCS Classification of Interventions and Procedures (OPCS-4) codes, were linked and incorporated. Coronary artery disease patients were identified using the following ICD-9 and ICD-10 codes: 410, 411, 413, 414; I20-25, Z951, Z955, and the following OPCS-4 codes: K40-46, K49, K50, K75. The control group were non-smoking participants with a calculated BMI < 30 and without diagnoses of other cardiovascular diseases and known comorbidities, including epilepsy and heart arrhythmias. Samples that passed the following quality control criteria were kept: (i) not carriers of full or mosaic sex chromosome aneuploidies, (ii) matched reported and genetic sex, (iii) without genetic kinship to other UKBB participants, (iv) missing call rates > 98%, (v) within +/-3 standard deviation of calculated heterozygosity rate mean, (vi) identified as white British with similar genetic ancestry based on principal component analysis of UKBB genotypes. The resulting number of cases were 25,107 with 157,665 available controls. To keep the ratio of case to controls 1:1, 26,000 controls were chosen at random. GWAS Validation using TOPMed The TOPMed program sits under the broader precision medicine initiative of the National Institutes of Health (NIH) aimed to tailor disease treatments to an individual’s unique genomics and environment. Detailed information regarding the TOPMed study is as described by ( 20 ). In brief, high-coverage whole-genome sequencing (WGS) was completed on 53,831 TOPMed samples across 32 participating studies with deep phenotyping to understand risk factors for complex disorders, including heart disease. As each participating study collected phenotypic data such as physical measurements, clinical chemistry, and clinical registries individually, the TOPMed Data Coordinating Center (DCC) has created harmonized phenotypes for combined analyses. Access to five TOPMed datasets and their parent study was approved (Project #29236) and downloaded through dbGaP, including the Atherosclerosis Risk in Communities (ARIC) study, the Coronary Artery Risk Development in Young Adults (CARDIA) study, the Cardiovascular Health Study (CHS), the Framingham Heart Study (FHS), and the Women’s Health Initiative (WHI) study. Coronary artery disease patients were identified using the following harmonised datasets: atherosclerosis events incidents and prior except for peripheral arterial disease indicators. Where available, ICD-9 and ICD-10 codes were also extracted from individual studies where CAD patients were identified using the same codes as with the UK Biobank. The control group were non-smoking participants with BMIs < 30 and without indications of other cardiovascular diseases including hypertension and hyperlipidaemia. Samples were passed through the following quality control measures: (i) no sex aneuploidy, (ii) matched reported and genetic sex, (iii) unrelated to the second degree to other participants, (iv) missing call rates > 98%, (v) within ±3 standard deviations of calculated heterozygosity rates, (vi) clustered tightly with the 1000 Genomes Project European population on a principal component plot (PC1 vs PC2). The final number of samples were 11,326 with 5,509 cases of CAD. Epistasis Validation using All of Us The All of Us Research Program (All of Us) is a longitudinal study aimed at enrolling a diverse cohort of at least one million individuals across the United States to accelerate biomedical research and enhance human health( 59 ). Since its launch in May 2018, All of Us has enrolled over 633,000 participants aged 18 and older from more than 340 recruitment sites nationwide, contributing to the Curated Data Repository (CDR, Control Tier Dataset v8) released in 2025. The program integrates multiple data modalities, including electronic health records, survey responses (participant-provided information), physical measurements, biospecimens, and data from digital health technologies such as wearables. Coronary artery disease patients were identified using the following ICD-10 codes: I20-25, Z951 and Z955. The control group were non-smoking participants with a calculated BMI < 30 and without diagnoses of other cardiovascular diseases and known comorbidities, such as hypertensive disorder and diabetes. Samples that passed the following quality control criteria were kept: (i) not carriers of full or mosaic sex chromosome aneuploidies, (ii) matched self-reported and genetic sex, (iii) without genetic kinship to other All of Us participants, (iv) missing call rates > 98% and (v) (v) Allele balance (AB) ≥ 0.2 for heterozygotes. The resulting number of cases was 41,525 with 134,206 available controls. Genotyping and Quality Control GWAS & Epistasis Discovery using UK Biobank UKBB samples were genotyped using either the Affymetrix UK BiLEVE Axiom or the Affymetrix UK Biobank Axiom array and mapped to the GRCh37 reference. 806,466 directly genotyped DNA sequence variants passed variant quality control. The UKBB team then performed imputation from a combined 1000 Genomes and UK10K reference panel using SHAPEIT3 and IMPUTE3. The following variant quality control filters were applied to the imputed genotypes: (i) imputation quality < 0.5, (ii) Hardy-Weinberg equilibrium P < 1 x 10 -6 , (iii) MAF 98%. Only bi-allelic single nucleotide polymorphisms (SNPs) were kept resulting in a total of 4,253,140 genetic variants. GWAS Validation using TOPMed Standardised laboratory methods, a single pipeline of mapping and processing of WGS data to the GRCh38 reference, and joint variant calling and genotyping across all participating studies, year, and sequencing centres were performed to minimise batch effects. Stringent quality filters were used to ensure that batch effects were minimised and that genotype calls were of high quality. On average, 99.65% of the reference genome was covered to a mean read depth of 38.2x resulting in 3,748,599 variants per individual. The freeze 8 genotype data of the five TOPMed datasets accessed were downloaded individually and merged into a single VCF. The following variant quality control filters were then applied to the merged VCF: (i) Hardy-Weinberg equilibrium P < 1 x 10 -6 , (ii) MAF 98%, (iv) bi-allelic single nucleotide polymorphisms (SNPs). As the TOPMed cohort was used to validate the UK Biobank findings, the genotype calls were mapped to the GRCh37 reference using liftOver ( 60 ) and subset to include SNPs also found in the UK Biobank genotype data resulting in 3,996,295 genetic variants. Epistasis Validation using All of Us The All of Us dataset contains short-read whole genome sequencing (srWGS) data for 414,830 participants, including more than 1.2 billion single nucleotide variants (SNVs) and indels, with multiallelic sites split into separate records. The dataset includes three callsets, and the Allele Count/Allele Frequency (ACAF) threshold callset was selected for this analysis. This callset includes variants with a population-specific allele frequency (AF) greater than 1% or a population-specific allele count (AC) greater than 100 in any of the computed ancestry subpopulations. The prioritised pairwise interactions from the BitEpi analysis on the UK Biobank cohort was filtered and then used to construct contingency tables (3 x 3 for each genotype combination) for CAD cases and controls. GWAS using Logistic Regression and VariantSpark Discovery using UK Biobank A logistic regression model between the 25,107 CAD cases and 26,000 controls and the dosage of 4,253,140 genetic variants (nV) was performed. Age, sex, genotype array, BMI, and the first 10 principal components were included as covariates. The widely accepted genome-wide significance threshold (i.e., 5 × 10 -8 ) was used to correct for multiple-testing. For the VariantSpark implementation, genotype hard calls were used instead of dosages. This resulted in some imputed genotype dosages to be coded as missing hard call genotypes due to a hard call threshold of 0.1 (i.e., 0.5 × ∑ i | x i − ⌊ x i ⌋| ≯ 0.1). The resultant missing hard call genotypes were imputed using VariantSpark’s inbuilt ‘mode’ imputation which replaces missing genotypes with the most frequent call of non-missing genotypes. Following hyperparameter optimisation (Supplementary Material S2, see file ‘Supplementary_Materials’), the following parameters for the final VariantSpark model were used: nTree = 15,000, mTry = 0.1 x nV, maximum depth = 13, minimum node size = 10,000. P -values were calculated using the RFlocalfdr approach( 61 ) and a false discovery rate of 5% was used to denote significance. The significant SNPs were then clumped using PLINK’s --clump function with the following settings: --clump-kb 5000 –-clump-r2 0.1 and the 1000 Genomes Project European cohort. A locus is therefore defined as a 1MB region about the most significant SNP within the haplotype. Validation using TOPMed A logistic regression model between the 5,509 cases and 5,817 controls and the genotype hard calls of the 3,996,295 SNPs that overlapped with the UK Biobank was performed. Age, sex, BMI, and the first 10 principal components were included as covariates. The widely accepted genome-wide significance threshold (i.e., 5 × 10 -8 ) was used to correct for multiple-testing. The same dataset was used in VariantSpark and as none of the SNPs were missing data, no imputation was required unlike with the UK Biobank cohort. Following hyperparameter optimisation (Supplementary Material S3, see file ‘Supplementary_Materials’), the following parameters were used: nTree = 15,000, mTry = 0.1 x nV, maximum depth = 13, minimum node size = 1,000. Similarly, P -values were calculated using the method outlined in Dunne et al. (2022)( 61 ) and a false discovery rate of 5% was used to denote significance. The significant SNPs were then clumped using PLINK’s –-clump function with the following settings: --clump-kb 5000 -- clump-r2 0.1 and the 1000 Genomes Project European cohort, defining a locus as the 1MB region about the most significant SNP within the haplotype. Epistasis Analysis using BitEpi Discovery using UK Biobank After clumping the 115 significant VariantSpark SNPs, the resulting 24 independent SNPs were exhaustively analysed with BitEpi( 18 ) for pairwise interactions (i.e., 276 tests). Briefly, the β-value, an entropy metric based on the concept of set-purity, quantifies the marginal and interaction effect of the pair and is used to compute the α-value, the interaction effect size for each interaction. In this study, all component α-values of the interaction are subtracted, rather than the maximum combined association power of any subset only as in Bayat et al., 2021( 18 ) to calculate a more accurate measure of the interaction effect size. Individual genotype numbers in contingency tables were also altered to remove the correlation between SNPs while preserving individual allele frequencies and case-control ratios. A permutation approach where the case-control phenotype was randomly permuted 1,000,000 times and the α-value calculated for each permutation was used to estimate the P value. The P value is calculated as the number of permutations where the permuted α-value is larger than the true the α-value divided by the total number of permutations (i.e., 1e 6 ). Validation using All of Us BitEpi was run on the contingency tables of the following pairs which were generated from the All of Us dataset: (i) rs 74467064- rs 1970112, (ii) rs 9864192- rs 1970112, (iii) rs 17665445- rs 1970112, (iv) rs 9310894-rs1970112, (v) rs 74467064- rs 1537370. All values (β, α, and P ) were calculated as described above with 1e 6 permutations for P -value estimations. Additionally, the chi2_contingency function from the Python Scipy library was used to compute the chi-square statistic and p-value for the chi-square test of independence. Downstream Analysis Variant to Causal Gene Significantly associated SNPs were functionally annotated using the two mapping approaches: positional and molecular phenotype quantitative trait loci datasets. Firstly, based on chromosomal position, the variants were mapped to dbSNP v. 150 for rsIDs, RefSeq genes, and functional predictions using ANNOVAR ( 62 ) (version April 2021). The Stockholm-Tartu Atherosclerosis Reverse Networks Engineering Task (STARNET; http://starnet.mssm.edu/ ) was used to determine the expression quantitative trait loci information of associated SNPs. Structure Predictions and Motif Analysis AlphaFold3 was used to predict three interactions with six seeds per interaction. The best model (seed) for each interaction was chosen through AlphaFold3’s ranking score which reflects the overall quality and physical plausibility of the predicted interactions. The three interactions were: 1) CDKN2B-AS1 NR003529.4 transcript sequence in FASTA format and Rbms3 protein sequence UniProt (UniProt ID = Q6XE24), 2) CDKN2B-AS1 NR003529.4 transcript sequence in FASTA format and Rbms1 protein (UniProt ID = P29558), and 3) CDKN2B-AS1 NR003529.4 transcript sequence in FASTA format and Actin protein (UniProt ID = P60709). The resulting .cif files was input into Mol* viewer( 63 ) to visualise, identify contact points, find conversed RNA-binding protein residues, and to identify the regions mapping to structural points of interest. The genomic coordinates (GRch38/hg38) chr9:22120503-22120712:+ was used as the query coordinates for RBPmap( 39 ) and RBMS3 from the RBPmap motifs database for motif selection. Default settings for all other options were kept. RBPmap results were visualised in the UCSC Genome Browser through the inbuilt option. The SNPs rs74467064 and rs1970112 was queried on FORGEdb’s web server to gain their FORGE2 score. Data Availability Access to the UK Biobank dataset is upon application and with permission of UKB Research Ethics Committee. Access to the TOPMed dataset is through dbGaP. This study was conducted under the approved project 29236. Access to the All of Us cohort is upon application. This study was conducted through Cedars-Sinai Medical Center. http://www.ukbiobank.ac.uk/using-the-resource https://topmed.nhlbi.nih.gov/topmed-data-access-scientific-community https://www.researchallofus.org/register/ Data Availability Access to the UK Biobank dataset is upon application as described at http://www.ukbiobank.ac.uk/using-the-resource and with permission of UKB’s Research Ethics Committee. Access to the TOPMed dataset is through dbGaP and is upon application as described at https://topmed.nhlbi.nih.gov/topmed-data-access-scientific-community . This study was conducted under the approved project #29236. Access to the All of Us cohort is upon application as described at https://www.researchallofus.org/register/ . This study was conducted through Cedars-Sinai Medical Center. Code Availability VariantSpark (v0.5.3) is freely available on GitHub: https://github.com/aehrc/variantspark and on AWS and Azure marketplace. An example Jupyter notebook is included in the GitHub repositories for reference and is reflective of the Jupyter notebook used for this study. Operating system: Linux. Programming language: Python, HAIL, Bash. Other requirements: Scala, Apache Spark, htsjdk, args4j, Joda-Time, fastutil, scala-csv. License: CSIRO Open Source Software Licence v1.0. BitEpi is available on GitHub https://github.com/aehrc/bitepi respectively. Operating system: Platform independent. Programming language: Python. Licence: CSIRO Open Source Software Licence v1.0 Competing Interests The authors declare no competing interests. Author Contribution J.W.V, D.C.B., & N.A.T. conceived of this study. B.H., P.S., R. R., & Y.J. developed VariantSpark and BitEpi. L.M.F.S ran all analyses with contributions from M.O.B. for the TOPMed dataset, M.V., P.J.F, Z.W., & J.H.M for the All of Us analysis, and A.H.K. & M.K. for AlphaFold3 analysis. L.M.F.S wrote the manuscript with contributions from A.P.. All authors reviewed and approved the final draft of the report. Additional files Description of data File name: Supplementary_Materials.docx Title of data: Supplementary Materials Description of data: Additional material in support of the findings in primary manuscript, including hyperparameter optimisation for VariantSpark and AlphaFold3 analysis. File Name: Supplementary_Tables.xlsx Title of data: Supplementary Tables Description of data: Additional material in support of the findings in the primary manuscript, including associated variants from VariantSpark and Logistic Regression analysis, BitEpi results and AlphaFold3 results. Acknowledgements This study was conducted using the UK Biobank Resource under Application Number 27483. References 1. ↵ Lloyd-Jones DM , Nam BH , D’Agostino , Sr RB , Levy D , Murabito JM , Wang TJ , et al. Parental Cardiovascular Disease as a Risk Factor for Cardiovascular Disease in Middle-aged Adults: A Prospective Study of Parents and Offspring . JAMA . 2004 May 12 ; 291 ( 18 ): 2204 . OpenUrl CrossRef PubMed Web of Science 2. ↵ Marenberg ME , Risch N , Berkman LF , Floderus B , de Faire U . Genetic Susceptibility to Death from Coronary Heart Disease in a Study of Twins . N Engl J Med . 1994 Apr 14 ; 330 ( 15 ): 1041 – 6 . OpenUrl CrossRef PubMed Web of Science 3. ↵ Samani NJ , Erdmann J , Hall AS , Hengstenberg C , Mangino M , Mayer B , et al. Genomewide Association Analysis of Coronary Artery Disease . N Engl J Med . 2007 Aug 2 ; 357 ( 5 ): 443 – 53 . OpenUrl CrossRef PubMed Web of Science 4. ↵ The Wellcome Trust Case Control Consortium . Genome-wide association study of 14,000 cases of seven common diseases and 3,000 shared controls . Nature . 2007 June ; 447 ( 7145 ): 661 – 78 . OpenUrl CrossRef PubMed Web of Science 5. ↵ Aragam KG , Jiang T , Goel A , Kanoni S , Wolford BN , Atri DS , et al. Discovery and systematic characterization of risk variants and genes for coronary artery disease in over a million participants . Nat Genet . 2022 Dec ; 54 ( 12 ): 1803 – 15 . OpenUrl CrossRef PubMed 6. ↵ Tcheandjieu C , Zhu X , Hilliard AT , Clarke SL , Napolioni V , Ma S , et al. Large-scale genome-wide association study of coronary artery disease in genetically diverse populations . Nat Med . 2022 Aug ; 28 ( 8 ): 1679 – 92 . OpenUrl CrossRef PubMed 7. ↵ Rocheleau G , Clarke SL , Auguste G , Hasbani NR , Morrison AC , Heath AS , et al. Rare variant contribution to the heritability of coronary artery disease . Nat Commun . 2024 Oct 9 ; 15 ( 1 ): 8741 . OpenUrl CrossRef PubMed 8. ↵ Turner AW , Nikpay M , Silva A , Lau P , Martinuk A , Linseman TA , et al. Functional interaction between COL4A1 / COL4A2 and SMAD3 risk loci for coronary artery disease . Atherosclerosis . 2015 Oct 1 ; 242 ( 2 ): 543 – 52 . OpenUrl CrossRef PubMed 9. Nie SF , Zha LF , Fan Q , Liao YH , Zhang HS , Chen QW , et al. Genetic Regulation of the Thymic Stromal Lymphopoietin (TSLP)/TSLP Receptor (TSLPR) Gene Expression and Influence of Epistatic Interactions Between IL-33 and the TSLP/TSLPR Axis on Risk of Coronary Artery Disease . Front Immunol . 2018 Aug 3 ; 9 : 1775 . OpenUrl PubMed 10. Li Y , Cho H , Wang F , Canela-Xandri O , Luo C , Rawlik K , et al. Statistical and Functional Studies Identify Epistasis of Cardiovascular Risk Genomic Variants From Genome-Wide Association Studies . JAHA [Internet] . 2020 Apr 9 [cited 2021 July 9 ]; 9 ( 7 ). Available from: https://www.ahajournals.org/doi/10.1161/JAHA.119.014146 11. Zeng L , Moser S , Mirza-Schreiber N , Lamina C , Coassin S , Nelson CP , et al. Cis-epistasis at the LPA locus and risk of cardiovascular diseases . Cardiovascular Research . 2021 Apr 20 ; cvab136 . 12. Snaebjarnarson AS , Helgadottir A , Arnadottir GA , Ivarsdottir EV , Thorleifsson G , Ferkingstad E , et al. Complex effects of sequence variants on lipid levels and coronary artery disease . Cell . 2023 Sept 14 ; 186 ( 19 ): 4085 – 4099 .e15. OpenUrl PubMed 13. ↵ Tsai CT , Hwang JJ , Ritchie MD , Moore JH , Chiang FT , Lai LP , et al. Renin–angiotensin system gene polymorphisms and coronary artery disease in a large angiographic cohort: Detection of high order gene–gene interaction . Atherosclerosis . 2007 Nov 1 ; 195 ( 1 ): 172 – 80 . OpenUrl CrossRef PubMed Web of Science 14. ↵ Moore JH . The Challenges of Whole-Genome Approaches to Common Diseases . JAMA: The Journal of the American Medical Association . 2004 Apr 7 ; 291 ( 13 ): 1642 – 3 . OpenUrl CrossRef PubMed Web of Science 15. ↵ McKinney BA , Reif DM , Ritchie MD , Moore JH . Machine Learning for Detecting Gene-Gene Interactions: A Review . Applied Bioinformatics . 2006 ; 5 ( 2 ): 77 – 88 . OpenUrl CrossRef PubMed 16. ↵ Lundberg M , Sng LMF , Szul P , Dunne R , Bayat A , Burnham SC , et al. Novel Alzheimer’s disease genes and epistasis identified using machine learning GWAS platform . Sci Rep . 2023 Oct 17 ; 13 ( 1 ): 17662 . OpenUrl CrossRef PubMed 17. ↵ Bayat A , Szul P , O’Brien AR , Dunne R , Hosking B , Jain Y , et al. VariantSpark: Cloud-based machine learning for association study of complex phenotype and large-scale genomic data . GigaScience . 2020 Aug 1 ; 9 ( 8 ): giaa077 . OpenUrl PubMed 18. ↵ Bayat A , Hosking B , Jain Y , Hosking C , Kodikara M , Reti D , et al. Fast and accurate exhaustive higher-order epistasis search with BitEpi . Sci Rep . 2021 Aug 5 ; 11 : 15923 . 19. ↵ Bycroft C , Freeman C , Petkova D , Band G , Elliott LT , Sharp K , et al. The UK Biobank resource with deep phenotyping and genomic data . Nature . 2018 Oct ; 562 ( 7726 ): 203 – 9 . OpenUrl CrossRef PubMed 20. ↵ Taliun D , Harris DN , Kessler MD , Carlson J , Szpiech ZA , Torres R , et al. Sequencing of 53,831 diverse genomes from the NHLBI TOPMed Program . Nature . 2021 Feb ; 590 ( 7845 ): 290 – 9 . OpenUrl CrossRef PubMed 21. ↵ Costanzo MC , Roselli C , Brandes M , Duby M , Hoang Q , Jang D , et al. Cardiovascular Disease Knowledge Portal: A Community Resource for Cardiovascular Disease Research . Circulation: Genomic and Precision Medicine . 2023 Dec ; 16 ( 6 ): e004181 . OpenUrl 22. ↵ Cardiovascular Disease Portal (cvd.hugemap.org) . Coronary Artery Disease Phenotype page . [Internet] . 2025 [cited 2025 Feb 24 ]. Available from: https://cvd.hugeamp.org/phenotype.html?phenotype=CAD 23. ↵ Dunne R , Reguant R , Ramarao-Milne P , Szul P , Sng LMF , Lundberg M , et al. Thresholding Gini variable importance with a single-trained random forest: An empirical Bayes approach . Computational and Structural Biotechnology Journal . 2023 Jan 1 ; 21 : 4354 – 60 . OpenUrl 24. ↵ Verma A , Huffman JE , Rodriguez A , Conery M , Liu M , Ho YL , et al. Diversity and scale: Genetic architecture of 2068 traits in the VA Million Veteran Program . Science . 2024 July 19 ; 385 ( 6706 ): eadj1182 . OpenUrl CrossRef PubMed 25. ↵ Gagliano Taliun SA , VandeHaar P , Boughton AP , Welch RP , Taliun D , Schmidt EM , et al. Exploring and visualizing large-scale genetic associations by using PheWeb . Nat Genet . 2020 June ; 52 ( 6 ): 550 – 2 . OpenUrl CrossRef PubMed 26. ↵ van der Harst P , Verweij N . Identification of 64 Novel Genetic Loci Provides an Expanded View on the Genetic Architecture of Coronary Artery Disease . Circ Res . 2018 Feb 2 ; 122 ( 3 ): 433 – 43 . OpenUrl Abstract / FREE Full Text 27. ↵ Sarnowski C , Chen H , Biggs ML , Wassertheil-Smoller S , Bressler J , Irvin MR , et al. Identification of novel and rare variants associated with handgrip strength using whole genome sequence data from the NHLBI Trans-Omics in Precision Medicine (TOPMed) Program . PLOS ONE . 2021 July 2 ; 16 ( 7 ): e0253611 . OpenUrl PubMed 28. ↵ Chang CC , Chow CC , Tellier LC , Vattikuti S , Purcell SM , Lee JJ . Second-generation PLINK: rising to the challenge of larger and richer datasets . GigaScience . 2015 Dec 1 ; 4 ( 1 ): s13742-015-0047–8 . OpenUrl CrossRef 29. ↵ Li Z , Guo Q , Zhang J , Fu Z , Wang Y , Wang T , et al. The RNA-Binding Motif Protein Family in Cancer: Friend or Foe? Frontiers in Oncology . 2021 Nov 4 ; 11 : 757135 . 30. ↵ Koplev S , Seldin M , Sukhavasi K , Ermel R , Pang S , Zeng L , et al. A mechanistic framework for cardiometabolic and coronary artery diseases . Nat Cardiovasc Res . 2022 Jan ; 1 ( 1 ): 85 – 100 . OpenUrl PubMed 31. ↵ Boua PR , Brandenburg JT , Choudhury A , Sorgho H , Nonterah EA , Agongo G , et al. Genetic associations with carotid intima-media thickness link to atherosclerosis with sex-specific effects in sub-Saharan Africans . Nat Commun . 2022 Feb 14 ; 13 ( 1 ): 855 . OpenUrl CrossRef PubMed 32. ↵ Dong S , Ma M , Li M , Guo Y , Zuo X , Gu X , et al. LncRNA MEG3 regulates breast cancer proliferation and apoptosis through miR-141-3p/RBMS3 axis . Genomics . 2021 July 1 ; 113 ( 4 ): 1689 – 704 . OpenUrl PubMed 33. ↵ Yang M , Lu H , Liu J , Wu S , Kim P , Zhou X . lncRNAfunc: a knowledgebase of lncRNA function in human cancer . Nucleic Acids Research . 2022 Jan 7 ; 50 ( D1 ): D1295 – 306 . OpenUrl CrossRef PubMed 34. ↵ Tyler A , Mahoney JM , Carter GW . Genetic Interactions Affect Lung Function in Patients with Systemic Sclerosis . G3 Genes|Genomes|Genetics . 2020 Jan 1 ; 10 ( 1 ): 151 – 63 . OpenUrl 35. ↵ Yang T , Guo Y , Li J , Zhang L , Shen H , Li SM , et al. Gene-gene interaction between RBMS3 and ZNF516 influences bone mineral density . Journal of Bone and Mineral Research . 2013 Apr 1 ; 28 ( 4 ): 828 – 37 . OpenUrl PubMed 36. ↵ Hubberten M , Bochenek G , Chen H , Häsler R , Wiehe R , Rosenstiel P , et al. Linear isoforms of the long noncoding RNA CDKN2B-AS1 regulate the c-myc-enhancer binding factor RBMS1 . Eur J Hum Genet . 2019 Jan ; 27 ( 1 ): 80 – 9 . OpenUrl PubMed 37. ↵ Abramson J , Adler J , Dunger J , Evans R , Green T , Pritzel A , et al. Accurate structure prediction of biomolecular interactions with AlphaFold 3 . Nature . 2024 June ; 630 ( 8016 ): 493 – 500 . OpenUrl CrossRef PubMed 38. ↵ Khazanov NA , Carlson HA . Exploring the Composition of Protein-Ligand Binding Sites on a Large Scale . PLoS Comput Biol . 2013 Nov 21 ; 9 ( 11 ): e1003321 . OpenUrl CrossRef PubMed 39. ↵ Paz I , Kosti I , Ares M , Cline M , Mandel-Gutfreund Y . RBPmap: a web server for mapping binding sites of RNA-binding proteins . Nucleic Acids Research . 2014 July 1 ; 42 ( W1 ): W361 – 7 . OpenUrl CrossRef PubMed Web of Science 40. ↵ Breeze CE , Haugen E , Gutierrez-Arcelus M , Yao X , Teschendorff A , Beck S , et al. FORGEdb: a tool for identifying candidate functional variants and uncovering target genes and mechanisms for complex diseases . Genome Biology . 2024 Jan 2 ; 25 ( 1 ): 3 . OpenUrl PubMed 41. ↵ Shu X , Long J , Cai Q , Kweon SS , Choi JY , Kubo M , et al. Identification of novel breast cancer susceptibility loci in meta-analyses conducted among Asian and European descendants . Nat Commun . 2020 Mar 5 ; 11 ( 1 ): 1217 . OpenUrl PubMed 42. ↵ Zhou Y , Liang Z , Xia Y , Li S , Liang J , Hu Z , et al. Disruption of RBMS3 suppresses PD-L1 and enhances antitumor immune activities and therapeutic effects of auranofin against triple-negative breast cancer . Chemico-Biological Interactions . 2023 Jan 5 ; 369 : 110260 . 43. ↵ Gorman BR , Ji SG , Francis M , Sendamarai AK , Shi Y , Devineni P , et al. Multi-ancestry GWAS meta-analyses of lung cancer reveal susceptibility loci and elucidate smoking-independent genetic risk . Nat Commun . 2024 Oct 4 ; 15 ( 1 ): 8629 . OpenUrl PubMed 44. ↵ Mahajan A , Spracklen CN , Zhang W , Ng MCY , Petty LE , Kitajima H , et al. Multi-ancestry genetic study of type 2 diabetes highlights the power of diverse populations for discovery and translation . Nat Genet . 2022 May ; 54 ( 5 ): 560 – 72 . OpenUrl CrossRef PubMed 45. ↵ McPherson R , Pertsemlidis A , Kavaslar N , Stewart A , Roberts R , Cox DR , et al. A Common Allele on Chromosome 9 Associated with Coronary Heart Disease . Science . 2007 June 8 ; 316 ( 5830 ): 1488 – 91 . OpenUrl Abstract / FREE Full Text 46. ↵ Cho H , Li Y , Archacki S , Wang F , Yu G , Chakrabarti S , et al. Splice variants of lncRNA RNA ANRIL exert opposing effects on endothelial cell activities associated with coronary artery disease . RNA Biology . 2020 Oct 2 ; 17 ( 10 ): 1391 – 401 . OpenUrl PubMed 47. ↵ Holdt LM , Stahringer A , Sass K , Pichler G , Kulak NA , Wilfert W , et al. Circular non-coding RNA ANRIL modulates ribosomal RNA maturation and atherosclerosis in humans . Nat Commun . 2016 Aug 19 ; 7 ( 1 ): 12429 . OpenUrl CrossRef PubMed 48. ↵ Burd CE , Jeck WR , Liu Y , Sanoff HK , Wang Z , Sharpless NE . Expression of Linear and Novel Circular Forms of an INK4/ARF-Associated Non-Coding RNA Correlates with Atherosclerosis Risk . PLOS Genetics . 2010 Dec 2 ; 6 ( 12 ): e1001233 . OpenUrl 49. ↵ Li H , Han S , Sun Q , Yao Y , Li S , Yuan C , et al. Long non-coding RNA CDKN2B-AS1 reduces inflammatory response and promotes cholesterol efflux in atherosclerosis by inhibiting ADAM10 expression . Aging (Albany NY ). 2019 Mar 29 ; 11 ( 6 ): 1695 . OpenUrl PubMed 50. ↵ Block CJ , Mitchell AV , Wu L , Glassbrook J , Craig D , Chen W , et al. RNA binding protein RBMS3 is a common EMT effector that modulates triple-negative breast cancer progression via stabilizing PRRX1 mRNA . Oncogene . 2021 Nov ; 40 ( 46 ): 6430 – 42 . OpenUrl CrossRef PubMed 51. ↵ Yang Y , Quan L , Ling Y . RBMS3 Inhibits the Proliferation and Metastasis of Breast Cancer Cells . Oncol Res . 2018 Jan 19 ; 26 ( 1 ): 9 – 15 . OpenUrl PubMed 52. ↵ Chen J , Kwong DLW , Zhu CL , Chen LL , Dong SS , Zhang LY , et al. RBMS3 at 3p24 Inhibits Nasopharyngeal Carcinoma Development via Inhibiting Cell Proliferation, Angiogenesis, and Inducing Apoptosis . PLOS ONE . 2012 Sept 5 ; 7 ( 9 ): e44636 . OpenUrl CrossRef PubMed 53. ↵ Xiao B , Velez Edwards DR , Lucas A , Drivas T , Gray K , Keating B , et al. Inference of Causal Relationships Between Genetic Risk Factors for Cardiometabolic Phenotypes and Female-Specific Health Conditions . Journal of the American Heart Association . 2023 Mar 7 ; 12 ( 5 ): e026561 . OpenUrl CrossRef PubMed 54. ↵ Penkov D , Ni R , Else C , Piñol-Roma S , Ramirez F , Tanaka S . Cloning of a human gene closely related to the genes coding for the c-myc single-strand binding proteins . Gene . 2000 Feb 8 ; 243 ( 1 ): 27 – 36 . OpenUrl CrossRef PubMed 55. ↵ Li J , Chen J , Zhang F , Li J , An S , Cheng M , et al. LncRNA CDKN2B-AS1 hinders the proliferation and facilitates apoptosis of ox-LDL-induced vascular smooth muscle cells via the ceRNA network of CDKN2B-AS1/miR-126-5p/PTPN7 . International Journal of Cardiology . 2021 Oct 1 ; 340 : 79 – 87 . OpenUrl PubMed 56. ↵ Ritchie MD , Steen KV . The search for gene-gene interactions in genome-wide association studies: challenges in abundance of methods, practical considerations, and biological interpretation . Annals of Translational Medicine . 2018 Apr ; 6 ( 8 ): 157 – 157 . OpenUrl PubMed 57. ↵ Balvert M , Cooper-Knock J , Stamp J , Byrne RP , Mourragui S , van Gils J , et al. Considerations in the search for epistasis . Genome Biology . 2024 Nov 19 ; 25 ( 1 ): 296 . OpenUrl CrossRef PubMed 58. ↵ Nikpay M , Goel A , Won HH , Hall LM , Willenborg C , Kanoni S , et al. A comprehensive 1000 Genomes–based genome-wide association meta-analysis of coronary artery disease . Nat Genet . 2015 Oct ; 47 ( 10 ): 1121 – 30 . OpenUrl CrossRef PubMed 59. ↵ The All of Us Research Program Investigators . The “All of Us” Research Program . N Engl J Med . 2019 Aug 15 ; 381 ( 7 ): 668 – 76 . OpenUrl CrossRef PubMed 60. ↵ Nassar LR , Barber GP , Benet-Pagès A , Casper J , Clawson H , Diekhans M , et al. The UCSC Genome Browser database: 2023 update . Nucleic Acids Research . 2023 Jan 6 ; 51 ( D1 ): D1188 – 95 . OpenUrl CrossRef PubMed 61. ↵ Dunne R , Reguant R , Ramarao-Milne P , Szul P , Sng L , Lundberg M , et al. Thresholding Gini Variable Importance with a single trained Random Forest: An Empirical Bayes Approach [Internet] . bioRxiv ; 2022 [cited 2023 May 31 ]. p. 2022.04.06.487300. Available from: https://www.biorxiv.org/content/10.1101/2022.04.06.487300v2 62. ↵ Wang K , Li M , Hakonarson H . ANNOVAR: functional annotation of genetic variants from high-throughput sequencing data . Nucleic Acids Research . 2010 Sept 1 ; 38 ( 16 ): e164 – e164 . OpenUrl CrossRef PubMed 63. ↵ Sehnal D , Bittrich S , Deshpande M , Svobodová R , Berka K , Bazgier V , et al. Mol* Viewer: modern web app for 3D visualization and analysis of large biomolecular structures . Nucleic Acids Research . 2021 July 2 ; 49 ( W1 ): W431 – 7 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 21, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Novel Epistatic Interaction Between RBMS3 and CDKN2B-AS1 in Coronary Artery Disease Risk Identified by Machine Learning Tool VariantSpark Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Novel Epistatic Interaction Between RBMS3 and CDKN2B-AS1 in Coronary Artery Disease Risk Identified by Machine Learning Tool VariantSpark Letitia M.F. Sng , Mitchell J. O’Brien , Brendan Hosking , Piotr Szul , Roc Reguant , Mythreye Venkatesan , Philip J. Freda , Zhiping Wang , Jason H. Moore , Anne H. Klein , Michael Kuiper , Angus Panagopoulos , Johan W. Verjans , Yatish Jain , Denis C. Bauer , Natalie A. Twine medRxiv 2025.10.19.25338331; doi: https://doi.org/10.1101/2025.10.19.25338331 Share This Article: Copy Citation Tools Novel Epistatic Interaction Between RBMS3 and CDKN2B-AS1 in Coronary Artery Disease Risk Identified by Machine Learning Tool VariantSpark Letitia M.F. Sng , Mitchell J. O’Brien , Brendan Hosking , Piotr Szul , Roc Reguant , Mythreye Venkatesan , Philip J. Freda , Zhiping Wang , Jason H. Moore , Anne H. Klein , Michael Kuiper , Angus Panagopoulos , Johan W. Verjans , Yatish Jain , Denis C. Bauer , Natalie A. Twine medRxiv 2025.10.19.25338331; doi: https://doi.org/10.1101/2025.10.19.25338331 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9219) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffba9780ad9dfa9',t:'MTc3OTQ1MTI0Mg=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00