Multimodal AI/ML for discovering novel biomarkers and predicting disease using multi-omics profiles of patients with cardiovascular diseases

doi:10.1101/2024.08.07.607041

Multimodal AI/ML for discovering novel biomarkers and predicting disease using multi-omics profiles of patients with cardiovascular diseases

2024 · doi:10.1101/2024.08.07.607041

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 82,080 characters · extracted from preprint-html · click to expand

Multimodal AI/ML for discovering novel biomarkers and predicting disease using multi-omics profiles of patients with cardiovascular diseases | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Multimodal AI/ML for discovering novel biomarkers and predicting disease using multi-omics profiles of patients with cardiovascular diseases William DeGroat , Habiba Abdelhalim , Elizabeth Peker , Neev Sheth , Rishabh Narayanan , Saman Zeeshan , Bruce T. Liang , View ORCID Profile Zeeshan Ahmed doi: https://doi.org/10.1101/2024.08.07.607041 William DeGroat 1 Rutgers Institute for Health, Health Care Policy and Aging Research , Rutgers, The State University of New Jersey , 112 Paterson St, New Brunswick, 08901, NJ, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Habiba Abdelhalim 1 Rutgers Institute for Health, Health Care Policy and Aging Research , Rutgers, The State University of New Jersey , 112 Paterson St, New Brunswick, 08901, NJ, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Elizabeth Peker 1 Rutgers Institute for Health, Health Care Policy and Aging Research , Rutgers, The State University of New Jersey , 112 Paterson St, New Brunswick, 08901, NJ, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Neev Sheth 1 Rutgers Institute for Health, Health Care Policy and Aging Research , Rutgers, The State University of New Jersey , 112 Paterson St, New Brunswick, 08901, NJ, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rishabh Narayanan 1 Rutgers Institute for Health, Health Care Policy and Aging Research , Rutgers, The State University of New Jersey , 112 Paterson St, New Brunswick, 08901, NJ, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Saman Zeeshan 2 Department of Biomedical and Health Informatics, UMKC School of Medicine , 2411 Holmes Street, Kansas City, MO, 64108, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Bruce T. Liang 3 Pat and Jim Calhoun Cardiology Center , UConn Health, 263 Farmington Ave, Farmington, CT, USA 4 UConn School of Medicine, University of Connecticut , 263 Farmington Ave, Farmington, CT, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zeeshan Ahmed 1 Rutgers Institute for Health, Health Care Policy and Aging Research , Rutgers, The State University of New Jersey , 112 Paterson St, New Brunswick, 08901, NJ, USA 4 UConn School of Medicine, University of Connecticut , 263 Farmington Ave, Farmington, CT, USA 5 Department of Medicine, Division of Cardiovascular Disease and Hypertension, Robert Wood Johnson Medical School , Rutgers Health, 125 Paterson St, New Brunswick, NJ, 08901, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zeeshan Ahmed For correspondence: zahmed{at}ifh.rutgers.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Cardiovascular diseases (CVDs) are multifactorial diseases, requiring personalized assessment and treatment. The advancements in multi-omics technologies, namely RNA-seq and whole genome sequencing, have offered translational researchers a comprehensive view of the human genome; utilizing this data, we can reveal novel biomarkers and segment patient populations based on personalized risk factors. Limitations in these technologies in failing to capture disease complexity can be accounted for by using an integrated approach, characterizing variants alongside expression related to emerging phenotypes. Designed and implemented data analytics methodology is based on a nexus of orthodox bioinformatics, classical statistics, and multimodal artificial intelligence and machine learning techniques. Our approach has the potential to reveal the intricate mechanisms of CVD that can facilitate patient-specific disease risk and response profiling. We sourced transcriptomic expression and variants from CVD and control subjects. By integrating these multi-omics datasets with clinical demographics, we generated patient-specific profiles. Utilizing a robust feature selection approach, we reported a signature of 27 transcripts and variants efficient at predicting CVD. Here, differential expression analysis and minimum redundancy maximum relevance feature selection elucidated biomarkers explanatory of the disease phenotype. We used Combination Annotation Dependent Depletion and allele frequencies to identify variants with pathogenic characteristics in CVD patients. Classification models trained on this signature demonstrated high-accuracy predictions for CVDs. Overall, we observed an XGBoost model hyperparameterized using Bayesian optimization perform the best (AUC 1.0). Using SHapley Additive exPlanations, we compiled risk assessments for patients capable of further contextualizing these predictions in a clinical setting. We discovered a 27-component signature explanatory of phenotypic differences in CVD patients and healthy controls using a feature selection approach prioritizing both biological relevance and efficiency in machine learning. Literature review revealed previous CVD associations in a majority of these diagnostic biomarkers. Classification models trained on this signature were able to predict CVD in patients with high accuracy. Here, we propose a framework generalizable to other diseases and disorders. 1. Introduction Cardiovascular diseases (CVDs) are recognized as the primary cause of mortality among men and women in the United States [ 1 , 2 ]. Given the complex nature, risk factors, inherent genetic makeup, and trajectory of CVD, personalized management is essential for effective treatment [ 2 ]. Advancements in genomics and bioinformatics have significantly enhanced our understanding of the intricate origins of CVDs [ 3 , 4 ]. Gaining insights into disease implications by utilizing transcriptomic expression and variant profiles holds the promise of revolutionizing diagnostic capabilities, treatment strategies, and prognostic assessments across various CVDs including but not limited to heart failure (HF) and arial fibrillation (AF) [ 5 , 6 ]. These advancements stem from next-generation sequencing (NGS) technologies, which have facilitated the identification of novel heritable links and the exploration of genetic diversity among patients [ 7 ]. Gene expression analysis through RNA-seq data has aided in uncovering disease associated biomarkers and categorizing patient groups according to their risk profiles [ 8 ]. Analyzing RNA-seq data for differential expression allows for the exploration of genome-wide biological disparities, leading to enriched functional pathways and gene ontologies [ 9 , 10 ]. RNA-seq data provides valuable biological insights into gene expression, RNA processing, and molecular pathways underlying disease states [ 11 , 12 ]. While gene expression analysis allows for enhancements in diagnostic capabilities and precise treatment plans, multiple studies have established that RNA-seq provides limited coverage of non-coding regions, and that transcriptomics cannot detect genomic variants [ 11 , 12 , 13 ]. The onset of multifactorial diseases is shaped by an interplay of environmental and genetic factors, affecting various biological processes such as gene regulation [ 14 ]. Previous studies utilizing whole genome and exome sequencing (WGS/WES) have demonstrated their efficacy in accurately revealing the effects of non-coding variants on CVDs [ 15 , 16 ] and other complex diseases [ 17 ], as well as in capturing all genetic variation. Thus, providing comprehensive information about an individual’s entire genome [ 16 , 17 ]. Although sequencing technology aids in identifying genetic variations linked to diseases, accurately linking specific genomic variations to disease phenotypes remains challenging [ 3 , 18 ]. Deciphering the pathogenic and biological function of genes may require additional information beyond what one type of data can offer [ 18 ]. Data integration is vital in managing the escalating volume of data and obtaining comprehensive interdisciplinary insights into extensive genomic datasets [ 19 ]. Additionally, due to the heterogenous nature of genomic, transcriptomic, and clinical data, there is a lack of standardization creating a persistent limitation in data integration [ 18 ]. These challenges are being addressed with the integration of precision medicine and artificial intelligence (AI)/machine learning (ML) approaches where phenotypic, clinical, transcriptomic, and genomic data can be subjected to classification and selection to facilitate the identification of high-risk patients [ 18 , 20 ]. Utilizing cutting edge AI/ML technology can aid in the analysis and interpretation of gene expression and variant data, providing more accurate diagnosis and improving our understanding of the mechanisms behind complex diseases including but not limited to CVDs [ 21 , 22 ], lupus [ 23 ], and colon cancer [ 24 ]. Previously, we have performed traditional bioinformatic analyses including an in-depth gene expression and enrichment analysis of RNA-seq data from patients with mostly HF and other CVDs. We identified differentially expressed genes (DEGs) that are well-documented to be associated with CVDs and other enriched pathways [ 25 ]. However, we were unable to detect any CVD drivers utilizing RNA-seq data. To address this limitation, we employed an integrative, multi-omics approach of gene expression, disease-causing gene variants and associated phenotypes among CVD populations [ 6 ]. In this study we combined specific mutations for the DEGs we had previously reported allowing for a better understanding of CVD progression [ 6 ]. Extending our research and expanding beyond orthodox bioinformatic techniques, we implemented AI/ML techniques on RNA-seq driven gene expression data to study biomarkers associated with HF, AF, and other CVDs [ 26 ]. Our AI/ML analysis supported our initial gene expression study as we were able to identify common genes that have a high impact on CVD diagnosis [ 25 , 26 ]. Additionally, this AI/ML framework aided in establishing Hygieia , a portable pipeline which integrates genomics and healthcare data to explore genes linked to specific disorders and predict disease [ 27 ]. While we were able to predict CVDs with high accuracy using this methodology, we were only focused on CVD driver genes with genetic alterations that can culminate in CVD [ 26 ]. We overcame this challenge by using whole transcriptome-based gene expression data and further enhanced our AI/ML model to adapt a novel nexus of algorithms to predict CVDs based on crucial transcriptomic biomarkers [ 28 ]. We utilized this approach and proposed IntelliGenes , a novel AI/ML pipeline for the identification of novel biomarkers and predict disease in individuals [ 29 ]. In this study, we leverage our previous work and present a new AI/ML approach that uses multi-omics data integrating RNA-seq driven gene expression, whole genome based single nucleotide variant (SNVs), and demographic and clinical data ( Figure 1 ). Novel biomarkers based on differential expression associated with CVDs were investigated for pathogenic SNVs to identify variations within the genes and their regulatory elements. A clinically integrated genomic and transcriptomic (CIGT) dataset was leveraged by two classifiers and three ML algorithms to accurately predict CVDs. Through the identification of genetic biomarkers and their relative SNPs, we have highlighted potential indicators for the early detection of CVDs. These biomarkers aid in identifying individuals at risk pre-diagnosis, enabling prompt intervention, and enhancing patient outcomes. With its implementation in healthcare, our predictive model can identify patients at risk of CVDs and may be extrapolated to perform other single disease predictions. Download figure Open in new tab Figure 1. Study design and workflow. This figure represents a summary of our study design: I) Transcriptomic expression, II) Pathogenic Variant, III) Multimodal Machine Learning, and IV) Results. Various inputs and their implementation are also included (RNA-sequencing, Clinical Records, Whole Genome Sequencing, Annotation databases and Biomarkers). 2. Methods Overall methodology is into three steps ( Figure 2 ), which include I) transcriptomic/gene expression analysis, II) pathogenic variant analysis, and III) Multimodal AI/ML analysis. Download figure Open in new tab Figure 2. Methodology. This figure presents the k-Nearest Neighbors (k-NN) imputation to address missing values present in our RNA-seq expression data. DESeq2 was utilized for normalization and gene differential expression on four clinical sub cohorts to reduce the effect of confounding variables. Next, minimum redundancy – maximum relevance (MRMR) was performed to identify biomarkers proficient in predicting CVDs. Simultaneously, significant single nucleotide variants (SNVs) were annotated, and their pathogenicity determined for downstream analysis. Utilizing the clinically integrated transcriptomics and genomics dataset (CIGT) of significant biomarkers and their variants, machine learning algorithms (Random Forest, Logistic Regression, and Xtreme Gradient Boosting) to predict CVDs. Boxes highlighted in yellow refer to input data, blue refers to machine learning approaches, orange highlights clinical records, red refers to statistical analyses, while purple refers to bioinformatic analyses, and green highlights results. Transcriptomic/Gene Expression Analysis Previously, we performed RNA-seq on a 71-participant cohort of CVD patients and health controls [ 6 , 25 ]. Samples were collected from the individual’s peripheral blood mononuclear cells (PBMCs). All procedures performed in studies involving human participants were in accordance with the ethical standards of the institution and with the 1964 Helsinki Declaration and its later amendments or comparable ethical standards. All human samples were used in accordance with relevant guidelines and regulations, and all experimental protocols were approved by the Institutional Review Board. This cohort had 61 CVD patients: 40 male and 21 female individuals aged between 45 and 92, from diverse ethnic groups (56 non-Hispanic, four Hispanic, and one declined to answer) and self-described race (42 Whites, seven Blacks or African Americans, one Asian and 11 of unknown race). Ten control patients rounded out the cohort: five males and five females, (out of which three were self-described Hispanics and seven non-Hispanic; nine were White race and one unknown race) aged between 28 and 78 with no clinical manifestation of CVD. From our 71-participant cohort, we extracted transcriptomic expression for each individual. Counts and TPM values were retrieved from our RNA-seq. TPM values guided preprocessing. Transcripts with a median TPM below 0.5 across participants were removed from our dataset, and transcripts without significant expression (TPM > 1) in at least one patient were also removed. Additionally, non-ubiquitous transcripts, or transcripts expressed in less than 80% of the cohort’s participants’ RNA-seq, were excluded from downstream analysis. The filtered transcripts were configured into a CIGT-formatted dataset based on raw count values; this dataset served as the basis of our series of analyses. We applied k-nearest neighbors (k-NN) imputation paired with artificial missingness to predict missing count values within our CIGT-formatted dataset (e.g., NaN). Here, we replaced 10% of the known portion of our dataset with missing values, maintaining the true values for comparison. We simulated imputations using distinct ‘n_neighbors’ values, ranging from 1 to 20. Optimizing this parameter assists in reducing noise in high-dimensional datasets, fine-tuning the imputer’s sensitivity; essentially, this parameter tells the algorithm the number of most similar data points to consult when estimating a missing value. With each simulation, we calculated the root mean squared error (RMSE) by comparing the predicted values with the previously withheld portion of the dataset; RMSE was minimized in our choice of optimal ‘n_neighbors’ values. We filled in missing values using optimal ‘n_neighbors’ and reintegrated the artificially excluded values into our dataset. We conducted dataset normalization using DESeq2’s median of ratios on our continuous imputed dataset [ 30 , 31 ]. This normalization method, which utilizes RNA-seq counts, adjusts for cross-sample comparability better than TPM values [ 32 ]. We calculated the coefficient of variation and intraclass correlation for our normalized dataset. Because median of ratios is a dataset-dependent normalization algorithm, sub-setted matrices from our expression data must be renormalized from raw counts prior to further analysis. Next, we performed differential expression analysis using DESeq2. To minimize the effects of confounders, the cohort’s CVD patients and control participants were stratified into subcohorts based on their demographic features. We assigned groups using the individual’s sex, racial background, and age. Four groups were created: white males ages 45 - 64, white males ages 65+, white females ages 45 - 64, and white females 65 plus. Differential expression using DESeq2 was then performed independently on each subcohort, employing a negative binomial distribution-based model to locate DEGs. [ 30 ]. DE on each subcohort yielded separate p-values and log fold changes (LFC) per feature, respectively indicating its probability of being differentially expressed and the directionality of expression. A positive LFC value indicates up regulation (i.e. over-expression), and negative indicates down regulation (i.e. under-expression). The results from our four subcohorts are merged: p-values were combined with Fisher’s method and LFCs were averaged. Only DEGs with an adjusted p-value less than 0.05 were considered to be significant and included in downstream analysis. To determine biologically relevant DEGs use in ML single-disease predictions [ 33 , 34 ], and to select genes explanatory of a generalized CVD phenotype, we utilized minimum redundancy -maximum relevance (MRMR). In MRMR, an arbitrary ‘k’ parameter is chosen indicating the number of biomarkers returned. This is the best set of ‘k’ size for explaining the difference between patients and controls. Here, MRMR minimizes the effects of co-expressed DEGs, and other patterns of redundant information contained in expression datasets. This example is preferable to other ML selectors with arbitrary cutoffs, such as recursive feature elimination models; in such models, highly correlated biomarkers might aggregate atop rankings, leaving classifiers with less useful or more redundant information to make predictions. Here, we performed MRMR exclusively on the DEGs in our training dataset, with ‘k’ set as 10. Using Gene Set Enrichment Analysis (GSEA), we examined Gene Ontology (GO) and Human Phenotype Ontology (HPO) enrichment in our 10 MRMR-selected biomarkers [ 35 , 36 , 37 ]. GO was utilized to investigate the biological processes, cellular components, and molecular functions of our biomarkers. HPO assisted in searching for disease implications. Pathogenic Variant Analysis WGS from our 71-participant cohort was investigated alongside RNA-seq. We processed WGS-derived single nucleotide polymorphisms (SNPs) from each patient into VCF format. SNPs with low quality scores (< 50) within the VCF files were discarded. Using the Ensembl Variant Effect Predictor (VEP), we annotated these genomes with ClinVar, Combined Annotation Dependent Depletion (CADD), and Genome Aggregation Database (gnomAD). Here, we exclusively examined the WGS datasets of CVD patients in our training cohort for these SNPs. Only SNPs associated with our MRMR-selected DEGs and their known regulatory elements (i.e., promoters and enhancers), sourced from GeneHancer [ 38 ], were included in downstream analyses. This methodology allowed us to focus on genomic regions we had previously been able to implicate with CVDs. By excluding SNPs outside these key areas, our study minimizes confounding data, thereby improving the likelihood of identifying significant genetic contributors to CVD. Next, we utilized CADD to measure the deleteriousness of SNPs in our regions of interest. CADD is a computational tool for understanding the pathogenicity of SNPs [ 39 ]. CADD utilizes machine learning to integrate multiple genomic annotations, predicting the perniciousness of genetic variants. This tool leverages data from diverse sources, including evolutionary conservation and functional annotations, to generate a comprehensive score that assesses variant impact. We used a CADD Phred score greater than 10, or the top 10% of harmful SNPs, as our threshold. gnomAD was utilized to discover rare variants within this highly deleterious set. gnomAD is a comprehensive public resource that aggregates exome and genome sequencing data to provide insights into genetic variants across diverse populations. It offers critical information on the frequency and potential impact of genetic mutations. gnomAD provided allele frequencies for SNPs; those with an allele frequency < 0.1% were included in our analyses [ 40 ]. We searched for the presence of SNPs from our training dataset in the rest of our 71-patient cohort. The presence/absence of these SNPs was detailed in a binary matrix in CIGT format. This matrix was merged with our RNA-seq counts matrix by a common ID identifying each patient. To examine if our selected SNPs showed any direct linear relation with transcriptomic expression, we performed a point-biserial correlation. Multimodal AI/ML Analysis Our selected DEGs and SNPs were integrated into an AI/ML-ready CIGT-formatted dataset and used with an 80/20 train-test split by three ML classifiers to predict CVD risk. Using Bayesian optimization, we found optimal hyperparameters for a random forest (RF), Xtreme Gradient Boosting (XGBoost), and logistic regression (LR) models. For RF, ‘max_depth’, ‘min_samples_split’, ‘min_samples_leaf’, ‘n_estimators’, ‘max_features’, ‘bootstrap’, ‘criterion’, ‘max_leaf_nodes’, and ‘max_samples’ were optimized. Our XGBoost classifier was optimized for ‘max_depth’, ‘min_child_weight’, ‘gamma’, ‘subsample’, ‘colsample_bytree’, ‘scale_pos_weight’, ‘n_estimators’, and ‘learning_rate’. The parameters ‘C’, ‘penalty’, ‘solver’, and ‘max_iter’ were included in the Bayesian optimization for LR. Optimizing these hyperparameters assists in maximizing the performance of each classifier on the dataset. Bayesian optimization is a specific sequential optimization technique enabling faster convergence in parameter over typical brute force algorithms such as Grid Search. RF and XGBoost, two tree-based models, were chosen as they have previously proven powerful in single-disease prediction [ 28 ]. LR, a linear model, was chosen for comparison. These classifiers performed patient single-disease prediction on our testing dataset. Metrics detailing the classifier’s performance were computed: accuracy, AUC, probabilities, sensitivity, specificity, F1, and Brier score. To investigate the importance and directionality of each feature in predicting the CVD phenotype for each ML model, SHapley Additive exPlanations (SHAP) were computed for each feature. SHAP scores offer insights into patient-specific CVD manifestations using a game theoretic approach [ 41 ]. We combined the SHAP profiles with prediction probabilities to investigate which biomarkers were the most important contributors to each patient’s CVD prediction. Additionally, we extensively reviewed literature search to examine which of these biomarkers have previously been implicated in manifestations of CVD. 3. Results We designed comparable, RNA-seq data driven patient-specific expression profiles efficient in DE and ML analyses [ 29 ]. We found 56,681 transcripts relevant to the DEGs in our cohort. Initial filtration revealed only 748 transcripts for follow up differential expression analysis. Missing expression values in individual patients’ profiles necessitated imputation before DEA. Using a robust, empirical approach to parameterizing an imputer, we corrected dataset absence. Artificial missingness facilitated RMSE calculations across various simulations of ‘n_neighbors’. K-NN Imputation was performed using our optimal ‘n_neighbors’ of 11, having the lowest RMSE across simulations. With a completed dataset, DESeq2 normalization was performed for improved cross-sample comparability. Utilizing a demographic-based segmentation approach, we performed DEA to investigate potential CVD-associated transcripts in four subcohorts, minimizing noise from confounding variables. Subcohort-specific results from DEA were then merged, detecting 28 DEGs, detailed in Table 1 , across our cohort. LFCs characterized each DEG’s direction of regulation in CVD patients. Here, we demonstrate 20 upregulated DEGs and 8 downregulated DEGs. View this table: View inline View popup Download powerpoint Table 1. Analysis and Associations of differentially expressed genes. This table includes differentially expressed genes, their p-values to determine significance, log fold change (LFC) to determine regulation (up/down), direct cardiovascular and non-cardiovascular disease (CVD) associations based on current and extensive literature review. Genes highlighted in green correspond to a direct CVD association, blue represents a direct CVD and non-CVD relationship, and yellow corresponds to a non-CVD association. We implemented MRMR feature selection to capture the phenotypic profile of CVD in 10 DEGs: ITGB2 , CD37 , RPL36AP37 , PSAP , ACTB , SELL , NCF2 , HBA1 , ICAM3 , and BBLN . This excludes transcriptomic features with non-informative and redundant information to ML classifiers (e.g., co-expression). As demonstrated, these 10 DEGs successfully explain the differences between CVD patients and healthy controls. Figures 3A and 3B detail each DEG’s LFC and adjusted p-value. Additionally, we examined GO and HPO enrichment across the transcript set to examine their implicated pathways and clinical relevance ( Figure 3C ). Using Fisher’s exact test, we conclude that downregulated DEGs are enriched in our MRMR-selected DEGs, with a p-value of 0.022. Previously, seven MRMR-selected transcripts were associated with CVDs. Loss of HBA1 function is linked with coronary artery disease (CAD) [ 42 ]. Hypomethylation of ITGB2 in PBMCs is linked to HF and CAD [ 43 , 44 ]. SNPs affecting SELL are associated with an increased risk of acute coronary syndromes (ACSs) [ 45 ]. Blood-based hypermethylation of ACTB is associated with the development of coronary heart disease (CHD) [ 46 ]. Conversely, hypomethylation increases the risk of stroke [ 47 ]. NCF2 has been used as a diagnostic biomarker for obstructive CAD in PBMCs [ 48 ]. Additionally, NCF2 has been associated with AF [ 49 ]. ICAM3 has been identified as a prognostic biomarker for acute ischemic stroke [ 50 ]. BBLN has been found DE in damaged hearts [ 51 , 52 ]. Additionally, we have reported associated CVD risk factors and diseases implicating shared pathways. Download figure Open in new tab Figure 3. Differentially expressed genes and their expression plots. This figure presents the results of gene expression analysis and that includes, A) Fold change in expression level based on differential expression (DE) analysis and redundancy – maximum relevance (MRMR) feature selection; B) Significance levels of genes based on DE and MRMR; C) Gene annotations for cellular component, molecular function, biological processes, and phenotypic abnormalities; and D) RNA-seq expression plots for the ten most significant biomarkers. We integrated rare, deleterious WGS based SNPs into our patient-specific, ML-efficient profiling. We performed a rigorous search of DEGs and their regulators for SNPs with pathogenic characteristics. Analysis revealed 17 SNPs matching these criteria in our cohort. Figures 4A and 4B demonstrate the CADD Phred scores and gnomAD-sourced allele frequency of these SNPs. Additionally, in Table 2 , we report the location, transcripts, and consequences of the 17 SNPs. The distribution of consequences is shown in Figure 4C . The vast majority of our reported SNPs were not reported or scored with uncertain or conflicting pathogenicity in ClinVar. Only rs115891972 and rs751011909 were scored benign and likely benign, respectively. Next, using an uncontaminated training dataset, isolated from the testing dataset during feature selection and hyperparameter tuning, we trained three distinct ML classifiers. Our features consisted of 10 MRMR-selected DEGs and 17 SNPs. Our decision tree (DT) classifiers, RF and XGBoost, had perfect abilities to differentiate CVD patients and control individuals. Both classifiers produced 100% and a 1.0 AUC-ROC score. Overall, XGBoost performed the best, considering the Brier score. Our LR model performed worse, scoring 93% accuracy and 1.0 AUC-ROC, but failed to detect the sole healthy individual in our testing dataset. Here, we conclude that DTs are more suitable for single-disease predictions. DTs provide interpretable ML models, capable of handling non-linear relationships and synthesizing various variable types, two strengths necessary for high-dimensional multi-omics datasets. Figures 5A and 5B display our classifiers’ predictions and AUC-ROC. The integrated, multi-omics patient-specific profiles containing SNPs and expression outperformed non-integrated RNA-seq datasets. Previously, we demonstrated 91% accuracy using a comparable dataset with RF and XGBoost classifiers trained on RNA-seq expression [ 28 ]. Download figure Open in new tab Figure 4. Variant feature selection. This figure presents the rare, deleterious variants affecting our CVD associated biomarkers based on, A) Combined Annotation Dependent Depletion (CADD) Score; B) Allele frequency obtained from the Genome Aggregation Database (gnomAD); and C) annotations of pathogenic single nucleotide variants (SNVs). Download figure Open in new tab Figure 5. Predictive analysis. This figure presents the predictive confidence of our ML model and that includes, A) Predictive certainty of three ML algorithms (Random Forest, Logistic Regression and Xtreme Gradient Boosting) on testing dataset; and B) Receiver operating characteristic (ROC) curve denoting the sensitivity and specificity of the classifiers. View this table: View inline View popup Table 2. Pathogenic single nucleotide variants (SNPs). This table includes the position of the SNP, its allele, transcripts, consequences (3’ UTR, missense, non-coding (NC) transcript exon, nonsense mediated mRNA decay (NMD) transcript, upstream gene, downstream gene, intron, NC transcript, 5’ UTR, splice region, synonymous), allele frequency, CADD Phred score and corresponding gene. At last, we created single-patient profiles of CVD phenotypes using SHAP importances and prediction metrics from our best-performing XGBoost model. Examination of these profiles revealed that the RPL36AP37 (mean absolute SHAP value = 0.764) and HBA1 (mean absolute SHAP value = 0.522) demonstrated higher usefulness than other MRMR-selected biomarkers in training the classifier, with BBLN (mean absolute SHAP value = 0.218) the next highest result. ICAM3 was not utilized by our XGBoost algorithm. Biomarkers contributing more significantly to predictions could indicate disease involvement, leading toward more efficient diagnoses and treatment of CVD. 4. Discussion This study explores the functional impact of AI based multi-omics interactions in CVD. Our DEA disclosed twenty-eight DEGs ( Table 1 ), with twelve out of them associated with a phenotypic variation of CVDs. Further investigation to uncover CVD and non-CVD associations, we found 65% of the total diseases reported for all genes were related to CVDs. Two genes ( HBM and GUK1 ) had only CVD associations. HBM has previously been identified as one of the ten most DEGs for hypertrophic cardiomyopathy (HCM) patients [ 53 ]. Additionally, it has been linked to CVD risk factors such as pulmonary arterial hypertension [ 54 ] and alpha thalassemia [ 55 ]. Upregulation of GUK1 has been implicated in CHD and HCM [ 56 ]. Ten of the twenty-eight DEGs ( HBA1, GPX1, SELENBP1, LGALS3, ND1, ITGB2, ACTB, NCF2, SELL , and ICAM3 ) were linked to both CVDs and non-CVDs. HBA1 has been documented to be highly associated with CVDs such as ischemic heart disease [ 57 ] and CAD [ 58 ]. It has also been linked to other CVD risk factors such as hypertension [ 59 ] and alpha thalassemia [ 59 ] as well as other non-CVDs such as chronic kidney disease, sickle cell disease [ 60 ], and nonalcoholic fatty liver disease [ 61 ]. LGALS3 [ 62 ], ITGB2 [ 44 ], and ICAM3 [ 50 ] were all found to be correlated with ischemic stroke and CHD. Upregulation of GPX1 has been implicated in various complex disorders including but not limited to cardiomyopathy [ 63 ], acute myeloid leukemia (AML) [ 63 ], and endometrial cancer [ 64 ]. Additionally, upregulation of SELENBP1 has been associated with acute coronary syndrome [ 65 ]. It has also been implicated in other phenotypes of CVDs such as myocardial infarction and cardiac arrest [ 65 ] as well as non-CVDs including breast cancer and lung adenocarcinoma [ 66 ]. Downregulation of ND1 [ 67 ] and upregulation of ACTB [ 46 , 68 ] in the inflammatory pathways are known to be associated with CHD and cardiomyopathy. ND1 and ACTB are also linked to other chronic and heritable diseases such as mitochondrial encephalomyopathy [ 69 ] and Parkinson’s disease [ 70 ], respectively. Additionally, NCF2 and SELL are reported to be potential diagnostic biomarkers for CAD [ 71 , 72 ] as well as cancers such as hepatocellular carcinoma [ 72 ] and leukemia [ 73 ], respectively. While the direct correlation between other complex diseases and CVDs remains unknown, state-of-the-art literature supports the implication of these genes in the inflammatory and immunological pathways shared between these diseases [ 59 , 60 , 66 , 73 ]. Future genomic and translational studies are required to understand these relationships. Sixteen of the twenty-eight DEGs were found to be only associated with non-CVDs based on existing literature ( Table 1 ). Genes such as HBQ1 [ 74 ] , HBA2 [ 75 ] , CD37 [ 76 ], and LILRA2 [ 77 ] are all associated with different types of cancer and other immunological diseases that are documented to have a direct impact on CVD pathophysiology. While these genes are not directly linked to CVDs, further research is required to understand their effects on regulatory elements that might trigger CVDs development. We could not find evidence associating genes such as BBLN , AHSP , and HBB with a CVD or other diseases. However, we reported their implications in CVD risk factors including but not limited to Tetralogy of Fallot [ 51 ], beta thalassemia [ 78 , 79 ], respectively. Other genes such as RPL36AP37 [ 80 ] , LYL1 [ 81 ] , HBD [ 82 ], FKBP8 [ 83 ], CDC34 [ 84 ], SLC25A39 [ 85 ], UQCR11 [ 86 ], RPS27 [ 87 ], and PSAP [ 88 ] are all implicated in cancerous and neurological diseases that are not directly associated with CVDs. A detailed list of documented CVD and non-CVD phenotypes associated with our DEGs are available in Table 1 . We validated our findings with peer-reviewed studies ( Table 3 ). Eleven of the twenty-eight genes were found to be upregulated in our DEA as well as existing literature. HBM [ 53 ], GUK1 [ 56 ], HBA1 [ 58 ], GPX1 [ 63 ], SELENBP1 [ 65 ], and LGALS3 [ 62 ] were all upregulated in different phenotypes associated with CVDs. Other genes such as HBQ1 [ 74 ], BBLN [ 51 ], LYL1 [ 81 ], CDC34 [ 84 ], and UQCR11 [ 86 ] were implicated in other diseases but their regulation levels also matched existing studies. Additionally, two genes, SELL [ 72 ] and CD37 [ 76 ] were observed to be downregulated in our DEA and in current studies. Seven of these genes ( HBA1, RPL36AP37, BBLN, ITGB2, ACTB, NCF2, SELL, ICAM3, CD37 , and PSAP ) were selected using MRMR and later utilized by our AI/ML model to predict CVDs. Upregulation of HBA1 has been extensively reported to exhibit a strong correlation with ischemic heart disease [ 57 ] while loss of HBA1 function is associated with CAD [ 55 ]. In our previous studies, we identified protein coding HBA1 to be upregulated in CVD patients and significantly expressed in HF patients [ 14 , 25 , 26 ]. Pseudogene RPL36AP37 assists in regulating DNA replication within eukaryotic cells and in producing ribosomal proteins [ 89 ]. Little to no information connecting RPL36AP37 to CVDs has been reported. However, non-CVD phenotypes associated with altered or loss of RPL36AP37 function include primary angle closure glaucoma [ 89 ], Parkinson’s disease, and certain cancers [ 80 ]. Protein-coding BBLN , also referred to as bublin coiled-coil protein, serves as a vital regulator of intestinal intermediate filaments, crucial for normal intestinal function, while also playing a role in maintaining cellular organelle architecture and serving as molecular spacers [ 90 ]. Induced downregulation of BBLN in mice with congenital heart defects leads to further cardiovascular dysfunction and necroptosis via activation of the CAMK2D pathway [ 51 ]. ITGB2 , a protein coding gene, encodes a cell-associated signaling molecule particularly involved in leukocyte adhesion and migration of T-cells and neutrophils [ 91 ]. ITGB2 is profoundly expressed within AML patients [ 43 ], and hypomethylation of this gene in peripheral blood was linked to HF and CAD [ 44 ]. Blood-based hypermethylation of ACTB , another protein coding gene, was significantly associated with the development of CHD [ 46 ]. While hypomethylation of ACTB was found to increase the risk of stroke [ 92 ]. Protein coding NCF2 was found to be significantly upregulated in AF [ 93 ] and significantly expressed in CAD patients [ 48 ]. SELL encodes for selectin, a protein essential for binding and rolling leukocytes on endothelial cells and acts as a primary downstream target for DYSF , a protein that, if upregulated, contributes to the pathogenesis of atherosclerotic CVDs [ 94 ]. Upregulation of protein coding ICAM3 is identified as a prognostic biomarker for acute ischemic stroke [ 95 ]. CD37 , protein coding gene, regulates immune response and prevent tumor formation and upregulation in mRNA expression levels is observed in AML patients [ 76 ]. PSAP , a protein coding gene, plays a significant role in the pathogenesis of atherosclerosis, a key risk factor for CVDs [ 96 ]. In particular, elevated PSAP expression in plaque macrophages was related to atherosclerosis-linked inflammation in humans [ 96 ]. Additionally, these seven genes are linked to other multi-factorial diseases. Further studies are needed to understand non-CVD implication on CVD prognosis and diagnosis. View this table: View inline View popup Table 3. Regulation of differentially expressed genes based on literature review. This table includes gene names, their regulation based on the log fold change (LFC), and their regulation based on literature review (up/down). Red-colored text in the regulation based on literature (up/down) refers to cardiovascular diseases while those in blue refer to non-cardiovascular diseases. The integration of multi-omics data, coupled with the multimodal advancements in AI/ML, has the potential to enhance diagnostic and predictive analyses of leading causes of mortality, modifiable risk factors, and other medical insights. State-of-the-art literature have supported the implementation of a genomic language model trained on millions of metagenomic scaffolds to uncover hidden functional and regulatory connections among genes. Additionally, this process also unveils intricate relationships between genes within a genomic region [ 97 ]. Another recent study utilized a deep-learning, integrative mass spectrometry framework on metabolomics, with a focus on lipid profiles, to detect lipid content specific to regions and localize lipids to individual cells depends on both cell subpopulations and the anatomical origins of the cells [ 98 ]. In epigenetics, recent literature introduced DeepMod2, a comprehensive deep-learning framework designed for methylation detection utilizing the ionic current signal obtained from Nanopore sequencing [ 99 ]. It incorporates both a bidirectional long short-term memory model and a transformer model to facilitate rapid and precise detection of DNA methylation from a variety of flow cell types using whole-genome or adaptive sequencing data [ 99 ]. Another study showcased a multi-omics analytic platform leveraging genomic, transcriptomic, proteomic, and lipid data to accurately predict adenocarcinoma patient survival [ 100 ]. This platform employs an ensemble of algorithms, including support vector machine, random forest, and neural network, to identify disease-associated biomarker panels for downstream predictive analyses [ 100 ]. Furthermore, researchers have combined omics data with demographic and clinical information to offer a comprehensive view of cancer prognosis [ 101 ]. They created and validated a deep learning framework capable of extracting insights from complex gene and miRNA expression data, enabling accurate prognosis predictions for breast and ovarian cancer patients [ 101 ]. All of these approaches offer potential advancements in understanding disease biology and could assist in developing more targeted treatments. Author contributions Z.A. proposed, led, and supervised this study. Z.A. participated in conceptualization, project administration, funding acquisition, methodology, investigation, resource allocation, data duration, RNA-seq and WGS data processing, quality checking, downstream analysis. W.D. executed formal analysis, and R.N. tested and reproduced results. H.A., E.P., and N.S., participated in research, investigation, and validation of AI/ML results using state of the art literature. S.Z. guided post bioinformatics and AI/ML analysis and evaluated results. B.L. supported overall study including multi-omics data generation. All authors have participated in writing - original draft when Z.A performed review & editing. All authors have approved it for publication. Biographical Note H.A., W.D., E.P., N.S., and R.N. are the Research Assistants/Students at the Ahmed lab, Rutgers IFH/RWJMS. S.Z. the Assistant Professor at the Department of Biomedical and Health Informatics, UMKC School of Medicine. B.L. is the Interim Chief Executive Officer (CEO), UConn Health; Executive Vice President for Health Affairs; Dean, UConn School of Medicine; Director, Pat and Jim Calhoun Cardiology Center; and Ray Neag Distinguished Professor of Cardiovascular Biology and Medicine. BL is an internationally recognized cardiovascular physician-scientist and national leader in academic medicine. Z.A. is the Assistant Professor at the Department of Medicine / Division of Cardiovascular Diseases and Hypertension, Rutgers Robert Wood Johnson Medical School, and Rutgers Health. Z.A. is Core Faculty Member at the Rutgers Institute for Health, Health Care Policy and Aging Research, at Rutgers, The State University of New Jersey. Furthermore, ZA is the Adjunct Assistant Professor at the Department of Genetics and Genome Sciences, School of Medicine, UConn Health, CT. Declarations Ethical Approval and Consent to participate Informed consent was obtained from all subjects. All human samples were used in accordance with relevant guidelines and regulations, and all experimental protocols were approved by the Institutional Review Board. Consent for publication Not applicable Availability of data and material All the source code reproducing the experiments of this study are available at GitHub, following web link: Competing interests The Authors declare no Competing Financial or Non-Financial Interests. Funding No funding received. Acknowledgments We appreciate great support by Institute for Health, Health Care Policy and Aging Research (IFH), and Robert Wood Johnson Medical School (RWJMS), at Rutgers Health, and Pat and Jim Calhoun Cardiology Center, and Department of Genetics and Genome Sciences, at the UConn School of Medicine, UConn Health. We thank members and collaborators of Ahmed Lab at Rutgers (IFH, RWJMS, RBHS) for their support, participation, and contribution to this study. We appreciate all colleagues and institutions who provided direct and indirect insight and expertise that greatly assisted the research and development of this project. We acknowledge Rutgers Office of Advanced Research Computing (OARC) for providing access to the Amarel cluster and associated research computing resources. Footnotes https://github.com/drzeeshanahmed/intelligenes_multi-omics_cvd_analysis List of Abbreviations (ACS) Acute coronary syndrome (AI) Artificial intelligence (AF) Atrial fibrillation (AML) Acute myeloid leukemia (AUC) Area under the curve (CVD) Cardiovascular disease (CAD) Coronary artery disease (CHD) Coronary heart disease (CADD) Combined Annotation Dependent Depletion (CIGT) Clinically integrated genomic and transcriptomic (DT) Decision tree (DEGs) Differentially expressed genes (GO) Gene Ontology (GSEA) Gene Set Enrichment Analysis (gnomAD) Genome Aggregation Database (HF) Heart failure (HPO) Human Phenotype Ontology (HCM) Hypertrophic cardiomyopathy (k-NN) K-nearest neighbors (LR) Logistic regression (LFC) Logarithmic fold change (ML) Machine learning (MRMR) Minimum redundancy - maximum relevance (NGS) Next-generation sequencing (NaN) Not a number (PBMC) Peripheral mononuclear blood cell (RMSE) Root mean squared error (RF) Random forest (SNPs) Single nucleotide polymorphisms (SHAP) SHapley Additive exPlanations (TPM) Transcripts per million (VEP) Variant Effect Predictor (WES) Whole exome sequencing (WGS) Whole genome sequencing (XGBoost) Xtreme Gradient Boosting References 1. ↵ Tsao , C. W. , Aday , A. W. , Almarzooq , Z. I. , Alonso , A. , Beaton , A. Z. , Bittencourt , M. S. , Boehme , A. K. , Buxton , A. E. , Carson , A. P. , Commodore-Mensah , Y. , Elkind , M. S. V. , Evenson , K. R. , Eze-Nliam , C. , Ferguson , J. F. , Generoso , G. , Ho , J. E. , Kalani , R. , Khan , S. S. , Kissela , B. M. , Knutson , K. L. , … Martin , S. S. ( 2022 ). Heart Disease and Stroke Statistics-2022 Update: A Report From the American Heart Association . Circulation , 145 ( 8 ), e153 – e639 . OpenUrl CrossRef PubMed 2. ↵ Krittanawong , C. , Johnson , K. W. , Choi , E. , Kaplin , S. , Venner , E. , Murugan , M. , Wang , Z. , Glicksberg , B. S. , Amos , C. I. , Schatz , M. C. , & Tang , W. H. W. ( 2022 ). Artificial Intelligence and Cardiovascular Genetics. Life (Basel , Switzerland ), 12 ( 2 ), 279 . OpenUrl 3. ↵ Wung , S. F. , Hickey , K. T. , Taylor , J. Y. , & Gallek , M. J. ( 2013 ). Cardiovascular genomics . Journal of nursing scholarship: an official publication of Sigma Theta Tau International Honor Society of Nursing , 45 ( 1 ), 60 – 68 . OpenUrl 4. ↵ Patel , K. K. , Venkatesan , C. , Abdelhalim , H. , Zeeshan , S. , Arima , Y. , Linna-Kuosmanen , S. , & Ahmed , Z. ( 2023 ). Genomic approaches to identify and investigate genes associated with atrial fibrillation and heart failure susceptibility . Human genomics , 17 ( 1 ), 47 . OpenUrl CrossRef 5. ↵ Ahmed Z. ( 2024 ). Deciphering expression and variants in cardiovascular disease genes among heart failure population for precision medicine . ESC heart failure , 11 ( 1 ), 606 – 609 . OpenUrl 6. ↵ Ahmed , Z. , Zeeshan , S. , Persaud , N. , Degroat , W. , Abdelhalim , H. , & Liang , B. T. ( 2023 ). Investigating genes associated with cardiovascular disease among heart failure patients for translational research and precision medicine . Clinical and translational discovery , 3 ( 3 ), e206 . OpenUrl 7. ↵ Zech , M. , & Winkelmann , J. ( 2024 ). Next-generation sequencing and bioinformatics in rare movement disorders . Nature reviews. Neurology , 20 ( 2 ), 114 – 126 . OpenUrl 8. ↵ Byron , S. A. , Van Keuren-Jensen , K. R. , Engelthaler , D. M. , Carpten , J. D. , & Craig , D. W. ( 2016 ). Translating RNA sequencing into clinical diagnostics: opportunities and challenges . Nature reviews. Genetics , 17 ( 5 ), 257 – 271 . OpenUrl CrossRef PubMed 9. ↵ Abbas , M. , & El-Manzalawy , Y. ( 2020 ). Machine learning based refined differential gene expression analysis of pediatric sepsis . BMC medical genomics , 13 ( 1 ), 122 . OpenUrl 10. ↵ Hamaguchi , Y. , Zeng , C. , & Hamada , M. ( 2021 ). Impact of human gene annotations on RNA-seq differential expression analysis . BMC genomics , 22 ( 1 ), 730 . OpenUrl CrossRef 11. ↵ Kaya , C. , Dorsaint , P. , Mercurio , S. , Campbell , A. M. , Eng , K. W. , Nikiforova , M. N. , Elemento , O. , Nikiforov , Y. E. , & Sboner , A. ( 2021 ). Limitations of Detecting Genetic Variants from the RNA Sequencing Data in Tissue and Fine-Needle Aspiration Samples . Thyroid : official journal of the American Thyroid Association , 31 ( 4 ), 589 – 595 . OpenUrl 12. ↵ Ellingford , J. M. , Ahn , J. W. , Bagnall , R. D. , Baralle , D. , Barton , S. , Campbell , C. , Downes , K. , Ellard , S. , Duff-Farrier , C. , FitzPatrick , D. R. , Greally , J. M. , Ingles , J. , Krishnan , N. , Lord , J. , Martin , H. C. , Newman , W. G. , O’Donnell-Luria , A. , Ramsden , S. C. , Rehm , H. L. , Richardson , E. , … Whiffin , N. ( 2022 ). Recommendations for clinical interpretation of variants found in non-coding regions of the genome . Genome medicine , 14 ( 1 ), 73 . OpenUrl 13. ↵ Gonorazky , H. D. , Naumenko , S. , Ramani , A. K. , Nelakuditi , V. , Mashouri , P. , Wang , P. , Kao , D. , Ohri , K. , Viththiyapaskaran , S. , Tarnopolsky , M. A. , Mathews , K. D. , Moore , S. A. , Osorio , A. N. , Villanova , D. , Kemaladewi , D. U. , Cohn , R. D. , Brudno , M. , & Dowling , J. J. ( 2019 ). Expanding the Boundaries of RNA Sequencing as a Diagnostic Tool for Rare Mendelian Disease . American journal of human genetics , 104 ( 3 ), 466 – 483 . OpenUrl CrossRef PubMed 14. ↵ Mhatre , I. , Abdelhalim , H. , Degroat , W. , Ashok , S. , Liang , B. T. , & Ahmed , Z. ( 2023 ). Functional mutation, splice, distribution, and divergence analysis of impactful genes associated with heart failure and other cardiovascular diseases . Scientific reports , 13 ( 1 ), 16769 . OpenUrl 15. ↵ Aung , N. , Vargas , J. D. , Yang , C. , Cabrera , C. P. , Warren , H. R. , Fung , K. , Tzanis , E. , Barnes , M. R. , Rotter , J. I. , Taylor , K. D. , Manichaikul , A. W. , Lima , J. A. C. , Bluemke , D. A. , Piechnik , S. K. , Neubauer , S. , Munroe , P. B. , & Petersen , S. E. ( 2019 ). Genome-Wide Analysis of Left Ventricular Image-Derived Phenotypes Identifies Fourteen Loci Associated With Cardiac Morphogenesis and Heart Failure Development . Circulation , 140 ( 16 ), 1318 – 1330 . OpenUrl CrossRef 16. ↵ Bomba , L. , Walter , K. , Guo , Q. , Surendran , P. , Kundu , K. , Nongmaithem , S. , Karim , M. A. , Stewart , I. D. , Langenberg , C. , Danesh , J. , Di Angelantonio , E. , Roberts , D. J. , Ouwehand , W. H ., INTERVAL study, Dunham, I. , Butterworth , A. S. , & Soranzo , N. ( 2022 ). Whole-exome sequencing identifies rare genetic variants associated with human plasma metabolites . American journal of human genetics , 109 ( 6 ), 1038 – 1054 . OpenUrl 17. ↵ Xiao , W. , Ren , L. , Chen , Z. , Fang , L. T. , Zhao , Y. , Lack , J. , Guan , M. , Zhu , B. , Jaeger , E. , Kerrigan , L. , Blomquist , T. M. , Hung , T. , Sultan , M. , Idler , K. , Lu , C. , Scherer , A. , Kusko , R. , Moos , M. , Xiao , C. , Sherry , S. T. , … Shi , L. ( 2021 ). Toward best practice in cancer mutation detection with whole-genome and whole-exome sequencing . Nature biotechnology , 39 ( 9 ), 1141 – 1150 . OpenUrl CrossRef 18. ↵ Vadapalli , S. , Abdelhalim , H. , Zeeshan , S. , & Ahmed , Z. ( 2022 ). Artificial intelligence and machine learning approaches using gene expression and variant data for personalized medicine . Briefings in bioinformatics , 23 ( 5 ), bbac191 . OpenUrl CrossRef 19. ↵ Weintraub W. S. ( 2019 ). Role of Big Data in Cardiovascular Research . Journal of the American Heart Association , 8 ( 14 ), e012791 . OpenUrl PubMed 20. ↵ Armoundas , A. A. , Narayan , S. M. , Arnett , D. K. , Spector-Bagdady , K. , Bennett , D. A. , Celi , L. A. , Friedman , P. A. , Gollob , M. H. , Hall , J. L. , Kwitek , A. E. , Lett , E. , Menon , B. K. , Sheehan , K. A. , Al-Zaiti , S. S. , & American Heart Association Institute for Precision Cardiovascular Medicine; Council on Cardiovascular and Stroke Nursing; Council on Lifelong Congenital Heart Disease and Heart Health in the Young; Council on Cardiovascular Radiology and Intervention; Council on Hypertension; Council on the Kidney in Cardiovascular Disease; and Stroke Council ( 2024 ). Use of Artificial Intelligence in Improving Outcomes in Heart Disease: A Scientific Statement From the American Heart Association. Circulation , doi: 10.1161/CIR.0000000000001201 . Advance online publication. OpenUrl CrossRef 21. ↵ Muse , E. D. , & Topol , E. J. ( 2024 ). Transforming the cardiometabolic disease landscape: Multimodal AI-powered approaches in prevention and management . Cell metabolism , S1550 – 4131 ( 24 )00048-2. Advance online publication. 22. ↵ Nagarajan , V. D. , Lee , S. L. , Robertus , J. L. , Nienaber , C. A. , Trayanova , N. A. , & Ernst , S. ( 2021 ). Artificial intelligence in the diagnosis and management of arrhythmias . European heart journal , 42 ( 38 ), 3904 – 3916 . OpenUrl 23. ↵ Kegerreis , B. , Catalina , M. D. , Bachali , P. , Geraci , N. S. , Labonte , A. C. , Zeng , C. , Stearrett , N. , Crandall , K. A. , Lipsky , P. E. , & Grammer , A. C. ( 2019 ). Machine learning approaches to predict lupus disease activity from gene expression data . Scientific reports , 9 ( 1 ), 9617 . OpenUrl 24. ↵ Maniruzzaman , M. , Jahanur Rahman , M. , Ahammed , B. , Abedin , M. M. , Suri , H. S. , Biswas , M. , El-Baz , A. , Bangeas , P. , Tsoulfas , G. , & Suri , J. S. ( 2019 ). Statistical characterization and classification of colon microarray gene expression data using multiple machine learning paradigms . Computer methods and programs in biomedicine , 176 , 173 – 193 . OpenUrl 25. ↵ Ahmed , Z. , Zeeshan , S. , & Liang , B. T. ( 2021 ). RNA-seq driven expression and enrichment analysis to investigate CVD genes with associated phenotypes among high-risk heart failure patients . Human genomics , 15 ( 1 ), 67 . OpenUrl CrossRef 26. ↵ Venkat , V. , Abdelhalim , H. , DeGroat , W. , Zeeshan , S. , & Ahmed , Z. ( 2023 ). Investigating genes associated with heart failure, atrial fibrillation, and other cardiovascular diseases, and predicting disease using machine learning techniques for translational research and precision medicine . Genomics , 115 ( 2 ), 110584 . OpenUrl CrossRef 27. ↵ DeGroat , W. , Venkat , V. , Pierre-Louis , W. , Abdelhalim , H. , & Ahmed , Z. ( 2023 ). Hygieia: AI/ML pipeline integrating healthcare and genomics data to investigate genes associated with targeted disorders and predict disease . Software Impacts , 16 , 100493 . OpenUrl 28. ↵ DeGroat , W. , Abdelhalim , H. , Patel , K. , Mendhe , D. , Zeeshan , S. , & Ahmed , Z. ( 2024 ). Discovering biomarkers associated and predicting cardiovascular disease with high accuracy using a novel nexus of machine learning techniques for precision medicine . Scientific reports , 14 ( 1 ), 1 . OpenUrl 29. ↵ DeGroat , W. , Mendhe , D. , Bhusari , A. , Abdelhalim , H. , Zeeshan , S. , & Ahmed , Z. ( 2023 ). IntelliGenes : a novel machine learning pipeline for biomarker discovery and predictive analysis using multi-genomic profiles. Bioinformatics (Oxford , England ), 39 ( 12 ), btad755 . OpenUrl 30. ↵ Love , M. I. , Huber , W. , & Anders , S. ( 2014 ). Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2 . Genome biology , 15 ( 12 ), 550 . OpenUrl CrossRef PubMed 31. ↵ Anders , S. , & Huber , W. ( 2010 ). Differential expression analysis for sequence count data . Genome biology , 11 ( 10 ), R106 . OpenUrl CrossRef PubMed 32. ↵ Zhao , Y. , Li , M. C. , Konaté , M. M. , Chen , L. , Das , B. , Karlovich , C. , Williams , P. M. , Evrard , Y. A. , Doroshow , J. H. , & McShane , L. M. ( 2021 ). TPM, FPKM, or Normalized Counts? A Comparative Study of Quantification Measures for the Analysis of RNA-seq Data from the NCI Patient-Derived Models Repository . Journal of translational medicine , 19 ( 1 ), 269 . OpenUrl 33. ↵ Radovic , M. , Ghalwash , M. , Filipovic , N. , & Obradovic , Z. ( 2017 ). Minimum redundancy maximum relevance feature selection approach for temporal gene expression data . BMC bioinformatics , 18 ( 1 ), 9 . OpenUrl CrossRef 34. ↵ Ding , C. , & Peng , H. ( 2005 ). Minimum redundancy feature selection from microarray gene expression data . Journal of bioinformatics and computational biology , 3 ( 2 ), 185 – 205 . OpenUrl CrossRef PubMed 35. ↵ Subramanian , A. et al. Gene set enrichment analysis: a knowledge-based approach for interpreting genome-wide expression profiles . Proc Natl Acad Sci U S A 102 , 15545 – 15550 ( 2005 ). OpenUrl Abstract / FREE Full Text 36. ↵ The Gene Ontology (GO) database and informatics resource. Nucleic Acids Res 32 , D258 – D261 ( 2004 ). OpenUrl CrossRef PubMed Web of Science 37. ↵ Gargano , M. A. et al. The Human Phenotype Ontology in 2024: phenotypes around the world . Nucleic Acids Res 52 , D1333 – D1346 ( 2024 ). OpenUrl CrossRef 38. ↵ Fishilevich , S. , Nudel , R. , Rappaport , N. , Hadar , R. , Plaschkes , I. , Iny Stein , T. , Rosen , N. , Kohn , A. , Twik , M. , Safran , M. , Lancet , D. , & Cohen , D. ( 2017 ). GeneHancer: genome-wide integration of enhancers and target genes in GeneCards . Database : the journal of biological databases and curation , 2017 , bax028 . OpenUrl 39. ↵ Schubach , M. , Maass , T. , Nazaretyan , L. , Röner , S. , & Kircher , M. ( 2024 ). CADD v1.7: using protein language models, regulatory CNNs and other nucleotide-level scores to improve genome-wide variant predictions . Nucleic acids research , 52 ( D1 ), D1143 – D1154 . OpenUrl CrossRef 40. ↵ Chen , S. , Francioli , L. C. , Goodrich , J. K. , Collins , R. L. , Kanai , M. , Wang , Q. , Alföldi , J. , Watts , N. A. , Vittal , C. , Gauthier , L. D. , Poterba , T. , Wilson , M. W. , Tarasova , Y. , Phu , W. , Grant , R. , Yohannes , M. T. , Koenig , Z. , Farjoun , Y. , Banks , E. , Donnelly , S. , … Karczewski , K. J. ( 2024 ). A genomic mutational constraint map using variation in 76,156 human genomes . Nature , 625 ( 7993 ), 92 – 100 . OpenUrl 41. ↵ Lundberg , S. M. , & Lee , S. I. ( 2017 ). A unified approach to interpreting model predictions . Advances in neural information processing systems , 30 . 42. ↵ Chonchol , M. , & Nielson , C. ( 2008 ). Hemoglobin levels and coronary artery disease . American heart journal , 155 ( 3 ), 494 – 498 . OpenUrl PubMed Web of Science 43. ↵ Wei , J. , Huang , X. J. , Huang , Y. , Xiong , M. Y. , Yao , X. Y. , Huang , Z. N. , Li , S. N. , Zhou , W. J. , Fang , D. L. , Deng , D. H. , & Cheng , P. ( 2021 ). Key immune-related gene ITGB2 as a prognostic signature for acute myeloid leukemia . Annals of translational medicine , 9 ( 17 ), 1386 . OpenUrl 44. ↵ Zhu , L. , Zhu , C. , Jin , J. , Wang , J. , Zhao , X. , & Yang , R. ( 2024 ). Identification of an association between coronary heart disease and ITGB2 methylation in peripheral blood by a case-control study . Clinica chimica acta; international journal of clinical chemistry , 552 , 117627 . OpenUrl 45. ↵ Malinowski , D. , Zawadzka , M. , Safranow , K. , Droździk , M. , & Pawlik , A. ( 2022 ). SELL and GUCY1A1 Gene Polymorphisms in Patients with Unstable Angina . Biomedicines , 10 ( 10 ), 2494 . OpenUrl 46. ↵ Jin , J. , Zhu , C. , Wang , J. , Zhao , X. , & Yang , R. ( 2022 ). The association between ACTB methylation in peripheral blood and coronary heart disease in a case-control study . Frontiers in cardiovascular medicine , 9 , 972566 . OpenUrl 47. ↵ Liu , C. , Yin , Q. , Li , M. , Fan , Y. , Shen , C. , & Yang , R. ( 2021 ). ACTB Methylation in Blood as a Potential Marker for the Pre-clinical Detection of Stroke: A Prospective Nested Case-Control Study . Frontiers in neuroscience , 15 , 644943 . OpenUrl 48. ↵ Mo , X. G. , Liu , W. , Yang , Y. , Imani , S. , Lu , S. , Dan , G. , Nie , X. , Yan , J. , Zhan , R. , Li , X. , Deng , Y. , Chen , B. , & Cai , Y. ( 2019 ). NCF2, MYO1F, S1PR4, and FCN1 as potential noninvasive diagnostic biomarkers in patients with obstructive coronary artery: A weighted gene co-expression network analysis . Journal of cellular biochemistry , 120 ( 10 ), 18219 – 18235 . OpenUrl CrossRef 49. ↵ Chu , Y. , Yu , F. , Wu , Y. , Yang , J. , Shi , J. , Ye , T. , Han , D. , & Wang , X. ( 2022 ). Identification of genes and key pathways underlying the pathophysiological association between nonalcoholic fatty liver disease and atrial fibrillation . BMC medical genomics , 15 ( 1 ), 150 . OpenUrl 50. ↵ Zicheng , H. , Xiao , Y. , Rongzhong , H. , Yongyong , L. , Haitao , R. , & Tingting , S. ( 2021 ). Association of Circulating ICAM3 Concentrations with Severity and Short-term Outcomes of Acute Ischemic Stroke . Neurotoxicity research , 39 ( 4 ), 1293 – 1299 . OpenUrl 51. ↵ Abd Alla , J. , Langer , A. , Wolf , S. , Fu , X. , Rageh , M. A. , & Quitterer , U. ( 2023 ). BBLN triggers CAMK2D pathology in mice under cardiac pressure overload and potentially in unrepaired hearts with tetralogy of Fallot . Nature Cardiovascular Research , 2 ( 11 ), 1044 – 1059 . OpenUrl 52. ↵ Abd Alla , J. , & Quitterer , U. ( 2023 ). Elevated BBLN levels as a cause of heart defects in tetralogy of Fallot . Nature Cardiovascular Research , 2 ( 11 ), 970 – 971 . OpenUrl 53. ↵ Cao , J. , & Yuan , L. ( 2022 ). Identification of key genes for hypertrophic cardiomyopathy using integrated network analysis of differential lncRNA and gene expression . Frontiers in cardiovascular medicine , 9 , 946229 . OpenUrl 54. ↵ Tang , S. , Liu , Y. , & Liu , B. ( 2022 ). Integrated bioinformatics analysis reveals marker genes and immune infiltration for pulmonary arterial hypertension . Scientific reports , 12 ( 1 ), 10154 . OpenUrl 55. ↵ Ren , Z. , Sun , G. , Zhang , Q. , Zou , S. , Chen , J. , Zhao , W. , Hou , G. , Zhong , Z. , Li , J. , Ye , Y. , Xu , X. , & Lin , L. ( 2023 ). LC-MS/MS-Based Absolute Quantitation of Hemoglobin Subunits from Dried Blood Spots Reveals Novel Biomarkers for α-Thalassemia Silent Carriers . Analytical chemistry , 95 ( 24 ), 9244 – 9251 . OpenUrl 56. ↵ Joehanes , R. , Ying , S. , Huan , T. , Johnson , A. D. , Raghavachari , N. , Wang , R. , Liu , P. , Woodhouse , K. A. , Sen , S. K. , Tanriverdi , K. , Courchesne , P. , Freedman , J. E. , O’Donnell , C. J. , Levy , D. , & Munson , P. J. ( 2013 ). Gene expression signatures of coronary heart disease . Arteriosclerosis, thrombosis, and vascular biology , 33 ( 6 ), 1418 – 1426 . OpenUrl Abstract / FREE Full Text 57. ↵ Prasad K. ( 2018 ). Does HbA1cc Play a Role in the Development of Cardiovascular Diseases? . Current pharmaceutical design , 24 ( 24 ), 2876 – 2882 . OpenUrl 58. ↵ Ghaffari , S. , Niafar , F. , Separham , A. , Niafar , M. , Pourafkari , L. , & Nader , N. D. ( 2015 ). Association between HbA1c levels with severity of coronary artery disease and short-term outcomes of acute ST-elevation myocardial infarction in nondiabetic patients . Therapeutic advances in cardiovascular disease , 9 ( 5 ), 305 – 313 . OpenUrl CrossRef PubMed 59. ↵ Singer , D. E. , Nathan , D. M. , Anderson , K. M. , Wilson , P. W. , & Evans , J. C. ( 1992 ). Association of HbA1c with prevalent cardiovascular disease in the original cohort of the Framingham Heart Study . Diabetes , 41 ( 2 ), 202 – 208 . OpenUrl Abstract / FREE Full Text 60. ↵ Ataga , K. I. , Saraf , S. L. , & Derebail , V. K. ( 2022 ). The nephropathy of sickle cell trait and sickle cell disease . Nature reviews. Nephrology , 18 ( 6 ), 361 – 377 . OpenUrl 61. ↵ Liu , W. , Baker , S. S. , Baker , R. D. , Nowak , N. J. , & Zhu , L. ( 2011 ). Upregulation of hemoglobin expression by oxidative stress in hepatocytes and its implication in nonalcoholic steatohepatitis . PloS one , 6 ( 9 ), e24363 . OpenUrl CrossRef PubMed 62. ↵ Aguilar , D. , Sun , C. , Hoogeveen , R. C. , Nambi , V. , Selvin , E. , Matsushita , K. , Saeed , A. , McEvoy , J. W. , Shah , A. M. , Solomon , S. D. , Boerwinkle , E. , & Ballantyne , C. M. ( 2020 ). Levels and Change in Galectin-3 and Association With Cardiovascular Events: The ARIC Study . Journal of the American Heart Association , 9 ( 13 ), e015405 . OpenUrl 63. ↵ Lubos , E. , Loscalzo , J. , & Handy , D. E. ( 2011 ). Glutathione peroxidase-1 in health and disease: from molecular mechanisms to therapeutic opportunities . Antioxidants & redox signaling , 15 ( 7 ), 1957 – 1997 . OpenUrl CrossRef PubMed Web of Science 64. ↵ Zhao , Y. , Wang , H. , Zhou , J. , & Shao , Q. ( 2022 ). Glutathione Peroxidase GPX1 and Its Dichotomous Roles in Cancer . Cancers , 14 ( 10 ), 2560 . OpenUrl 65. ↵ Kühn-Heid , E. C. D. , Kühn , E. C. , Ney , J. , Wendt , S. , Seelig , J. , Schwiebert , C. , Minich , W. B. , Stoppe , C. , & Schomburg , L. ( 2019 ). Selenium-Binding Protein 1 Indicates Myocardial Stress and Risk for Adverse Outcome in Cardiac Surgery . Nutrients , 11 ( 9 ), 2005 . OpenUrl 66. ↵ Chen , G. , Wang , H. , Miller , C. T. , Thomas , D. G. , Gharib , T. G. , Misek , D. E. , Giordano , T. J. , Orringer , M. B. , Hanash , S. M. , & Beer , D. G. ( 2004 ). Reduced selenium-binding protein 1 expression is associated with poor outcome in lung adenocarcinomas . The Journal of pathology , 202 ( 3 ), 321 – 329 . OpenUrl CrossRef PubMed Web of Science 67. ↵ Zhang , Z. , Liu , M. , He , J. , Zhang , X. , Chen , Y. , & Li , H. ( 2019 ). Maternally inherited coronary heart disease is associated with a novel mitochondrial tRNA mutation . BMC cardiovascular disorders , 19 ( 1 ), 293 . OpenUrl 68. ↵ Feng , W. , & Han , S. ( 2022 ). lncRNA ADAMTS9-AS1/circFN1 Competitively Binds to miR-206 to Elevate the Expression of ACTB, Thus Inducing Hypertrophic Cardiomyopathy . Oxidative medicine and cellular longevity , 2022 , 1450610 . OpenUrl 69. ↵ Lin , X. , Zhou , Y. , & Xue , L. ( 2024 ). Mitochondrial complex I subunit MT-ND1 mutations affect disease progression . Heliyon , 10 ( 7 ), e28808 . OpenUrl 70. ↵ Straccia , G. , Reale , C. , Castellani , M. , Colangelo , I. , Orunesu , E. , Meoni , S. , Moro , E. , Krack , P. , Prokisch , H. , Zech , M. , Romito , L. M. , & Garavaglia , B. ( 2022 ). ACTB gene mutation in combined Dystonia-Deafness syndrome with parkinsonism: Expanding the phenotype and highlighting the long-term GPi DBS outcome . Parkinsonism & related disorders , 104 , 3 – 6 . OpenUrl 71. ↵ Mo , X. G. , Liu , W. , Yang , Y. , Imani , S. , Lu , S. , Dan , G. , Nie , X. , Yan , J. , Zhan , R. , Li , X. , Deng , Y. , Chen , B. , & Cai , Y. ( 2019 ). NCF2, MYO1F, S1PR4, and FCN1 as potential noninvasive diagnostic biomarkers in patients with obstructive coronary artery: A weighted gene co-expression network analysis . Journal of cellular biochemistry , 120 ( 10 ), 18219 – 18235 . OpenUrl CrossRef 72. ↵ Huang , N. , Zhang , J. , Kuang , S. , Li , Z. , Zhao , H. , Wu , J. , Liu , M. , & Wang , L. ( 2023 ). Role of NCF2 as a potential prognostic factor and immune infiltration indicator in hepatocellular carcinoma . Cancer medicine , 12 ( 7 ), 8991 – 9004 . OpenUrl 73. ↵ Tatewaki , M. , Yamaguchi , K. , Matsuoka , M. , Ishii , T. , Miyasaka , M. , Mori , S. , Takatsuki , K. , & Watanabe , T. ( 1995 ). Constitutive overexpression of the L-selectin gene in fresh leukemic cells of adult T-cell leukemia that can be transactivated by human T-cell lymphotropic virus type 1 Tax . Blood , 86 ( 8 ), 3109 – 3117 . OpenUrl Abstract / FREE Full Text 74. ↵ Kim , K. , Choi , E. Y. , Ahn , H. M. , Kim , D. G. , & Kim , Y. J. ( 2023 ). Hemoglobin Subunit Theta 1 Promotes Proliferation by Reducing Reactive Oxygen Species in Lung Adenocarcinoma . Cancers , 15 ( 23 ), 5504 . OpenUrl 75. ↵ Alperin , J. B. , Dow , P. A. , & Petteway , M. B. ( 1977 ). Hemoglobin A2 levels in health and various hematologic disorders . American journal of clinical pathology , 67 ( 3 ), 219 – 226 . OpenUrl CrossRef PubMed 76. ↵ Zhang , Q. , Han , Q. , Zi , J. , Song , C. , & Ge , Z. ( 2020 ). CD37 high expression as a potential biomarker and association with poor outcome in acute myeloid leukemia . Bioscience reports , 40 ( 5 ), BSR20200008 . OpenUrl 77. ↵ Mamegano , K. , Kuroki , K. , Miyashita , R. , Kusaoi , M. , Kobayashi , S. , Matsuta , K. , Maenaka , K. , Colonna , M. , Ozaki , S. , Hashimoto , H. , Takasaki , Y. , Tokunaga , K. , & Tsuchiya , N. ( 2008 ). Association of LILRA2 (ILT1, LIR7) splice site polymorphism with systemic lupus erythematosus and microscopic polyangiitis . Genes and immunity , 9 ( 3 ), 214 – 223 . OpenUrl CrossRef PubMed Web of Science 78. ↵ Che Yaacob , N.S. , Islam , M.A. , Alsaleh , H. , et al. ( 2020 ). Alpha-hemoglobin-stabilizing protein (AHSP): a modulatory factor in β-thalassemia . International Journal of Hematology 111 , 352 – 359 . OpenUrl 79. ↵ Jaing , T. H. , Chang , T. Y. , Chen , S. H. , Lin , C. W. , Wen , Y. C. , & Chiu , C. C. ( 2021 ). Molecular genetics of β-thalassemia: A narrative review . Medicine , 100 ( 45 ), e27522 . OpenUrl 80. ↵ Verma , G. , Rebholz-Schuhmann , D. , & Madden , M. G. ( 2024 ). Enabling personalised disease diagnosis by combining a patient’s time-specific gene expression profile with a biomedical knowledge base . BMC bioinformatics , 25 ( 1 ), 62 . OpenUrl 81. ↵ Kim , S. I. , Lee , J. W. , Lee , N. , Lee , M. , Kim , H. S. , Chung , H. H. , Kim , J. W. , Park , N. H. , Song , Y. S. , & Seo , J. S. ( 2018 ). LYL1 gene amplification predicts poor survival of patients with uterine corpus endometrial carcinoma: analysis of the Cancer genome atlas data . BMC cancer , 18 ( 1 ), 494 . OpenUrl 82. ↵ Kamino , Y. , Kurashige , Y. , Uehara , O. , Sato , J. , Nishimura , M. , Yoshida , K. , Arakawa , T. , Nagayasu , H. , Saitoh , M. , & Abiko , Y. ( 2014 ). HBD-2 is downregulated in oral carcinoma cells by DNA hypermethylation, and increased expression of hBD-2 by DNA demethylation and gene transfection inhibits cell proliferation and invasion . Oncology reports , 32 ( 2 ), 462 – 468 . OpenUrl 83. ↵ Tian , T. , Cao , X. , Kim , S. E. , Lin , Y. L. , Steele , J. W. , Cabrera , R. M. , Karki , M. , Yang , W. , Marini , N. J. , Hoffman , E. N. , Han , X. , Hu , C. , Wang , L. , Wlodarczyk , B. J. , Shaw , G. M. , Ren , A. , Finnell , R. H. , & Lei , Y. ( 2020 ). FKBP8 variants are risk factors for spina bifida . Human molecular genetics , 29 ( 18 ), 3132 – 3144 . OpenUrl 84. ↵ Tanaka , K. , Kondoh , N. , Shuda , M. , Matsubara , O. , Imazeki , N. , Ryo , A. , Wakatsuki , T. , Hada , A. , Goseki , N. , Igari , T. , Hatsuse , K. , Aihara , T. , Horiuchi , S. , Yamamoto , N. , & Yamamoto , M. ( 2001 ). Enhanced expression of mRNAs of antisecretory factor-1, gp96, DAD1 and CDC34 in human hepatocellular carcinomas . Biochimica et biophysica acta , 1536 ( 1 ), 1 – 12 . OpenUrl PubMed 85. ↵ Shi , X. , Reinstadler , B. , Shah , H. , To , T. L. , Byrne , K. , Summer , L. , Calvo , S. E. , Goldberger , O. , Doench , J. G. , Mootha , V. K. , & Shen , H. ( 2022 ). Combinatorial GxGxE CRISPR screen identifies SLC25A39 in mitochondrial glutathione transport linking iron homeostasis to OXPHOS . Nature communications , 13 ( 1 ), 2483 . OpenUrl 86. ↵ Wang , X. , Huang , K. , Yang , F. , Chen , D. , Cai , S. , & Huang , L. ( 2021 ). Association between structural brain features and gene expression by weighted gene co-expression network analysis in conversion from MCI to AD . Behavioural brain research , 410 , 113330 . OpenUrl 87. ↵ Wan , J. , Lv , J. , Wang , C. , & Zhang , L. ( 2022 ). RPS27 selectively regulates the expression and alternative splicing of inflammatory and immune response genes in thyroid cancer cells . Advances in clinical and experimental medicine : official organ Wroclaw Medical University , 31 ( 8 ), 889 – 901 . OpenUrl 88. ↵ Quan , W. , Li , J. , Jin , X. , Liu , L. , Zhang , Q. , Qin , Y. , Pei , X. , & Chen , J. ( 2021 ). Identification of Potential Core Genes in Parkinson’s Disease Using Bioinformatics Analysis . Parkinson’s disease , 2021 , 1690341 . OpenUrl 89. ↵ Wright , C. , Tawfik , M. A. , Waisbourd , M. , & Katz , L. J. ( 2016 ). Primary angle-closure glaucoma: an update . Acta ophthalmologica , 94 ( 3 ), 217 – 225 . OpenUrl CrossRef PubMed 90. ↵ Truebestein , L. , & Leonard , T. A. ( 2016 ). Coiled-coils: The long and short of it . BioEssays : news and reviews in molecular, cellular and developmental biology , 38 ( 9 ), 903 – 916 . OpenUrl 91. ↵ Ostermann , G. , Weber , K. S. , Zernecke , A. , Schröder , A. , & Weber , C. ( 2002 ). JAM-1 is a ligand of the beta(2) integrin LFA-1 involved in transendothelial migration of leukocytes . Nature immunology , 3 ( 2 ), 151 – 158 . OpenUrl CrossRef PubMed Web of Science 92. ↵ Liu , C. , Yin , Q. , Li , M. , Fan , Y. , Shen , C. , Yang , R. ( 2021 ). ACTB methylation in blood as a potential marker for the pre-clinical detection of stroke: a prospective nested case-control study . Frontiers in Neuroscience , 15 . 93. ↵ Chu , Y. , Yu , F. , Wu , Y. et al. ( 2022 ). Identification of genes and key pathways underlying the pathophysiological association between nonalcoholic fatty liver disease and atrial fibrillation . BMC Med Genomics 15 , 150 . OpenUrl 94. ↵ Zhang , X. , He , D. , Xiang , Y. , Wang , C. , Liang , B. , Li , B. , Qi , D. , Deng , Q. , Yu , H. , Lu , Z. , Zheng , F. ( 2022 ). DYSF promotes monocyte activation in atherosclerotic cardiovascular disease as a DNA methylation-driven gene . Translational Research , 247 , 19 – 38 . OpenUrl 95. ↵ Zicheng , H. , Xiao , Y. , Rongzhong , H. et al. ( 2021 ). Association of circulating ICAM3 concentrations with severity and short-term outcomes of acute ischemic stroke . Neurotox Res , 39 , 1293 – 1299 . OpenUrl 96. ↵ van Leent , M. M. T. , Beldman , T. J. , Toner , Y. C. , Lameijer , M. A. , Rother , N. , Bekkering , S. , Teunissen , A. J. P. , Zhou , X. , van der Meel , R. , Malkus , J. , Nauta , S. A. , Klein , E. D. , Fay , F. , Sanchez-Gaytan , B. L. , Pérez-Medina , C. , Kluza , E. , Ye , Y. X. , Wojtkiewicz , G. , Fisher , E. A. , Swirski , F. K. , … Duivenvoorden , R. ( 2021 ). Prosaposin mediates inflammation in atherosclerosis . Science translational medicine , 13 ( 584 ), eabe1433 . OpenUrl Abstract / FREE Full Text 97. ↵ Hwang , Y. , Cornman , A. L. , Kellogg , E. H. , Ovchinnikov , S. , & Girguis , P. R. ( 2024 ). Genomic language model predicts protein co-regulation and function . Nature communications , 15 ( 1 ), 2880 . OpenUrl 98. ↵ Xie , Y. R. , Castro , D. C. , Rubakhin , S. S. , Trinklein , T. J. , Sweedler , J. V. , & Lam , F. ( 2024 ). Multiscale biochemical mapping of the brain through deep-learning-enhanced high-throughput mass spectrometry . Nature methods , 21 ( 3 ), 521 – 530 . OpenUrl 99. ↵ Ahsan , M. U. , Gouru , A. , Chan , J. , Zhou , W. , & Wang , K. ( 2024 ). A signal processing and deep learning framework for methylation detection using Oxford Nanopore sequencing . Nature communications , 15 ( 1 ), 1448 . OpenUrl 100. ↵ Osipov , A. , Nikolic , O. , Gertych , A. , Parker , S. , Hendifar , A. , Singh , P. , Filippova , D. , Dagliyan , G. , Ferrone , C. R. , Zheng , L. , Moore , J. H. , Tourtellotte , W. , Van Eyk , J. E. , & Theodorescu , D . ( 2024 ). The Molecular Twin artificial-intelligence platform integrates multi-omic data to predict outcomes for pancreatic adenocarcinoma patients . Nature cancer , 5 ( 2 ), 299 – 314 . OpenUrl 101. ↵ Jiang , L. , Xu , C. , Bai , Y. , Liu , A. , Gong , Y. , Wang , Y. P. , & Deng , H. W. ( 2024 ). Autosurv: interpretable deep learning framework View the discussion thread. Back to top Previous Next Posted August 09, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Multimodal AI/ML for discovering novel biomarkers and predicting disease using multi-omics profiles of patients with cardiovascular diseases Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Multimodal AI/ML for discovering novel biomarkers and predicting disease using multi-omics profiles of patients with cardiovascular diseases William DeGroat , Habiba Abdelhalim , Elizabeth Peker , Neev Sheth , Rishabh Narayanan , Saman Zeeshan , Bruce T. Liang , Zeeshan Ahmed bioRxiv 2024.08.07.607041; doi: https://doi.org/10.1101/2024.08.07.607041 Share This Article: Copy Citation Tools Multimodal AI/ML for discovering novel biomarkers and predicting disease using multi-omics profiles of patients with cardiovascular diseases William DeGroat , Habiba Abdelhalim , Elizabeth Peker , Neev Sheth , Rishabh Narayanan , Saman Zeeshan , Bruce T. Liang , Zeeshan Ahmed bioRxiv 2024.08.07.607041; doi: https://doi.org/10.1101/2024.08.07.607041 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genomics Subject Areas All Articles Animal Behavior and Cognition (7644) Biochemistry (17726) Bioengineering (13916) Bioinformatics (42033) Biophysics (21486) Cancer Biology (18635) Cell Biology (25549) Clinical Trials (138) Developmental Biology (13397) Ecology (19940) Epidemiology (2067) Evolutionary Biology (24361) Genetics (15620) Genomics (22541) Immunology (17763) Microbiology (40468) Molecular Biology (17207) Neuroscience (88739) Paleontology (667) Pathology (2842) Pharmacology and Toxicology (4834) Physiology (7659) Plant Biology (15175) Scientific Communication and Education (2047) Synthetic Biology (4304) Systems Biology (9834) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00