SimMapNet: A Bayesian Framework for Gene Regulatory Network Inference Using Gene Ontology Similarities as External Hint

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 47,244 characters · extracted from preprint-html · click to expand
SimMapNet: A Bayesian Framework for Gene Regulatory Network Inference Using Gene Ontology Similarities as External Hint | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results SimMapNet: A Bayesian Framework for Gene Regulatory Network Inference Using Gene Ontology Similarities as External Hint View ORCID Profile Maryam Shahdoust , View ORCID Profile Rosa Aghdam , View ORCID Profile Mehdi Sadeghi doi: https://doi.org/10.1101/2025.04.09.647936 Maryam Shahdoust 1 School of Biological Sciences, Institute For Research In Fundamental Sciences(IPM) , 19395-5746,Tehran, Iran Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Maryam Shahdoust For correspondence: m.shahdoost{at}ipm.ir Rosa Aghdam 1 School of Biological Sciences, Institute For Research In Fundamental Sciences(IPM) , 19395-5746,Tehran, Iran Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Rosa Aghdam Mehdi Sadeghi 2 Department of Medical Genetics, National Institute for Genetic Engineering and Biotechnology , 1497716316, Tehran, Iran 1 School of Biological Sciences, Institute For Research In Fundamental Sciences(IPM) , 19395-5746,Tehran, Iran Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mehdi Sadeghi Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Motivation Gene regulatory network (GRN) reconstruction is a fundamental challenge in computational biology, and is crucial for understanding gene interactions. In this study, we aim to incorporate Gene Ontology (GO) similarities into the construction of GRNs. Our key assumption is that genes with higher similarity in Molecular Function, Biological Process, or Cellular Component categories are more likely to be functionally related and, therefore, more likely to be connected in the network. We introduce SimMapNet , a Bayesian framework that estimates the precision matrix, which serves as the adjacency matrix in a Gaussian graphical model (GGM) for GRN inference. SimMapNet enhances network inference by integrating GO similarities, which inform the hyperparameters of the prior distribution through a kernel function, incorporating biological prior knowledge in a principled manner. Results We evaluate SimMapNet on three datasets: two datasets from the SOS DNA-repair response pathway in Escherichia coli and one dataset from Drosophila melanogaster . The results demonstrate the algorithm’s superior performance compared to state-of-the-art methods such as GLASSO, GENIE3, and KBOOST in terms of F1-score. SimMapNet has low time complexity, making it suitable for constructing large networks. Our simulation results confirm that SimMapNet is particularly well-suited for scenarios with limited sample sizes, where traditional methods often struggle. Availability and implementation The datasets and R package of SimMapNet are available in the github repository, https://github.com/maryam-shahdoust/SimMapNet . Introduction Developing an integrative framework for Gene Regulatory Network (GRN) that combines multiple data sources, machine learning algorithms, and network-based computational strategies remains a crucial challenge in the field, requiring continuous innovation and interdisciplinary collaboration [ 1 ]. Decades of biomedical research have led to the accumulation of extensive biological knowledge, including pathway information, transcription factor-target interactions, gene ontology (GO) annotations and other functional information, which are now available in public databases [ 2 , 3 ]. Leveraging prior biological data can significantly enhance the statistical power and interpretability of GRN reconstruction, particularly in relation to complex phenotypes. However, traditional network inference methods rely solely on gene expression data [ 4 , 5 ], often resulting in high false positive and false negative rates in the inferred network due to noise and limited coverage [ 6 ]. Studies have demonstrated that incorporating prior biological knowledge can refine network structure, reduce uncertainty, and improve predictive power [ 7 , 8 , 9 , 10 ]. For instance, the Prior Lasso (pLasso) method partitions gene interactions based on pathway knowledge [ 10 ] and PriorPC that uses soft priors that assign to edges a probability of existence [ 8 ]. More recent approaches, such as GRNPT, integrate transformer-based embeddings from large-scale biological data to capture regulatory patterns from single-cell RNA sequencing [ 11 ]. In this study, we incorporate GO similarities into the construction of GRNs. Our key assumption is that genes with higher similarity in the Molecular Function (MF), Biological Process (BP) categories are more likely to be functionally related and, therefore, more likely to be connected in the network. This assumption is biologically motivated, as genes involved in similar biological processes or sharing molecular functions often participate in the same regulatory pathways [ 12 , 13 , 14 ]. Additionally, we also utilize Cellular Component (CC) similarities to assess the impact of different types of external information in improving the accuracy and robustness of GRN construction. We introduce SimMapNet , a Bayesian framework that estimates the precision matrix, which serves as the adjacency matrix in a Gaussian graphical model (GGM) for GRN inference. In this approach, the precision matrix—i.e., the inverse of the covariance matrix— represents the network structure, where non-zero entries indicate regulatory relationships between genes [ 15 ]. To estimate the precision matrix, we adopt a Bayesian approach, incorporating GO similarities into the estimation of the hyperparameters of the prior distribution. Specifically, GO similarities help to define the prior covariance structure, which guides the Bayesian inference process. Previously, we developed F-MAP [ 16 ], which combined Gaussian graphical models with a Bayesian framework to infer the GRN of one species while incorporating gene expression data from related species. In F-MAP, external information was integrated by estimating hyperparameters in the prior distribution through factor analysis of the covariance matrix of related species. Following a similar conceptual framework, SimMapNet introduces a distinct methodology by incorporating GO-based functional relationships through kernel functions. Unlike F-MAP, which relies on cross-species covariance structures, SimMapNet directly integrates functional similarity measures into the prior distribution, enabling GO similarities to systematically refine the inferred network structure. We evaluate SimMapNet using three different microarray gene expression datasets. Two datasets belong to the SOS DNA-repair response pathway in Escherichia coli , one with 9 samples and the other with 466 samples, representing different sample sizes of a small-scale GRN [ 17 , 18 , 19 ]. The third dataset, from Drosophila melanogaster [ 20 ], is considered due to its high dimensionality, which allows us to assess SimMapNet ’s performance on large-scale data. The results demonstrate SimMapNet ’s ability to more accurately identify functionally related genes. This novel approach results in a more robust and biologically interpretable network by incorporating functional relationships. Compared to established methods, including Graphical Lasso (GLASSO) [ 21 ], KBOOST [ 22 ], and GENIE3 [ 23 ], SimMapNet exhibits superior performance in ensuring biologically relevant connections among genes. Methods SimMapNet constructs the GRN within the Gaussian Graphical Models (GGM) framework [ 15 ], assuming gene relationships follow a multivariate normal distribution, and estimates the precision matrix. The algorithm integrates Bayesian inference and kernel methods to estimate the precision matrix, enforce sparsity and then build adjacency matrices representing regulatory relationships. The Bayesian estimation, GO similarities, and the stepwise implementation of SimMapNet are detailed in the following subsections, with additional information on the Bayesian framework and GGM in the supplementary file. Bayesian Inference of Precision Matrix Let Y = ( Y 1 , …, Y n ) T be an n × p matrix, where each Y i , for i ∈ { 1, …, n} , is an independent and identically distributed multivariate normal observation, Y i ∼ 𝒩 (0, Θ − 1 ). Here, Θ − 1 is an unknown p×p covariance matrix, and Θ is the corresponding precision matrix, which is assumed to be positive definite. Wishart Prior Distribution The Inverse Wishart distribution is a commonly used prior for Θ − 1 [ 24 , 25 ]. Based on the relationship between the Wishart and inverse Wishart distributions, the prior distribution on Θ is the Wishart distribution: W ( ν , ( ν Ω) − 1 ), where ν is the degrees of freedom, which must be greater than ( p − 1), and Ω is a p × p positive definite matrix [ 25 ]. The Wishart distribution is the conjugate prior for the population precision matrix of a multivariate normal distribution. Therefore, the posterior distribution of Θ follows the Wishart distribution, W ( ν ′ , ( ν ′ Ω ′ ) − 1 ), where: The mode of the posterior distribution, known as the maximum a posteriori (MAP) estimate, serves as an estimator for Θ: Hyperparameter Estimation The parameter ν is emperically determined. It could be learned from the dataset, starting with an initial value greater than p − 1, where p is the number of variables (genes) [ 25 , 26 ]. We set the parameter ν equals 2 p recommended by the study of Zhang et al. [ 27 ]. To estimate the hyperparameter Ω, GO similarities are transformed into a covariance structure using kernel functions, incorporating gene relationship data. GO similarities range from 0 to 1, and we calculate distances between genes as d ( x, x ′ ) = 1 − similarity, which are then input into the kernel function. Finally, the hyperparameter Ω is estimated as: where, V = Diag( σ 1 , σ 2 , …, σ p ) is a p × p diagonal matrix, whose diagonal entries represent the standard deviations of the corresponding genes. The p × p matrix K encodes prior information about gene correlations obtained from the kernel function. The matrix ωI is a diagonal matrix, where ω is a positive parameter that ensures the estimated Ω is positive definite and possesses desirable algebraic properties. In this study, we focus on two stationary isotropic kernels, which depend only on the distance d ( x, x ′ ) between inputs [ 28 , 29 ]. Squared exponential (SE) kernel [ 9 , 28 , 30 ] (also known as Gaussian or RBF) and Ornstein-Uhlenbeck (OU) kernel , part of the Matérn group [ 30 ]. The formulas and parameter definitions for both kernels are provided in the supplementary file (Section 2, subsection “Kernel functions”).. Gene Ontology (GO) Similarities GO classifies gene functions into Biological Process (BP), Molecular Function (MF), and Cellular Component (CC) [ 12 , 31 ]. GO similarities are computed using semantic similarity measures such as Resnik’s [ 32 ] and Wang’s similarity [ 33 ](See the supplementary file, Section 3, for more details.). These similarities are mapped to kernel functions to define distances between genes. SimMapNet steps The steps of the algorithm are presented below. The algorithm is implemented as an R package, available at GitHub. Figure 1 displays a graphical abstract of the SimMapNet algorithm. Download figure Open in new tab Fig. 1. Graphical abstract of the SimMapNet algorithm. The “ths” in the Sparsification section denotes the threshold applied to induce sparsity in the MAP estimation of the precision matrix. Step 1: Data Pre-processing The input to the algorithm consists of gene expression data which is an n × p gene expression matrix, where n and p represent the number of samples and the number of genes, respectively. This matrix can be obtained from various transcriptomic platforms. To ensure the data conforms to a multivariate normal distribution, a normalization transformation is applied. One common transformation is log-transformation, where the log 2 of the data is computed [ 34 ]. The gene expression vectors for each sample are then centered so that each gene has zero mean across samples. Step 2: Similarities calculation and Kernel computation Calculate the similarities (or differences) between variables. We calculate the GO similarities between genes using the GOSemSim R package [ 31 ]. These similarities are then converted into distances and passed to a kernel function to construct the kernel matrix ( K ). Step 3: Bayesian inference of the precision matrix This step includes multiple stages: 3.1 Set a Wishart prior for the precision matrix: W ( ν , ( ν Ω) − 1 ). 3.2 Choose a value equal to (or greater than) 2 p as the prior degree of freedom ν . 3.3 Estimate the hyperparameter Ω using Equation (3). 3.4 Estimate the posterior distribution: W ( ν ′ , ( ν ′ Ω ′ ) − 1 ) using Equation (1). 3.5 Estimate the MAP of the precision matrix using Equation (2). Step 4: Make Sparse Precision Matrix To induce sparsity in the MAP estimate, we apply a hard-thresholding method. In this process, different percentiles of the absolute values of the estimated partial correlations are used as thresholds. Step 5: Make Adjacency Matrix The final binary adjacency matrix is derived from the sparse precision matrix, where non-zero elements (set to one) indicate the presence of an edge between the corresponding gene pairs. Results The results of implementing SimMapNet on different datasets are evaluated by following performance metrics, such as True Positive rate (TPR), False Positive rate (FPR), precision (PPV), accuracy (ACC), and F1-score [ 35 ]. The mathematical definition of these metrics are provided in supplementary file Section 4. TPR and FPR are also used to plot the receiver operating characteristic (ROC) curve, and the area under the ROC curve (AUC) is calculated. Similarly, the Precision-Recall (PR) curve is plotted using PPV and TPR, and the area under this curve (PRAUC) is calculated [ 36 , 37 ]. The performance of SimMapNet is benchmarked against three well-known methods: GENIE3 [ 23 ], KBOOST [ 22 ], and GLASSO [ 21 ]. In addition, to more precisely evaluate the role of GO similarities, Euclidean distances have been used as prior information for calculating the kernel functions, following the approach described in the SimMapNet methodology. The parameters of the algorithm and other methods are trained based on their corresponding diagnostic measures, specifically the F1-score. The optimal parameter set is selected by maximizing the F1-score. To ensure the reliability and robustness of SimMapNet , we systematically evaluated its performance using bootstrap datasets and computed the 95% confidence interval (CI) for the F1-score obtained through bootstrap sampling [ 38 ]. The CIs are calculated according to the percentile method using percentiles of the bootstrap distribution; . These evaluations allowed us to assess the stability of SimMapNet in reconstructing GRNs and its sensitivity to variations in input data. SOS DNA repair network dataset The SOS DNA repair network is a small-scale, experimentally validated gene expression dataset consisting of nine genes. There are two real microarray SOS datasets, varying in terms of their sample size (referred here as SOS1 and SOS2). SOS1 contains nine samples [ 17 , 18 ], while SOS2 (version 4, build 6) contains 466 samples for the same nine genes [ 19 ]. The reference network includes 24 regulatory interactions (edges). GO similarities for MF and BP were mapped for eight genes. As a result, the edges associated with umuDC were removed from the reference network, yielding a final network with 22 edges. Figure 2 displays the SOS reference networks, labeled “Reference Network”, along with the reconstructed networks using GO similarities MF (MF-GO), GO similarities BP (BP-GO) for SOS1 and SOS2. Due to the absence of CC similarity data for some genes, the network construction based on CC similarity was performed separately. In the reference network, edges lacking CC information were excluded, resulting in the reference network ( SimMapNetCC ), which contains only eight edges (see Figure S1 for the Reference Network). The constructed GRN using CC similarities are in Figure S1. In addition, the performance metrics for both datasets are in Table 1 and Table 2 . Figure 3 displays box plots of the F1-scores achieved from 100 bootstrap sampling for different methods. The results show that SimMapNet , particularly with molecular function ( SimMapNet MF ) and biological process ( SimMapNet BP ) GO similarities, achieves the highest F1-scores across both SOS1 and SOS2 datasets, outperforming GLASSO, GENIE3, and KBOOST. The boxplots of bootstrap sampling for both SOS1 and SOS2 confirm SimMapNet robustness, as it maintains a high median F1-score with reasonable variability. Download figure Open in new tab Fig. 2. SOS Gene regulatory networks based on MF and BP similarities. The Reference Network represents the true network where each node shows a gene and each edge shows the true relationship between genes. True positive edges are in black and false positives are in red. MF-GO (SOS1) and BP-GO (SOS1) are the reconstructed networks for SOS1 using SimMapNet with MF and BP GO similarities, respectively. Similarly, MF-GO (SOS2) and BP-GO (SOS2) are for SOS2. View this table: View inline View popup Download powerpoint Table 1. Performance Metrics of Different Methods for SOS1 Constructed Networks View this table: View inline View popup Download powerpoint Table 2. Performance Metrics of Different Methods for SOS2 Constructed Networks Download figure Open in new tab Fig. 3. The box plots show the F1-scores obtained from 100 bootstrap samples for different methods, where (a) corresponds to SOS1 and (b) to SOS2. SimMapNet is evaluated using GO similarities in Molecular Function (MF), Biological Process (BP), and Cellular Component (CC), denoted as SimMapNet MF, SimMapNet BP , and SimMapNet CC , respectively. SimMapNet dist refers to the version of SimMapNet that incorporates Euclidean distances between gene expressions. For comparison, the results of three other methods—GLASSO, KBOOST, and GENIE3—are also included. Comparing the results of SOS1 and SOS2 reveals that the number of false positive edges in SOS2 is higher than in SOS1, possibly due to greater expression variability resulting from its larger sample size. To assess the impact of sample size on the performance of the constructed networks, we conducted a benchmark on SOS2 using different sample sizes (20, 50, 100, and 200). The samples were selected based on their variability in gene expression. Specifically, we sorted the samples according to their standard deviation across all genes and selected the top n samples with the highest standard deviations. Figure 4 displays the changes in F1-score and AUC for different sample sizes across various methods. The complete results are in Table S1. Download figure Open in new tab Fig. 4. (a) The trend of changes in F1-score of the SOS network across different sample sizes for various methods. (b) The trend of changes in AUC of the SOS network across different sample sizes for various methods. SimMapNet using GO similarities MF, BP, and CC are represented as SimMapNet MF, SimMapNet BP, and SimMapNet CC , respectively. SimMapNet dist represents the SimMapNet method using Euclidean distances between gene expressions. Other methods are indicated by their names: GLASSO, KBOOST, and GENIE3. A sample size of 9 corresponds to the results from SOS1 data, while the other sample sizes reflect results from sample selections in SOS2. The sample size of 466 represents the results of the main SOS2 dataset (whole information). Drosophila melanogaster Dataset To assess SimMapNet ’s performance on high-dimensional datasets, we applied it to gene expression data from D. melanogaster (amel), obtained from Kalinka et al. [ 20 ] (ArrayExpress, E-MTAB-404). The dataset includes over 3,000 genes, but we focus on those in the reference network, which involves 12 transcription factors (TFs) and their target genes. The reference network is derived from ChIP-chip data from MacArthur et al. [ 39 ], which includes relationships between 12 TFs and their 2,049 target genes. In other hand, since GO similarities are unavailable for all genes, we limited the study to 1,441 genes with available all GO similarity information. Genes without GO similarity were excluded from both the study and the reference network. The best performances for the Drosophila melanogaster datasets are shown in Table S2. Networks constructed using SimMapNet with GO similarities consistently achieved higher F1-scores than those built by other methods. Each network’s parameters were selected based on the configuration yielding the maximum F1-score, resulting in networks with varying levels of sparsity. To ensure a fair comparison, we searched each method’s results to find the performance corresponding to networks with a similar number of edges to the reference network (about 4,967 edges). Table 3 presents these results. Even under matched conditions, SimMapNet using GO similarities clearly outperforms the competing methods. View this table: View inline View popup Download powerpoint Table 3. Performance Metrics of Different methods for Drosophila melanogaster Constructed Networks with similar number of edges To investigate whether combining networks enhances performance, we integrated the networks constructed using different GO similarity measures by identifying common edges among them ( Figure 5 , see also Table S3). The bar plot illustrates performance across various GO combinations. The MF&CC network achieved the highest TPR, while MF&BP showed superior Precision, and BP&CC offered a balanced trade-off between the two. Combining all three GO types (MF&BP&CC) resulted in a lower FPR and improvements in both Precision and Accuracy, but with a reduction in TPR. This indicates a trade-off: enhancing precision may come at the expense of detecting fewer true positive edges. Download figure Open in new tab Fig. 5. Performance comparison of gene regulatory networks constructed using SimMapNet with different GO similarity measures in Drosophila melanogaster . The bar plot presents True Positive Rate (TPR), False Positive Rate (FPR), Precision, Accuracy, and F-score for networks based on pairwise similarities (MF&BP, MF&CC, BP&CC) and the combined network (MF&BP&CC). The MF&BP&CC network, which integrates all three GO similarities, achieves the highest accuracy but a lower TPR compared to pairwise networks. Discussion This study introduces SimMapNet , a GRN reconstruction approach that uses GO similarity to improve gene relationship inference. Three datasets were used to evaluate SimMapNet : two small SOS networks [ 17 , 19 ] and a large-scale Drosophila melanogaster dataset [ 20 ]. We found that GO similarities greatly increase network inference accuracy. Further, the results of bootstrap sampling confirm that SimMapNet balances both high accuracy and stability, making it a more reliable approach for network inference. The inferred networks were validated against reference networks and benchmarked against GENIE3, KBOOST, and GLASSO. SimMapNet typically outperformed various algorithms, with improvements varying by similarity metric. GO-based similarities were more accurate than Euclidean distances, showing the value of biological context over statistics. Our key assumption is that genes sharing functional similarity are likely to participate in related biological processes and be regulated by similar mechanisms. This often results in potential interactions within a gene regulatory network, where one gene can directly influence the expression of another, enabling coordinated control over a common function [ 12 ]. Incorporating such similarities allows the algorithm to better capture the underlying biological structure and mitigate noise in gene expression data, particularly in cases with limited sample sizes. We also incorporate CC similarities, as they may provide complementary insights into gene interactions, despite their implications differing from those of functional similarity. Our results suggest that functional similarity, particularly utilizing MF and BP, generally improves network inference performance. In the construction of SOS GRNs using both SOS1 and SOS2 datasets, networks built with BP and MF similarities outperform those based on CC similarity. In the Drosophila melanogaster dataset, the differences are less pronounced, although the constructed network utilizing MF and BP still achieves slightly higher AUC scores than CC ( Figure 2 and Figure S1). To ensure a fair comparison, we selected parameter sets that generated a comparable number of network edges, approximately matching the reference network’s edge count ( Table 3 ). Under these conditions, SimMapNet using MF remains the strongest predictor, while CC outperforms BP. These findings suggest that while functional similarity is generally more informative for GRN reconstruction, CC similarity may still provide valuable insights depending on the dataset and network structure. Further investigation is needed to fully understand the implications of this result. Assessment of SimMapNet ’s performance across varying sample sizes for the SOS dataset (Table S1) reveals distinct performance patterns across GO categories. SimMapNet using MF similarities demonstrates the most stable performance, particularly excelling in small sample sizes where functional constraints are more informative. The GRN constructed by SimMapNet applying BP performs comparably but shows slight variations due to the complexity of process-level interactions. SimMapNet using CC exhibits robust performance but is more sensitive to increasing sample sizes, likely due to structural dependencies captured in cellular localization data. Despite these variations, all GO-constrained versions of SimMapNet outperform traditional methods across the tested sample sizes. As the sample sizes increase, we observe that SimMapNet , like other GRN inference methods, produces slightly denser networks, which may introduce additional false positives. This effect is common in network reconstruction due to the increasing number of detectable correlations in large datasets. However, SimMapNet ’s Bayesian framework mitigates this issue better than alternative approaches by integrating biological priors. The results indicate that SimMapNet does not require a large sample size to perform well, which makes it particularly effective when data are limited. GENIE3 performs well at smaller sample sizes but fails to show consistent improvement as sample sizes increase, suggesting that its tree-based structure may be more sensitive to data variability. While GLASSO maintains relatively stable, though suboptimal, performance across larger sample sizes, it is worth mentioning that GLASSO often struggles with overfitting or unstable covariance estimates in low-data scenarios [ 40 , 41 ]. However, SimMapNet outperforms KBOOST not only when applied to SOS data with varying sample sizes but also in constructing gene regulatory networks for Drosophila flies, particularly benefiting from GO-based constraints that improve stability when data is limited. SimMapNet employs a Bayesian framework to estimate the precision matrix of gene expression data, which serves as the foundation for inferring a GGM of the gene regulatory network. While Bayesian precision matrix estimation for multivariate normal data is a well-established and widely used technique [ 42 , 43 , 44 , 11 ], most existing approaches are not specifically tailored for GRN construction and lack the capacity to incorporate external biological information. The supplementary file, section 6 and Table S4, includes the results of implementing one of these Bayesian methods—Ledoit and Wolf [ 42 ]—on SOS and Drosophila melanogaster datasets.) Integrating this information into the prior distribution not only introduces an adaptive regularization mechanism that enhances the interpretability of inferred networks but also reduces computational complexity. As a result, SimMapNet achieves remarkably short computation times, making it a highly efficient tool for GRN inference, capable of constructing even large networks within a minute, provided that gene similarities are available. Future studies could explore hybrid approaches that combine Bayesian shrinkage techniques with biological constraints to further enhance GRN inference accuracy. To incorporate GO similarities, we employ a kernel function that transforms similarity values into a structured prior covariance matrix. This kernel-based approach ensures that prior biological knowledge is treated as a probabilistic influence rather than a strict constraint, making the framework more flexible and biologically realistic. We utilize two isotropic kernel functions: the squared exponential (SE) kernel, which allows smooth transitions in similarity contributions and demonstrated robust performance [ 45 , 9 ], and the Ornstein-Uhlenbeck (OU) kernel, which enforces a stronger locality constraint to reduce false positive edges [ 28 ]. The choice of the kernel function is left to the user, who can select the one that yields the best-performing network. In this study, we evaluate both kernels and determine which one provides the most biologically meaningful results. The other parameters of SimMapNet , such as the degree of freedom of the prior distribution ( ν ), must be specified by the user. Here, we set ν = 2 p since varying it did not strongly affect the results. In this paper, our primary goal is to present SimMapNet to introduce an approach that can benefit from incorporating external biological information. Therefore, we optimize the parameters to achieve the best performance, particularly focusing on the F1-score using the available reference network. We select the level of sparsity that maximizes network accuracy rather than using predefined statistical criteria. However, users can apply alternative sparsity selection methods, such as the Bayesian Information Criterion (BIC) [ 46 ] or Extended BIC (EBIC) [ 47 , 43 ], depending on their specific dataset and application. Additionally, we are working on deriving a closed-form solution for certain parameters, particularly leveraging the algebraic properties of the estimations. This effort aims to reduce user dependency on manual parameter selection, thereby improving the usability and robustness of the framework. While this study focuses on GRN inference, the underlying framework of SimMapNet is highly adaptable to other biological datasets, such as microbiome networks. In addition, by leveraging different types of domain-specific similarities, SimMapNet has the potential to improve network inference across diverse biological systems. Competing interests No competing interest is declared. Author contributions statement MSH and MS developed the idea. MSH collected the data. MSH and RA led all statistical analyses from data preprocessing to fitting models, as well as summarizing the results by the creation of figures. MSH wrote the initial complete draft of the manuscript. MSH and RA and MS contributed in interpretations, editing, and revision of the manuscript. All authors read and approved the final manuscript. Acknowledgments This research was supported by the Iran National Science Foundation (INSF) under project number 4027812. Footnotes https://github.com/maryam-shahdoust/SimMapNet References 1. ↵ Vân Anh Huynh-Thu and Guido Sanguinetti . Gene regulatory network inference: an introductory survey . Gene regulatory networks: Methods and protocols , pages 1 – 23 , 2019 . 2. ↵ Gene Ontology Consortium et al. Creating the gene ontology resource: design and implementation . Genome research , 11 ( 8 ): 1425 – 1433 , 2001 . OpenUrl Abstract / FREE Full Text 3. ↵ Gene Ontology Consortium . The gene ontology (go) database and informatics resource . Nucleic acids research , 32 ( Suppl 1 ): D258 – D261 , 2004 . OpenUrl CrossRef PubMed Web of Science 4. ↵ Rosa Aghdam , Mojtaba Ganjali , Xiujun Zhang , and Changiz Eslahchi . Cn: a consensus algorithm for inferring gene regulatory networks using the sorder algorithm and conditional mutual information test . Molecular BioSystems , 11 ( 3 ): 942 – 949 , 2015 . OpenUrl CrossRef PubMed 5. ↵ Seyed Amir Malekpour , Maryam Shahdoust , Rosa Aghdam , and Mehdi Sadeghi . wplogicnet: logic gate and structure inference in gene regulatory networks . Bioinformatics , 39 ( 2 ): btad072 , 2023 . OpenUrl CrossRef PubMed 6. ↵ Rasmus Magnusson and Mika Gustafsson . Liplike: towards gene regulatory network predictions of high certainty . Bioinformatics , 36 ( 8 ): 2522 – 2529 , 2020 . OpenUrl CrossRef PubMed 7. ↵ Elisa Benedetti , Maja Pučić-Baković , Toma Keser , Nathalie Gerstner , Mustafa Büyüközkan , Tamara Štambuk , Maurice HJ Selman , Igor Rudan , Ozren Polašek , Caroline Hayward , et al. A strategy to incorporate prior knowledge into correlation network cutoff selection . Nature communications , 11 ( 1 ): 5153 , 2020 . OpenUrl CrossRef PubMed 8. ↵ Mahsa Ghanbari , Julia Lasserre , and Martin Vingron . Reconstruction of gene networks using prior knowledge . BMC systems biology , 9 : 1 – 11 , 2015 . OpenUrl CrossRef PubMed 9. ↵ Eyal Krupka and Naftali Tishby . Incorporating prior knowledge on features into learning . In Artificial Intelligence and Statistics , pages 227 – 234 . PMLR , 2007 . 10. ↵ Zixing Wang , Wenlong Xu , F Anthony San Lucas , and Yin Liu . Incorporating prior knowledge into gene network study . Bioinformatics , 29 ( 20 ): 2633 – 2640 , 2013 . OpenUrl CrossRef PubMed 11. ↵ Guangzheng Weng , Patrick Martin , Hyobin Kim , and Kyoung Jae Won . Integrating prior knowledge using transformer for gene regulatory network inference . Advanced Science , 12 ( 3 ): 2409990 , 2025 . OpenUrl CrossRef PubMed 12. ↵ Kimberly Glass , Edward Ott , Wolfgang Losert , and Michelle Girvan . Implications of functional similarity for gene regulatory interactions . Journal of The Royal Society Interface , 9 ( 72 ): 1625 – 1636 , 2012 . OpenUrl CrossRef PubMed 13. ↵ Wenting Liu , Kuiyu Chang , Jie Zheng , Jain Divya , Jung-Jae Kim , and Jagath C Rajapakse . Gene regulatory networks from gene ontology . In Bioinformatics Research and Applications: 9th International Symposium, ISBRA 2013, Charlotte, NC, USA, May 20-22, 2013. Proceedings 9 , pages 87 – 98 . Springer , 2013 . 14. ↵ Wenting Liu , Jianjun Liu , and Jagath C Rajapakse . Gene ontology enrichment improves performances of functional similarity of genes . Scientific reports , 8 ( 1 ): 12100 , 2018 . OpenUrl CrossRef PubMed 15. ↵ Daphane Koller . Probabilistic graphical models: Principles and techniques , 2009 . 16. ↵ Maryam Shahdoust , Hamid Pezeshk , Hossein Mahjub , and Mehdi Sadeghi . F-map: a bayesian approach to infer the gene regulatory network using external hints . Plos one , 12 ( 9 ): e0184795 , 2017 . OpenUrl CrossRef PubMed 17. ↵ Timothy S Gardner , Diego Di Bernardo , David Lorenz , and James J Collins . Inferring genetic networks and identifying compound mode of action via expression profiling . Science , 301 ( 5629 ): 102 – 105 , 2003 . OpenUrl Abstract / FREE Full Text 18. ↵ Michal Ronen , Revital Rosenberg , Boris I Shraiman , and Uri Alon . Assigning numbers to the arrows: parameterizing a gene regulation network by using accurate expression kinetics . Proceedings of the national academy of sciences , 99 ( 16 ): 10555 – 10560 , 2002 . OpenUrl Abstract / FREE Full Text 19. ↵ Stephen Kotiang and Ali Eslami . A probabilistic graphical model for system-wide analysis of gene regulatory networks . Bioinformatics , 36 ( 10 ): 3192 – 3199 , 2020 . OpenUrl CrossRef PubMed 20. ↵ Alex T Kalinka , Karolina M Varga , Dave T Gerrard , Stephan Preibisch , David L Corcoran , Julia Jarrells , Uwe Ohler , Casey M Bergman , and Pavel Tomancak . Gene expression divergence recapitulates the developmental hourglass model . Nature , 468 ( 7325 ): 811 – 814 , 2010 . OpenUrl CrossRef PubMed Web of Science 21. ↵ Jerome Friedman , Trevor Hastie , and Robert Tibshirani . Sparse inverse covariance estimation with the graphical lasso . Biostatistics , 9 ( 3 ): 432 – 441 , 2008 . OpenUrl CrossRef PubMed Web of Science 22. ↵ Luis F Iglesias-Martinez , Barbara De Kegel , and Walter Kolch . Kboost: a new method to infer gene regulatory networks from gene expression data . Scientific Reports , 11 ( 1 ): 15461 , 2021 . OpenUrl CrossRef PubMed 23. ↵ Vân Anh Huynh-Thu , Alexandre Irrthum , Louis Wehenkel , and Pierre Geurts . Inferring regulatory networks from expression data using tree-based methods . PloS one , 5 ( 9 ): e12776 , 2010 . OpenUrl CrossRef PubMed 24. ↵ Morris L Eaton and ML Eaton . Multivariate statistics: a vector space approach , volume 512 . Wiley New York , 1983 . 25. ↵ Andrew Gelman , John B Carlin , Hal S Stern , and Donald B Rubin . Bayesian data analysis . Chapman and Hall/CRC , 1995 . 26. ↵ S James Press . Applied multivariate analysis: using Bayesian and frequentist methods of inference. Courier Corporation , 2005 . 27. ↵ Yi Zhang. Smart pca . In Twenty-First International Joint Conference on Artificial Intelligence , 2009 . 28. ↵ Andrew Gordon Wilson . Covariance kernels for fast automatic pattern discovery and extrapolation with Gaussian processes. PhD thesis, University of Cambridge Cambridge, UK , 2014 . 29. ↵ Ethan Anderes , Jesper Møller , and Jakob G Rasmussen . Isotropic covariance functions on graphs and their edges . 2020 . 30. ↵ Moreno Bevilacqua , Christian Caamaño-Carrillo , and Emilio Porcu . Unifying compactly supported and matérn covariance functions in spatial statistics . Journal of Multivariate Analysis , 189 : 104949 , 2022 . OpenUrl CrossRef 31. ↵ Guangchuang Yu , Fei Li , Yide Qin , Xiaochen Bo , Yibo Wu , and Shengqi Wang . Gosemsim: an r package for measuring semantic similarity among go terms and gene products . Bioinformatics , 26 ( 7 ): 976 – 978 , 2010 . OpenUrl CrossRef PubMed Web of Science 32. ↵ Philip Resnik . Semantic similarity in a taxonomy: An information-based measure and its application to problems of ambiguity in natural language . Journal of artificial intelligence research , 11 : 95 – 130 , 1999 . OpenUrl CrossRef 33. ↵ James Z Wang , Zhidian Du , Rapeeporn Payattakool , Philip S Yu , and Chin-Fu Chen . A new method to measure the semantic similarity of go terms . Bioinformatics , 23 ( 10 ): 1274 – 1281 , 2007 . OpenUrl CrossRef PubMed Web of Science 34. ↵ John Quackenbush . Microarray data normalization and transformation . Nature genetics , 32 ( 4 ): 496 – 501 , 2002 . OpenUrl CrossRef PubMed Web of Science 35. ↵ Douglas G Altman and J Martin Bland . Diagnostic tests. 1: Sensitivity and specificity . BMJ: British Medical Journal , 308 ( 6943 ): 1552 , 1994 . OpenUrl CrossRef PubMed 36. ↵ Kendrick Boyd , Kevin H Eng , and C David Page . Area under the precision-recall curve: point estimates and confidence intervals . In Machine Learning and Knowledge Discovery in Databases: European Conference, ECML PKDD 2013, Prague, Czech Republic, September 23-27, 2013, Proceedings, Part III 13 , pages 451 – 466 . Springer , 2013 . 37. ↵ David Faraggi and Benjamin Reiser . Estimation of the area under the roc curve . Statistics in medicine , 21 ( 20 ): 3093 – 3106 , 2002 . OpenUrl CrossRef PubMed Web of Science 38. ↵ Robert J Tibshirani and Bradley Efron . An introduction to the bootstrap . Monographs on statistics and applied probability , 57 ( 1 ): 1 – 436 , 1993 . OpenUrl 39. ↵ Stewart MacArthur , Xiao-Yong Li , Jingyi Li , James B Brown , Hou Cheng Chu , Lucy Zeng , Brandi P Grondona , Aaron Hechmer , Lisa Simirenko , Soile VE Keränen, et al. Developmental roles of 21 drosophila transcription factors are determined by quantitative differences in binding to an overlapping set of thousands of genomic regions . Genome biology , 10 : 1 – 26 , 2009 . OpenUrl CrossRef 40. ↵ Peter Bühlmann and Sara Van De Geer . Statistics for high-dimensional data: methods, theory and applications . Springer Science & Business Media , 2011 . 41. ↵ Adam J Rothman , Peter J Bickel , Elizaveta Levina , and Ji Zhu . Sparse permutation invariant covariance estimation . 2008 . 42. ↵ Olivier Ledoit and Michael Wolf . A well-conditioned estimator for large-dimensional covariance matrices . Journal of multivariate analysis , 88 ( 2 ): 365 – 411 , 2004 . OpenUrl CrossRef 43. ↵ Markku Kuismin and Mikko J Sillanpää. Use of wishart prior and simple extensions for sparse precision matrix estimation . PloS one , 11 ( 2 ): e0148171 , 2016 . OpenUrl CrossRef PubMed 44. ↵ Willem van den Boom , Alexandros Beskos , and Maria De Iorio . The g-wishart weighted proposal algorithm: Efficient posterior computation for gaussian graphical models . Journal of Computational and Graphical Statistics , 31 ( 4 ): 1215 – 1224 , 2022 . OpenUrl CrossRef 45. ↵ Carl Edward Rasmussen and Hannes Nickisch . Gaussian processes for machine learning (gpml) toolbox . The Journal of Machine Learning Research , 11 : 3011 – 3015 , 2010 . OpenUrl 46. ↵ Andrew A Neath and Joseph E Cavanaugh . The bayesian information criterion: background, derivation, and applications . Wiley Interdisciplinary Reviews: Computational Statistics , 4 ( 2 ): 199 – 203 , 2012 . OpenUrl CrossRef 47. ↵ Rina Foygel and Mathias Drton . Extended bayesian information criteria for gaussian graphical models . Advances in neural information processing systems , 23 , 2010 . View the discussion thread. Back to top Previous Next Posted April 15, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following SimMapNet: A Bayesian Framework for Gene Regulatory Network Inference Using Gene Ontology Similarities as External Hint Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share SimMapNet: A Bayesian Framework for Gene Regulatory Network Inference Using Gene Ontology Similarities as External Hint Maryam Shahdoust , Rosa Aghdam , Mehdi Sadeghi bioRxiv 2025.04.09.647936; doi: https://doi.org/10.1101/2025.04.09.647936 Share This Article: Copy Citation Tools SimMapNet: A Bayesian Framework for Gene Regulatory Network Inference Using Gene Ontology Similarities as External Hint Maryam Shahdoust , Rosa Aghdam , Mehdi Sadeghi bioRxiv 2025.04.09.647936; doi: https://doi.org/10.1101/2025.04.09.647936 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7620) Biochemistry (17642) Bioengineering (13865) Bioinformatics (41863) Biophysics (21410) Cancer Biology (18548) Cell Biology (25437) Clinical Trials (138) Developmental Biology (13359) Ecology (19865) Epidemiology (2067) Evolutionary Biology (24288) Genetics (15587) Genomics (22468) Immunology (17704) Microbiology (40301) Molecular Biology (17142) Neuroscience (88448) Paleontology (666) Pathology (2825) Pharmacology and Toxicology (4815) Physiology (7634) Plant Biology (15109) Scientific Communication and Education (2042) Synthetic Biology (4285) Systems Biology (9812) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00