AdmirePred: A method for predicting abundant miRNAs in Exosomes

preprint OA: closed CC-BY-4.0
📄 Open PDF Full text JSON View at publisher
Full text 48,080 characters · extracted from preprint-html · click to expand
AdmirePred: A method for predicting abundant miRNAs in Exosomes | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results AdmirePred: A method for predicting abundant miRNAs in Exosomes View ORCID Profile Akanksha Arora , View ORCID Profile Gajendra Pal Singh Raghava doi: https://doi.org/10.1101/2025.03.19.644072 Akanksha Arora 1 Department of Computational Biology, Indraprastha Institute of Information Technology , Okhla Phase 3, New Delhi-110020, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Akanksha Arora Gajendra Pal Singh Raghava 1 Department of Computational Biology, Indraprastha Institute of Information Technology , Okhla Phase 3, New Delhi-110020, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Gajendra Pal Singh Raghava For correspondence: raghava{at}iiitd.ac.in Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Non-invasive disease diagnosis is a key application of blood exosomes in liquid biopsy, as they carry diverse biological molecules, including microRNAs (miRNAs) derived from their parent cells. Developing miRNA-based disease biomarkers requires prediction of highly abundant miRNAs in exosomes under normal conditions for establishing a baseline for understanding their physiological roles and disease-specific variations. In this study, we present models for predicting highly abundant miRNAs in exosomes from their nucleotide sequences. The models were trained, tested, and evaluated on a dataset comprising 348 abundant and 349 non-abundant miRNAs. Initially, we applied alignment-based approaches, such as motif and similarity searches, but these methods yielded poor coverage. We then explored alignment-free approaches, particularly machine learning models leveraging a broad range of features. Our Extra Trees classifier, developed using binary profiles and TF-IDF features, achieved the highest performance with an AUC of 0.77. To further enhance predictive accuracy, we developed a hybrid method that combines machine learning models with alignment-based approaches, achieving an AUC of 0.854 on an independent dataset. To support research in non-invasive diagnostics and therapeutics, we have developed a web server, standalone tool, and Python package for AdmirePred, available at https://webs.iiitd.edu.in/raghava/admirepred/ . Key points miRNA abundant in blood exosomes are promising biomarkers for liquid biopsy Classification of abundant and non-abundant miRNA in healthy individuals A hybrid method that combine alignment based and alignment free approach Prediction of miRNAs that are highly expressed in blood exosomes A web server, a python package, and a standalone tool have been created Author’s Biography Akanksha Arora is currently pursuing a Ph.D. in Computational Biology at Department of Computational Biology, Indraprastha Institute of Information Technology, New Delhi, India. Gajendra P. S. Raghava is currently working as a Professor and Head of Department of Computational Biology, Indraprastha Institute of Information Technology, New Delhi, India. 1. Introduction Exosomes are small extracellular vesicles, typically 30-150 nm in size, are secreted by nearly all cell types and carry a cargo of proteins, lipids, and nucleic acids that reflect the physiological and pathological state of their parent cells. Their stability in bodily fluids such as blood, urine, and saliva makes them ideal candidates for minimally invasive diagnostic and prognostic applications [ 1 – 4 ]. Exosomes are gaining significant attention in the field of liquid biopsy as promising non-invasive biomarkers for various diseases, including cancer, cardiovascular conditions, and neurodegenerative disorders [ 5 – 8 ]. Recent advancements in isolation and analytical techniques have enhanced the ability to harness exosomes for early disease detection, monitoring treatment response, and understanding disease mechanisms. As research progresses, exosomes are increasingly recognized for their potential to revolutionize personalized medicine by providing real-time insights into disease progression and therapeutic efficacy [ 9 , 10 ]. These vesicles carry a diverse array of molecular cargo, including mRNA, miRNA, lipids, proteins, and DNA. Among these, microRNAs (miRNAs) have garnered significant attention due to their regulatory influence on gene expression and their potential as diagnostic and prognostic biomarkers for various diseases [ 11 – 14 ]. These small non-coding RNAs (typically 20–22 nucleotides in length) modulate post-transcriptional gene expression by binding to target mRNAs, leading to their degradation or translational repression. MiRNAs abundant in exosomes are particularly significant as they regulate crucial biological pathways involved in cell proliferation, differentiation, apoptosis, and immune response modulation. Their stability in bodily fluids and ability to reflect disease-specific alterations make them valuable biomarkers for liquid biopsies, offering diagnostic, prognostic, and therapeutic insights into conditions such as cancer, cardiovascular diseases, and neurodegenerative disorders. While exosomes have emerged as a focal point in liquid biopsy research, significant efforts have also been directed toward understanding the molecules that are localized in exosomes as their presence in exosomes or other cellular compartments can influence their biological roles and utility as biomarkers [ 15 – 18 ]. Earlier we developed a method – EmiRPred, that classifies exosomal and non-exosomal miRNA based on the dataset collected from RNALocate, where subcellular locations for miRNA are given [ 18 , 19 ]. However, earlier we did not take miRNA expression into account and classified them as exosomal and non-exosomal. miRNAs exist in multiple subcellular locations, including the cytoplasm, nucleus, mitochondria, extracellular space (both vesicular and non-vesicular forms), and ribonucleoprotein complexes, but their expression levels vary amongst the locations [ 20 , 21 ]. Instead of using the binary approach of whether an miRNA is observed in exosome or not, we used expression-based approach that identifies miRNA that are abundant in exosomes. In this study, we have made a systematic effort to identify and predict the miRNA that are abundantly present in exosomes of normal subjects by taking their expression values from various studies. We believe that discovering which miRNA are present inside exosomes under normal conditions will help the scientists discover new biomarkers for a number of diseases, as well as enabling more precise and non-invasive disease monitoring [ 22 , 23 ]. Here, we present AdmirePred, a computational method designed to predict abundant miRNAs in exosomes. In this method, we have integrated two types of techniques including alignment-based approaches and AI-based approaches developed on a set of features. The method is trained and validated on experimentally verified miRNA datasets extracted from EVmiRNA and GEO datasets [ 24 , 25 ]. The architecture of this study is shown in Figure 1 . Download figure Open in new tab Figure 1: Architecture of AdmirePred 2. Methods 2.1. Data Collection and Preprocessing We collected the data from EVmiRNA for blood exosomes (serum and plasma) for normal human subjects (n=60), and average expressions were taken into account. The miRNAs having average expression in Reads Per Kilobase Million (RPKM) > 2 were assigned as abundant miRNA in exosomes whereas miRNA having average expression in RPKM < 1 were assigned as non-abundant miRNA. In this study, we refer abundant miRNAs in exosomes as exosomal miRNAs and non-abundant miRNAs as non-exosomal miRNAs. This data was then validated with another GEO study GSE270497 where the small RNA sequencing of plasma exosomes was done for both breast cancer and normal human subjects. We took the expression data for normal subjects into account to verify our data. Altogether, after obtaining the data for miRNA sequences, and removing the duplicates, we were able to get about 348 abundant and 349 non-abundant sequences, where the sequences ranged from 16 to 25 nucleotides. 2.2. Motif-Search We utilized the MERCI (Motif Emerging with Classes Identification) tool to identify motifs within miRNA sequences that are abundantly present in exosomes [ 26 ]. The motifs were discovered using the training dataset and then those motifs were searched within an independent validation set. The MERCI software allows for the selection of motifs that are exclusively present in the positive class (miRNA abundantly present in exosomes) within the training data. Furthermore, the software provides several customization options, including the ability to analyze motif frequency and detect motifs with or without gaps [ 16 ]. 2.3. Similarity Search In this study, we employed the blastn-short tool to annotate miRNA sequences by comparing their similarity to known abundant and non-abundant miRNA sequences in exosomes [ 27 ]. Initially, a database was created using the “makeblastdb” command, incorporating miRNA sequences from the training dataset. To generate results for the training set, self-hits were excluded, and only the top hit, after removing self-hits, was considered. For the independent validation set, the first hit was used to evaluate results across different e-value thresholds. This approach has been previously utilized in earlier studies [ 28 – 30 ]. 2.4. ML-based classification methods 2.4.1. Feature Generation To build a prediction model for distinguishing between abundant and non-abundant miRNA in exosomes, we employed various techniques to extract relevant features from the miRNA sequences. These techniques are outlined below: a) Nucleotide Composition: We employed Nfeature to calculate a diverse set of composition-based features from miRNA sequences, such as nucleotide composition of a sequence as well as its reverse complementary sequence for k-mers = 1,2,3, and 4 (nucleotide sequences of length k) [ 31 ]. b) Nucleotide-weighted frequency: We extracted weighted frequency of nucleotides using Term Frequency-Inverse Document Frequency (TF-IDF), a statistical technique that determines the significance of a term within a document in relation to a larger collection of documents. In our context, “terms” correspond to k-mers , and a “document” represents an miRNA sequence [ 32 ]. The Term Frequency (TF) measures how often a specific k-mer appears within a sequence, normalized by the total number of k-mers in that sequence. The Inverse Document Frequency (IDF) quantifies the uniqueness of a k-mer by calculating the logarithmic inverse of the fraction of sequences in the dataset that contain the k-mer. By combining these values, TF-IDF assigns higher importance to k-mers that frequently appear in specific sequences but are relatively rare across the dataset, making them more informative and representative of the sequence’s characteristics. These metrics are further explained in Equations 1 - 3 . In these equations, k represents the k-mer, s denotes a sequence, and D stands for the entire dataset of sequences. TF-IDF features were computed for both the original and reverse complementary sequences, covering k-mer ranges from (1,1) to (1,7). c) Distance Distribution of Nucleotides: We calculated distance distribution of different nucleotide types in miRNA sequences using Nfeature tool. Distance-based features help uncover hidden sequence patterns that might not be evident from simple nucleotide composition (like k-mer frequencies) consider direct compositions but often ignore spatial relationships between nucleotides. However, in biological sequences like miRNAs, the relative positioning of nucleotides might play a crucial role in their structural stability, functional interactions, and regulatory activity. d) Nucleotide Repeat Index: The Nucleotide Repeats Index (NRI) concisely captures the extent of consecutive runs of each nucleotide (A, C, G, U) in a sequence, turning those repeats into a simple numeric descriptor. By highlighting how nucleotides cluster or disperse, NRI aids in detecting patterns which can influence RNA folding and gene regulation. e) Entropy: The Shannon Entropy feature gauges the randomness or unpredictability of nucleotides in a given sequence. It reflects how uniformly the four nucleotides (A, C, G, U) are distributed—sequences dominated by one or two nucleotides have lower entropy, while more balanced sequences show higher entropy. This measure helps distinguish sequences based on complexity and can highlight potential regulatory or structurally significant regions where nucleotide diversity (or lack thereof) matters. We calculated both sequence level and nucleotide level Shannon Entropy for our dataset. The sequence level Shannon Entropy produces one overall entropy value for the entire sequence, whereas the nucleotide level Shannon entropy breaks the calculation down to yield a 4-value vector, capturing how each nucleotide’s distribution contributes to the entropy. f) Correlation: We computed correlation-based descriptors to capture positional relationships and physicochemical similarities within miRNA sequences. We computed a) Autocorrelation which measures how the same property (e.g., a certain physicochemical index) at different positions correlates over a given “lag” (distance); b) Cross-correlation which evaluates how two different properties correlate at positions separated by a specific lag; c) Auto-cross correlation: A hybrid approach that merges the ideas of both autocorrelation (same property) and cross-correlation (different properties) in a single descriptor. g) Pseudo Composition: Compared with the conventional nucleotide composition, pseudo nucleotide compositions have the advantage of converting RNA sequences of various lengths to a fixed-length digital vector to enable sequence comparison, while at the same time keeping the long-range sequence order information. Along with pseudo composition, we also computed two subtypes namely serial correlation pseudo composition which captures the dependency between consecutive dinucleotides in a sequence, and parallel correlation pseudo composition which computes multiple correlations simultaneously across different sequence positions. h) Binary Profiles: We generated binary profiles, also known as one-hot encoding features, to represent nucleotide sequences as binary vectors. To accommodate sequences of varying lengths, we applied a padding technique to standardize their length. As the longest miRNA sequence in our study is 25 nucleotides, all sequences were extended to this length by appending ‘X’ to the sequences until they have a length of 25 nucleotides. For instance, a sequence such as “AGTAGCATCGUUAAGTUCCAAU” which has a length of 22, was padded to “AGTAGCATCGUUAAGTUCCAAUXXX” to reach the required length of 25. Once padding was applied, the sequences were transformed into binary profiles using one-hot encoding. 2.4.2. Prediction Models We employed multiple machine learning (ML) algorithms from Scikit-learn to classify abundant and non-abundant miRNA sequences in exosomes. Decision Tree (DT) splits data based on feature thresholds to form decision rules, Random Forest (RF) builds multiple decision trees and averages their predictions for better accuracy, Logistic Regression (LR) estimates probabilities using a logistic function, K-Nearest Neighbors (KNN) classifies based on the majority class of k nearest data points, Gaussian Naïve Bayes (GNB) applies Bayes’ theorem with a Gaussian distribution, Support Vector Classifier (SVC) finds the optimal hyperplane to separate classes, Extreme Gradient Boosting (XGB) improves predictions through iterative gradient-boosted decision trees, and Extra Trees Classifier (ET) enhances RF by adding randomness to feature selection and splits, improving robustness [ 33 – 38 ]. Prediction models were developed using these algorithms based on various feature sets, as described in Section 2.3.1. 2.4.3. Cross-validation and performance metrics The dataset, comprising 697 sequences, was split in an 80:20 ratio, with 80% allocated for training and 20% for validation. To assess the performance of the ML models, a five-fold cross-validation approach was applied to the training set, while the remaining 20% was kept separate and unseen during training the models. In five-fold cross-validation, the training data is divided into five sets, with four sets used for training and one for internal validation, rotating this process five times so that each set is used as a validation set once. The models were evaluated using both threshold-dependent and threshold-independent performance metrics. The evaluation criteria included the calculation of specificity, sensitivity, accuracy Matthews correlation coefficient (MCC), and the Area Under the Receiver Operating Characteristic Curve (AUROC). AUROC is independent of thresholds, whereas the other metrics rely on threshold values. Sensitivity, specificity, and MCC were optimized to identify the best threshold. These evaluation metrics have been widely utilized in previous research to assess model performance and are shown in Equations 4 -7 [ 39 – 43 ]. Where FP, TP, FN, and TN are false positive, true positive, false negative, and true negative, respectively. 2.5. Ensemble method To enhance the predictive performance of our top-performing model, developed using the optimal feature set, we developed a method that integrates different approaches used in this study. This method employs a weighted scoring system that combines three techniques: (i) a motif-based analysis, (ii) similarity search via BLAST, and (iii) ML predictions. In this scoring scheme, an miRNA sequence is awarded +0.5 points if an exosomal motif is detected and 0 points if no such motif is found and added to the predicted probabilities generated by the best-performing ML model using the predict_proba() function from scikit-learn, which provides the likelihood of a sequence belonging to a particular class rather than a simple binary classification [ 44 ]. Similarly +0.5 points are assigned if the sequence is identified as similar to an exosomal (abundant in exosomes) miRNA through BLAST analysis and added to the probabilities generated by best performing ML model. The combined score of ML and motif-search varies from 0 to 1.5, and the combined score for ML and blast-search varies from 0 to 1.5 as well. These are defined in equations 8 and 9 . Based on these cumulative scores, sequences were classified as either exosomal or non-exosomal. This hybrid or ensemble approach has been previously applied in various studies [ 16 , 28 ]. Here, E = Prediction probability score acquired from best performing ML model and E m = Score obtained after the scores from the motif-based approach are added Here, E b = Final score obtained by adding scores from the best performing ML model and BLAST-based approach ranging from 0 to 1.5 3. Results In this study, we used various techniques to predict abundant miRNA in exosomes, which can be broadly classified into three categories: (i) Alignment-based methods, (ii) AI-based methods, and (iii) ensemble methods. The alignment-based methods include motif identification using the MERCI tool and similarity search through BLAST. The AI-based methods involve the application of ML models on a diverse range of features and their combinations. To maximize the advantages of both alignment-based and AI-based approaches, we developed an ensemble strategy that effectively integrates their strengths. 3.1 Motif-Search In motif-search using MERCI, it was observed that there was low coverage and a high error rate with a coverage of only 8 sequences in the independent validation set with 3 wrong predictions for gap = 0. Similarly for gap=1, about 15 sequences were covered with 6 wrong predictions. 3.2. Similarity-search using BLAST In this study, we utilized the blastn-short tool to implement similarity searches against a training dataset comprising abundant and non-abundant miRNA sequences in exosomes, exploring e-values from 10 −6 to 10 6 [ 27 ]. The optimal performance was observed at e-value 10 −2 , with 24 correct hits and 0 incorrect hits for exosomal (abundant in exosomes) miRNA sequences out of the total of 69 exosomal sequences in the validation set. The e-values below 10 −2 resulted in insufficient sequence coverage, while values exceeding 10 −2 led to an increased error rate. The complete results for BLAST for the e-values 10 −6 to 10 6 are described in Table 1 . View this table: View inline View popup Table 1: Number of correct, incorrect, and total hits for training and independent validation sets comprising exosomal (abundant in exosomes) miRNA sequences for e-values ranging from 10 −6 to 10 6 3.3. ML-based classification methods First, we extracted a comprehensive set of sequence features from miRNA sequences, as described in the Materials & Methods section. Subsequently, we employed various machine learning algorithms to build prediction models, including (LR), Gaussian Naïve Bayes (GNB), K-Nearest Neighbors (KNN), Decision Tree (DT), Support Vector Classifier (SVC), Extreme Gradient Boosting (XGB), Random Forest (RF), and Extra Trees Classifier (ET) models. The results are shown in Supplementary Table S1. The performances of ML-based models developed using various types of features are given below: a) Nucleotide Composition: We calculated nucleotide composition of miRNA sequences as well as their reverse complementary sequences for k-mers 1 to 4. The best performing feature among this category was the composition of reverse complementary miRNA sequence for k-mer =3, where the highest AUC of 0.738 on independent validation set was achieved on an SVC model. b) Nucleotide-weighted frequency: The best performing composition-based features were TF-IDF for reverse complementary sequences for k-mers 1 and 2 leading to an AUC of 0.744 in the independent validation set for ET model. c) Distance Distribution of Nucleotides: We computed distance distribution of nucleotides using Nfeature, and got the highest performance of 0.609 AUC for training set and 0.621 on independent validation set. d) Nucleotide Repeat Index: The nucleotide repeat index was calculated as described in the Methods section. The highest AUC observed for these set of features was 0.593 and 0.594 for training and independent validation sets respectively. e) Entropy: We computed Shannon Entropy on two levels including sequence level and nucleotide level entropy. The highest AUC achieved for sequence level was 0.561 on independent validation set, whereas for nucleotide level it was 0.594 for independent validation set. f) Correlation: We calculated three types of correlations in this study including autocorrelation, auto-cross correlation, and cross correlation. For this, the best performing features were autocorrelation of dinucleotides with AUC of 0.62 for independent validation set. g) Pseudo Composition: We computed pseudo composition, serial correlation pseudo composition, parallel correlation pseudo compositions for dipeptides. The highest performing feature was serial correlation pseudo composition with an AUC of 0.708 for training and 0.707 for independent validation set. h) Binary Profiles: We calculated binary profiles or one hot encoding features for the miRNA sequences to represent these sequences as binary vectors. These features were one of the best performing features with the highest AUC of 0.75 in the independent validation set. i) Best features: We developed the machine learning models by combining the best features in composition-based features and binary profile-based features. The best performing model was Extra Tree classifier resulting in an AUC of 0.763 in training and 0.769 in independent validation set. The results for best features are given in Table 2 and the results for combined best features are shown in Table 3 . We also performed a Mann-Whitney test on these features and it was found that 31 features were significantly different (p-value<0.05) from a total of 145 features. The results for Mann-Whitney Test are given in Supplementary Table S2. j) Feature Importance We selected the best-performing 20 features using the feature importance in the ET model for alternate dataset as well. About 15 of the 20 were found to be significantly different (p-value<0.05) between the exosomal and non-exosomal classes. In binary profile features, G at 15 th , G at 19 th , A at 16 th , G at 10 th and positions were found to be significantly increased in exosomal sequences, whereas, U at 15 th position, A at 20 th , and A at 11 th were found to be significantly decreased in exosomal sequences as compared to non-exosomal sequences. In TFIDF features from the range of (1,2) in reverse complement miRNA sequences, it is seen that the occurrence of C, AC, CG, and GA are significantly increased in reverse complementary exosomal sequences, and UU, U, AU, and UA are significantly decreased in reverse complementary exosomal sequences as compared to non-exosomal sequences. The features with their importance are given in Supplementary Table S3, and the comparison between the two classes for top 20 important features are shown in Figure 2 . View this table: View inline View popup Table 2: The performance of machine learning-based methods developed using binary profile View this table: View inline View popup Download powerpoint Table 3: The performance of machine learning-based models developed using combination of binary profile and reverse complement TFIDF View this table: View inline View popup Table 4: Results for hybrid model: best-performing features (AI-based methods) combined with similarity search (Alignment-based methods) Download figure Open in new tab Figure 2: The top 20 most important features for the classification of exosomal (abundant in exosomes) and non-exosomal (non-abundant in exosomes) miRNA Download figure Open in new tab Figure 3: AUROC graphs for the best performing ML model, and hybrid models (ML + MERCI, ML + BLAST), for A) training set and B) validation set 3.4. Comparison with existing methods We compared our prediction to the existing method – EmiRPred, which predicts exosomal miRNA and non-exosomal miRNA without taking expressions of different miRNA in exosomes into account [ 45 ]. In addition to this, we also compared our method against miRNALoc, and EL-RMLocnet that predict the subcellular localization of miRNA, with “exosome” being one of the possible locations [ 17 , 46 ]. To evaluate and compare the results, we input our independent validation set into the prediction server of miRNALoc that achieved an AUC of 0.422, compared to our server AdmirePred demonstrated a significantly higher AUC of 0.854. However, obtaining predictions from EL-RMLocNet was not feasible, as it processes only one sequence at a time, making it impractical to analyze the entire validation set of 139 sequences. As EmiRPred’s training dataset had a lot of the same miRNA used in AdmirePred, we found the miRNA sequences that were exclusively present in admirepred dataset and put them as input in EmiRPred server, it was able to predict the miRNA as exosomal and non-exosomal with an AUC of 0.623. 3.5. Webserver and Standalone Software We wish to promote the field of non-invasive diagnostics and prognostics, focusing on miRNA based biomarkers and help the researchers worldwide working in this area. The developed web server offers three key modules: Predict, Design, and BLAST-search. The Predict module allows users to input query sequences and obtain predictions for abundant miRNA in exosomes. The Design module enables users to generate all possible mutant miRNA sequences and predict how likely a mutant miRNA is to be abundantly present in exosome. The BLAST-search module provides an option of similarity search against a database of known abundant and non-abundant miRNA sequences in exosomes. This web server is designed to be accessible across multiple devices, including smartphones, PCs, iMacs, and tablets. Additionally, we have developed a Python package, a standalone tool, and a GitHub repository, which are available at the following links: https://webs.iiitd.edu.in/raghava/admirepred/ and https://github.com/raghavagps/admirepred . 4. Discussions Exosomal microRNAs (miRNAs) have emerged as critical players in intercellular communication and disease pathogenesis, offering promising potential as diagnostic and therapeutic biomarkers [ 5 , 47 ]. These small non-coding RNAs are selectively packaged into exosomes, which protect them from degradation and facilitate their transport to target cells, where they modulate gene expression [ 48 ]. Studies have demonstrated the utility of exosomal miRNAs in a variety of diseases, including cancer, neurodegenerative disorders, and cardiovascular diseases. For instance, in cancer, exosomal miR-21 and miR-155 have been identified as key regulators of tumor progression and metastasis, with elevated levels observed in the exosomes of patients with breast and pancreatic cancers [ 49 , 50 ]. In neurodegenerative diseases such as Alzheimer’s, exosomal miR-125b and miR-146a have been implicated in neuroinflammation and neuronal dysfunction [ 51 ]. Similarly, in cardiovascular diseases, exosomal miR-1 and miR-133a have been associated with myocardial injury and heart failure [ 52 ]. The ability of exosomal miRNAs to reflect disease states and their stability in bodily fluids make them attractive candidates for non-invasive diagnostics and targeted therapies. To help accelerate exosome-based diagnostics and therapies, we conducted this research in predicting the miRNA present in abundance in exosomes during normal conditions. We believe this will help the scientists worldwide to compare the expression levels of miRNA inside exosomes during various disease conditions. To achieve this, we firstly employed alignment-based approaches including motif-search and similarity-search. We performed a motif-search to find motifs that are exclusively present in positive dataset (exosome abundant miRNA) for various conditions such as Gap = 0, 1, and 2. However, it was observed that this approach did not work well for the dataset as it had low coverage of about 7.6% and high error rate. For similarity-search approach, we used blastn-short. This method showed about 0% error but low coverage accounting to only about 34.7% of the total independent validation set sequences at e-value 10 −2 . To overcome the limitations of alignment-based methods, we performed built machine learning-based models to predict highly abundant miRNA in exosomes. We calculated a number of features of the miRNA sequences including binary-profile based features and composition-based features like nucleotide composition, entropy, correlation, term-frequency-inverse document frequency (tfidf) etc. The best performing based features were binary-profile based features and tfidf features for reverse complementary sequences for k-mers (1,2). After combining these best-performing features, we achieved a maximum AUC of 0.769 and MCC of 0.410 on independent validation dataset. Next, we employed a hybrid method to harness the predicting power of both alignment-based and machine-learning based classification methods. After combining these techniques, we obtained the highest AUC of 0.854 and MCC of 0.559. We compared our method against existing computational approaches designed to predict the subcellular localization of miRNAs, including those that specifically account for exosomal miRNAs, such as EmiRPred, EL-RMLocnet and miRNALoc [ 17 , 45 , 46 ]. These methods have been widely used to infer the localization of miRNAs within cellular compartment. However, our method demonstrated superior performance in accurately predicting miRNA localization, particularly in identifying abundant miRNAs in exosomes. This improvement can be attributed to our incorporation of miRNA expression profiles. We believe that the expression levels of miRNAs play a critical role in determining their abundance. For instance, miRNAs that are highly expressed in specific cell types or under certain disease states are more likely to be selectively packaged into exosomes, as exosomal sorting mechanisms often prioritize miRNAs that are functionally relevant to intercellular communication. By integrating expression data into our predictive framework, we were able to capture these nuances, leading to more accurate predictions. Conflict of interest The authors declare no competing financial and non-financial interests. Author’s contributions AA collected and processed the data, implemented the algorithms, developed the prediction models, and built the front end and back end of the web server. AA, and GPSR prepared the manuscript. GPSR conceived and coordinated the project. All authors have read and approved the final manuscript. Data Availability Statement All the datasets generated in this study are available at https://webs.iiitd.edu.in/raghava/admirepred/dataset.php and codes are available at https://github.com/raghavagps/admirepred . Acknowledgements The authors are thankful to the Council of Scientific and Industrial Research (CSIR) for providing the fellowship. The authors are also thankful to the Department of Computational Biology, IIITD, New Delhi for its infrastructure and facilities. We thank the Department of Biotechnology (DBT) for providing an infrastructure grant to the institute (Grant BT/PR40158/BTIS/137/24/2021). We would like to acknowledge that figures were created using BioRender, and English was corrected using Grammarly. Footnotes Authors’ Information Akanksha Arora: akankshaar{at}iiitd.ac.in , Prof. Gajendra P. S. Raghava: raghava{at}iiitd.ac.in References 1. ↵ Lin Y , Anderson JD , Rahnama LMA , et al. Exosomes in disease and regeneration: biological functions, diagnostics, and beneficial effects . Am J Physiol Heart Circ Physiol 2020 ; 319 : H1162 – H1180 OpenUrl CrossRef PubMed 2. Sonar S. Decoding the mysteries of prostate cancer via cutting-edge liquid biopsy-based urine exosomes profiling . The journal of liquid biopsy 2024 ; 6 : 100162 OpenUrl CrossRef PubMed 3. Wang H , Matsumoto Y , Maiyulan A , et al. Circulating exosome-derived miR-191-5p is a novel therapeutic biomarker for radiotherapy in esophageal squamous cell carcinoma patients . Esophagus 2025 ; 4. ↵ Arora A , Kaur D , Patiyal S , et al. SalivaDB-a comprehensive database for salivary biomarkers in humans . Database (Oxford) 2023 ; 2023 : 5. ↵ Mosquera-Heredia MI , Morales LC , Vidal OM , et al. Exosomes: Potential Disease Biomarkers and New Therapeutic Targets . Biomedicines 2021 ; 9 : 6. Zhou B , Xu K , Zheng X , et al. Application of exosomes as liquid biopsy in clinical diagnosis . Signal Transduct Target Ther 2020 ; 5 : 144 OpenUrl CrossRef PubMed 7. Danielson KM , Shah R , Yeri A , et al. Plasma Circulating Extracellular RNAs in Left Ventricular Remodeling Post-Myocardial Infarction . EBioMedicine 2018 ; 32 : 172 – 181 OpenUrl CrossRef PubMed 8. ↵ Yu W , Hurley J , Roberts D , et al. Exosome-based liquid biopsies in cancer: opportunities and challenges . Ann Oncol 2021 ; 32 : 466 – 477 OpenUrl CrossRef PubMed 9. ↵ Kalluri R , LeBleu VS . The biology, function, and biomedical applications of exosomes . Science 2020 ; 367 : 10. ↵ Théry C , Witwer KW , Aikawa E , et al. Minimal information for studies of extracellular vesicles 2018 (MISEV2018): a position statement of the International Society for Extracellular Vesicles and update of the MISEV2014 guidelines . J Extracell Vesicles 2018 ; 7 : 1535750 OpenUrl CrossRef PubMed 11. ↵ Ratti M , Lampis A , Ghidini M , et al. MicroRNAs (miRNAs) and Long Non-Coding RNAs (lncRNAs) as New Tools for Cancer Therapy: First Steps from Bench to Bedside . Target Oncol 2020 ; 15 : 261 – 278 OpenUrl CrossRef PubMed 12. Wang J , Chen J , Sen S. MicroRNA as Biomarkers and Diagnostics . J Cell Physiol 2016 ; 231 : 25 – 30 OpenUrl CrossRef PubMed 13. Catalanotto C , Cogoni C , Zardo G. MicroRNA in Control of Gene Expression: An Overview of Nuclear Functions . Int J Mol Sci 2016 ; 17 : 14. ↵ Kim DH , Park H , Choi YJ , et al. Identification of exosomal microRNA panel as diagnostic and prognostic biomarker for small cell lung cancer . Biomark Res 2023 ; 11 : 80 OpenUrl CrossRef PubMed 15. ↵ Ras-Carmona A , Gomez-Perosanz M , Reche PA . Prediction of unconventional protein secretion by exosomes . BMC Bioinformatics 2021 ; 22 : 333 OpenUrl CrossRef PubMed 16. ↵ Arora A , Patiyal S , Sharma N , et al. A random forest model for predicting exosomal proteins using evolutionary information and motifs . Proteomics 2023 ; e2300231 17. ↵ Meher PK , Satpathy S , Rao AR . Publisher Correction: miRNALoc: predicting miRNA subcellular localizations based on principal component scores of physico-chemical properties and pseudo compositions of di-nucleotides . Sci Rep 2021 ; 11 : 3287 OpenUrl CrossRef PubMed 18. ↵ Arora A , Raghava GPS . Prediction of exosomal miRNA-based biomarkers for liquid biopsy . 2024 ; 19. ↵ Cui T , Dou Y , Tan P , et al. RNALocate v2.0: an updated resource for RNA subcellular localization with increased coverage and annotation . Nucleic Acids Res 2022 ; 50 : D333 – D339 OpenUrl CrossRef PubMed 20. ↵ van Wijk N , Zohar K , Linial M. Challenging Cellular Homeostasis: Spatial and Temporal Regulation of miRNAs . Int J Mol Sci 2022 ; 23 : 21. ↵ Leung AKL . The Whereabouts of microRNA Actions: Cytoplasm and Beyond . Trends Cell Biol 2015 ; 25 : 601 – 610 OpenUrl CrossRef PubMed 22. ↵ Bartel DP . Metazoan MicroRNAs . Cell 2018 ; 173 : 20 – 51 OpenUrl CrossRef PubMed 23. ↵ Weber JA , Baxter DH , Zhang S , et al. The microRNA spectrum in 12 body fluids . Clin Chem 2010 ; 56 : 1733 – 41 OpenUrl Abstract / FREE Full Text 24. ↵ Barrett T , Troup DB , Wilhite SE , et al. NCBI GEO: Mining tens of millions of expression profiles - Database and tools update . Nucleic Acids Res 2007 ; 25. ↵ Liu T , Zhang Q , Zhang J , et al. EVmiRNA: a database of miRNA profiling in extracellular vesicles . Nucleic Acids Res 2019 ; 47 : D89 – D93 OpenUrl CrossRef PubMed 26. ↵ Vens C , Rosso M-N , Danchin EGJ . Identifying discriminative classification-based motifs in biological sequences . Bioinformatics 2011 ; 27 : 1231 – 8 OpenUrl CrossRef PubMed 27. ↵ Altschul SF , Gish W , Miller W , et al. Basic local alignment search tool . J Mol Biol 1990 ; 215 : 403 – 10 OpenUrl CrossRef PubMed Web of Science 28. ↵ Kaur D , Arora A , Vigneshwar P , et al. Prediction of peptide hormones using an ensemble of machine learning and similarity-based methods . Proteomics 2024 ; 29. Sharma N , Naorem LD , Jain S , et al. ToxinPred2: an improved method for predicting toxicity of proteins . Brief Bioinform 2022 ; 23 : 30. ↵ Sharma N , Patiyal S , Dhall A , et al. AlgPred 2.0: an improved method for predicting allergenic proteins and mapping of IgE epitopes . Brief Bioinform 2021 ; 22 : 31. ↵ Mathur M , Patiyal S , Dhall A , et al. Nfeature: A platform for computing features of nucleotide sequences . bioRxiv 2021 ; 2021.12.14.472723 32. ↵ . TF–IDF . Encyclopedia of Machine Learning 2011 ; 986 – 987 33. ↵ Stoltzfus JC . Logistic regression: a brief primer . Acad Emerg Med 2011 ; 18 : 1099 – 104 OpenUrl CrossRef PubMed Web of Science 34. Wu Y , Ianakiev K , Govindaraju V. Improved k-nearest neighbor classification . Pattern Recognit 2002 ; 35. Cristianini N , Ricci E. Support Vector Machines . Encyclopedia of Algorithms 2008 ; 928 – 932 36. Chen T , Guestrin C. XGBoost: A scalable tree boosting system . Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining 2016 ; 37. Joshi D , Mishra A , Anand S. A naïve Gaussian Bayes classifier for detection of mental activity in gait signature . Comput Methods Biomech Biomed Engin 2012 ; 15 : 411 – 6 OpenUrl CrossRef PubMed 38. ↵ Geurts P , Ernst D , Wehenkel L. Extremely randomized trees . Mach Learn 2006 ; 39. ↵ Aggarwal S , Dhall A , Patiyal S , et al. An ensemble method for prediction of phage-based therapy against bacterial infections . Front Microbiol 2023 ; 14 : 1148579 OpenUrl CrossRef PubMed 40. Jarwal A , Dhall A , Arora A , et al. A deep learning method for classification of HNSCC and HPV patients using single-cell transcriptomics . Front Mol Biosci 2024 ; 11 : 41. Srivastava A , Dhall A , Patiyal S , et al. Prediction of Alzheimer’s Disease from Single Cell Transcriptomics Using Deep Learning . 42. Rainio O , Teuho J , Klén R. Evaluation metrics and statistical tests for machine learning . Sci Rep 2024 ; 14 : 6086 OpenUrl CrossRef PubMed 43. ↵ Bhalla S , Chaudhary K , Kumar R , et al. Gene expression-based biomarkers for discriminating early and late stage of clear cell renal cancer . Sci Rep 2017 ; 7 : 44997 OpenUrl CrossRef PubMed 44. ↵ Pedregosa F , Varoquaux G , Gramfort A , et al. Scikit-learn: Machine learning in Python . Journal of Machine Learning Research 2011 ; 45. ↵ Arora A , Raghava GPS . Prediction of exosomal miRNA-based biomarkers for liquid biopsy . 2024 ; 46. ↵ Asim MN , Ibrahim MA , Malik MI , et al. EL-RMLocNet: An explainable LSTM network for RNA-associated multi-compartment localization prediction . Comput Struct Biotechnol J 2022 ; 20 : 3986 – 4002 OpenUrl CrossRef PubMed 47. ↵ Tastan B , Tarakcioglu E , Birinci Y , et al. Role of Exosomal MicroRNAs in Cell-to-Cell Communication . Methods Mol Biol 2022 ; 2257 : 269 – 292 OpenUrl CrossRef PubMed 48. ↵ Zhang J , Li S , Li L , et al. Exosome and exosomal microRNA: trafficking, sorting, and function . Genomics Proteomics Bioinformatics 2015 ; 13 : 17 – 24 OpenUrl CrossRef PubMed 49. ↵ Melo SA , Luecke LB , Kahlert C , et al. Glypican-1 identifies cancer exosomes and detects early pancreatic cancer . Nature 2015 ; 523 : 177 – 82 OpenUrl CrossRef PubMed 50. ↵ Lai X , Wang M , McElyea SD , et al. A microRNA signature in circulating exosomes is superior to exosomal glypican-1 levels for diagnosing pancreatic cancer . Cancer Lett 2017 ; 393 : 86 – 93 OpenUrl CrossRef PubMed 51. ↵ Cheng L , Doecke JD , Sharples RA , et al. Prognostic serum miRNA biomarkers associated with Alzheimer’s disease shows concordance with neuropsychological and neuroimaging assessment . Mol Psychiatry 2015 ; 20 : 1188 – 96 OpenUrl CrossRef PubMed 52. ↵ Kuwabara Y , Ono K , Horie T , et al. Increased microRNA-1 and microRNA-133a levels in serum of patients with cardiovascular disease indicate myocardial damage . Circ Cardiovasc Genet 2011 ; 4 : 446 – 54 OpenUrl Abstract / FREE Full Text View the discussion thread. Back to top Previous Next Posted March 19, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following AdmirePred: A method for predicting abundant miRNAs in Exosomes Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share AdmirePred: A method for predicting abundant miRNAs in Exosomes Akanksha Arora , Gajendra Pal Singh Raghava bioRxiv 2025.03.19.644072; doi: https://doi.org/10.1101/2025.03.19.644072 Share This Article: Copy Citation Tools AdmirePred: A method for predicting abundant miRNAs in Exosomes Akanksha Arora , Gajendra Pal Singh Raghava bioRxiv 2025.03.19.644072; doi: https://doi.org/10.1101/2025.03.19.644072 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7643) Biochemistry (17717) Bioengineering (13910) Bioinformatics (42016) Biophysics (21477) Cancer Biology (18628) Cell Biology (25536) Clinical Trials (138) Developmental Biology (13392) Ecology (19935) Epidemiology (2067) Evolutionary Biology (24356) Genetics (15617) Genomics (22530) Immunology (17755) Microbiology (40437) Molecular Biology (17200) Neuroscience (88704) Paleontology (667) Pathology (2840) Pharmacology and Toxicology (4832) Physiology (7657) Plant Biology (15171) Scientific Communication and Education (2046) Synthetic Biology (4304) Systems Biology (9828) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-24T02:00:01.246996+00:00
License: CC-BY-4.0