Automated Navigation of the lncRNA Transcriptome: A comprehensive SnakeMake based computational Pipeline for robust Identification of lncRNAs and their putative targets

doi:10.1101/2024.08.18.608522

Automated Navigation of the lncRNA Transcriptome: A comprehensive SnakeMake based computational Pipeline for robust Identification of lncRNAs and their putative targets

2024 · doi:10.1101/2024.08.18.608522

preprint OA: closed CC-BY-NC-ND-4.0

📄 Open PDF Full text JSON View at publisher

Full text 53,836 characters · extracted from preprint-html · click to expand

Automated Navigation of the lncRNA Transcriptome: A comprehensive SnakeMake based computational Pipeline for robust Identification of lncRNAs and their putative targets | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Automated Navigation of the lncRNA Transcriptome: A comprehensive SnakeMake based computational Pipeline for robust Identification of lncRNAs and their putative targets Manu Kandpal , Chitranjan Mukherjee , Bhadresh Rami doi: https://doi.org/10.1101/2024.08.18.608522 Manu Kandpal 1 Deep Biological Intelligence, Artificial Intelligence Centre of Excellence, Jio Platforms Ltd. , NHQ Delhi, Gurugram, Haryana, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chitranjan Mukherjee 2 Deep Biological Intelligence, Artificial Intelligence Centre of Excellence, Jio Platforms Ltd. , NHQ, Bengaluru, Karnataka, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site Bhadresh Rami 2 Deep Biological Intelligence, Artificial Intelligence Centre of Excellence, Jio Platforms Ltd. , NHQ, Bengaluru, Karnataka, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: Bhadresh.Rami{at}ril.com Abstract Full Text Info/History Metrics Supplementary material Preview PDF ABSTRACT Background Long non-coding RNAs (lncRNAs) have emerged as potent regulatory elements in cellular processes. The substantial increase in transcriptomic data resulting from high-throughput RNA sequencing necessitates effective approaches for the identification and functional annotation of lncRNAs. Method To address this need, we have developed a SnakeMake-based pipeline. Our pipeline automates and integrates several key steps: 1) RNA-seq analysis using Hisat2 and stringTie, (2) lncRNA identification using inhouse python scripts and tools CPC2 and BLASTX, (3) prediction of cis- and trans- gene targets of lncRNAs, and (4) KEGG pathway enrichment to obtain biological insights. Importantly, the pipeline allows users to customize parameters for each step through a user-friendly configuration file (config.yaml), enhancing flexibility and ease of use. One of the distinctive features of our approach is its single command execution, facilitating multiple runs without the need for extensive user intervention. This not only enhances user convenience but also ensures reproducibility of analyses across different studies. Result We applied our pipeline on rice, sorghum, and human RNA-seq data, to identify (1) List of all differentially expressed transcripts., (2) List of differentially expressed lncRNAs, (3) lncRNA target genes, (4) Enriched pathways to which target genes belong and (5) Obtain a visualization output in the form of a bubble plot that depicts the enriched pathways. Our approach can help researchers obtain valuable biological insights into how lncRNAs contribute to various biological functions. Conclusion The distinctive features of our SnakeMake-based automation pipeline position it as a versatile asset for researchers seeking a user-friendly, adaptable, robust, and reproducible solution for pan species lncRNA analysis. By efficiently uncovering the regulatory roles of lncRNAs in cellular processes, this pipeline has the potential to shed light on various biological phenomena, such as developmental biology, disease progression, and cellular response to external stimuli. GRAPHICAL ABSTRACT This study presents a SnakeMake-based pipeline for identifying and annotating long non-coding RNAs (IncRNAs) from RNA sequencing data. It integrates RNA-seq analysis, IncRNA identification, gene target prediction, and pathway enrichment, with customizable parameters through a user-friendly configuration file. The pipeline’s single command execution enhances convenience and reproducibility. (The bubble chart in the figure is a representative chart and provided as an example.) Download figure Open in new tab 1. INTRODUCTION Long noncoding RNAs (lncRNAs) constitute a diverse category of endogenous transcribed RNA that are characterized by transcript length of ≥ 200 nucleotides ( Lan et al. 2021 ; Zhou et al. 2022 ). In contrast to protein- coding genes, lncRNAs do not code for proteins. However, their significance in the regulatory landscape of cellular processes has been increasingly recognized ( Oo et al. 2022 ). lncRNAs can sequester miRNAs thus disrupting the binding on the target coding genes which ultimately results in indirect deregulation of the target genes of those miRNAs. lncRNAs can interact with transcription factors ( Liu et al. 2018 ; Ai et al. 2019 ; Zhang et al. 2019 ), and chromatin modifier proteins ( Chu et al. 2011 ; Li et al. 2017 ; Bell et al. 2018 ; Bonetti et al. 2020 ), thus regulating gene expression ( Kitagawa et al. 2013 ; Xiang et al. 2014 ; Yang et al. 2015 ; Isoda et al. 2017 ; Saldaña-Meyer et al. 2019 ; Statello et al. 2021 ), apoptosis ( Ma et al. 2014 ; Li et al. 2018 ; Tao et al. 2019 ), survival, cancer migration ( Jiang et al. 2019 ; Wang et al. 2020 ; Ahmad et al. 2023 ), and metabolism ( Lin et al. 2020 ; Liu et al. 2021 ; Agostini et al. 2022 ; Duan et al. 2023 ). Their involvement in these crucial pathways emphasizes the significance of lncRNAs in the overall framework of cellular regulation and disease ( Schmitt and Chang 2016 ; Bartonicek et al. 2016 ; Bhan et al. 2017 ; Wang et al. 2023 ; Kohlmaier et al. 2023 ; Ahmad et al. 2023 ). In plants, lncRNAs have been recognized for their roles in regulating development, stress responses, and other critical processes, and similarly, in humans they are involved in crucial biological processes and disease mechanisms. In plants, lncRNAs such as COOLAIR ( Hawkes et al. 2016 ; Nguyen and Searle 2023 ) and COLDAIR ( Kim et al. 2017 ) regulate flowering time through vernalization in Arabidopsis thaliana, while LDMAR influences male sterility in rice under long-day conditions ( Ding et al. 2012a ; Zhou et al. 2012 ; Ding et al. 2012b ), and enod40 is essential for nodule formation during symbiotic nitrogen fixation in legumes ( Gultyaev et al. 2023 ). MIEL1 and lncRNA39026 modulate plant immunity ( Marino et al. 2013 ; Aznaourova et al. 2020 ) and stress responses ( Hou et al. 2020 ; Zhang et al. 2023 ), respectively. In humans, well-characterized lncRNAs include HOTAIR ( Hajjari and Salavaty 2015 ), which is involved in gene silencing and cancer metastasis; MALAT1 ( Arun et al. 2020 ), which regulates gene expression and alternative splicing; XIST ( Li et al. 2022a ), crucial for X-chromosome inactivation; and H19 ( Liao et al. 2023 ), which plays roles in development and cancer. Other significant lncRNAs such as ANRIL ( Aguilo et al. 2016 ), TERRA ( Chebly et al. 2022 ), PVT1 ( Li et al. 2022b ), and NEAT1 ( Dong et al. 2018 ) are associated with cancer, telomere maintenance, MYC ( Shtivelman and Bishop 1989 ; Wu et al. 2023 ) regulation, and paraspeckle formation ( Hirose et al. 2014 ), respectively. These examples illustrate the diverse and critical roles lncRNAs play in both plant and human biology, highlighting their importance in development, stress responses, defense mechanisms, and disease pathogenesis. Despite their prevalence across all eukaryotes and their key roles in regulating cellular roles, precious little is known about lncRNA’s and many unanswered questions remain regarding their biology, including their origin, genomic organization, evolution, and specific roles. Although there have been attempts to unravel the complex functions of lncRNAs, a large fraction of these molecules remains unidentified, unclassifiable and poorly characterized. Thus, there is a vital need in the field to develop robust analysis pipelines to address this key unmet need. Researchers have proposed multi-step procedures for identifying lncRNA that necessitate a comprehensive understanding of each stage. However, this leads to a decrease in the reproducibility of the process as different users may implement each stage differently. Therefore, it is critical to maximize the consistency and efficiency of these analysis methods, particularly when dealing with large datasets, and to make sure that all necessary files are in place to support reproducibility and transparency. Few tools have been developed to speed up the multi- step process, such as Linc2function ( Ramakrishnaiah et al. 2023 ), ICAnnoLncRNA ( Pronozin and Afonnikov 2023 ), and lncRNADetector ( Shukla et al. 2021 ), to identify lncRNAs from the fasta sequences. However, there aren’t many pipelines that can identify lncRNAs from raw fastq files, which contain unprocessed raw data. It is important and preferable to begin data analysis with unprocessed raw files to minimize data loss. A few pipelines, such as CALINCA ( Talyan et al. 2021 ) and UClncR ( Sun et al. 2017 ), are available but they only analyze human data and cannot be used for any other species. Our pipeline, built using the Snakemake workflow management system ( Köster and Rahmann 2012 ), explicitly defines the individual steps of the analysis workflow by specifying software environments and their corresponding dependencies in terms of libraries required for the installation. This ensures that every time data is analyzed, it is done using the identical software versions and dependencies, thereby ensuring reproducibility and reducing human intervention. Our pipeline encompasses multiple stages of analysis, starting from the handling of raw fastq data to analyzing transcriptomics data, identifying lncRNAs, classifying them, performing differential expression (DE) analysis between test vs control samples, and pinpointing the trans and cis target genes regulated by the DE-lncRNAs. Furthermore, our pipeline conducts KEGG enrichment analysis to uncover the enriched pathways governed by the genes targeted by the DE-lncRNAs. An important feature of our pipeline is its versatility, as it can simultaneously process multiple samples across different species. This capability is achieved by providing the reference genome fasta and gtf/gff files specific to each species, enabling comprehensive analysis with just one command. 2. MATERIAL AND METHODS Our pipeline takes raw, unprocessed, paired-ended sequence files (fastq.gz/.fq.gz) as input. Fig. 1 shows the files and folder structure in a manner in which the raw files (from either in-house experiments or databases like GEO or SRA), genome file and the annotation files need to be stored in the folder. The run folder structure is crucial, emphasizing a main folder containing Snakemake scripts and config files. Within this run folder, individual dataset folders containing raw paired-end sequence files are organized, along with the index files and gtf of the genome. There is no limit on the number of datasets that can be analysed simultaneously using our pipeline, provided suitable computational resources are available. Prerequisites include pre-run configuration of the user- edited config file, indexed reference genome (by hisat2-build) for each dataset, installation of Linux-based tools (fastQC, multiQC, fastp, bbmap, hisat2, stringtie2, gffread, gffcompare, CPC2, BLASTX, subread, and bedtools), and R packages (stringR, tidyverse, dplyr, tidyr, DESeq2, data.table, optparse, gtools, and ashr). Download figure Open in new tab Fig 1. Structure of the working directory. Use this directory structure as a template to organize raw, genome, and metadata files in their respective folders. All result files generated during script execution will be saved in the dataset folder. We have also provided a config.yaml file that can be edited by the users according to their requirements. For example, the following parameters can be modified by the user by easily editing the “config.yaml” file – (1) minimum length cut-off for trimming, (2) minimum phred score cut-off for trimming, (3) mapping percentage cut-off, (4) p-value and absolute log2FoldChange cut-off for differential expression analysis, (5) BLASTX E- value cut-off, (6) flanking length in BP upstream and downstream to the DE-lncRNA’s location, (7) p-value cut- off and Pearson correlation cut-off for identifying trans targets. The ability to modify the parameters by editing the config.yaml files provides significant flexibility to the individual user to finetune the pipeline to meet their study requirements as per experiment design. The details about each parameter is explained in the following sections. Once the files are arranged in the proper folders, user is ready to run the pipeline for lncRNA identification developed in our study as shown in Fig 2 . The pipeline is divided into two scripts. The first script performs quality filtering, transcript mapping to genome and transcriptome assembly, while the second script performs lncRNA identification, origin-based classification, DE analysis, identification of cis- & trans- targets genes of the DE- lncRNA’s, and KEGG pathway enrichment of these target genes. The stepwise details of the pipeline, as outlined below, provide user-friendly guidelines for execution. Download figure Open in new tab Fig 2. The workflow depicts the process for identification and characterization of lncRNAs. Green boxes represent the individual steps performed. Brown boxes indicate the corresponding tools used. The parameters / filters applicable are listed next to the arrows. Dashed boxes indicate the steps included in script 1 & 2. All the steps have been automated using Snakemake and can be executed with just one click. 2.1 Script 1- Pre-processing of the raw data and Alignment In the initial step, all fastq files undergo quality checks using fastqc ( https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ ) and multiqc ( Ewels et al. 2016 ). Subsequently, the fastp tool (version 0.22.0) ( Chen et al. 2018 ) is used for quality trimming, which includes automatic detection and removal of adapters. Bases with a phred score <30 and reads shorter than 100 nucleotides are trimmed by default to enhance data quality. Following this, the pipeline filters out any rRNA reads from the dataset. This step, achieved using bbduk (version 38.18) ( https://sourceforge.net/projects/bbmap/ ), employs an rRNA database provided with the sortmeRNA tool ( Kopylova et al. 2012 ). This database includes sequences from Rfam and SILVA databases. bbduk identifies and removes rRNA reads with a k-mer length of 27, retaining only those reads that do not match the rRNA database. The preprocessed reads are then aligned to the indexed reference genome using hisat2 (version 2.2.1) ( Kim et al. 2019 ) with the option --dta-cufflinks. This step aligns reads to the genome, generating alignment files in BAM format. Additionally, a stats file is generated that contains information about the alignment percentage. Samples with alignment percentages ≥ 70% are selected for further analysis. This ensures that only samples with high- quality alignment to the reference genome are selected for further analysis, helping to improve the accuracy and reliability of downstream analyses. 2.2 Script 2 - Transcriptome Assembly and lncRNA identification The aligned BAM files, meeting the mapping cut-off criteria, are assembled using StringTie2 (version 2.2.1) ( Kovaka et al. 2019 ). The assembly files from all samples are then merged into a single ".gtf" file using Gffutils ( Pertea and Pertea 2020 ). This merged file includes class code information, defining the transcript’s position in the genome. Class codes i, u, and x (i = intronic, u = intergenic, and x = antisense) are utilized to filter transcripts from the intergenic region of the genome or those not overlapping with the exonic sequence on the same strand as a pre-annotated gene. This ensures focus on the non-coding transcripts. Using the filtered merged GTF file, a FASTA file is generated using gffread module of GFF Utilities tool ( Pertea and Pertea 2020 ). A Perl script then filters sequences with a length ≥ 200 nucleotides, to align with the definition of lncRNA ( Kapranov et al. 2007 ; Wang and Chekanova 2017 ). Each sequence then undergoes a coding potential test using CPC2 (version 1.0.1). Sequences with a coding potential ≥0.5 are eliminated as they predict for protein- coding ( Kang et al. 2017 ). Finally, a BLASTX ( Altschul et al. 1990 ) analysis is performed against the Swiss-Prot database (version 2.5.0) to filter out sequences similar to protein-coding genes, with the E-value parameter set to <= 0.001 by default. CPC2 calculates the coding potential based on features like ORF length, ORF integrity, isoelectric point and Fickett TESTCODE score. While BLASTX filters by cross-referencing known coding genes in the database. These double checks employed to detect for known coding genes are ideal for avoiding any false positives. 2.3 Script 2 - DE analysis, identification of cis and trans targets and KEGG enrichment analysis The FeatureCounts module of the Subread tool (version 2.0.6) ( Liao et al. 2014 ) counts the number of reads mapped to the specific co-ordinates in the chromosome and this is utilized to generate an expression count file from the BAM files. DESeq2 (version 1.40.2) ( Love et al. 2014 ) is then employed to perform differential expression (DE) analysis, comparing the defined stages or sample types in the metadata file. The default criteria for DE analysis include a p-value cutoff of ≤ 0.05 and an absolute log2FoldChange of ≥ 1. This ensures that only genes with statistically significant and biologically meaningful changes in expression levels are considered DE, providing reliable insights into the biological processes under study. LncRNAs regulate genes locally ( cis ) or at a distance ( trans ). To identify cis target genes for the differentially expressed long non-coding RNAs (DE-lncRNAs), DE genes within a 100 kb region upstream and downstream from the DE-lncRNAs are identified. This approach is based on previous studies ( Tian et al. 2020 ; Sun et al. 2023 ) and is implemented using Bedtools (version 2.26.0). For identifying trans targets, the DE-lncRNAs and DE genes undergo Pearson correlation analysis, with the default threshold of r2 ≥ 0.9 and a p-value ≤ 0.05. This identifies highly correlated expression patterns between DE-lncRNAs and DE-genes, helping in the identification of potential trans targets and offering insights into the regulatory roles of lncRNAs in gene expression. 3. RESULTS 3.1 LncRNA Identification and Classification To demonstrate the functioning of our pipeline across multiple species we analyzed datasets from three studies associated with PRJNA657713 (Rice) ( Zhang et al. 2020 ), GSE141035 (Sorghum) ( Dhaka et al. 2020 ), and PRJNA1031181 (Human) ( https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246050 ). The details of the samples taken from these studies are provided in Table 1 . View this table: View inline View popup Download powerpoint Table 1. Details of different datasets used in the study The pipeline was run just with a single command, which parallelizes the process and optimizes it depending upon available computational resources and/or user input. Table 2 , Table 3 and Table 4 summarize the overall results obtained by analyzing the three datasets. In the rice dataset (PRJNA657713), 90 DE-lncRNAs were detected. Our pipeline categorized lncRNAs into three distinct classes: intergenic, intronic, and anti-sense. Among the 90 DE- lncRNAs, 74 were intergenic, 3 were intronic, and 13 were anti-sense. Subsequently, DE-genes and DE-lncRNAs underwent analysis for cis and trans target identification. Cis targets were identified as DE-genes located within 100kb upstream and downstream regions of DE lncRNAs, leading to the identification of 53 cis targets. Pearson correlation scores were computed between the normalized read counts of DE-lncRNAs and DE-genes, resulting in the identification of 846 trans targets with a correlation coefficient (r^2) ≥ 0.9 and p-value ≤ 0.05. Out of these 53 cis and 846 trans targets, 39 were found to be both cis and trans, resulting in a total of 860 unique targets. Among these 860 targets, 110 were upregulated, and 750 were downregulated. View this table: View inline View popup Download powerpoint Table 2. Summary of DE-lncRNA classes identified for the analyzed datasets: View this table: View inline View popup Download powerpoint Table 3. Summary of DE-lncRNAs and their associated cis and trans targets, along with enriched KEGG terms identified through the pipeline. View this table: View inline View popup Download powerpoint Table 4. Enriched KEGG terms identified separately for up- and down-regulated targets using the pipeline Similarly, in the Sorghum dataset (GSE141035), 1161 DE-lncRNAs were identified. Among these, 1074 were intergenic, 33 were intronic, and 54 were anti-sense. The analysis revealed 1941 cis and 6187 trans targets. Among these cis and trans targets, 1925 were identified as both cis and trans, leading to a total of 5603 unique targets. Out of these 5603 targets, 2886 were upregulated, and 3317 were downregulated. In the human dataset (PRJNA1031181), 725 DE-lncRNAs were identified, along with 253 cis targets and 2197 trans targets. Among the DE-lncRNAs, 315 were intergenic, 396 were intronic, and 14 were anti-sense. A total of 2281 unique targets were identified, out of which 1163 were upregulated, and 1118 were downregulated. 3.1 Kegg Pathway Enrichment In addition, our pipeline performs KEGG pathway enrichment analysis by applying the hypergeometric test to the up- and down-regulated target genes in each species separately. Supplementary Table 1 obtained from enrichment analysis of rice, sorghum and human lncRNA target data, displays the pathway name, gene IDs associated with each pathway, the number of genes mapped to each pathway, using a p-value cut-off of < 0.05. Using this data, the pipeline produces bubble plots in Supplementary Fig. 1 that display the statistical significance (p-value) of pathways. The plots use a colour gradient from blue to red, with blue indicating high significance and red indicating lower significance. The size of each bubble represents the number of genes associated with the pathway. Download figure Open in new tab Supplementary Fig. 1. The figure displays bubble plots illustrating the enriched KEGG pathways in: a) Rice, b) Sorghum, and c) Human. 4. DISCUSSION This pipeline offers the crucial benefit of processing samples simultaneously, unlike traditional methods. Our pipeline utilizes the Snakemake workflow management system to automate and run multiple tasks concurrently, streamlining the analysis workflow, reducing manual intervention, and optimizing resource utilization. It covers tasks from managing raw fastq data to conducting transcriptomics analysis, detecting and classifying lncRNAs, performing DE analysis, and identifying the cis- and trans -target genes of DE-lncRNAs. Additionally, it performs KEGG pathway enrichment analysis, providing researchers with a comprehensive understanding of lncRNA regulatory functions in cellular processes. The detailed biological application of this pipeline has been covered in one of our research article manuscript that is currently under review. To demonstrate resource optimization and a significant reduction in processing time, we conducted a comparative analysis of manual and automated runs using Sorghum and Rice samples. Manual processing of Rice study (6 samples) required 8 hours, while Sorghum study (6 samples) took 9 hours. Utilizing our automated pipeline, the processing time decreased to 5.5 and 4.5 hours for rice and sorghum samples, respectively. This not only accelerates analysis and saves time but also enhances consistency while minimizing the risk of human error. Significantly, in our analysis, the BLASTX step stood out as the most time-consuming process, and its completion time increased in direct proportion to the sample depth. We evaluated the pipeline’s reproducibility by conducting independent automated runs on the rice and sorghum datasets and comparing the predicted lncRNAs. We observed a high reproducibility rate of ∼ 95%, indicating that the majority of predicted lncRNAs were consistent between the runs. The remaining 5% of minor variability can be attributed to the inherent randomness in the StringTie algorithm. During the execution of these tasks, the pipeline automatically saves the generated files and final results in the same raw data folder of the study. This approach facilitates easy data traceability, allowing users to crosscheck data from each step and aid in the identification and troubleshooting of any issues that might arise during data analysis. Based on our literature survey we came across few tools that were designed to study lncRNA’s either in very specific contexts or to be used as analysis pipelines. We have made a list of key features needed for effective identification of lncRNA’s and other aspects highlighted earlier. The availability of these features were then compared for our pipeline and the other tools. This comparison in no way undermines the utility of those tools and their scientific rigor, as we acknowledge that some of these tools are designed for very unique scenarios and have their benefits in those scenarios. However, from the specific point of view of a comprehensive, automated, lncRNA analysis pipeline, the relevant features of these tools have been compared with our pipeline in figure 3 . MechRNA ( Gawronski et al. 2018 ) focuses solely on predicting RNA-RNA and RNA-protein interactions, requiring additional chipseq data and lacking all other features like lncRNA and its target prediction. ICAnnoLncRNA ( Pronozin and Afonnikov 2023 ), a Snakemake pipeline, searches and classifies lncRNAs from fasta sequences specifically in plants, but does not handle raw fastq data, does not predict lncRNA targets, nor perform functional analysis. Linc2function ( Ramakrishnaiah et al. 2023 ) offers functional annotation from lncRNA fasta sequences but does not predict lncRNAs, and it is trained on human data so it is best suited for human libraries. CALINCA ( Talyan et al. 2021 ) specializes in identifying lncRNAs in podocyte disease, providing a podocyte-centric analysis without covering features such as target prediction and functional prediction. UClncR ( Sun et al. 2017 ) detects lncRNAs from RNA-seq data and quantifies them but is restricted to the human genome and lacks classification, target detection, and functional prediction capabilities. lncRNADetector ( Shukla et al. 2021 ) offers web server accessibility and user-friendliness but is specific to medicinal plant lncRNA identification without providing target and functional information. Our pipeline, focuses on lncRNA identification and analysis, operating on RNA-seq alignment files, predicting candidates, quantifying and classifying lncRNAs, predicting targets, and working across organisms. It currently cannot predict RNA- RNA or RNA-protein interactions, with efforts underway to develop this capability. Additionally, we plan to create a web-based platform for our automated pipeline for a wider reach and to enhance user-friendliness. Download figure Open in new tab Fig 3. A comparative table of available tools and their features that attempt to obtain insights into lncRNA identification, classification, automation and functional aspects. 5. CONCLUSION In conclusion, our SnakeMake-based pipeline provides a versatile, user-friendly, and reproducible solution for researchers seeking a comprehensive analysis of the lncRNA transcriptome. This tool will significantly contribute to advancing the field of lncRNA research by providing a standardized and efficient approach to uncovering the regulatory landscape of these intriguing molecules across different species and biological systems. FUNDING MK, CM, and BRR acknowledge Jio Platforms Limited for providing the necessary facilities and financial support through salaries. No additional funding was received for this project. COMPETING INTERESTS The authors declare no conflict of interest. AUTHOR CONTRIBUTIONS MK conceived the study, CM wrote the scripts. MK and CM wrote the manuscript. BRR supervised the study, edited the manuscript for content and flow and provided critical suggestions. ACKNOWLEDGEMENTS MK, CM and BRR acknowledge Jio Platforms Limited for providing the necessary facilities required and financial assistance through the salaries provided. We also acknowledge intern, Bommineni Sai, for his contributions in the script writing. REFERENCES ↵ Agostini M , Mancini M , Candi E ( 2022 ) Long non-coding RNAs affecting cell metabolism in cancer . Biol Direct 17 : 26 . doi: 10.1186/s13062-022-00341-x OpenUrl CrossRef ↵ Aguilo F , Di Cecilia S , Walsh MJ ( 2016 ) Long Non-coding RNA ANRIL and Polycomb in Human Cancers and Cardiovascular Disease . Curr Top Microbiol Immunol 394 : 29 – 39 . doi: 10.1007/82_2015_455 OpenUrl CrossRef ↵ Ahmad M , Weiswald L-B , Poulain L , Denoyelle C , Meryet-Figuiere M ( 2023 ) Involvement of lncRNAs in cancer cells migration, invasion and metastasis: cytoskeleton and ECM crosstalk . Journal of Experimental & Clinical Cancer Research 42 : 173 . doi: 10.1186/s13046-023-02741-x OpenUrl CrossRef ↵ Ai B , Kong X , Wang X , Zhang K , Yang X , Zhai J , Gao R , Qi Y , Wang J , Wang Z , Fang Y ( 2019 ) LINC01355 suppresses breast cancer growth through FOXO3-mediated transcriptional repression of CCND1 . Cell Death Dis 10 : 502 . doi: 10.1038/s41419-019-1741-8 OpenUrl CrossRef ↵ Altschul SF , Gish W , Miller W , Myers EW , Lipman DJ ( 1990 ) Basic local alignment search tool . Journal of Molecular Biology 215 : 403 – 410 . doi: 10.1016/S0022-2836(05)80360-2 OpenUrl CrossRef PubMed Web of Science ↵ Arun G , Aggarwal D , Spector DL ( 2020 ) MALAT1 Long Non-Coding RNA: Functional Implications . Noncoding RNA 6 : 22 . doi: 10.3390/ncrna6020022 OpenUrl CrossRef PubMed ↵ Aznaourova M , Janga H , Sefried S , Kaufmann A , Dorna J , Volkers SM , Georg P , Lechner M , Hoppe J , Dökel S , Schmerer N , Gruber AD , Linne U , Bauer S , Sander LE , Schmeck B , Schulte LN ( 2020 ) Noncoding RNA MaIL1 is an integral component of the TLR4-TRIF pathway . Proc Natl Acad Sci U S A 117 : 9042 – 9053 . doi: 10.1073/pnas.1920393117 OpenUrl Abstract / FREE Full Text ↵ Bartonicek N , Maag JLV , Dinger ME ( 2016 ) Long noncoding RNAs in cancer: mechanisms of action and technological advancements . Mol Cancer 15 : 43 . doi: 10.1186/s12943-016-0530-6 OpenUrl CrossRef ↵ Bell JC , Jukam D , Teran NA , Risca VI , Smith OK , Johnson WL , Skotheim JM , Greenleaf WJ , Straight AF ( 2018 ) Chromatin-associated RNA sequencing (ChAR-seq) maps genome-wide RNA-to-DNA contacts . eLife 7 : e27024 . doi: 10.7554/eLife.27024 OpenUrl CrossRef ↵ Bhan A , Soleimani M , Mandal SS ( 2017 ) Long Noncoding RNA and Cancer: A New Paradigm . Cancer Res 77 : 3965 – 3981 . doi: 10.1158/0008-5472.CAN-16-2634 OpenUrl Abstract / FREE Full Text ↵ Bonetti A , Agostini F , Suzuki AM , Hashimoto K , Pascarella G , Gimenez J , Roos L , Nash AJ , Ghilotti M , Cameron CJF , Valentine M , Medvedeva YA , Noguchi S , Agirre E , Kashi K , Samudyata , Luginbühl J , Cazzoli R , Agrawal S , Luscombe NM , Blanchette M , Kasukawa T , Hoon MD , Arner E , Lenhard B , Plessy C , Castelo-Branco G , Orlando V , Carninci P ( 2020 ) RADICL-seq identifies general and cell type–specific principles of genome-wide RNA-chromatin interactions . Nat Commun 11 : 1018 . doi: 10.1038/s41467-020-14337-6 OpenUrl CrossRef ↵ Chebly A , Ropio J , Baldasseroni L , Prochazkova-Carlotti M , Idrissi Y , Ferrer J , Farra C , Beylot-Barry M , Merlio J-P , Chevret E ( 2022 ) Telomeric Repeat-Containing RNA (TERRA): A Review of the Literature and First Assessment in Cutaneous T-Cell Lymphomas . Genes (Basel ) 13 : 539 . doi: 10.3390/genes13030539 OpenUrl CrossRef ↵ Chen S , Zhou Y , Chen Y , Gu J ( 2018 ) fastp: an ultra-fast all-in-one FASTQ preprocessor . Bioinformatics 34 : i884 – i890 . doi: 10.1093/bioinformatics/bty560 OpenUrl CrossRef PubMed ↵ Chu C , Qu K , Zhong FL , Artandi SE , Chang HY ( 2011 ) Genomic Maps of Long Noncoding RNA Occupancy Reveal Principles of RNA-Chromatin Interactions . Molecular Cell 44 : 667 – 678 . doi: 10.1016/j.molcel.2011.08.027 OpenUrl CrossRef PubMed Web of Science ↵ Dhaka N , Krishnan K , Kandpal M , Vashisht I , Pal M , Sharma MK , Sharma R ( 2020 ) Transcriptional trajectories of anther development provide candidates for engineering male fertility in sorghum . Sci Rep 10 : 897 . doi: 10.1038/s41598-020-57717-0 OpenUrl CrossRef ↵ Ding J , Lu Q , Ouyang Y , Mao H , Zhang P , Yao J , Xu C , Li X , Xiao J , Zhang Q ( 2012a ) A long noncoding RNA regulates photoperiod-sensitive male sterility, an essential component of hybrid rice . Proc Natl Acad Sci U S A 109 : 2654 – 2659 . doi: 10.1073/pnas.1121374109 OpenUrl Abstract / FREE Full Text ↵ Ding J , Shen J , Mao H , Xie W , Li X , Zhang Q ( 2012b ) RNA-directed DNA methylation is involved in regulating photoperiod-sensitive male sterility in rice . Mol Plant 5 : 1210 – 1216 . doi: 10.1093/mp/sss095 OpenUrl CrossRef PubMed Web of Science ↵ Dong P , Xiong Y , Yue J , Hanley SJB , Kobayashi N , Todo Y , Watari H ( 2018 ) Long Non- coding RNA NEAT1: A Novel Target for Diagnosis and Therapy in Human Tumors . Front Genet 9 : 471 . doi: 10.3389/fgene.2018.00471 OpenUrl CrossRef ↵ Duan J , Huang Z , Nice EC , Xie N , Chen M , Huang C ( 2023 ) Current advancements and future perspectives of long noncoding RNAs in lipid metabolism and signaling . Journal of Advanced Research 48 : 105 – 123 . doi: 10.1016/j.jare.2022.08.007 OpenUrl CrossRef ↵ Ewels P , Magnusson M , Lundin S , Käller M ( 2016 ) MultiQC: summarize analysis results for multiple tools and samples in a single report . Bioinformatics 32 : 3047 – 3048 . doi: 10.1093/bioinformatics/btw354 OpenUrl CrossRef PubMed ↵ Gawronski AR , Uhl M , Zhang Y , Lin Y-Y , Niknafs YS , Ramnarine VR , Malik R , Feng F , Chinnaiyan AM , Collins CC , Sahinalp SC , Backofen R ( 2018 ) MechRNA: prediction of lncRNA mechanisms from RNA–RNA and RNA–protein interactions . Bioinformatics 34 : 3101 – 3110 . doi: 10.1093/bioinformatics/bty208 OpenUrl CrossRef ↵ Gultyaev AP , Koster C , van Batenburg DC , Sistermans T , van Belle N , Vijfvinkel D , Roussis A ( 2023 ) Conserved structured domains in plant non-coding RNA enod40, their evolution and recruitment of sequences from transposable elements . NAR Genom Bioinform 5 :lqad091. doi: 10.1093/nargab/lqad091 OpenUrl CrossRef ↵ Hajjari M , Salavaty A ( 2015 ) HOTAIR: an oncogenic long non-coding RNA in different cancers . Cancer Biol Med 12 : 1 – 9 . doi: 10.7497/j.issn.2095-3941.2015.0006 OpenUrl Abstract / FREE Full Text ↵ Hawkes EJ , Hennelly SP , Novikova IV , Irwin JA , Dean C , Sanbonmatsu KY ( 2016 ) COOLAIR Antisense RNAs Form Evolutionarily Conserved Elaborate Secondary Structures . Cell Rep 16 : 3087 – 3096 . doi: 10.1016/j.celrep.2016.08.045 OpenUrl CrossRef ↵ Hirose T , Virnicchi G , Tanigawa A , Naganuma T , Li R , Kimura H , Yokoi T , Nakagawa S , Bénard M , Fox AH , Pierron G ( 2014 ) NEAT1 long noncoding RNA regulates transcription via protein sequestration within subnuclear bodies . Mol Biol Cell 25 : 169 – 183 . doi: 10.1091/mbc.E13-09-0558 OpenUrl Abstract / FREE Full Text ↵ Hou X , Cui J , Liu W , Jiang N , Zhou X , Qi H , Meng J , Luan Y ( 2020 ) LncRNA39026 Enhances Tomato Resistance to Phytophthora infestans by Decoying miR168a and Inducing PR Gene Expression . Phytopathology 110 : 873 – 880 . doi: 10.1094/PHYTO-12-19-0445-R OpenUrl CrossRef ↵ Isoda T , Moore AJ , He Z , Chandra V , Aida M , Denholtz M , Piet Van Hamburg J , Fisch KM , Chang AN , Fahl SP , Wiest DL , Murre C ( 2017 ) Non-coding Transcription Instructs Chromatin Folding and Compartmentalization to Dictate Enhancer-Promoter Communication and T Cell Fate . Cell 171 : 103 – 119 .e18. doi: 10.1016/j.cell.2017.09.001 OpenUrl CrossRef ↵ Jiang M-C , Ni J-J , Cui W-Y , Wang B-Y , Zhuo W ( 2019 ) Emerging roles of lncRNA in cancer and therapeutic opportunities . Am J Cancer Res 9 : 1354 – 1366 OpenUrl ↵ Kang Y-J , Yang D-C , Kong L , Hou M , Meng Y-Q , Wei L , Gao G ( 2017 ) CPC2: a fast and accurate coding potential calculator based on sequence intrinsic features . Nucleic Acids Research 45 : W12 – W16 . doi: 10.1093/nar/gkx428 OpenUrl CrossRef PubMed ↵ Kapranov P , Cheng J , Dike S , Nix DA , Duttagupta R , Willingham AT , Stadler PF , Hertel J , Hackermüller J , Hofacker IL , Bell I , Cheung E , Drenkow J , Dumais E , Patel S , Helt G , Ganesh M , Ghosh S , Piccolboni A , Sementchenko V , Tammana H , Gingeras TR ( 2007 ) RNA Maps Reveal New RNA Classes and a Possible Function for Pervasive Transcription . Science 316 : 1484 – 1488 . doi: 10.1126/science.1138341 OpenUrl Abstract / FREE Full Text ↵ Kim D , Paggi JM , Park C , Bennett C , Salzberg SL ( 2019 ) Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype . Nat Biotechnol 37 : 907 – 915 . doi: 10.1038/s41587-019-0201-4 OpenUrl CrossRef PubMed ↵ Kim D-H , Xi Y , Sung S ( 2017 ) Modular function of long noncoding RNA, COLDAIR, in the vernalization response . PLoS Genet 13 : e1006939 . doi: 10.1371/journal.pgen.1006939 OpenUrl CrossRef ↵ Kitagawa M , Kitagawa K , Kotake Y , Niida H , Ohhata T ( 2013 ) Cell cycle regulation by long non-coding RNAs . Cell Mol Life Sci 70 : 4785 – 4794 . doi: 10.1007/s00018-0 13-1423-0 OpenUrl CrossRef PubMed ↵ Kohlmaier A , Holdt LM , Teupser D ( 2023 ) Long noncoding RNAs in cardiovascular disease . Curr Opin Cardiol 38 : 179 – 192 . doi: 10.1097/HCO.0000000000001041 OpenUrl CrossRef ↵ Kopylova E , Noé L , Touzet H ( 2012 ) SortMeRNA: fast and accurate filtering of ribosomal RNAs in metatranscriptomic data . Bioinformatics 28 : 3211 – 3217 . doi: 10.1093/bioinformatics/bts611 OpenUrl CrossRef PubMed Web of Science ↵ Köster J , Rahmann S ( 2012 ) Snakemake—a scalable bioinformatics workflow engine . Bioinformatics 28 : 2520 – 2522 . doi: 10.1093/bioinformatics/bts480 OpenUrl CrossRef PubMed Web of Science ↵ Kovaka S , Zimin AV , Pertea GM , Razaghi R , Salzberg SL , Pertea M ( 2019 ) Transcriptome assembly from long-read RNA-seq alignments with StringTie2 . Genome Biol 20 : 278 . doi: 10.1186/s13059-019-1910-1 OpenUrl CrossRef PubMed ↵ Lan H , Wang H , Gao M , Luo G , Zhang J , Yi E , Liang C , Xiong X , Chen X , Wu Q , Chen R , Lin B , Qian D , Hong W ( 2021 ) Analysis and Construction of a Competitive Endogenous RNA Regulatory Network of Baicalin-Induced Apoptosis in Human Osteosarcoma Cells . Biomed Res Int 2021 : 9984112 . doi: 10.1155/2021/9984112 OpenUrl CrossRef ↵ Li G , Liu K , Du X ( 2018 ) Long Non-Coding RNA TUG1 Promotes Proliferation and Inhibits Apoptosis of Osteosarcoma Cells by Sponging miR-132-3p and Upregulating SOX4 Expression . Yonsei Med J 59:226–235. doi: 10.3349/ymj.2018.59.2.226 OpenUrl CrossRef ↵ Li J , Ming Z , Yang L , Wang T , Liu G , Ma Q ( 2022a ) Long noncoding RNA XIST: Mechanisms for X chromosome inactivation, roles in sex-biased diseases, and therapeutic opportunities . Genes Dis 9 : 1478 – 1492 . doi: 10.1016/j.gendis.2022.04.007 OpenUrl CrossRef ↵ Li R , Wang X , Zhu C , Wang K ( 2022b ) lncRNA PVT1: a novel oncogene in multiple cancers . Cell Mol Biol Lett 27 : 84 . doi: 10.1186/s11658-022-00385-x OpenUrl CrossRef ↵ Li X , Zhou B , Chen L , Gou L-T , Li H , Fu X-D ( 2017 ) GRID-seq reveals the global RNA– chromatin interactome . Nat Biotechnol 35 : 940 – 950 . doi: 10.1038/nbt.3968 OpenUrl CrossRef ↵ Liao J , Chen B , Zhu Z , Du C , Gao S , Zhao G , Zhao P , Wang Y , Wang A , Schwartz Z , Song L , Hong J , Wagstaff W , Haydon RC , Luu HH , Fan J , Reid RR , He T-C , Shi L , Hu N , Huang W ( 2023 ) Long noncoding RNA (lncRNA) H19: An essential developmental regulator with expanding roles in cancer, stem cell differentiation, and metabolic diseases . Genes Dis 10 : 1351 – 1366 . doi: 10.1016/j.gendis.2023.02.008 OpenUrl CrossRef ↵ Liao Y , Smyth GK , Shi W ( 2014 ) featureCounts: an efficient general purpose program for assigning sequence reads to genomic features . Bioinformatics 30 : 923 – 930 . doi: 10.1093/bioinformatics/btt656 OpenUrl CrossRef PubMed Web of Science ↵ Lin W , Zhou Q , Wang C-Q , Zhu L , Bi C , Zhang S , Wang X , Jin H ( 2020 ) LncRNAs regulate metabolism in cancer . Int J Biol Sci 16 : 1194 – 1206 . doi: 10.7150/ijbs.40769 OpenUrl CrossRef ↵ Liu C , Li H , Chu F , Zhou X , Xie R , Wei Q , Yang S , Li T , Liang S , Lü M ( 2021 ) Long non-coding RNAs: Key regulators involved in metabolic reprogramming in cancer (Review) . Oncol Rep 45 : 54 . doi: 10.3892/or.2021.8005 OpenUrl CrossRef ↵ Liu L , Yan X , Wu D , Yang Y , Li M , Su Y , Yang W , Shan Z , Gao Y , Jin Z ( 2018 ) High expression of Ras-related protein 1A promotes an aggressive phenotype in colorectal cancer via PTEN/FOXO3/CCND1 pathway . J Exp Clin Cancer Res 37 : 178 . doi: 10.1186/s13046-018-0827-y OpenUrl CrossRef ↵ Love M , Anders S , Huber W ( 2014 ) Differential analysis of count data–the DESeq2 package . Genome Biol 15 : 10 – 1186 OpenUrl CrossRef ↵ Ma C , Nong K , Zhu H , Wang W , Huang X , Yuan Z , Ai K ( 2014 ) H19 promotes pancreatic cancer metastasis by derepressing let-7’s suppression on its target HMGA2-mediated EMT . Tumour Biol 35 : 9163 – 9169 . doi: 10.1007/s13277-014-2185-5 OpenUrl CrossRef PubMed ↵ Marino D , Froidure S , Canonne J , Ben Khaled S , Khafif M , Pouzet C , Jauneau A , Roby D , Rivas S ( 2013 ) Arabidopsis ubiquitin ligase MIEL1 mediates degradation of the transcription factor MYB30 weakening plant defence . Nat Commun 4 : 1476 . doi: 10.1038/ncomms2479 OpenUrl CrossRef PubMed ↵ Nguyen V , Searle I ( 2023 ) Keeping it cool . Elife 12 : e86885 . doi: 10.7554/eLife.86885 OpenUrl CrossRef ↵ Oo JA , Brandes RP , Leisegang MS ( 2022 ) Long non-coding RNAs: novel regulators of cellular physiology and function . Pflugers Arch 474 : 191 – 204 . doi: 10.1007/s00424-021-02641-z OpenUrl CrossRef ↵ Pertea G , Pertea M ( 2020 ) GFF Utilities: GffRead and GffCompare . F 1000R es 9:304. doi: 10.12688/f1000research.23297.2 OpenUrl CrossRef ↵ Pronozin AYu , Afonnikov DA ( 2023 ) ICAnnoLncRNA: A Snakemake Pipeline for a Long Non-Coding-RNA Search and Annotation in Transcriptomic Sequences . Genes 14 : 1331 . doi: 10.3390/genes14071331 OpenUrl CrossRef ↵ Ramakrishnaiah Y , Morris AP , Dhaliwal J , Philip M , Kuhlmann L , Tyagi S ( 2023 ) Linc2function: A Comprehensive Pipeline and Webserver for Long Non-Coding RNA (lncRNA) Identification and Functional Predictions Using Deep Learning Approaches . Epigenomes 7 : 22 . doi: 10.3390/epigenomes7030022 OpenUrl CrossRef ↵ Saldaña-Meyer R , Rodriguez-Hernaez J , Escobar T , Nishana M , Jácome-López K , Nora EP , Bruneau BG , Tsirigos A , Furlan-Magaril M , Skok J , Reinberg D ( 2019 ) RNA Interactions Are Essential for CTCF-Mediated Genome Organization . Molecular Cell 76 : 412 – 422 .e5. doi: 10.1016/j.molcel.2019.08.015 OpenUrl CrossRef ↵ Schmitt AM , Chang HY ( 2016 ) Long Noncoding RNAs in Cancer Pathways . Cancer Cell 29 : 452 – 463 . doi: 10.1016/j.ccell.2016.03.010 OpenUrl CrossRef PubMed ↵ Shtivelman E , Bishop JM ( 1989 ) The PVT gene frequently amplifies with MYC in tumor cells . Mol Cell Biol 9 : 1148 – 1154 . doi: 10.1128/mcb.9.3.1148-1154.1989 OpenUrl Abstract / FREE Full Text ↵ Shukla B , Gupta S , Srivastava G , Sharma A , Shukla AK , Shasany AK ( 2021 ) lncRNADetector: a bioinformatics pipeline for long non-coding RNA identification and MAPslnc: a repository of medicinal and aromatic plant lncRNAs . RNA Biology 18 : 2290 – 2295 . doi: 10.1080/15476286.2021.1899673 OpenUrl CrossRef ↵ Statello L , Guo C-J , Chen L-L , Huarte M ( 2021 ) Gene regulation by long non-coding RNAs and its biological functions . Nat Rev Mol Cell Biol 22 : 96 – 118 . doi: 10.1038/s41580-020-00315-9 OpenUrl CrossRef PubMed ↵ Sun X , Tang M , Xu L , Luo X , Shang Y , Duan W , Huang Z , Jin C , Chen G ( 2023 ) Genome- wide identification of long non-coding RNAs and their potential functions in radish response to salt stress . Front Genet 14 : 1232363 . doi: 10.3389/fgene.2023.1232363 OpenUrl CrossRef ↵ Sun Z , Nair A , Chen X , Prodduturi N , Wang J , Kocher J-P ( 2017 ) UClncR: Ultrafast and comprehensive long non-coding RNA detection from RNA-seq . Sci Rep 7 : 14196 . doi: 10.1038/s41598-017-14595-3 OpenUrl CrossRef ↵ Talyan S , Filipów S , Ignarski M , Smieszek M , Chen H , Kühne L , Butt L , Göbel H , Hoyer-Allo KJR , Koehler FC , Altmüller J , Brinkkötter P , Schermer B , Benzing T , Kann M , Müller R-U , Dieterich C ( 2021 ) CALINCA—A Novel Pipeline for the Identification of lncRNAs in Podocyte Disease . Cells 10 : 692 . doi: 10.3390/cells10030692 OpenUrl CrossRef ↵ Tao H , Liu X , Liu X , Liu W , Wu D , Wang R , Lv G ( 2019 ) LncRNA MEG3 inhibits trophoblast invasion and trophoblast-mediated VSMC loss in uterine spiral artery remodeling . Mol Reprod Dev 86 : 686 – 695 . doi: 10.1002/mrd.23147 OpenUrl CrossRef ↵ Tian H , Guo F , Zhang Z , Ding H , Meng J , Li X , Peng Z , Wan S ( 2020 ) Discovery, identification, and functional characterization of long noncoding RNAs in Arachis hypogaea L . BMC Plant Biol 20 : 308 . doi: 10.1186/s12870-020-02510-4 OpenUrl CrossRef ↵ Wang H-LV , Chekanova JA ( 2017 ) Long Noncoding RNAs in Plants . In: Rao MRS (ed) Long Non Coding RNA Biology . Springer Singapore, Singapore , pp 133–154 ↵ Wang J , Zhang X , Chen W , Hu X , Li J , Liu C ( 2020 ) Regulatory roles of long noncoding RNAs implicated in cancer hallmarks . Int J Cancer 146 : 906 – 916 . doi: 10.1002/ijc.32277 OpenUrl CrossRef ↵ Wang X , Fan H , Wang B , Yuan F ( 2023 ) Research progress on the roles of lncRNAs in plant development and stress responses . Frontiers in Plant Science 14 ↵ Wu F , Zhu Y , Zhou C , Gui W , Li H , Lin X ( 2023 ) Regulation mechanism and pathogenic role of lncRNA plasmacytoma variant translocation 1 (PVT1) in human diseases . Genes Dis 10 : 901 – 914 . doi: 10.1016/j.gendis.2022.05.037 OpenUrl CrossRef ↵ Xiang J-F , Yin Q-F , Chen T , Zhang Y , Zhang X-O , Wu Z , Zhang S , Wang H-B , Ge J , Lu X , Yang L , Chen L-L ( 2014 ) Human colorectal cancer-specific CCAT1-L lncRNA regulates long-range chromatin interactions at the MYC locus . Cell Res 24 : 513 – 531 . doi: 10.1038/cr.2014.35 OpenUrl CrossRef PubMed Web of Science ↵ Yang F , Deng X , Ma W , Berletch JB , Rabaia N , Wei G , Moore JM , Filippova GN , Xu J , Liu Y , Noble WS , Shendure J , Disteche CM ( 2015 ) The lncRNA Firre anchors the inactive X chromosome to the nucleolus by binding CTCF and maintains H3K27me3 methylation . Genome Biol 16 : 52 . doi: 10.1186/s13059-015-0618-0 OpenUrl CrossRef PubMed ↵ Zhang L , Lin T , Zhu G , Wu B , Zhang C , Zhu H ( 2023 ) LncRNAs exert indispensable roles in orchestrating the interaction among diverse noncoding RNAs and enrich the regulatory network of plant growth and its adaptive environmental stress response . Hortic Res 10 :uhad234. doi: 10.1093/hr/uhad234 OpenUrl CrossRef ↵ Zhang T , Liang Q , Li C , Fu S , Kundu JK , Zhou X , Wu J ( 2020 ) Transcriptome Analysis of Rice Reveals the lncRNA–mRNA Regulatory Network in Response to Rice Black- Streaked Dwarf Virus Infection . Viruses 12 : 951 . doi: 10.3390/v12090951 OpenUrl CrossRef ↵ Zhang X , Wang W , Zhu W , Dong J , Cheng Y , Yin Z , Shen F ( 2019 ) Mechanisms and Functions of Long Non-Coding RNAs at Multiple Regulatory Levels . IJMS 20 : 5573 . doi: 10.3390/ijms20225573 OpenUrl CrossRef ↵ Zhou H , Liu Q , Li J , Jiang D , Zhou L , Wu P , Lu S , Li F , Zhu L , Liu Z , Chen L , Liu Y-G , Zhuang C ( 2012 ) Photoperiod- and thermo-sensitive genic male sterility in rice are caused by a point mutation in a novel noncoding RNA that produces a small RNA . Cell Res 22 : 649 – 660 . doi: 10.1038/cr.2012.28 OpenUrl CrossRef PubMed Web of Science ↵ Zhou H , Ren F , Wang X , Qiu K , Sheng Y , Xie Q , Shi P , Zhang J , Pan H ( 2022 ) Genome- wide identification and characterization of long noncoding RNAs during peach (Prunus persica) fruit development and ripening . Sci Rep 12 : 11044 . doi: 10.1038/s41598-022-15330-3 OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted August 19, 2024. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Automated Navigation of the lncRNA Transcriptome: A comprehensive SnakeMake based computational Pipeline for robust Identification of lncRNAs and their putative targets Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Automated Navigation of the lncRNA Transcriptome: A comprehensive SnakeMake based computational Pipeline for robust Identification of lncRNAs and their putative targets Manu Kandpal , Chitranjan Mukherjee , Bhadresh Rami bioRxiv 2024.08.18.608522; doi: https://doi.org/10.1101/2024.08.18.608522 Share This Article: Copy Citation Tools Automated Navigation of the lncRNA Transcriptome: A comprehensive SnakeMake based computational Pipeline for robust Identification of lncRNAs and their putative targets Manu Kandpal , Chitranjan Mukherjee , Bhadresh Rami bioRxiv 2024.08.18.608522; doi: https://doi.org/10.1101/2024.08.18.608522 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7651) Biochemistry (17746) Bioengineering (13928) Bioinformatics (42066) Biophysics (21499) Cancer Biology (18650) Cell Biology (25579) Clinical Trials (138) Developmental Biology (13409) Ecology (19947) Epidemiology (2067) Evolutionary Biology (24374) Genetics (15633) Genomics (22557) Immunology (17775) Microbiology (40505) Molecular Biology (17217) Neuroscience (88796) Paleontology (667) Pathology (2845) Pharmacology and Toxicology (4836) Physiology (7664) Plant Biology (15179) Scientific Communication and Education (2047) Synthetic Biology (4304) Systems Biology (9839) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-24T02:00:01.246996+00:00

License: CC-BY-NC-ND-4.0