Full text
17,280 characters
· extracted from
preprint-html
· click to expand
AutoGDC: A Python Package for DNA Methylation and Transcription Meta-Analyses | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results AutoGDC: A Python Package for DNA Methylation and Transcription Meta-Analyses Chase Alan Brown , Jonathan D. Wren doi: https://doi.org/10.1101/2024.04.14.589445 Chase Alan Brown 1 University of Oklahoma Health Science Center, Oklahoma Medical Research Foundation Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: chase-brown{at}omrf.org Jonathan D. Wren 2 Oklahoma Medical Research Foundation Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Preview PDF A bstract Motivation The Genomic Data Commons is a powerful resource which facilitates the exploration of molecular alterations across various diseases. However, utilizing this resource for meta-analysis requires many different tools to query, download, organize, and analyze the data. In order to facilitate a more rapid, simple means of analyzing DNA methylation and RNA sequencing datasets from the GDC we developed autogdc , a python package that integrates data curation and preprocessing with meta-analysis functionality into one simplified bioinformatic pipeline. Availability and Implementation The autogdc python package is available under the GPLv3 license at along with several examples of typical use-case scenarios in the form of a jupyter notebook. The data is all originally provided by the GDC, and is therefore available under the NIH Genomic Data Sharing (GDS) and NCI GDS policies. 1 Introduction The Genomic Data Commons (GDC) 1 is a rich and well-annotated human genomic data repository, containing many large studies on cancer with a focus on enabling discovery in precision medicine. The GDC has created several bioinformatic tools and web applications, which mostly operate via a representational state transfer (REST) application programming interface (API), in order to quickly query and analyze data. However, despite GDC’s work towards intuitive web apps and additional command line interface (CLI) tools, the python ecosystem for querying, downloading, processing, and analyzing GDC data in one python package is less developed. The autogdc package provides this access to the data repository, which contains more than 230,000 open-access data files and more than 350,000 controlled-access data files. The autogdc infrastructure focuses upon transcription and DNA methylation profiling data, as these two assay data types make up around 100,000 of the open-access files. The open-access paired DNA methylation and transcriptional profiling data provide unique opportunities for understanding molecular changes in cancer as well as in a broader biological context. 2 Similar work 2.1 GDC API and gdc - client The Genomic Data Commons has produced several useful tools for increasing the accessibility of their data to bioinformaticians. One particular cli tool of interest is the gdc-client executable, which downloads data directly from the GDC servers given a manifest file. Due to the direct GDC support for this tool, which helps to standardize data transfer protocols from their servers, the autogdc package utilizes the gdc-client for constructing the large matrices that serve as the local backend for the different analyses. Additionally, the GDC has an API which is useful for gathering clinical annotations of the data, which is also wrapped within the autogdc package, therefore reducing much of the tedious data wrangling to match annotations and data. 2.2 TCGAbiolinks The most functionally similar package to autogdc is TCGAbiolinks. 2 However, TCGAbiolinks is implemented in R and does not include the sequence modeling tools for meta-analysis on the regulatory effect of DNA methylation on transcription. In contrast, autogdc is written for python developers and contains tools for answering various bioinformatic questions, including sequence modeling and elastic net aging models. 3 Implementation The general architecture of the autogdc package is a ‘Dataset’ object, containing several methods to query, retrieve, and transform data from the GDC repository. This main object contains multiple data frames of genomic data, along with the corresponding annotation metadata for each sample. Additionally, in order to facilitate multi-omic studies, a ‘frame’ property can be called, which constructs a multi-indexed data frame of transcript and DNA methylation data, with information about DNA methylation loci, such as position to transcription start site, beta value, and it’s associated transcript. This multi-indexed dataframe facilitates the rapid construction of summary statistics and tensors for building machine learning models to assess the epigenetic regulatory relationship with transcription. 3.1 DNA methylation and RNAseq data All data is gathered automatically via autogdc from the Genomic Data Commons using the GDC API or the gdc-client command line interface executable. The compressed text files are then combined into dataframes of either DNA methylation or RNA sequencing count values, alongside the corresponding metadata from GDC. Additionally, preprocessing steps can be given as parameters, such that imputation of missing values and normalization will be automatically performed on the resulting matrices. Studies on regulation of transcription by DNA methylation were restricted to paired samples (within patient and tissue) with both 450k chip DNA methylation and RNA sequencing data. These restrictions yield a DNA methylation data frame consisting of 9472 samples and 396065 features and an RNA sequencing data frame consisting of 9415 samples and 59016 features. These matrices were processed by autogdc with mean imputation and quantile normalization for preprocessing. 3.2 Feature metadata Infinium HumanMethylation450K loci meta data was retrieved from Illumina’s product file website and used to filter CpG sites via various genomic features. Additionally, Biomart 3 was used to annotate gene symbols for RNA sequencing data. 3.3 Machine Learning Models A long-short term memory recurrent neural network 4 (tensorflow LSTM default implementation) with a latent state dimension of 32 units was used to encode the sequence of DNA methylation in order to predict RNA expression, as seen in Figure Figure 8 . Several other models can be easily constructed in the package, such as transformer models to encode both RNA and DNA methylation states and predict tissues, age, etc. 4 Example Case Studies and Discussion Several different autogdc case studies are presented in the jupyter notebook supplementary information, with a broader focus on the relation between transcription and DNA methylation. 4.1 Differential Gene Expression One of the most common tasks for a meta-analysis is the determination of a set of differentially expressed genes between two contrasted groups. The autogdc package provides a simple interface to performing differential gene expression (See example cluster map of significantly altered genes in Figure 3 ) via the function study.ddx(contrast = “sample_type”, method = “chdir”) , which provides options for the method of analysis (characteristic direction 5 or DESEq2 6 ) or contrast variable. Simple gene set enrichments on top of the differential gene analysis are also provided via gseapy 7 , 8 . Download figure Open in new tab Figure 1. Summary Statistics of autogdc data Download figure Open in new tab Figure 2. DNA Methylation loci distribution Download figure Open in new tab Figure 3. DNA Methylation loci distribution 4.2 DNA methylation-RNA expression relationship Many studies have shown an inverse correlation between the average gene promoter DNA methylation and that gene’s level of mRNA expression 9 ; however, the study of genomewide patterns has been less explored, mostly likely because of the lack of paired DNA and RNA datasets easily available to the public. AutoGDC enables this type of analysis to be done easily and quickly, as can be seen in the joint density plot of median promoter DNA methylation (loci within −1500 base pairs prior to the transcription start site) and gene expression for all genes ( Figure 4 ). Interestingly however, when assessing the Pearson r correlation value between median DNA promoter methylation and RNA expression (natural logarithm of RNA sequencing counts) within each gene, the distribution of Pearson r correlation values contains a considerable number of positive correlations ( Figure 5 ). In other words, the paired data from the GDC can be used to determine genes which have ‘noncanonical’ correlations between DNA methylation and transcription (high methylation and high expression or low methylation and low expression). A quick analysis of DNA methylation loci within promoters and the corresponding gene expression level shows several genes which have ‘non-canonical’ correlations ( Figure 5 ). In addition to including functions to facilitate discovery with standard statistical techniques, autogdc contains several machine learning methods for studying the relationship between transcription and DNA methylation ( Figure 8 ). These methods can be used to determine more detailed descriptions of genome-wide regulatory mechanisms and their dependence on sequence information. Download figure Open in new tab Figure 4. DNAm vs. RNA exp. Joint kernel density Download figure Open in new tab Figure 5. Pearson r correlations of DNAm & RNA expression Download figure Open in new tab Figure 6. DNA Methylation & RNA expression Download figure Open in new tab Figure 7. DNAm predicts RNA exp via Lin. Reg. Download figure Open in new tab Figure 8. DNAm predicts RNA exp via LSTM 5 Conclusion The autogdc package improves developer workflows by integrating the querying, downloading, organization, and analysis functions of a bioinformatics study into one simple package. Furthermore, the inclusion of several machine learning tools for assessing the relationship between DNA methylation and transcription will facilitate research into epigenetic regulatory mechanisms. 5.1.1 Funding This work was supported by the National Institutes of Health (P20-GM103636 and P30-AG050911 to J.D.W) 5.1.2 Contributions C.A.B is developer and writer, J.D.W. is PI Footnotes jonathan-wren{at}omrf.org References 1. ↵ Grossman , R. L. et al. Toward a shared vision for cancer genomic data . New England Journal of Medicine 375 , 1109 – 1112 ( 2016 ) OpenUrl CrossRef PubMed 2. ↵ Colaprico , A. et al. TCGAbiolinks: an R/Bioconductor package for integrative analysis of TCGA data . Nucleic acids research 44 , e71 – e71 ( 2016 ) OpenUrl CrossRef PubMed 3. ↵ Smedley , D. et al. BioMart–biological queries made easy . BMC genomics 10 , 1 – 12 ( 2009 ) OpenUrl CrossRef PubMed 4. ↵ Hochreiter , S. & Schmidhuber , J. Long short-term memory . Neural computation 9 , 1735 – 1780 ( 1997 ) OpenUrl CrossRef PubMed Web of Science 5. ↵ Clark , N. R. et al. The characteristic direction: a geometrical approach to identify differentially expressed genes . BMC bioinformatics 15 , 1 – 16 ( 2014 ) OpenUrl CrossRef PubMed 6. ↵ Love , M. I. , Huber , W. & Anders , S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2 . Genome biology 15 , 1 – 21 ( 2014 ) OpenUrl CrossRef PubMed 7. ↵ Subramanian , A. et al. Gene set enrichment analysis: a knowledge-based approach for interpreting genome-wide expression profiles . Proceedings of the National Academy of Sciences 102 , 15545 – 15550 ( 2005 ) OpenUrl Abstract / FREE Full Text 8. ↵ Fang , Z. GSEApy: gene set enrichment analysis in Python . GitHub https://github.com/zqfang/GSEApy ( 2020 ) 9. ↵ Compere , S. J. & Palmiter , R. D. DNA methylation controls the inducibility of the mouse metallothionein-I gene in lymphoid cells . Cell 25 , 233 – 240 ( 1981 ) OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted April 17, 2024. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following AutoGDC: A Python Package for DNA Methylation and Transcription Meta-Analyses Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share AutoGDC: A Python Package for DNA Methylation and Transcription Meta-Analyses Chase Alan Brown , Jonathan D. Wren bioRxiv 2024.04.14.589445; doi: https://doi.org/10.1101/2024.04.14.589445 Share This Article: Copy Citation Tools AutoGDC: A Python Package for DNA Methylation and Transcription Meta-Analyses Chase Alan Brown , Jonathan D. Wren bioRxiv 2024.04.14.589445; doi: https://doi.org/10.1101/2024.04.14.589445 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7644) Biochemistry (17728) Bioengineering (13916) Bioinformatics (42037) Biophysics (21488) Cancer Biology (18636) Cell Biology (25552) Clinical Trials (138) Developmental Biology (13401) Ecology (19940) Epidemiology (2067) Evolutionary Biology (24367) Genetics (15621) Genomics (22545) Immunology (17764) Microbiology (40475) Molecular Biology (17208) Neuroscience (88744) Paleontology (667) Pathology (2842) Pharmacology and Toxicology (4834) Physiology (7659) Plant Biology (15175) Scientific Communication and Education (2047) Synthetic Biology (4304) Systems Biology (9834) Zoology (2272)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.