peakScout – a user-friendly and reversible peak-to-gene translator for genomic peak calling results

doi:10.1101/2025.09.07.671934

peakScout – a user-friendly and reversible peak-to-gene translator for genomic peak calling results

2025 · doi:10.1101/2025.09.07.671934

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 24,270 characters · extracted from preprint-html · click to expand

peakScout – a user-friendly and reversible peak-to-gene translator for genomic peak calling results | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results peakScout – a user-friendly and reversible peak-to-gene translator for genomic peak calling results View ORCID Profile Alexander L. Lin , View ORCID Profile Lana A. Cartailler , View ORCID Profile Jean-Philippe Cartailler doi: https://doi.org/10.1101/2025.09.07.671934 Alexander L. Lin 1 Creative Data Solutions, Vanderbilt University , Nashville TN 37027 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alexander L. Lin Lana A. Cartailler 1 Creative Data Solutions, Vanderbilt University , Nashville TN 37027 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lana A. Cartailler Jean-Philippe Cartailler 1 Creative Data Solutions, Vanderbilt University , Nashville TN 37027 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jean-Philippe Cartailler For correspondence: jp.cartailler{at}vanderbilt.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Summary peakScout is a command line and web-based bioinformatics tool designed to quickly and easily bridge the gap between genomic peak data and gene annotations, enabling researchers to understand the relationship between measurements of regulatory elements and their target genes. At its core, peakScout processes genomic peak files obtained through various means chromatin profiling and maps them to nearby genes using reference genome annotations. The workflow begins with input processing, where peak files are standardized and reference GTF files are decomposed into chromosome-specific feature collections. The core analysis modules then perform bidirectional mapping: peak-to-gene identifies which genes are potentially regulated by specific genomic regions, while gene-to-peak reveals which regulatory elements might influence particular genes of interest. Throughout this process, nearest-feature detection algorithms handle the complex spatial relationships between genomic elements, considering factors like distance constraints and feature overlaps. Finally, the results are formatted into researcher-friendly CSV and Excel outputs, providing a comprehensive view of the genomic landscape that connects regulatory elements to their potential gene targets. Availability and implementation The web version of peakScout is available at https://vandydata.github.io/peakScout/ . The command line version is available at https://github.com/vandydata/peakScout and archived on Zenodo (URL to be provided upon version 1.0 release) under the GNU Affero General Public License v3.0. Installation instructions, example datasets, and detailed usage examples are provided in the GitHub repository README file. peakScout is implemented in Python and is platform independent, but the web version is implemented in Amazon Web Services and thus uses proprietary infrastructure. 1 Introduction High-throughput genomic techniques have revolutionized our understanding of protein-DNA interactions, chromatin modifications, and gene regulation. Methods such as ChIP-seq, CUT&RUN, CUT&TAG, ATAC-seq, DNase-seq, and FAIRE-seq typically generate thousands of genomic regions of interest (peaks) that represent binding sites, accessible chromatin regions, or enriched chromatin marks. However, translating these genomic coordinates into biologically meaningful insights remains challenging, particularly for researchers without extensive bioinformatics expertise. The critical step of associating genomic peaks with nearby genes is essential for understanding regulatory networks and interpreting experimental results. While several sophisticated tools exist for genomic data analysis, they often require significant computational skills, command-line proficiency, and parameter optimization. BEDTools ( Quinlan and Hall, 2010 ) offers comprehensive functionality for manipulating genomic intervals but demands familiarity with command-line operations and complex parameters. Other tools like HOMER ( Heinz et al ., 2010 ), GREAT (McLean CY, et al. GREAT improves functional interpretation of cis-regulatory regions - Google Search), and ChIPseeker ( Yu et al ., 2015 ) provide peak annotation capabilities but either require programming knowledge or offer limited flexibility in defining proximity relationships. peakScout addresses this gap by providing a straightforward, accessible solution for bidirectional mapping between genomic peaks and genes. Unlike existing tools that may overwhelm users with complex options or require extensive bioinformatics training, peakScout offers an intuitive approach that accepts standard output from popular peak callers like MACS2 ( Zhang et al ., 2008 ) and SEACR ( Meers et al ., 2019 ) and produces researcher-friendly results in familiar formats (CSV, Excel). The tool supports both peak-to-gene and gene-to-peak analyses, allowing researchers to either identify potential target genes of regulatory elements or find regulatory elements that might influence specific genes of interest. By simplifying this critical analytical step, peakScout enables bench scientists and bioinformatics novices to quickly interpret their genomic data without extensive computational training. The tool’s straightforward implementation and accessible output formats facilitate rapid integration with downstream analyses and visualization tools, accelerating the path from raw peak data to biological insights. 2 Language and algorithmic approach peakScout is implemented in Python, leveraging the high-performance Polars data processing library for efficient manipulation of tabular genomic data. This choice of technology enables rapid processing of large genomic datasets while maintaining a user-friendly interface. The core algorithmic approach in peakScout centers around efficient chromosome-specific feature decomposition and nearest-feature detection. For reference data management, peakScout employs a hierarchical decomposition strategy that organizes genomic features by type (gene, exon, CDS, etc.) and chromosome. This approach, implemented in decompose_ref . py , significantly reduces the search space when identifying nearest features, as only features on the same chromosome need to be considered. Each feature collection is further organized into separate files sorted by start and end positions, enabling efficient binary search operations when determining proximity relationships. The nearest-feature detection algorithm employs a bidirectional search strategy that simultaneously evaluates upstream and downstream features while accounting for potential overlaps. This approach is particularly efficient as it leverages the pre-sorted nature of both the decomposed reference data and the input features, allowing for rapid identification of nearest features even for datasets containing thousands of peaks and reference features. 2.1 Supported operations Figure 1 illustrates the general workflow of peakScout. Prior to conducting peak or gene analyses, peakScout implements a decomposition operation that preprocesses reference annotation files – such as GTFs – into a format optimized for downstream analysis. This preprocessing step significantly improves the efficiency of nearest feature identification by partitioning the reference data based on both chromosome and feature type. This decomposition is a one-time operation per reference file, and the resulting data are stored in a user-defined directory for future analyses. Download figure Open in new tab Figure 1. peakScout analysis workflow. The peakScout tool employs a two-step process for bidirectional mapping between genomic peaks and genes. (a) Reference preparation: GTF annotation files containing genomic features (start, end, metadata) are decomposed into chromosome- and feature-specific collections to optimize search efficiency. (b) Peak-to-gene analysis: Input peak files are processed alongside the decomposed reference to identify the k-nearest genes for each peak, generating a comprehensive output table with peak names, associated genes, and distance measurements. (c) Gene-to-peak analysis: A user-provided gene list is analyzed against the peak file and decomposed reference to identify the k-nearest peaks for each gene of interest, producing an output table with gene names, associated peaks, and distance measurements. Both analytical pathways support flexible distance constraints and multiple output formats (CSV, Excel). Following the decomposition step, users may proceed with one of two analytical functions: peak2gene or gene2peak . The peak2gene function identifies the nearest annotated genes from the reference dataset to a user-defined set of genomic peaks. Conversely, the gene2peak function determines the nearest peaks – also provided by the user – relative to a specified list of genes. Both functions provide comprehensive options for defining peak boundaries and proximity constraints. Users can choose between native peak boundaries (as defined by the peak caller), peak summits, or artificial boundaries with user-defined extensions. Additionally, users can specify maximum distance constraints for upstream and downstream features, ensuring that only biologically relevant associations are reported. For input flexibility, peakScout supports multiple peak file formats through specialized processing functions. Internally defined operations automatically detect and handle MACS2 output (both Excel and BED formats) and SEACR output, standardizing these diverse formats into a consistent internal representation. BED6 is also supported, which provides wide support for most peak-calling outputs. This enables seamless integration with various peak calling workflows without requiring users to perform format conversions. When users want to obtain a list of peaks nearest to their genes of interest, a single-column gene list is all that is required. To facilitate ease of use, peakScout includes pre-generated reference files for commonly studied organisms: C. elegans (WBcel235), Drosophila (BDGP6.54), zebrafish (GRCz11), S. cerevisiae (R64-1-1), X. tropicalis (v10.1), mouse (mm39 and mm10), human (hg38 and hg19), and pig (Sscrofa11.1). The reference GTF files were downloaded from Ensembl ( Dyer et al ., 2025 ) and Gencode ( Mudge et al ., 2025 ), and the resulting decomposed references are available via a public S3 bucket, eliminating the need for users to manually process GTF files for these organisms. Instructions are provided as to how to download and use these pre-generated references. 2.4 Feature detection and proximity analysis The core of peakScout’s analytical capability lies in its feature detection and proximity analysis algorithms. Unlike some tools that simply report the nearest TSS for each peak, peakScout implements a more sophisticated approach that considers the full genomic context. The get_nearest_features function in process_features . py employs a bidirectional search strategy that simultaneously evaluates upstream and downstream features. This approach allows peakScout to identify the k-nearest features in both directions, providing a more comprehensive view of the genomic neighborhood around each peak. The function also handles feature overlaps, ensuring that genes directly overlapping with peaks are prioritized in the results. For proximity constraints, peakScout allows users to specify maximum distance thresholds for upstream and downstream features through the up_bound and down_bound parameters. This functionality enables researchers to focus on biologically relevant associations based on their understanding of regulatory element behavior in their specific experimental context. The constrain_features function further optimizes the search process by filtering reference features based on these distance constraints before performing the detailed proximity analysis. Through binary search operations, it also ensures that, after accounting for potential overlaps, searches for upstream and downstream nearest features begin at the closest possible location. These pre-filtering steps significantly reduce the computational burden for large datasets, enabling rapid analysis even on standard desktop computers. 2.5 Output generation and visualization peakScout provides flexible output options through its write_output . py module. Results can be exported in both CSV and Excel formats, with the Excel output including additional formatting for improved readability. As shown in Figure 2 , the output includes comprehensive information about each peak or gene, including 1) genomic coordinates (chromosome, start, end), 2) feature identifiers (peak name or gene name), 3) k-nearest features (as specified by the user), 4) distance to each nearest feature (with negative values indicating upstream features), and 5) a pre-formed URL to the UCSC genome browser (for supported genomes) that includes a “highlight” of their peak in a rich and interactive genomic context. This rich output format enables researchers to quickly identify potential regulatory relationships and prioritize candidates for further experimental validation. Download figure Open in new tab Figure 2. peakScout in practice. A) Users and their peak data can interact with peakScout either via the web version of peakScout at https://vandydata.github.io/peakScout/orB ) via the command line. The user will upload a peak file and select basic parameter settings. The result is provided as tabular data, which includes a gene name and genomic distance from the peak, for as many nearby genes selected by the user. For example, in C), the user would have selected 2 nearby genes. For the first peak on chr1:4344147-4344187, the nearest gene is Rp1 (retinitis pigmentosa 1), which is essential of retinal photoreceptor cell function. The second nearest gene is Gm37483. To help with accessing additional gene annotations in this genomic context, a link to the relevant region at the UCSC genome browser is provided, which provides a context specific “highlight” (as per UCSC specifications) of the peak, as shown in D. In this case, the peak is present in the promoter region of Rp1 and most likely plays a role in transcriptional control. 2.6 Interfaces For use as imported functions or at the command line, peakScout can be installed as per the instructions on the GitHub website and dependencies installed using pip or uv, commonly used package managers. 2.6.1. Use as a python library To use peakScout functions within a typical Python script, one can import the peakScout functions ( peak2gene, gene2peak , or decompose_ref ) and use as desired in their program or workflow. 2.6.2. Use via a website and serverless cloud computing We have also deployed peakScout as a serverless application on Amazon Web Services (AWS), where the analysis environment is containerized and executed via AWS Lambda.Pre-decomposed reference genomes are stored in Amazon S3, retrieved on demand, and decompressed within the execution environment, eliminating the need for persistent infrastructure while supporting multiple genome assemblies. As shown in Figure 2 , users interact through a lightweight web client to submit peak files (e.g., BED, narrowPeak) and analysis parameters directly to the AWS Lambda endpoint. Input files are processed against the selected reference genome, and results are returned as compressed artifacts for immediate preview or download. All intermediate files are discarded after execution, ensuring a stateless and reproducible workflow. This cloud-based design enables analyses to complete within seconds of file submission, providing a technically robust yet accessible platform where users simply upload a file, wait several seconds and then retrieve results. The web version of peakScout is available at https://vandydata.github.io/peakScout/ . 2.6.3. Use as a command line tool Several examples for peakScout utilization at the command line are provided in the GitHub README file. In short, one would simply type peakScout , followed by the desired function ( peak2gene, gene2peak , or decompose_ref ), and finally the required and optional parameters for that analysis. Execution is typically less than 10 seconds when run locally for a peak file containing 50000 peaks. Example output is show in Figure 2C . Conclusion While originally designed for CUT&RUN, CUT&TAG and ChIP-seq, peakScout’s flexible input requirements and support for popular peak caller outputs (MACS2, SEACR) makes it broadly applicable to other genomic methods that generate coordinate-based regions of interest, including ATAC-seq, DNase-seq, and FAIRE-seq experiments. The lack of advanced parameters are intentional to make things easy and accessible for all non-technical users, or those who desire a first pass at results before engaging in deep bioinformatics analysis. Conflict of interest None declared. Data Availability We have made available prep-processed gene annotation references for common organisms, publicly available and cloud-ready at s3://cds-peakscout-public/. Example peakScout outputs are available as supplementary data 1 (XLSX) and supplementary data 2 (CSV). Acknowledgements None at this time. Footnotes https://github.com/vandydata/peakScout https://vandydata.github.io/peakScout/ References ↵ Dyer , S.C. et al. ( 2025 ) Ensembl 2025 . Nucleic Acids Res , 53 , D948 – D957 . OpenUrl CrossRef PubMed ↵ Heinz , S. et al. ( 2010 ) Simple combinations of lineage-determining transcription factors prime cisregulatory elements required for macrophage and B cell identities . Mol Cell , 38 , 576 – 589 . OpenUrl CrossRef PubMed Web of Science McLean CY , et al. GREAT improves functional interpretation of cis-regulatory regions Google Search . ↵ Meers , M.P. et al. ( 2019 ) Peak calling by Sparse Enrichment Analysis for CUT&RUN chromatin profiling . Epigenetics & Chromatin , 12 , 42 . OpenUrl CrossRef PubMed ↵ Mudge , J.M. et al. ( 2025 ) GENCODE 2025: reference gene annotation for human and mouse . Nucleic Acids Res , 53 , D966 – D975 . OpenUrl CrossRef PubMed ↵ Quinlan , A.R. and Hall , I.M. ( 2010 ) BEDTools: a flexible suite of utilities for comparing genomic features . Bioinformatics , 26 , 841 – 842 . OpenUrl CrossRef PubMed Web of Science ↵ Yu , G. et al. ( 2015 ) ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization . Bioinformatics , 31 , 2382 – 2383 . OpenUrl CrossRef PubMed ↵ Zhang , Y. et al. ( 2008 ) Model-based Analysis of ChIP-Seq (MACS) . Genome Biology , 9 , R137 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted September 08, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following peakScout – a user-friendly and reversible peak-to-gene translator for genomic peak calling results Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share peakScout – a user-friendly and reversible peak-to-gene translator for genomic peak calling results Alexander L. Lin , Lana A. Cartailler , Jean-Philippe Cartailler bioRxiv 2025.09.07.671934; doi: https://doi.org/10.1101/2025.09.07.671934 Share This Article: Copy Citation Tools peakScout – a user-friendly and reversible peak-to-gene translator for genomic peak calling results Alexander L. Lin , Lana A. Cartailler , Jean-Philippe Cartailler bioRxiv 2025.09.07.671934; doi: https://doi.org/10.1101/2025.09.07.671934 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7642) Biochemistry (17715) Bioengineering (13907) Bioinformatics (42005) Biophysics (21472) Cancer Biology (18624) Cell Biology (25534) Clinical Trials (138) Developmental Biology (13391) Ecology (19935) Epidemiology (2067) Evolutionary Biology (24356) Genetics (15617) Genomics (22529) Immunology (17753) Microbiology (40437) Molecular Biology (17200) Neuroscience (88697) Paleontology (667) Pathology (2840) Pharmacology and Toxicology (4829) Physiology (7653) Plant Biology (15171) Scientific Communication and Education (2046) Synthetic Biology (4304) Systems Biology (9827) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00