XtractPAV: An Automated Pipeline for Identifying Presence–Absence Variations Across Multiple Genomes

doi:10.1101/2025.06.27.661953

XtractPAV: An Automated Pipeline for Identifying Presence–Absence Variations Across Multiple Genomes

2025 · doi:10.1101/2025.06.27.661953

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 21,176 characters · extracted from preprint-html · click to expand

XtractPAV: An Automated Pipeline for Identifying Presence–Absence Variations Across Multiple Genomes | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results XtractPAV: An Automated Pipeline for Identifying Presence–Absence Variations Across Multiple Genomes Rana Sheraz Ahmad , Muhammad Sadaqat , View ORCID Profile Muhammad Tahir ul Qamar doi: https://doi.org/10.1101/2025.06.27.661953 Rana Sheraz Ahmad 1 Integrative Omics and Molecular Modeling Laboratory, Department of Bioinformatics and Biotechnology, Government College University Faisalabad (GCUF) , Faisalabad, 38000, Pakistan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Muhammad Sadaqat 2 UMR CNRS 6553 Ecosystèmes, Biodiversité, Evolution (ECOBIO), Université de Rennes 1 , Rennes, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Muhammad Tahir ul Qamar 1 Integrative Omics and Molecular Modeling Laboratory, Department of Bioinformatics and Biotechnology, Government College University Faisalabad (GCUF) , Faisalabad, 38000, Pakistan Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Muhammad Tahir ul Qamar For correspondence: tahirulqamar{at}gcuf.edu.pk Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Motivation Presence-absence variations (PAVs) significantly influence phenotypic diversity across and within species by modulating functional modules involved in stress responsiveness, adaptation, and developmental processes. This modulation ultimately contributes to genetic diversity at both inter- and intra-species levels. However, existing tools for detecting PAVs offer limitations in achieving optimal analysis because they lack scalable workflows for multi-genome comparisons and frequently necessitate manual integration. To address these challenges, we developed XtractPAV, an end-to-end pipeline that automates the extraction, annotation, and interactive visualization of PAVs across large-scale genomic datasets. Results XtractPAV was evaluated using assembled genomes of both eukaryotic and prokaryotic organisms, including Pyrus communis, Arabidopsis thaliana, Mus musculus , and Salmonella enterica , to assess its ability to detect the genomic variations across diverse species. The performance of XtractPAV was benchmarked against other established pipelines, demonstrating superior precision and a more comprehensive extraction of PAV segments. Notably, our pipeline not only identified the known PAVs from the reference set but also revealed novel variations in genes associated with various functions such as flowering time regulation and disease resistance. Furthermore, the automated report generation feature of XtractPAV produces publication-ready summaries of PAV distributions and related metrics. Availability XtractPAV is freely accessible at https://github.com/SherazAhmadd/XtractPAV and on the XtractPAV webpage. The Package includes all requisite files, a user manual, test data, and a license permitting non-commercial use. Supplementary material Supplementary data are accessible online at Bioinformatics . 1. Introduction Presence-absence variations (PAVs) represent a significant class of genomic structural variations (SVs), characterized by the presence of specific genomic segments in some individuals and their absence in others. These regions of variation are often associated with key functions or traits, thereby facilitating an understanding of phenotypic differences across species ( Wang, et al., 2023 ). Thus, PAVs contribute to the sequence diversity among individuals, leading to intraspecific variability and interspecific genomic divergence ( Gerdol, et al., 2025 ). The identification of large SVs is considerably more complex than that of small-scale genomic variations ( Jiao, et al., 2025 ), often leading to their oversight in genomics studies. Various tools have been developed for the detection of these large-scale SVs, particularly PAVs, such as ScanPAV ( Giordano, et al., 2018 ) and ppsPCP ( Tahir Ul Qamar, et al., 2019 ). However, these tools impose strict limitations and are unable to provide comprehensive annotation of PAVs, including those intersecting coding regions. To address these limitations, this study introduces XtractPAV, a novel pipeline designed to efficiently and accurately trace PAVs. XtractPAV has been evaluated across five complex pear genomes, eighteen ecotypes of Arabidopsis, and six mouse genome assemblies to demonstrate its broad applicability and accuracy. 2. Materials and Methods To identify the genomic regions present in the query genome but absent in the reference genome, a comprehensive whole-genome comparison is first performed. The NUCmer program from the MUMmer4 ( Marçais, et al., 2018 ) is employed to compute the delta file of the alignment, which is subsequently analysed for coordinate information. In-house Python scripts are utilized to identify the unmapped regions in the genome, with the minimum length parameter adjusted by the user, and to retrieve the corresponding sequences. To address larger segments of ambiguous bases (N’s), PAVs are segmented into smaller fragments prior to the downstream filtering. To ensure the authenticity of the PAVs, all-versus-all alignment with the reference genome is performed using BLASTn ( Camacho, et al., 2009 ); fragments with no hits, less than 10% coverage, and under 5% sequence identity are considered authentic PAVs and retained for further analysis. Those PAVs located within the coding regions are designated as the genic PAVs. Additional in-house Python scripts are employed to refine the boundaries of genic-PAVs and annotate them using available annotation file of the query genome. In XtractPAV, we extend the PAV sequence when it intersects with the coding region to complete the gene sequence, subsequently storing this information in a FASTA file. However, the coordinates in the summary file retain the original PAV designation. Ultimately, the pipeline produces a comprehensive final analysis report along with various interactive plotting graphs ( Fig. 1 ). XtractPAV enables the simultaneous analysis of multiple query genomes against the reference genome in a single execution. While utilizing XtractPAV, users can specify parameters such as minimum length, coverage, similarity, thread count, and the number of genomes for PAV extraction. Download figure Open in new tab Fig. 1. Workflow diagram of the XtractPAV pipeline. This pipeline accepts two genome assemblies in FASTA format, along with their corresponding annotation files in General Feature Format version 3 (GFF3), for the analysis of PAVs. 3. Results and Discussion XtractPAV was developed to accommodate both eukaryotic and prokaryotic organisms based on user requirements. The scripts for XtractPAV are written in Python and Shell scripting and are implemented in a Linux environment. It accepts genomic sequences in *.fa, *.fna, *.faa, and other *.fasta formats, alongside their annotation file in GFF3, which is recommended; however, *.gff and *.gtf formats are also acceptable. All PAVs are recorded in FASTA format, accompanied by their annotations in GFF3 format and a local HTML file for analysis reports and graphical representations. The performance of XtractPAV was evaluated for both eukaryotic and prokaryotic organisms. For a complex plant genome analysis, five Pyrus communis (pear) genomes, with Dangshansuli serving as the reference, while Bartlett, Cuiguan, Shanxiduli, and ZhongaiNO.1 genomes, were sourced from PGDB ( Chen, et al., 2023 ) to investigate interspecies differences, revealing a significant number of insertions and deletions among the species. Furthermore, we selected 19 ecotypes of Arabidopsis from the 19 genomes of the Arabidopsis thaliana project (Gan, et al., 2011) to conduct PAVs analysis, aiming to elucidate their genetic divergences and quantify the number of PAVs (Supplementary Table S1). The mouse ( Mus musculus ) is a widely utilized model organism for the investigation of human diseases and biological processes; however, the presence of multiple genome assemblies and substantial intraspecies genetic variation contributes to phenotypic diversity, which may influence empirical findings. To investigate SVs across various mouse assemblies, we conducted the PAVs analysis using XtractPAV on six mouse assemblies, AKR_J, 129S1_SvImJ, C3H_HeJ, C57BL_6NJ, and DBA_2J, compared to the reference genome GRCm39 ( mm39 ). To test the performance of XtractPAV on prokaryotic genomes, we selected 42 serovars of Salmonella enterica , which were previously employed to construct the pan-genome ( Jacobsen, et al., 2011 ). S. enterica typhimurium strain LT2 served as the reference, while the other 41 strains were utilized as target genomes. The results of the S. enterica PAV analysis are presented in Supplementary Table S2. 3.1 Benchmarking To assess the functional capabilities of XtractPAV, we conducted a comparative analysis with other PAV extraction pipelines, specifically ScanPAV and ppsPCP. While these tools can be effective in certain contexts, they also demonstrate limitations in managing multi-genomic data, annotating coding-regions, adjusting parameter flexibility, and accurately identifying PAVs. All tools were evaluated using the same dataset of P. communis (Pear) genomes to ensure consistent processing ( Table 1 ). The discrepancies in results when comparing different pipelines can be attributed to the differences in their stringent identification and filtering criteria. View this table: View inline View popup Download powerpoint Table 1. Number of presence-absence variations (PAVs) identified with different pipelines to assess the performance of XtractPAV The scanPAV pipeline is specifically designed for the extraction of PAVs. However, it presents several limitations. Notably, it cannot process multi-genomic inputs, and the size of the PAV is fixed at 1 kilobase (kb). While PAVs in plants have been documented to be a minimum of 100 base pairs (bps) ( Shen, et al., 2015 ). Since users cannot adjust the length of PAVs according to the specific genome and analysis requirements, which renders it is predominantly suitable for the mammalian genomes. Furthermore, scanPAV does not explicitly extract coding regions along with their annotations. Additionally, the pipeline has not incorporated validation based on coverage and similarity, which raises concerns regarding its precision. XtractPAV represents an advanced iteration of our previous pan-genome construction pipeline, ppsPCP (Tahir Ul Qamar, et al., 2019), and is specifically designed to identify the PAVs with improved features and performance. The ppsPCP tool was initially developed for constructing the pangenomes, but it is often utilized to screen PAVs ( Chen, et al., 2025 ; Lan, et al., 2024 ). However, it imposes a strict minimum length of 100 bp for PAVs, which cannot be modified by the user depending on specific genome and analysis needs. Additionally, since ppsPCP is primarily tailored for pangenome construction, which contributes to longer processing times for PAV extraction and utilizes the primary PAV output as an intermediary file, it does not provide PAVs as explicit output and annotation of genic PAVs. In contrast, XtractPAV addresses these limitations by offering a more comprehensive solution for PAV extraction. It accepts multi-genomic input, allows for fully customizable parameters, and provides direct outputs including PAVs in FASTA format, accompanied by an annotation GFF file, and a graphical report that summarizes the PAVs distribution across genomes. 5. Conclusion The consideration of complete diversity and genetic composition, without accounting for PAV regions, presents an insufficient representation of an organism’s genetic architecture. The challenge of identifying large SVs remains significant. XtractPAV addresses this issue by facilitating the tracking the larger-scale insertions and deletions, particularly in PAV regions, across multiple genomes, utilizing customizable user-defined parameters. Furthermore, by quantifying both genic and non-genic PAVs, XtractPAV also offers a robust metric to evaluate genome assembly quality across various sequencing technologies. Funding Not applicable. Author contributions R.S.A.: logical programming, coding, code testing, visualization, and writing the manuscript. M.S.: code testing, validation, and editing of the manuscript. M.T.Q.: conception and study design, code testing, resources, supervision, and editing the manuscript. Supplementary data Supplementary data are available at Bioinformatics online. Conflict of interest None. Ethical Statement This study did not involve any experiments with animal subjects or human participants. The authors utilized ChatGPT v2 to enhance the clarity and readability of the codes. All content generated through this service was carefully reviewed and edited by the authors, who take full responsibility for the final content of the publication. Acknowledgment Not applicable. Footnotes ranasheraz.202101902{at}gcuf.edu.pk (R.S.A.); tahirulqamar{at}gcuf.edu.pk (M.T.Q.) muhammad.sadaqat{at}univ-rennes.fr (M.S) References 1. ↵ Camacho , C. , et al. BLAST+: architecture and applications . BMC bioinformatics 2009 ; 10 : 1 – 9 . OpenUrl CrossRef PubMed 2. ↵ Chen , C. , et al. A comprehensive omics resource and genetic tools for functional genomics research and genetic improvement of sorghum . Molecular Plant 2025 ; 18 ( 4 ): 703 – 719 . OpenUrl CrossRef PubMed 3. ↵ Chen , S. , et al. The pear genomics database (PGDB): a comprehensive multi-omics research platform for Pyrus spp . BMC Plant Biology 2023 ; 23 ( 1 ): 430 . OpenUrl CrossRef PubMed 4. Gan , X. , et al. Multiple reference genomes and transcriptomes for Arabidopsis thaliana . Nature 2011 ; 477 ( 7365 ): 419 – 423 . OpenUrl CrossRef PubMed Web of Science 5. ↵ Gerdol , M. , et al. The mytilin gene cluster: Shedding light on the enigmatic origin of mussel dispensable genes . Fish & Shellfish Immunology 2025 ; 161 : 110286 . OpenUrl CrossRef PubMed 6. ↵ Giordano , F. , et al. scanPAV: a pipeline for extracting presence–absence variations in genome pairs . Bioinformatics 2018 ; 34 ( 17 ): 3022 – 3024 . OpenUrl CrossRef PubMed 7. ↵ Jacobsen , A. , et al. The Salmonella enterica pan-genome . Microbial ecology 2011 ; 62 : 487 – 504 . OpenUrl CrossRef PubMed Web of Science 8. ↵ Jiao , C. , et al. Pan-genome bridges wheat structural variations with habitat and breeding . Nature 2025 ; 637 ( 8045 ): 384 – 393 . OpenUrl CrossRef PubMed 9. ↵ Lan , D. , et al. Pangenome and multi-tissue gene atlas provide new insights into the domestication and highland adaptation of yaks . Journal of animal science and biotechnology 2024 ; 15 ( 1 ): 64 . OpenUrl CrossRef 10. ↵ Marçais , G. , et al. MUMmer4: A fast and versatile genome alignment system . PLoS computational biology 2018 ; 14 ( 1 ): e1005944 . OpenUrl CrossRef 11. ↵ Shen , X. , et al. PAV markers in Sorghum bicolour: genome pattern, affected genes and pathways, and genetic linkage map construction . Theoretical and Applied Genetics 2015 ; 128 : 623 – 637 . OpenUrl CrossRef PubMed 12. ↵ Tahir Ul Qamar , M. , et al. ppsPCP: a plant presence/absence variants scanner and pan-genome construction pipeline . Bioinformatics 2019 ; 35 ( 20 ): 4156 – 4158 . OpenUrl CrossRef PubMed 13. ↵ Wang , D. , et al. Two complementary genes in a presence-absence variation contribute to indicajaponica reproductive isolation in rice . Nature communications 2023 ; 14 ( 1 ): 4531 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted July 02, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following XtractPAV: An Automated Pipeline for Identifying Presence–Absence Variations Across Multiple Genomes Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share XtractPAV: An Automated Pipeline for Identifying Presence–Absence Variations Across Multiple Genomes Rana Sheraz Ahmad , Muhammad Sadaqat , Muhammad Tahir ul Qamar bioRxiv 2025.06.27.661953; doi: https://doi.org/10.1101/2025.06.27.661953 Share This Article: Copy Citation Tools XtractPAV: An Automated Pipeline for Identifying Presence–Absence Variations Across Multiple Genomes Rana Sheraz Ahmad , Muhammad Sadaqat , Muhammad Tahir ul Qamar bioRxiv 2025.06.27.661953; doi: https://doi.org/10.1101/2025.06.27.661953 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41910) Biophysics (21436) Cancer Biology (18576) Cell Biology (25480) Clinical Trials (138) Developmental Biology (13368) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40360) Molecular Biology (17163) Neuroscience (88534) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00