Full text
22,310 characters
· extracted from
preprint-html
· click to expand
PyOrthoANI, PyFastANI, and Pyskani: a suite of Python libraries for computation of average nucleotide identity | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results PyOrthoANI, PyFastANI, and Pyskani: a suite of Python libraries for computation of average nucleotide identity Martin Larralde , View ORCID Profile Georg Zeller , View ORCID Profile Laura M. Carroll doi: https://doi.org/10.1101/2025.02.13.638148 Martin Larralde 1 Structural and Computational Biology Unit , EMBL, 69117 Heidelberg, Germany 2 Leiden University Center for Infectious Diseases (LUCID), Leiden University Medical Center , 2333ZA Leiden, Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: martin.larralde{at}embl.de laura.carroll{at}umu.se Georg Zeller 1 Structural and Computational Biology Unit , EMBL, 69117 Heidelberg, Germany 2 Leiden University Center for Infectious Diseases (LUCID), Leiden University Medical Center , 2333ZA Leiden, Netherlands 3 Center for Microbiome Analyses and Therapeutics, Leiden University Medical Center , 2333ZA Leiden, Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Georg Zeller Laura M. Carroll 4 Department of Clinical Microbiology, SciLifeLab, Umeå University , 90187 Umeå, Sweden 5 Laboratory for Molecular Infection Medicine Sweden (MIMS), Umeå University , 90187 Umeå, Sweden 6 Umeå Centre for Microbial Research (UCMR), Umeå University , 90187 Umeå, Sweden 7 Integrated Science Lab (IceLab), Umeå University , 90187 Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Laura M. Carroll For correspondence: martin.larralde{at}embl.de laura.carroll{at}umu.se Abstract Full Text Info/History Metrics Supplementary material Preview PDF ABSTRACT Summary The average nucleotide identity (ANI) metric has become the gold standard for prokaryotic species delineation in the genomics era. The most popular ANI algorithms are available as command-line tools and/or web applications, making it inconvenient or impossible to incorporate them into bioinformatic workflows, which utilize the popular Python programming language. Here, we present PyOrthoANI, PyFastANI, and Pyskani, Python libraries for three popular ANI computation methods. ANI values produced by PyOrthoANI, PyFastANI, and Pyskani are virtually identical to those produced by OrthoANI, FastANI, and skani, respectively. All three libraries integrate seamlessly with BioPython, making it easy and convenient to use, compare, and benchmark popular ANI algorithms within Python-based workflows. Availability and Implementation Source code is open-source and available via GitHub (PyOrthoANI, https://github.com/althonos/orthoani ; PyFastANI, https://github.com/althonos/pyfastani ; Pyskani, https://github.com/althonos/pyskani ). Supplementary Information Supplementary data are available on bioRxiv . Introduction The average nucleotide identity (ANI) metric of genomic similarity is arguably the most popular method for prokaryotic species delineation in the genomics era ( Jain et al ., 2018 ; Richter and Rosselló-Móra, 2009 ). The calculation of ANI values shared between two genomes is a crucial step in many bioinformatic pipelines, including popular methods/workflows for prokaryotic species identification ( Parks et al ., 2018 ; Chaumeil et al ., 2019 ), within-species lineage/strain delineation ( Rodriguez-R et al ., 2024 ; Raghuram et al ., 2024 ), and general prokaryotic (meta)genomic data analysis ( Olm et al ., 2017 ; Petit and Read, 2020 ). While numerous ANI algorithm implementations have been developed, nucleotide BLAST-based ANI (ANIb) algorithms are considered to be the gold standard ( Jain et al ., 2018 ; Shaw and Yu, 2023 ). ANIb algorithms are accurate in the sense that they share a strong correlation with experimentally determined DNA-DNA hybridization (DDH) values ( Konstantinidis and Tiedje, 2005 ; Goris et al ., 2007 ; Lee et al ., 2016 ; Richter and Rosselló-Móra, 2009 ). However, due to the high time complexity of BLAST and similar alignment-based algorithms, ANIb algorithms are notoriously slow ( Jain et al ., 2018 ) and thus most appropriate for users with smaller datasets (e.g., up to ≈10 3 genomes/10 6 pairwise comparisons), who prioritize accuracy over speed. To overcome the computational limitations of ANIb, alignment-free ANI algorithms have been developed, most notably FastANI ( Jain et al ., 2018 ) and skani ( Shaw and Yu, 2023 ). Both FastANI and skani forgo some accuracy in favor of speed (i.e., they produce ANI values, which correlate with, but are not necessarily equivalent to, ANIb), and as such, they can readily scale to massive genomic datasets (e.g., ≥10 4 genomes/10 8 pairwise comparisons) ( Jain et al ., 2018 ; Shaw and Yu, 2023 ). However, identifying the optimal alignment-free ANI algorithm for a given dataset is not always straightforward. FastANI is ≥50x faster than ANIb methods and is more accurate than skani on reference-quality genomes ( Shaw and Yu, 2023 ; Jain et al ., 2018 ). skani, on the other hand, is >20x faster than FastANI and is more accurate on fragmented, incomplete metagenome-assembled genomes (MAGs) ( Shaw and Yu, 2023 ). Thus, in addition to considering dataset size and algorithm speed-accuracy tradeoff, users may want to consider dataset composition (e.g., isolate genomes versus MAGs) and quality when selecting the optimal ANI algorithm for their dataset. Regardless of whether they prioritize accuracy or speed, the most popular ANI algorithms/methods (e.g., FastANI, skani, ANI by Orthology [OrthoANI], JSpeciesWS, PyANI) are available as command-line tools and/or web applications ( Jain et al ., 2018 ; Shaw and Yu, 2023 ; Lee et al ., 2016 ; Richter et al ., 2016 ; Pritchard et al ., 2015 ). This makes it inconvenient–and sometimes, impossible–for bioinformaticians to incorporate ANI algorithms into bioinformatic workflows, which utilize the popular Python programming language ( Van Rossum and Drake, 2009 ). Here, we present a suite of Python libraries for popular ANI algorithms, specifically: (i) PyOrthoANI, a Python-based implementation of the OrthoANI algorithm (a highly accurate ANIb method) ( Lee et al ., 2016 ); (ii) PyFastANI and (iii) Pyskani, Python bindings for the FastANI and skani algorithms, respectively (fast, alignment-free methods) ( Jain et al ., 2018 ; Shaw and Yu, 2023 ). Each Python library integrates seamlessly with BioPython ( Cock et al ., 2009 ), making it simple and convenient to perform ANI computations within Python-based bioinformatic workflows, software programs, and notebooks (e.g., Jupyter) ( Kluyver et al ., 2016 ). By providing a unified Python interface, our suite allows users to easily swap out different ANI algorithms, making it simple and convenient to test, compare, and benchmark methods. Implementation The PyOrthoANI algorithm ( https://github.com/althonos/orthoani ) was implemented in the same manner as the original OrthoANI Java implementation ( Lee et al ., 2016 ). Briefly, to calculate ANI values between a query and reference genome, both genomes are partitioned into 1,020 bp-long fragments. Fragments that are 80% ambiguous (N) nucleotides are discarded. Nucleotide BLAST (blastn) ( Camacho et al ., 2009 ) values are then calculated between the set of query and reference genome fragments using the following blastn parameters (all other parameters are set to their respective defaults): -evalue 1e-15, -xdrop_gap 150, -dust no, -penalty -1, -reward 1, -num_alignments 1, -outfmt ‘6 qseqid sseqid length pident’. The resulting fragments are considered to be orthologous if they produce reciprocal best hits, which cover at least 35% of the total length of the fragment. Final ANI values are calculated by averaging the nucleotide identity values for all reciprocal blastn hits. For PyFastANI ( https://github.com/althonos/pyfastani ), the original FastANI code (written in C++) ( Jain et al ., 2018 ) was wrapped into a Python extension module using the Cython language (v3.0) ( Behnel et al ., 2011 ). While PyFastANI uses the original FastANI code for hashing and core-genome identity computations, we reimplemented the sketching to support passing plain Python strings as input sequences. In addition, we implemented serialization/deserialization support to allow querying a reference database several times. To speed up the querying of individual sequences, we parallelized the fragment sketching step using Python thread pools and re-entrant code. For Pyskani ( https://github.com/althonos/pyskani ), the original skani code (written in Rust) ( Shaw and Yu, 2023 ) was wrapped into a Python extension module using the PyO3 library (v0.22.5; https://pyo3.rs ) for bindings generation. To accelerate querying, we implemented a more generic strategy for the storage of reference markers, allowing to either load the markers from a file iteratively (as in the original skani), or pre-loading them in memory to reduce I/O costs for successive querying. Results Using each of the five data sets used to validate and benchmark FastANI ( n = 14,952 total [meta]genomes) ( Jain et al ., 2018 ), we compared ANI values produced by PyOrthoANI, PyFastANI, and Pyskani to those produced by OrthoANI, FastANI, and skani, respectively. We additionally benchmarked the speed of all six methods on each genome individually using 1, 8, and/or 16 CPUs in triplicate ( n = 717,651 total ANI computations; Figure 1 , Supplementary Figures S1-S7, Supplementary Tables S1-S7, Supplementary Text). Download figure Open in new tab Figure 1. (A-C) Correlation between average nucleotide identity (ANI) values produced by (A) PyOrthoANI, (B) PyFastANI, and (C) Pyskani (Y-axes) with ANI values produced by OrthoANI, FastANI, and skani, respectively (X-axes) for genomes used in the FastANI validation/benchmarking data sets (black dots; Supplementary Tables S1-S6). Dashed lines denote the best-fitting linear model for each method pair, with adjusted R 2 and P -values reported in the upper left corner of each subplot. Pyskani values were multiplied by 100. (D-F) Per-genome real (wall clock) time in seconds (Y-axes, log-scale) for (D) OrthoANI/PyOrthoANI, (E) FastANI/PyFastANI, and (F) skani/Pyskani (X-axes), using 1, 8, and/or 16 CPUs on the FastANI validation/benchmarking data sets (violin plots; Supplementary Table S7). For fairness, PyFastANI and Pyskani times include the time it took to load Python modules and parse genomes using BioPython (performed for every genome/computation). For extended versions of this figure, see Supplementary Figures S1-S7. Raw data used to construct all plots is available in Supplementary Tables S1-S7. ANI values calculated by PyOrthoANI, PyFastANI, and Pyskani were virtually identical to those produced by OrthoANI, FastANI, and skani, respectively (adjusted R 2 > 0.999 and P < 2.2e-16 for all methods; Figure 1a-c ). Compared to OrthoANI, PyOrthoANI was, on average, 3x faster per genome ( Figure 1d ). PyFastANI and Pyskani performed similarly to FastANI and skani, respectively, even when Python module load times and genome parsing (via BioPython) were included in the PyFastANI/Pyskani runtime; however, differences in FastANI/PyFastANI and skani/Pyskani runtime and memory usage varied by dataset ( Figure 1ef , Supplementary Figures S1-S7). Overall, PyOrthoANI, PyFastANI, and Pyskani enable users to perform ANI computations within Python-based software, workflows, and notebooks. Because each Python library integrates with BioPython and is easily interchangeable, we anticipate that our Python suite will be particularly useful for comparing/benchmarking ANI algorithms, and for developers/users who frequently encounter highly heterogeneous datasets (e.g., genomic datasets varying in size, quality, and isolate/MAG composition) which require flexibility in ANI computation algorithms. Conflict of interest The authors declare no conflicts of interest. Funding This work was supported by the SciLifeLab & Wallenberg Data Driven Life Science Program [grant number KAW 2020.0239 to L.M.C.], the Swedish Research Council [grant number 2023-05212 to L.M.C.], the European Molecular Biology Laboratory (EMBL); the SFB 1371 of the German Research Foundation (Deutsche Forschungsgemeinschaft, DFG) [395357507 to G.Z.], and a LUMC Fellowship [to G.Z.]. Data availability PyOrthoANI, PyFastANI, and Pyskani are available: (i) as part of the Python Package Index (PyPI) repository under the open-source MIT license at https://pypi.org/project/orthoani/ , https://pypi.org/project/pyfastani/ , and https://pypi.org/project/pyskani/ , respectively; (ii) via GitHub (source code) at https://github.com/althonos/orthoani , https://github.com/althonos/pyfastani , and https://github.com/althonos/pyskani , respectively; (iii) as Singularity containers (used for benchmarking) at https://cloud.sylabs.io/library/lmc297/pyorthoani/pyorthoani , https://cloud.sylabs.io/library/lmc297/pyfastani/pyfastani , and https://cloud.sylabs.io/library/lmc297/pyskani/pyskani , respectively. Acknowledgements This research was conducted using the resources of High Performance Computing Center North (HPC2N; Umeå University, Umeå, Sweden). References ↵ Behnel , S. et al. ( 2011 ) Cython: The Best of Both Worlds . Computing in Science & Engineering , 13 , 31 – 39 . OpenUrl ↵ Camacho , C. et al. ( 2009 ) BLAST+: architecture and applications . BMC Bioinformatics , 10 , 1 – 9 . OpenUrl CrossRef PubMed ↵ Chaumeil , P.-A. et al. ( 2019 ) GTDB-Tk: a toolkit to classify genomes with the Genome Taxonomy Database . Bioinformatics , 36 , 1925 – 1927 . OpenUrl CrossRef PubMed ↵ Cock , P.J.A. et al. ( 2009 ) Biopython: freely available Python tools for computational molecular biology and bioinformatics . Bioinformatics , 25 , 1422 – 1423 . OpenUrl CrossRef PubMed Web of Science ↵ Goris , J. et al. ( 2007 ) DNA-DNA hybridization values and their relationship to whole-genome sequence similarities . Int J Syst Evol Microbiol , 57 , 81 – 91 . OpenUrl CrossRef PubMed Web of Science ↵ Jain , C. et al. ( 2018 ) High throughput ANI analysis of 90K prokaryotic genomes reveals clear species boundaries . Nat. Commun ., 9 , 1 – 8 . OpenUrl CrossRef PubMed ↵ Kluyver , T. et al. ( 2016 ) Jupyter Notebooks – a publishing format for reproducible computational workflows . In, Positioning and Power in Academic Publishing: Players, Agents and Agendas . IOS Press , pp. 87 – 90 . ↵ Konstantinidis , K.T. and Tiedje , J.M. ( 2005 ) Genomic insights that advance the species definition for prokaryotes . Proc Natl Acad Sci U S A , 102 , 2567 – 2572 . OpenUrl Abstract / FREE Full Text ↵ Lee , I. et al. ( 2016 ) OrthoANI: An improved algorithm and software for calculating average nucleotide identity . Int. J. Syst. Evol. Microbiol ., 66 , 1100 – 1103 . OpenUrl CrossRef PubMed ↵ Olm , M.R. et al. ( 2017 ) dRep: a tool for fast and accurate genomic comparisons that enables improved genome recovery from metagenomes through de-replication . The ISME Journal , 11 , 2864 . OpenUrl CrossRef PubMed ↵ Parks , D.H. et al. ( 2018 ) A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life . Nature Biotechnology , 36 , 996 – 1004 . OpenUrl CrossRef PubMed ↵ Petit , R.A. , 3rd . and Read , T.D. ( 2020 ) Bactopia: a Flexible Pipeline for Complete Analysis of Bacterial Genomes . mSystems , 5 , e00190 – 20 . OpenUrl PubMed ↵ Pritchard , L. et al. ( 2015 ) Genomics and taxonomy in diagnostics for food security: soft-rotting enterobacterial plant pathogens . Anal. Methods , 8 , 12 – 24 . OpenUrl CrossRef ↵ Raghuram , V. et al. ( 2024 ) Average nucleotide identity-based strain grouping allows identification of strain-specific genes in the pangenome . mSystems , 9 , e0014324 . OpenUrl CrossRef PubMed ↵ Richter , M. et al. ( 2016 ) JSpeciesWS: a web server for prokaryotic species circumscription based on pairwise genome comparison . Bioinformatics , 32 , 929 – 931 . OpenUrl CrossRef PubMed ↵ Richter , M. and Rosselló-Móra , R. ( 2009 ) Shifting the genomic gold standard for the prokaryotic species definition . Proc. Natl. Acad. Sci. U. S. A ., 106 , 19126 – 19131 . OpenUrl Abstract / FREE Full Text ↵ Rodriguez-R , L.M. et al. ( 2024 ) An ANI gap within bacterial species that advances the definitions of intra-species units . mBio , 15 , e0269623 . OpenUrl CrossRef PubMed ↵ Shaw , J. and Yu , Y.W. ( 2023 ) Fast and robust metagenomic sequence comparison through sparse chaining with skani . Nature Methods , 20 , 1661 – 1665 . OpenUrl CrossRef PubMed ↵ Van Rossum , G. and Drake , F.L. ( 2009 ) Python 3 Reference Manual . Scotts Valley, CA : CreateSpace . View the discussion thread. Back to top Previous Next Posted February 17, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following PyOrthoANI, PyFastANI, and Pyskani: a suite of Python libraries for computation of average nucleotide identity Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share PyOrthoANI, PyFastANI, and Pyskani: a suite of Python libraries for computation of average nucleotide identity Martin Larralde , Georg Zeller , Laura M. Carroll bioRxiv 2025.02.13.638148; doi: https://doi.org/10.1101/2025.02.13.638148 Share This Article: Copy Citation Tools PyOrthoANI, PyFastANI, and Pyskani: a suite of Python libraries for computation of average nucleotide identity Martin Larralde , Georg Zeller , Laura M. Carroll bioRxiv 2025.02.13.638148; doi: https://doi.org/10.1101/2025.02.13.638148 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7624) Biochemistry (17651) Bioengineering (13871) Bioinformatics (41884) Biophysics (21424) Cancer Biology (18566) Cell Biology (25463) Clinical Trials (138) Developmental Biology (13365) Ecology (19867) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15590) Genomics (22477) Immunology (17714) Microbiology (40331) Molecular Biology (17148) Neuroscience (88487) Paleontology (666) Pathology (2828) Pharmacology and Toxicology (4817) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.