A comparison of two universal angiosperm bait sets and the phylogenomics of Alismatales

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 79,910 characters · extracted from preprint-html · click to expand
A comparison of two universal angiosperm bait sets and the phylogenomics of Alismatales | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results A comparison of two universal angiosperm bait sets and the phylogenomics of Alismatales View ORCID Profile Ed Biffin , View ORCID Profile Michelle Waycott , View ORCID Profile Timothy A. Hammer , View ORCID Profile Kor-jent van Dijk doi: https://doi.org/10.1101/2025.09.15.676180 Ed Biffin 1 State Herbarium of South Australia, Botanic Gardens and State Herbarium , Hackney Road, Adelaide, SA 5000, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ed Biffin For correspondence: ed.biffin{at}adelaide.edu.au Michelle Waycott 1 State Herbarium of South Australia, Botanic Gardens and State Herbarium , Hackney Road, Adelaide, SA 5000, Australia 2 School of Biological Sciences, The University of Adelaide , Adelaide, SA 5005, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Michelle Waycott Timothy A. Hammer 1 State Herbarium of South Australia, Botanic Gardens and State Herbarium , Hackney Road, Adelaide, SA 5000, Australia 2 School of Biological Sciences, The University of Adelaide , Adelaide, SA 5005, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Timothy A. Hammer Kor-jent van Dijk 2 School of Biological Sciences, The University of Adelaide , Adelaide, SA 5005, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Kor-jent van Dijk Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract High throughput sequencing of hybridisation capture libraries provides an efficient approach for assembling large scale phylogenomic data. These include ‘universal’ bait sets that aim to generate comparable data from any lineage within the taxon of interest. Here, we present the OzBaits v2 bait set, which targets a set of low copy nuclear loci for angiosperms. Using published genomic data, we design a set of RNA baits targeting a single exon in each of 98 putatively orthologous nuclear protein coding genes. We tested the efficiency of this bait set for a diverse range of angiosperms and recovered, on average, 93 (95%) genes per sample. We compared a common set of samples for the monocot order Alismatales enriched using OzBaits and the Angiosperms353 (A353) bait set, a widely used universal probe set targeting up to 353 nuclear genes in angiosperms. Gene recovery was, on average, c. 1.7 times higher for OzBaits relative to A353. Using proxies for signal and bias to rank gene alignments by their phylogenetic usefulness, we found that on average, the OzBaits data had higher phylogenetic utility. Both data sets resolved largely congruent, well-supported phylogenies for Alismatales although measures of internal discordance where higher for the A353 data. We discuss the implications of these findings for the design universal baits sets. Introduction With the advent of efficient sequencing technologies and the development of large-scale (genomic) data sets, reduced representation sequencing approaches have become a mainstay of evolutionary biology research ( McKain et al. 2018 ). Reduced representation methods including transcriptomics, restriction-site associated DNA sequencing (e.g. RAD-seq), targeted amplicon sequencing and hybridisation-capture, allow the sequencing of a reduced portion of the genome across many samples for the equivalent effort of a single whole genome. Hybridisation-capture experiments target genetic loci of interest, such as phylogenetically informative low-copy nuclear genes (LCNG), using specifically designed RNA bait sequences that hybridise in solution with targeted regions and are enriched prior to high-throughput sequencing ( Weitemier et al. 2014 ; Johnson et al. 2019 ). Hybridisation bait sets for evolutionary studies include those that target major linages spanning many millions of years (so-called ‘universal’ bait sets, e.g. Buddenhagen et al. 2016 ; Wolf et al. 2018 ; Johnson et al. 2019 ; Liu et al. 2019 ; Brienholt et al. 2021; Hutter et al. 2021; Khan et al. 2024 ) through to bait sets that target lower taxa ranging from family (e.g. Choi et al. 2020; Fonseca et al. 2023 ), generic (e.g. Michel et al. 2022 ) and species-specific approaches (e.g. Hill et al. 2019 ). On the one hand, a universal bait set facilitates the recovery of phylogenetically useful data from divergent evolutionary lineages enabling the development of large-scale comparable sequence data sets broadly across the taxon of interest (e.g. Zuntini et al. 2024 ). On the other hand, bait sets with a narrower taxonomic focus require fewer baits per gene of interest and tend to show higher enrichment efficiency (i.e. the number of ‘on target’ reads as a proportion of the total) because of lower divergence between bait and target sequences ( Liu et al. 2019 ). Several studies to date have compared the relative performance of ‘universal’ versus ‘taxon specific’ bait sets (e.g. Larridon et al. 2020 ; Shah et al. 2021 ; Yardeni et al. 2022 ) although few have contrasted the performance of ‘universal’ bait sets that employ different strategies for bait design. A key issue for bait set design is efficiency in terms of cost per base pair of sequence data generated and this is a function of the number of loci that are targeted and the level of genetic divergence between the bait and target sequences. Thus, for a given unit cost, increasing the number of target loci (or number of base pairs targeted) can be achieved by reducing the diversity and specificity of probe sequences. Conversely, increasing probe specificity could be achieved by reducing the number of genetic loci included in the bait set. While maximising the amount of data generated is a key consideration in universal bait set design, trade-offs in terms of probe specificity and enrichment efficiency could potentially influence the quality of the data including reduced matrix occupancy, lower base call accuracy and increased proportions of missing data ( Hutter et al. 2022 ). Each of these aspects has potential to impact the phylogenetic utility of the data sets and for instance, fragmentary sequences have been found to increase noise (non-phylogenetic signal) in large phylogenomic data sets ( Sayyari et al. 2017 ). We present OzBaits v2.0, a universal probe set that is designed to recover c. 100 LCNG from any angiosperm lineage, building on the OzBaits v1.0 bait kit, which targets fewer genes ( Waycott et al. 2021 ). In the present study, we focus on the OzBaits v2.0 nuclear component and we compare the performance of these with a widely used alternative, the Angiosperms353 baits (A353; Johnson et al. 2019 ), which target up to 353 LCNG in any angiosperm. The former targets a single (or partial) exon per genetic locus and relative to A353 and has higher probe specificity while the latter attempts to recover entire genes by tiling probes for each target locus on transcript sequences. As part of the Genomics for Australian Plants (GAP; https://www.genomicsforaustralianplants.com/ ) initiative, we have generated sequence data for a broad range of angiosperm species to assess the efficiency of gene recovery using the OzBaits v2.0 probe set. In addition, we compare the performance of OzBaits and A353, focussing on Australian representatives of the monocot lineage Alismatales an in particular, ‘core Alismatales’, a diverse predominantly hydrophytic lineage that includes all known fully submerged marine angiosperms (seagrasses). For each bait set, we have sequenced the same set of samples, employed equivalent library preparation steps and used a common bioinformatics pipeline, enabling meaningful comparisons between OzBaits and A353 in terms of the differences in bait design and how these influence the phylogenetic utility of the data sets generated. Methods Bait Design and Synthesis OzBaits V2.0 We designed target enrichment probes to cover all angiosperms using published transcriptomes and genomic sequences. Target genes were selected from Duarte et al. (2010) who report a set of putatively orthologous low copy nuclear genes shared in Arabidopsis, Populus, Vitis and Oryza (APVO SSC genes sensu Duarte et al., 2010 ). We first examined the corresponding genomic data (coding sequences: CDS) available on Phytozome ( v. 12 , including 48 species of angiosperms encompassing all major flowering plant linages ( https://phytozome.jgi.doe.gov ) to identify a subset of genes based upon several criteria including: copy number (genes are single copy in the majority of diploid genomes); sequence conservation (c. 70% average pairwise identity) and representation across the sequenced genomes (locus represented in > c. 70% of genomes). For the selected genes, we used the CDS for Arabidopsis thaliana (Araport11; Cheng et al. 2017 ) to retrieve putatively homologous transcript sequences from the 1000 Plants Project (1KP; https://www.onekp.com ; Leebens-Mack et al . 2019 ) using the China National Genebank ( https://db.cngb.org ) BLAST portal and the following settings: Discontiguous Mega-Blast, expect value=10, maximum target sequences=1000, selected organisms=Magnoliophyta (taxid:3398). Additional sequences were sourced from SeagrassDB ( Sablok et al. 2018 ), which includes transcriptome data for marine and aquatic angiosperms from the Alismatales. The sequences retrieved from the above sources were combined with the Phytozome data for each gene and made into a BLAST database in Geneious Prime 2022.0.1 ( https://www.geneious.com ). We queried each BLAST database using the A. thaliana gene family member with exon annotations manually added, and the following settings: Discontiguous Mega-Blast, expect value=10, maximum target sequences=1200, results=Hit Table, retrieve=Matching Region with Annotation. We then extracted sequences matching the exon annotations and selected a single exon per gene with the caveat that exon size was >180 and less than 800 bp, to facilitate probe tiling while reducing the number of required probes per target locus. The extracted sequences were clustered using CD-HIT-EST ( Li and Godzik, 2006 ; http://weizhong-lab.ucsd.edu/cdhit_suite ) with a sequence identity cut-off fraction of 0.85 (the approximate limit for genetic divergence between probe and target sequence for efficient hybridisation, Hancock-Hanser et al. 2013 ) and a length similarity fraction of 0.2, and one representative sequence (the longest) per cluster was selected. A total of 19,652 representative sequences were used for bait design, ranging in length from 180-660 bp, with a mean length of c. 319 bp. Bait design (myBaits) and synthesis was performed by Daicel-Arbor Biosciences (formerly Mycroarray; Ann Arbor, Michigan, USA; Cat. # 300196R.v5 ) with 100-nucleotide baits and ∼2X flexible tiling density for a total of 101,467 baits (Supplementary material-S1). Taxon sampling We explored the performance of the Ozbaits bait set using a broad sample of 19 angiosperm taxa including monocots (Asphodelaceae: Caesia, Xanthorrhoea; Araceae: Lemna, Typhonium; Maundiaceae: Maundia; Ruppiaceae: Ruppia ), eudicots (Ceratophyllaceae: Ceratophyllum ), core eudicots (Dilleniaceae: Dillenia, Hibbertia ), rosids (Malvaceae: Androcalva, Lasiopetalum, Thomasia; Myrtaceae: Darwinia, Leptospermum ), Caryophyllales (Droseraceae: Drosera ) and asterids (Plantaginaceae: Plantago ). We also developed a dataset for Alismatales comprising 96 samples including 11 of the 14 currently accepted families (Stevens, 2001 onwards; http://www.mobot.org/MOBOT/research/APweb/ , Butomaceae, Scheuchzeriaceae and Tofieldiaceae are not naturally present in Australia). We included 31 of the 38 genera with Australian representation from the ‘core alismatids’ ( Wilson et al. 2011 ) along with a single representative for each of three aroid genera. Ceratophyllum and Plantago were included as outgroups ( Table 1 ). View this table: View inline View popup Table 1: Details of samples included in this study. The data set ‘both’ includes sequence data generated using the OzBaits and A353 bait sets. DNA extraction, library preparation, and sequencing DNA extractions and library preparations were done at two separate institutions. Initial OzBaits performance testing and non-GAP sample processing was done at the University of Adelaide ADIFF (Advanced DNA, Identification and Forensic Facility) and the GAP initiative samples were processed at the Australian Genome Research Facility (AGRF, Melbourne, Australia) as part of Bioplatforms Australia (Sydney, Australia). At the ADIFF DNA was extracted using the DNeasy® Plant mini kit (Qiagen) as per the manufacturer’s instructions using 20-30 mg of dry tissue. Samples were ground in 2.0 mL screw cap tubes using a Bead Ruptor 24 (Omni International, Kennesaw Georgia, USA) with zirconia beads. DNAs where then converted into genomic libraries using a customized and miniaturized library preparation process that was developed at the University of Adelaide for cost reduction. Detailed laboratory protocols and sequencing adapter designs can be found at DOI: dx.doi.org/10.17504/protocols.io.dm6gpbzm8lzp/v1 (Private link for reviewers: https://www.protocols.io/private/1FB56482A98611EC922E0A58A9FEAC02 to be removed before publication.). In brief, library preparations were done with the NEBNext® Ultra™ II FS DNA Library Prep kit with Fragmentase and Sample Purification Beads (New England Biolabs, Ipswich, MA, USA). Neat DNA extracts were used as the starting material and reactions were done in 1/3 volumes. To enable bioinformatics processing following hybrid capture, custom stubby Y-adaptors with synthetic “barcodes” were annealed to the ends of the DNA fragments and amplified to create half-completed libraries (index and p5 and p7 grafting sites missing). Samples were pooled 16-plex and hybrid capture performed with the myBaits custom OzBaits_NR set (Cat. # 300496R.V5) probes following myBaits manufactures manual with V5 chemistry. Hybridization was done at 65°C and incubated for 24 h. Post capture PCR was performed on the half build libraries by fusing the i5 and i7 indexes and grafting sites to the ends of the DNA fragments. Libraries were pooled in equimolar concentrations and size selected to 350–600 bp, quantified on a 2100 Bioanalyzer (Agilent) and sent for sequencing. For the GAP initiative samples dried plant tissue (20–30 mg) was provided to AGRF and ground using a TissueLyser II (Qiagen) with tungsten carbide beads. Genomic DNA was extracted using the DNeasy® Plant mini kit (Qiagen) as per the manufacturer’s instructions on a QIAcube Connect (Qiagen). DNA quantity and quality was assessed using 1% E-gel with Sybr Safe dye (Thermo Fisher) and concentrations assessed using Quantifluor dsDNA assay (Promega). Libraries were prepared using the NEBNext Ultra II FS Library Prep Kit (New England Biolabs, Ipswich, MA, USA), following the manufacturer’s instructions targeting inserts of approximately 350 bp. The libraries were enriched using the myBaits custom OzBaits_NR (Cat. # 300496R.v5) and myBaits Expert Plant Angiosperms353 v1 (Cat. # 308108.v5) ( Johnson et al. 2019 ) bait sets with V5 chemistry. Pooled libraries (12–16 plex) were enriched by hybridising at 65°C (A353) and 64°C (Ozbaits_NR) respectively. All sequencing (ADIFF and AGRF) was done on a NovaSeq 6000 (Illumina Inc., San Diego, USA) with v1.5 chemistry and 150 bp paired-end reads. Bionformatics processing High-throughput 150 bp paired-end reads were imported into CLC Genomics Workbench v20.0.2 ( https://digitalinsights.qiagen.com/ ) for demultiplexing and trimming using a quality score limit of 0.05 (Phred score c. 13). Reads for each individual were randomly sampled to 4 million reads. The Captus pipeline (v1.0.1; Oritz et al. 2023) was used to assemble the cleaned sequence reads, then extract and align the target regions. Relative to other commonly used tools for assembling hybrid-capture data sets (e.g. SECAPR, Andermann et al. 2018 ; HybPiper, Johnson et al. 2016 ), Captus performs well across a range of data types (Oritz et al. 2023), making it valuable in comparing bait sets that use different design strategies. Reads for each individual were de novo assembled using Megahit (v1.2.9; Li et al . 2015 ) (CAPTUS ‘assemble’ function) with the CAPSKIM default –k-list ( https://edgardomortiz.github.io/captus.docs/assembly/assemble/ ) with both – min-count and – prune-level both set to 3. For the extraction step, we used --nuc_min_identity 70 and --nuc_min_score 2.0 to match the targeted nuclear regions with Scipio (v1.4; Keller et al. 2008 ). The target file for the A353 was sourced from the Kew Tree of Life Explorer ( Baker et al. 2022 ) and comprised sequences for core Alismatales including representatives of Alismataceae, Aponogetonaceae, Cymodoceaceae, Hybrocharitaceae, Juncaginaceae, Maundiaceae, Posidoniaceae, Potomagetonaceae and Zosteraceae (Supplementary material-S2). For completeness, we also compared the recovery of A353 target genes using the Mega353 (McLay et al. 2023) target file, for which a modified version has been included in Captus (Oritiz et al. 2023). To extract the OzBaits data, we used the 18,663 representative sequences used for bait design as the reference file (Supplementary material-S2). The Captus ‘align’ function was used to generate multiple sequence alignments (MSAs) for the extracted nuclear and plastid gene data using the -f (format) flag to generate separate nucleotide MSAs for each target region comprising the coding sequence (exons; NT), the coding region(s) and introns (genes; GE), and GE plus flanking regions (genes flanked; GF). All extracted markers were aligned with MAFFT ( Katoh and Stanley 2013 ) using the mafft_auto algorithm. We used the ‘informed’ paralogy filter in Captus, retaining a maximum of 3 paralogs per sample and using ‘ –tolerance 2.0 ’ to remove putative paralogs. Alignments were trimmed with ClipKIT v1.3.0 ( Steenwyk et al . 2020 ) using default settings in Captus. Alignments with fewer than 20 sequences were removed from phylogenetic analyses. As well as comparing locus recovery from the two bait sets, we also estimated average read depth per target locus and per sample. Read depth estimates were generated using the SECAPR reference_assembly function ( Andermann et al. 2018 ), which uses the BWA mapper ( Li and Durbin, 2010 ) for reference-based mapping and Picard (broadinstitute.github.io/picard/) for removing duplicate reads. For the mapping step we used the sampled FASTQ reads and activated the –reference_type sample-specific flag, which uses the consensus sequence for each sample and locus as output by the Captus pipeline. We used a Mann-Whitney U test, as implemented in the Past software ( v. 4; Hammer et al. 2001 ) to test the statistical significance of coverage estimates between bait sets. Phylogenomic analysis For the nuclear data, we first generated a concatenated alignment comprising the three alignment formats output by Captus for the A353 and OzBaits loci. From this, we generated a locus specific tree (hereafter, gene tree) for each partition using IQ-TREE v2.2.3 ( Nguyen et al . 2015 ; Chernomor et al . 2016 ) using 1000 ultrafast bootstrap replicates (UFBS; Mihn et al. 2013) to assess branch support. We used the partitioned ‘GE’ alignments to generate a maximum likelihood (ML) species tree estimate using IQ-TREE with the MFP+MERGE flag activated, which seeks to identify the best model for each partition and then merge like partitions ( Kalyaanamoorthy et al. 2017 ). Only the top 10% of partition merging schemes were examined by using the relaxed hierarchical clustering algorithm (--rclust 10; Lanfear et al . 2014 ). The partitioned alignment, species and gene tree estimates were used as input to genesortR ( Mongiardino Koch 2021 ; https://github.com/mongiardino/genesortR ), an R script that sorts and subsamples phylogenomic datasets based on properties that quantify phylogenetic usefulness. For nucleotide data, these include 3 proxies for bias (average pairwise patristic distance, saturation, and root-to-tip variance) and 3 proxies for signal (Robinson-Foulds similarity to the IQ-TREE ‘GE’ topology, average bootstrap support, proportion of variable sites) (see Mongiardino Koch 2021 , and references therein). From the sorted gene alignments, we generated 3 final datasets for phylogenetic inference: for each of the OzBaits and A353 data, we selected the highest-ranking alignment format for each gene, along with a ‘100-best’ alignment data set comprising the highest-ranking alignment format for the 100 genes top ranked genes, including both OzBaits and A353 targets. We explored the sensitivity of our results to species tree resolution by collapsing poorly supported nodes (UFBS<90) and rerunning genesortR . We generated two additional data sets from the original Captus alignments to test for the effect of low-quality data on phylogenetic usefulness. First, we used the Paragone v1.0.0 ( https://github.com/chrisjackson-pellicle/ParaGone ; Jackson et al. 2023 ) pipeline, which is based upon the orthology resolution approach developed by Yang and Smith (2014) , to refine quality of each of the original alignments output by Captus. Paragone uses HmmCleaner ( Di Franco et al . 2019 ), TreeShrink ( Mai et al . 2018 ) and TrimAl ( Capella-Gutiérrez et al . 2009 ) to remove sequencing and alignment errors and rogue taxa from MSAs. Secondly, we used TreeShrink to remove potentially spurious long-branched terminals from the alignments. For each of these data sets, we generated gene trees using IQ-TREE (as above) and we compared phylogenetic usefulness rankings for each against the original Captus output using genesortR . For each of our final datasets (‘A353-best’, ‘OzBaits-best’ and ‘100-best’), we used IQ-TREE to generate a concatenated ML estimate of the species tree. We first estimated the best fitting partitioned model scheme, as outlined above, and estimated branch support using 1000 UFBS replicates along with the IQ-TREE implementation of the approximate likelihood ratio test (aLRT;; Guindon et al . 2010 ), with interpretation of support following Minh et al . (2013) and recommendations in the IQ-TREE manual ( http://www.iqtree.org/doc/Frequently-Asked-Questions ). We generated a gene tree for each partition in our datasets using IQ-TREE, with branch support estimated using the approximate Bayes test (aBayes; Anisimova et al . 2011 ). For each data set, the gene trees were used to generate a coalescent-based species tree estimate using Weighted ASTRAL (wASTRAL; Zhang and Mirarab, 2022 ) as implemented in Accurate Species Tree EstimatoR (ASTER) ( https://github.com/chaoszhang/ASTER ). wASTRAL uses weighting to reduce the impact of quartets with low support and/or long branches, thereby providing more accurate species tree estimates compared to unweighted inference ( Zhang and Mirarab, 2022 ). We followed the authors recommendations ( https://github.com/chaoszhang/ASTER/blob/master/tutorial/astral-hybrid.md ) for maximum and minimum values for aBayes supports (1.0 and 0.33, respectively) and used local posterior probability (LPP) to measure internal branch support for the coalescent species tree. Both the concatenated and coalescent based species tree estimates were further interrogated using quartet sampling (QS; Pease et al. 2018 ). The QS approach subsamples quartets from the target tree and alignment to produce 4 metrics, Quartet Concordance (QC), Quartet Differential (QD), Quartet Informativeness (QI) and Quartet Fidelity (QF). Taken together, QS scores describe the degree of topological variation in the data ( Pease et al. 2018 ), providing a valuable alternative to measures such as the bootstrap, which may have limited utility among large data sets ( Lanfear and Hahn, 2024 ). We ran QS analyses with 500 replicates and a log-likelihood cut-off of 2. Results Gene recovery The OzBaits bait set performed well across a diverse set of angiosperm lineages including monocots and eudicots ( Table 1 ). Average gene recovery ranged from 81 ( Maundia ) to 97 ( Caesia, Plantago, Dilleniaceae, Malvaceae) with an average of 93 genes (c. 95%) recovered across all samples (Supplementary material-S3). For the Alismatales OzBaits data, no sequence data were recovered from two samples and less than 20 genes were recovered for each of 3 samples (Supplementary material-S4). This appears to reflect poor sample quality and/or library preparation given that recovery was high for congeners, and these samples also showed poor recovery from the A353 bait set (below). Excluding these, we recovered on average c. 85 (87%) genes per sample across 91 samples with average values per family ranging from 74 (c. 76%) in Hydrocharitaceae to 95 (c. 97%) in Posidoniaceae. By way of comparison, the per sample gene recovery for the A353 data using Alismatales specific references (Supplementary material-S5) averaged 208 (c. 66%) with minimum and maximum average per family values ranging from 147 (47%; Hydrocharitaceae) to 245 (78%; Zosteraceae). Recovery using the mega353 references was substantially lower (Supplementary material-S6) and is not discussed further. In Figure 1 (Supplementary material-S7), we compare proportion of genes recovered per sample from the OzBaits versus the A353 bait sets, by family. The proportion of loci recovered from OzBaits was in all cases higher by a factor of c. 20% (Araceae, Juncaginaceae) and up to c. 80% in Alismataceae, Hydrocharitaceae and Posidoniaceae. Download figure Open in new tab Figure 1: Box plots showing the relative proportion of genes recovered per sample for OzBaits and A353, ordered by family. Shown are the median (horizontal line), 25-75% quartiles (box), maximal and minimal values (whiskers) and outliers (open circle, values 1.5X IQR; asterisk, values 3X IQR). We assessed read depth across targeted loci - for the OzBaits data the median read depth was 120.7 ( n =98; interquartile range, IQR=81.3) while for the A353 the median was 35.9 ( n =314; IQR=22.5). The Mann-Whitney U test indicates read depth was significantly higher for gene regions recovered by the OzBaits bait set ( z =13.2; p <0.001). Similarly, the median read depth across samples was higher for OzBaits ( n =91; median=94.9, IQR=155.5) when compared to the A353 bait set ( n= 91; median=23.3; IQR=27.4) and this comparison was also statistically significant ( z =7.029; p <0.001) (Supplementary material-S8). We found that, on average, the OzBaits bait set recovered a significantly higher proportion of the reference sequence in the ‘best hit’ contig(s) (Mann-Whitney U: z= 54.935, p <.001). We also assessed the best hit LG50 and LG90 scores (i.e. the least number of contigs in the best hit that contain 50% or 90%, respectively, of the reference locus length). When considering the proportion of best hit contigs that exceed LG50 and LG90 thresholds, values for the OzBaits bait set were significantly higher for both metrics (LG50: χ 2 =505.27, p < .00001; LG90: χ 2 =2845.36, p < .00001). For both LG50 and LG90 ≥1, the average number of contigs in the best hit was higher for A353 ( Table 2 ). View this table: View inline View popup Download powerpoint Table 2: Contig recovery for the Alismatales data set from the OzBaits and A353 bait sets. The LG50 and LG90 values indicate the least number of contigs in the best hit that contain 50% or 90%, respectively, of the reference locus length. Phylogenetic usefulness We used the subsampling approach described by Mongiardino Koch (2021 ; see also Mongiardino Koch and Thompson, 2021 ) to order genes by their phylogenetic usefulness as means of comparing between OzBaits and A353, but also exploring the utility of the three alignment formats output by Captus. With respect to the top-ranking alignments, we found the OzBaits data to be dominated by GF format (94%) with both NT and GE each comprising 3%. In contrast, the GF, GE and NT formats comprised 56% (175), 35% (109) and 9% (29) of the best ranked A353 alignments. In line with this, we found that phylogenetic usefulness for OzBaits was highest for GF (median rank = 70, IQR = 117) followed by GE (median = 170, IQR = 128) and NT (median = 182, IQR = 133). For the A353, both GF (median = 402, IQR = 433) and GE (median = 441, IQR = 510) ranked higher, on average, than the NT alignment format (median = 547, IQR = 427). Figure 2 show the cumulative percentage of the 407 ‘best’ ranked alignment formats for OzBaits and A353. The median ranking for OzBaits was 130 (IQR = 140) while for A353, the median was 237 (IQR = 208). Approximately 38% (37 loci) of the OzBaits and 20% (63 loci) of the A353 loci fell within the ‘best 100’ genes and this association was statistically significant (χ 2 = 11.33, p = 0.00076). While the A353 data included some of the top ranked genes, approximately 60% had a rank of 200 or lower and approximately 1/3 (34%) had a rank below 300. In comparison, the OzBaits data included approximately 25% of alignments ranking below 200 (χ 2 = 30.627, p < 0.00001) and just 2% of alignments below 300 (χ 2 = 37.989, p < 0.00001) ( Figure 2 ). Download figure Open in new tab Figure 2: Percentile plots showing the percentage of genes with a phylogenetic usefulness ranked value lower than x, for ‘best’ OzBaits and A353 data sets, along with rankings for each data set estimated using a species tree with poorly supported nodes collapsed (UFBS < 90%). We conducted three additional analyses using genesortR to assess the impact of (1), species tree topology (poorly supported branches collapsed in the species tree), (2) poorly aligned regions and spurious sequences (alignments cleaned using ParaGone) and (3), spurious sequences only (alignments processed using TreeShrink). With respect to (1), we found no substantive difference in the ranking of genes by phylogenetic usefulness relative to the fully resolved species tree ( Figure 2 ). For (2) and (3), based upon Mann-Whitney U tests, we found no significant difference in phylogenetic usefulness rankings for comparisons between alignments output by Captus versus the cleaned alignments for each of OzBaits and A353, although the differences in ranking between each bait set were significant for each treatment (Supplementary material S9; Figures S4 and S5). Table 3 shows the distribution of the 6 gene properties that were used to derive a phylogenetic usefulness axis along with an additional 6 gene properties estimated as part of the genesortR routine. With respect to the former, the OzBaits data show high values for the 3 proxies for signal (proportion of variable sites, average bootstrap and RF similarity) and these values are significantly different from the equivalent measures for the A353 data. In contrast, the A353 data show lower, and significantly different values for 2 proxies of bias viz. saturation and patristic distance. The difference in root to tip variation, the third proxy for bias, was not statistically significant for the OzBaits-A353 comparison. Across all 6 gene properties, we found that A353 dataset is characterised by high variability relative to the OzBaits and ‘best 100’ datasets. View this table: View inline View popup Download powerpoint Table 3: Gene properties estimated using the GenesortR script for the A353, OzBaits and ‘best 100’ data sets. Gene properties in bold typeface are used for ranking genes along an axis of phylogenetic usefulness, while properties that are estimated but not used for ranking are also included. Values are median (interquartile range) estimate from n genes included in each data set. The Mann-Whitney U test was used to assess the significance of differences between gene properties for each pairwise data set comparison. For each comparison, the z score and its corresponding p value are indicated. A z score > 1.96 and p values >0.05 are taken to indicate a significant difference (bold). As suggested by the LG50 and LG90 statistics, above, the proportion of missing data (i.e. the proportion of missing/ambiguous cells in alignments [ Mongiardino Koch, 2021 ]) was significantly higher for the A353 dataset relative to OzBaits and the ‘best 100’ alignments ( Table 3 ) and shows a significant negative correlation with mean read depth (Spearmans correlation: rs (403) =.389, p <0.001). In turn, we found that missingness shows a significant negative correlation with measures of signal including RF similarity ( rs (403) =.369, p < 0.001) and average bootstrap support of gene trees ( rs (403) =.166, p < 0.001) and positive correlations with measures of bias including variance of root-to-tip distances ( rs (403) =.337, p <0.001) and mean patristic distance ( rs (403) =.298, p < 0.001). Phylogenetic relationships of Alismatales We constructed 3 alignments for a common set of 85 taxa comprising ‘OzBaits best’, ‘A353 best’ and ‘best 100’, with a concatenated alignment length of 78,244, 299,665 and 116,204 bp, respectively (Supplementary material S10). Across all three data sets, phylogenetic estimates were by-and-large well-supported and congruent particularly as they relate to relationships among families and higher clades, irrespective of the inference method used. For the OzBaits data, the position of Maundia is incongruent relative to both the A353 and ‘best 100’ phylogenies, but with weak statistical support. Other areas of disagreement are largely confined to intra-familial and infra-generic relationships and are mostly associated with short internal branch lengths ( Figure 3 , Figures S1-S3). Download figure Open in new tab Figure 3: Maximum likelihood phylogeny for Alismatales and outgroups estimated from the ‘best’ 100 locus alignments, ranked according to phylogenetic usefulness, from both the OzBaits and A353 data sets. Values on the branches are support values estimated using the approximate likelihood ratio test, ultrafast bootstrap (both estimated using IQ-TREE) and quartet concordance (estimated using Quartet Sampling). Values of 100/100/1.0, respectively, are indicated by an asterisk. The GAP sample ID is indicated in brackets following the taxon label. Quartet sampling was used to explore patterns of support and conflict in the data. As with the ML trees, evidence for discordance is largely associated with relationships within genera and above family level. With respect to the former, relationships within densely sampled lineages such as Potamogeton , Posidonia , Triglochin and Cycnogeton show high levels of discordance. Several backbone relationships, including the position of Maundia , are also discordant. In general, QC scores, averaged across all internal nodes, increase from A353 (0.65) to OzBaits (0.71) to the ‘best 100’ dataset (0.79) ( Figure 3 , Figure S1). Discussion A number of ‘universal’ bait kits have been designed to target low-copy nuclear genes among vascular plants including flagellate land plants (GoFlag; Breinholt, 2021 ), ferns ( Wolf et al. 2018 ), conifers (REMcon; Khan et al. 2024 ) and angiosperms (e.g. Buddenhagen et al. 2016 ; Johnson et al. 2019 , A353). Of these, A353 provides a valuable comparison with OzBaits v.2, given the taxonomic scope of interest, they target a common set of genes and use the much of the same genomic resources in design. However, these two differ in the approach for selecting representative sequences for probe design, including the use of different clustering algorithms and thresholds and the proportion of a gene (transcript) targeted for each low copy locus. In the present study, we have generated data sets using OzBaits and A353 bait sets from a common set of samples, employed consistent library preparation and sequencing steps and used a common phylogenomics pipeline. By removing these confounding factors our comparisons can offer insight into how factors relating to bait design might impact on the generation of phylogenomic data using universal probe kits. Gene recovery: OzBaits versus A353 While the application of the A353 bait kit across angiosperms is well-established ( Zuntini et al. 2024 ), we explored, in a limited sense, the consistency of gene recovery using the OzBaits bait set. For the angiosperm-wide assessment, gene recovery was high and extremely consistent across lineages spanning much of the extant diversity within angiosperms (Supplementary material S3) suggesting that the OzBaits target genes should be efficiently recovered from any flowering plant lineage. In a more focused assessment, we compared locus recovery for the two baits sets across the angiosperm order Alismatales, with an estimated crown group age extending to the lower Cretaceous ( Chen et al. 2022 ). Differences in recovery success (i.e. the proportion of genes recovered relative to the number of genes in each baits set; Fig 1 , Supplementary material S7) could have several underlying causes including in vitro (i.e. the sequence divergence between baits and target sequences) or in vivo (e.g. the distance between reference and target sequences during bioinformatics processing) explanations ( McLay et al. 2021 ). We suggest the former is more likely given that for taxa with the lowest relative recovery success from A353 (e.g. Alismataceae, Hydrocharitaceae, Posidoniaceae; Figure 1 ), family specific reference sequences were included in the bioinformatics processing steps (Supplementary material S2). This is further supported by significant differences in average read depth between the bait sets (Supplementary material S8)– read depth (capture efficiency) is predicted to decrease as divergence increases due to lower probe specificity ( Grover et al. 2012 ; Portik et al. 2016 ; Hutter et al. 2022 ). According to Liu et al. (2019) , the capture efficiency of low-copy nuclear genes in mosses declined sharply when the average pair-wise distance between probe and target sequences fell below 30%. For the A353 baits set, Johnson et al. (2019) use a 30% threshold of pair-wise sequence divergence in their probe design and at this value, 95% of the Angiosperm diversity was captured by 15 or less representative sequences per gene. In contrast, the OzBaits target sequences were clustered at 85% sequence similarity, which results in a higher diversity of probe sequences and presumably, largely accounts for the high consistency of target gene recovery across taxa (Supplementary material S3 and S4). On the one hand, the A353 bait set recovered an average of more than 200 genes per sample for Alismatales but uses approximately half the number of probes (75,151 120-mer probes; Johnson et al. 2019 ) relative to OzBaits and is therefore a highly efficient approach. On the other hand, the OzBaits probes recover fewer and (on average) shorter loci, but also have higher matrix occupancy and fewer missing values ( Table 3 ). Phylogenetic usefulness: OzBaits versus A353 We used the approach developed by Mongiardino Koch (2021) to explore the properties of our data and to rank genes by their phylogenetic usefulness. While the best performing genes overall were captured by A353, a large proportion of A353 alignments also ranked poorly, including virtually all the lowest ranked 100 genes ( Figure 2 ). We found that on average, loci recovered by the OzBaits bait set had higher phylogenetic usefulness than A353. Furthermore, a significantly higher proportion of the OzBaits targets were represented in the 100 top ranking genes relative to A353. These patterns hold for genes that were recovered by both bait sets: of the 27 recovered genes in common, c. 63% of those recovered using the OzBaits bait set ranked higher than the equivalent A353 locus (Supplementary material-S11) suggesting that the contrasting patterns of usefulness are not solely a consequence of target gene selection (e.g. selecting genes from different functional groups). In addition to lower overall recovery of sequences per gene (i.e. matrix occupancy) ( Table 3 ), the A353 genes have a significantly higher proportion of missing data (i.e. the individual is present in the alignment but is represented by a fragmentary sequence, type 2 missing data sensu Hosner et al. 2016 ). This is indicated by a lower proportion of the targeted regions recovered for the A353 data, and of these, fewer sequences exceed LG50 and LG90 (i.e. the best hit contig(s) exceed 50% or 90% of the target length, respectively) thresholds ( Table 2 ). Furthermore, the A353 LG50 and LG90 best hits are more likely to include multiple contigs relative to the OzBaits data. In previous studies, it has been found that gene tree accuracy can be negatively impacted by fragmentary sequences ( Hosner et al. 2016 ; Sayyari et al. 2017 ; Smith et al. 2020 ). Here, we found that the proportion of missing data shows significant correlations with gene tree accuracy including measures of signal (a negative correlation with RF similarity and average bootstrap support) and bias (a positive correlation with variance of root-to-tip distances and mean patristic distance) ( Table 3 ). Fragmentary sequences can behave as ‘rogue’ taxa in gene tree estimates, and this has been attributed to lower signal because of fewer informative sites in short contigs relative to full length sequences ( Goloboff and Szumik, 2015 ; Hosner et al. 2016 ). We found that read depth is a significant correlate with the proportion of missing data in alignments although factors relating the strategy used to select the target sequences used for probe design are also likely to affect the patterns of missingness in our data. For the OzBaits bait set, the target sequences comprise a single (or partial) exon per targeted gene region, while the A353 bait set uses probes that are tiled across an entire transcript ( Johnson et al. 2019 ). In the first instance, the recovered locus includes the targeted region and generally the upstream and downstream flanking regions comprising introns, UTRs and/or additional exon sequence. In contrast, the strategy adopted for the A353 bait set is to recover full genes including exons as well as introns/UTRs to generate ‘supercontig’ sequences ( Johnson et al. 2016 ). One implication of the above is that, while patterns of missingness should be reasonably predictable for the OzBaits loci (i.e. a ‘core’ region corresponding to the target sequence, with the proportion of missing data increasing towards the flanking regions, see for example Streicher et al. 2016 ), this will be less the case for the A353 alignments. For the latter, patterns of missingness will be heterogeneous given that the majority of best hit contigs recovered a fraction of the target sequence, there was more likely to be multiple contigs in the best hit, and the target sequences commonly span multiple exons. One possible scenario is that patterns of missingness generate biased gene trees – for example, alignment portions uniquely shared by a set of distantly related terminals could artifactually inflate shared signal and lead to erroneous phylogenetic inference ( Smith et al. 2020 , 2023 ; Uribe et al. 2022 ). Conversely, closely related terminals could share no unique site patterns due to low alignment overlap. Taken together, our results suggest increasing proportions missing data in alignments leads to decreased gene tree accuracy and increased bias and heterogeneity in these factors largely drive the ranking of genes by phylogenetic usefulness. While ranking using RF similarity suffers potential circularity ( Mongiardino Koch, 2021 ), we found that species trees recovered from each of our data sets show broadly similar topologies ( Figure 3 and Figures S1-S3) with respect to each other and to the target tree and poorly supported branches (largely associated with congeneric species groups) are generally held in common. Furthermore, collapsing poorly supported branches in the target tree did not substantially influence the ranking of genes ( Figure 2 ; Supplementary material S9). In addition, we found that gene rankings correlate with alternative measures of signal (average bootstrap support) that are independent of the species tree estimate. For the OzBaits data, higher average bootstrap support relative to A353 reflects the fact that the ‘best’ loci from the former data set are overwhelmingly ‘genes-flanked’ alignments, which supports the efficiency of this bait set in capturing high coverage and largely complete contigs. In contrast, unpredictable patterns of missing data recovered by the A353 bait set appear to drive variation in gene properties and as a consequence, large variation phylogenetic usefulness. We note that to date, the processing of A353 sequencing data has predominantly used the HybPiper pipeline ( Johnson et al. 2016 ) and by comparison, Captus is recently developed and has yet to be fully evaluated. However, a in a recent study using sequence data simulated on the Arabidopsis thaliana genome, it was found that Captus recovered substantially more loci than HybPiper at low read depths (sequencing depth<10X). On the other hand, at very low sequencing depths (<5X) Captus recovered a higher proportion of shorter and less accurate contig sequences relative to higher sequencing depths ( Raza et al. 2023 ). The recovery of short and inaccurate sequences as a consequence of low capture efficiency could potentially contribute to alignment and gene tree inaccuracy in our present data set. In order to explore this possibility, we ran the genesortR script on alignments that had been trimmed using the ParaGone pipeline, and separately, using TreeShrink to remove unexpectedly long branched terminals. In the case of the former, the ‘cleaned’ alignments showed lower bias and improved RF similarity but lower average values for measures of signal (proportion of variable sites and average bootstrap support). Similarly, using TreeShrink to refine alignments reduced bias but also signal. In general, we found no significant change in phylogenetic usefulness rankings for either the OzBaits or A353 data sets, although we found a small (but not statistically significant) decrease in average phylogenetic usefulness ranking for the OzBaits genes following both treatments (Supplementary material S9). Thus, while alignment trimming appears to reduce bias and gene tree estimation error (the objective of the ParaGone workflow is to generate high-quality gene trees for paralogy resolution) this comes at the expense of signal and in terms of the phylogenetic usefulness of genes, improvements in bias are presumably countered by lower signal. As both data sets were more-or-less equally impacted, we suspect that issues relating to the Captus pipeline and low read-depth sequencing data have little bearing on our main findings. Phylogenetic relationships of Alismatales We found that, by-and-large, species tree estimates for Alismatales were congruent across each of our data sets irrespective of the method of inference (i.e. concatenation versus a coalescent approach) with areas of disagreement largely restricted to a few poorly supported branches (e.g. the placement of Maundia triglochinoides in the OzBaits tree) ( Figure 3 ; Figures S1 and S2). While near complete support from bootstrapping or marginal posterior probabilities is expected from large scale data sets such as ours (Thompson and Brown, 2022) we found that on average, QC increased from A353 to OzBaits to our ‘best 100’ data set. Both biological and technical issues could potentially drive discordance amongst quartet topologies ( Lanfear and Hahn, 2024 ) although here, the level of agreement between concatenation and coalescent analyses suggests that incomplete lineage sorting (ILS) is not the key factor ( Hosner et al. 2016 ; Mirarab et al. 2016 ). Rather, we suggest that higher discordance on the A353 species tree topology largely mirrors the variation in gene properties noted above, with a high proportion of genes ranking poorly in terms of phylogenetic usefulness. More generally, these findings support the value of subsampling phylogenomic data given that our ‘best 100’ dataset produced a similarly well-resolved species tree relative to the more gene rich A353 data but shows lower internal discordance. Among recent phylogenetic analyses for Alismatales, our study is perhaps most readily comparable to that of Chen et al. (2022) who use a nuclear data set comprising c. 1000 low copy nuclear orthologs to develop a hypothesis of relationships. To the extent that our sampling overlaps (Chen et al. have a high representation of Northern Hemisphere lineages versus our Australian focus) we recovered relationships amongst families and genera that are generally congruent on our ‘best 100’ tree (compare our Figure 3 and Figure 1 of Chen et al.). Conclusion While there are several comparisons of hybrid-capture baits sets in the literature (e.g. ‘universal’ versus ‘taxon specific’, Larridon et al. 2020 ; Shah et al. 2021 ; Yardeni et al. 2022 ), we are not aware of many that have compared the design of ‘universal’ baits and how this impacts the generation of phylogenomic data (but see, for example, Hutter et al. 2022 ). Here, we formally introduce OzBaits v. 2, and contrast this with the widely used A353 panel. Both bait sets target angiosperms but adopt different approaches to bait design - the approach adopted for A353 enables the user to recover more loci and potentially full genes at a lower cost per base relative to OzBaits. On the other hand, we found that OzBaits can recover high quality data across a broad range of angiosperms and these data have high phylogenetic usefulness for Alismatales. The present study has focussed upon a single, albeit a highly diverse order of angiosperms. In recent studies (e.g. Zuntini et al. 2024 ) Alismatales is placed on a long branch as sister to the remaining monocots (excluding Acorales) and may therefore be considered a phylogenetically isolated taxon. In such specific cases, it has been found that the A353 bait set shows low capture efficiency for some genes likely because the distance to the probe sequences is too great ( Johnson et al. 2019 ). More generally, the utility of the A353 bait set is well-established and is not called into question here. Rather, the results of our study provide insights into possible pathways by which to increase the quality of data obtained using universal hybrid capture bait sets. Cost is a key consideration in the design of a bait set, and this is largely contingent on the number of probes required to fulfill that design. This, in turn, depends upon two (not mutually exclusive) factors: the total size (in base pairs) of the regions that are targeted, and the distance between the probes and the target DNA. Increasing the number of targeted base pairs while maintaining cost requires reducing the specificity of probes. This may come with diminishing returns as we found here for the A353 data, which included a substantial proportion of low read depth and missing data. Related to this, reducing the number of base pairs targeted could be achieved by enriching for shorter loci, such as a single exon per gene, the approach adopted for OzBaits. Significantly, we found that proxies for signal were in fact higher for the OzBaits Alismatales data ( Table 3 ), reflecting a combination of relatively high enrichment efficiency and the consistent recovery of a high proportion of the target region including flanking introns/UTRs. More studies are needed to determine whether these issues are relevant for ‘core’ angiosperm lineages that are better represented in the A353 baits design where we would expect higher capture efficiency. Irrespective of the above, the A353 data produced a credible estimate of Alismatales phylogeny indicating that sufficient high-quality data was obtained. However, the Alismatales includes a number of well-separated lineages that are characterised by long internal branches, a pattern that may be less challenging compared to rapid radiations where incomplete lineage sorting, gene flow and gene tree estimation errors can be conflated ( Whitfield and Lockhart, 2007 ; Cai et al. 2021 ; Morales-Briones et al. 2021 ). The OzBaits trees showed an uncertain placement for Maundia, presumably reflecting insufficient signal and/or conflict in these data. However, we found that internal discordance decreased from A353 to OzBaits to our ‘best 100’ data set, suggesting that for the latter, incongruence will more likely reflect biological processes rather than gene tree estimation error. As modern phylogenomic studies seek to tackle large scale data spanning a range of divergences we suggest a trade-off between more data, and more accurate data will become increasingly relevant. Conflicts of interest The authors declare no conflicts of interest. Declaration of funding Cofunding for generation of the nuclear data was provided by Bioplatforms Australia (see https://bioplatforms.com/ ) as part of Genomics for Australian Plants Framework Initiative (GAP) Stage II (see https://www.genomicsforaustralianplants.com/ ). Data availability statement Sequence data are made available via the Bioplatforms Australia web portal ( https://data.bioplatforms.com/ ). Associated sequence IDs are listed in Table 1 . Supplementary Material Captions Supplementary material S1 : List of nuclear gene targets included in the OzBaits v2. bait set. Gene names and descriptions are based upon the Arabidopsis thaliana ortholog. The targeted exon ( A. thaliana gene model), mean target length and the number of unique sequences included in the design are indicated. Supplementary material S2 : Reference target files used to extract target genes from each data set with the CAPTUS ‘extract’ function. Supplementary material S3-S6 : CAPTUS gene extraction heat maps and statistics for OzBaits ‘universal’ (S3), OzBaits Alismatales (S4) and A353 using Alismatales specific references (S5) and CAPTUS inbuilt mega353 (S6) references. Supplementary material S7 : Gene recovery for the Alismatales data using the OzBaits and A353 bait sets (see Figure 1 ). Supplementary material S8 : Average read depth (coverage) estimated by mapping cleaned reads back to target contigs for each sample and gene recovered by CAPTUS for the OzBaits and A353 data sets. Supplementary material S9 : Phylogenetic usefulness rankings (median[IQR]) for alignments output by CAPTUS using a fully resolved species tree; the same alignments following alignment cleaning using the ParaGone routine; and using Treeshrink to remove long terminal branches. Statistical tests using the Mann-Whitney U compare the CAPTUS output with each of the ParaGone and Treeshrink data sets separately for each of OzBaits and A353, as well as OzBaits versus A353 for the ParaGone and Treeshrink data. For each analysis, the input files (alignment, partition file, gene tree file and species tree) for GenesortR are included. Supplementary material S10 : Alignment and partition files used for phylogenetic inference for OzBaits ‘best’, A353 ‘best’ and 100 ‘best’ datasets. Supplementary material S11 : Phylogenetic usefulness rankings for genes that are shared between OzBaits and A353 bait sets. For each gene, the A353 and OzBaits identifier are used in that order, and the higher ranked version is highlighted with bold typeface. Values for the 6 properties used to derive a phylogenetic usefulness axis are also included. Figure S1 : Maximum-likelihood topologies (IQTREE) inferred from concatenated alignments for (A) OzBaits best, and (B) A353 best data sets. Values on the branches are support values estimated using the approximate likelihood ratio test, ultrafast bootstrap (both estimated using IQ-TREE) and quartet concordance (estimated using Quartet Sampling). Values of 100/100/1.0, respectively, are indicated by an asterisk. The GAP sample ID is indicated in brackets following the taxon label. Figure S2 : Species tree topologies inferred using ASTRAL for (A) OzBaits best, and (B) A353 best data sets. Values on branches are local posterior probabilities (LPP). The GAP sample ID is indicated in brackets following the taxon label. Figure S3 : Species tree topologies inferred using ASTRAL for the 100 best data set. Values on branches are local posterior probabilities (LPP). The GAP sample ID is indicated in brackets following the taxon label. Figure S4 : Percentile plots showing the percentage of genes with a phylogenetic usefulness ranked value lower than x, for ‘best’ OzBaits and A353 data sets, along with rankings for each data set estimated following the alignment cleaning routines implemented by the ParaGone pipeline. Figure S5 : Percentile plots showing the percentage of genes with a phylogenetic usefulness ranked value lower than x, for ‘best’ OzBaits and A353 data sets, along with rankings for each data set estimated following the removal of abnormally long branches among gene trees using TreeShrink. Funder Information Declared Cofunding for generation of the DNA sequence data was provided by Bioplatforms Australia (see https://bioplatforms.com/) as part of Genomics for Australian Plants Framework Initiative (GAP) Stage II (see https://www.genomicsforaustralianplants.com/). References ↵ Andermann T , Cano Á , Zizka A , Bacon C , Antonelli A ( 2018 ) SECAPR-A bioinformatics pipeline for the rapid and user-friendly processing of targeted enriched Illumina sequences, from raw reads to alignments . PeerJ 2018 , e5175 . doi: 10.7717/peerj.5175 OpenUrl CrossRef ↵ Anisimova M , Gil M , Dufayard J-F , Dessimoz C , Gascuel O ( 2011 ) Survey of branch support methods demonstrates accuracy, power, and robustness of fast likelihood-based approximation schemes . Systematic Biology 60 , 685 – 699 . OpenUrl CrossRef PubMed ↵ Baker WJ , Bailey P , Barber V , Barker A , Bellot S , Bishop D , Botigue LR , Brewer G , Carruthers T , Clarkson JJ , Cook J , Cowan RS , Dodsworth S , Epitawalage N , Francoso E , Gallego B , Johnson M , Kim JT , Leempoel K , Maurin O , McGinnie C , Pokorny L , Roy S , Stone M , Toledo E , Wickett NJ , Zuntini AR , Eiserhardt WL , Kersey PJ , Leitch IJ , Forest F ( 2022 ) A comprehensive phylogenomic platform for exploring the angiosperm tree of life . Systematic Biology 71 , 301 – 319 . OpenUrl CrossRef PubMed ↵ Breinholt JW , Carey SB , Tiley GP , Davis EC , Endara L , McDaniel SF , Neves LG , Sessa EB , von Konrat M , Chantanaorrapint S , Fawcett S ( 2021 ) A target enrichment probe set for resolving the flagellate land plant tree of life . Applications in Plant Sciences 9 : e11406 . OpenUrl ↵ Buddenhagen C , Lemmon AR , Lemmon EM , Bruhl J , Cappa J , Clement WL , Donoghue MJ , Edwards EJ , Hipp AL , Kortyna M , Mitchell N ( 2016 ) Anchored phylogenomics of angiosperms I: Assessing the robustness of phylogenetic estimates . bioRxiv 086298 [Preprint]. Posted 28 November 2016 ↵ Cai L , Xi Z , Lemmon EM , Lemmon AR , Mast A , Buddenhagen CE , Liu L , Davis CC ( 2021 ) The perfect storm: gene tree estimation error, incomplete lineage sorting, and ancient gene flow explain the most recalcitrant ancient angiosperm clade, Malpighiales . Systematic Biology 70 , 491 – 507 . OpenUrl CrossRef PubMed ↵ Capella-Gutiérrez S , Silla-Martínez JM , Gabaldón T ( 2009 ) trimAl: a tool for automated alignment trimming in large-scale phylogenetic analyses . Bioinformatics 25 , 1972 – 1973 . OpenUrl CrossRef PubMed Web of Science ↵ Chen L-Y , Lu B , Morales-Briones DF , Moody ML , Liu F , Hu GW , Huang C-H , Chen J-M , Wang QF ( 2022 ) Phylogenomic analyses of Alismatales shed light into adaptations to aquatic environments . Molecular Biology and Evolution 39 , msac079 , doi: 10.1093/molbev/msac079 OpenUrl CrossRef PubMed ↵ Cheng C , Krishnakumar V , Chan AP , Thibaud-Nissen F , Schobel S , Town CD ( 2017 ) Araport11: a complete reannotation of the Arabidopsis thaliana reference genome . The Plant Journal , 89 , 789 – 804 . doi: 10.1111/tpj.13415 OpenUrl CrossRef PubMed Choi B , Crisp MD , Cook LG , Edwards RD , Toon A , Külheim C ( 2019 ) Identifying genetic markers for a range of phylogenetic levels–from species to family level . PLoS One 14 , e0218995 . OpenUrl PubMed ↵ Chernomor O , von Haeseler A , Minh BQ ( 2016 ) Terrace aware data structure for phylogenomic inference from supermatrices . Systematic Biology 65 : 997 – 1008 . OpenUrl CrossRef PubMed ↵ Di Franco A , Poujol R , Baur D , Philippe H ( 2019 ) Evaluating the usefulness of alignment filtering methods to reduce the impact of errors on evolutionary inferences . BMC Evolutionary Biology 19 , 21 . OpenUrl PubMed ↵ Duarte JM , Wall PK , Edger PP , Landherr LL , Ma H , Pires PK , Leebens-Mack J , Claude WD ( 2010 ) Identification of shared single copy nuclear genes in Arabidopsis, Populus, Vitis and Oryza and their phylogenetic utility across various taxonomic levels . BMC Evolutionary Biology 10 , 61 . OpenUrl PubMed ↵ Fonseca LH , Carlsen MM , Fine PV , Lohmann LG ( 2023 ) A nuclear target sequence capture probe set for phylogeny reconstruction of the charismatic plant family Bignoniaceae . Frontiers in Genetics 9 , 1085692 . OpenUrl ↵ Goloboff PA , Szumik CA ( 2015 ) Identifying unstable taxa: efficient implementation of triplet-based measures of stability, and comparison with Phyutility and RogueNaRok . Molecular Phylogenetics and Evolution 88 , 93 – 104 . OpenUrl CrossRef PubMed ↵ Grover CE , Salmon A , Wendel JF ( 2012 ) Targeted sequence capture as a powerful tool for evolutionary analysis1 . American Journal of Botany 99 , 312 – 319 . OpenUrl Abstract / FREE Full Text ↵ Guindon S , Dufayard J-F , Lefort V , Anisimova M , Hordijk W , Gascuel O ( 2010 ) New algorithms and methods to estimate maximum-likelihood phylogenies: Assessing the performance of PhyML 3.0 . Systematic Biology 59 , 307 – 321 . OpenUrl CrossRef PubMed Web of Science ↵ Hammer Ø , Harper DA ( 2001 ) Past: paleontological statistics software package for education and data analysis . Palaeontologia Electronica 4 , p. 1 . OpenUrl ↵ Hancock-Hanser BL , Frey A , Leslie MS , Dutton PH , Archer FI , Morin PA ( 2013 ) Targeted multiplex next-generation sequencing: advances in techniques of mitochondrial and nuclear DNA sequencing for population genomics . Molecular Ecology Resources 13 , 254 – 268 . OpenUrl PubMed ↵ Hill CB , Wong D , Tibbits J , Forrest K , Hayden M , Zhang XQ , Westcott S , Angessa TT , Li C ( 2019 ) Targeted enrichment by solution-based hybrid capture to identify genetic sequence variants in barley . Scientific Data 6 , 12 . OpenUrl PubMed ↵ Hosner PA , Faircloth BC , Glenn TC , Braun EL , Kimball RT ( 2016 ) Avoiding Missing Data Biases in Phylogenomic Inference: An Empirical Study in the Landfowl (Aves: Galliformes) . Molecular Biology and Evolution 33 , 1110 – 1125 . OpenUrl CrossRef PubMed ↵ Hutter CR , Cobb KA , Portik DM , Travers SL , Wood Jr PL , Brown RM ( 2022 ) FrogCap: A modular sequence capture probe-set for phylogenomics and population genetics for all frogs, assessed across multiple phylogenetic scales . Molecular Ecology Resources 22 , 1100 – 1119 . OpenUrl PubMed ↵ Jackson C , McLay T , Schmidt-Lebuhn AN ( 2023 ) hybpiper-nf and paragone-nf: Containerization and additional options for target capture assembly and paralog resolution . Applications in Plant Sciences 11 , e11532 . OpenUrl PubMed ↵ Johnson MG , Gardner EM , Liu Y , Medina R , Goffinet B , Shaw AJ , Zerega NJ , Wickett NJ ( 2016 ) HybPiper: extracting coding sequence and introns for phylogenetics from high-throughput sequencing reads using target enrichment . Applications in Plant Sciences 4 , 1600016 . OpenUrl ↵ Johnson MG , Pokorny L , Dodsworth S , Botigué LR , Cowan RS , Devault A , Eiserhardt WL , Epitawalage , Forest F , Kim JT , Leebens-Mack JH ( 2019 ) A universal probe set for targeted sequencing of 353 nuclear genes from any flowering plant designed using k-medoids clustering . Systematic Biology 68 , 594 – 606 . OpenUrl CrossRef PubMed ↵ Katoh K , Standley DM ( 2013 ) MAFFT multiple sequence alignment software version 7: improvements in performance and usability . Molecular Biology and Evolution 30 , 772 – 780 . OpenUrl CrossRef PubMed Web of Science ↵ Kalyaanamoorthy S , Minh BQ , Wong TKF , von Haeseler A , Jermiin LS ( 2017 ) ModelFinder: fast model selection for accurate phylogenetic estimates . Nature Methods 14 , 587 – 589 . OpenUrl PubMed ↵ Keller O , Odronitz F , Stanke M , Kollmar M , Waack S ( 2008 ) Scipio: Using protein sequences to determine the precise exon/intron structures of genes and their orthologs in closely related species . BMC Bioinformatics 9 , 278 . OpenUrl CrossRef PubMed ↵ Khan R , Biffin E , van Dijk KJ , Hill RS , Liu J and Waycott M ( 2024 ) Development of a target enrichment probe set for conifer (REMcon) . Biology , 13 ( 6 ), p. 361 . OpenUrl PubMed Grover CE , Salmon A , Wendel JF ( 2012 ) Targeted sequence capture as a powerful tool for evolutionary analysis1 . American Journal of Botany . 99 , 312 – 319 . OpenUrl Abstract / FREE Full Text ↵ Lanfear R , Calcott B , Kainer D , Mayer C , Stamatakis A ( 2014 ) Selecting optimal partitioning schemes for phylogenomic datasets . BMC Ecology and Evolution 14 , 82 . OpenUrl ↵ Lanfear R , Hahn M ( 2024 ) The meaning and measure of concordance factors in phylogenomics . EcoEvoRxiv . ↵ Larridon I , Villaverde T , Zuntini AR , Pokorny L , Brewer GE , Epitawalage N , Fairlie I , Hahn M , Kim J , Maguilla E , Maurin O ( 2020 ) Tackling rapid radiations with targeted sequencing . Frontiers in Plant Science 10 , 1655 . OpenUrl PubMed ↵ Leebens-Mack JH , Barker MS , Carpenter EJ , Deyholos MK , Gitzendanner MA , Graham SW , Grosse I , Li Z , Melkonian M , Mirarab S , et al. ( 2019 ) One thousand plant transcriptomes and the phylogenomics of green plants . Nature 574 , 679 – 68 . OpenUrl CrossRef PubMed ↵ Li D , Liu CM , Luo R , Sadakane K , Lam TW ( 2015 ) MEGAHIT: an ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph . Bioinformatics 31 , 1674 – 1676 . OpenUrl CrossRef PubMed ↵ Li H , Durbin R ( 2010 ) Fast and accurate long-read alignment with Burrows–Wheeler transform . Bioinformatics 26 , 589 – 595 . OpenUrl CrossRef PubMed Web of Science Li H , Handsaker B , Wysoker A , Fennell T , Ruan J , Homer N , Marth G , Abecasis G , Durbin R ( 2009 ) The sequence alignment/map format and SAMtools . Bioinformatics 25 , 2078 – 2079 . OpenUrl CrossRef PubMed Web of Science ↵ Li W , Godzik A ( 2006 ) Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences . Bioinformatics 22 , 1658 – 1659 . OpenUrl CrossRef PubMed Web of Science ↵ Liu , Y , Johnson , MG , Cox , CJ , Medina , R , Devos , N , Vanderpoorten , A , Hedenäs , L , Bell , NE , Shevock , JR , Aguero , B , Quandt , D ( 2019 ) Resolution of the ordinal phylogeny of mosses using targeted exons from organellar and nuclear genomes . Nature Communications 10 , 1485 . OpenUrl PubMed ↵ Mai U , Mirarab S ( 2018 ) TreeShrink: fast and accurate detection of outlier long branches in collections of phylogenetic trees . BMC Genomics 19 ( Suppl 5 ), 272 . OpenUrl CrossRef PubMed ↵ McKain MR , Johnson MG , Uribe-Convers S , Eaton D , Yang Y ( 2018 ) Practical considerations for plant phylogenomics . Applications in Plant Sciences 6 , e1038 . OpenUrl PubMed ↵ McLay TGB. , Birch L. , Gunn BF. , Ning W. , Tate JA , Nauheimer L , Joyce EM. , Simpson L , Schmidt-Lebuhn AN , Baker WJ , Forest F , Jackson CJ ( 2021 ) New targets acquired: Improving locus recovery from the Angiosperms353 probe set . Applications in Plant Science 9 , aps3.11420 . OpenUrl ↵ Michel T , Tseng YH , Wilson H , Chung KF , Thomas DC , Kidner C . A hybrid capture bait set for Begonia . Edinburgh Journal of Botany . 2022 Aug 18 ; 79 : 1 – 33 . OpenUrl ↵ Minh BQ , Nguyen MAT , von Haeseler A ( 2013 ) Ultrafast approximation for phylogenetic bootstrap . Molecular Biology and Evolution 30 , 1188 – 1195 . OpenUrl CrossRef PubMed Web of Science ↵ Mirarab S , Bayzid MS , Warnow T ( 2016 ) Evaluating summary methods for multilocus species tree estimation in the presence of incomplete lineage sorting . Systematic Biology 65 , 366 – 380 . OpenUrl CrossRef PubMed ↵ Mongiardino Koch N ( 2021 ) Phylogenomic subsampling and the search for phylogenetically reliable loci . Molecular Biology and Evolution 38 , 4025 – 4038 . OpenUrl CrossRef PubMed ↵ Mongiardino Koch N , Thompson , JR ( 2021 ) A total-evidence dated phylogeny of Echinoidea combining phylogenomic and paleontological data . Systematic Biology , 70 : 421 – 439 . OpenUrl CrossRef PubMed ↵ Morales-Briones DF , Kadereit G , Tefarikis DT , Moore MJ , Smith SA , Brockington SF , Timoneda A , Yim WC , Cushman JC , Yang Y ( 2021 ) Disentangling sources of gene tree discordance in phylogenomic data sets: testing ancient hybridizations in Amaranthaceae s.l . Systematic Biology 70 , 219 – 235 . OpenUrl CrossRef PubMed ↵ Nguyen L-T , Schmidt HA , von Haeseler A , Minh BQ ( 2015 ) IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies . Molecular Biology and Evolution 32 , 268 – 274 . OpenUrl CrossRef PubMed Ortiz EM , Hoewener A , Shigita G , Raza M , Maurin O , Zuntini A , Forest F , Baker WJ , Schaefer H ( 2023 ) A novel phylogenomics pipeline reveals complex pattern of reticulate evolution in Cucurbitales . BioRxiv , 2023 – 10 . ↵ Pease JB , Brown JW , Walker JF , Hinchliff CE , Smith SA ( 2018 ) Quartet Sampling distinguishes lack of support from conflicting support in the green plant tree of life . American Journal of Botany 105 , 385 – 403 . OpenUrl CrossRef PubMed ↵ Portik DM , Smith LL , Bi K ( 2016 ) An evaluation of transcriptome-based exon capture for frog phylogenomics across multiple scales of divergence (Class: Amphibia, Order: Anura) . Molecular Ecology Resources , 16 , 1069 – 1083 . OpenUrl PubMed ↵ Raza M , Ortiz EM , Schwung L , Shigita G , Schaefer H ( 2023 ) Resolving the phylogeny of Thladiantha (Cucurbitaceae) with three different target capture pipelines . BMC Ecology and Evolution 23 , 75 . OpenUrl ↵ Sablok G , Hayward R , Davey PA , Santos RP , Schliep M , Larkum A , Pernice M , Dolferus R , Ralph PJ ( 2018 ) SeagrassDB: An open-source transcriptomics landscape for phylogenetically profiled seagrasses and aquatic plants . Scientific Reports 8 , 2749 . OpenUrl PubMed ↵ Sayyari E , Whitfield JB , Mirarab S ( 2017 ) Fragmentary gene sequences negatively impact gene tree and species tree reconstruction . Molecular Biology and Evolution 34 , 3279 – 3291 . OpenUrl CrossRef PubMed ↵ Shah T , Schneider JV , Zizka G , Maurin O , Baker W , Forest F , Brewer GE , Savolainen V , Darbyshire I , Larridon I ( 2021 ) Joining forces in Ochnaceae phylogenomics: a tale of two targeted sequencing probe kits . American Journal of Botany 108 , 1201 – 1216 . OpenUrl CrossRef PubMed ↵ Smith BT , Mauck WM , Benz BW , Andersen MJ ( 2020 ) Uneven missing data skew phylogenomic relationships within the lories and lorikeets . Genome Biology and Evolution 12 , 1131 – 1147 . OpenUrl CrossRef PubMed ↵ Smith BT , Merwin J , Provost KL , Thom G , Brumfield RT , Ferreira M , Mauck III WM , Moyle RG , Wright TF , Joseph L ( 2023 ) Phylogenomic analysis of the parrots of the world distinguishes artifactual from biological sources of gene tree discordance . Systematic Biology 72 , 228 – 241 . OpenUrl CrossRef PubMed ↵ Steenwyk JL , Buida TJ , Li Y , Shen X-X , Rokas A ( 2020 ) ClipKIT: a multiple sequence alignment trimming software for accurate phylogenomic inference . PLoS Biology 18 , e3001007 . OpenUrl CrossRef PubMed ↵ Streicher JW , Schulte JA , Wiens JJ ( 2016 ) How should genes and taxa be sampled for phylogenomic analyses with missing data? An empirical study in iguanian lizards . Systematic Biology 65 , 128 – 145 . OpenUrl CrossRef PubMed Thomson RC , Brown JM ( 2022 ) On the need for new measures of phylogenomic support . Systematic Biology 71 , 917 – 920 . OpenUrl PubMed ↵ Uribe JE , González VL , Irisarri I , Kano Y , Herbert DG , Strong EE , Harasewych MG ( 2022 ) A phylogenomic backbone for gastropod molluscs . Systematic Biology 71 , 1271 – 1280 . OpenUrl PubMed ↵ Waycott M , van Dijk KJ , Biffin E ( 2021 ) A hybrid capture RNA bait set for resolving genetic and evolutionary relationships in angiosperms from deep phylogeny to intraspecific lineage hybridization . BioRxiv . 7 : 2021 – 09 . OpenUrl ↵ Weitemier K , Straub SC , Cronn RC , Fishbein M , Schmickl R , McDonnell A , Liston A ( 2014 ) Hyb-Seq: Combining target enrichment and genome skimming for plant phylogenomics . Applications in Plant Sciences 2 , 1400042 . OpenUrl ↵ Whitfield JB , Lockhart PJ ( 2007 ) Deciphering ancient rapid radiations . Trends in Ecology and Evolution 22 , 258 – 265 . OpenUrl ↵ Wilson , A. , 2011 . Flora of Australia: Volume 39 Alismatales to Arales . CSIRO Publishing/ABRS, Australian Government Department of Sustainability, Environment, Water, Population and Communities . ↵ Wolf , PG , Robison , TA , Johnson , MG , Sundue , MA , Testo , WL , Rothfels , CJ ( 2018 ) Target sequence capture of nuclear-encoded genes for phylogenetic analysis of ferns . Applications in Plant Sciences 6 , e01148 . OpenUrl PubMed ↵ Yang Y , Smith SA ( 2014 ) Orthology inference in nonmodel organisms using transcriptomes and low-coverage genomes: improving accuracy and matrix occupancy for phylogenomics . Molecular Biology and Evolution 31 , 3081 – 3092 . OpenUrl CrossRef PubMed ↵ Yardeni G , Viruel J , Paris M , Hess J , Crego CG , de La Harpe M , Rivera N , Barfuss MH , Till W , Guzmán-Jacob V , Krömer T ( 2022 ) Taxon-specific or universal? Using target capture to study the evolutionary history of a rapid radiation . Molecular Ecology Resources 22 , 927 – 945 . OpenUrl PubMed ↵ Zhang C , Mirarab S ( 2022 ) Weighting by gene tree uncertainty improves accuracy of quartet-based species trees . Molecular Biology and Evolution 39 , msac215 . OpenUrl CrossRef PubMed ↵ Zuntini AR , Carruthers T , Maurin O , Bailey PC , Leempoel K , Brewer GE , Epitawalage N , Françoso E , Gallego-Paramo B , McGinnie C , Negrão R ( 2024 ) Phylogenomics and the rise of the angiosperms . Nature , 24 : 1 – 8 . OpenUrl View the discussion thread. Back to top Previous Next Posted September 16, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A comparison of two universal angiosperm bait sets and the phylogenomics of Alismatales Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A comparison of two universal angiosperm bait sets and the phylogenomics of Alismatales Ed Biffin , Michelle Waycott , Timothy A. Hammer , Kor-jent van Dijk bioRxiv 2025.09.15.676180; doi: https://doi.org/10.1101/2025.09.15.676180 Share This Article: Copy Citation Tools A comparison of two universal angiosperm bait sets and the phylogenomics of Alismatales Ed Biffin , Michelle Waycott , Timothy A. Hammer , Kor-jent van Dijk bioRxiv 2025.09.15.676180; doi: https://doi.org/10.1101/2025.09.15.676180 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Evolutionary Biology Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41936) Biophysics (21452) Cancer Biology (18588) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15153) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00