Full text
92,806 characters
· extracted from
preprint-html
· click to expand
Scalable single-cell metagenomic analysis with Bascet and Zorn | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Scalable single-cell metagenomic analysis with Bascet and Zorn View ORCID Profile Hadrien Gourlé , View ORCID Profile Iryna Yakovenko , View ORCID Profile Jyoti Verma , View ORCID Profile Julian Dicken , View ORCID Profile Florian Albrecht , View ORCID Profile Linas Mažutis , View ORCID Profile Johan Normark , View ORCID Profile Nongfei Sheng , View ORCID Profile Nicklas Strömberg , View ORCID Profile Tommy Löfstedt , View ORCID Profile Laura M. Carroll , View ORCID Profile Johan Henriksson doi: https://doi.org/10.1101/2025.06.20.660799 Hadrien Gourlé 1 Department of Clinical Microbiology, SciLifeLab, Umeå University , Umeå, Sweden 2 Laboratory for Molecular Infection Medicine Sweden (MIMS), Umeå University , Umeå, Sweden 3 Umeå Centre for Microbial Research (UCMR), Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hadrien Gourlé Iryna Yakovenko 1 Department of Clinical Microbiology, SciLifeLab, Umeå University , Umeå, Sweden 2 Laboratory for Molecular Infection Medicine Sweden (MIMS), Umeå University , Umeå, Sweden 3 Umeå Centre for Microbial Research (UCMR), Umeå University , Umeå, Sweden 4 Department of Molecular Biology, SciLifeLab, Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Iryna Yakovenko Jyoti Verma 1 Department of Clinical Microbiology, SciLifeLab, Umeå University , Umeå, Sweden 2 Laboratory for Molecular Infection Medicine Sweden (MIMS), Umeå University , Umeå, Sweden 3 Umeå Centre for Microbial Research (UCMR), Umeå University , Umeå, Sweden 4 Department of Molecular Biology, SciLifeLab, Umeå University , Umeå, Sweden 5 Integrated Science Lab (IceLab), Umeå University , Umeå, Sweden 6 Department of Odontology, Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jyoti Verma Julian Dicken 1 Department of Clinical Microbiology, SciLifeLab, Umeå University , Umeå, Sweden 4 Department of Molecular Biology, SciLifeLab, Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Julian Dicken Florian Albrecht 2 Laboratory for Molecular Infection Medicine Sweden (MIMS), Umeå University , Umeå, Sweden 3 Umeå Centre for Microbial Research (UCMR), Umeå University , Umeå, Sweden 4 Department of Molecular Biology, SciLifeLab, Umeå University , Umeå, Sweden 5 Integrated Science Lab (IceLab), Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Florian Albrecht Linas Mažutis 4 Department of Molecular Biology, SciLifeLab, Umeå University , Umeå, Sweden 7 Institute of Biotechnology, Life Sciences Centre, Vilnius University , 7 Sauletekio av., Vilnius, LT-10257, Lithuania Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Linas Mažutis Johan Normark 1 Department of Clinical Microbiology, SciLifeLab, Umeå University , Umeå, Sweden 2 Laboratory for Molecular Infection Medicine Sweden (MIMS), Umeå University , Umeå, Sweden 3 Umeå Centre for Microbial Research (UCMR), Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Johan Normark Nongfei Sheng 6 Department of Odontology, Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nongfei Sheng Nicklas Strömberg 3 Umeå Centre for Microbial Research (UCMR), Umeå University , Umeå, Sweden 6 Department of Odontology, Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nicklas Strömberg Tommy Löfstedt 8 Department of Computing Science, Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tommy Löfstedt Laura M. Carroll 1 Department of Clinical Microbiology, SciLifeLab, Umeå University , Umeå, Sweden 2 Laboratory for Molecular Infection Medicine Sweden (MIMS), Umeå University , Umeå, Sweden 3 Umeå Centre for Microbial Research (UCMR), Umeå University , Umeå, Sweden 5 Integrated Science Lab (IceLab), Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Laura M. Carroll For correspondence: laura.carroll{at}umu.se johan.henriksson{at}umu.se Johan Henriksson 3 Umeå Centre for Microbial Research (UCMR), Umeå University , Umeå, Sweden 4 Department of Molecular Biology, SciLifeLab, Umeå University , Umeå, Sweden 5 Integrated Science Lab (IceLab), Umeå University , Umeå, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Johan Henriksson For correspondence: laura.carroll{at}umu.se johan.henriksson{at}umu.se Abstract Full Text Info/History Metrics Supplementary material Preview PDF ABSTRACT Single-cell metagenomic sequencing (scMetaG) can provide maximum-resolution insights into complex microbial communities. However, existing bioinformatic tools are not equipped to handle the massive amounts of data generated by novel high-throughput scMetaG methods. Here, we present a bioinformatic toolkit for complete, end-to-end scMetaG analysis: (i) Bascet, a command-line suite designed to scale to massive scMetaG datasets (≥1 million cells); (ii) Zorn, an R package/workflow manager that enables reproducible scMetaG data analysis, exploration, and visualization ( http://zorn.henlab.org/ ). Enabled by recent advances in droplet microfluidics, we use Bascet and Zorn to develop and optimize a high-throughput scMetaG method on a ten-species mock community. To showcase their utility on a real-world sample, we use Bascet and Zorn to characterize a human saliva sample, generating single-amplified genomes (SAGs) from >10k prokaryotic cells. Overall, Bascet and Zorn enable reproducible scMetaG analysis, allowing users to query microbiomes at unprecedented resolution and scale. INTRODUCTION Determining “who’s there” in a microbial community (e.g., which species/strains are present; which genes they carry) remains a fundamental, yet non-trivial question in microbiome research. 1 , 2 To this end, single-cell metagenomic sequencing (scMetaG) methods can provide maximum-resolution insights into complex microbial communities. 1 , 3 , 4 Also known as single-cell whole-genome sequencing (scWGS) or single-cell DNA-sequencing (scDNA-seq), scMetaG generates sequencing reads, which are linked to their microbial cell of origin. Because the single-cell origin of each sequencing read is known, scMetaG can uncover heterogeneity, which bulk methods like shotgun metagenomics lack the resolution to capture (e.g., by differentiating closely related strains; by identifying pan-genomic differences among cells). 1 , 3 Until recently, scMetaG has been carried out in a multiwell (e.g., 96-well) plate format, 5 making it low-throughput and prohibitively expensive. However, droplet scMetaG methods, with the potential to query millions of cells per sample, have recently emerged. 3 , 6 , 7 Droplet scMetaG methods encapsulate single bacterial cells in e.g., oil-in-water emulsion droplets (e.g., 10x Genomics), or more recently, semi-permeable capsules (SPCs; Atrandi Biosciences), aiming to make scMetaG scalable and economically feasible. 3 , 6 , 7 Should it be possible to scale existing droplet scMetaG methods to their theoretical limits, microbiologists would face substantial challenges analyzing the resulting data. Currently, there is no dedicated software available for scMetaG analysis, and existing bioinformatic tools are not equipped to handle this novel data modality. For example, all software for eukaryotic single-cell analysis (e.g., scRNA-seq, spatial transcriptomics) rely on a single reference genome and thus cannot be used for scMetaG. Likewise, methods for bulk metaG/bacterial WGS analysis cannot scale to the massive amounts of data that droplet scMetaG methods have the potential to generate without major modifications. For example, in the scRNA-seq space, commercial kits have the capacity to generate 10 6 single-cell transcriptomes per run; for comparison, there are 10 6 total assembled bacterial genomes in the NCBI Assembly database ( https://www.ncbi.nlm.nih.gov/assembly/ , accessed 2 May 2025). 8 Thus, existing single-cell and bulk analysis software are not designed for the throughput enabled by droplet scMetaG methods. Here, we present Bascet and Zorn, a bioinformatic toolkit for scalable, reproducible, end-to-end scMetaG analysis. We describe the overall design of Bascet and Zorn, and we outline major steps in a typical analysis workflow. Enabled by recent advances in droplet microfluidics, we use Bascet and Zorn to optimize a novel, high-throughput, SPC-based scMetaG method on a mock community. Finally, to showcase their utility on a real-world sample, we use Bascet and Zorn to analyze scMetaG data from a human saliva sample, generating single-amplified genomes (SAGs) from >10k prokaryotic cells. Overall, Bascet and Zorn enable reproducible scMetaG analysis, allowing users to query microbiomes at unprecedented resolution and scale. RESULTS Bascet and Zorn enable scalable processing of single-cell data Recent single-cell and droplet microfluidics advances indicate that it is now realistic to generate scMetaG libraries for up to 10 6 prokaryotic cells in a single run ( Fig 1a ). Download figure Open in new tab Figure 1. Bascet and Zorn enable large-scale scMetaG analysis. (a) Expected average coverage as a function of the number of cells and genome size, filling a Novaseq X 25B flow cell. (b) Reads are processed and sorted into separate shards for fast parallel processing. (c) Typical overall scMetaG workflow, where the heavy preprocessing is performed by Bascet/Zorn, and most high-level processing by Seurat/Signac. The output is typically clustered and dimensionally reduced cells, where metadata and specialized analysis of each cell can be plotted to draw high-level conclusions. (d) Bascet introduces several interchangeable file formats under a common API. (e) Bascet/Zorn support a general map-reduce procedure for applying a function (calculation) to each cell, both functioning as a minibatching workflow manager, as well as avoiding the creation of large numbers of files. (f) To integrate with popular single-cell analysis tools, Bascet/Zorn generate features out of reads or contigs. Features are aggregated in a count matrix, handled similar to scRNA-seq or ATAC-seq data. However, existing bioinformatic tools for bacterial WGS analysis are not designed to scale to massive amounts of this novel type of data. For example, cluster job managers do not cope well with many small jobs (e.g., SLURM is frequently configured to disallow large job arrays), and small files incur excessive random I/O that filesystems are not designed for. Naively partitioning paired-end reads from each cell in one scMetaG sample with 10 6 cells would produce 2·10 6 files for paired-end read storage alone; if one desires to perform additional analyses (e.g., SAG assembly, quality control [QC], annotation) and/or analyze >1 sample, the number of files required to store this data increases rapidly (e.g., ≥10 7 files), hindering the performance of most modern filesystems and software. To overcome these challenges, we developed Bascet, a highly scalable, open-source single-cell analysis pipeline, written in Rust. Conceptually similar to Cell Ranger (10x Genomics), Bascet is designed to perform common single-cell analysis tasks (e.g., barcode identification, read trimming/filtering, QC), using massive amounts of raw single-cell data in (gzipped) FASTQ format as input ( Fig 1b ). However, unlike Cell Ranger, Bascet is agnostic to sequencing platform/chemistry, allowing it to accommodate diverse scMetaG protocols and the increasing number of new single-cell products coming to market. Further, additional scMetaG-specific methods/tools are incorporated into Bascet (for e.g., taxonomic classification, de novo assembly, genome annotation), including novel methods for dimensionality reduction and data visualization (detailed in subsequent sections; Fig 1c ). To make Bascet accessible and provide users with a familiar interface, we developed Zorn, a companion R package ( Fig 1c ). In addition to providing an interface between Bascet and popular single-cell analysis methods in R (detailed below), Zorn is a full workflow manager, similar to Nextflow, 9 enabling reproducible analysis. Because Bascet and Zorn are designed to be used together, we will refer to their combination hereafter as “Bascet/Zorn” when discussing methods that involve both tools. Bascet/Zorn utilize several state-of-the-art strategies to enable efficient scMetaG analysis ( Fig 1bde ). First, we developed a novel family of file formats (i.e., Bascet-formatted files), all with a similar structure and unified under the common Bascet API, including: (i) Bascet-ZIP, a general-purpose format, which enables rapid storage/access to any set of files, using the fast Facebook Zstandard compression algorithm; (ii) Bascet-TIRP (Tabix-Indexed Read Pairs), a BED-like format for storing debarcoded sequencing reads; (iii) Bascet-FASTQ, designed to simplify interactions with tools that use FASTQ files as input (e.g., BWA, Kraken2); 10 , 11 (iv) Bascet-BAM, a BAM/CRAM-like format 12 produced by aligners wrapped into Bascet (detailed below; Fig 1d ). Together, Bascet-formatted files limit data storage to a small number of files, thereby reducing small I/O operations and massively improving compute performance. Building upon these novel file formats, we have further implemented several strategies to enable efficient computation ( Fig 1e ). Unlike typical scRNA-seq protocols, most operations in the prokaryotic WGS space (e.g., de novo assembly, genome annotation) are performed on a “genome-by-genome” basis, in which sequencing reads, contigs, or scaffolds from a single prokaryotic isolate (or here, cell) are queried individually, one at a time. To enable analogous “cell-by-cell” operations here, we utilized the map-reduce concept from large-scale computing and functional programming ( Fig 1e ). Specifically, the Bascet map-operation applies a given function (a shell script or Rust function) over each cell, hiding how the data is actually stored from the user. This enables future addition of specialized file formats for increased performance and compression. Finally, to maximize computational efficiency and enable scalable computing, Bascet/Zorn borrow the concept of “sharding” (horizontal partitioning) from databases ( Fig 1be ). Within this framework, data for a given cell is confined to a single “shard”. With each shard holding data for 1–10k cells, millions of cells can be managed without creating too many files for the filesystem to manage. This also reduces scheduling overhead for cluster job managers (e.g., SLURM), as each shard is a natural minibatch. Altogether, Bascet/Zorn enable efficient, flexible scMetaG analysis via novel high-performance file formats (to reduce the number of files and heavy I/O), map-reduce operations (to enable efficient cell-by-cell operations), sharding (to maximize computational resource usage), and chemistry/sequencing platform-agnostic design (to accommodate the increasing number of sequencing methods being developed). Bascet and Zorn enable flexible microbial single-cell analysis and integrate with popular eukaryotic single-cell tools A typical user will interact with Bascet/Zorn via Zorn in R, with minimal need for the command-line. While we envision that most users will follow a standard workflow ( Fig 1c ), the processing steps described below are modular and can be added/omitted as needed. Briefly, users supply raw reads in FASTQ format to Bascet/Zorn as input. Raw reads are then debarcoded and trimmed in an instrument-specific manner. At this step, users pick a suitable number of shards depending on how many computers are available for parallel processing ( Fig 1b ). This is followed by optional reference genome alignment, enabling e.g., host DNA removal ( Fig 1c ). After debarcoding and trimming, we recommend users perform several QC and filtration steps via Bascet/Zorn, specifically (i) taxonomic classification and (ii) doublet/contaminant detection and removal ( Fig 1c ). To (i) perform a rough taxonomic classification of their cells, users can employ either (a) an “alignment-based” classification step, in which reads are aligned to user-selected reference genomes via BWA (if sample composition is known), or (b) a “ de novo ” classification step, in which reads are assigned to taxa via Kraken2 (if sample composition is unknown). 11 Both taxonomic classification steps allow users to identify potential biases (e.g., due to poor lysis of some taxa, poor sequencing coverage, contamination). Following taxonomic classification, users can (ii) detect and remove doublets (droplets/SPCs with >1 cell, from droplet instrument overloading or cell aggregation) and/or cells contaminated with free-floating DNA, using existing R packages (e.g., scDblFinder). 13 Low-quality libraries can then be omitted from further analysis ( Fig 1c ). After QC and filtration, users may wish to cluster and visualize cells/samples (e.g., via a UMAP). 14 To construct features for clustering analysis, users can choose from four workflows: (i) alignment , (ii) k-mer database , (iii) informative k-mer , and/or (iv) k-mer count sketch ( Fig 1f ). The (i) alignment workflow allows users to map reads and/or call variants relative to user-specified reference genome(s) via BWA and CellSNP-lite. 15 The resulting read counts and/or SNPs serve as features for clustering and UMAP construction. Because the alignment workflow relies on reference-based mapping approaches, we anticipate that it will be most useful for comparing highly similar cells (e.g., members of the same species). For users interested in comparing more dissimilar cells (e.g., members of different species) or samples of unknown composition, we developed three k -mer-based workflows: (ii) a Kraken2-based k-mer database workflow, (iii) the MinHash 16 -inspired informative k-mer workflow, and (iv) a novel k-mer count sketch workflow (each detailed in subsequent sections; Fig 1f ). All workflows enable UMAP construction and related clustering analyses, as Bascet/Zorn interoperate seamlessly with popular single-cell tools (e.g., Seurat, Signac). 17 , 18 In addition to read-based analyses, users may also assemble SAGs de novo using single-cell SPAdes (scSPAdes) 19 or SKESA 20 ( Fig 1c ). For cells with low coverage, users may additionally want to co-assemble highly similar genomes, as defined by clustering analysis. In this scenario, Bascet/Zorn can be used to output the reads of chosen cells for bespoke analysis. Users may additionally choose to run popular tools for microbial (meta)genomic data analysis, which are integrated into Bascet/Zorn (for e.g., read/SAG QC, genome annotation; Fig 1c ). 21 – 25 Results from these tools can be overlaid onto a UMAP ( Fig 1c ), alongside other metadata (e.g., taxonomic metadata, user-provided sample metadata), allowing users to interpret and explore their dataset. Finally, the map mechanism implemented in Bascet/Zorn ( Fig 1e ) allows developers to add methods/tools beyond those already included. We anticipate that this feature will promote collaboration between members of the nascent scMetaG community and foster community-driven tool development. Altogether, Bascet/Zorn enable reproducible scMetaG data analysis, from start to end. Semi-permeable capsules enable high-throughput scMetaG To showcase their utility, we used Bascet/Zorn to optimize a semi-permeable capsule (SPC)-based scMetaG method developed by Atrandi Biosciences, using an ATCC 10-strain (equal abundance) mock community ( Fig 2a-d ). 3 , 26 Our scMetaG protocol consists of 6 major steps: (a) encapsulating cells in SPCs, diluted to ensure singlets according to a Poisson distribution; (b) lysis; (c) multiple displacement amplification (MDA); (d) debranching; (e) combinatorial barcoding; (f) SPC release, fragmentation, and index PCR. As mechanical lysis methods cannot be used with SPCs, we benchmarked several enzymatic lysis protocols (see Methods), and the final protocol is able to capture cells of all strains ( Fig 2d ). After lysis, we proceed with MDA and debranching. This generates overlapping amplicons, and unlike protocols that merely tagment DNA, 27 enables de novo assembly of SAGs. Finally, dsDNA is barcoded using split-pool ligation, 28 and index PCR is performed for 10k SPCs at a time until sufficient cells have been collected ( Fig 2b ). Download figure Open in new tab Figure 2. Semi-permeable capsules (SPCs) enable droplet-scale scMetaG of complex samples. (a) Our scMetaG protocol, adapted from Atrandi. SPCs allow buffers, small proteins, and DNA/RNA to be exchanged. After harsh lysis, genomic DNA is isothermally amplified using multiple displacement amplification (MDA). After debranching to create simple double-stranded DNA (dsDNA), DNA is combinatorially barcoded. The SPCs are opened to release DNA, which is then fragmented to suitable size for sequencing. A final adapter is added, and after PCR, the library is sequenced. (b) Final barcoded dsDNA, ready for sequencing. (c) Confirmation of cells in SPCs and Poisson loading, using microscopy. Left: encapsulated bacteria; right: stained DNA after MDA. (d) Verification of lysis for different species using qPCR (see Methods for lysis protocol descriptions). (e) Saturation plots of randomly selected cells, showing the difference in complexity between libraries. (f) Typical knee plot, showing read count per SPC for lysis protocols R1-3. (g) Knee plots for each species separately for lysis protocols R1-3. (h) Barnyard plot of R3 sample (i.e., the final protocol; 13,717 cells after filtering), indicating the level of mixing between SPCs (computed by alignment to a reference consisting of all strains). (i) Number of SPCs per species for lysis protocols R1-3. 10k reads/cell was used as a cutoff. For panels e, g, h, and i, species were assigned by aligning to reference genomes via BWA. Benchmarking scMetaG of a mock community using Bascet and Zorn To assess mock community composition, we used Bascet/Zorn’s alignment workflow to align reads to mock community reference genomes. While some cells achieve >90% coverage ( Fig 2e ), cell libraries saturate at different coverages, suggesting variable complexity. MDA is known to cause coverage biases due to its locally exponential amplification. 19 We thus recommend users sequence more cells, rather than sequence deeper. From a standard knee plot analysis, using total read counts per SPC, we identify 1k to 5k cell-containing SPCs per sample ( Fig 2f ). However, knee plots for individual species indicate strain-specific lysis patterns, which are not captured in a standard knee plot ( Fig 2g ). We thus recommend low read count cutoffs and filtering cells after initial clustering. Using majority-class species assignments for SPCs, we find that ∼4% of reads are contamination from other cells ( Fig 2h ). While our final selected lysis protocol (R3) was able to detect DNA from all species, strong lysis biases were present ( Fig 2i ), highlighting the need for future optimization. Curiously, a large number of cells were assigned to Bacillus pacificus , a difficult-to-lyse Gram-positive sporeformer. However, as most B. pacificus reads are in low-coverage SPCs, we believe this is free-floating DNA (“soup”) 29 originating from early lysis. Zorn’s k -mer database workflow leverages species assignments for clustering Beyond the alignment workflow, we developed three k -mer-based workflows ( k-mer database, informative k-mer, k-mer count sketch ) for samples of unknown composition. We then applied each k -mer-based workflow to our mock community scMetaG data and compared them to the alignment workflow. The first k -mer-based workflow, Bascet/Zorn’s k-mer database workflow, uses Kraken2 11 to obtain a table of read counts for each taxonomyID and cell, which can then be used for comparison. The quality of the resulting UMAP was comparable to that of the alignment workflow, despite Kraken2 assigning reads to taxa that were not present in the mock community ( Fig 3a ). Altogether, the speed of the k -mer database workflow makes it particularly suitable for initial QC and data reduction. Download figure Open in new tab Figure 3. Bascet/Zorn workflow comparison. (a) UMAP of simulated and mock (sample R3) communities, based on Bacet/Zorn’s four feature construction workflows. (b) The informative k-mer workflow. (c) Histogram of min-hash k-mer frequency. (d) Count of k-mer AGATCGGAAGAGCACACGTCTGAACTCCAGT on UMAP. (e) The count sketch algorithm, including quantization steps. (f) Feature matrix, before and after count sketch projection. (g) Count sketch reduction randomly adds/subtracts k-mer counts from random columns in the final count matrix. This linear projection, which approximately conserves distances, can be performed without storing the vast amount of individual k-mer counts in memory. Features are then further decomposed to account for sequencing depth differences. (h) Cosine distances are distributed around zero for random zero-mean vectors. Binarizing vectors has little impact on angle. (i) Number of dimensions needed to conserve cell-cell distance with less than distortion ε. (j) Dot products of binary vectors are homomorphic to XOR functions, which are fundamental circuits, enabling the design of efficient computing hardware. Informative k -mers enable scalable and interpretable de novo comparison Bascet/Zorn’s alignment and k-mer database workflows rely on reference genome(s) and a database of known taxa, respectively. As such, species/strains that are not represented in the reference set/database cannot be detected. We thus developed the informative k-mer workflow ( Fig 3b ), which uses a subset of k -mers detected across cells as features ( Fig 3cd ). k -mer subsets are selected (i) from the most common MinHashed k -mers, or (ii) by users (e.g., from genes of interest). Users can then proceed with e.g., UMAP construction, or use k -mer frequencies as input for Seurat differential abundance analysis. To select an adequate k -mer frequency range, we tested an array of ranges and constructed UMAPs via the Seurat workflows for both ATAC-seq 30 (TF-IDF, SVD, UMAP) and RNA-seq (size-factor normalization, PCA, UMAP). We found that using the 100k most common min- k -mers produced a UMAP that was similar to those produced via Bascet/Zorn’s alignment and k -mer database workflows ( Fig 3a ). Bascet/Zorn’s informative k -mer workflow enables interpretable de novo scMetaG analysis, without the need for reference genomes/databases. We anticipate this workflow will be particularly useful for users who lack prior knowledge of their samples, who prioritize the use of interpretable features (e.g., to assess which k -mers drive clustering; to identify differentially abundant k -mers via Seurat). k -mer count sketching leverages all k -mers for scalable de novo comparison Because each k -mer sequence is known, Bascet/Zorn’s informative k-mer workflow enables interpretable de novo scMetaG analysis. However, the workflow is limited by the number of k -mers that can fit into memory. To enable use of the full k -mer composition space, we introduce a novel technique, namely a randomized affine projection to a subspace, implemented in Bascet/Zorn’s k-mer count sketch workflow ( Fig 3e-j ). This method is analogous to count sketches , which enable fast computation of the k -mer count projection, without intermediate storage of the full k -mer count matrix (which, for 1 million cells and 10 species, would occupy up to 2 petabytes). Bascet computes an initial randomized projection down to 5,000 dimensions (default), which Zorn can further dynamically reduce to trade precision for speed. The Johnson-Lindenstrauss lemma 31 tells us that 5,000 dimensions are enough to roughly retain cell-cell distances (within 20%) for up to 10 6 cells ( Fig 3i ). Since this is still a large amount of data for comparison, we sought fast approximations. We found that binarizing vector components to be ±1 both improved performance and data reduction ( Fig 3h ). A plausible explanation is as follows: (i) in high-dimensional spaces, randomly chosen vectors are highly likely to be orthogonal, and discretization affects the angle minimally; (ii) discretization also normalizes vector length, in a manner that appears numerically more robust than naive vector normalization. The discretized genome vectors are then compared using a fast approximate nearest neighbour algorithm, accessed via Seurat. When applied to simulated and mock community data, the k -mer count sketch workflow produced exceptionally well-separated clusters, without prior knowledge of species or choice of k -mers ( Fig 3a ). We anticipate that this workflow will be useful for users with minimal prior knowledge of their sample and those interested in novelty detection/discovery. scMetaG provides maximum-resolution insights into the human oral microbiome To test our full experimental and computational workflow on a real-world sample, we used our optimized scMetaG method to characterize saliva from a healthy human donor, and analyzed it with Bascet/Zorn ( Fig 4a ). After filtering ( Fig 4bcd ), SAGs were assembled de novo , 10k of which were selected for subsequent analysis ( Fig 4e ). Download figure Open in new tab Figure 4. Single-cell-resolution genomics of 10k cells from the human oral microbiome. (a) Saliva sample processing, prior to loading in SPCs. (b) Kneeplot for all cells. (c) Kneeplot per genus, assigned via Kraken. (d) Fraction of saliva sample scMetaG reads from human, Xanthomonas spp. (byproduct of xanthan gum used in the optimized protocol, see Methods), and other microbes. (e) SAG quality per QUAST, as a function of single-cell sequencing depth. (f) Human saliva UMAPs from the k-mer based workflows. (g) UMAP constructed using SNPs detected in Streptococcus salivarius (“REF” or “ALT”), as compared to a high-quality S. salivarius reference SAG (“Reference”). (h) Selected putative bacteriocin gene clusters detected in human saliva microbiome SAGs. Using Kraken, nearly two-thirds of 10k cells were assigned to Streptococcus (6,034 cells, 60.3%), with Streptococcus salivarius among the most abundant (30 cells; Fig 4f ). On a core genome level, SNPs were identified among S. salivarius ( Fig 4g ). At the pan-genome level, bacteriocin gene clusters were identified in several SAGs, including those of S. salivarius ( Fig 4h ). Although future validation efforts are needed, our workflow showcases the potential of scMetaG for unravelling genomic diversity at the level of single microbial cells. DISCUSSION Single-cell (multi-)omic methods have revolutionized eukaryotic biology. However, the single-cell ‘omics revolution has all but evaded microbiology, with only a handful of high-throughput methods in existence, including protocols for bacterial scRNA-seq 32 – 35 and, more recently, scMetaG. 3 , 6 Single-cell sequencing technologies have the potential to usher in a new era of microbiology research, and with that comes new challenges and opportunities. On the experimental side, we developed a high-throughput scMetaG method using novel microfluidics that enables aggressive lysis and genome amplification. Our method is overwhelmingly more cost efficient than multiwell plate-based scMetaG methods (i.e., >150x cheaper, 0.24 EUR/cell; Supplementary File S1 ). While our optimal lysis protocol captures every microbe in a mock community, lysis biases are still present. Thus, novel lysis methods optimized for droplets/SPCs are needed, as mechanical lysis methods, which work well for bulk metaG (e.g., bead beating) 36 cannot be used. Further, our scMetaG method relies on MDA, which is known to amplify unevenly. 19 Thus, our scMetaG method will likely benefit from MDA alternatives in the future. Beyond high-throughput data generation, a lack of bioinformatic software for analyzing, storing, and disseminating microbial single-cell data further prevents microbiologists from making the single-cell leap. We provide Bascet/Zorn, the first bioinformatic toolkit for end-to-end analysis of microbial single-cell data, including novel file formats for efficient storage and data sharing. Unlike existing tools for single-cell analysis, Bascet/Zorn is designed to compare cells without a common reference. To handle the sparsity of single-cell data, we developed three novel k -mer-based methods. Most notably, we implement an approach based on randomized projections, utilizing all k -mers in the input data (the k-mer count sketch workflow). Interestingly, binarizing projected vectors improves species-species separation. Highly quantised representations have recently gained traction in deep learning, given their space efficiency and suitability for fast dedicated hardware (e.g., XOR operations; Fig 3j ). 37 Our approach translates cell comparison into a problem familiar within machine learning, and is analogous to how alignment can be bypassed for the analysis of RNA-seq data; 38 our approach, however, is reference-free. This suggests that further theoretic unification of k- mer analysis may be fruitful. We anticipate that additional single-cell analysis methods can be incorporated into the Bascet/Zorn toolkit as they are developed, including additional QC steps, improved SAG (co-)assembly strategies, and incorporation of additional ‘omics layers (e.g., single-organism scRNA-seq, single-cell metatranscriptomics). Overall, our novel scMetaG method queries microbiomes at maximum possible resolution (single cells) and boasts unprecedented throughput; notably, we have the capacity to generate roughly as many microbial genomes in a single experiment as there currently are in existence in public databases. In the future, as data generation costs decrease, we anticipate that our scMetaG method and analysis toolkit will be particularly useful within the realm of diagnostics, pathogen surveillance, and precision health. 39 – 42 Altogether, recent developments here and elsewhere 3 , 32 – 35 indicate that microbiology is on the cusp of a single-cell revolution. Code availability Bascet and Zorn are both open-source and freely available via GitHub under the MIT license ( https://github.com/henriksson-lab/bascet and https://github.com/henriksson-lab/zorn , respectively). Code specific for this study has been deposited at GitHub ( https://github.com/henriksson-lab/scwgs1 ). AUTHOR CONTRIBUTIONS Software/computational method development and computational analyses were performed by HG, JD, LMC, and JH, with input from FA and TL. Experimental method development was performed by IY and JV, with assistance from FA and input from LM and JH. Clinical samples were provided by JN, N Sheng, and N Strömberg. LMC and JH conceived the study, which was funded by LMC, JH, TL, JN, and N Strömberg. All authors contributed to manuscript preparation. COMPETING INTERESTS The authors declare no competing interests. SUPPLEMENTARY FILES Supplementary File S1: Cost calculations Supplementary File S2: qPCR primer sequences Online Methods Overall design of Bascet To ensure speed and allow for the implementation of bespoke algorithms, Bascet was implemented in Rust (v4.4.0). All operations were implemented in a multithreaded fashion. To maximize inlining and reduce cache misses, the use of dynamic pointers was avoided. Robustness was maximized by avoiding external software dependencies; when needed, dependencies were deployed in a prebuilt Singularity container. 43 To enable future improvements of the file formats, read and write operations are, when possible, performed through one of the internal APIs that hide file format details. The main three APIs are for (i) reading the data for a given cell, (ii) iterating across cells and retrieving data in linear order, and (iii) appending data for a cell. Because data is typically processed on a cell-by-cell basis, the data for one cell is read or written as a whole. This approach minimizes the number of function calls and is built on the assumption that a thread will read the data for one cell, finish all processing, then write the output in its entirety. The writing blocks other threads from sending data, and thus benefits from the data being as ready-to-write as possible. A caveat of this approach is that the memory consumption is driven by the amount of data per cell. However, Bascet is designed for the use case of rather shallow sequencing of microbes. Assuming 10x coverage of a 5 Mbp genome, including quality scores, across 20 threads, this amounts to 1 GB active memory (ignoring overhead; i.e., a very small amount for a typical compute node). Overall design of Zorn Zorn was implemented in R (v4.4.0) to provide easy access to Seurat and Signac, with which it integrates, as well as to provide a familiar interface for bioinformaticians and biologists alike. The concept of shards was implemented in Zorn, where each file is stored as [name.##.ext]. The ## denotes the shard number, and each function call referring to “name” will automatically detect which shards belong together. Calls to Bascet give explicit paths to individual shards, meaning that Bascet need not be aware of the Shard concept. This implies that users are discouraged from using Bascet directly, and command-line invocations should rather be through R files and R scripts. Computationally heavy Zorn function calls result in Jobs , which are executed using a Runner . There are presently three runners: (i) an immediate local runner, pausing R until the job is executed; (ii) an asynchronous local runner, returning immediately but expecting the user to check when the job is completed; (iii) a SLURM runner, which is inherently asynchronous. The local runner is only meant for developers of Bascet/Zorn, or new users who wish to test the workflow without committing to a full server farm deployment. Routine users are rather expected to set up SLURM and provide suitable parameters (e.g., account, number of cores), obtaining the full benefits of parallelization. To run all the steps in a workflow, R should run on e.g., a login node using screen or tmux to keep the session alive. By default, all jobs are blocking calls (i.e., R is paused until the job has been executed). Optionally, a handle can be returned, which can be polled to see if the job is complete or not. This design is to benefit users who run RStudio via remote desktop (e.g., Open OnDemand) on the server farm, where one R session can be used to both quickly launch jobs while performing interactive data analysis. Once Bascet has produced count data, Zorn can load it and provide it either as a Signac ChromatinAssay (if a reference genome is associated), or Seurat Assay for other workflows (e.g., k -mer or taxonomy counts). These objects can be combined into SeuratObjects to enable “multimodal” analysis of the same data, in the sense that different means of summarizing the cell data can easily be compared. The typical commands for e.g., plotting, differential expression, data integration, doublet detection, and imputation are then available. Detection of Atrandi MDA cell barcodes Bascet has a modular system for handling different types of single-cell data. In the case of Atrandi MDA (see section “Bacterial lysis, whole genome amplification, and cell labelling” below), the cells are first identified by a combination of four barcodes, each from a list of possible sequences (“whitelist”). This enables correction of the barcodes in case of sequence errors. The location of the barcodes within the read are first detected by analyzing the first 10,000 reads and scanning for whitelist entries. This step also prepares alternative positions to scan in case of a shift (e.g., +-1bp, tunable). For performance, a naive algorithm first simply attempts to look up the barcode in the whitelist, from the expected position, using a hashmap. If this fails, the most similar whitelist entry is looked up. For performance, if any barcode fails, the algorithm stops. Barcode A is looked up first, as it is the most likely one to fail in case of over-fragmentation. Trimming of Atrandi MDA reads Bascet performs initial trimming in parallel to barcode detection (see section “Detection of Atrandi MDA cell barcodes” above). Read 2 (R2) contains the cell barcode, and the initial portion is trivially removed based on the expected barcode size, plus 2bp to account for variation in dA-tailing. Read 1 (R1) is the genomic read, and must be trimmed if the insert is small and the read continues into the barcode. This analysis is complicated by the fact that there is no fixed adapter sequence, but rather, different types of Barcode A may appear. Because most users will obtain 2×150bp reads, an approximation algorithm is used to ensure speed: the last 10bp from R2 are reverse complemented and scanned for in R1. The chance of finding this sequence randomly is low (0.25 10 ), while the sequence is short enough to avoid sequencing errors. If the sequence is found, the reads are lined up, assuming no shifts have been introduced by sequencing errors. R1 and R2 are then trimmed if their respective 3’ is expected to overlap each other’s adapters. Analysis of the resulting trimmed reads using FastQC v0.12.1 (default settings) 44 showed that this trimming was not sufficient. Qualitative analysis of suspicious reads (e.g., reads misclassified as human origin by Kraken2, 11 in mock samples that are not supposed to have human DNA) showed non-canonical library fragments (e.g., reads where no adapter had been ligated, which contained the P5 adapter that is added through PCR toward the adapter). We can only explain this as mispriming, and conclude that it is important to trim adapters also where they might not be expected. We thus added a fastp-based 45 trimming step (v0.23.4, default settings, paired mode), which can be invoked through Zorn in a shard-compatible manner. However, the Bascet-FASTQ format enables any trimmer to be used after barcode identification. Sharding and the design of TIRP file format To make the remaining steps agnostic to the choice of library preparation method (i.e., to support non-Atrandi protocols), the debarcoded and trimmed reads are stored in a new TIRP (Tabix-indexed read pairs) file format. This is a modified BED file, enabling the use of Tabix 46 to retrieve all reads for a given cell barcode. The file has 8 columns (chrom, from, to, r1, r2, q1, q2, UMI), where r# and q# represent read and quality score. The first three columns are mandated by the BED standard. By setting chrom=cellbarcode, from=1, to=1, and sorting the file, Tabix can extract reads for a cell by asking for all reads from the given “chromosome” (i.e., the name of the cell is used). While the from–to columns are redundant, compression of the file will largely remove them. For Atrandi, UMI=””, as the field is primarily intended for scRNA-seq protocols (to be supported in the future). Alternatives to TIRP were considered and may be implemented in the future, as the Bascet API enables easy addition of new file formats. TIRP however has two advantages: (i) files can be sorted using the Unix sort utility, which typically is extremely optimized. Keeping read pairs on the same line ensures that the two reads are kept together in the process; (ii) as a Tabix BED file, users can inspect the content using existing command-line tools. To maximize multiprocessing, each raw FASTQ file will be debarcoded, trimmed, and outputted as a TIRP file that is sorted. As sequencing may be performed on multiple lanes, one library may result in multiple FASTQ files. To shard the files, these FASTQ files must be merged, and simultaneously rewritten but split on a cell-by-cell basis. This has to be performed on a single compute node, making it a workflow bottleneck. By pre-sorting the TIRP-files, sharding becomes linear in time, making it primarily bound by disk I/O speed. Design of the Bascet-ZIP file format As we expect to wrap many existing tools, there is a need for a structured approach to handling their input and output. One approach is to force the user to standardize the output from each software; however, it is near impossible to find a one-size-fits-all solution (e.g., conversion to JSON or XML), and this would incur a heavy burden on the developers. We thus instead aim to store the raw output data as-is. The current exceptions are (i) contigs and (ii) raw reads and quality scores, as these files will commonly be piped across tools, calling for a standard format. Several storage solutions were considered, and through the Bascet API, improved file formats can easily be added. The use of a separate object store (e.g., S3, MongoDB) was ruled out, as such solutions require the maintenance of separate persistent processes, and many compute facilities do not support them. Furthermore, for most of our envisioned users, object stores are difficult to copy between machines or make backups of. We instead opted for a single container file. Under no normal circumstances is the content of these files modified, but rather they are written in their entirety once only. However, we expect the user to want to (i) list all files related to a cell, (ii) extract a specific file, and (iii) for advanced applications, jump to and read a specific coordinate within a file for a cell. All of these requirements are met by basic ZIP files, as they contain an index to all files within. Experiments showed that writing a ZIP-file having 100,000 (trivial) files can be performed in seconds, while a regular filesystem would struggle to cope with this quantity. As an example of the latter, remote copying using scp may take nearly a second to set up the transfer of a single file, no matter the size of the content. However, by keeping all small files together in a ZIP, only one such handshake is needed, and disk prefetch results in maximum I/O performance. The current ZIP format is rather trivial, in that files are stored as #/randomfile.xyz where # is the cell barcode. For FASTQ-type content, the data is stored as #/r1.fq and r2.fq. Contigs are stored as #/contig.fa. A special file, #/_mapcell.log, contains the stdout of the tool that was used to generate the content. Other tool-specific log files may also be stored, normally under their original names. To enable random I/O to files within the ZIP-file, the files can be stored in uncompressed format. It is the responsibility of the storing process (e.g., the map-reduce system) to ensure that this happens. The Bascet API can then extract the start and end coordinates of the file within the ZIP-file, enabling advanced tools to directly look up needed data. An example use case is to store per-cell k -mer databases, and enable the lookup of individual k -mer counts without the need to first extract the entire k -mer database. This is thus crucial for the performance of some envisioned future tasks. Design of the map-reduce system To apply functions to each cell’s data, we designed a system around constraints imposed by existing bioinformatic software and the file formats supported by such software. This has resulted in a general API, provided as several Rust traits, each available if the underlying file format enables the implementation. Two modes of data reading are supported: (i) streaming, which reads all objects in the file, and (ii) random, which specifically reads requested objects. Since random access requires seeking based on an index, not all file formats support this (e.g., Bascet-FASTQ has no means of seeking to a particular cell). All file formats support streamed access. Random access is only beneficial in a few scenarios: (a) when extracting a filtered list of objects, (b) when extracting a single object, e.g., the assembled genome of a particular cell of interest. The function to be applied is on the highest level a function in Rust. Functions written in this format have the highest level of possible performance, and can e.g., read specific parts of objects in the container without extracting the full file. This is useful when embedding per-cell databases in the container file. As this type of function requires knowledge of Rust, we expect only a handful of power users to make use of it. The most commonly used Rust map function takes a Bash script as input and runs the script as the map function. This approach is designed to be suitable for a general user who wishes to wrap existing software into Bascet/Zorn. For this approach to work, the Bash script must provide a list of files that the software needs. The underlying API will then create these files based on how the container file format is designed; e.g., a requested r1.fq will simply be unzipped from a Bascet-ZIP, but created de novo if the input is a Bascet-BAM. The downside of this approach is that an additional file creation step is needed; however, not all software can pipe two read files, and this approach is more amenable to parallelized decompression. The Bash script also specifies the preferred number of threads; if this is more than one thread, priority is given to distribute as many threads as possible to a smaller number of map-scripts. This turns out to (i) reduce peak main memory usage, and (ii) solve a bottleneck where a map-reduce operation waits for a single cell with large amounts of data to be processed to completion. The reduce step is performed using the Zorn aggregate function. Two key assumptions went into this design: (i) reduction is computationally cheap, within the capacity of a single computer; (ii) reductions may have rich structure, such as being a tree or R S4 class. Performing the reduction largely using R code fits these assumptions, where Zorn calls Bascet merely for extracting files from the container object. To simplify the build process for users, Bascet is invoked via the command line rather than using the C-language bindings for R. However, there is a great cost to invoking the command line when Bascet is embedded in a Singularity or Docker container. To remove this overhead, a crucial optimization is the addition of the Bascet console command. This console (not designed for human operators, although possible) is opened up as an asynchronous process using the R processx library (v3.8.6), enabling a stream of commands to be sent to a Bascet process that need only be started once. Furthermore, container files need only be opened once, to then serve a stream of objects to Zorn. De novo assembly Bascet wraps both SPAdes 19 and SKESA 20 via the MAP system, where each de novo assembler outputs a FASTA file containing contigs. These FASTA files are stored in Bascet-ZIP as described above (see section “Design of the Bascet-ZIP file format”). SPAdes supports assembly of single genomes, amplified using MDA, taking into account the local-exponential nature of the reaction (i.e., via its single-cell assembly mode; option ‘--sc’). However, we find that, for the massive number of libraries that our scMetaG method generates, single-cell SPAdes is too slow to be practical. We thus used SKESA v2.5.1 for analyses conducted here (’--min_contig 50--min_count 1--fraction 0.01’). Alignment workflow Bascet wraps BWA 10 for reference-based alignment using the BWA MEM algorithm. To keep the read-cell barcode linkage, a custom FASTQ file is written, where the name of the cell is encoded as the name of the read. This implies that certain characters are not allowed in cell names (e.g., whitespace). After BWA has produced a BAM file, Bascet wraps SAMtools 47 to optionally sort and index the file. Reads are otherwise ordered such that all reads for one particular cell belong together. Bascet then provides the means of counting the reads mapping to a reference genome, storing the counts in an HDF5 file format using sparse encoding. As one count table is produced per shard, Zorn takes care of merging count tables upon loading. Within Bascet’s alignment workflow, users additionally have the option of performing variant calling using CellSNP-lite. 15 k -mer database workflow Bascet wraps Kraken2 11 for read-based taxonomic classification. The standard database is standard-8, as we envision that this workflow will primarily be used for rough classification, and thus the smallest database is the best fit. Because Kraken2 spends significant time loading the k -mer database, the reads for all cells have to be processed in a single invocation. This influenced the creation of Bascet-FASTQ and Bascet-BAM as intermediate formats to support such tools. Reads can be piped and written in a suitable format on-the-fly. The Kraken2 output file is then parsed by another command, resulting in a sparse matrix anndata 48 -like HDF5 file, containing counts of taxonomyID vs cell name. After loading this matrix using Zorn, cells can be assigned a consensus taxonomy ID (i.e., assigned to the most common ID among reads for the cell). To aid visualization, Zorn can further map consensus taxonomy ID to the name of phylum, class, order, family, genus, and species (internally using taxonomizr v0.10.7; https://CRAN.R-project.org/package=taxonomizr ). Informative k -mer workflow Informative k -mers can be extracted both from raw reads and contigs produced via de novo assembly (see section “ De novo assembly” above). As contigs cannot currently be produced for cells with extremely low coverage, we have prioritized the read-based workflow. In early versions of Bascet, we first deduplicated k -mers using KMC3. 49 We decided against this approach, as it was slow, and we encountered random crashes in the KMC3 ops-step (where we merged k -mers across cells). Furthermore, the count of k -mers may also be informative (e.g., possibly reflecting plasmid copy number). Thus, the current version counts k -mer (and k -mer hashes) directly from reads without any attempt at pre-sorting them. As the hash function, we chose SeaHash v.4.1.0 ( https://docs.rs/seahash/latest/seahash/ ), based on its speed and availability for Rust. After the user has selected k -mers of interest, all reads are scanned again. While looking up k -mers in a k -mer database may theoretically be faster, it is likely that the large number of queried k-mers will be randomly scattered over the database. Thus, due to cache misses and how the disk is read in blocks, any theoretical speed gain would likely not pan out in practice. To select an adequate k -mer frequency range, we empirically tested k -mer frequency ranges and constructed UMAPs via the Seurat workflows for both ATAC-seq 30 (TF-IDF, SVD, UMAP) and RNA-seq (size-factor normalization, PCA, UMAP). We initially expected the most abundant k -mers to be of low utility; however, we do not appear to find any such k -mers, most likely because the k -mer length (31bp) results in near species-specific k -mers. For the mock community, at our sequencing depth, at least 10k of the most common MinHash k -mers are needed to ensure that cells cluster together. However, with about 100k k -mers, we observe a drastic improvement in clustering, and thus used the top 100k most common MinHash k -mers in our analysis. k -mer count sketch workflow We aim to be able to model the distribution of counts X for k -mer k , from the genome z of cell i , as The latent space, Z , can be equipped with a normal prior to make this a typical generative process. We will however not observe X directly (unless we assemble the genome), but rather sampled k -mer counts, Q k . Ignoring the correlation between k -mers and local exponential amplification biases of multiple displacement amplification (MDA), 19 a simple model is In practice, we simply use the depth estimate To compute a lower dimensional representation, Z , we decided to use UMAP. 14 This algorithm takes as input the k -nearest neighbour (kNN) graph (i.e., most similar cells). Given above, we can compute this graph over the estimated k -mer distribution, where d i takes the role of a size factor for cell i . As the distance between points, we use the cosine distance, as the probability of random vectors being orthogonal (high distance) increases with dimensionality. This has previously been shown to benefit single-cell analysis. 50 Computing the kNN from Equation [4] is, however, intractable as Q has 4 31 features (or about 5×10 7 for a mix of 10 different bacterial species) and cannot be kept in memory (merely keeping the features sorted on disk is a challenge). We instead compute a randomized projection, SQ , to a lower-dimensional zero-centered space, 51 where SQ largely keeps the structure of Q ; e.g. || SQx || 2 = (1±ε)|| Qx || 2 , and low rank approximations Q=LDW can be found (where if Q is n × n, L is n × k, D is diagonal k × k, and W is n × k). Such approximations also open up for future alternative dimensional reduction methods, as well as generative process formulations. In the explicit algorithm, which turns out equivalent to the count sketch algorithm, 52 S is defined through hash functions h(x) → {1,…,n} and g(x) → {-1,+1} as For h , we used SeaHash v.4.1.0 ( https://docs.rs/seahash/latest/seahash/ ). For g , we used Murmur3 from fasthash v0.4.0 ( https://docs.rs/fasthash/latest/fasthash/ ). Due to the linearity of the mapping, the entries SQ = S ( Q 1 + Q 2 +…) can be computed in parallel using multiple threads. Furthermore, there is no need to ever retain Q in memory, meaning that the memory requirement is determined by the size of SQ only. To help fine-tune the size of the final projection, without recomputing SQ from all raw data, Zorn may further apply a second randomized zero-centered projection as TSQ . T is defined as or equivalently, for each output vector, an equal number of input feature vectors are added and subtracted. S and T both transform the features such that they are zero-centered, motivating the choice of cosine distance. Before computing the kNN-graph from SQ , we tested different methods of normalization, where division by d in Equation [4] is the natural approach. While the normalizing factor, d , can be computed from SQ , computing it with good precision appeared difficult (using standard double floating point), and the resulting UMAPs were of poor quality. Inspired by product quantization (which can be used to speed up computation while also reducing memory footprint) 53 and binarized neural networks, 37 we instead investigated the quantized This method results in trivially normalized vectors as || X ’’ .i || 2 = 1. For the cosine distance, it retains the key property of large vectors, namely their low probability of randomly being aligned. To finally compute the kNN graph, an approximation was needed, as computing all cell pairwise distances was prohibitively slow (even for 5,000 dense dimensions). We thus approximated the kNN graph using the library annoy ( https://github.com/spotify/annoy ), as Seurat has already wrapped this library. Reference genome-based analysis of mock community data A reference genome was constructed by concatenating the 10 reference genomes of the ATCC mock community strains (MSA-2003). BWA mem v.0.7.18-r1243-dirty was used for alignment via Zorn/Bascet (default settings). To generate a Barnyard-like plot and assess mixing cells, we compared the counts of reads for the dominant species for a cell vs the total count of reads of other species (as assigned via Kraken2 v2.1.3 and the standard-8 database). Generation of simulated mock community data An R script was written to simulate reads from cells, immediately creating a TIRP file (i.e., the barcoding step was skipped, as this benchmark focused on assessing the performance of Bascet/Zorn’s dimensionality reduction methods). For genomes consisting of multiple contigs, an equal ratio was assumed. The MDA amplification and library preparation was assumed to be ideal, such that reads were simply uniformly and independently sampled across the genome of each cell. All cells were singlets. The insert size of all fragments was assumed to be 800 bp, resulting in 2×150bp reads. The number of reads per cell was sampled uniformly (log scale), from 10 3 to 10 4 . 5 reads. All species were assumed to have an equal number of cells (50). Collection and processing of the saliva sample Fresh saliva was obtained from a healthy (non-patient) human donor (ethical permit # 08-047M). The saliva (5mL) was collected in a 50mL falcon tube and vortexed for 3 min with glass beads (Fisherbrand, catalog #41401001). The sample was subsequently filtered using a 35μm filter (Cat no. 352235, Corning). The filtrate was further centrifuged at 5,000g for 10 minutes. The supernatant was discarded and the pellet was washed with 1mL of PBS three times. Finally, the pellet was resuspended in 10% glycerol and stored at −80°C. Single-cell bacterial encapsulation in semi-permeable capsules Samples (i.e., ten-strain mock community ATCC MSA-2003 and the processed saliva sample; see section “Collection and processing of the saliva sample” above) were diluted before starting the single-cell encapsulation using phosphate-buffered saline (PBS). The dilution was made based on the cell count as observed by counting the cells on a hemocytometer to get lambda of 0.1 or lower for final droplet loading. The ONYX platform (Atrandi Biosciences, cat no. MHN-ONYX1) was used for SPC production using the SPCs innovator kit (Cat no. CKN-G-12, Atrandi Biosciences). The procedure was followed as per the instructions in the manual for the preparation of the core and shell solutions used for SPC generation. The saliva sample was pretreated with DNase I (0.1U, NEB) for the removal of any host or free-floating DNA, followed by the addition of 0.05% Xanthan gum (Cat no. G1253, Sigma) to the final core solution, which helps to prevent any aggregation of bacteria together or to the channel walls, thus helping to maintain a smooth flow. 54 The encapsulation was performed for 3h, and the excess oil was removed from the emulsion collected in the collection tube. The shell was crosslinked using a UV light source of 405nm (Atrandi Biosciences Light Exposure Device) for 30–60s, followed by emulsion breaking and washing the capsules with Capsule Wash Buffer (CWB). The occupancy of the capsules was further checked by fluorescence imaging. Initial lysis optimization For initial lysis optimization, we attempted 6 protocols. These were further compared by qPCR using PCRBIO HS Taq Mix Red (Catalogue no. PB10.23-02; primer sequences in Supplementary File S2 ), inside SPCs as well as in free solution. M1 and M6 were found to give better lysis efficiency as depicted in Fig.2d . M1: Based on a previous protocol 55 with custom modifications. Resuspend the SPCs in a PBS buffer with 50U/ul Lysozyme (Cat no. E-0057-D2, Biosearch Technologies), 22U/ml Lysostaphin (Cat no. L9043, Sigma), 250U/ml Mutanolysin (Cat no. SRE0007, Sigma) and incubate at 37°C overnight. The next day add 0.5mg/ml Achromopeptidase (Cat no. A3547, Sigma) followed by incubation at 37°C for 6-8 hours. Spin down and wash SPCs with 1X WB. Resuspend the SPCs in 1 mg/ml Proteinase K (P8107AA, NEB), 0.5% SDS, Incubate overnight at 40°C. M2: Based on a previous protocol. 56 Resuspend the SPCs in a TE buffer containing: 2.5mM EDTA, 10mM NaCl, 5U Lysostaphin (Cat no. L9043, Sigma), 50U Mutanolysin (Cat no. SRE0007, Sigma), 20mg Lysozyme (Cat no. E-0057-D2, Biosearch Technologies) and incubate at 37°C overnight. The next day treat with TE buffer containing 4U Proteinase K (P8107AA, NEB), 1% Triton X 100 (Cat no. X100, Sigma), 100mM NaCl. Incubate at 55 °C for 30 minutes. M3: Based on a previous protocol. 57 Resuspend the SPCs in Buffer 1: 20mM Tris HCl pH 8, 10mM EDTA, 100mM NaCl, 1% Triton X 100 (Cat no. X100, Sigma), 20mg/ml Lysozyme (Cat no. E-0057-D2, Biosearch Technologies) and incubate at 37°C for 1 hour. Treat with Buffer 2: Tris pH8, 20mM EDTA, 100mM NaCl, 1% SDS, 200ug/ml Proteinase K (P8107AA, NEB) and incubate at 55°C for 30 minutes. M4: Based on a suggestion from Atrandi. Resuspend the SPCs in an Enzyme cocktail: 100U/ul Lysozyme (Cat no. E-0057-D2, Biosearch Technologies), 500U/ml Mutanolysin (Cat no. SRE0007, Sigma), 22U/ml Lysostaphin (Cat no. L9043, Sigma), 0.5mg/ml Achromopeptodase (Cat no. A3547, Sigma), and incubate at 37°C for 1 hour. M5: Based on a suggestion from Atrandi. Resuspend the SPCs in the enzyme cocktail from M4 and incubate at 37°C for 1 hour. Spin down and wash SPCs with 1X WB. Resuspend the SPCs in 200ug/ml Proteinase K (Cat no. P8107AA, NEB), 1% SDS, 10mM EDTA, 10mM Tris-HCl pH7.5 and incubate at 55°C for 30 minutes. Next the SPCs were treated with the following lysis buffer: 0.8M KOH, 20mM EDTA, 200mM DTT (Cat no. P2325, Thermo Fisher Scientific) and incubated at RT for 15 minutes. M6: Modification of M2. TE buffer containing: 2.5mM EDTA, 10mM NaCl, 5U Lysostaphin (Cat no. L9043, Sigma), 50U Mutanolysin (Cat no. SRE0007, Sigma), 20mg Lysozyme (Cat no. E-0057-D2, Biosearch Technologies) + 0.5mg/ml Achromopeptidase (Cat no. A3547, Sigma) at 37°C overnight. TE buffer with 4U Proteinase K (P8107AA, NEB), 1% Triton X 100 (Cat no. X100, Sigma), 100mM NaCl at 55 °C for 30 minutes. Lysis optimization, validated by sequencing The SPCs were fixed in chilled 100% methanol before the initiation of the lysis. SPCs were divided into different tubes and treated with different lysis methods as follows: R1 : Adapted from Atrandi with custom modifications. SPCs were resuspended in buffer containing 100 U/μL Lysozyme, 0.2% Pluronic F-68, 200 mM NaCl, 2 mM EDTA, 20 mM Tris-HCl pH7.5 and incubated at 37°C for 30 min, 900 rpm shaking. SPCs were spun down and resuspended in 1mL of 200μg/mL Proteinase K, 1% SDS, 10 mM EDTA, 10 mM Tris-HCl pH7.5 and incubated at 50 °C for 30 minutes. SPSs were pelleted and resuspended in 500 µL of 1X Wash Buffer (WB (10 mM Tris-HCl (pH 7.5) with 0.1% Triton X-100 (Cat no. X100, Sigma) and mixed with 500 µL of 2X Atrandi lysis buffer consisting of 0.8 M KOH, 20 mM EDTA and 200 mM DTT (total reaction volume 1 mL) and incubated at RT for 15 minutes. R2: Adapted from Atrandi with custom modifications. SPCs were resuspended in buffer containing 100 U/μL Lysozyme, 500U/ml Mutanolysin, 0.2% Pluronic F-68, 200 mM NaCl, 2 mM EDTA, 20 mM Tris-HCl pH7.5 and incubated at 37°C for 1 hour 800 rpm shaking. SPCs were pelleted and resuspended in 1mL of 200μg/mL Proteinase K, 1% SDS, 10 mM EDTA, 10 mM Tris-HCl pH7.5 and incubated at 55°C for 30 minutes. SPCs were pelleted and resuspended in 500µL of 1X WB and mixed with 500 µL of 2X Atrandi lysis buffer and incubated at RT for 15 minutes. R3: Based on optimized method M6 . SPCs were resuspended in TE buffer containing: 2.5mM EDTA, 10mM NaCl, 5U Lysostaphin (Cat no. L9043, Sigma), 50U Mutanolysin (Cat no. SRE0007, Sigma), 20mg Lysozyme (Cat no. E-0057-D2, Biosearch Technologies) and 0.5mg/ml Achromopeptidase (Cat no. A3547, Sigma) and incubated at 37°C overnight at 800 rpm shaking. SPCs were pelleted and resuspended, washed in 1X WB 3 times and resuspended in 1 mL of 4U Proteinase K, 1%Triton X 100 and 100mM NaCl and incubated at 55°C for 30 minutes–1h at 800 rpm shaking. Bacterial lysis, whole genome amplification, and cell labelling The SPCs were fixed in chilled 100% methanol before the initiation of the lysis. SPCs were divided into different tubes and incubated in enzyme cocktail solution with TE buffer (pH 8) containing 5U Lysostaphin (Cat no. L9043, Sigma), 50U Mutanolysin (Cat no. SRE0007, Sigma), 50U Lysozyme (Cat no. E-0057-D2, Biosearch Technologies), 0.5mg/ml Achromopeptidase (Cat no. A3547, Sigma) with 2.5mM EDTA and 10mM NaCl at 37°C with shaking at 800rpm overnight. The next day, the SPCs were washed three times with WB and further resuspended in the Proteinase K lysis solution in TE buffer containing 4U Proteinase K (P8107AA, NEB), 1% Triton X (Cat no. X100, Sigma) and 100mM NaCl. The incubation was done at 55°C for 1 hour with shaking at 800rpm. After the incubation, the SPCs were spun down and washed three times with 1ml of Neutralization buffer (NB - 1M Tris-HCl (pH 7.5), three times with WB and two times with nuclease-free water. Whole genome amplification and cell labelling was done using Single-Microbe DNA Barcoding Kit (Cat no. CKP-BARK1, Atrandi Biosciences). Briefly, SPCs were mixed with all the reaction components for WGA and incubated at 45°C for 15 minutes, followed by the inactivation at 65°C for 10 minutes. SPCs were then washed with WB three times and to verify the occupancy of amplified genomes, SPCs were stained with 10x SYTO9 nucleic acid dye to observe them under a fluorescence microscope. Further, SPCs were processed for DNA debranching and DNA end preparation as per the instructions in the manual. For the cell labelling, a four-step combinatorial split-and-pool barcoding was performed to label each SPC with a unique variant. We used 24×24×24×24 barcode combinations for the ATCC-mock community library, while 24×24×24×5 barcode combinations were used for the saliva library. Library preparation for sequencing The barcoded SPCs were further treated with the Release reagent (Atrandi Biosciences) and the released DNA was purified using 0.8X AMPure XP paramagnetic beads (Cat no. A63881, Beckman Coulter). The purified DNA was used for the final library preparation using NEBNext Ultra II FS DNA Library Prep Kit for Illumina (Cat no. E7805S, NEB) and custom PCR indexing primers. The library was cleaned using double-sided AMPure beads purification and library concentration was measured by Qubit using a dsDNA HS kit (Cat no. Q32854, Thermo Fisher Scientific), followed by checking the library QC on Agilent Bioanalyzer. The libraries were further diluted as per the manual specifications and sequenced on Illumina NovaSeq X platform. Analysis of the saliva sample After debarcoding and trimming (using fastp v0.23.4; paired end mode), reads were aligned (using BWA mem v.0.7.18-r1243-dirty, default settings) to a concatenated reference genome consisting of human (GRCh38) and Xanthomonas campestris B-1459 (per the addition of xanthan gum in the scMetaG protocol; NCBI Assembly accession GCF_001372255.1). Reads that mapped to either genome were conservatively removed for raw data deposition and further analysis. To analyze the fraction of reads mapping to either reference genome, the CIGAR string was analyzed and only reads having >=50 matches (CIGAR M) were considered to be mapping. Bascet/Zorn’s k -mer database workflow was used to obtain an overview of the sample using Kraken2 v2.1.3 and the standard-8 database. Low-count filtering was performed, resulting in 10k cells used in subsequent analyses. The following tools were run using the Zorn map system: (i) SKESA v2.5.1 (to assemble SAGs de novo ;--min_contig 50 --min_count 1 --fraction 0.01); (ii) QUAST v5.3.0; (iii) ABRicate v1.0.1 (--minid 80 --mincov 80). 21 To conduct bespoke analyses, the SAG for each cell was extracted via Zorn ( n = 9,999 final SAGs). The ‘lineage_wf’ workflow in CheckM v1.2.2 58 was used to assess SAG completeness/contamination using 28 CPUs. GTDB-Tk v2.3.2 59 was used to assign each SAG to a species within the Genome Taxonomy Database (GTDB) taxonomy (release214), 60 using the ‘classify_wf’ workflow and 28 CPUs. The following tools were applied to each SAG, using Nextflow v24.04.2 build 5914 9 and 1 CPU: (i) AMRFinderPlus v3.12.8, 23 with database version 2024-01-31.1, each SAG FASTA file supplied as input to the ‘-n’ option, and the ‘--plus’ option enabled; (ii) GECCO v0.9.10, 25 with each SAG supplied as input to ‘-g’ and parameters ‘--mask --merge-gbk -m 0.3’; (iii) antiSMASH v6.1.1, 61 with ‘--taxon bacterià and ‘--genefinding-tool prodigal-m’. Biosynthetic gene clusters (BGCs) detected by GECCO and antiSMASH were concatenated with experimentally validated BGCs from the MIBiG database (v4.0) 62 and clustered into Gene Cluster Families (GCFs) using IGUA v0.1.0, 63 with ‘--clustering-distance 0.80’, ‘--clustering-method “average”’, and 8 CPUs. AMR determinant and GCF presence/absence matrices were loaded into Zorn for UMAP visualization ( Supplementary Figures S1 and S2 ). Streptococcus salivarius SNP analysis was performed as follows: the reads from all cells were aligned to a reference SAG (i.e., from the cell H1_E5_F9_E11, a S. salivarius SAG of relatively high quality) using BWA mem v.0.7.18-r1243-dirty 10 (default settings). CellSNP-lite v1.2.3 15 was called (default settings, i.e., no cutoffs), wrapped via Zorn on each shard, to generate SNP count matrices. These matrices were loaded and merged via Zorn. A matrix of cell counts for each ALT base and genome coordinate was created. Supplementary Figures Download figure Open in new tab Supplementary Figure S1. Antimicrobial resistance (AMR) determinants detected in SAGs from human saliva using (a) AMRFinderPlus and (b) ABRicate. UMAPs were constructed using Bascet/Zorn’s k-mer database workflow. Download figure Open in new tab Supplementary Figure S2. Biosynthetic gene clusters (BGCs) detected in SAGs from human saliva. UMAPs were constructed using Bascet/Zorn’s k-mer database workflow. BGCs were detected in each SAG using GECCO and antiSMASH and subsequently clustered into gene cluster families (GCFs) using IGUA. Red dots denote cells in which a given GCF was detected. ACKNOWLEDGMENTS This work was supported by: the SciLifeLab & Wallenberg Data Driven Life Science Program (grant KAW 2020.0239 to LMC); the Swedish Research Council (VR; grant 2023-05212 to LMC; grant 2024-03952 to JH; grant 2024-06085 to JH/LMC/JN; grant 2019-00453 to N Strömberg); Kempestiftelserna (IceLab Multidisciplinary Postdoctoral Fellowship Grant CSMK23-0129.3 to LMC/JH/N Strömberg; Kempe Postdoctoral Fellowship Grant JCK-0055 to JH); Cancerfonden (grant #23 3102 Pj to JH); the Umeå University Faculty of Medicine (grant FS 506-21 to JH/TL); TUA #RV-977985 to N Strömberg. Computation was enabled by resources provided by (i) High Performance Computing Center North (HPC2N; Umeå University, Umeå, Sweden), and (ii) the National Academic Infrastructure for Supercomputing in Sweden (NAISS), partially funded by the Swedish Research Council through grant agreement no. 2022-06725. Experimental work was enabled by the μNisch Single-Cell Facility (Umeå University, Umeå, Sweden); the Umeå University Flow Cytometry Facility (Umeå, Sweden); Genomic Medicine Sweden Center North/Clinical Genomics Umeå, funded by a regional agreement between Umeå University and Region Västerbotten (ALF), Kempestiftelserna, Science for Life Laboratory, Umeå University and Genomic Medicine Center (GMC) North; the National Genomics Infrastructure (NGI), funded by VR and Science for Life Laboratory, Sweden. Funder Information Declared Knut and Alice Wallenberg Foundation , KAW 2020.0239 Swedish Research Council, https://ror.org/03zttf063 , 2023-05212 , 2024-03952 , 2024-06085 , 2019-00453 Kempestiftelserna , CSMK23-0129.3 , JCK-0055 Cancerfonden , #23 3102 Pj Umeå University Faculty of Medicine , FS 506-21 TUA , #RV-977985 REFERENCES 1. ↵ Pinto , Y. & Bhatt , A. S . Sequencing-based analysis of microbiomes . Nat. Rev. Genet . 1 – 17 ( 2024 ). 2. ↵ Ye , S. H. , Siddle , K. J. , Park , D. J. & Sabeti , P. C . Benchmarking Metagenomics Tools for Taxonomic Classification . Cell 178 , 779 – 794 ( 2019 ). OpenUrl CrossRef PubMed 3. ↵ Ling , M. et al. High throughput single cell metagenomic sequencing with semi-permeable capsules: unraveling microbial diversity at the single-cell level in sewage and fecal microbiomes . Front. Microbiol . 15 , 1516656 ( 2025 ). 4. ↵ Shao , D. D. et al. Advances in single-cell DNA sequencing enable insights into human somatic mosaicism . Nature Reviews Genetics 1 – 14 ( 2025 ). 5. ↵ Macaulay , I. C. et al. G& T-seq: parallel sequencing of single-cell genomes and transcriptomes . Nat. Methods 12 , 519 – 522 ( 2015 ). OpenUrl CrossRef PubMed 6. ↵ Li , X. , et al. Main Manuscript for Microbiome single cell atlases generated with a benchtop instrument . bioRxiv 2023.08.08.551713 ( 2024 ) doi: 10.1101/2023.08.08.551713 . OpenUrl Abstract / FREE Full Text 7. ↵ Shang , Y. et al. Droplet-based single-cell sequencing: Strategies and applications . Biotechnol. Adv . 77 , 108454 ( 2024 ). 8. ↵ Kitts , P. A. et al. Assembly: a resource for assembled genomes at NCBI . Nucleic Acids Res 44 , D73 – 80 ( 2016 ). OpenUrl CrossRef PubMed 9. ↵ Di Tommaso , P. et al. Nextflow enables reproducible computational workflows . Nat. Biotechnol . 35 , 316 – 319 ( 2017 ). OpenUrl CrossRef PubMed 10. ↵ Li , H. & Durbin , R . Fast and accurate short read alignment with Burrows-Wheeler transform . Bioinformatics 25 , 1754 – 1760 ( 2009 ). OpenUrl CrossRef PubMed Web of Science 11. ↵ Wood , D. E. , Lu , J. & Langmead , B . Improved metagenomic analysis with Kraken 2 . Genome Biol . 20 , 257 ( 2019 ). 12. ↵ Bonfield , J. K . CRAM 3.1: advances in the CRAM file format . Bioinformatics 38 , 1497 – 1503 ( 2022 ). OpenUrl CrossRef PubMed 13. ↵ Germain , P.-L. , Lun , A. , Garcia Meixide , C. , Macnair , W. & Robinson , M. D . Doublet identification in single-cell sequencing data using scDblFinder . F1000Res . 10 , 979 ( 2021 ). OpenUrl 14. ↵ McInnes , L. , Healy , J. & Melville , J . UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction . arXiv [stat.ML] ( 2018 ). 15. ↵ Huang , X. & Huang , Y . Cellsnp-lite: an efficient tool for genotyping single cells . Bioinformatics 37 , 4569 – 4571 ( 2021 ). OpenUrl CrossRef PubMed 16. ↵ Ondov , B. D. et al. Mash: fast genome and metagenome distance estimation using MinHash . Genome Biol . 17 , 132 ( 2016 ). 17. ↵ Hao , Y. et al. Dictionary learning for integrative, multimodal and scalable single-cell analysis . Nature Biotechnology 42 , 293 – 304 ( 2023 ). OpenUrl PubMed 18. ↵ Stuart , T. , Srivastava , A. , Madad , S. , Lareau , C. A. & Satija , R . Single-cell chromatin state analysis with Signac . Nat. Methods 18 , 1333 – 1341 ( 2021 ). OpenUrl CrossRef PubMed 19. ↵ Bankevich , A. et al. SPAdes: A New Genome Assembly Algorithm and Its Applications to Single-Cell Sequencing . ( 2012 ) doi: 10.1089/cmb.2012.0021 . OpenUrl CrossRef PubMed 20. ↵ Souvorov , A. , Agarwala , R. & Lipman , D. J . SKESA: strategic k-mer extension for scrupulous assemblies . Genome Biol . 19 , 1 – 13 ( 2018 ). OpenUrl CrossRef PubMed 21. ↵ GitHub - tseemann/abricate:: mag_right: Mass screening of contigs for antimicrobial and virulence genes . GitHub https://github.com/tseemann/abricate . 22. Hunt , M. et al. ARIBA: rapid antimicrobial resistance genotyping directly from sequencing reads . Microb Genom 3 , e000131 ( 2017 ). OpenUrl 23. ↵ Feldgarden , M. et al. Validating the AMRFinder Tool and Resistance Gene Database by Using Antimicrobial Resistance Genotype-Phenotype Correlations in a Collection of Isolates . Antimicrobial Agents and Chemotherapy ( 2019 ) doi: 10.1128/aac.00483-19 . OpenUrl CrossRef 24. Seemann , T . Prokka: rapid prokaryotic genome annotation . Bioinformatics 30 , 2068 – 2069 ( 2014 ). OpenUrl CrossRef PubMed Web of Science 25. ↵ Carroll , L. M. et al. Accurate de novo identification of biosynthetic gene clusters with GECCO . bioRxiv 2021.05.03.442509 ( 2021 ) doi: 10.1101/2021.05.03.442509 . OpenUrl Abstract / FREE Full Text 26. ↵ Leonaviciene , G. & Mazutis , L . RNA cytometry of single-cells using semi-permeable microcapsules . Nucleic Acids Res . 51 , e2 ( 2023 ). OpenUrl CrossRef PubMed 27. ↵ Olsen , T. R. et al. Scalable co-sequencing of RNA and DNA from individual nuclei . Nat. Methods 22 , 477 – 487 ( 2025 ). OpenUrl CrossRef PubMed 28. ↵ Rosenberg , A. B. et al. Single-cell profiling of the developing mouse brain and spinal cord with split-pool barcoding . Science 360 , 176 – 182 ( 2018 ). OpenUrl Abstract / FREE Full Text 29. ↵ Young , M. D. & Behjati , S . SoupX removes ambient RNA contamination from droplet-based single-cell RNA sequencing data . Gigascience 9 , ( 2020 ). 30. ↵ Cusanovich , D. A. et al. Multiplex single cell profiling of chromatin accessibility by combinatorial cellular indexing . Science 348 , 910 – 914 ( 2015 ). OpenUrl Abstract / FREE Full Text 31. ↵ Johnson , W. B. & Lindenstrauss , J. Extensions of Lipschitz mappings into a Hilbert space . in Conference in modern analysis and probability (eds. Beals , R. , Beck , A. & Bellow , A. ) vol. 26 189 – 206 ( American Mathematical Society, New Haven, Conn ; Providence, RI , 1982 ). OpenUrl 32. ↵ Kuchina , A. et al. Microbial single-cell RNA sequencing by split-pool barcoding . Science 371 , ( 2021 ). 33. Blattman , S. B. , Jiang , W. , Oikonomou , P. & Tavazoie , S . Prokaryotic single-cell RNA sequencing by in situ combinatorial indexing . Nat Microbiol 5 , 1192 – 1201 ( 2020 ). OpenUrl CrossRef PubMed 34. Wang , B. et al. Single-cell massively-parallel multiplexed microbial sequencing (M3-seq) identifies rare bacterial populations and profiles phage infection . Nat Microbiol 8 , 1846 – 1862 ( 2023 ). OpenUrl CrossRef PubMed 35. ↵ Pountain , A. W. et al. Transcription-replication interactions reveal bacterial genome regulation . Nature 626 , 661 – 669 ( 2024 ). OpenUrl CrossRef PubMed 36. ↵ Tourlousse , D. M. et al. Validation and standardization of DNA extraction and library construction methods for metagenomics-based human fecal microbiome measurements . Microbiome 9 , 1 – 19 ( 2021 ). OpenUrl CrossRef PubMed 37. ↵ Ma , S. , et al. The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits . arXiv [cs.CL] ( 2024 ). 38. ↵ Bray , N. L. , Pimentel , H. , Melsted , P. & Pachter , L . Near-optimal probabilistic RNA-seq quantification . Nat. Biotechnol . 34 , 525 – 527 ( 2016 ). OpenUrl CrossRef PubMed 39. ↵ Ko , K. K. K. , Chng , K. R. & Nagarajan , N. Metagenomics-enabled microbial surveillance . Nature Microbiology 7 , 486 – 496 ( 2022 ). OpenUrl CrossRef PubMed 40. Hendriksen , R. S. et al. Global monitoring of antimicrobial resistance based on metagenomics analyses of urban sewage . Nature Communications 10 , 1 – 12 ( 2019 ). OpenUrl CrossRef PubMed 41. Chiu , C. Y. & Miller , S. A. Clinical metagenomics . Nature Reviews Genetics 20 , 341 – 355 ( 2019 ). OpenUrl CrossRef PubMed 42. ↵ Benoit , P. et al. Seven-year performance of a clinical metagenomic next-generation sequencing test for diagnosis of central nervous system infections . Nature Medicine 30 , 3522 – 3533 ( 2024 ). OpenUrl CrossRef PubMed 43. ↵ Kurtzer , G. M. , Sochat , V. & Bauer , M. W . Singularity: Scientific containers for mobility of compute . PLOS ONE 12 , e0177459 ( 2017 ). OpenUrl CrossRef PubMed 44. ↵ Andrews , S. , et al. FastQC . Preprint at ( 2012 ). 45. ↵ Chen , S. , Zhou , Y. , Chen , Y. & Gu , J. fastp: an ultra-fast all-in-one FASTQ preprocessor . Bioinformatics 34 , i884 – i890 ( 2018 ). OpenUrl CrossRef PubMed 46. ↵ Li , H . Tabix: fast retrieval of sequence features from generic TAB-delimited files . Bioinformatics 27 , 718 – 719 ( 2011 ). OpenUrl CrossRef PubMed Web of Science 47. ↵ Li , H. et al. The Sequence Alignment/Map format and SAMtools . Bioinformatics 25 , 2078 – 2079 ( 2009 ). OpenUrl CrossRef PubMed Web of Science 48. ↵ Virshup , I. , Rybakov , S. , Theis , F. J. , Angerer , P. & Wolf , F . A. anndata: Access and store annotated data matrices . J. Open Source Softw . 9 , 4371 ( 2024 ). OpenUrl CrossRef 49. ↵ Kokot , M. , Dlugosz , M. & Deorowicz , S . KMC 3: counting and manipulating k-mer statistics . Bioinformatics 33 , 2759 – 2761 ( 2017 ). OpenUrl CrossRef PubMed 50. ↵ Haghverdi , L. , Lun , A. T. L. , Morgan , M. D. & Marioni , J. C . Batch effects in single-cell RNA-sequencing data are corrected by matching mutual nearest neighbors . Nat. Biotechnol . 36 , 421 – 427 ( 2018 ). OpenUrl CrossRef PubMed 51. ↵ Clarkson , K. L. & Woodruff , D. P . Low rank approximation and regression in input sparsity time . arXiv [cs.DS] ( 2012 ). 52. ↵ Charikar , M. , Chen , K. & Farach-Colton , M . Finding frequent items in data streams . Theor. Comput. Sci . 312 , 3 – 15 ( 2004 ). OpenUrl CrossRef 53. ↵ André , F. , Kermarrec , A.-M. & Scouarnec , N. L . Quicker ADC: Unlocking the hidden potential of Product Quantization with SIMD . arXiv [cs.CV] ( 2018 ). 54. ↵ Pranauskaite , E. , Milkus , V. , Ritmejeris , J. , Zilionis , R. & Mazutis , L . Increasing fluid viscosity ensures consistent single-cell encapsulation . Anal. Chem . 96 , 6898 – 6905 ( 2024 ). OpenUrl CrossRef 55. ↵ Kawano-Sugaya , T. , et al. Single amplified genome catalog reveals the dynamics of mobilome and resistome in the human microbiome . bioRxiv ( 2023 ) doi: 10.1101/2023.12.06.570492 . OpenUrl Abstract / FREE Full Text 56. ↵ Li , X. et al. Microbiome single cell atlases generated with a commercial instrument . bioRxivorg ( 2024 ) doi: 10.1101/2023.08.08.551713 . OpenUrl Abstract / FREE Full Text 57. ↵ Lan , F. et al. Massively parallel single-cell sequencing of diverse microbial populations . Nat. Methods 21 , 228 – 235 ( 2024 ). OpenUrl CrossRef PubMed 58. ↵ Parks , D. H. , Imelfort , M. , Skennerton , C. T. , Hugenholtz , P. & Tyson , G. W . CheckM: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes . Genome Res 25 , 1043 – 1055 ( 2015 ). OpenUrl Abstract / FREE Full Text 59. ↵ Chaumeil , P.-A. , Mussig , A. J. , Hugenholtz , P. & Parks , D. H . GTDB-Tk v2: memory friendly classification with the genome taxonomy database . Bioinformatics 38 , 5315 – 5316 ( 2022 ). OpenUrl CrossRef PubMed 60. ↵ Parks , D. H. et al. GTDB: an ongoing census of bacterial and archaeal diversity through a phylogenetically consistent, rank normalized and complete genome-based taxonomy . Nucleic Acids Res 50 , D785 – D794 ( 2022 ). OpenUrl CrossRef PubMed 61. ↵ Blin , K. et al. antiSMASH 6.0: improving cluster detection and comparison capabilities . Nucleic Acids Res 49 , W29 – W35 ( 2021 ). OpenUrl CrossRef PubMed 62. ↵ Zdouc , M. M. et al. MIBiG 4.0: advancing biosynthetic gene cluster curation through global collaboration . Nucleic Acids Res 53 , D678 – D690 ( 2025 ). OpenUrl CrossRef PubMed 63. ↵ Larralde , M. , Blom , J. , Gourlé , H. , Carroll , L. M. & Zeller , G. Fast, flexible gene cluster family delineation with IGUA . bioRxiv https://www.biorxiv.org/content/10.1101/2025.05.15.654203v1 ( 2025 ). View the discussion thread. Back to top Previous Next Posted June 21, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Scalable single-cell metagenomic analysis with Bascet and Zorn Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Scalable single-cell metagenomic analysis with Bascet and Zorn Hadrien Gourlé , Iryna Yakovenko , Jyoti Verma , Julian Dicken , Florian Albrecht , Linas Mažutis , Johan Normark , Nongfei Sheng , Nicklas Strömberg , Tommy Löfstedt , Laura M. Carroll , Johan Henriksson bioRxiv 2025.06.20.660799; doi: https://doi.org/10.1101/2025.06.20.660799 Share This Article: Copy Citation Tools Scalable single-cell metagenomic analysis with Bascet and Zorn Hadrien Gourlé , Iryna Yakovenko , Jyoti Verma , Julian Dicken , Florian Albrecht , Linas Mažutis , Johan Normark , Nongfei Sheng , Nicklas Strömberg , Tommy Löfstedt , Laura M. Carroll , Johan Henriksson bioRxiv 2025.06.20.660799; doi: https://doi.org/10.1101/2025.06.20.660799 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Microbiology Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17637) Bioengineering (13864) Bioinformatics (41853) Biophysics (21403) Cancer Biology (18540) Cell Biology (25429) Clinical Trials (138) Developmental Biology (13356) Ecology (19862) Epidemiology (2067) Evolutionary Biology (24287) Genetics (15585) Genomics (22464) Immunology (17701) Microbiology (40300) Molecular Biology (17142) Neuroscience (88440) Paleontology (666) Pathology (2825) Pharmacology and Toxicology (4814) Physiology (7633) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4285) Systems Biology (9809) Zoology (2268)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.