Unique molecular identifiers don’t need to be unique: a collision-aware estimator for RNA-seq quantification

doi:10.1101/2025.09.08.674884

Unique molecular identifiers don’t need to be unique: a collision-aware estimator for RNA-seq quantification

2025 · doi:10.1101/2025.09.08.674884

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 56,656 characters · extracted from preprint-html · click to expand

Unique molecular identifiers don’t need to be unique: a collision-aware estimator for RNA-seq quantification | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Unique molecular identifiers don’t need to be unique: a collision-aware estimator for RNA-seq quantification Dylan Agyemang , View ORCID Profile Rafael A. Irizarry , View ORCID Profile Tavor Z. Baharav doi: https://doi.org/10.1101/2025.09.08.674884 Dylan Agyemang 1 Department of Statistics, University of North Carolina at Chapel Hill , Chapel Hill, NC, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rafael A. Irizarry 2 Department of Biostatistics, Harvard T.H. Chan School of Public Health , Boston, MA, USA 3 Department of Data Science, Dana-Farber Cancer Institute , Boston, MA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Rafael A. Irizarry Tavor Z. Baharav 3 Department of Data Science, Dana-Farber Cancer Institute , Boston, MA, USA 4 Eric and Wendy Schmidt Center, Broad Institute , Cambridge, MA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tavor Z. Baharav For correspondence: baharav{at}broadinstitute.org Abstract Full Text Info/History Metrics Preview PDF Abstract RNA-sequencing (RNA-seq) relies on Unique Molecular Identifiers (UMIs) to accurately quantify gene expression after PCR amplification. Longer UMIs minimize collisions—where two distinct transcripts are assigned the same UMI—at the cost of increased sequencing and synthesis costs. However, it is not clear how long UMIs need to be in practice, especially given the nonuniformity of the empirical UMI distribution. In this work, we develop a method-of-moments estimator that accounts for UMI collisions, accurately quantifying gene expression and preserving downstream biological insights. We show that UMIs need not be unique: shorter UMIs can be used with a more sophisticated estimator. RNA-sequencing is widely used to study the sequence and abundance of mRNA molecules within cells [ 1 , 2 ]. In order to generate sufficient genomic material for sequencing, especially at single-cell resolution [ 3 ], mRNA molecules must be PCR amplified. However, the nonuniform amplification of PCR makes UMIs essential for accurate quantification [ 4 ]. UMIs are randomly generated nucleotide sequences of a specific length that are attached to the reverse-transcribed cDNA molecules in the sequencing process [ 5 ]. After PCR amplification, reads that originate from the same molecule will carry the same UMI ( Figure 1a ). Genomic analysis pipelines [ 3 ] identify reads with the same sequence and same UMI as duplicates, and collapse them into a single count. Reads with distinct UMIs but identical sequences are counted separately ( Figure 1b ). However, the reliability of UMIs to correctly make this distinction depends on the length of the UMI used. At each position of a UMI, we have a choice of 4 nucleotides, yielding a total of 4 k distinct length- k UMIs. With shorter UMI lengths we observe collisions , where multiple identical mRNA transcripts are assigned the same UMI sequence [ 6 ] ( Figure 1c ). Consequently, deduplication leads to an underestimation of the true transcript abundance, as it misattributes the collision to a PCR duplicate rather than a distinct additional transcript ( Figure 1d ). With longer UMIs we avoid these collisions, at the expense of additional sequencing cycles and increased synthesis difficulty. As it stands, there is no established tradeoff between UMI length and quantification accuracy, so varied lengths are used in practice. For example, the UMI length was notably increased from 10-bp in 10x Chromium v2 to 12-bp in 10x Chromium v3, while 8-bp was originally used in scRNA-seq [ 7 ] and 6-bp or 8-bp is often still used in miRNA experiments [ 8 ]. In this work we derive a method-of-moments estimator for the true transcript abundance, characterize its near-optimality, and show its improved performance in reconstructing the true counts ( Figure 1e ). Download figure Open in new tab Fig. 1. Comparison between the standard pipeline and our collision-aware estimator (synthetic data). a) With long UMIs ( k = 12) collisions are rare. All 3 transcripts are assigned a different UMI, and so deduplication yields 3 counts. b) Deduplicated UMI counts accurately predict the true number of transcripts for long UMIs. c) With short UMIs ( k = 5) collisions will occur. Here, 3 transcripts are sequenced, but 2 are assigned the same UMI, leading to a deduplicated count of 2. d) Deduplicated UMI counts fall far below the y = x line due to collisions. Our statistical model (red line) accurately predicts this observed relationship. e) Our method-of-moments estimator applied to d corrects for these collisions. Previous works have studied the unequal effects of PCR amplification [ 9 ]. Some works have focused on the collision model we study [ 10 ], and heuristically derived similar estimators in the uniform UMI case [ 6 ], but these works show limited theoretical analysis and empirical validation on real data, and often fail to account for the nonuniformity of the UMI distribution. Significant work has been devoted to UMI error correction, spanning graph-based approaches [ 11 , 12 ], alignment-free and Phred quality-based methods [ 13 ], and Hamming distance-based methods [ 5 , 7 ]. Some works have proposed structured UMIs for easier error correction, for example repeated homotrimeric (or homodimeric) nucleotide blocks [ 14 , 15 ]. Collisionadjustment methods work in concert with these UMI error correction methods, where UMI error correction serves as a preprocessing step for deduplication and collision adjustment, which yields counts for downstream analysis. Some works have tackled the problem of UMI error correction, deduplication, and collision-adjustment jointly, extending to nonuniform UMI distributions. However, due to the complex model, the estimator derived is computationally intensive, requiring quantization and approximate computation, and does not admit a rigorous theoretical analysis [ 16 ]. Our algorithm is designed as a modular plug-in, compatible with conventional deduplication and data processing methods, and we envision it to be particularly useful for aiding the design of new lab-developed sequencing protocols. Methodologically, in this work we study, for a given gene, how to estimate the number of unique pre-amplification mRNA molecules. Specifying to one particular mRNA molecule, we make the assumption that the UMI sequence does not impact its amplification. This assumption is reasonable, as the UMI is a short sequence (up to 12 bp) attached to a much longer cDNA transcript (several hundred base pairs), and is therefore unlikely to impact amplification. This motivates our statistical model where each mRNA molecule is tagged with a UMI randomly drawn from some distribution over the 4 k possible UMIs, and that each pre-amplification mRNA molecule has an equal probability of appearing in the set of sequenced reads. Under these assumptions, we model our problem on a gene-by-gene basis using the classical balls and bins framework. Consider N identical balls randomly assigned into K bins, with the observation Y denoting the number of bins with at least one ball. In our sequencing model, this translates to N mRNA transcripts for a given gene before PCR amplification, K = 4 k possible UMIs, and Y observed unique UMIs. Given the UMI distribution p over the K UMIs, we want to estimate the number of transcripts N after observing Y = y unique UMIs. When all UMIs are equally likely, a simple closedform method-of-moments estimator exists [ 6 ]. This estimation question relates to the classical occupancy problem [ 17 ]. Naively, one would estimate , which is suboptimal as for (the birthday paradox [ 18 ]) collisions occur with probability greater than 1/2. More importantly, estimation is possible up to the coupon collector threshold of N = K log K , the expected number of original mRNA transcripts that need to be sequenced in order for us to observe all K UMIs [ 18 ]. We show that N cannot be consistently estimated above this threshold, and conversely that below this threshold our estimator provides excellent performance, theoretically matching the Cramér–Rao lower bound in a simplified binomial setting [ 19 ] (Section S5). These results simplify nicely when all UMIs are equally likely, but in practice UMIs do not follow a uniform distribution, which biases the estimator if unaccounted for. We adapt our estimator to this nonuniform case, yielding an extremely computationally efficient estimator detailed in Section S3. As identified by previous works, UMIs in practice follow a nonuniform distribution [ 16 ]. This distribution is T-biased (elevated T frequency), and is relatively well approximated by a marginal per-base model, where the probability of the UMI depends only on its aggregate nucleotide composition ( Figure S4 ). Based on the UMI nucleotide composition in 10x’s PBMC 1k dataset, we approximated the marginal frequencies as [0.23, 0.24, 0.21, 0.32] for A,C,G,T (shown in Figure S2a , discussed in Section S2). The only exception is UMIs with poly(dT) tails, which are significantly more likely ( Figure S2c ). Recent work has shown that this is due to truncated UMI synthesis [ 20 ]; while all UMIs are supposed to be the same length (e.g. k = 12 bp), some miss several rounds of synthesis and end up as length 8. Then, due to the structure of 10x’s beads, the sequencer will keep reading into the 30-bp poly(dT) tail [ 21 ], mistakenly reading 4 Ts as the last 4 bases of the UMI. All genes follow the same trend, except for MALAT1, which we are able to identify and filter out (Section S2.2). We empirically evaluated our collision-aware estimator on three publicly available 10x Genomics Peripheral Blood Mononuclear Cells (PBMC) datasets, of 1k ( Figure 2 ) and 10k ( Figure S6 ) cells sequenced using Chromium Single Cell 3 ′ v3 chemistry, and of 5k ( Figure S7 ) cells sequenced using v4 chemistry. For each dataset, we artificially shortened the error-corrected length 12 UMIs to length k = 1, …, 12. For each UMI length, we utilized our estimator to predict the true collision-free counts from the number of unique length k UMIs observed (details in Section S1). To evaluate the performance of our estimator and its collision-oblivious counterpart (which estimates , we measured their raw gene expression estimation accuracy, as well as their performance in downstream differential expression (DE) analysis. Download figure Open in new tab Fig. 2. Improved performance of method-of-moments estimator on 10x’s PBMC 1k dataset. Panels a-c demonstrate enhanced accuracy in raw expression estimation, while panels d-f show the downstream improvement for log-fold change (LFC) estimation of differentially expressed genes. Processing details in Section S1. a) Concordance between our statistical model of collisions and the observed data for a UMI length of k = 5. b) Same as a but using our collision-aware estimator. c) Per-cell MSE between normalized gene expression vectors estimated from shortened UMIs and the ground truth (k=12), shown as a function of UMI length (median and 95% CIs). d) Difference between predicted (using k = 5, naive estimator) and true LFC (predicted - ground truth). e) Same as d but using our collision-aware estimator. f) MSE for estimating LFC (as in d , e ), evaluated across UMI lengths (95% CIs). We see the dramatic impact of collisions at shorter UMI lengths like k = 5 ( Figure 2a ), which our method-of-moments estimator is able to correct for ( Figure 2b ). To aggregate this metric across genes within a cell, we normalize the gene expression vectors and compute the MSE between our estimates and the ground truth k = 12 gene expression. Computed across all cells, our estimator reduced the mean squared error (MSE) by 95% over the naive estimator at a UMI length of 5 ( Figure 2c ). These performance improvements generalize to both additional datasets (Section S4). Our more accurate quantification translates to improved retention of biological insights in downstream tasks. For the prototypical task of cell type prediction, we observe that even with very short UMIs (e.g. k = 4), the naive estimator still allows accurate CellTypist classification [ 22 ] (over 90% accuracy, Figure S8 ). To generate a more meaningful comparison, we focus on differential gene expression analysis. For low-expression marker genes, presence/absence of this gene is often sufficient, and so UMI length has little effect ( Figure S9 ). Studying more highly expressed genes, we show that our improved quantification ensures accurate estimation of their fold changes ( Figure 2d,e ). Aggregating the error in fold change estimation across genes highlights the improved performance of our estimator below k = 8; at k = 5, our estimator reduces MSE by 96% ( Figure 2f ). In this work we showed that the ubiquitous approach of collapsing reads with identical UMIs as PCR duplicates is statistically biased. While conventionally the solution to this is to use longer UMIs to minimize the number of collisions, we enable the use of shorter and cheaper UMIs via a method-of-moments estimator that adjusts for these UMI collisions. We show that our estimator is nearoptimal theoretically, and that in practice it performs extremely well up until the threshold of saturation. This allows for robust biological and statistical conclusions to be drawn from UMIs as short as 5 bp, closely matching the results from the standard 12bp UMIs. This has the potential to further reduce sequencing costs and simplify manual UMI synthesis, enabling more cost-effective and scalable sequencing. There are many exciting directions of future work, including incorporating error correction into our statistical model, and leveraging per-UMI counts as opposed to just presence/absence. Supplementary Information S1 Datasets and processing details We processed all datasets using Cell Ranger. According to Cell Ranger, the 1K PBMC (v3) dataset contains 1222 cells sequenced at an average depth of 54k reads/cell, the 10K PMBC dataset (v3) contains 11485 cells sequenced at an average depth of 30k reads/cell, and the 5k PBMC dataset (v4) contains 4782 cells sequenced at an average depth of 39k reads/cell. To synthetically generate shortened UMIs, we processed the output BAM file from Cell Ranger, and shortened the UB field (corrected UMI, discussed in Section S1.1) to the first k base pairs for UMI length k (removing the last 12- k base pairs) For downstream analyses, for each dataset and UMI length we performed the standard pipeline of librarysize (total count) normalization, scaling to 10,000 counts per cell followed by a log1p transformation. We use CellTypist [ 22 ] to annotate each cell, using their Healthy COVID19 PBMC model, which is trained on “peripheral blood mononuclear cell types from healthy and COVID-19 individuals”, as this is the best match for our datasets. S1.1 Sequencing error As noted, our model does not account for UMI sequencing errors. Errors introduced during PCR amplification or during sequencing impact the actual UMIs we observe, leading to incorrectly undeduplicated counts: UMIs that should have been deduplicated but are not due to errors. Consider for concreteness the case where a gene has 100 reads associated with it, all stemming from 2 unique UMIs. However, during sequencing, one of these reads suffers a sequencing error at the last base pair of its UMI. Then, naively, the number of unique UMIs recorded as the ground truth for this gene is 3. However, for a UMI length of 11 or shorter, if we only observe the first k base pairs of the UMI, our count will now be only 2, as the errored base pair will be deleted from the UMI and there will only be the true 2 unique UMIs. For simplicity, in this work we analyze Cell Ranger’s error corrected UMIs (UB instead of UR field in BAM file), noting that these are functions of the full length UMI which we would in practice not observe. Algorithmically, UMI error correction methods utilize the fact that neighboring UMIs in Hamming space (i.e. UMIs one base pair apart) are very rare naturally for long UMIs, and so these are collapsed and considered sequencing errors. However, as the UMI length shortens, the number of possible UMIs decreases exponentially, and so the probability of two UMIs being one base pair apart increases. In fact, a naive use of existing error correction methods, e.g. collapsing UMIs which are Hamming distance 1 and one has at least a factor of 2 plus 1 more counts than the other, may yield a non-monotonic relationship between the observed UMI counts and the true UMI counts, as the UMI space becomes so saturated that many valid UMIs are incorrectly collapsed together. This issue of UMI error correction for short UMIs was briefly studied in [ 16 ], with a Bayesian method that worked to jointly model collisions and sequencing error, but due to the computationally intensive nature of their proposed estimator their approximate solution had to be further simplified by quantizing a dynamic programming problem. Note that any UMI error correction methods can be used as preprocessing for our algorithm, which can work to avoid such issues by utilizing information such as Phred scores, counts, and graph-based rules [ 11 – 13 ]. An empirical analysis of the 1k PBMC dataset reveals that many more uncorrected UMIs are 1 base pair apart than would be expected by chance ( Figure S1 ). We study the top m cells, and their expression of the top m most expressed genes ( m = 5). We compute, across all pairs of UMIs, what fraction are 1 base pair apart. In addition to the uncorrected (UR) and corrected (UB) UMIs, we also generate a null distribution by randomly sampling a matching number of UMIs from the empirical error corrected UMI distribution (UB) across all cells and all genes. Plotting histograms of these three, we see that the uncorrected UMIs have a significantly higher fraction of pairs that are 1 base pair apart compared to the corrected UMIs and the null distribution, separated by an order of magnitude ( Figure S1a ). Zooming in on the corrected UMIs and the null samples, these are statistically very similar, validating our theoretical model that UMIs within a given cell, for a given gene, are drawn i.i.d. from some common distribution across cells / genes ( Figure S1b ). S1.2 Differential Expression processing details We perform differential expression analysis to assess our estimator’s ability to recover biological insights. To this end, we select cell types with over 100 cells in the 1k dataset (CD14 monocytes, Naive B cells, and Naive CD4+ T cells) and run the DE analysis pipeline as described below on each of them, and aggregate the results. For each counts matrix, after normalization and log1p transformation, we run scanpy’s rank genes group function with the Wilcoxon rank sum test to compute differential expression statistics [ 23 ]. For each gene we compute the difference between the estimated and ground truth log-fold changes, using either the naive estimator of collision-aware one. An example of this is shown in Figure 2d-f . Download figure Open in new tab Fig. S1. Pairwise Hamming distance between UMIs shows errors in uncorrected UMIs. We compute pairwise Hamming distances between all UMIs for a given gene in a given cell. Null samples generated by sampling n UMIs without replacement from the empirical UMI distribution across all cells all genes. We plot the top 5 genes over the top 5 cells, and choose n as the mean UB counts over these. a) shows all three methods, and b) zooms in on the corrected UMIs and null samples. In our analysis, we observe several marker genes for which UMIs are unnecessary for detecting differential expression. For example, we observe that FCN1 is a marker gene for Naive CD4+ T cells, which displays essentially a binary expression pattern ( Figure S9 , additional discussion in Section S4.1). Simply the presence or absence of this gene (UMI length of 0) is sufficient. Such genes are not of interest to study here, as UMIs are not even needed in the first place. To this end, we filter for genes with mean expression greater than 10 across all cells in the dataset. Additionally, similar to a volcano plot, we filter for genes with at least a 25% fold change in either direction, with an adjusted p-value less than 0.05. S2 Nonuniform UMIs Naively, one would assume that all UMIs were equally likely. However, this is not the case in practice, as previously identified [ 10 , 16 , 20 ]. Examining our 1k PBMC dataset, we compute the following position weight matrix characterizing the marginal probability distribution at each base pair of the UMI ( Figure S2a ). Download figure Open in new tab Fig. S2. Analysis of empirical UMI nucleotide frequencies in 1k PBMC dataset. a) Position weight matrix (PWM) for the empirical UMI distribution across all genes except MALAT1. b) PWM for UMIs corresponding to MALAT1. Less of a trailing T effect, but higher T fraction overall. c) Mean probability for all UMIs given a certain number of trailing Ts. This is shown for the empirical frequencies of MALAT1 (blue) and all other genes (orange), as well as the theoretical probabilities based on the PWM (subfigure a) , green) and the constant PWM (Equation (1), red). The UMI frequencies can be well modeled by an independent, per-base probability distribution, which improves for shorter UMI lengths ( Figure S4(a-c) ). For generating synthetic data with shortened UMI lengths, we truncate and only look at the first k base pairs of the UMI, but a similar result holds when truncating from the back of the UMI ( Figure S4(d-f) ). We model this probability with [ℙ ( A ), ℙ ( C ), ℙ ( G ), ℙ ( T )] = [. 23 ,. 24 ,. 21 ,.32], from the 1k PBMC dataset. Probabilistically, this independent model states that: Download figure Open in new tab Fig. S3. Prediction of empirical 12-bp UMI probabilities using different methods. a) Prediction using PWM in Figure S2a b) Prediction using a constant PWM as in Equation (1) (same plot as in Figure S4 ). c) Prediction (of counts) using a Poisson generalized linear model. Defining T as the number of trailing Ts of a UMI, we include T and T 2 as features, in addition to nucleotide frequencies. Download figure Open in new tab Fig. S4. Accuracy of UMI frequency model in Equation (1). Columns correspond to UMI lengths of 5 (left), 9 (center), and 12 (right). Top row corresponds to truncating UMIs from the rear (removing the last 12 − k base pairs, as we use for all datasets), and the bottom row to truncating UMIs from the front (to show our model’s robustness). Note that the last column is the same for both rows, as the UMIs are full length. Both settings are fit by our model reasonably well, with an improved fit when k is short. S2.1 Trailing Ts The only UMIs that significantly deviate from the model in Equation (1) are those that end with a string of Ts, as shown in Figure S2c . This is due to truncation during UMI synthesis, and with the 10x chemistry of 3 ′ sequencing, this leads us to read into the poly(dT) trailing [ 20 ]. This is because of the specifics of 10x’s beads: to capture the mRNA transcript, each bead contains (in order) a TruSeq Read 1, the cell barcode (16-bp), the UMI (supposed to be 12-bp), poly(dT) sequence (30-bp), and finally VN (1-bp) an anchor to indicate the end of the poly(dT) stretch. However, in the sequential synthesis of UMIs—which has approximately 99% efficiency—not all will make it to the full length 12. In fact, the authors claim that under 50% of 10x Chromium beads have the claimed length of 28 (16-bp barcode plus 12-bp UMI). From Figure S2c , we see a quadratic relationship in log scale as a function of number of trailing Ts, so we use as features not only nucleotide frequencies of A,C,G (removing T to avoid linear dependence), but also T and T 2 . Training this model ( Figure S3c ), we weight UMIs proportional to T to address the imbalance in number of UMIs with varying T, as there are 3 × 4 11 UMIs with T = 0 but only 3 UMIs with T = 11. We observe that, as expected, UMIs with larger T are now better fit. However, due to the relatively small fraction of UMIs with many trailing Ts, we retain the simple generalizable model in Equation (1) for our estimation of the UMI distribution p . S2.2 Poor fit of MALAT1 Analyzing the observed counts for shortened UMIs, our nonuniform model is generally accurate. However, one gene, MALAT1, is a consistent outlier. Studying the 1k PBMC dataset ( Figure S5a ), we observe that almost all counts fall within 3 σ confidence intervals of their expectation under our model, except for MALAT1 (colored in red). As can be seen, these points follow a fundamentally different trend, and are negatively biased (i.e. experience more collisions). This same behavior was identified across all datasets studied. MALAT1 is notorious in the literature for its high expression, high rate of internal priming, and lack of polyadenylation [ 24 , 25 ]. MALAT1 expression is commonly used for quality control [ 24 ], as it is retained within the cell nucleus, and so if high MALAT1 expression is not detected within a droplet, then the droplet is likely either empty or contains a poor-quality cell. MALAT1 does not have a poly(A) tail, but rather folds onto itself to form a unique and highly stable triple-helical A-rich structure [ 25 ]. Download figure Open in new tab Fig. S5. Outlier detection of MALAT1. a) 10x 1k PBMC dataset, corrected and deduplicated UMI counts. Nonuniform model (Equation (5)) with UMI frequencies generated by the marginal per-base pair model (Equation (1)). 3 σ confidence intervals constructed from Equation (6). MALAT1 constitutes a clear outlier, with all other genes falling well within the 3 σ confidence intervals. b) Per-gene analysis of TV distance between UMI nucleotide frequencies and mean nucleotide frequencies. Observed data is overdispersed relative to the multinomial model, but MALAT1 is a clear outlier. c) Z-score computed for each gene’s TV distance, with variance approximation from Equation (2). Sorted z-scores reveal that MALAT1 is a clear outlier. We show that this deviance of MALAT1 can be detected de novo, directly from its UMI distribution ( Figure S5b ). Concretely, in these plots we compute for each gene its empirical UMI distribution across all cells, and collapse this to a per-nucleotide distribution summed across all 12 positions. This yields counts for 12-bp UMIs observed across all cells for this gene. After normalization, we compute the total variation (TV) distance between each gene’s per-nucleotide UMI distribution and the expected distribution (summing counts across all cells and all genes). Plotting this for all genes reveals MALAT1 as a clear outlier ( Figure S5b ). We can approximate the variance of the TV distance for m counts by noting that under the model that the nucleotides of a UMI are independent and identically distributed, we are computing the variance of the TV distance between a sample of size m from a multinomial and its expectation. For large m , the entries of the multinomial are approximately independent. Concretely, denoting p as the nucleotide distribution, and X ∼ Multinomial( m, p ), we have: While reality is overdispersed relative to this, the general approach provides a useful framework for understanding the behavior of UMI counts. Computing the z-score of each gene’s TV distance immediately identifies MALAT1 as the only gene that deviates significantly from the expected distribution ( Figure S5c ). S3 Analyzing the statistical model Recall our statistical model, restated here for completeness: N identical balls are randomly assigned into K bins, with the observation Y denoting the number of bins with at least one ball. In our sequencing model, this translates to N mRNA transcripts before PCR amplification, K = 4 k possible UMIs, and Y unique UMIs observed. The balls (UMIs) follow some distribution p over the set [ K ] ={ 1, 2, … K }, and are independent and identically distributed. Then: Mathematically, Y =|{ X 1 , X 2 , …, X N }|, where | S | denotes the cardinality of a set S , in our case the number of distinct UMIs observed. Given p and Y , we want to estimate N . By representing Y as a sum of K indicators, we can compute its mean and variance. A second order Taylor approximation is necessary to simplify the crossterms in the variance. Observe that 𝔼 [ Y ] is a concave function of p for N ≥ 1, as can be verified by taking the second derivative. By Jensen’s inequality, this implies that the expected number of unique UMIs observed is maximized when p is uniform. This assumption that the X i are independent and identically distributed need not perfectly hold in practice. For example, sequencing errors may introduce dependencies between the X i . As we show for MALAT1, X i appear to follow a subtly different distribution, potentially due to internal priming biasing for UMIs with a higher concentration of Ts. However, this model fits the data well enough as shown in e.g. Figure S5a . S3.1 Simplification in uniform setting Simplifying Equations (5) and (6) in the case when p is uniform (all UMIs are equally likely) we recover classical results [ 6 ]: This balls and bins setting is known to have several different interesting thresholds, considering the case of large K [ 18 ]. First is the classical birthday paradox, which occurs at . At this point, collisions occur with with probability greater than 1/2, indicating that the naive estimator will underestimate the true counts over half the time. Next is the value of N = K . This is the maximum counts that the naive estimator will output, where we note that at this value of N , 𝔼 [ Y ]≈ (1 − e − 1 ) N ≈.63 N , indicating the dramatic bias of the collision-oblivious naive estimator. Finally, we have the coupon collector threshold of N = K log K , the expected number of unique mRNA transcripts that need to be sequenced in order for our observations to saturate with Y = K , where all K UMIs are observed. We show that this is the true threshold for consistent estimation, where for N > K log K we have Y = K with high probability, meaning that all K UMIs are observed, beyond which larger N cannot be estimated. Surprisingly, we show that for N ≤ cK log K for c < 1, our estimator performs near optimally, with MSE essentially matching the Cramér–Rao lower bound in a simplified binomial setting (Theorem 1). S3.2 Method-of-moments estimator (uniform) Rearranging the expectation of Y , we obtain our method-of-moments estimator in the uniform UMI setting: This choice of is motivated by the fact that the expected number of additional transcripts required to go from observing K − 1 unique UMIs to K unique UMIs is K . This extension of retains the convexity of the estimator. Proposition 1 is convex . Proof For is a convex function of Y , as it is − log( · ) composed with an affine function. Examining the edge case of Y = K , we show that the derivative is strictly increasing, in that: This holds true for all K > 1, and so the estimator is convex. This implies that defining would retain the convexity of the estimator for any γ > ln(2). S3.3 Method-of-moments estimator (nonuniform) In the nonuniform case with probabilities , we have that the expected number of unique UMIs of length k observed, given that there are N transcripts before PCR amplification, is Inverting this expression to solve for N as a function of Y does not yield a closed form solution. However, given that Y N is monotonically increasing in N (and concave), we can define by identifying n such that Y n ≤ y ≤ Y n +1 , and linearly interpolating. Concretely, for y 0 for all j, Y N is concave, strictly increasing in N , and twice differentiable. Thus, is convex in y for y < K , as it is the linear interpolation between sampled points of a convex function. As before, the case of y = K is a priori undefined, as Y n < K for all n . Here, we use a quadratic extrapolation, to yield a simple estimate that retains the convexity of our estimator. Quadratic extrapolation here studies finite differences, i.e. the behavior of . To get to the quadratic extrapolation, we can consider the second finite difference . To perform quadratic extrapolation, we want . This simplifies as: This retains the convexity of our estimator, even in the nonuniform p setting, as can be verified by computing that Algorithmically, our estimator can be expressed as follows: Compute the maximum observed counts as Y max . For n = 1, 2, … compute Y n . Stop when Y n > min( Y max , K − 1). For every value y = 0, 1, …, Y max , compute by iterating over the values Y n (already in sorted order), identifying the smallest n such that Y n ≤ y , and computing the interpolated estimator as in Equation (12). If y = K , use the extrapolated estimator in Equation (13). For each observed count y , use the precomputed estimator . Steps 2 and 3 constitute a one-time cost, constructing the array of estimates from 0 to Y max observed counts. Then, each observed count y can be mapped to its corresponding estimate in constant time in step 4 (as in practice counts do not often exceed 1E4). S3.4 Variance of estimator We compute the variance of our estimator using the delta method, a first order Taylor Expansion. Let f ( N ) = Y N be the function relating the true number of transcripts N to the observed count Y N (Equation (11)). Recall that where in the last line we used that log(1 + x ) ≈ x for small x , (1 − x ) N ≈ e −xN for small x , and that N is large. Using the delta method, we evaluate , the derivative of the inverse function, evaluated at the observed count y . Noting that , up to the linear interpolation, we have , where N = f − 1 ( Y ). In the uniform case, where for all j, f ′ ( N ) simplifies as The variance of Y is given in Equation (6), and evaluating for general p yields: S4 Validation on additional datasets Throughout, we discussed the application of our method to 10x’s PBMC 1k dataset. Here, we show that our method’s performance improvements hold in general. We recapitulate our analyses from Figure 2 for the 10x Genomics 10k PBMC dataset with v3 chemistry ( Figure S6 ), and for the 10x Genomics 5k PBMC dataset with v4 chemistry ( Figure S7 ). Download figure Open in new tab Fig. S6. Performance improvement afforded by our method-of-moments estimator on 10x’s PBMC 10K dataset (v3 chemistry). a-c show improvement in raw expression estimation, and d-f show improvement for log-fold change (LFC) estimation of differentially expressed genes, replication of Figure 2 . Download figure Open in new tab Fig. S7. Performance improvement afforded by our method-of-moments estimator on 10x’s PBMC 5K dataset (v4 chemistry). a-c show improvement in raw expression estimation, and d-f show improvement for log-fold change (LFC) estimation of differentially expressed genes, replication of Figure 2 . S4.1 DE supplemental figures To begin, in Figure S8 , we highlight the ease of cell type annotation as a computational task. Even for very short UMI lengths like k = 4, we still retain essentially the same accuracy with the naive estimator as for k = 9 (showing the brittleness of cell type annotation). Download figure Open in new tab Fig. S8. Fraction of errors in cell type prediction using CellTypist as a function of UMI length. As discussed in Section S1.2, we filter out low count differently expressed genes from our analysis. This is because, for certain marker genes that exhibit a binary-like expression (0 in certain cells, and nonzero in others), we don’t actually need UMIs at all to detect that this gene is differentially expressed. FCN1, a marker gene for CD14 Monocytes, overwhelmingly displays 0 expression for other cell types, with only 12% of CD14 monocytes having 0 expression, while over 93% of other cells have 0 counts ( Figure S9 ). With a UMI length of 12, the LFC is 5.9 with a p-value of 1.1 × 10 − 118 , while with a UMI length of 0 the naive estimator provides an LFC of 4.2 with a p-value of 2.5 × 10 − 110 . Clearly, an improved estimator is unnecessary in this situation, and so we filter out genes with low average expression to ensure that we are only comparing those where UMI length will play a role. Download figure Open in new tab Fig. S9. Marker genes do not require UMIs to be called as DE. Marker genes that display a binary expression pattern are called as differentially expressed even by the naive estimator for a UMI length of 1. Shown is FCN1, a marker gene for CD14 monocytes. a) Distribution of raw UMI counts for the gene FCN1 analyzing the full length 12 UMIs. b) Same as a but using the raw length 1 UMI counts (the naive estimator). Download figure Open in new tab Fig. S10. Mean expression versus LFC for all DE genes (no average expression filter) for 1k PBMC dataset. Difference between true LFC and predicted LFC using naive estimator and method-ofmoments estimator for a 5-bp UMI. We filter for genes with significant p-value and moderate fold change. Performance is similar for the two methods at low expression levels, as these genes often do not even require the use of UMIs ( Figure S9 ). As mean expression increases however, our estimator shows its improved performance (highlighted in Figure 2d,e ). a) Log-fold changes between ground truth and naive estimator for all genes. b) Same as a but for our collision-aware estimator. S5 Optimality of method-of-moments estimator for uniform UMIs On its surface, our proposed estimator seems quite simplistic. It only matches the first moment of N , and fails to take into account any higher order moments of Y . Additionally, since the estimator is a convex function of y , by Jensen’s inequality , implying that will overestimate N . However, as we show, this estimator yields good estimation up until the threshold of impossibility. S5.1 Impossibility beyond N > K log K In this section, we use standard Bachmann–Landau asymptotic notation. From the classical coupon collector problem, it is known that the expected number of balls required until all bins are filled is N = K log K + O ( K ). This threshold is tight: taking N to be larger than K log K by any multiplicative constant yields vanishing (with K ) probability of observing Y 1, ℙ ( Y = K ) ≥ 1 − K 1 −c Proof Define the indicator random variables , whether bin j is filled, for j ∈ [ K ]. Then, Y = ∑ j j . where a union bound is used, followed by the inequality that 1 − x ≤ e −x . □ Since Y will be equal to K with high probability, we cannot distinguish between N = 2 K log K and N ′ = K 3 , for example, and so N cannot be estimated to any nontrivial accuracy. Extending this to the nonuniform p setting is difficult, without a closed form solution. Defining p min = min j p j , we see that N = Ω(1 /p min ) is necessary, as otherwise the UMI corresponding to p min will not have been observed with high probability. Conversely, N = O (log( K ) /p min ) is sufficient, by a similar union bounding argument. Again, by concavity, N = Ω( K log K ) is necessary. S5.1.1 Extending saturation threshold The above observations regarding the saturation threshold scaling as 1 /p min implies that we can increase our threshold for feasible estimation by decreasing the minimum probability. A natural question is then: how far can we extend our nontrivial estimation? As we drop p min → 0, Y will not saturate until 1 /p min , but gain increased variance before. Theoretically, given a known distribution of true UMI counts ( N ), or a range of feasible N , one could compute the best UMI distribution with respect to this Bayes Risk or Minimax Risk. Even by taking a simple setting, where per-nucleotide probabilities are [1 − x , 1 − x , 1 − x , 3 x ] for [A,C,G,T], with x ∈ [0, 1 / 3], by taking x to 0 we can guarantee that Y will not saturate, and that via the asymptotic normality argument below, we can compute the MSE for any fixed x and N . S5.2 Asymptotic normality analysis Prior work has shown that Y is asymptotically normally distributed, so long as Var( Y ) → ∞ [ 17 , 26 ]. Evaluating our method-of-moments estimator in the uniform UMI setting using this normal approximation yields that, for Z ∼ 𝒩 (0, 1), defining µ = 𝔼 [ Y ] and σ 2 = Var[ Y ] (Equations (5) and (6)): In the second line we used the normal approximation, and that log(1 − 1 /K ) ≈ −1 /K for 1 /K ≪ 1. In the third, the asymptotic approximation for µ ≈ K (1 − e −N/K ). In the final line we used the approximation of log(1 − x )≈ − x , when e N/K σ/K ≪ 1. This holds when e N/K ≪ K , i.e. when N ≤ cK log K for c < 1. The MSE of our estimator is thus: S5.3 Discussion of Excess Variance Following this asymptotic analysis, we can say that Var , which clearly shows the growth in variance of our estimator. This indicates that, first, the variance of our estimator is always larger than the variance of Y , the naive estimator. However, our estimator retains its near unbiasedness, and even for large N approaching K , the variance of our estimator is only a constant factor larger than the variance of Y . Concretely, for N = K , we have that Var ( Y ) → e − 2 ( e − 2) N , growing linearly with N . Here, the MSE of is asymptotically e 2 Var ( Y ) = Θ( N ), while the MSE of the naive estimator is Var ( Y ) +(1 − 1 /e ) 2 N 2 = Θ( N 2 ). It is not until N approaches K log K that the variance of our estimator becomes significantly larger than the variance of Y , where in the regime of N = cK log K for c < 1, the variance of our estimator is approximately K 2 c Var ( Y ). Using our asymptotic approximations, we have that (for N < K ): To the first order, the variance is Var ( Y ) = Ke −N/K (1 − e −N/K ). Splitting this into regimes (tabulated in Table S1), we have that: . Here, both estimators are essentially unbiased, and Var ( Y ) = O ( N ). N = O ( N ), .Here, Var ( Y ) = O ( N ), and e 2 N/K = Θ(1), and so our estimator still has MSE on the order of N . However, the squared bias of the naive estimator is growing, and is on the order of which dominates the MSE beyond N = Ω( K 2 / 3 ). Thus, the total MSE of the naive estimator is for , the squared bias is still dominated by the variance, but for N = Ω( K 2 / 3 ), the bias begins to dominate. N = Ω( K ). Here, Var ( Y ) = Ke −N/K is decaying, but our estimator now has inflated variance, leading to an MSE of order Ke N/K . The naive estimator is dominated by the squared bias, yielding an MSE of order N 2 . N ≥ 2 K log K . Here, Y = K with probability at least 1 − 1 /K , and so both estimators will have bias growing linearly in N . View this table: View inline View popup Download powerpoint Table S1: Comparison of Bias, Variance, and MSE for naive and collision-aware estimators across different regimes of N . This highlights that when N = O ( K 2 / 3 ) both estimators attain essentially optimal performance, then the naive estimator has increasing bias which dominates the MSE. When N falls between K and K log K , the MSE of the naive estimator already scales as N 2 , dominated by the bias, while our estimator performs slightly worse than the earlier MSE of Θ( N ), but still sub-quadratic. The high level takeaway is that in order to minimize the MSE, K is is currently taken so that N ≤ K 2 / 3 to avoid the large bias of the naive estimator. However, K can in fact be selected such that N is a constant multiple of K , in which case the MSE is still O ( N ). S5.4 Cramér–Rao lower bound in binomial setting To begin, N is a discrete parameter, and so we technically cannot directly apply the Cramér-Rao lower bound. However, since the likelihood as a continuous function of N , we can instead perform inference when N ∈ ℝ + and then apply the Cramér–Rao lower bound to the continuous function. Recall the indicator random variable based definition of Y , with , whether bin j is filled, for j ∈ [ K ], and Y = ∑ j Z j . Here, In our setting these Z j are correlated, making the analysis difficult, so we approximate Y by , a binomial where the Z j are independent: The mean of matches Y , as: ,and its variance is (in this binomial approximation) which is approximately the variance of Y The log-likelihood for is then . The Fisher information is defined as Plugging in for the variance of in the numerator, it follows that . For large K , we may use the approximations ,so that . Then, the Fisher information becomes . This allows us to state our Cramér–Rao lower bound: Theorem 1. Any unbiased estimator of N , given observation , satisfies . This approximate lower bound matches the first order approximation of the variance of Y (Equation (16)), highlighting the near-optimality of our estimator. Evaluating this expression, we have that as N approaches K ln K , we have e N/K ≈ K , so that , indicating a dramatic increase in estimation error. In this regime, where nearly all bins are occupied, even small differences in Y lead to large differences in , and no estimator can achieve significantly lower variance than the approximate lower bound. Our method-of-moments estimator achieves this approximate lower bound up to a constant factor, and so is near-optimal in this regime. Declarations S5.6 Conflict of interest/Competing interests The authors declare no conflicts of interest. S5.7 Data availability The 1k, 5k, and 10k PBMC datasets are publicly available from 10x Genomics: 1k PBMC: https://www.10xgenomics.com/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0 10k PBMC: https://www.10xgenomics.com/datasets/10k-human-pbmcs-3-v3-1-chromium-controller-3-1-high 5k PBMC: https://www.10xgenomics.com/datasets/5k_Human_Donor1_PBMC_3p_gem-x S5.8 Code availability Our code is publicly available at https://github.com/agdylan/minimal_UMIs . S5.5 Acknowledgments We thank Elvira Forte for her careful review of the manuscript and for insightful suggestions that improved its clarity. T.Z.B. was supported by the Eric and Wendy Schmidt Center at the Broad Institute. D.A. was supported by R25HG006682. RAI was supported in part by funding from NIH Grants R35GM131802, and R01HG005220. Funder Information Declared Eric and Wendy Schmidt Center at the Broad Institute National Human Genome Research Institute, https://ror.org/00baak391 , R01HG005220 , R25HG006682 National Institute of General Medical Sciences, https://ror.org/04q48ey07 , R35GM131802 References 1. ↵ Conesa , A. et al. A survey of best practices for rna-seq data analysis . Genome biology 17 , 1 – 19 ( 2016 ). OpenUrl CrossRef PubMed 2. ↵ Kukurba , K. R. & Montgomery , S. B. Rna sequencing and analysis . Cold Spring Harbor Protocols 2015 , pdb – top084970 ( 2015 ). OpenUrl 3. ↵ Zheng , G. X. et al. Massively parallel digital transcriptional profiling of single cells . Nature communications 8 , 14049 ( 2017 ). OpenUrl PubMed 4. ↵ Parekh , S. , Ziegenhain , C. , Vieth , B. , Enard , W. & Hellmann, The impact of amplification on differential expression analyses by rna-seq . Scientific reports 6 , 25533 ( 2016 ). OpenUrl PubMed 5. ↵ Kivioja , T. et al. Counting absolute numbers of molecules using unique molecular identifiers . Nature methods 9 , 72 – 74 ( 2012 ). OpenUrl 6. ↵ Fu , G. K. , Hu , J. Wang , P.-H. & Fodor , S. P. Counting individual dna molecules by the stochastic attachment of diverse labels . Proceedings of the National Academy of Sciences 108 , 9026 – 9031 ( 2011 ). OpenUrl Abstract / FREE Full Text 7. ↵ Bose , S. et al. Scalable microfluidics for single-cell rna printing and sequencing . Genome biology 16 , 1 – 16 ( 2015 ). OpenUrl CrossRef PubMed 8. ↵ Hücker , S. M. et al. Single-cell microrna sequencing method comparison and application to cell lines and circulating lung tumor cells . Nature communications 12 , 4316 ( 2021 ). OpenUrl PubMed 9. ↵ Gustafsson , J. , Robinson , J. , Nielsen , J. & Pachter , L. Butterfly: addressing the pooled amplification paradox with unique molecular identifiers in single-cell rna-seq . Genome Biology 22 , 174 ( 2021 ). OpenUrl PubMed 10. ↵ Clement , K. , Farouni , R. , Bauer , D. E. & Pinello , L. Ampumi: design and analysis of unique molecular identifiers for deep amplicon sequencing . Bioinformatics 34 , i202 – i210 ( 2018 ). OpenUrl CrossRef PubMed 11. ↵ Smith , T. , Heger , A. & Sudbery , I. Umi-tools: modeling sequencing errors in unique molecular identifiers to improve quantification accuracy . Genome research 27 , 491 – 499 ( 2017 ). OpenUrl Abstract / FREE Full Text 12. ↵ He , D. et al. Alevin-fry unlocks rapid, accurate and memoryfrugal quantification of single-cell rna-seq data . Nature Methods 19 , 316 – 322 ( 2022 ). OpenUrl PubMed 13. ↵ Tsagiopoulou , M. et al. Umic: a preprocessing method for umi deduplication and reads correction . Frontiers in Genetics 12 , 660366 ( 2021 ). OpenUrl PubMed 14. ↵ Philpott , M. et al. Nanopore sequencing of single-cell transcriptomes with sccolor-seq . Nature biotechnology 39 , 1517 – 1520 ( 2021 ). OpenUrl CrossRef PubMed 15. ↵ Sun , J. et al. Correcting pcr amplification errors in unique molecular identifiers to generate accurate numbers of sequencing molecules . Nature Methods 21 , 401 – 405 ( 2024 ). OpenUrl PubMed 16. ↵ Petukhov , V. et al. dropest: pipeline for accurate estimation of molecular counts in droplet-based single-cell rna-seq experiments . Genome biology 19 , 1 – 16 ( 2018 ). OpenUrl CrossRef PubMed 17. ↵ O’Neill , B. The classical occupancy distribution: computation and approximation . The American Statistician 75 , 364 – 375 ( 2021 ). OpenUrl 18. ↵ Feller , W. An introduction to probability theory and its applications, Volume 2 Vol. 2 ( John Wiley & Sons , 1991 ). 19. ↵ Cramér , H. Mathematical methods of statistics Vol. 9 ( Princeton university press , 1999 ). 20. ↵ Sun , J. et al. Enhancing single-cell transcriptomics using interposed anchor oligonucleotide sequences . Communications Biology 8 , 67 ( 2025 ). OpenUrl PubMed 21. ↵ 10x Genomics . Chromium next gem single cell 3’ reagent kits v3.1 (dual index) user guide ( 2022 ). User Guide CG000315, Rev E. 22. ↵ Domínguez Conde , C. et al. Cross-tissue immune cell analysis reveals tissue-specific features in humans . Science 376 , eabl5197 ( 2022 ). OpenUrl CrossRef PubMed 23. ↵ Wolf , F. A. , Angerer , P. & Theis , F. J. Scanpy: large-scale single-cell gene expression data analysis . Genome biology 19 , 15 ( 2018 ). OpenUrl CrossRef PubMed 24. ↵ Clarke , Z. A. & Bader , G. D. Malat1 expression indicates cell quality in single-cell rna sequencing data . BioRxiv 2024 – 07 ( 2024 ). 25. ↵ Wilusz , J. E. et al. A triple helix stabilizes the 3’ ends of long noncoding rnas that lack poly (a) tails . Genes & development 26 , 2392 – 2407 ( 2012 ). OpenUrl Abstract / FREE Full Text 26. ↵ Hwang , H.-K. & Janson , S. Local limit theorems for finite and infinite urn models . Annals of Probability 36 , 992 – 1022 ( 2008 ). OpenUrl View the discussion thread. Back to top Previous Next Posted September 12, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Unique molecular identifiers don’t need to be unique: a collision-aware estimator for RNA-seq quantification Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Unique molecular identifiers don’t need to be unique: a collision-aware estimator for RNA-seq quantification Dylan Agyemang , Rafael A. Irizarry , Tavor Z. Baharav bioRxiv 2025.09.08.674884; doi: https://doi.org/10.1101/2025.09.08.674884 Share This Article: Copy Citation Tools Unique molecular identifiers don’t need to be unique: a collision-aware estimator for RNA-seq quantification Dylan Agyemang , Rafael A. Irizarry , Tavor Z. Baharav bioRxiv 2025.09.08.674884; doi: https://doi.org/10.1101/2025.09.08.674884 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17635) Bioengineering (13859) Bioinformatics (41846) Biophysics (21401) Cancer Biology (18534) Cell Biology (25422) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24285) Genetics (15582) Genomics (22463) Immunology (17700) Microbiology (40298) Molecular Biology (17141) Neuroscience (88424) Paleontology (666) Pathology (2825) Pharmacology and Toxicology (4813) Physiology (7633) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00