Full text
112,659 characters
· extracted from
preprint-html
· click to expand
Improving RNA-seq protocols | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Improving RNA-seq protocols View ORCID Profile Felix Pförtner , View ORCID Profile Eva Briem , View ORCID Profile Wolfgang Enard , View ORCID Profile Daniel Richter doi: https://doi.org/10.1101/2025.08.20.671269 Felix Pförtner 1 Anthropology and Human Genomics, Faculty of Biology, Ludwig-Maximilians Universität in Munich , Großhaderner Str. 2, 82152 Planegg-Martinsried, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Felix Pförtner Eva Briem 1 Anthropology and Human Genomics, Faculty of Biology, Ludwig-Maximilians Universität in Munich , Großhaderner Str. 2, 82152 Planegg-Martinsried, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Eva Briem Wolfgang Enard 1 Anthropology and Human Genomics, Faculty of Biology, Ludwig-Maximilians Universität in Munich , Großhaderner Str. 2, 82152 Planegg-Martinsried, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Wolfgang Enard For correspondence: richter{at}bio.lmu.de enard{at}bio.lmu.de Daniel Richter 1 Anthropology and Human Genomics, Faculty of Biology, Ludwig-Maximilians Universität in Munich , Großhaderner Str. 2, 82152 Planegg-Martinsried, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Daniel Richter For correspondence: richter{at}bio.lmu.de enard{at}bio.lmu.de Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Download figure Open in new tab Bulk and single-cell RNA-seq are powerful tools for transcriptomic analysis, providing insights into many aspects of molecular and cellular phenotypes. Costs constrain the amount of biological insight obtainable within a given budget, and as sequencing prices decline, efficient library protocols have become a decisive factor. In this study, we introduce an approach to systematically optimize the number of usable reads that RNA-seq protocols generate. We applied this ”funnel strategy” to prime-seq, an early-barcoding bulk RNA-seq protocol, by systematically testing critical protocol steps totaling 1080 samples in 49 libraries. This resulted in the optimized prime-seq2 protocol that increases the number of usable reads by 60 % and improves one of the most cost-efficient bulk RNA-seq protocols available. Our study also suggests that monitoring the filtering of usable reads can serve as a valuable quality control for many RNA-seq protocols and sheds light on the complexity of the conditions and interactions that shape RNA-seq library composition and their interpretation. Introduction RNA sequencing (RNA-seq) has revolutionized molecular biology by enabling comprehensive analyses of transcriptomes [ 1 ]. This includes developments for genome annotation and isoform detection that were boosted by long-read sequencing technologies [ 2 ] and comprehensive characterization of cell types, enabled by single-cell RNA-seq and spatial transcriptomics [ 3 ]. While these exciting developments have justifiably garnered much attention, bulk RNA-seq, in which average expression levels of many cells are measured, remains an essential and complementary tool. Bulk RNA-seq is valued especially for its cost-efficiency and its easy and unbiased sampling procedure. This is of particular advantage when the cellular composition of samples is sufficiently homogeneous. Notably, the increasing availability of single-cell RNA-seq data, together with recent advances in deconvolution methods [ 4 , 5 , 6 ], is poised to significantly improve the interpretation—and thus the utility—of bulk RNA-seq data in the future. Many different bulk RNA-seq protocols have been developed [ 7 , 8 ]. They differ in aspects like the type of RNAs (e.g., polyadenylated RNAs, short RNAs, rRNA-depleted RNAs, newly synthesized RNAs) and the parts of RNAs (5’ ends, 3’ ends, full-length) that get incorporated in sequencing libraries. They differ in their sensitivity, i.e. how many cells are needed as input, their accuracy, i.e. how well cDNA molecules in the library correlate with RNA abundance, their complexity, i.e. how many different RNAs are detected at a given sequencing depth and their technical noise, i.e. the variation with which the detected RNAs are measured in technical replicates. Obviously, different questions and samples require different protocols. Here, we focus on probably the most frequent application of bulk RNA-seq, in which genome-wide expression levels of polyadenylated RNAs are correlated with variables of interest, such as drug treatment, genetic variants, or disease states. Data is usually interpreted by identifying differently expressed genes and their enrichment in molecular pathways and biological processes. This type of bulk RNA-seq application has been called RNA-seq for differential gene expression [ 1 ], mRNA Sequencing [ 9 ] or quantitative RNA-seq [ 10 ]. It is conceptually similar to a quantitative Reverse Transcription PCR (RT-qPCR) for many genes and is becoming as routine, as important, and as affordable as RT-qPCR [ 11 ]. As with many methods, costs are a crucial factor, as they limit the amount of biological insight within a given budget. Costs not only constrain the number of replicates and hence the statistical power to detect expression differences [ 12 , 13 , 14 ], but they also limit the number of variables (e.g., tissues, time points, treatments) that can be investigated. For RNA-seq, sequencing costs have long limited the number of samples to the notorious N=3 per condition. Ten years ago, that could be justified by budget limitations when costs for generating and sequencing a bulk RNA-seq sample were maybe around $200. But sequencing costs have dropped ten-fold in the last ten years [ 15 ] and one million reads can cost less than one dollar on the latest Illumina NovaSeq X machines. Hence, costs for sequencing an RNA-seq sample fell to $10-20, making costs of $60-164 per sample for commercial library generation kits the dominant cost factor [ 16 ]. As a consequence, improvements such as miniaturization and early barcoding, originally developed to improve scRNA-seq cost-efficiency [ 17 ], have become increasingly important to leverage the full potential of bulk RNA-seq. Early barcoding protocols (also called Tag-Seq, 3’ RNA-seq, or QuantSeq), in which a sample-specific barcode is introduced either at the 5’ or at the 3’ end of the polyT primed cDNA, allow for pooling cDNA before amplification and library generation. This decreases library reagent costs for non-commercial protocols to $2.53, $4.05, or $6.58 for prime-seq [ 16 ], BRB-seq [ 11 ], or DECODE-seq [ 18 ], respectively. In addition to the reagent costs per library, other factors like necessary equipment, hands-on time, or failure rates contribute to cost-efficiency. Of note, the low costs per library of these protocols make kit-based RNA isolations of ¼ $5 per sample a relevant factor, and protocols have established different methods to incorporate direct lysis or RNA purification steps [ 16 , 19 , 20 , 21 ]. Importantly, costs are directly comparable only at the same efficiency, i.e., at the same statistical power to detect differently expressed genes [ 22 ]. At a given number of replicates and sequencing reads, this depends mainly on the library complexity and the technical noise of the protocol [ 23 , 24 ]. While comprehensive benchmarking for early barcoding protocols has not been done, they generally perform fairly similarly to each other and fairly similarly to commercial gold-standards like TruSeq with respect to technical noise and complexity [ 16 , 11 ]. A potential drawback of early barcoding protocols is that mainly 5’ or 3’ ends of cDNAs are sequenced, and hence that expression levels are quantified per gene and not per transcript isoform. But as the available functional annotation is almost always gene-specific and not isoform-specific, most projects interpret the RNA-seq data anyway only at the level of differentially expressed genes. Additionally, quantifying expression levels of isoforms requires either long-read sequencing or deep short-read sequencing, which increases the sequencing costs per library considerably. Hence, the most cost-efficient way to quantify gene expression levels by RNA-seq is currently early barcoding RNA-seq. So while the cost-efficiency of bulk RNA-seq has made tremendous progress by reducing reagent costs per library, relatively little attention has been devoted to quantifying and optimizing how many usable reads, i.e., reads that are used to quantify gene expression levels, are actually produced per library. In general, RNA-seq libraries can differ enormously in that respect, as e.g., seen by an analysis of > 2000 tumor bulk RNA-seq datasets across > 40 studies in which the fraction of usable reads ranged from 0 % to 79 % at a median of 50 % [ 25 ]. Early barcoding protocols show similar patterns with substantial variation among protocols and among samples. For instance, increasing the number of usable reads was a main motivation to develop BRB-seq from the previous SCRB-seq protocol [ 11 ] , and when using prime-seq (another SCRB-seq derivative), we also observed substantial variations in the number of usable reads among samples and especially among sample types. In particular, we observed that some sequencing runs on patterned flow cells of Illumina’s NextSeq generated fewer usable reads and especially fewer total reads than expected. We had not observed this before on non-patterned flow cells such as Illumina’s Hi-Seq, probably because on those flow cells, one can ignore clustering molecules that do not generate sequencing reads when overloading flow cells. To optimize the number of usable reads for prime-seq we systematically varied different steps in the protocol. In this way, we could increase the number of usable reads by 60 %, importantly, without compromising the complexity and noise of the prime-seq libraries. This new version - prime-seq2 - is a considerable improvement in cost-efficiency, reducing sequencing costs per sample, e.g., from $53.1 to $33.2, when generating 10 million usable UMIs at $1.50 per million reads, at an unchanged cost of ∼$3.5 per library including RNA isolation. As we think that many commercial or home-brew early barcoding protocols could profit from such an optimization, we present our approach as a generally applicable optimization strategy that visualizes the amount of usable reads at different processing steps as a funnel ( Fig. 1A ). Download figure Open in new tab Figure 1. The funnel strategy and its application to prime-seq. A , Funnel strategy scheme. The total reads from the sequencer - having per definition at least a Read1 sequence - are filtered for the presence of both Illumina indices, for the presence of long high quality reads, for the presence of a barcode introduced with the RT-primers, for reads mapping to the coding strand of annotated exons and introns and for reads that differ in their UMI sequence and hence are not PCR duplicates. These UMI counts per gene and per sample constitute the count matrix that is used for all subsequent analysis and hence is the number of usable reads. B , Overview of the prime-seq workflow and the parameters that have been tested in this study. For a more detailed graphical version, see Figure S7. C , DNA amount measured after the original DNase treatment under high salt conditions on beads and under the optimized conditions in solution (n=64; Two independent replicates of 16 samples each per condition.) D , The fraction of reads mapping to exons and introns (left) and intergenic regions (right) indicate that the optimized DNA digest removes remaining genomic DNA (n=32; 16 samples per condition in one library.) E, The fraction of reads mapping to exons and introns is lower in the original RT condition (Maxima H- at 42 °C) than in the other tested conditions (lines are medians of n=4 per condition). F, The number of detected genes, downsampled to 7 million reads per sample is higher in the original RT condition (Maxima H- at 42 °C) than in the other tested conditions (dots are averages of n=4 per condition). G , The fraction of reads mapping to exons and introns decreases strongly with increasing amounts of samples pooled after reverse transcription. Samples à 10 ng of total RNA were reverse transcribed, pooled, exonuclease digested, and then added to the cDNA amplification PCR in amounts corresponding to 8 (80 ng), 32 (320 ng), and 92 (920 ng) samples (total RNA), respectively. This was done in a total of 40 replicates (n=8 for 80 ng and n=16 for 320 ng and 920 ng) from which one library per condition was generated P-values were calculated in unpaired two-sided t-tests (ns: p > 0.05; *: p ≤ 0.05; **: p ≤ 0.01; ***: p ≤ 0.001; ****: p ≤ 0.0001). Materials and Methods Preparation of SPRI Beads Homemade solid-phase reversible immobilization (SPRI) beads are used throughout the paper. They are produced as follows: the 22 % PEG solution (22 % (w/v) PEG 8000, 1 M NaCl, 10 mM Tris-HCl pH 8, 1 mM EDTA. 0.01 % IGEPAL, 0.05 % sodium azide, to 49 ml with UltraPure (UP) H 2 O) was heated at 40 °C to dissolve PEG. Sera-Mag Speed Beads (1 ml) were separated from their solution using magnets. The supernatant was removed. The resulting pellet was washed twice in TE buffer (10 mM Tris-HCl, pH 8, 1 mM EDTA) and eluted in 0.9 ml TE buffer. This bead suspension was mixed with the 22 % PEG solution. SPRI beads were stored at 4 °C and equilibrated to room temperature before usage. Nucleic Acid Purification using SPRI Beads Home-made SPRI beads were used for nucleic acid purification at multiple steps during the RNA-seq protocol. In the ensuing procedures, ”cleanup” always followed this protocol: Bead suspension was added in the specified ratio, incubated for 5 min, and the beads were anchored using magnets. The supernatant was discarded, and the beads washed twice with 50-1000 µ l 80 % ethanol in UP H 2 O. Beads were air-dried, removed from the magnet and eluted in UP H 2 O. After 5 min incubation, beads were anchored again using magnets and the supernatant was transferred to a new well. Sample Preparation To compare different RNA-seq protocols on the same source material, an abundant and representative tissue or cell lysate was needed. Brain tissue from a 250-day-old male Foxp2 hum/hum mouse (5H11 line [ 26 ]) was used. The tissue was homogenized in RLT Plus with 1 % β -mercaptoethanol and stored at -70 °C. For validation, a second, more challenging sample was used, which delivered worse results with the original protocol. Striatal punches of C57BL/6J mice were lysed in RLT Plus with 1 % β -mercaptoethanol and stored at -70 °C. Total RNA concentrations were determined using the QuantiFluor RNA System after Proteinase K treatment, DNA digest and purifications using SPRI beads. Standard Prime-seq Workflow The RNA-seq workflow used as a starting point is prime-seq [ 16 ]. It is termed as the “original” prime-seq method and used in this way if not mentioned otherwise. It is described in the following section and visualized in Fig. S7. The full list of reagents and oligonucleotides can be found in Tab. 1 & 2 . View this table: View inline View popup Table 1. Enzymes & Chemicals. View this table: View inline View popup Table 2. Oligonucleotides used in this study. ”Phos” signifies a 5’ phosphorylation. ”*” signifies a PTO bond. E3V7NEXT contains a region (UMI) and an anchor of random bases denoted by ”N”. Regions in square brackets are barcodes (BC) or indices (i5 & i7) with varying sequences. cDNA Preparation Once the RNA concentration of the standard lysate sample had been established, the standard input of the samples was adjusted to target approximately 10 ng of total RNA per sample. First, proteins were digested using 20 µ g Proteinase K in 0.5 mM EDTA at 50 °C for 15 min, followed by heat inactivation at 75 °C for 10 min. SPRI beads were used in a 2× ratio (given as bead volume to sample volume) to purify the samples as described above. In this step, samples were resuspended in 5 µ l UP H 2 O without removing the beads. DNA was digested on beads using 1 unit of DNase I at 20 °C for 10 min in 1× DNase I buffer and 1× bead binding buffer (11 % (w/v) PEG 8000, 0.5 M NaCl, 5 mM Tris-HCl pH 8, 0.005 % IGEPAL, 0.025 % sodium azide). EDTA was added to 4.5 mM before heat-inactivation at 65 °C for 5 min. The samples were cleaned up again and eluted in 4 µ l UP H 2 O. RT was performed in 1× Maxima H Minus buffer using 30 units Maxima H Minus, 1 mM each dNTP, 1 µ M template-switching oligo (TSO), and 1 µ M barcoded oligo (dT) primers (E3V7NEXT). The RT reaction was incubated at 42 °C for 90 min. The samples were combined, purified using SPRI beads at a 1× ratio and eluted in 17 µ l in UP H 2 O. This “pooled” sample was termed the “preAmp pool”. Residual primers were digested using 20 units Exonuclease I in 2 µ l 10× Exonuclease I buffer at 37 °C for 20 min with ensuing heat inactivation at 80 °C for 10 min. The samples were cleaned using SPRI beads at a 0.8× ratio, and eluted in 20 µ l of UP H 2 O. cDNA was amplified in 1× KAPA HiFi Ready Mix and 0.6 µ M SINGV6 primer, according to this PCR protocol: initial denaturation at 98 °C for 3 min, denaturation at 98 °C for 15 s, annealing at 65 °C for 30 s, elongation at 68 °C for 4 min, and final elongation at 72 °C for 10 min. Denaturation, annealing, and elongation were repeated for 10–23 cycles, depending on the initial amount of total RNA. The DNA was cleaned with SPRI beads in a 0.8× ratio and eluted with 10 µ l of UP H 2 O. The cDNA concentration was measured using the QuantiFluor dsDNA System and the quality of the cDNA was assessed using a Bioanalyzer. Library Preparation Libraries were prepared using the NEBNext Ultra II FS Library Preparation Kit. 1.2-4 µ l cDNA (usually 4–20 ng) were fragmented using the FS Enzyme Mix and FS Reaction Buffer in a 6 µ l reaction by incubating at 37 °C for 5 min, then at 65 °C for 30 min. 0.5 µ l ligation adapter (1.5 µ M) was ligated using the Ligation Master Mix and Ligation Enhancer in a reaction volume of 12.7 µ l by incubating at 20 °C for 15 min. Double-sided size selection was performed using SPRIselect beads according to the manufacturer, with a bead ratio of 0.52× for the removal of long DNA fragments and a ratio of 0.72× for the removal of short fragments. Samples were eluted in 11 µ l 0.1× TE buffer. 10.5 µ l DNA sample was amplified using 12.5 µ l Q5 Master Mix, 1 µ L Nextera P7 index primer (5 µ M), and 1 µ L TruSeq P5 index primer (5 µ M) following this PCR protocol: initial denaturation at 98 °C for 30 s, denaturation at 98 °C for 10 s, annealing and elongation at 65 °C for 1 min 15 s, and final elongation at 65 °C for 5 min. Denaturation and annealing/elongation were repeated for 10–13 cycles, depending on the initial amount of cDNA. Double-sided size selection was performed as above. The initial DNA concentration was measured using the QuantiFluor dsDNA System and, more precisely, using a Bioanalyzer, which also allowed for quality assessment. Prime-seq Interventions The following experiments are all modifications of the original prime-seq protocol described above. Cleanup Resuspension The cleanup steps following Proteinase K and DNase I digest were altered to include resuspension in ethanol. After adding 80 % ethanol, the plate was taken from the magnet, beads were resuspended in ethanol by pipetting, beads were reanchored using the magnet and ethanol was removed. This procedure was repeated in the second wash step. DNase I Digest In order to decrease the NaCl concentration during DNase I digest, bead binding buffer was removed from the reaction, and UP H 2 O was added to keep the reaction volume constant. Consequently, 18 µ l bead binding buffer was added after heat-inactivation to obtain the same PEG concentration as with the initially added bead binding buffer. Reverse Transcription The incubation temperature during reverse transcription (RT) was increased from 42 °C to 50 °C. The number of PCR cycles during preAmp PCR was increased by six cycles at 50 °C to obtain similar cDNA yields. Additionally, Superscript IV was tested against the standard Maxima H Minus as an alternative reverse transcriptase (RTase). In this reaction setup, Maxima buffer was swapped for Superscript IV buffer, and DTT was added to 5 mM, while keeping all other reagents and the total volume constant. cDNA amplification pool size To test the effect of combined RNA input in the cDNA amplification, pools of 8, 32, and 92 samples were compared. Sequencing data was downsampled to equal read numbers for analysis. For length binning, each gene was represented by a single transcript selected in the following hierarchical order: (1) the consensus coding sequence (CCDS), (2) the transcript with the highest support level, and (3) the longest transcript. To determine anti-sense intergenic reads, all intergenic reads were recounted using zUMIs (v2.9.7 [ 27 ]) with library strandedness set to reverse. Fragmentation To determine the optimal fragmentation time, cDNA from standard brain lysate was processed in the standard prime-seq library preparation without size selection. 5, 15, and 80 ng cDNA were fragmented for 0, 2, 3.5, 4, 4.5, 5, 6, 7, 8, or 15 min. Fragmentation was compared using a Bioanalyzer. Ligation Adapters and Index primers Altered ligation adapters and index primers were compared. The mirrored ligation adapter contains swapped strands to have a 3’ overhang instead of a 5’ overhang. The blocked prime-seq ligation contains an added dideoxycytidine at its 3’ end. The P5+RS and P7-RS primers correspond to the primers used in the original prime-seq protocol. The P5-RS primer was shortened by 8 nt from the 3’ end, which decreases its melting temperature from 75 °C to 71 °C. The P7+RS primer was extended by 19 nt at the 3’ end to its maximum length, which increases its melting temperature from 65 °C to 74 °C. Only the P5/7+RS primers contain the complete start sites for priming of the read 1 and read 2 sequencing reads, respectively. In combinations of P5+RS and P7+RS, Annealing/Elongation temperature during library PCR was increased from 65 °C to 72 °C, as calculated using the NEB Tm calculator. PTO bonds In the cDNA PCR and the library PCR, SINGV6 and index primers (P5+RS and P7+RS) with two 3’-terminal phosphorothioate (PTO) bonds were used. Sequencing RNA-seq libraries were sequenced by the Blum Lab as part of the Laboratory for Functional Genome Analysis (LAFUGA) of the Gene Center Munich. Sequencing was performed on an Illumina NextSeq 1000/2000 instrument with the following setup: Read 1: 28 bp, Index 1: 8 bp, Index 2: 8 bp, Read 2: 93 bp. DNase digest test DNase digest was compared using the original protocol, the improved protocol, and a negative control lacking DNase I. 8 samples per condition of mouse liver lysate containing 50 ng total RNA each were used. After Proteinase K treatment, the DNase I digest was done as in the original protocol, without DNase I and using the improved low-salt conditions. RNA was quantified using the QuantiFluor RNA System. RNA was digested using 1 µ l RNase A per sample at room temperature for 90 min. DNA was quantified using the QuantiFluor dsDNA system after a bead clean-up. The experiment was independently repeated. Data Analysis Prime-seq libraries were demultiplexed with deML (v1.1.1 [ 28 ]) using the indices in the index primers. Terminal poly(A)s were trimmed from read 2 and read pairs were filtered to minimum lengths of 28 nt read 1 and 40 nt read 2 using cutadapt (v3.2 [ 29 ]): ”cutadapt -A A{30} –discard-casava –minimum- length 28:40”. zUMIs (v2.9.7 [ 27 ]) was used for further processing. Reads were filtered if more than 2 barcode bases had Q-scores below 20 or if more than 3 UMI bases had Q-scores below 20. Barcodes (read 1, 1-12 nt) were demultiplexed and binned within a Hamming distance of 2. Remaining reads were mapped to the genome GRCm39. Exonic and intronic reads and UMIs (read 1, 13-28 nt) were counted per gene according to Gencode version 34. Power simulations were performed with powsimR (v1.2.5 [ 30 ]). For each protocol and sample size setup (3 vs. 3, 6 vs. 6, 12-20 vs. 12-20, 24 vs. 24, and 48 vs. 48), 25 simulations were performed with a negative binomial distribution, trimmed mean of M values (TMM) normalization, limma-voom DE-method, and 10 % differentially expressed genes. The log fold change was modeled by a gamma distribution with shape = 1 and rate = 2. The data were stratified by absolute log fold change or mean expression. Further analysis was done in R (v4.2.3 [ 31 ]) using tidyverse (v2.0.0 [ 32 ]), data.table (v1.14.8 [ 33 ]), ggbeeswarm (v0.7.2 [ 34 ]), purrr (v1.0.2 [ 35 ]), ggpubr (v0.6.0[36]), patchwork (v1.1.3 [ 37 ]), cowplot (v1.1.1[38]), ShortRead (v1.56.1 [ 39 ]) and bioanalyzeR (v0.10.1 [ 40 ]). Reagents Results A funnel strategy to systematically optimize the number of usable reads for bulk RNA-seq protocols When sequencing RNA-seq libraries, a certain number of reads per sample are targeted, often between 5 and 20 million reads, depending on the number and types of replicates and the intended analyses. After pooling and quantification, one loads the required amount of library molecules on the flow cells of short-read sequencers such as Illumina’s NextSeq or NovaSeq. This sets the price tag, but how many reads one gets from the flow cell and how many of those can be used for quantifying expression levels depends largely on the molecules in the library and, hence, largely on the RNA-seq protocol. For example, reads that cluster on the flow cell but lack a functional read primer binding site will lead to fewer reads than expected. Similarly, molecules lacking indices or barcodes, having inserts not derived from mRNAs or PCR duplicates (identified by having the same Unique Molecular identifier - UMI), will be filtered out and reduce the number of usable reads that go into the count matrix, i.e. the table in which reads are counted per sample (columns) and gene (rows). Note that we use reads mapping to exons and introns of genes, as it has been shown that intronic reads are generally derived from pre-mRNA and help to quantify gene expression levels [ 41 , 16 , 27 ]. Our goal was to increase the cost-efficiency of our prime-seq protocol by increasing the number of usable reads without compromising the complexity of the libraries. To visualize bottlenecks, we found it helpful to use a ”read funnel” displaying the number of usable reads retained after each sequential filtering step ( Fig. 1A ), a strategy allowing a better visualization and systematic optimization of RNA-seq protocols in general. Identifying potential optimizations for prime-seq We had previously developed and benchmarked prime-seq and could show that its power is comparable to TruSeq, making it fourfold more cost-efficient due to almost 50-fold cheaper library costs [ 16 ]. It uses the principles of poly(A) priming, template switching, early barcoding, and UMIs to generate 3’ tagged RNA-seq libraries ( Fig. 1B and Fig. S7) and hence is similar to 3’ early barcoding bulk RNA-seq protocols like BRB-seq [ 11 ] or DRUG-seq [ 20 ] and to the 3’ early barcoding scRNA-seq protocol of 10x Genomics. Prime-seq has been used extensively in our lab over the past years. During that time, and especially with the switch from non-patterned flow cells of Illumina’s HiSeq to the patterned flow cells of the NextSeq, we noticed that, for some samples, sequencing yields were 20–30 % lower than expected based on Illumina’s nominal flow cell output specifications. We also noticed that we often lost a considerable (10-30 %) amount of reads for which we couldn’t assign the i7 index, and that some sample types had a very high proportion of intergenic reads ( > 30 %). These patterns were not immediately apparent, as they had always been present to some degree and varied substantially across samples. Furthermore, it is not obvious how these patterns are caused and how they could be fixed, given the complexity of interactions among the different protocol steps and sample types. As even small improvements could provide considerable benefits for a highly used protocol and as testing variations might reveal general mechanistic insights into the generation of complex sequencing libraries, we tested nine protocol steps that we identified as promising for increasing the number of usable reads ( Fig. 1B ). We iteratively tested these protocol modifications, amounting to a total of 1080 RNA-seq samples in 49 libraries for a total of 131 million reads that we structured and visualized using our funnel strategy. For initial testing, a mouse whole-brain lysate was used as an abundant, homogeneous sample. Later, lysates from mouse striatal punch biopsies were used as more challenging samples for which we had previously seen a very low fraction of usable reads with the old protocol. In the following, we discuss the tested modifications in the order they appear within the protocol. Additional RNA washing increases exonic reads but lowers complexity RNA extraction from lysates comprises the digestion of proteins using Proteinase K, followed by a DNase treatment. After each step, samples are purified using SPRI beads to remove substances such as salts, small nucleic acids, proteins, lipids, or polysaccharides that can negatively impact subsequent steps in the protocol. This is typically done by adding ethanol for a few seconds to the immobilized beads on the magnet. We reasoned that a resuspension of the beads in ethanol at the washing steps might enhance the purification process by removing more contaminants and, in this way, increase the number of usable reads. This additional resuspension increased the mean proportion of exonic reads from 68 % to 74 % while reducing intronic, intergenic and unmapped reads (Fig. S1A). However, additional resuspension also led to a reduction in the mean number of UMIs per read by 2 % (Fig. S1B). We also observed a non-significant 7 % decrease in the library complexity, i.e. in the number of detected genes at 1.25 million reads per sample with a larger spread of detected genes among samples (Fig. S1C). Hence, the additional RNA washing increased exonic reads, but led to fewer usable reads in the end and to less complex and potentially more variable libraries. We do not know the reasons for these changes, but as the additional washing does not improve the number of usable reads and as it increases hands-on time, we decided not to incorporate this additional washing step into an improved prime-seq protocol. However, it might remain an option worth testing for lysates with more challenging compositions than whole brain lysates, such as samples with a very high lipid content or chlorophyll-containing samples. Improved DNase treatment reduces intergenic reads Funnels from previous prime-seq projects indicated that a considerable fraction of reads is lost at exon and intron assignment due to 10-20 % of reads mapping to intergenic regions within the genome, i.e. all reads not mapping to the expected strand of the current version of GENCODE annotated exons and introns [ 27 ]. Reverse transcriptases used for cDNA generation, including the engineered MMLV-type RTs such as Maxima and Superscript, can generally also use single-stranded DNA as a template [ 42 ] and hence these reads can in principle be derived from contaminating genomic DNA. However, our previous experiments with mixing human total RNA and mouse genomic DNA had shown that the fraction of UMIs derived from mouse genomic DNA is below 5 %, even if more than half of the input is mouse DNA and no DNase I treatment is done [ 16 ]. Furthermore, the fraction of UMIs derived from mouse genomic DNA is ∼ 0 %, even if more than half of the input is mouse DNA and a DNA digest is done, although this digest is done under suboptimal high salt conditions on beads [ 16 ]. These findings are difficult to reconcile with a high fraction of intergenic reads derived from genomic DNA. However, one potential explanation could be the different behavior of an intact, high molecular weight DNA present in the mouse-human test samples and a more degraded, low molecular weight DNA in problematic samples that show a high number of intergenic reads. Larger DNA fragments would be expected to stick more firmly to the beads and, consequently, could be less accessible for reverse transcription than smaller fragments. Consequently, samples with fragmented genomic DNA in combination with a partial DNA digest due to suboptimal conditions on the beads could lead to more accessible DNA and hence to more intergenic reads. In contrast, under optimal conditions, DNase I should be able to digest the entire genomic DNA to di-, tri-, and short oligonucleotides [ 43 ], which are then removed during bead clean-up. Hence, we tested whether an optimized DNA digest would lead to less DNA and to fewer intergenic reads. For this, we compared conditions without DNase I treatment, with the original DNase I treatment under suboptimal (high-salt) conditions on beads, and with the optimized DNase I treatment under low-salt conditions off beads. Indeed, when performing the prime-seq RNA isolation protocol on the mouse whole brain samples, the DNA content in the purified lysate was reduced from 1.7 ± 0.4 ng DNA (mean ± SD) for the original DNase I treatment to 0.0 ± 0.2 ng DNA for the optimized treatment ( Fig. 1C ). This indicates strongly that the original protocol indeed left residual DNA after DNA digestion, which was eliminated in the optimized digest that lowered RNA content only minimally (Fig. S2A). Interestingly, samples without any DNase I treatment contained 0.3 ± 0.2 ng of DNA, i.e., considerably less than for the original DNase I treatment. This supports that high molecular weight DNA indeed binds strongly to beads and hence cannot be eluted very well. To assess the impact of the improved DNase conditions on the final data, we produced libraries from the mouse brain lysates with and without this optimization. As hypothesized, this reduced the fraction of intergenic reads compared to the original protocol from 16 % to 13 % and accordingly increased the fraction of exonic and intronic reads from 80 % to 83 % ( Fig. 1D ). Assuming that the optimized DNA digest is complete, as supported by the absence of DNA after elution and as expected from the DNase I activity under these conditions, the remaining intergenic reads must be derived from reverse transcribed RNA (see discussion below). In summary, we were able to optimize the genomic DNA digestion within the protocol without additional costs. For our mouse brain samples, this increased the number of usable reads by 3 %, but we expect that the improvement can also be considerably bigger for samples that contain more DNA and more partially degraded DNA. Hence, we integrated this optimized DNA digest into our protocol. Reverse transcription as a trade-off between complexity and specificity Reverse transcription (RT) is the crucial step in any RNA-seq workflow. To prime reverse transcription of mRNAs, prime-seq currently uses an oligo(dT) 30 primer with a VN-anchor and the RTase Maxima H- at 42 °C. However, the manufacturer suggests a reaction temperature of 50 °C for this enzyme when utilizing oligo(dT) 18 primers. Increasing the temperature during reverse transcription is expected to enhance priming specificity and to facilitate the denaturation of RNA secondary structures, thereby improving access to more mRNAs [ 44 ]. Additionally, other RTases could improve specificity and complexity of the generated cDNAs, such as the generally high-performing Superscript IV [ 45 , 46 , 47 , 48 ]. To test the influence of the RTase as well as the RT temperature, we tested both RTases, Maxima H- and Superscript IV, at 42 °C and 50 °C with the mouse brain lysate. This resulted in 6 % and 4 % more exonic and intronic reads when using Maxima H- and Superscript IV, respectively. Similarly, substituting Maxima H- with Superscript IV resulted in 4 % and 2 % more exonic and intronic reads at 42 °C and 50 °C, respectively. In total, switching from Maxima H- at 42 °C to Superscript IV at 50 °C elevated the mean exonic and intronic read fraction from 82 % to 89 % ( Fig. 1E ). However, these enhancements also led to a decreased number of detected genes. At a sequencing depth of 3 million reads per sample, increasing the temperature alone reduced the mean number of detected genes by 32 %. Switching the enzyme to Superscript IV further decreased the mean number of detected genes by 42 % ( Fig. 1F ). So while we see an improvement in the number of usable reads, this increase in specificity comes at the cost of library complexity. This trade-off illuminates the importance of considering not only one quality metric when optimizing a protocol. As we think that a gain of 8 % in usable reads when increasing temperature and switching RTase is not worth a loss of 60 % of detected genes, we decided to stick with using Maxima H- at 42 °C. Pooling fewer samples increases the number of usable reads The key advantage of early barcoding methods is the pooling of cDNA after Reverse Transcription for PCR amplification and library construction, which reduces handling time, reagent costs, and technical variation. However, multi-template PCRs are known to be prone to technical artifacts such as chimera formation and biased amplification [ 49 ] and the type and number of artifacts are expected to depend on the number of PCR cycles, the composition of the template pool, and the amount of template. The prime-seq protocol already minimizes the number of PCR cycles, and the composition of the template pool is given by the analysed samples and hence also not optimizable. But as we occasionally observed lower fractions of usable reads when pooling more samples, we tested whether the amount of template in the cDNA, i.e., the number of pooled samples, affects the number of usable reads. To investigate this effect, we isolated RNA and set up reverse transcription reactions from 132 mouse brain lysate aliquots that contain ∼ 10 ng of total RNA each. We pooled 8, 32, and 92 of them for the subsequent exonuclease treatment, preamplification, and library generation, following the standard prime-seq protocol for these sample sizes. Note that this includes minimizing the number of PCR cycles, which in this case corresponds to 13, 10, and 7 cycles for the pools of 8, 32, and 92 samples, respectively. Strikingly, the increase in input led to a decrease in the mean proportion of exonic and intronic sequences from 81.0 % to 70.1 % ( Fig. 1G ). In other words, using more cDNA as input in the preamplification PCR leads to fewer usable reads, despite a lower number of PCR cycles, which is considered to reduce technical artifacts in multi-template PCRs. The magnitude of this effect was surprising, also because we are not aware of any findings that report an effect of the amount of cDNA input on the quality of bulk or single-cell RNA-seq libraries. Analysing the effect in more detail, we found that the decrease in usable reads was caused almost entirely by an increase in intergenic reads and this increase was caused almost entirely by reads mapping to the opposite, i.e. antisense, strand of exons and introns, increasing from 3 % in the 80 ng condition to 11 % in the 920 ng condition (Fig. S3A). In general, antisense reads have been attributed to off-target priming, i.e., to priming of the oligodT primer and the TSO primer to other sites than the polyA tail and the 5’ end of the cDNA, respectively [ 50 ]. Off-target priming is prevalent in essentially all bulk and single-cell/single-nucleus RNA-seq libraries and a recent analysis found that on average 18 % (max. 30 %) of UMIs are antisense among the seven analyzed 10X Genomics datasets [ 51 ]. So off-target priming in general and antisense reads in particular are a common source of non-usable reads in early barcoding protocols like prime-seq. We discuss the causes and consequences of off-target priming in more detail below. Of relevance here is that there is no obvious mechanistic explanation why antisense reads are more prevalent when pooling more samples, although an increased amplification bias for shorter molecules could play a role (Fig. S3B). Independent of the mechanism, our findings indicate that pooling fewer samples is better for prime-seq and potentially also for other early-barcoding protocols. Hence, our previous recommendation of pooling up to 384 samples of 4-40 ng RNA each [ 16 ] should not be followed. Rather, we recommend that pooling should be limited to ∼ 48-96 samples with ∼ 500 ng of total RNA as input, as this seems a reasonable ad hoc compromise between the time and cost savings of pooling and the disadvantage of less usable reads. Fragmentation time is robust to input amounts After its amplification, full-length cDNA needs to be fragmented to lengths compatible with short-read sequencing technologies such as Illumina. If the fragments are too long, the sequencing quality degrades [ 52 ] and clustering efficiency on the flow cell decreases [ 53 ]. Conversely, if the fragments are too short, the sequencing cycles may exceed the insert length, leading to unwanted sequencing of adapter sequences [ 54 ]. The prime-seq adapter sequences are 197 bp (including 12 bp barcode and 16 bp UMI), and we sequence the cDNA insert with only one read of 93 bp, so we aim for a total fragment size of 290-650 bp (Fig. S7). Prime-seq uses enzymatic fragmentation (NEBNext Ultra II FS) and double-sided SPRI bead purification to obtain this range of fragments. It is desirable to retain as much of the complexity present in the amplified full-length cDNA also within the fragmented cDNA, and hence to adapt fragmentation time to maximize DNA yield in the size range of 250-650 bp. We tested 5, 15, and 80 ng of full-length cDNA for fragmentation times of 2-8 minutes and determined the DNA yield in the desired size range on a Bioanalyzer (Fig. S4). We found that the 5 minutes of fragmentation time used for our original protocol works well for 5 ng and the recommended 15 ng, with a maximum yield of 66 % DNA mass at 4.5 minutes. At an input of 80 ng cDNA, fragmentation for 5 minutes yielded 60 % DNA yield and a maximum of 67 % was obtained when fragmenting for 7 minutes. Hence, the originally recommended 5-minute fragmentation time for 15 ng cDNA works well and is reasonably robust even when 3-fold higher or lower cDNA amounts are used. Slightly longer fragmentation times are optimal for higher cDNA amounts, but as cDNA concentration is measured before fragmentation, we keep the recommendation of using 15 ng cDNA and 5-minute fragmentation also in prime-seq2. Improved adapter ligation and library PCR increase the number of usable reads To generate an Illumina sequencing library from these cDNA fragments, it is necessary to add indices (i5 and i7) and the complete flow cell adapter sequences (P5 and P7) that are necessary for Illumina sequencing. As prime-seq is a 3’-tagged early barcoding method, only cDNA fragments containing the 3’ end can be utilized, as only these contain the oligodT primer with the sample-specific barcode, the UMI, and the partial P5 sequence for priming. To generate an Illumina sequencing library from these cDNA fragments, one performs end repair and dA-tailing before ligating an adapter to the 5’ end ( Fig. 2A , S7). This adapter ligation generates DNA molecules that are amplified in the following “library PCR” that adds the indices (i5 and i7) and the complete flow cell adapter sequences (P5 and P7) necessary for Illumina sequencing ( Fig. 2A ). We suspected that the adapters and the indexing primers are suboptimal and contribute to unusable reads in the library. As interactions among adapter ligation and indexing primers are likely, we also tested the combination of these modifications. To minimize batch effects and biases, we started from one homogeneous cDNA pool and constructed three independent libraries per condition with 24 replicates each for all 12 combinations of adapters and primers, totaling 864 samples. For the indexing primers used in the original prime-seq protocol, we noticed that they might be suboptimal as their annealing sites differ considerably in length, resulting in a melting temperature (Tm) difference of approximately 6 °C. These primers were retained from prime-seq’s predecessor protocol, SCRB-seq [ 55 ], and while we are not aware of any positive consequences of this Tm difference, it is known that it can lead to inefficient and biased amplification [ 56 ]. To test whether an optimization of the library PCR could improve the number of usable reads, we tested four indexing primer combinations that differ by the presence of a read start site and hence in their length and Tm ( Fig. 2A ): The original protocol combines a long P5 primer that contains the read start site of Illumina’s Read 1 sequencing primer (”P5+RS”) and a short P7 primer that does not contain the read start site of Illumina’s Read 2 sequencing primer (”P7-RS”). We designed the two additional versions, P5-RS and P7+RS, resulting in the four possible primer combinations. We adjusted the annealing and elongation temperature from 65 °C to 72 °C for the P5+RS and P7+RS primer combination, and generated libraries from the same mouse brain cDNA. Download figure Open in new tab Figure 2. Possible improvements during adapter ligation and library PCR. A , Principle of adapter ligation and library PCR. Adapters are ligated to the cDNA fragments after fragmentation using a 3’ A-overhang. Note that the biotinylated SINGV6 primer prevents ligation on the P5 site (B - Biotin). Alternative mirrored and blocked ligation adapters were designed to prevent the formation of P7-P7 fragments in the following library PCR that amplifies fragments and adds indices (i5 and i7) as well as the complete flow cell adapter sequences (FC binding regions P5 and P7). In the original protocol, a P5 primer with and a P7 primer without a read start (P5+RS & P7-RS) were used, and we tested all four primer combinations with and without a read start for all three adapters with 24 samples per condition and three independent library preparations each (n= 864). B , Median ratio of read numbers after index assignment with the new P7+RS relative to the original P7-RS. (n=18 pairs). C , Median ratio of read numbers after index assignment with each new ligation adapter relative to the original ligation adapter. (n=24 pairs each). D , Mean fraction of assigned barcodes per combination of index primers with the original ligation adapter (n=3 independent library replicates per condition) E , Mean fraction of mapped exonic and intronic reads per combination of index primers with the original ligation adapter (n=72 samples per condition) F , Mean fraction of assigned barcodes per combination of index primers with the new ligation adapters (n=3 independent library replicates per condition) G , Mean fraction of mapped exonic and intronic reads per combination of index primers with the new ligation adapters (n=72 samples per condition) H , Total reads relative to the improved protocol without PTO bonds (n=2 independent library replicates per condition. Bar shows the mean; error bars show maximal/minimal values.). I , Fraction of assigned barcodes with and without PTO bonds (n=42; Two independent library replicates of 10/11 samples per condition.). P-values were calculated in paired two-sided t-tests (ns: p > 0.05; *: p ≤ 0.05; **: p ≤ 0.01; ***: p ≤ 0.001; ****: p ≤ 0.0001). We find that libraries generated with P7+RS instead of the original P7-RS have 37 % more reads with a correct i5 and i7 index (index assignment step), and libraries generated with P5-RS instead of P5+RS have 43.2 % fewer ( Fig. 2B , S5). Hence, the longer primers generate more sequenceable library molecules, confirming that our previous conditions were suboptimal. Importantly, these additional reads generated by P7+RS and P5+RS library PCRs had the same fraction of reads with assigned barcodes ( Fig. 2D ) and even a 2 % higher fraction of reads in exons and introns ( Fig. 2E ), indicating that not only more, but also more usable reads are produced when using P7+RS and P5+RS indexing primers. The ligation adapter used in the original prime-seq protocol has a 15-base 5’-overhang, so that the P7 index primer can anneal only after the P5 index primer has elongated a fragment ( Fig. 2A ). This setup is intended to prevent the amplification of P7-P7 fragments that are generated from internal cDNA fragments that have ligation adapters on both ends. However, this design overlooked that the polymerase can synthesize the complementary overhang of the ligation adapter, enabling the priming of P7 index primers without the prior elongation of the P5 index primer. To address this issue, we developed two variants of ligation adapters ( Fig. 2A ). The first variant is a mirrored adapter, which has a 3’-overhang instead of a 5’-overhang. This variant leads to complete amplicons only when molecules are initially primed and elongated from the P7 site. Initial priming and elongation from the P5 site, as well as P7-P7 fragments, generate ”dead-end” amplicons lacking an overhang on the opposite side for P5 binding. The second variant is a blocked adapter, which includes a dideoxycytidine at its 3’-end. This prevents the synthesis of the complementary overhang and hence should lead to the preferential amplification of fragments with a P5 and a P7 end. The mirrored and the blocked adapters increased the number of reads after index assignment by 5 % and 16 %, respectively ( Fig. 2C ). As for the increased number of reads produced by P7+RS compared to P7-RS, this improvement is likely due to a reduction in irregular fragments, which are quantified but fail to produce valid reads. Importantly, these additional reads with ∼2 % also had a slightly higher fraction of reads with barcodes ( Fig. 2F ) and an up to 5 % increase in exonic and intronic reads ( Fig. 2G ), indicating that not only more, but also more usable sequences are produced. In summary, both new ligation adapters perform better than the original ligation adapter regarding the number of sequencable reads, fraction of assigned barcodes, and fraction of mapped exons and introns. Both perform best when combined with the P7+RS & P5+RS index primers. As the blocked adapter performed slightly better with respect to the number of sequenceable reads and because it prevents, by design, more mispriming than the mirrored adapter, we integrated the blocked adapter and the P7+RS & P5+RS index primers into the prime-seq2 protocol. PTO bonds prevent primer editing Proofreading polymerases are commonly used in RNA-seq protocols due to their 3’-5’ exonuclease activity, which reduces PCR errors [ 57 , 58 , 59 ]. However, when mismatches occur between the primer and template, these polymerases can modify primer sequences, leading to unspecific amplification [ 60 , 61 ]. Previous studies have shown that incorporating phosphorothioate (PTO) bonds into the primers’ 3’ ends can block this primer editing [ 62 , 60 ]. We incorporated PTO bonds into the last two 3’ bonds of all PCR primers (SINGV6, P5+RS, P7+RS) and tested their effects using the lysates from mouse striatal punch biopsies. We detected a mean increase in total reads of 11.7 % when using primers with PTO bonds ( Fig. 2H ). This could be attributed to a decrease in irregular fragments resulting from mispriming, which distort quantification but do not produce valid reads as described above. We also noted a 1.4 % higher fraction of mean assigned barcodes with significantly less variation among samples (Levene’s Test, p < 0.05; Fig. 2I ). Hence, the incorporation of PTO bonds improved the number of usable reads, and we integrated primers with PTO bonds in the prime-seq2 protocol. Validation of the prime-seq2 protocol Finally, we compared the combination of improvements in the protocol with the original prime-seq protocol using lysates from mouse striatal punch biopsies, generating four libraries of 32 samples per protocol. The improved protocol (prime-seq2) includes the optimized DNA digest, the 3’ blocked ligation adapter, a library PCR with P5+RS and P7+RS primers, and their higher annealing temperature and PTO bonds in SINGV6, P5+RS, and P7+RS primers. Note that for both protocols, 32 samples are pooled, and potential additional improvements from pooling fewer samples ( Fig. 1G ) are not included in this comparison. For both protocols, we visualize the fraction of reads remaining after each step of the read funnel, arbitrarily setting the average number of total reads obtained from the prime-seq2 libraries as 100 % ( Fig. 3A ). What matters most is that the number of usable reads at the end of the funnel, i.e., the fraction of UMIs, increases from 28 % to 45 %, i.e., by 60 % ( Fig. 3A, E ). Hence, the prime-seq2 protocol represents a considerable improvement in the number of usable reads. Going through each step of the funnel for the total read fraction ( Fig. 3A ) and for the relative change ( Fig. 3B ) helps to explain how this improvement occurs. Download figure Open in new tab Figure 3. Overall comparison of the improved protocol. A , Funnel plot showing the read number at each step relative to the total reads with the improved protocol. The first four steps use one data point per library, while the last two show one data point per sample, increasing the variation. Lines show means; error bars show SE. (n=64; Four libraries of 32 samples (8 barcodes) per condition.) B , Change in total reads per step relative to the previous step. Error bars show SE. C , Total reads per flow cell relative to the nominal flow cell output according to the manufacturer. (n=17) D , Fraction of barcode-assigned reads mapped to exons, introns, or intergenic regions and unmapped reads. (n=64; Four libraries of 32 samples (eight barcodes) each per condition.) E , Number of UMIs relative to the original protocol. (n=64; Four libraries of 32 samples (eight barcodes) each per condition.) F , Power analysis of both protocols at equal flow cell share across replicate numbers. Points and lines show means; error bars show SE. P-values were calculated in unpaired two-sided t-tests (ns: p > 0.05; *: p ≤ 0.05; **: p ≤ 0.01; ***: p ≤ 0.001; ****: p ≤ 0.0001). The new protocol prime-seq2 increases the total reads by a mean of 27 %, i.e., for a given amount of library molecules, 27 % more reads are obtained from a flow cell. As explained above, we think that this is caused by fewer molecules that get quantified but fail to cluster or - more importantly - cluster, but fail to produce an R1 sequence read and hence do not get detected by the sequencer. Of note, we observed this effect not only in the direct comparison of the two protocol versions done here, but also across 17 unrelated projects from a variety of species and cell types that we sequenced on Illumina’s NextSeq1000/2000 using P2, P3, and P4 flow cells (Tab. S1). The 10 and 7 libraries prepared with the old and the new protocol had 97 % and 118 % of the expected flow cell output, respectively (MWU test, p < 0.05; Fig. 3C ). In the next processing step - index assignment - 19 % of the reads get filtered out in the original protocol, but only 2.5 % in prime-seq2. This indicates that more molecules contain a proper i5 and i7 index in prime-seq2. Both protocols then filter a similarly small percentage of reads during trimming & filtering (3-4 %) and barcode assignment (4 %). This is partly expected, as neither the RT-primer nor the fragmentation and size-selection were changed in prime-seq2, and partly indicates that the PTO bonds do not improve the assigned barcode fraction decisively. The generally small percentage of filtered reads shows that these two steps are not a major bottleneck for the number of usable reads in prime-seq. A considerable fraction of barcode-assigned reads, 26 % and 21 % for the old and the new version, respectively, is filtered because they do not map to an annotated exon or intron. 4 % and 3 % cannot be mapped at all and 21 % and 17 % map to intergenic regions ( Fig. 3D ). Hence, prime-seq2 does not only produce 60 % more barcode assigned reads ( Fig. 3A ), but among them is a significantly higher fraction of exonic and intronic reads. This is probably caused largely by the improved DNA digest ( Fig. 1D ) and the new index primers ( Fig. 2E ). Why, despite a presumably complete DNA digest, the fraction of intergenic reads is still at 17 % is interesting and discussed in more detail below. In the final UMI collapsing processing step, 26 % and 29 % of the exonic and intronic reads are filtered out. Hence, there is a slightly higher fraction of duplicated reads in the prime-seq2 protocol for unclear reasons. Importantly and as mentioned in the outset, the final number of usable reads, i.e. the total number of exonic and intronic UMIs increases significantly from 28 % to 45 %, i.e. by 60 % ( Fig. 3A, E ). To quantify how this improvement translates into performance, we used power simulations as implemented in powsimR [ 30 ]. PowsimR estimates the mean-variance distribution of the data, in our case, of the two protocol versions, and then simulates differently expressed genes between two conditions to obtain a ground truth against which simulations for a certain number of replicates per condition can be compared. We estimated the mean variance distributions using equal flow cell shares and hence took the increased number of total reads of prime-seq2 into account. We simulated that 10 % of genes were differentially expressed, with log-fold changes following a gamma distribution, and varied the number of replicates per condition from three to 48. The prime-seq2 protocol consistently had a higher true positive rate (TPR) in detecting differentially expressed genes across all numbers of replicates. prime-seq2 achieves an 80 % TPR with only 13 samples per group, whereas the original prime-seq protocol requires 17 samples to reach the same threshold. Importantly, the false discovery rate (FDR) was not compromised and was actually lower than that of the original protocol for low replicate numbers and similar at higher replicate numbers ( Fig. 3F , S6). Therefore, the protocol improvements increase the power to detect differentially expressed genes without additional sequencing costs. Discussion Bulk RNA-seq is a powerful method for many biological questions, and its cost-efficiency is a crucial factor, limiting biological insights at a given budget. Early barcoding protocols have reduced library costs per sample by 10-50-fold by minimizing reagent costs and increasing throughput [ 16 ]. What has been largely neglected in protocol developments is understanding and improving the number of usable reads that are obtained from RNA-seq libraries. Here, we systematically optimized the 3’ tagged early barcoding protocol prime-seq. This results in the improved protocol prime-seq2 that generates 60 % more usable reads and presents one of the most widely used and cost-efficient home-brew bulk RNA-seq protocols available. Further, we think our optimization approach reveals insights and approaches how to optimize other bulk, single-cell, and single-nucleus RNA-seq protocols. Finally, our results provide insights into the types of molecules that get amplified in such protocols and how this affects the interpretation of RNA-seq profiles. We discuss these issues in turn. Powerful and cost-efficient RNA-seq with prime-seq2 While using the original prime-seq protocol [ 16 ] in various projects, we noticed that some samples generated fewer usable reads than expected, especially after switching to sequencing on Illumina’s patterned flow cells. We identified three main issues: (i) lower-than-expected total sequencing reads per flow cell, (ii) loss of reads due to missing index assignments, and (iii) a high proportion of intergenic reads. By systematically testing adjustments of different protocol steps as presented above, the resulting prime-seq2 protocol led to 27 % more reads per flow cell, 21 % more index assignments and 7.6 % more exonic and intronic reads ( Fig. 3A-C ). While it is not possible to unambiguously attribute each improvement to specific adjustments, we propose the following plausible explanations: (i): Lower-than-expected total sequencing reads can be obtained when library molecules are quantified, but fail to amplify on the flow cell, and when library molecules get amplified, but lack a functional Read 1 sequencing primer binding site ( Fig. 2A , S7). Our improvement of 27 % ( Fig. 3A-C ) is likely caused by blocked ligation adapters, longer P7 primers and the primers with PTO bonds as these modifications are expected to increase the specificity of the library PCR and hence produce more sequenceable molecules. This is supported by our findings that these modifications lead to 16 % ( Fig. 2C ), 37 % ( Fig. 2B ), and 12 % ( Fig. 2H ) more index-assigned reads, respectively. (Note that these numbers reflect both the increased total reads and the improved index assignment. Disentangling these two factors would require separate flow cells, which was only done during validation of the full protocol ( Fig. 3A-C )). (ii): Clusters that get filtered out at the indx assignment steps have either no proper i5 or no proper i7 index. Similar to library molecules that lack a functional Read 1 primer binding site, this can be caused by molecules lacking a functional Read 2 (i7) sequencing primer binding site or lacking a functional Read 3 sequencing primer binding site (i5). As above, our improvement of 21 % ( Fig. 3A-B ) is likely caused by our blocked ligation adapters ( Fig. 2C ), longer P7 primers ( Fig. 2B ), and the incorporation of PTO bonds ( Fig. 2H ), as these should increase the specificity of the library PCR and hence produce more sequenceable molecules. Importantly, this would help little if the additional reads did not contain usable reads mapping to exons and introns. Fortunately, the improved library PCR generates even 3 % more of those reads ( Fig. 2G ) and hence really does increase the number of usable reads. (iii) Intergenic reads, i.e., reads mapping to the genome, but not to the sense strand of annotated exons and introns, were reduced from 21 % to 17 % in the improved protocol ( Fig. 3D ). Intergenic reads were reduced by optimizing DNA digest conditions, which seems especially relevant if degraded genomic DNA is present (see Fig. 1C-D , 2). As no genomic DNA is detectable after the optimized digest ( Fig. 1C ), essentially all intergenic reads in the improved protocol must be derived from RNA. Remarkably, the fraction of intergenic reads can vary considerably and is influenced by the conditions of reverse transcription ( Fig. 1E ), the amount of samples pooled for cDNA amplification ( Fig. 1G ), and by the library PCR conditions ( Fig. 2G ). The causes and consequences of intergenic reads are discussed in more detail below. Here, it is worth mentioning that the biggest reduction of over 10 % of intergenic reads is achieved when pooling 8 lysates à 10 ng total RNA instead of 92 ( Fig. 1G ). Note that this effect is masked in our final protocol comparison, as we pooled 32 samples both for the original as well as the prime-seq2 protocol. We currently do not know how this pooling effect is caused, but the increased amount of antisense reads and the reduced representation of long transcripts (Fig. S3) indicate that pooling more cDNA could increase the amplification bias towards shorter molecules in the preamplification and/or the library PCR. This likely plays a role also in other early barcoding bulk and scRNA-seq protocols and hence might be a relevant parameter that can improve early barcoding protocols in general. For the current version of prime-seq2, we recommend limiting the libraries to pools of ∼500 ng of total RNA, which is a reasonable compromise for most of our projects with respect to intergenic reads, reagent costs, and hands-on time. In summary, prime-seq2 delivers on average 60 % more UMIs than prime-seq and hence reduces sequencing costs by an average of 38 %. For example, this allows for the generation of bulk RNA-seq libraries of 96 samples at 2.5€ per sample, including RNA isolation (Tab. S2) and sequencing data for another 20-40€ per sample (assuming 5-10 million usable reads per sample and 2€ per million paired-end reads). To our knowledge, this makes it the most cost-efficient bulk RNA-seq protocol [ 16 ], although other early-barcoding bulk RNA-seq protocols are in the same range, and proper benchmarking would be required to compare cost-efficiency quantitatively. Prime-seq is well used by us and others [ 63 , 64 , 65 , 66 , 67 , 68 , 69 ] and due to its low set-up costs, which essentially comprises the 96 custom oligos for ∼1250 € (Tab. S2), it is an attractive alternative to commercial kits for core facilities and labs that frequently perform bulk RNA-seq. The detailed step-by-step protocol of prime-seq2 is accessible on protocols.io: doi.org/10.17504/protocols.io.14egn97kpl5d/v1. Monitoring and optimizing RNA-seq protocols using the funnel Our strategy to monitor and optimize prime-seq could be useful also for other RNA-seq protocols, as the number of usable reads varies widely within and among other bulk and single-cell RNA-seq protocols [ 25 , 70 ] and very similar principles like poly(A) priming, early barcoding, and cDNA and library amplification apply to most of them. We think that just monitoring the different filter steps from total reads to usable reads in more detail, i.e., by applying the funnel strategy, could be a useful QC metric that can identify issues for particular samples, protocols, and/or sequencing platforms. For example, lower rates of index assignment can pinpoint non-sequenceable library molecules, or a high fraction of intergenic reads could pinpoint suboptimal library amplification, as discussed above for prime-seq. If appropriate, identified bottlenecks could then be addressed by adjusting individual protocol steps. When optimizing such steps, it is important to also track other performance parameters such as complexity or technical noise as was e.g. relevant when we tried optimizing RT-conditions or for prime-seq ( Fig. 1F ). It can also be important to consider non-obvious interactions of sample types and protocol steps as we found for the DNA digest that was apparently working well for samples with high molecular weight genomic DNA contamination [ 16 ], but turned out to be optimizable for samples with degraded genomic DNA modifications ( Fig. 1D ). Another illustration of the usefulness of the funnel is that for prime-seq2, still about 29 % of the reads are lost in the UMI-collapsing step ( Fig. 3A ). This indicates that measures to increase the complexity of the library, such as increasing the input further, optimizing cycle number, and amplification conditions, might be a bottleneck that is worth addressing next. In summary, we think that the funnel can be a useful QC metric to monitor and optimize bulk and scRNA-seq protocols. Priming, mispriming, and library complexity Our optimizations also shed light on another aspect of polyA-primed RNA-seq protocols that we think is worth discussing, which is the relation between the specificity of the oligo-dT priming and the complexity of the resulting libraries. The idealized view is that oligodT priming on the polyA tail of an mRNA leads to reverse transcription of the (entire) mRNA and is followed by template switching and second-strand synthesis. The following cDNA amplification and library PCR leads to library molecules in which - for 3’ tagged protocols -the 3’-end of the cDNA is read, starting ∼200bp downstream of the polyadenylation site ( Fig. 1B and 7 ). Due to only one priming site - the polyA tail - the number of reads is independent of the length of the mRNA. However, stretches of adenine also occur within transcripts (”internal polyA”), which inevitably leads to internal oligo-dT priming, for which already six or seven consecutive adenines seem sufficient [ 71 ]. This off-target priming is prevalent in bulk and single-cell RNA-seq data and ranges from 95 % for intronic reads, dependent on the protocol and the sample (Table S1 in [ 71 ]). As intronic reads are derived largely from nascent, i.e., (partially) unspliced transcripts, and those are confined to the nucleus, the use of intronic reads is especially relevant for single-nucleus RNA-seq [ 72 , 27 ] ). But also for single-cell data, intronic reads can account for over 30 % or over 50 % of all reads from mouse embryonic brain samples or human PBMCs, respectively [ 50 ]. This creates a dependency between the number of reads and the occurrence of internal poly(A) stretches that in turn leads to the observed gene length bias, i.e., a correlation of number of reads and transcript length [ 73 , 74 , 71 ]. This reduces the correlation of read counts and the number of RNA molecules in the samples, i.e., reduces the accuracy of the RNA quantification. It has been proposed to filter or correct these off-target reads [ 71 ], but this does not work well for real data [ 74 ]. However, most bulk and single-cell analyses do not profit from a slightly higher accuracy, as they focus on comparing relative RNA levels among samples or cells to identify differentially expressed genes or distinguish different cell types. For those purposes, the complexity of a library is much more relevant, i.e., how many genes are detected at a given sequencing depth or - more precisely - for how many genes one has sufficient reads, i.e. statistical power, to detect differential expression. As it turns out that thousands of genes are largely or even exclusively detected by intronic reads [ 75 ], off-target reads are now considered more as a feature than as a bug, exemplified by intronic reads being now the default for single-cell and single-nucleus data in the 10x CellRanger software [ 76 ]. In the context of this study, this matters, as we not only assumed that intronic reads are usable, but we also observe a relationship of priming specificity and library complexity: When increasing the temperature for the reverse transcription reaction, we do increase the number of usable, i.e. exonic and intronic reads by 6 % ( Fig. 1E ), but we also decrease the complexity by detecting only 2/3 of the genes ( Fig. 1F ). As alluded to above, this can be explained if the higher temperature reduces off-target priming for many genes that are not well detected by on-target oligo-dT priming. Also, other protocols probably differ considerably in off-target priming rates as seen by the differences in library complexity and fraction of intronic reads among commercial scRNA-seq protocols [ 70 ]. For example, the 5’ expression kit from 10x has three times fewer intronic reads than the 3’ expression kit [ 70 , 50 ]. This is likely caused by the lower off-target priming rate of the TSO compared to the oligodT [ 50 ] and this likely leads to the lower number of detected genes per cell (∼2000 and ∼3000 in 5’ and 3’ PBMC data, respectively [ 70 ]), as a similar reduction in gene number is seen when counting solely exons in 3’ expression data [ 76 ]. While the balance between off- and on-target priming seems important for the performance of RNA-seq protocols, it is currently unclear where an optimal balance might lie, how it could best be measured, and whether it should and could be optimized. In any case, it is likely that it is a major factor contributing to differences in gene expression levels among protocols [ 22 ], among RNA isolation methods [ 16 ] , or usable reads as discussed below. In summary, we find for prime-seq that more stringent reverse transcription leads to slightly more usable reads, but less complex libraries. This stresses the general importance of optimizing not one parameter in isolation and likely reflects the balance between off- and on-target priming, which is an important but largely unexplored parameter for RNA-seq libraries. Usable and unusable reads in RNA-seq Barcode assigned reads (”reads” in the following) are mapped to the genome and are used further if they map to the sense strand of an annotated exon or intron. For the most recent mouse gene annotation GENCODE34 [ 77 ] that is used here, these are 21,414 annotated protein coding genes, 15,131 long non-coding RNA genes, 6,105 small non-coding RNA genes or 13,754 pseudogenes that amount to 19.5 % of the mouse genome. A considerable fraction of reads - 17 % for the prime-seq2 validation data ( Fig. 3D ) - map to intergenic regions and their origin and information content seem worth discussing. Several possible sources for these intergenic reads can be distinguished: 1) Genomic DNA: This is a potential source of intergenic reads as the reverse transcriptase can use DNA as template. However, as discussed above, this source is probably negligible for prime-seq2 due to its optimized DNA digest. 2) Mismapping: A substantial fraction of reads in RNA-seq - ∼10 % of reads for our data - map equally well to 2-50 different regions of the genome ( > 50 are filtered out). There are different possibilities to handle such multimapped reads [ 78 ] and STAR randomly assigns them to one region. Among the intergenic reads, 2.9 %pt. have an equally good match in an exonic or intronic region (data not shown). Assuming that such reads actually do originate from an annotated gene - as currently assumed, e.g., by STARsolo - this would reduce the fraction of intergenic reads to 14.1 %. 3) Antisense: For the 3’tagged libraries of 10x, it has been convincingly argued that antisense reads arise when the second cDNA strand is not generated by the TSO, but instead by the oligodT primer hybridizing to internal polyA stretches on the first strand cDNA [ 50 ]. This generates one sense and one antisense molecule in the library, indeed leading to a correlation of sense and antisense reads that is even higher than the exon-intron correlation [ 50 ]. For 10x scRNA-seq data, on average 14 % of all UMIs mapping to exons/introns are antisense [ 50 ]. For prime-seq2, we find comparable numbers, with ∼5 % of all reads being antisense (Fig. S3A), which corresponds to ∼9 % of all UMIs mapping to exons/introns. To our knowledge, it has not been investigated whether using antisense reads would increase or decrease the power to detect differentially expressed genes in bulk or scRNA-seq data, but as introns do increase this power, the even more correlated antisense reads are likely to do so as well. However, the current practice, as evident by the default settings of CellRanger and STARsolo, is not to use them. Of note, RNA-seq protocols based on random priming are not strand-aware and hence necessarily use sense and antisense reads to quantify expression levels. 4) Non-annotated read-through isoforms: It has been shown that thousands of genes that are detected by in situ hybridization in a mouse brain region have reads from a 10x experiment of the same region mapping within 10 kb downstream of their annotated 3’ end [ 75 ]. These reads make up 25 % of all intergenic reads in this dataset and ∼2000 genes are only detected when using these reads [ 75 ]. A good proportion of these reads might be derived from short-lived read-through transcripts that arise especially during stress [ 79 ], but read-through transcripts are also found for 1/3 of all protein-coding genes in ”normal” human tissues [ 80 , 81 , 82 ]. Either way, they can be regarded as non-annotated isoforms of the upstream genes and, similar to using introns, increase the effective complexity of the library by allowing more genes to be tested for differential expression. Also in our data, we find ∼25 % of all intergenic reads to map in sense orientation to regions 10 kb downstream of annotated genes (data not shown), suggesting that a good fraction of intergenic reads might be derived from non-annotated read-through transcripts. While it has been suggested to incorporate such reads for quantification [ 75 ] and recent ENCODE versions have taken this up and integrated options for extending human and mouse gene annotation [ 83 ], it is not the current practice to use those reads. 6) Pervasive transcription: After microarrays and Next-Generation Sequencing had made deep transcriptome profiling possible, it became clear that a large fraction of the genome is transcribed to some extent [ 84 ]. This includes the above read-through transcripts, transcripts originating from enhancers (eRNA) or promotors (PROMPTS), and gives rise to the 36,172 currently annotated long non-coding RNAs [ 83 ] and many more non-annotated ones [ 85 ]. The overly enthusiastic functional interpretations of these transcripts have been rightly criticized [ 86 ] and it is not clear how to interpret annotated and non-annotated lincRNAs in quantitative RNA-seq experiments. Many probably do reflect the transcriptional activity of the region in some way, and interestingly, it has been suggested to exploit these reads in scRNA-seq data to infer chromatin states [ 51 ]. In summary, the 17 % intergenic reads consist of 2.3 %pt. mismapped reads, ∼5 %pt. antisense reads, and 4.3 %pt. putative read-throughs. Most of these reads are probably directly related to the canonical transcription of (protein-coding) genes and hence probably improve their detection and quantification, similar to introns. However, in contrast to introns, there is no consensus when and how often these reads might originate from different processes, like stress-induced read through or genuine antisense transcription that might regulate sense transcription [ 87 ]. It is probably due to this uncertainty that they are not used in current practice. However, given that they do help to detect and quantify genes known to be expressed [ 75 ], further work is warranted to explore whether this practice might be worth changing. Conclusion In this study, we have optimized the number of usable reads by systematically testing different steps in a bulk RNA-seq protocol. This has resulted in prime-seq2, one of the most cost-efficient home-brew bulk RNA-seq protocols available. Our study suggests that monitoring the filtering of usable reads can serve as a valuable quality control for many bulk and single-cell RNA-seq protocols. It also highlights the complexity of the conditions and interactions that shape RNA-seq library composition, with implications for how these data should be interpreted. Data Availability The data discussed in this publication have been deposited in NCBI’s Gene Expression Omnibus [ 88 ] and are accessible through GEO Series accession number GSE286365 ( ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE286365 ). The remaining data and code for data analysis and visualization can be accessed at github.com/Hellmann-Lab/prime-seq2 and as a stable version at 10.5281/zenodo.16902725 Supplementary Data Supplementary Data are available at NAR Online. Author Contributions FP, EB, WE and DR conceived the study. FP, EB and DR conducted the experiments. FP and EB analyzed the data. FP, EB, WE and DR wrote the manuscript. Funding This work was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) – Project Number 238187445 – SFB 1123, Subproject Z02 (W.E. and E.B.). Conflict of Interest Disclosure None declared. Acknowledgments We thank Ines Bliesener for lab support and the Enard/Hellmann group for helpful discussions. We thank Dr. Stefan Krebs and the staff of LAFUGA for sequencing services. Some illustrations in the graphical abstract, Fig. 1 - 3 , and Fig. S7 were created with BioRender.com. Funder Information Declared Deutsche Forschungsgemeinschaft, https://ror.org/018mejw64 , 238187445 Footnotes https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE286365 https://github.com/Hellmann-Lab/prime-seq https://doi.org/10.5281/zenodo.16902725 https://dx.doi.org/10.17504/protocols.io.14egn97kpl5d/v1 References [1]. ↵ Stark , R. , Grzelak , M. , and Hadfield , J. (24 November, 2019 ) RNA sequencing: the teenage years . Nat. Rev. Genet ., 20 ( 11 ), 631 – 656 . OpenUrl CrossRef PubMed [2]. ↵ Foord , C. , Hsu , J. , Jarroux , J. , Hu , W. , Belchikov , N. , Pollard , S. , He , Y. , Joglekar , A. , and Tilgner , H. U. (January, 2023 ) The variables on RNA molecules: concert or cacophony? Answers in long-read sequencing . Nat. Methods , 20 ( 1 ), 20 – 24 . OpenUrl CrossRef PubMed [3]. ↵ Rood , J. E. , Wynne , S. , Robson , L. , Hupalowska , A. , Randell , J. , Teichmann , S. A. , and Regev , A . (January, 2025 ) The Human Cell Atlas from a cell census to a unified foundation model . Nature , 637 ( 8048 ), 1065 – 1071 . OpenUrl CrossRef PubMed [4]. ↵ Avila Cobos , F. , Alquicira-Hernandez , J. , Powell , J. E. , Mestdagh , P. , and De Preter , K. (6 November, 2020 ) Benchmarking of cell type deconvolution pipelines for transcriptomics data . Nat. Commun ., 11 ( 1 ), 5650 . OpenUrl CrossRef PubMed [5]. ↵ Nguyen , H. , Nguyen , H. , Tran , D. , Draghici , S. , and Nguyen , T. (22 May, 2024 ) Fourteen years of cellular deconvolution: methodology, applications, technical evaluation and outstanding challenges . Nucleic Acids Res ., 52 ( 9 ), 4761 – 4783 . OpenUrl CrossRef PubMed [6]. ↵ White , B. S. , de Reyniès , A. , Newman , A. M. , Waterfall , J. J. , Lamb , A. , Petitprez , F. , Lin , Y. , Yu , R. , Guerrero-Gimenez , M. E. , Domanskyi , S. , Monaco , G. , Chung , V. , Banerjee , J. , Derrick , D. , Valdeolivas , A. , Li , H. , Xiao , X. , Wang , S. , Zheng , F. , Yang , W. , Catania , C. A. , Lang , B. J. , Bertus , T. J. , Piermarocchi , C. , Caruso , F. P. , Ceccarelli , M. , Yu , T. , Guo , X. , Bletz , J. , Coller , J. , Maecker , H. , Duault , C. , Shokoohi , V. , Patel , S. , Liliental , J. E. , Simon , S. , de Reyniès , A. , Saez-Rodriguez , J. , Heiser , L. M. , Guinney , J. , and Gentles , A. J. (27 August, 2024 ) Community assessment of methods to deconvolve cellular composition from bulk gene expression . Nat Commun , 15 ( 1 ), 1 – 22 . OpenUrl CrossRef PubMed [7]. ↵ Hrdlickova , R. , Toloue , M. , and Tian , B. (January, 2017 ) RNA-Seq methods for transcriptome analysis . Wiley Interdiscip. Rev. RNA , 8 ( 1 ). OpenUrl [8]. ↵ Thind , A. S. , Monga , I. , Thakur , P. K. , Kumari , P. , Dindhoria , K. , Krzak , M. , Ranson , M. , and Ashford , B. (5 November, 2021 ) Demystifying emerging bulk RNA-Seq applications: the application and utility of bioinformatic methodology . Brief. Bioinform ., 22 ( 6 ). [9]. ↵ mRNA Sequencing . https://emea.illumina.com/techniques/sequencing/rna-sequencing/mrna-seq.html Accessed: 2025-2-6 . [10]. ↵ Ziegenhain , C. , Vieth , B. , Parekh , S. , Hellmann , I. , and Enard , W. (1 July, 2018 ) Quantitative single-cell transcriptomics . Brief. Funct. Genomics , 17 ( 4 ), 220 – 232 . OpenUrl PubMed [11]. ↵ Alpern , D. , Gardeux , V. , Russeil , J. , Mangeat , B. , Meireles-Filho , A. C. A. , Breysse , R. , Hacker , D. , and Deplancke , B. (19 April, 2019 ) BRB-seq: ultra-affordable high-throughput transcriptomics enabled by bulk RNA barcoding and sequencing . Genome Biol ., 20 ( 1 ), 71 . OpenUrl CrossRef PubMed [12]. ↵ Schurch , N. J. , Schofield , P. , Gierlinśki , M. , Cole , C. , Sherstnev , A. , Singh , V. , Wrobel , N. , Gharbi , K. , Simpson , G. G. , Owen-Hughes , T. , Blaxter , M. , and Barton , G. J. (June, 2016 ) How many biological replicates are needed in an RNA-seq experiment and which differential expression tool should you use? . RNA , 22 ( 6 ), 839 – 851 . OpenUrl Abstract / FREE Full Text [13]. ↵ Lazic , S. E. , Clarke-Williams , C. J. , and Munafó , M. R. (4 April, 2018 ) What exactly is ’N’ in cell culture and animal experiments? . PLoS Biol ., 16 ( 4 ), e2005282 . OpenUrl CrossRef PubMed [14]. ↵ Liu , Y. , Zhou , J. , and White , K. P. (1 February, 2014 ) RNA-seq differential expression studies: more sequence or more replication? . Bioinformatics , 30 ( 3 ), 301 – 304 . OpenUrl CrossRef PubMed Web of Science [15]. ↵ Wetterstrand , K. A . DNA Sequencing Costs: Data from the NHGRI Genome Sequencing Program (GSP) . https://www.genome.gov/about-genomics/fact-sheets/DNA-Sequencing-Costs-Data (14 November, 2024 ) Accessed: 2024-11-14 . [16]. ↵ Janjic , A. , Wange , L. E. , Bagnoli , J. W. , Geuder , J. , Nguyen , P. , Richter , D. , Vieth , B. , Vick , B. , Jeremias , I. , Ziegenhain , C. , Hellmann , I. , and Enard , W. (31 March, 2022 ) Prime-seq, efficient and powerful bulk RNA sequencing . Genome Biol ., 23 ( 1 ), 88 . OpenUrl CrossRef PubMed [17]. ↵ Svensson , V. , Vento-Tormo , R. , and Teichmann , S. A. (April, 2018 ) Exponential scaling of single-cell RNA-seq in the past decade . Nat. Protoc ., 13 ( 4 ), 599 – 604 . OpenUrl CrossRef PubMed [18]. ↵ Li , Y. , Yang , H. , Zhang , H. , Liu , Y. , Shang , H. , Zhao , H. , Zhang , T. , and Tu , Q. (23 March, 2020 ) Decode-seq: a practical approach to improve differential gene expression analysis . Genome Biol ., 21 ( 1 ), 66 . OpenUrl CrossRef PubMed [19]. ↵ O’Keeffe , P. , Nouri , Y. , Saw , H. S. , Moore , Z. , Baldwin , T. M. , Olechnowicz , S. W. Z. , Jabbari , J. S. , Squire , D. M. , Leslie , S. , Wang , C. , You , Y. , Ritchie , M. E. , Cross , R. S. , Jenkins , M. R. , Audiger , C. , Naik , S. H. , Whittle , J. R. , Freytag , S. , Best , S. A. , Hickey , P. F. , Amann-Zalcenstein , D. , Bowden , R. , and Brown , D. V. (21 November, 2024 ) TIRE-seq: An integrated sample extraction and transcriptomics workflow for high throughput perturbation studies . bioRxiv ,. [20]. ↵ Ye , C. , Ho , D. J. , Neri , M. , Yang , C. , Kulkarni , T. , Randhawa , R. , Henault , M. , Mostacci , N. , Farmer , P. , Renner , S. , Ihry , R. , Mansur , L. , Keller , C. G. , McAllister , G. , Hild , M. , Jenkins , J. , and Kaykas , A. (17 October, 2018 ) DRUG-seq for miniaturized high-throughput transcriptome profiling in drug discovery . Nat. Commun ., 9 ( 1 ), 4307 . OpenUrl CrossRef PubMed [21]. ↵ Kashima , M. , Kamitani , M. , Nomura , Y. , Mori-Moriyama , N. , Betsuyaku , S. , Hirata , H. , and Nagano , A. J . (6 August, 2022 ) DeLTa-Seq: direct-lysate targeted RNA-Seq from crude tissue lysate . Plant Methods , 18 ( 1 ), 99 . OpenUrl CrossRef PubMed [22]. ↵ Ziegenhain , C. , Vieth , B. , Parekh , S. , Reinius , B. , Guillaumet-Adkins , A. , Smets , M. , Leonhardt , H. , Heyn , H. , Hellmann , I. , and Enard , W. (16 February, 2017 ) Comparative Analysis of Single-Cell RNA Sequencing Methods . Mol. Cell , 65 ( 4 ), 631 – 643 .e4. OpenUrl CrossRef PubMed [23]. ↵ Vieth , B. , Ziegenhain , C. , Parekh , S. , Enard , W. , and Hellmann , I. (1 November, 2017 ) powsimR: power analysis for bulk and single cell RNA-seq experiments . Bioinformatics , 33 ( 21 ), 3486 – 3488 . OpenUrl CrossRef PubMed [24]. ↵ Conesa , A. , Madrigal , P. , Tarazona , S. , Gomez-Cabrero , D. , Cervera , A. , McPherson , A. , Szczésniak , M. W. , Gaffney , D. J. , Elo , L. L. , Zhang , X. , and Mortazavi , A. (26 January, 2016 ) A survey of best practices for RNA-seq data analysis . Genome Biol ., 17 , 13. [25]. ↵ Beale , H. C. , Roger , J. M. , Cattle , M. A. , McKay , L. T. , Thompson , D. K. A. , Learned , K. , Lyle , A. G. , Kephart , E. T. , Currie , R. , Lam , D. L. , Sanders , L. , Pfeil , J. , Vivian , J. , Bjork , I. , Salama , S. R. , Haussler , D. , and Vaske , O. M. (13 March, 2021 ) The case for using mapped exonic non-duplicate reads when reporting RNA-sequencing depth: examples from pediatric cancer datasets . Gigascience , 10 ( 3 ). [26]. ↵ Enard , W. , Gehre , S. , Hammerschmidt , K. , Hölter , S. M. , Blass , T. , Somel , M. , Brückner , M. K. , Schreiweis , C. , Winter , C. , Sohr , R. , Becker , L. , Wiebe , V. , Nickel , B. , Giger , T. , Müller , U. , Groszer , M. , Adler , T. , Aguilar , A. , Bolle , I. , Calzada-Wack , J. , Dalke , C. , Ehrhardt , N. , Favor , J. , Fuchs , H. , Gailus-Durner , V. , Hans , W. , Hölzlwimmer , G. , Javaheri , A. , Kalaydjiev , S. , Kallnik , M. , Kling , E. , Kunder , S. , Mossbrugger , I. , Naton , B. , Racz , I. , Rathkolb , B. , Rozman , J. , Schrewe , A. , Busch , D. H. , Graw , J. , Ivandic , B. , Klingenspor , M. , Klopstock , T. , Ollert , M. , Quintanilla-Martinez , L. , Schulz , H. , Wolf , E. , Wurst , W. , Zimmer , A. , Fisher , S. E. , Morgenstern , R. , Arendt , T. , de Angelis , M. H. , Fischer , J. , Schwarz , J. , and Pääbo , S. (29 May, 2009 ) A humanized version of Foxp2 affects cortico-basal ganglia circuits in mice . Cell , 137 ( 5 ), 961 – 971 . OpenUrl CrossRef PubMed Web of Science [27]. ↵ Parekh , S. , Ziegenhain , C. , Vieth , B. , Enard , W. , and Hellmann , I . ( 2018 ) zUMIs - A fast and flexible pipeline to process RNA sequencing data with UMIs . Gigascience , 7 . [28]. ↵ Renaud , G. , Stenzel , U. , Maricic , T. , Wiebe , V. , and Kelso , J. (1 March, 2015 ) deML: robust demultiplexing of Illumina sequences using a likelihood-based approach . Bioinformatics , 31 ( 5 ), 770 – 772 . OpenUrl CrossRef PubMed [29]. ↵ Martin , M. (2 May, 2011 ) Cutadapt removes adapter sequences from high-throughput sequencing reads . EMBnet.journal , 17 ( 1 ), 10 – 12 . OpenUrl [30]. ↵ Vieth , B. , Ziegenhain , C. , Parekh , S. , Enard , W. , and Hellmann , I. (1 November, 2017 ) powsimR: power analysis for bulk and single cell RNA-seq experiments . Bioinformatics , 33 ( 21 ), 3486 – 3488 . OpenUrl CrossRef PubMed [31]. ↵ R Core Team R : A Language and Environment for Statistical Computing . ( 2023 ). [32]. ↵ Wickham , H. , Averick , M. , Bryan , J. , Chang , W. , McGowan , L. , Francois , R. , Grolemund , G. , Hayes , A. , Henry , L. , Hester , J. , Kuhn , M. , Pedersen , T. , Miller , E. , Bache , S. , Müller , K. , Ooms , J. , Robinson , D. , Seidel , D. , Spinu , V. , Takahashi , K. , Vaughan , D. , Wilke , C. , Woo , K. , and Yutani , H. (21 November, 2019 ) Welcome to the tidyverse . J. Open Source Softw ., 4 ( 43 ), 1686 . OpenUrl CrossRef [33]. ↵ Dowle , M. and Srinivasan , A. data.table: Extension of ‘data.frame‘ . ( 2023 ). [34]. ↵ Clarke , E. , Sherrill-Mix , S. , and Dawson , C. ggbeeswarm: Categorical Scatter (Violin Point) Plots . ( 2023 ). [35]. ↵ Wickham , H. and Henry , L. purrr: Functional Programming Tools . ( 2023 ). [36]. Kassambara , A. ggpubr: ’ggplot2’ Based Publication Ready Plots . ( 2023 ). [37]. ↵ Pedersen , T . L. patchwork: The Composer of Plots . ( 2023 ). [38]. Wilke , C. O. cowplot: Streamlined Plot Theme and Plot Annotations for ’ggplot2’ . ( 2020 ). [39]. ↵ Morgan , M. , Anders , S. , Lawrence , M. , Aboyoun , P. , Pagés , H. , and Gentleman , R. (1 October, 2009 ) ShortRead: a bioconductor package for input, quality assessment and exploration of high-throughput sequence data . Bioinformatics , 25 ( 19 ), 2607 –2608. OpenUrl CrossRef PubMed Web of Science [40]. ↵ Foley , J . bioanalyzeR . https://github.com/jwfoley/bioanalyzeR (21 July, 2023 ) Accessed: 2024-10-15 . [41]. ↵ Lee , S. , Zhang , A. Y. , Su , S. , Ng , A. P. , Holik , A. Z. , Asselin-Labat , M.-L. , Ritchie , M. E. , and Law , C. W. (September, 2020 ) Covering all your bases: incorporating intron signal from RNA-seq data . NAR Genom. Bioinform ., 2 ( 3 ), lqaa073 . OpenUrl CrossRef [42]. ↵ Oscorbin , I. P. and Filipenko , M. L. (22 November, 2021 ) M-MuLV reverse transcriptase: Selected properties and improved mutants . Comput. Struct. Biotechnol. J ., 19 , 6315 – 6327 . OpenUrl CrossRef PubMed [43]. ↵ Vanecko , S. and Laskowski , Sr , M. (December, 1961 ) Studies of the specificity of deoxyribonuclease I. III. Hydrolysis of chains carrying a monoesterified phosphate on carbon 5’ . J. Biol. Chem ., 236 , 3312 – 3316 . OpenUrl FREE Full Text [44]. ↵ Mohr , S. , Ghanem , E. , Smith , W. , Sheeter , D. , Qin , Y. , King , O. , Polioudakis , D. , Iyer , V. R. , Hunicke-Smith , S. , Swamy , S. , Kuersten , S. , and Lambowitz , A. M. (July, 2013 ) Thermostable group II intron reverse transcriptase fusion proteins and their use in cDNA synthesis and next-generation RNA sequencing . RNA , 19 ( 7 ), 958 – 970 . OpenUrl Abstract / FREE Full Text [45]. ↵ Bagnoli , J. W. , Ziegenhain , C. , Janjic , A. , Wange , L. E. , Vieth , B. , Parekh , S. , Geuder , J. , Hellmann , I. , and Enard , W . ( 2018 ) Sensitive and powerful single-cell RNA sequencing using mcSCRB-seq . Nat. Commun ., 9 . [46]. ↵ Zucha , D. , Androvic , P. , Kubista , M. , and Valihrach , L. (1 January, 2020 ) Performance comparison of reverse transcriptases for single-cell studies . Clin. Chem ., 66 ( 1 ), 217 – 228 . OpenUrl CrossRef PubMed [47]. ↵ Hahaut , V. , Pavlinic , D. , Carbone , W. , Schuierer , S. , Balmer , P. , Quinodoz , M. , Renner , M. , Roma , G. , Cowan , C. S. , and Picelli, S. (October, 2022 ) Fast and highly sensitive full-length single-cell RNA sequencing using FLASH-seq . Nat. Biotechnol ., 40 ( 10 ), 1447 – 1451 . OpenUrl CrossRef PubMed [48]. ↵ Hagemann-Jensen , M. , Ziegenhain , C. , Chen , P. , Ramsköld , D. , Hendriks , G.-J. , Larsson , A. J. M. , Faridani , O. R. , and Sandberg , R. (June, 2020 ) Single-cell RNA counting at allele and isoform resolution using Smart-seq3 . Nat. Biotechnol ., 38 ( 6 ), 708 – 714 . OpenUrl CrossRef PubMed [49]. ↵ Kalle , E. , Kubista , M. , and Rensing , C. (December, 2014 ) Multi-template polymerase chain reaction . Biomol. Detect. Quantif ., 2 , 11 – 29 . OpenUrl PubMed 50. ↵ 10x Genomics Interpreting Intronic and Antisense Reads in 10x Genomics Single Cell Gene Expression Data . https://www.10xgenomics.com/support/universal-three-prime-gene-expression/documentation/steps/sequencing/interpreting-intronic-and-antisense-reads-in-10-x-genomics-single-cell-gene-expression-data (9 August, 2021 ) Accessed: 2025-4-30 . [51]. ↵ He , D. , Mount , S. M. , and Patro , R. (31 January, 2024 ) scCensus: Off-target scRNA-seq reads reveal meaningful biology . bioRxiv ,. [52]. ↵ Tan , G. , Opitz , L. , Schlapbach , R. , and Rehrauer , H. (27 February, 2019 ) Long fragments achieve lower base quality in Illumina paired-end sequencing . Sci. Rep ., 9 (1), 2856 . OpenUrl CrossRef PubMed [53]. ↵ Gohl , D. M. , Magli , A. , Garbe , J. , Becker , A. , Johnson , D. M. , Anderson , S. , Auch , B. , Billstein , B. , Froehling , E. , McDevitt , S. L. , and Beckman , K. B. (29 April, 2019 ) Measuring sequencer size bias using REcount: a novel method for highly accurate Illumina sequencing-based quantification . Genome Biol ., 20 (1), 85 . OpenUrl CrossRef PubMed [54]. ↵ Bronner , I. F. and Quail , M. A. (June, 2019 ) Best practices for Illumina library preparation . Curr. Protoc. Hum. Genet ., 102 ( 1 ), e86 . OpenUrl CrossRef PubMed [55]. ↵ Soumillon , M. , Cacchiarelli , D. , Semrau , S. , van Oudenaarden , A. , and Mikkelsen , T. S. (5 March, 2014 ) Characterization of directed differentiation by high-throughput single-cell RNA-Seq . bioRxiv , p. 003236 . [56]. ↵ Rychlik , W. , Spencer , W. J. , and Rhoads , R. E. (11 November, 1990 ) Optimization of the annealing temperature for DNA amplification in vitro . Nucleic Acids Res ., 18 ( 21 ), 6409 – 6412 . OpenUrl CrossRef PubMed Web of Science [57]. ↵ Cline , J. , Braman , J. C. , and Hogrefe , H. H. (15 September, 1996 ) PCR fidelity of pfu DNA polymerase and other thermostable DNA polymerases . Nucleic Acids Res ., 24 ( 18 ), 3546 – 3551 . OpenUrl CrossRef PubMed Web of Science [58]. ↵ Lundberg , K. S. , Shoemaker , D. D. , Adams , M. W. , Short , J. M. , Sorge , J. A. , and Mathur , E. J. (1 December, 1991 ) High-fidelity amplification using a thermostable DNA polymerase isolated from Pyrococcus furiosus . Gene , 108 ( 1 ), 1 – 6 . OpenUrl CrossRef PubMed Web of Science [59]. ↵ Potapov , V. and Ong , J. L. (6 January, 2017 ) Examining sources of error in PCR by single-molecule sequencing . PLoS One , 12 ( 1 ), e0169774 . OpenUrl CrossRef PubMed [60]. ↵ Skerra , A . (25 July, 1992 ) Phosphorothioate primers improve the amplification of DNA sequences by DNA polymerases with proofreading activity . Nucleic Acids Res ., 20 ( 14 ), 3551 – 3554 . OpenUrl CrossRef PubMed Web of Science [61]. ↵ Gohl , D. M. , Vangay , P. , Garbe , J. , MacLean , A. , Hauge , A. , Becker , A. , Gould , T. J. , Clayton , J. B. , Johnson , T. J. , Hunter , R. , Knights , D. , and Beckman , K. B. (September, 2016 ) Systematic improvement of amplicon marker gene methods for increased accuracy in microbiome studies . Nat. Biotechnol ., 34 ( 9 ), 942 – 949 . OpenUrl CrossRef PubMed [62]. ↵ Gohl , D. M. , Auch , B. , Certano , A. , LeFrancois , B. , Bouevitch , A. , Doukhanine , E. , Fragel , C. , Macklaim , J. , Hollister , E. , Garbe , J. , and Beckman , K. B. (7 September, 2021 ) Dissecting and tuning primer editing by proofreading polymerases . Nucleic Acids Res ., 49 ( 15 ), e87. [63]. ↵ Zhao , A. , Jiang , H. , Palomares , A. R. , Larsson , A. , He , W. , Grünler , J. , Zheng , X. , Rodriguez Wall-berg , K. A. , Catrina , S.-B. , and Deng , Q. (15 April, 2024 ) Appropriate glycemic management protects the germline but not the uterine environment in hyperglycemia . EMBO Rep ., 25 ( 4 ), 1752 – 1772 . OpenUrl CrossRef PubMed [64]. ↵ Pereira , A. , Diwakar , J. , Masserdotti , G. , Beskardes , S. , Simon , T. , So , Y. , Maritin-Loarte , L. , Bergemann , F. , Vasan , L. , Schauer , T. , Danese , A. , Bocchi , R. , Colomé-Tatché , M. , Schuurmans , C. , Philpott , A. , Straub , T. , Bonev , B. , and Götz , M. (2 July, 2024 ) Direct neuronal reprogramming of mouse astrocytes is associated with multiscale epigenome remodeling and requires Yy1 . Nat. Neurosci. , 27 ( 7 ), 1260 – 1273 . OpenUrl CrossRef PubMed [65]. ↵ Lu , H. , Jiang , H. , Li , C. , Derisoud , E. , Zhao , A. , Eriksson , G. , Lindgren , E. , Pui , H.-P. , Risal , S. , Pei , Y. , Maxian , T. , Ohlsson , C. , Benrick , A. , Haider , S. , Stener-Victorin , E. , and Deng , Q. (1 September, 2024 ) Dissecting the impact of maternal androgen exposure on developmental programming through targeting the androgen receptor . Adv. Sci. (Weinh.) , 11 ( 36 ), e2309429 . OpenUrl [66]. ↵ Mazzarini , M. , Cherone , J. , Nguyen , T. , Martelli , F. , Varricchio , L. , Funnell , A. P. W. , Pa-payannopoulou , T. , and Migliaccio , A. R. (5 November, 2024 ) The glucocorticoid receptor elicited proliferative response in human erythropoiesis is BCL11A-dependent . Stem Cells , 42 ( 11 ), 1006 – 1022 . OpenUrl PubMed [67]. ↵ Mocciaro , E. , Giambruno , R. , Micheloni , S. , Cernilogar , F. M. , Andolfo , A. , Consonni , C. , Pannese , M. , Ferri , G. , Runfola , V. , Schotta , G. , and Gabellini , D. (9 June, 2023 ) WDR5 is required for DUX4 expression and its pathological effects in FSHD muscular dystrophy . Nucleic Acids Res ., 51 ( 10 ), 5144 – 5161 . [68]. ↵ Cheng , J. , Chen , J. , Liao , J. , Wang , T. , Shao , X. , Long , J. , Yang , P. , Li , A. , Wang , Z. , Lu , X. , and Fan , X . (1 April, 2023 ) High-throughput transcriptional profiling of perturbations by Panax ginseng saponins and Panax notoginseng saponins using TCM-seq . J. Pharm. Anal ., 13 ( 4 ), 376 – 387 . OpenUrl PubMed [69]. ↵ Eser , T. M. , Baranov , O. , Huth , M. , Ahmed , M. I. M. , Déak , F. , Held , K. , Lin , L. , Pekayvaz , K. , Leunig , A. , Nicolai , L. , Pollakis , G. , Buggert , M. , Price , D. A. , Rubio-Acero , R. , Reich , J. , Falk , P. , Markgraf , A. , Puchinger , K. , Castelletti , N. , Olbrich , L. , Vanshylla , K. , Klein , F. , Wieser , A. , Hasenauer , J. , Kroidl , I. , Hoelscher , M. , and Geldmacher , C. (24 May, 2023 ) Nucleocapsid-specific T cell responses associate with control of SARS-CoV-2 in the upper airways before seroconversion . Nat. Commun. , 14 ( 1 ), 2952 . OpenUrl PubMed [70]. ↵ De Simone , M. , Hoover , J. , Lau , J. , Bennett , H. M. , Wu , B. , Chen , C. , Menon , H. , Au-Yeung , A. , Lear , S. , Vaidya , S. , Shi , M. , Lund , J. M. , Xavier-Magalhães , A. , Liang , Y. , Kurdoglu , A. , O’Gorman , W. E. , Modrusan , Z. , Le , D. , and Darmanis , S. (16 December, 2024 ) A comprehensive analysis framework for evaluating commercial single-cell RNA sequencing technologies . Nucleic Acids Res ., p. gkae1186 . [71]. ↵ Svoboda , M. , Frost , H. R. , and Bosco , G . (25 June, 2022 ) Internal oligo(dT) priming introduces systematic bias in bulk and single-cell RNA sequencing count data . NAR Genom. Bioinform ., 4 ( 2 ), lqac035 . OpenUrl [72]. ↵ Bakken , T. E. , Hodge , R. D. , Miller , J. A. , Yao , Z. , Nguyen , T. N. , Aevermann , B. , Barkan , E. , Bertagnolli , D. , Casper , T. , Dee , N. , Garren , E. , Goldy , J. , Graybuck , L. T. , Kroll , M. , Lasken , R. S. , Lathia , K. , Parry , S. , Rimorin , C. , Scheuermann , R. H. , Schork , N. J. , Shehata , S. I. , Tieu , M. , Phillips , J. W. , Bernard , A. , Smith , K. A. , Zeng , H. , Lein , E. S. , and Tasic , B. (26 December, 2018 ) Single-nucleus and single-cell transcriptomes compared in matched cortical cell types . PLoS One , 13 ( 12 ), e0209648. [73]. ↵ Kuo , A. , Hansen , K. D. , and Hicks , S. C. (1 July, 2024 ) Quantification and statistical modeling of droplet-based single-nucleus RNA-sequencing data . Biostatistics , 25 ( 3 ), 801 – 817 . OpenUrl PubMed [74]. ↵ Chamberlin , J. T. , Lee , Y. , Marth , G. T. , and Quinlan , A. R. (20 March, 2024 ) Differences in molecular sampling and data processing explain variation among single-cell and single-nucleus RNA-seq experiments . Genome Res ., 34 ( 2 ), 179 – 188 . OpenUrl Abstract / FREE Full Text [75]. ↵ Pool , A.-H. , Poldsam , H. , Chen , S. , Thomson , M. , and Oka , Y. (11 October, 2023 ) Recovery of missing single-cell RNA-sequencing data with optimized transcriptomic references . Nat. Methods , 20 ( 10 ), 1506 – 1515 . OpenUrl CrossRef PubMed [76]. ↵ 10x Genomics Release Notes for Cell Ranger v9.01 Feburary 6, 2022 . https://www.10xgenomics.com/support/software/cell-ranger/latest/release-notes/cr-release-notes#v7-0-0 (6 February, 2022) Accessed: 2025-7-4 . [77]. ↵ Frankish , A. , Diekhans , M. , Jungreis , I. , Lagarde , J. , Loveland , J. E. , Mudge , J. M. , Sisu , C. , Wright , J. C. , Armstrong , J. , Barnes , I. , Berry , A. , Bignell , A. , Boix , C. , Carbonell Sala , S. , Cunningham , F. , Di Domenico , T. , Donaldson , S. , Fiddes , I. T. , Garcia Girón , C. , Gonzalez , J. M. , Grego , T. , Hardy , M. , Hourlier , T. , Howe , K. L. , Hunt , T. , Izuogu , O. G. , Johnson , R. , Martin , F. J. , Martinez , L., Mohanan , S. , Muir , P. , Navarro , F. C. P. , Parker , A. , Pei , B. , Pozo , F. , Riera , F. C. , Ruffier , M. , Schmitt , B. M. , Stapleton , E. , Suner , M.-M. , Sycheva , I. , Uszczynska-Ratajczak , B. , Wolf , M. Y. , Xu , J. , Yang , Y. T. , Yates , A. , Zerbino , D. , Zhang , Y. , Choudhary , J. S. , Gerstein , M. , Guigó , R. , Hubbard , T. J. P. , Kellis , M. , Paten , B. , Tress , M. L. , and Flicek , P. (8 January, 2021 ) GENCODE 2021 . Nucleic Acids Res ., 49 ( D1 ), D916 – D923 . OpenUrl CrossRef PubMed [78]. ↵ Deschamps-Francoeur , G. , Simoneau , J. , and Scott , M. S. (12 June, 2020 ) Handling multi-mapped reads in RNA-seq . Comput. Struct. Biotechnol. J ., 18 , 1569 – 1576 . OpenUrl CrossRef PubMed [79]. ↵ Vilborg , A. and Steitz , J. A. (4 May, 2017 ) Readthrough transcription: How are DoGs made and what do they do? . RNA Biol ., 14 ( 5 ), 632 – 636 . OpenUrl PubMed [80]. ↵ Caldas , P. , Luz , M. , Baseggio , S. , Andrade , R. , Sobral , D. , and Grosso , A. R. (15 January, 2024 ) Transcription readthrough is prevalent in healthy human tissues and associated with inherent genomic features . Commun. Biol ., 7 ( 1 ), 100 . OpenUrl PubMed [81]. ↵ McShane , A. , Narayanan , I. V. , Paulsen , M. T. , Ashaka , M. , Blinkiewicz , H. , Yang , N. T. , Magnuson , B. , Bedi , K. , Wilson , T. E. , and Ljungman , M. (9 April, 2024 ) Characterizing nascent transcription patterns of PROMPTs, eRNAs, and readthrough transcripts in the ENCODE4 deeply profiled cell lines . bioRxiv , p. 2024.04.09.588612. [82]. ↵ Agostini , F. , Zagalak , J. , Attig , J. , Ule , J. , and Luscombe , N. M. (5 May, 2021 ) Intergenic RNA mainly derives from nascent transcripts of known genes . Genome Biol. , 22 ( 1 ), 136 . OpenUrl CrossRef PubMed [83]. ↵ Mudge , J. M. , Carbonell-Sala , S. , Diekhans , M. , Martinez , J. G. , Hunt , T. , Jungreis , I. , Loveland , J. E. , Arnan , C. , Barnes , I. , Bennett , R. , Berry , A. , Bignell , A. , Cerdán-Vélez , D. , Cochran , K. , Cortés , L. T. , Davidson , C. , Donaldson , S. , Dursun , C. , Fatima , R. , Hardy , M. , Hebbar , P. , Hollis , Z. , James , B. T. , Jiang , Y. , Johnson , R. , Kaur , G. , Kay , M. , Mangan , R. J. , Maquedano , M. , Gómez , L. M. , Mathlouthi , N. , Merritt , R. , Ni , P. , Palumbo , E. , Perteghella , T. , Pozo , F. , Raj , S. , Sisu , C. , Steed , E. , Sumathipala , D. , Suner , M.-M. , Uszczynska-Ratajczak , B. , Wass , E. , Yang , Y. T. , Zhang , D. , Finn , R. D. , Gerstein , M. , Guigó , R. , Hubbard , T. J. P. , Kellis , M. , Kundaje , A. , Paten , B. , Tress , M. L. , Birney , E. , Martin , F. J. , and Frankish , A. (6 January, 2025 ) GENCODE 2025: reference gene annotation for human and mouse . Nucleic Acids Res ., 53 ( D1 ), D966 – D975 . OpenUrl CrossRef PubMed 84. ↵ [84] ENCODE Project Consortium (6 September, 2012 ) An integrated encyclopedia of DNA elements in the human genome . Nature , 489 (7414), 57 – 74 . OpenUrl CrossRef PubMed Web of Science [85]. ↵ Yip , C. W. , Parr , C. , Takahashi , H. , Yasuzawa , K. , Valentine , M. , Nishiyori-Sueki , H. , Ugolini , C. , Ranzani , V. , Murata , M. , Kato , M. , Kang , W. , Yip , W. H. , Shibayama , Y. , Sim , A. D. , Chen , Y. , Shu , X. , Moody , J. , Umarov , R. , Chang , J.-C. , Pandolfini , L. , Kawashima , T. , Tagami , M. , Nobusada , T. , Kouno , T. , Alfonso Gonzale , C. , Albanese , R. , Dossena , F. , Haberman , N. , Ozaki , K. , Kasukawa , T. , Lenhard , B. , Frith , M. , Bodega , B. , Nicassio , F. , Calviello , L. , Bienko , M. , Legnini , I. , Hilgers , V. , Gustincich , S. , Göke , J. , Lecellier , C.-H. , Shin , J. W. , Hon , C.-C. , and Carninci , P. (31 October, 2024 ) CFC-seq: identification of full-length capped RNAs unveil enhancer-derived transcription . bioRxiv , p. 2024.10.31.620483. [86]. ↵ Ponting , C. P. and Haerty , W. (31 August, 2022 ) Genome-wide analysis of human long noncoding RNAs: A provocative review . Annu. Rev. Genomics Hum. Genet ., 23 ( 1 ), 153 – 172 . OpenUrl CrossRef PubMed [87]. ↵ Pelechano , V. and Steinmetz , L. M. (12 December, 2013 ) Gene regulation by antisense transcription . Nat. Rev. Genet ., 14 ( 12 ), 880 – 893 . OpenUrl CrossRef PubMed [88]. ↵ Edgar , R. , Domrachev , M. , and Lash , A. E. (1 January, 2002 ) Gene Expression Omnibus: NCBI gene expression and hybridization array data repository . Nucleic Acids Res ., 30 ( 1 ), 207 – 210 . OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted August 24, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Improving RNA-seq protocols Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Improving RNA-seq protocols Felix Pförtner , Eva Briem , Wolfgang Enard , Daniel Richter bioRxiv 2025.08.20.671269; doi: https://doi.org/10.1101/2025.08.20.671269 Share This Article: Copy Citation Tools Improving RNA-seq protocols Felix Pförtner , Eva Briem , Wolfgang Enard , Daniel Richter bioRxiv 2025.08.20.671269; doi: https://doi.org/10.1101/2025.08.20.671269 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genomics Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17636) Bioengineering (13859) Bioinformatics (41847) Biophysics (21401) Cancer Biology (18535) Cell Biology (25423) Clinical Trials (138) Developmental Biology (13353) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24287) Genetics (15582) Genomics (22463) Immunology (17701) Microbiology (40300) Molecular Biology (17141) Neuroscience (88432) Paleontology (666) Pathology (2825) Pharmacology and Toxicology (4813) Physiology (7633) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4285) Systems Biology (9808) Zoology (2267)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.