Multi-platform evaluation and optimization of single-cell RNA isoform sequencing

doi:10.1101/2025.02.22.639662

Multi-platform evaluation and optimization of single-cell RNA isoform sequencing

2025 · doi:10.1101/2025.02.22.639662

preprint OA: closed CC-BY-NC-4.0

📄 Open PDF Full text JSON View at publisher

Full text 76,915 characters · extracted from preprint-html · click to expand

Multi-platform evaluation and optimization of single-cell RNA isoform sequencing | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Multi-platform evaluation and optimization of single-cell RNA isoform sequencing View ORCID Profile James T. Webber , View ORCID Profile Daniel A. Bartlett , View ORCID Profile Ghamdan Al-Eryani , View ORCID Profile Akanksha Khorgade , View ORCID Profile Asa Shin , View ORCID Profile Christophe Georgescu , View ORCID Profile Houlin Yu , View ORCID Profile Brian J. Haas , View ORCID Profile Victoria Popic , View ORCID Profile Aziz Al’Khafaji doi: https://doi.org/10.1101/2025.02.22.639662 James T. Webber 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for James T. Webber Daniel A. Bartlett 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Daniel A. Bartlett Ghamdan Al-Eryani 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ghamdan Al-Eryani Akanksha Khorgade 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Akanksha Khorgade Asa Shin 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Asa Shin Christophe Georgescu 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Christophe Georgescu Houlin Yu 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Houlin Yu Brian J. Haas 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Brian J. Haas Victoria Popic 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Victoria Popic For correspondence: vpopic{at}broadinstitute.org aalkhafa{at}broadinstitute.org Aziz Al’Khafaji 1 Broad Clinical Labs , Cambridge, MA, 2 Broad Institute of MIT and Harvard , Cambridge, MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Aziz Al’Khafaji For correspondence: vpopic{at}broadinstitute.org aalkhafa{at}broadinstitute.org Abstract Full Text Info/History Metrics Preview PDF Abstract Long-read transcriptomics enables isoform identification and quantification at single-cell resolution, but analysis is complicated by artifacts introduced during library preparation. We characterize the distinct artifact profiles of three popular single-cell platforms and demonstrate their impact on isoform identification. We introduce a new method for the identification of a wide range of artifacts in cDNA libraries and novel biochemical and bioinformatic strategies to significantly reduce their impact on downstream applications. Finally, we provide a comprehensive framework for RNA isoform sequencing analysis and interpretation as the field continues to develop. Main Advances in long-read sequencing platforms, library construction methods, and computational tools have collectively turned single-cell isoform sequencing into a production-level assay for biological research. The identification and quantification of RNA isoforms at the single-cell level allows researchers to resolve alternative splicing across cell types. Alternative splicing is a central regulatory axis which modulates gene function through differential exon splicing during transcript maturation. In contrast to gene expression analyses, full-length transcript expression analyses allow researchers to capture the rich structural information embedded in the length of the transcript, enabling differentiation of active and inactive forms of the same gene, identification of localization signals, and discovery of novel isoforms. While this datatype is feature rich, artifacts that arise during library construction and sequencing can be erroneously assigned to novel transcripts, greatly complicating analysis. In this work, we identify key sources of artifacts in single-cell RNA isoform sequencing data and implement molecular and computational strategies to eliminate or minimize their impact on downstream analysis. Biochemical and workflow differences between single-cell platforms can lead to important differences during the generation of the sequencing libraries, and thus the generated data. To enable accurate isoform-level transcriptomics it is critical to characterize and account for the landscape of artifacts produced from various single-cell approaches. Artifacts arise during the processes of 1) cDNA synthesis and PCR amplification due to off-target priming from reverse transcription (RT), template switch oligo (TSO), and PCR primers; 2) internal priming from oligo(dT) RT primers during reverse transcription, leading to unintended priming at A-rich regions; 3) mRNA degradation during library construction; 4) incomplete reverse transcription; and 5) errors in sequencer primary data processing 1 , 2 . These artifactual reads may be difficult or impossible to confidently assign to the originating isoform, which can lead to inaccuracies in quantification and the erroneous assignment of novel isoforms. We evaluated multiple prominently used single-cell platforms, namely PIPseq V4 (Fluent Biosciences), 10x 3′ V3.1 (10x Genomics), and 10x 5′ V2.1 (10x Genomics), for their ability to generate high-quality full-length cDNA libraries for isoform sequencing 3 . Given the biochemical differences between these three platforms, we hypothesized that they would exhibit notable differences in the transcripts captured and artifacts produced, which can ultimately lead to significant performance variation in downstream isoform identification and quantification. We first prepared single-cell cDNA libraries from peripheral blood mononuclear cells (PBMCs) and sequenced them using short-read sequencing (Illumina Novaseq X). After subsampling to an equal number of reads per cell and clustering each dataset, we found that the three technologies mostly captured the same cell types at similar proportions ( Fig. 1a, b , Supplemental Table 1), with two notable differences. Analysis of the 10x 3′ data resolved a population of dendritic cells that were not discretely identified by the other two platforms, while the 10x 5′ and PIPseq V4 platforms were able to resolve innate lymphoid cells. This variation is likely due to differential capture of transcript biomarkers. In addition, the PIPseq library contained a substantially smaller proportion of CD14+ monocytes (8.4% of cells, compared to ∼22% in 10x 3′ and 5′), and the UMIs/cell for CD14+ monocytes was lower than for other cell types. This is consistent with previous reports of this platform, and reflects increased RNAse activity due to cell stress during sample preparation 4 . Overall, the 10x libraries captured more UMIs/cell (median 10x 3′: 10,091; 10x 5′: 6,804; PIPseq: 3,872) and genes/cell (median 10x 3′: 2,904; 10x 5′: 2,339; PIPseq: 1,398), while PIPseq captured more cells overall (10x 3′: 3,687; 10x 5′: 3,083; PIPseq: 8,079; Supplementary Fig. 1). We performed a barnyard experiment using the PIPseq platform and observed that the ability to capture more cells did not come at the cost of higher doublet rates (0.9% of 17,050 barcodes observed with ≥10,000 UMIs; Supplementary Fig. 2), and the ambient contamination was similar across all three methods (median 1% cross-species UMI contamination; Supplementary Fig. 3). Download figure Open in new tab Figure 1. Cross-platform artifact quantification and evaluation of molecular optimizations to minimize their formation. a) Single-cell embeddings for PIPseq, 10x 3′ and 10x 5′ libraries prepared from a sample of PBMCs. Cells are colored based on cluster assignment. b) A dot plot showing the percentage non-zero (size) and mean expression (color intensity) of a set of markers across clusters. c) Schematic showing several common structural artifact classes that marti will search for when annotating long reads. The specific sequences of the adapters are input in a configuration file. d) Artifact counts and percentages for the most common categories in the PBMC monomer data. Structural annotations are available in the supplementary data. Samples are colored as in 1b. e) rate of TSO-TSO artifacts in a standard 10x 3′ monomer library, compared to the 3dG monomers. f) The distribution of read length and array size for a 10x 3′ Kinnex library when artifact clean-up is not performed. g) Array size distributions for 10x 3′ Kinnex libraries using three protocols: using a 3dG TSO without cleanup, using the standard 10x TSO without cleanup, and using the standard TSO with the cleanup step. h) Comparisons for PIPseq data when using fresh PBMCs vs methanol+DSP fixation of the same sample. Violin plots show the number of UMIs per cell from the short-read and long-read sequencing along with the read length distribution for CD14+ monocytes. We then turned to comparing cross-platform isoform capture. To evaluate the structure of the cDNA library we sequenced the pre-fragmented full-length cDNA libraries as monomers using the Iso-Seq method on a PacBio Sequel IIe sequencer. We also performed MAS-seq using the commercially available Kinnex single-cell kit, and sequenced on the PacBio Revio platform to obtain a deeply sequenced dataset of single-cell isoforms (cDNA reads after deconcatenation: 10x 3′: 91.8M; 10x 5′: 69.7M; PIPseq: 101.3M, Supplementary Fig. 4, 5). We annotated the monomer reads using marti , a novel software framework we developed for the in-depth classification and quantification of artifactual cDNA constructs. marti enables the detection of a wide range of artifactual constructs that arise during cDNA synthesis, PCR, sequencing, and primary sequence analysis stages ( Fig. 1c, d ; Supplementary Table 2; methods). By examining the occurrences of the RT and PCR primers, sequencing adapters, and poly(A) tails within the read sequence, marti builds the structural representation of the read and classifies it based on its deviation from the non-artifactual (proper) read conformation and its matches against a library of predefined artifactual constructs. Notably, by reporting the specific structure of each read (in addition to the matched artifact class(es)), marti facilitates the detection of novel artifacts not yet defined in its artifact library. We used marti to quantify the rate of occurrence of different artifact types in our long-read libraries, and extract the annotated native cDNA sequence of proper reads for alignment and downstream analysis. Generally, artifacts can fall into two non-exclusive classes, those with an improper adapter structure (e.g. TSO-TSO, RT-RT, internal PCR adapter priming) and those with evidence of oligo(dT) internal priming, which are structurally indistinguishable from proper non-artifactual reads (Supplementary Fig. 6). Both classes of artifacts are problematic as they do not faithfully reflect the native mRNA and constrain our ability to accurately identify and quantify transcripts. While easier to classify, artifacts with improper adapter structure are particularly problematic as they can interfere with library construction and require involved purification. As seen in previous studies, we observed that the 10x 3′ library contained a high percentage (14.50%) of TSO-TSO artifacts, while the 10x 5′ and PIPseq libraries had lower rates (1.16% and 0.51% respectively) 5 . These and other artifact types inhibit the formation of full-length MAS-seq arrays by introducing a segment with symmetric ends, reversing the array ordering and creating shorter arrays ( Fig. 1f ). As a strategy to prevent TSO-TSO artifact formation, we compared the use of the standard TSO with one incorporating 3′-Deoxyguanosine (3dG) in the 10x 3′ Gene Expression kit with PBMCs as input. This modified template switching oligo retains its templating ability without having the requisite 3′-hydroxyl group which can erroneously prime RNA and extend during the reverse transcription reactions. After sequencing these libraries on the Sequel IIe and annotating the reads with marti , we found that the 3dG reagent successfully prevents the formation of TSO-TSO artifacts: < 0.1% of reads, compared to 7.45% of reads for the standard TSO in this experiment ( Fig. 1e ). We found the 3dG oligo to be highly effective for standard 10x 3′ library construction and sequencing, and, in fact, to result in a reduced number of intergenic and antisense reads found in both short-and long-read data (Supplementary Table 3). While the standard TSO yielded more UMIs/cell overall, further inspection of both TSO conditions revealed a similar number of forward-strand exonic reads, suggesting the potential for elevated artifact rates from the standard TSO. Such artifactual reads have the potential to negatively impact isoform identification and quantification. In addition to the observed improvements in data quality, the 3dG library could be used to create full-length MAS-seq arrays without the need for involved cleanup steps, (91.36% of the arrays were full-length, compared to 24.51% for the standard TSO without cleanup and 93.78% with cleanup; Fig. 1g , Supplementary Fig. 7). The PIPseq library suffered from a large number of reads that contained only a poly(A) tail and no other sequence, and a lower percentage of full-splice match reads, consistent with the mRNA degradation within the CD14 monocytes found in the short-read data ( Fig. 1d , Supplementary Fig. 8, Supplementary Table 4, 5). While these reads do not interfere with the construction of MAS-seq arrays, they reflect a limitation of the PIPseq workflow when capturing cell types with high RNAse levels. Previous work has shown that cell fixation prior to PIPseq encapsulation can prevent mRNA degradation, which would preserve more full length molecules in these cells 6 , 7 . To explore this, we tested five different fixation methods on a new PBMC sample and used both short and long-read sequencing to evaluate the results. We found a combination of methanol and dithio-bis(succinimidyl propionate) (DSP) was most effective at preserving mRNA from CD14+ monocytes. Kinnex single-cell sequencing of the PBMCs showed that methanol DSP fixation improved CD14+ monocytes as measured by the median number of reads per cell (fixed: 2,253; fresh: 153), UMIs per cell (fixed: 1,933; fresh: 135) and the median read length (fixed: 305bp; fresh: 223bp) (Supplementary Fig. 9, 10). While the fixed condition does produce higher quality PIP-seq data, the modest monocyte read lengths are likely due to cellular stress conditions during the involved cell preparation for the single-cell experiment. While addressing the biochemical root cause of artifacts is the ideal path, such workflow fixes can take extensive development and optimization. Such is the case for internal priming of the oligo(dT) during reverse transcription, wherein a stretch of adenines in a transcript is sufficient for priming and extension. These internal priming events are upstream of the poly(A) tail and hence truncating. During analysis, these reads appear to be novel 3′ sites, either for known genes (leading to mistaken annotation of novel isoforms) or in intergenic regions (leading to mistaken annotation of novel genes). Identifying internally primed reads can be challenging because they appear structurally sound and have all expected adapters. In advance of molecular solutions for internal priming in high-throughput single-cell workflows, we developed a workflow to detect internal priming that classifies reads according to the genomic context of their alignment, similar to previous approaches to this problem 8 , 9 . Aligned reads were classified based on the proximity of the 3′ end to known transcription stop sites and other annotated features, as well as the genomic context. To be classified as an expected priming event, a read must terminate near a known transcription stop site or be adjacent to a poly(A) signal motif, and must not be located within a coding sequence or be adjacent to a particularly adenine-rich genomic sequence (see methods, Supplementary Table 6). While these heuristics are imperfect, this approach allowed us to broadly classify reads into good priming (known sites and unannotated) and likely internal priming candidates (genomic poly(A)). We observed a striking degree of internal priming across the three platforms (PIPseq: 18.27%, 10x 3′: 26.76%, 10x 5′: 18.69%; Fig. 2a , Supplementary Table 7) and explored the downstream analytical impacts in the next section. Download figure Open in new tab Figure 2. Cross-platform quantification of internal reverse transcription priming artifacts and their impact on downstream analysis. a) Rate of different priming events in single-cell MAS-seq data. A complete breakdown of read classifications and rates is available in the Supplementary Table 6. b) Visualization of all transcripts identified by IsoQuant showing the proportion of reads assigned to that transcript that were classified as properly primed (dark color fill) or misprimed (light color fill). Transcripts are broken down by SQANTI3 class, from left to right: known transcripts present in the reference, NIC transcripts from known genes, NNIC transcripts from known genes, and transcripts assigned to novel genes. Transcripts are displayed horizontally with the dark color fill portion reflecting the proportion of reads for the transcript that came from proper priming sites. c) Coverage from full splice match and incomplete splice match reads for known transcripts, binned by transcript length and broken down by priming call. The shaded area above each plot shows the locations for the most-3′ splice junction in each length bin. d) Upset plot showing the overlap of known transcripts identified in the three libraries by IsoQuant , as determined by gffcompare matched as = and c . e) Upset plot showing the overlap of all transcripts. f) Violin plot showing the UMIs per transcript for categories in 2e. On the left is shown the UMI distributions for transcripts unique to each sample. On the right, the distributions for transcripts found in all three samples. After classifying the priming status of each read, we quantified the proportion of reads for each transcript that derived from expected priming sites ( Fig. 2b ). In all three experiments we saw that the identification of many novel transcripts and novel genes had been derived entirely from internally primed reads (NIC: 29.1% – 38.7%; NNIC: 42.0% – 45.4%; novel genes: 57.6% – 69.8%), raising questions about the reliability of these novel annotations. Known transcripts were more likely to be represented by an expected priming site, but in some cases the only evidence for a known transcript came from internal priming (15.4% – 18.0%). These results highlight the difficulty of accurate transcript identification and quantification in the presence of internally primed reads. The impact of internal priming can be seen clearly in the average transcript coverage in our MAS-seq experiments. In each experiment we calculated the transcriptome coverage profile by computing the coverage per transcript, normalizing to the length of the transcript, and then averaging the normalized profiles together ( Fig. 2c , see methods section). This coverage profile was calculated for four combinations of priming status (good or internally primed reads) and transcript match (full splice match or incomplete splice match). A full splice match (FSM) matches all of the annotated splice junctions for a transcript, while an incomplete splice match (ISM) matches a subset–the read is either 5′ or 3′ truncated, or both. In all three experiments we see that coverage for FSM reads is mostly uniform over the transcript body. As transcript length increases, there is a 5′ bias in FSM coverage, reflecting that a read does not need to contain the entire terminal 3′ exon to be classified as a full splice match, irrespective of internal priming. Long transcripts often contain very long 3′ UTR sequences in the final exon ( Fig. 2c , shaded regions), resulting in the observed increased 5′ coverage bias in FSMs compared to shorter transcripts. Internal priming sites in the terminal exon can still produce a full splice match, and we observe this across all three platforms. Finally, reads from a shorter isoform may be erroneously assigned to a longer version. This can be due to unannotated alternative polyadenylation sites in the 3′ UTRs of these genes, or an unexpected combination of transcription start site, exon splicing, and polyadenylation site (Supplementary Fig. 11) 10 – 12 . The coverage for incomplete splice matches looks notably different. In all three experiments we see a strong 3′ bias, reflecting reads that were properly primed but failed to capture the whole mRNA. Roughly 50% of the coverage from these properly primed reads fell in the last 25% of the transcript body (10x 3′: 51.8%; 10x 5′: 49.6%; PIPseq: 51.7%; see Supplementary Fig. 12). This can occur because reverse transcription did not produce a full-length cDNA, or because the mRNA itself was truncated. For the longest transcripts (>4kb) the effect is more pronounced (10x 3′: 56.3%; 10x 5′: 54.1%; PIPseq: 52.4%), suggesting that cDNA synthesis is a limiting factor when sequencing these isoforms. On the other end, the most 5′ quartile of longest transcripts accounts for a large proportion of the internally primed ISM coverage (10x 3′: 45.3%; 10x 5′: 45.1%; PIPseq: 32.2%). This reflects that internal priming sites in the middle of a transcript will produce a 3′-truncated read that is missing later splice junctions. This effect is visible in all three experiments but is more pronounced in the 10x 5′ and 3′ data, reflecting the higher rate of internal priming in those libraries. Our hypothesis was that differences in capture and internal priming behavior between these three platforms would mostly lead to differences in novel isoform calls, while known transcripts would be identified more consistently. To test this we processed each MAS-seq dataset with the isoform identification and quantification tool IsoQuant (v3.4.2), and compared the resulting annotations using gffcompare (v0.12.6) 13 , 14 . When considering only isoforms that are closely compatible with known annotations ( = and c classifications from gffcompare ), there is a strong overlap between all three platforms, suggesting that these isoforms are truly expressed in the sample and that all three platforms capture those transcripts (9,784 transcripts in common; Fig. 2d ). However, when we consider all isoforms reported by IsoQuant , we observe a very large number of transcript structures that are unique to each of the three single-cell approaches (10x 3′: 63,373; 10x 5′: 43,017; PIPseq: 51,790; Fig. 2e ). These transcripts are typically supported by a smaller number of reads (median 10 – 13 UMIs per unique transcript, compared to 67 – 76 UMIs for shared transcripts) and these reads are less likely to be coming from expected priming sites in the genome (27.4% – 37.8% of reads had expected priming, compared to 83.2% – 92.8% for shared transcripts) which supports the hypothesis that misprimed reads can lead to the identification of unreliable novel isoforms ( Fig. 2f ). In this study we quantified the rate of common artifacts in single-cell RNA libraries, demonstrated how they can lead to inaccuracies in isoform identification and quantification, and presented strategies to mitigate these issues. In some cases sequencing artifacts can be identified by inspection of the read structure, while other cases must be inferred by considering the genomic context of the aligned sequence. Structural artifacts can be filtered out bioinformatically, and we developed marti to identify and remove these artifacts. Preferably, improvements in biochemistry would prevent these artifacts from forming in the first place. We demonstrated the impact of using the 3dG template switch oligo, which prevents a common class of sequencing artifacts in 10x 3′ libraries. We also showed that cell fixation is critical for maintaining mRNA integrity on the PIPseq platform. Other artifacts, such as reads with proper structure arising from internal priming, are more difficult to filter due to our incomplete knowledge of the transcriptome and the aim to discover novel genes and isoforms in our experiments. We developed workflows to tag reads with their priming status and annotate isoforms with their support from both properly and internally primed reads. These annotations can be coupled with custom filters to minimize spurious isoform detection. As biochemical and bioinformatic methods continue to advance, the best strategy for these data will continue to evolve. Future advances in biochemistry may be able to prevent other classes of sequencing artifacts from being generated, greatly simplifying identification and leading to more accurate and reliable isoform expression results. In particular, more reliable capture at the poly(A) tail would reduce the sequencing of internally primed reads and more processive reverse transcriptases would provide more complete coverage over long transcripts. While methods exist to sequence specifically from the 3′ end of mRNA molecules, they have yet to be demonstrated at the scale and throughput necessary for single-cell isoform sequencing 15 . Advances in throughput and accuracy of full-length RNA isoform sequencing will further bolster a more comprehensive representation of transcriptomics and bring to light critical roles RNA splicing plays in natural and disease biology. Online Methods Wet-lab PIP-seq vs 10x 3′ vs 10x 5′ Fresh PBMC sample preparation Cryopreserved PBMCs obtained from AllCells (Lot # 3082436 25/Apr 22 with 10 million cells) were thawed for 2min in 37°C water bath and washed twice using PBS containing 0.4 % BSA, paletted using a swing bucket centrifuge (300 x g for 5 minutes, 4°C). Cells were then resuspended in Fluent cell resuspension buffer (FB0002440), filtered through 40um Flowmi cell strainers (Bel-Art, H13680-0040) counted and viability evaluated with an AO/PI stain (90.5% viability) using Nexcelom Cellometer Auto 2000 (Nexcelom Bioscience). The cell suspensions were mixed well and were divided equally into three aliquots, two for the 10x Chromium captures (3′ and 5′ assays) and one for PIP-seq captures. Samples were further diluted using a Fluent cell resuspension buffer to meet manufacturer requirements for cell loading concentration; 800 cells/ul for both 3′ and 5′ 10x Chromium captures and 1650 cells/ul for PIP-seq capture. Barnyard K562:3T3 sample preparation K562 (ATCC) and 3T3 cells (ATCC) were harvested and washed twice in PBS containing 0.4% BSA, palleted at 300 x g for 5 minutes in a swing bucket centrifuge (4°C). Cell pellets were resuspended in 400 µl of room-temperature Fluent Cell Resuspension Buffer (FB0002440) and analyzed for cell count and viability using a Nexcelom Cellometer Auto 2000 (Nexcelom Bioscience) with Acridine Orange (AO) / Propidium Iodide (PI) staining (viability: 99.7% for K562; 98.8% for 3T3). The cells were then combined at a 1:1 ratio to a final concentration of 2,000 cells/μl for PIP-seq capture. The mixed-species cell concentration was reverified using the Cellometer Auto 2000 (Nexcelom Bioscience) prior to single-cell capture. DSP/Methanol and rehydration reagent preparation 1mg of DSP (ThermoFisher Scientific, Cat # A35393) was dissolved in 20ul of 100% Anhydrous DMSO (ThermoFisher Scientific, Cat # D12345). The 20uL of 50mg/ml DSP solution was then added to 800ul of ice cold 100% Methanol (Sigma-Aldrich, Cat # 34860) in a 1.5mL tube and gently mixed. DSP/Methanol Rehydration buffer was made by adding to 893ul of nuclease-free water the following: 180ul of Saline sodium citrate (SSC) 20x (Sigma-Aldrich; CAT# S6639, 1.2ul of DTT 1M (Sigma-Aldrich; Cat # 646563), 6ul of Protector RNase Inhibitor 2,000U (Roche, 3335399001), 120ul of 10% BSA in DPBS (Sigma-Aldrich; Cat# A1595). DSP/Methanol/DEPC rehydration buffer was made similarly; however, instead of 6ul of Protector RNase inhibitor, 12ul of DEPC was used and 887ul of nuclease-free water. Fresh vs DSP/Methanol PBMC sample preparation Cryopreserved PBMC were thawed for 2min in 37°C water bath, washed twice using PBS containing 2% FBS, paletted using a swing bucket centrifuge (300 x g for 5 minutes, 4°C), then resuspended in 200ul Cell staining buffer (Biolegend, # 420201). Viability of samples were evaluated using Tryphan blue stain using Nexcelom Cellometer Auto 2000 (Nexcelom Bioscience) (97.7% viability). Samples were mixed well, split in two, then each incubated with TotalSeq-A hashtag antibodies (Biolegend) for 30minutes on ice. Cells were washed thrice with cell staining buffer then either resuspended in 820ul of freshly prepared chilled DSP/Methanol solution (added gradually with gentle swirling) or in Fluent cell resuspension buffer (FB0002440). DSP/Methanol sample was left on ice for 30minutes with occasional swirling then quenched with 20.4ul 1M Tris-HCL pH 7.5 (Invitrogen, Cat # 15567027). DSP/Methanol sample was then centrifuged (400 x g, 5min, 4°C) using bucket centrifuge, supernatant discarded, and resuspended in rehydration buffer. Both samples were filtered through 40um Flowmi cell strainers (Bel-Art, H13680-0040), counted with AO/PI stain using Nexcelom Cellometer Auto 2000 (Nexcelom Bioscience) and pooled to attain a 1:1 mixed ratio of both samples. Samples were further diluted using Fluent cell resuspension buffer (FB0002440), making up a concentration of 2000 cells/ul. Multi-Fixation PBMC sample preparation Cryopreserved PBMC were thawed for 2min in 37°C water bath, washed twice using PBS containing 2% FBS, paletted using a swing bucket centrifuge (300 x g for 5 minutes, 4°C), then resuspended in 600ul Cell staining buffer (Biolegend, # 420201). Viability of samples were evaluated using Tryphan blue stain using Nexcelom Cellometer Auto 2000 (Nexcelom Bioscience) (98% viability). Samples were mixed well, split into 6, with each incubated with TotalSeq-A hashtag antibodies (Biolegend) for 30minutes on ice. Cells were then washed thrice with cell staining buffer then either resuspended in 300ul of freshly prepared chilled DSP/Methanol solution (added gradually with gentle swirling),150ul Methanol (added gradually with gentle swirling), 150ul of fluent cell resuspension buffer (FB0002440) with methanol free to make 0.1% PFA solution (Life Technologies, Cat #28908),150ul of fluent resuspension buffer (FB0002440) PFA making 1% solution (Life Technologies, Cat #28908) or 150ul of fluent resuspension buffer (FB0002440). Methanol and the DSP/Methanol samples were left on ice for 30minutes with occasional swirling, and the DSP/Methanol sample was quenched with 40.8ul 1M Tris-HCL pH 7.5 (Invitrogen, Cat # 15567027) then split equally into two 150ul in new 1.5ml Eppendorfs. PFA samples were left at room temperature to fix for 5min then quenched with 1.25 M glycine solution (Life Technologies, J64365.30). All samples were centrifuged at 400 x g for 5 minutes at 4°C using bucket centrifuge, supernatant discarded, and resuspended in either fluent resuspension buffer (Fresh and PFA fixed samples) or rehydration buffer, with the DSP/Methanol/DEPC sample using a rehydration buffer that has DEPC (Sigma-Aldrich, Cat # D5758) instead of Protector RNase Inhibitor (Roche, 3335399001). All samples were filtered through 40um Flowmi cell strainers (Bel-Art, H13680-0040), counted with AO/PI stain (94% viability for fresh sample) using Nexcelom Cellometer Auto 2000 (Nexcelom Bioscience) and pooled with an attempt to attain equal ratio at a concentration of 3,000 cells/ul. PIP-seq cell full length cDNA library prep Cells were captured by adding 10 μl of sample (2,000 cells/ul K562:3T3 cells; 1650 cells/ul Fresh PBMCs, 2000 cells/ul DSP/Methanol vs Fresh PBMCs, 3000cells/ul Multi-Fix PBMC) directly into Fluent PIP bead reagent mix (PIPseq T20 Single Cell RNA kit v4.0). For K562:3T3 samples, and fresh only PBMC experiments, 2 μl of Recombinant RNase Inhibitor (Takara; cat no. 2313A) was also added. For fix vs fresh PBMC sample comparison, Protector RNase Inhibitor (Roche, 3335399001) was used. Cell capture, cell lysis, cDNA synthesis and whole transcriptome amplification (WTA) were performed as per the manufacturer’s instructions. Briefly, cells suspended in the emulsion were lysed and mRNA was captured on PIP-beads before breaking the emulsion prior to reverse transcription. cDNA underwent WTA, using 11 cycles for 3T3:K562 cDNA and 14 cycles for PBMC cDNA’s, with 8 min extensions. Amplified cDNA was isolated from the PIPs then SPRI purified and quantified using Qubit. Library integrity assessment was performed on a high-sensitivity D5000 ScreenTape on 4150 Tapestation (Agilent Technologies). PIP-seq Illumina Library prep and sequencing Libraries were prepared for Illumina sequencing as per the manufacturer’s instructions (PIPseq T20 Single Cell RNA kit v4.0). Briefly, 30ng of PBMC cDNA and 100 ng of K562:3T3 cDNA were input into the PIP-seq library prep module to undergo fragmentation, end repair, A-tailing, and adapter ligation and underwent indexing PCR using 11 cycles for PMBC and 9 cycles for K562:3T3. Final libraries were quantified using Qubit and fragment sizes were verified using a high-sensitivity D1000 ScreenTape on 4150 Tapestation (Agilent Technologies) prior to equimolar pooling and sequencing at BCL on an either an Illumina NovaSeq X (28-bp Read 1, 10-bp i7 index, 10-bp i5 index, and 90-bp Read 2), NovaSeq SP (54-bp Read 1, 8-bp i7 index, 8-bp i5 index, and 67-bp Read 2) or NovaSeq 2000 (28-bp Read 1, 10-bp i7 index, 10-bp i5 index, and 90-bp Read 2). PIP-seq cell Hashtag library prep and sequencing Following WTA cDNA of PIP-seq captures, the supernatant of first SPRI cleanup which contained Hashtag cDNA was further processed as per manufacturer’s instructions. Briefly, following another round of SPRI purification, libraries were amplified (8 cycles) using primers that appended illumina adapters. Libraries were SPRI cleaned, evaluated using high-sensitivity D1000 ScreenTape on 4150 Tapestation (Agilent Technologies) and sequenced either on Illumina NovaSeq SP (54-bp Read 1, 8-bp i7 index, 8-bp i5 index, and 67-bp Read 2) or NovaSeq 2000 (28-bp Read 1, 10-bp i7 index, 10-bp i5 index, and 90-bp Read 2). 10x Genomics full length cDNA library prep Using single cell 3′ V3.1 (10x Genomics, 1000269) and single Cell 5′ v2 (10x Genomics, 1000267) kits, captures were carried out according to the manufacturer’s protocol by Broad Clinical Labs on two 10x chromium controllers simultaneously (10x Genomics, 120270). Briefly, cell suspensions loaded onto the Chromium Next GEM Chip G along with gel beads and partitioning oil. Following GEM generation, samples were immediately subjected to reverse transcription using a thermal cycler with the following conditions: 53°C for 45 minutes, 85°C for 5 minutes, and a hold at 4°C. After RT, emulsions were broken, and cellular debris was removed via a Silane bead cleanup (10x Genomics, 2000048). The purified cDNA was amplified under the following PCR conditions: initial denaturation at 98°C for 3 minutes, followed by 12 cycles of 98°C for 15 seconds (denaturation), 63°C for 20 seconds (annealing), and 72°C for 1 minute (extension). The amplified cDNA underwent a 0.6× SPRI bead cleanup (Beckman Coulter) and was quantified using a Qubit fluorometer (Thermo Fisher Scientific). The final full-length cDNA was then divided into aliquots for short-read and long-read library constructions. 10x Genomics Illumina library construction Library construction was performed using an automated workflow. The cDNA was fragmented and repaired with the addition of an A-tail by incubating at 32°C for 5 minutes and 65°C for 30 minutes, using the Chromium Single Cell 3′ Library Construction Kit v3.1 (10x Genomics, PN-2000160). A 0.8× double-sided SPRI bead cleanup (Beckman Coulter) was performed to remove undesired fragments. Adapters were ligated onto the DNA at 20°C for 15 minutes, followed by another 0.8× SPRI bead cleanup. Dual-indexed barcodes (10x Genomics, 1000215) were added, and libraries were amplified using PCR with the following conditions: initial denaturation at 98°C for 45 seconds, followed by 12 cycles of 98°C for 20 seconds (denaturation), 54°C for 30 seconds (annealing), and 72°C for 20 seconds (extension). Libraries underwent a final 0.8× double-sided SPRI bead cleanup for size selection. The quality of the final libraries was assessed using Quant-iT PicoGreen (Thermo Fisher Scientific) and an Agilent TapeStation (Agilent) to confirm the expected library size. Libraries were pooled to achieve the desired number of reads per cell for downstream analysis. Libraries were sequenced using NovaSeqX (28-bp Read 1, 10-bp i7 index, 10-bp i5 index, and 90-bp Read 2). Long-read sample preparation ISO-seq (1-mer) library prep for PacBio sequencing 3.2 ng of cDNA was amplified in a 100 μl MAS- PCR reaction using the following reaction conditions: 50 μl of MAS PCR mix, 10 μl of custom indexed Ai/Qi MAS primers (see Supplementary Table 8 for MAS primers; to enable multiplexing), 15 ng of cDNA library, adding H2O to bring final volume to 100 ul. Reactions were amplified using the following cycling conditions: 98°C for 3 minutes, followed by 8 cycles of 98°C for 20 s, 68°C for 30 s and 72°C for 8 minutes, followed by a final 72°C extension for 10 minutes. The amplified libraries were cleaned using a 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 60 μl elution buffer, and quantified with Qubit (Thermo Fisher Scientific, Q32851). Barbell formation was carried out by mixing 500 ng of MAS PCR product in 55 μl volume with 2 μl MAS enzyme and incubated at 37°C for 30 minutes. Subsequently, 3ul of barcode MAS adapter and 20 μl of MAS ligation additive was mixed into the sample followed by 10 μl of MAS ligase buffer and 10 μl fo MAS ligase. Sample was thoroughly mixed by gentle pipetting using a wide bore pipette tip to avoid bubble formation. MAS array formation reactions were incubated at 42°C for 60 minutes followed by 4°C hold (52°C lid). Array formation reactions were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 43 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). DNA Damage Repair was carried out by mixing 42 μl of sample with 6 μl of Repair buffer and 2 μl of DNA repair mix, and incubated at 37°C for 30 minutes (lid 47°C). DNA damage repair reactions were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 40 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). Nuclease treatment was carried out by mixing 40 μl of DNA damage repaired library with 5 μl Nucelase buffer and 5 μl Nuclease mix and incubated at 37°C for 60 minutes followed by 4°C hold (lid 47°C). The final MAS array libraries were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 20 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). Library size was obtained using a Genomic DNA ScreenTape and 4150 Tapestation (Agilent Technologies). Multiplexed ISO-seq libraries were pooled in equimolar amounts and sequenced on two Revio SMRT cells (PN: 102-202-200). TSO artifact purification TSO artifact removal was carried out as per manufacturer’s instructions. Briefly, 15 ng of cDNA from 3′ 10x PBMC and 5′ 10x PBMC were each used for TSO PCR using the following reaction conditions: 25 μl of MAS PCR mix, 10 μl of MAS 5′ or 3′ capture primer mix, 15 ng of 10x 3′ or 10× 5′ cDNA library, adding H2O to bring final volume to 50 ul. Reactions were amplified using the following cycling conditions: 98°C for 3 minutes, followed by 5 cycles of 98°C for 20 s, 68°C for 30 s and 72°C for 8 minutes, followed by a final 72°C extension for 10 minutes. The amplified libraries were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318) and quantified with Qubit (Thermo Fisher Scientific, Q32851). They were then further purified with 10 μl MAS capture beads. Following the binding and washing steps, the beads were reconstituted in 40 μl of elution buffer. 2 μl of MAS enzyme was added to each sample and incubated at 37°C for 30 minutes to cleave the captured DNA from the MAS capture beads. Eluant was moved to a fresh PCR tube and cleaned using a 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 45 μl of elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). This product was then used as input into MAS PCR. MAS-seq 16-mer indexed library prep for PacBio sequencing The following samples were prepared for indexed Kinnex 16mer workflow: View this table: View inline View popup Download powerpoint MAS-seq 16-mer libraries were prepared according to manufacturer’s instructions (PacBio MAS-seq kit Cat, 102-407-900). Briefly, 55ng cDNA was split between 16 MAS PCR reactions, each with the following final reaction conditions: 50 μl of MAS PCR mix, 15 ng of cDNA library, adding H2O to bring final volume to 90 ul, then adding 10 μl of of MAS primer mix pairs composed of sequential primers to each respective PCR reaction (see primer table). To generate indexed arrays, we substituted indexed primers for the A and Q primers (see primer table). Reactions were amplified using the following cycling conditions: 98°C for 3 minutes, followed by 8 cycles of 98°C for 20 s, 68°C for 30 s and 72°C for 8 minutes, followed by a final 72°C extension for 10 minutes. The amplified libraries were pooled and cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 50 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). MAS array formation was carried out by mixing 4 ug of MAS PCR product in 47 μl volume with 10 μl MAS enzyme and incubated at 37°C for 30 minutes. Subsequently, 3ul of barcode MAS adapter and 20 μl of MAS ligation additive was mixed into the sample followed by 10 μl of MAS ligase buffer and 10 μl of MAS ligase. Sample was thoroughly mixed by gentle pipetting using a wide bore pipette tip to avoid bubble formation. MAS array formation reactions were incubated at 42°C for 60 minutes followed by 4°C hold (52°C lid). Array formation reactions were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 43 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). DNA Damage Repair was carried out by mixing 42 μl of sample with 6 μl of Repair buffer and 2 μl of DNA repair mix, and incubated at 37°C for 30 minutes (lid 47°C). DNA damage repair reactions were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 40 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). Nuclease treatment was carried out by mixing 40 μl of DNA damage repaired library with 5 μl Nucelase buffer and 5 μl Nuclease mix and incubated at 37°C for 60 minutes followed by 4°C hold (lid 47°C). The final MAS array libraries were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 20 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). Library size was obtained using a Genomic DNA ScreenTape and 4150 Tapestation (Agilent Technologies). MAS-seq 16-mer library prep for PacBio sequencing The following samples were prepared for standard Kinnex 16mer workflow: View this table: View inline View popup Download powerpoint MAS-seq 16-mer libraries were prepared according to manufacturer’s instructions (PacBio MAS-seq kit Cat, 102-407-900). Briefly, 55ng cDNA was split between 16 MAS PCR reactions, each with the following final reaction conditions: 50 μl of MAS PCR mix, 15 ng of cDNA library, adding H2O to bring final volume to 90 ul, then adding 10 μl of of MAS primer mix pairs composed of sequential primers to each respective PCR reaction (see primer table). Reactions were amplified using the following cycling conditions: 98°C for 3 minutes, followed by 8 cycles of 98°C for 20 s, 68°C for 30 s and 72°C for 8 minutes, followed by a final 72°C extension for 10 minutes. The amplified libraries were pooled and cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 50 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). MAS array formation was carried out by mixing 4 ug of MAS PCR product in 47 μl volume with 10 μl MAS enzyme and incubated at 37°C for 30 minutes. Subsequently, 3ul of barcode MAS adapter and 20 μl of MAS ligation additive was mixed into the sample followed by 10 μl of MAS ligase buffer and 10 μl of MAS ligase. Sample was thoroughly mixed by gentle pipetting using a wide bore pipette tip to avoid bubble formation. MAS array formation reactions were incubated at 42°C for 60 minutes followed by 4°C hold (52°C lid). Array formation reactions were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 43 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). DNA Damage Repair was carried out by mixing 42 μl of sample with 6 μl of Repair buffer and 2 μl of DNA repair mix, and incubated at 37°C for 30 minutes (lid 47°C). DNA damage repair reactions were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 40 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). Nuclease treatment was carried out by mixing 40 μl of DNA damage repaired library with 5 μl Nucelase buffer and 5 μl Nuclease mix and incubated at 37°C for 60 minutes followed by 4°C hold (lid 47°C). The final MAS array libraries were cleaned using 0.8× SPRIselect beads (Beckman Coulter B23318), eluted into 20 μl elution buffer and quantified with Qubit (Thermo Fisher Scientific, Q32851). Library size was obtained using a Genomic DNA ScreenTape and 4150 Tapestation (Agilent Technologies). 3′-Deoxyguanosine (3dG) Template Switch Oligo Experiment The 3′-Deoxyguanosine (3dG) template switch oligo was obtained from Biosyn Corporation, sequence: /5BiosG/AAGCAGTGGTATCAACGCAGAGTACATrGrG{3dG} with RNAse free HPLC purification. Following the standard 10x protocol, the 3dG TSO was resuspended to 1mM in elution buffer. The TSO comparison study was performed using PBMCs (AllCells) on the 10x 3′ Gene Expression Kit (10x Genomics, Cat #PN-1000269). The PBMC samples were run in parallel comparing following the standard protocol and a 3dG protocol with the only modification being the addition of 2.4ul of 1mM 3dG TSO in place of the standard TSO. Computational Methods Code Availability The code to perform the computational analysis for this paper is available at www.github.com/MethodsDev/mdl-sc-isoform-paper . This repository contains Jupyter notebooks that document all of the analysis performed in this manuscript and produce all of the figures. In addition, the analysis relies on the following custom packages available on Github: View this table: View inline View popup Download powerpoint References The following human references were used: – FASTA: GRCh38_no_alt genome – BED (provided to minimap2): Gencode v39 annotation – GTF: Gencode v39 basic annotation – only transcripts with Gencode’s basic tag – Gencode polyA site annotation – polyA motif file (distributed by SQANTI3, from Beaudoing et al. , 2000) For the barnyard experiment, we used the mouse-and-human genome reference available from 10x and built a new STAR index that was compatible with PIPseeker. Short-read sequencing data processing Data from the Illumina NovaSeq X was demultiplexed with bcl2fastq2 (v2.20.0.422). 10x 3′ and 10x 5′ experiments were then processed using cellranger (v7.1.0) using the reference genome and GTF specified above. The PIPseq samples were processed with PIPseeker (PBMC sample: v2.1.4; Initial fixation experiments: v3.0.5; Second round of fixation experiments: v3.3.0). The PBMC sample was aligned to the human reference and the barnyard experiment was aligned to a combined human and mouse reference, derived from the 2020-A reference made available by 10x Genomics at https://www.10xgenomics.com/support/software/cell-ranger/downloads#reference-downloads . Long-read sequencing data processing Long-read sequencing was performed on two instruments: monomer cDNA sequencing libraries were sequenced on the PacBio Sequel IIe, while MASseq arrays were sequenced on PacBio Revio sequencers. Sequel IIe samples were demultiplexed by first annotating primers with lima (v2.9.0) and then separating individual samples with a custom tool, named callao. callao uses the annotations created by lima and outputs one BAM for each sample. For the analysis of artifacts in these data, we included reads that contained the same index on either end (A i -A i and Q i -Q i reads), which are produced when cDNA has the same adapter on either end. In MASseq arrays these artifacts can produce palindromic arrays of different lengths (e.g. ABCBA). Revio samples were demultiplexed in the same fashion and then deconcatenated using skera (v1.2.0), using custom array lists based on the A i and Q i adapters that correspond to the indexes used. After the demultiplexing and deconcatentation of MASseq arrays, reads were classified using marti, a tool we developed for in-depth artifact analysis. At a high level, marti classifies each read based on the presence, absence, and location of predefined target sequence features (such as, PCR and sequencing adapters) within the read. It takes as input an unaligned BAM file and a YAML configuration file with search parameters (e.g., the expected sequencing error rate and minimum poly(A) tail length) and target sequences of interest. It outputs (1) a BAM file with annotated cDNA reads, where per-read annotations include the artifact category, the coordinates of all the target sequence features and the native cDNA sequence, and the structural representation of the read (which captures the order and orientation of the target features within the read), and (2) a report of counts for each artifact category and read structure. marti classifies the reads into the following classes: (1) Proper : non-artifactual read, (2) TSO-TSO : RT artifact caused by the internal priming of the switch-leader-sequence (SLS, terminal 3′ end of the TSO which is not part of the TSO adapter sequence) of the TSO, characterized by the presence of a TSO at both ends of the reads, (3) RT-RT : RT artifact caused by the internal priming of the oligo dT (from the RT primer), characterized by the presence of the RT PCR adapter and a poly(A) at both ends of the read, (4) internal-priming-RT-adapter : PCR artifact caused by the internal priming of the RT PCR adapter, (5) internal-priming-TSO-adapter : PCR artifact caused by the internal priming of the TSO PCR adapter, (6) only-polyA : artifact class assigned when only As or Ts are found inside the read sequence, likely a result of RNA degradation, (7) missing-adapter : sequencing artifact, assigned when the read is missing at least one PCR adapter at either end, (8) internal-adapter : sequencing artifact, assigned when an extra adapter is found inside the read, (9) retained-SMRTbell : sequencing artifact in PacBio reads, assigned when the PacBio SMRTbell adapter is found inside the read, (10) unknown : artifact of unknown type assigned when the read structure does not correspond to a proper read nor any other known artifact classes, which allows users to identify novel artifact classes (by examining the discovered read structures) and filter out such constructs from downstream analysis. Since some artifact classes may correspond to multiple read structures, marti separately reports and quantifies each observed read structure within each artifact class. For instance, the internal-priming-TSO-adapter artifact can be detected in the following three different read configurations (in each orientation): (TSO PCR adapter, cDNA, poly(A), reverse_complement(RT PCR adapter), (TSO PCR adapter, cDNA, reverse_complement(TSO)), and (TSO PCR adapter, cDNA, reverse_complement(TSO PCR adapter). With the exception of internal polyA priming , Supplementary Figure 6 summarizes the artifact classes and read structures reported by marti. Short-read analysis Barnyard data Publicly available data for human-mouse mixture experiments were obtained from https://www.10xgenomics.com/datasets . We downloaded the unfiltered raw h5 matrix files for two barnyard experiments, which had been aligned to the same human-mouse combined reference used for the PIPseq experiment: – 10k_hgmm_3p_nextgem_Chromium_Controller_raw_feature_bc_matrix.h5 for 10x 3′ v3.1 data – 10k_hgmm_5pv2_nextgem_Chromium_Controller_raw_feature_bc_matrix.h5 for 10x 5′ v2 data We called cells and doublets using a UMI threshold adjusted for the relative depth of the three experiments. To calculate doublet rates relative to throughput, 10x 3′ cells were called at 2,750 UMIs/barcode, 10x 5′ cells were called at 3,000 UMIs/barcode and PIPseq cells were called at 10,000 UMIs/barcode. To measure ambient levels in these experiments we considered cells with ≥10,000 UMIs of one species and <1,000 UMIs of the other as confident single-species barcodes, and considered the percentage of cross-species UMIs as a proxy for ambient contamination. PBMC data As a preliminary analysis, each dataset was analyzed separately using all reads. For the 10x data we filtered to barcodes with >1,000 UMIs. After estimating the number of cells in each sample, we subsampled the full sequencing data so that each dataset contained roughly 100,000 reads per cell and selected barcodes with >1,000 UMIs after subsampling (Supplementary Fig. 13). All downstream analysis was performed on the subsampled data. Gene selection and clustering were performed as described in J. Langlieb & N.S. Sachdev, 2023. Briefly, we used a binomial model to estimate the expected rate of non-zero counts in the case of homogenous gene expression in a population. A gene was selected as variable if the observed percentage of cells with non-zero count deviated from Poisson expectation by more than a specified percentage. A minimum of 5% deviation from expectation was used for gene selection during clustering. Clustering was performed on each dataset separately, with a hierarchical strategy 1 . Each round of clustering consisted of four steps. First, gene selection was performed based on the above binomial model. Next, counts were square-root transformed. Third, k -nearest neighbor (kNN using cosine distance, with k = 80) and shared nearest neighbor (SNN) graphs were constructed. Finally, the SNN graph was clustered using the Leiden algorithm over a range of resolution parameters to find the lowest resolution that yielded multiple clusters 2 . The resulting clusters were each recursively re-clustered with the same strategy, and the process was continued until no resolution parameter yielded multiple clusters, or the resulting clusters were too small to compute a meaningful SNN graph (if n < k = 80 cells, the graph is fully connected and all edge weights are equal). Clusters were labeled by manual inspection, using a set of canonical markers for peripheral blood cells. For visualization purposes we used a force-directed graph embedding based on the same type of shared nearest neighbors graph. In this case a minimum deviation of 10% was used. The SFDP layout algorithm was run on the SNN graph and the three embeddings were rotated and reflected as needed to align similar cell populations to each other. Hashtagged Fixation Comparisons Count matrices of cells and hashtags were generated using PIPseeker’s built-in functionality. We selected the highest-sensitivity filtered matrix (level 5, in PIPseeker’s output) for processing and took the barcodes called for each hashtag for further analysis. Counts were processed using Seurat v4.3 using standard single-cell analysis practices and parameters (dim 1:15, res = 0.8). Hashtag counts were CLR normalised and cells were demultiplexed to their respective fixation or fresh using Seurat inbuilt HTODemux function (0.99 positive quantile). Annotation of Cell types Canonical markers commonly used for differentiating cell types were used for annotation of clusters (PMCID: PMC8238499). First cells were demarcated by broader lineage associated markers (Myeloid: LYZ , B-cell: MS4A1 , T-cell/NK/ILC: CD3 , Red blood cell: PPBP ), then clusters within each lineage was further stratified as per following: Classical Monocyte: CD14+FCGR3A-, Non-classical monocyte: CD14 + FCGR3A+ , DC: CLEC10A+ / CD1C+ / IRF7+ , CD4 T-cells: CD4 + CD8 -, CD8 T-cells: CD8 + CD4 - NCAM1 -, NK: TRBC1-XCL1 + NCAM1 +, CTL: CD8+/− GNLY+, TRBC1 +/− ILC: TRDC+ CD8 +/− NCAM1 +/− TRBC1+/−, Naïve T-cell: CD3 + SELL+ CCR7+ Long-read analysis Barcode extraction To extract barcodes and UMIs from long-read data we extracted valid barcodes and UMIs from the marti-annotated sequences. Only Proper reads were used for alignment and isoform identification. Given the expected single-cell library structure and location of the adapters (provided by marti annotation), we extracted the putative location of the cell barcode and matched the region against a whitelist of known barcodes (Supplementary Fig. 14). For matching we used a maximum edit distance of 1 and only accepted barcodes with a single unambiguous match. After the barcode has been located, the adjacent sequence of the appropriate length is taken as the UMI. Using the structure of the read we also extracted the cDNA and reoriented all reads to the forward orientation for mapping. To perform barcode matching on very long (39bp) PIPseq barcodes we developed a new library, bouncer, which uses a symmetric-deletion index for efficient lookup of match candidates. This is an extension of the SymSpell method first developed for spell-checking by Wolfe Garbe ( https://github.com/wolfgarbe/SymSpell ). The most significant change from the original algorithm is the use of a partitioning method to efficiently store the very large (85M barcode) whitelist of 39bp PIPseq barcodes. The code for bouncer and the SymSpell implementation barcode-symspell are available on GitHub (see above). Read alignment and filtering After extracting cDNA and orienting all reads to the forward strand, we aligned reads to the GRCh38 reference genome using minimap2 (v2.28) 3 . Alignment was performed with the following settings: minimap2 –t 16 –ayx splice:hq –uf –G1250k –Y –-MD –-junc-bed [gencode_v39_bed] [GRCh38_fasta] [input_fastq] > [output] Of note is –G1250k to allow introns up to the maximum size annotated in the human genome, and –uf to only perform alignment on the forward strand of the input, as we know the original orientation of the cDNA from our annotation. While we use the “basic” Gencode annotation for isoform quantification, here we use the full annotation to give minimap2 access to additional putative splice sites. After alignment, we filtered the aligned BAMs for high-scoring primary alignments. High-scoring is defined as: MAPQ ≥ 60 or [alignment score] / [query len] > 0.9. In our testing, including secondary and supplemental alignments did not improve the isoform identification for this dataset, and their presence complicated the internal priming analysis. Isoform identification IsoQuant (v3.4.2) was run on the filtered primary reads with the GRCh38 genome, the Gencode v39 basic reference, and the following options: –-complete_genedb, –-stranded forward to indicate the reads are already correctly oriented, and –-polya_requirement never because we have removed the poly(A) tails from the reads 4 . We found that including poly(A) tails during alignment led to more reads misaligned to pseudo-genes in the genome. To compare the results of IsoQuant across the experiments, we used gffcompare (v0.12.6) to match the novel transcripts in the transcript_models.gtf output by IsoQuant to the input reference, using the –-strict-match option 5 . This comparison was restricted to transcripts with at least five assigned reads. Priming analysis and read annotation We categorized each read according to the genomic context of their alignment, and assigned them to one of 19 categories, of which four categories were considered to be expected priming events. To be classified as an expected priming event, a read must terminate near a known transcription stop site or be adjacent to a poly(A) signal motif. If the read does not terminate near an annotated stop site, it must not contain a “genomic poly(A)”, defined as no more than 12 adenines and no more than 6 in a row starting within the 20 bases downstream of the read. These thresholds were determined empirically from analysis of bulk and single-cell mRNA datasets and are similar to the thresholds used by Pacific Biosciences’ pigeon tool 6 . The output from the priming analysis is a set of annotated BAM files, with metadata for each read indicating its priming analysis as well as additional metadata from IsoQuant, including the transcript assignment and the SQANTI classification (e.g. full or incomplete splice match) for each read. Having produced these annotated BAMs we could efficiently quantify the number of transcripts in different categories (known transcripts, novel-in-catalog transcripts from known genes, and novel not-in-catalog transcripts from known and novel genes) and calculate the proportion of reads for each transcript that came from expected or unexpected priming sites. Coverage analysis After creation of the annotated BAM files, coverage could be efficiently calculated for each transcript by filtering the tagged reads based on the annotated tags. To construct an average profile, the coverage for each transcript was scaled to an equal number of bins (n = 1000) and the coverage was interpolated over that range. This allowed us to create an average profile over transcripts of differing lengths, either within a certain range of lengths or over the entire observed transcriptome. Supplementary Figures Download figure Open in new tab Supplementary Figure 1: Kneeplots and metrics for the short-read libraries, subsampled to ∼100,000 reads per cell. Download figure Open in new tab Supplementary Figure 2: Barnyard data showing the number of cells and doublets called for each experiment, as well as the distribution of cross-species contamination from ambient mRNA. 10x 3′ and 5′ data is from public datasets made available by 10x Genomics. The top left quadrant contains mouse cells, while the bottom right quadrant contains human cells. The top right quadrant contains doublets of mouse and human (10x 3′:; 10x 5′:; PIPseq:). Note that the expected doublet count is twice this number, as we cannot detect doublets of the same species. Download figure Open in new tab Supplementary Figure 3: Distribution of ambient contamination (measured via cross-species alignment) estimated from barnyard experiment data, based on barcodes called confidently as one species (>10,000 UMIs) but not the other (<1,000 UMIs from the other species). On the right the distribution is log-scaled to show outliers. Download figure Open in new tab Supplementary Figure 4: Read length distributions for Kinnex experiments. 0.8x and 0.6x refer to SPRI ratios used for PIPseq. Download figure Open in new tab Download figure Open in new tab Supplementary Figure 5: Length histograms and ligation heatmaps for single-cell PBMC Kinnex experiments. 0.8x and 0.6x refer to SPRI ratios used for PIPseq. Download figure Open in new tab Supplementary Figure 6: Schematics of artifacts. Download figure Open in new tab Supplementary Figure 7: Length histograms and ligation heatmaps for Kinnex data from 10x 3′ (no clean), 10x 3′ (cleaned) and 10x 3dG libraries. Download figure Open in new tab Supplementary Figure 8: SQANTI3-style classifications for Kinnex data, broken down by cell types determined from the short-read clustering data. FSM: Full splice match; ISM: Incomplete splice match; NNIC: novel not in catalog; NIC: novel in catalog. Download figure Open in new tab Supplementary Figure 9: Violin plots showing UMIs/cell for CD14+ monocytes from preliminary comparison of fixation protocols. Download figure Open in new tab Supplementary Figure 10: Violin plots showing read length distributions for CD14+ monocytes from preliminary comparison of fixation protocol. Download figure Open in new tab Supplementary Figure 11: Three examples of alternative polyadenylation sites that lead to transcript misclassification, which contributes to the observed 5′ enrichment in transcript coverage of full splice matches. Download figure Open in new tab Supplementary Figure 12: Binned transcript coverage for Kinnex data, broken down by priming classification and splice match. Transcripts have been grouped into three length bins, and average coverage is shown for each quartile of transcript length. Download figure Open in new tab Supplementary Figure 13: Saturation plot showing the number of UMIs per barcode for barcodes above 1000 UMIs at various subsampling rates. Download figure Open in new tab Supplementary Figure 14: Kneeplots derived from long-read data, similar to Supplementary Fig. 1. View this table: View inline View popup Download powerpoint Supplementary Table 1: Cell type counts and proportion from short-read clustering. View this table: View inline View popup Download powerpoint Supplementary Table 2: marti classifications from monomer cDNA libraries. View this table: View inline View popup Download powerpoint Supplementary Table 3: CellRanger metrics for standard TSO and 3dG TSO 10x 3′ experiment. View this table: View inline View popup Download powerpoint Supplementary Table 4: SQANTI classifications from IsoQuant run on primary reads View this table: View inline View popup Supplementary Table 5: SQANTI classifications from IsoQuant, broken down by cell type View this table: View inline View popup Supplementary Table 6: Rules for classification of priming sites based on genomic context of alignment. A check mark indicates a required feature, an X means that feature cannot be present, and a dash means either case is allowed. The categories in bold are expected priming sites, while the rest are considered likely to be mispriming events. View this table: View inline View popup Download powerpoint Supplementary Table 7: Full priming classification, not including mitochondrial reads. View this table: View inline View popup Supplementary Table 8: MAS Primer Sequences Acknowledgements This work was supported both by Broad Clinical Labs and a collaboration agreement between Pacific Biosciences of California, Inc. and the Broad Institute. Methods Bibliography 1. ↵ Langlieb , J. , et al. The molecular cytoarchitecture of the adult mouse brain . Nature 624 , 333 – 342 ( 2023 ). OpenUrl CrossRef PubMed 2. ↵ Traag , V. , Waltman , L. & van Eck , N. J . From Louvain to Leiden: guaranteeing well-connected communities . Sci. Rep . 9 , 5233 ( 2019 ). OpenUrl CrossRef PubMed 3. ↵ Li , H . Minimap2: pairwise alignment for nucleotide sequences . Bioinformatics 34 , 3094 – 3100 ( 2018 ). OpenUrl CrossRef PubMed 4. ↵ Prjibelski , A. D. , et al. Accurate isoform discovery with IsoQuant using long reads . Nat. Biotechnol . 1 – 4 ( 2023 ) doi: 10.1038/s41587-022-01565-y . OpenUrl CrossRef 5. ↵ Pertea , G. & Pertea , M. GFF Utilities: GffRead and GffCompare . Preprint at doi: 10.12688/f1000research.23297.2 ( 2020 ). OpenUrl CrossRef 6. ↵ SMRT Link Kinnex full-length RNA troubleshooting guide . Bibliography 1. ↵ Verwilt , J. , Mestdagh , P. & Vandesompele , J . Artifacts and biases of the reverse transcription reaction in RNA sequencing . RNA 29 , 889 – 897 ( 2023 ). OpenUrl Abstract / FREE Full Text 2. ↵ Svoboda , M. , Frost , H. R. & Bosco , G. Internal oligo(dT) priming introduces systematic bias in bulk and single-cell RNA sequencing count data . NAR Genom. Bioinform . 4 , lqac035 ( 2022 ). OpenUrl 3. ↵ Clark , I. C. , et al. Microfluidics-free single-cell genomics with templated emulsification . Nat. Biotechnol . 41 , 1557 – 1566 ( 2023 ). OpenUrl CrossRef PubMed 4. ↵ Single Cell Analysis , RNA Sequencing (scRNA-Seq) . Fluent BioSciences https://www.fluentbio.com/ ( 2021 ). 5. Al’Khafaji , A. M. , et al. High-throughput RNA isoform sequencing using programmed cDNA concatenation . Nat. Biotechnol . 42 , 582 – 586 ( 2024 ). OpenUrl CrossRef PubMed 6. ↵ Martin , B. K. , et al. Optimized single-nucleus transcriptional profiling by combinatorial indexing . Nat. Protoc . 18 , 188 – 207 ( 2023 ). OpenUrl CrossRef PubMed 7. ↵ Wang , X. , Yu , L. & Wu , A. R . The effect of methanol fixation on single-cell RNA sequencing data . BMC Genomics 22 , 420 ( 2021 ). OpenUrl CrossRef PubMed 8. ↵ Celik , M. H. & Mortazavi , A . Analysis of alternative polyadenylation from long-read or short-read RNA-seq with LAPA . bioRxiv ( 2022 ) doi: 10.1101/2022.11.08.515683 . OpenUrl Abstract / FREE Full Text 9. ↵ Pardo-Palacios , F. J. , et al. SQANTI3: curation of long-read transcriptomes for accurate identification of known and novel isoforms . Nat. Methods 21 , 793 – 797 ( 2024 ). OpenUrl CrossRef PubMed 10. Gruber , A. J. & Zavolan , M . Alternative cleavage and polyadenylation in health and disease . Nat. Rev. Genet . 20 , 599 – 614 ( 2019 ). OpenUrl CrossRef PubMed 11. Beaudoing , E. , Freier , S. , Wyatt , J. R. , Claverie , J. M. & Gautheret , D . Patterns of variant polyadenylation signal usage in human genes . Genome Res . 10 , 1001 – 1010 ( 2000 ). OpenUrl Abstract / FREE Full Text 12. Tian , B. & Manley , J. L . Alternative polyadenylation of mRNA precursors . Nat. Rev. Mol. Cell Biol . 18 , 18 – 30 ( 2017 ). OpenUrl CrossRef PubMed 13. Prjibelski , A. D. , et al. Accurate isoform discovery with IsoQuant using long reads . Nat. Biotechnol . 41 , 915 – 918 ( 2023 ). OpenUrl CrossRef PubMed 14. Pertea , G. & Pertea , M. GFF utilities: GffRead and GffCompare . F1000Res . 9 , 304 ( 2020 ). OpenUrl 15. Brouze , A. , Krawczyk , P. S. , Dziembowski , A. & Mroczek , S . Measuring the tail: Methods for poly(A) tail profiling . Wiley Interdiscip. Rev. RNA 14 , e1737 ( 2023 ). OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted February 27, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Multi-platform evaluation and optimization of single-cell RNA isoform sequencing Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Multi-platform evaluation and optimization of single-cell RNA isoform sequencing James T. Webber , Daniel A. Bartlett , Ghamdan Al-Eryani , Akanksha Khorgade , Asa Shin , Christophe Georgescu , Houlin Yu , Brian J. Haas , Victoria Popic , Aziz Al’Khafaji bioRxiv 2025.02.22.639662; doi: https://doi.org/10.1101/2025.02.22.639662 Share This Article: Copy Citation Tools Multi-platform evaluation and optimization of single-cell RNA isoform sequencing James T. Webber , Daniel A. Bartlett , Ghamdan Al-Eryani , Akanksha Khorgade , Asa Shin , Christophe Georgescu , Houlin Yu , Brian J. Haas , Victoria Popic , Aziz Al’Khafaji bioRxiv 2025.02.22.639662; doi: https://doi.org/10.1101/2025.02.22.639662 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetics Subject Areas All Articles Animal Behavior and Cognition (7622) Biochemistry (17650) Bioengineering (13871) Bioinformatics (41880) Biophysics (21424) Cancer Biology (18566) Cell Biology (25461) Clinical Trials (138) Developmental Biology (13365) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15590) Genomics (22475) Immunology (17713) Microbiology (40328) Molecular Biology (17148) Neuroscience (88473) Paleontology (666) Pathology (2827) Pharmacology and Toxicology (4816) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-23T02:00:01.238055+00:00

License: CC-BY-NC-4.0