Full text
74,499 characters
· extracted from
preprint-html
· click to expand
Gene library deep sequencing for protein super-family profiling | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Gene library deep sequencing for protein super-family profiling Rahkesh T Sabapathy , Kara Henry-Cocks , Jackson Feng , Shashikanth Marri , Gustavo Bracho Granado , View ORCID Profile Harald Janovjak doi: https://doi.org/10.1101/2025.10.20.682913 Rahkesh T Sabapathy 1 Flinders Health and Medical Research Institute (FHMRI), College of Medicine and Public Health, Flinders University , Bedford Park 5042, Adelaide, South Australia, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kara Henry-Cocks 1 Flinders Health and Medical Research Institute (FHMRI), College of Medicine and Public Health, Flinders University , Bedford Park 5042, Adelaide, South Australia, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jackson Feng 2 Australian Regenerative Medicine Institute (ARMI), Faculty of Medicine, Nursing and Health Sciences, Monash University , Clayton 3800, Melbourne, Victoria, Australia 3 European Molecular Biology Laboratory Australia (EMBL Australia), Monash University , Clayton 3800, Melbourne, Victoria, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Shashikanth Marri 1 Flinders Health and Medical Research Institute (FHMRI), College of Medicine and Public Health, Flinders University , Bedford Park 5042, Adelaide, South Australia, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gustavo Bracho Granado 1 Flinders Health and Medical Research Institute (FHMRI), College of Medicine and Public Health, Flinders University , Bedford Park 5042, Adelaide, South Australia, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Harald Janovjak 1 Flinders Health and Medical Research Institute (FHMRI), College of Medicine and Public Health, Flinders University , Bedford Park 5042, Adelaide, South Australia, Australia 2 Australian Regenerative Medicine Institute (ARMI), Faculty of Medicine, Nursing and Health Sciences, Monash University , Clayton 3800, Melbourne, Victoria, Australia 3 European Molecular Biology Laboratory Australia (EMBL Australia), Monash University , Clayton 3800, Melbourne, Victoria, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Harald Janovjak For correspondence: harald.janovjak{at}flinders.edu.au Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Unravelling protein function at scale remains challenging. Gain-of-function (GOF) screens harnessing large open reading frame (ORF) libraries enable systematic functional annotations of cell behavior drivers and drug targets. Currently, few sequencing methods are established to analyze existing ORF libraries and facilitate future library generation. Here, we developed a barcode-free deep sequencing method termed pooled plasmid library sequencing (PPLseq). PPLseq achieved high efficiency and single nucleotide accuracy, including the detection of previously undocumented variants, on a prototypical super-family ORF library (>300 human G-protein-coupled receptors (GPCRs)). We next combined PPLseq with high-throughput gene engineering to generate a new library of 246 GPCR-fluorescent protein (FP) fusions. We quantified expression of all members of this library and identified robustly expressed yet understudied receptors. Collectively, we demonstrate accessible, scalable, and sensitive ORF library sequencing towards a deeper understanding of proteome function. Introduction A major scientific endeavor is to understand the form and function of the ∼25’000 proteins encoded in the human genome. Whereas recent deep learning-based advances, such as AlphaFold [ 1 , 2 ] or RoseTTAFold [ 3 ], provide proteome-wide models of form, defining function most commonly relies on experimental inquiry. Complementary to loss-of-function experiments (e.g., gene knock-out or -down using CRISPR or RNAi), GOF screens allow high-throughput analysis of ORF (over)expression in the context of complex cell states and behaviors ([ 4 – 7 ], and below). However, the generation, propagation, and application of pooled or arrayed ORF libraries is challenged by undocumented and/or undesired coding sequence alterations and non-uniform coverage, as well as recombination events and off-target effects during gene delivery [ 8 – 13 ]. These confounders can individually or collectively limit library reliability and screen power and hence call for rigorous sequence characterization. Limited large-scale sequencing was performed on ORF libraries prior to the broad availability of NGS methods [ 14 – 19 ]. More recently, NGS verification has been applied to libraries of short regulatory sequences or isolated protein domains [ 4 , 20 – 23 ] (typically <300 base pairs (bp)) but less so to full-length ORF libraries including those obtained commercially or from repositories [ 10 , 12 , 24 ]. NGS has been demonstrated to be essential to quantify enrichment in elegant pooled screens but in most studies ORF integrity was not assessed [ 5 , 7 , 13 , 25 – 27 ]. Thus, the potential of NGS to systematically characterize ORF libraries is underexplored. One particular research area that relies on ORF libraries and thus may benefit from improved methods are studies of large protein (super-)families, such as membrane proteins [ 30 – 32 ]. For instance, ∼300 diverse receptors (and ∼400 olfactory receptors) form the GPCR super-family [ 33 , 34 ]. GPCRs are expressed in virtually every human cell type, sense diverse ligands, and are the target of ∼30% of all prescription drugs [ 35 , 36 ]. Only very few super-family-wide GPCR libraries are available and large-scale functional studies of GPCRs remain challenging. Inoue et al. and Avet et al. determined downstream coupling profiles for 148 or 100 therapeutically-relevant wild-type GPCRs [ 37 , 38 ]. Kroeze et al. fused 314 DNA sequence-optimized GPCRs to a cleavable transactivator domain for high throughput functional assays, and Lv et al. and Tedman et al. generated extensive libraries of 826 or 940 human GPCRs and splice variants and applied these to surface expression analysis [ 39 – 41 ]. Finally, we generated a library multi-fusion chimeric Class A orphan GPCRs for optogenetics [ 42 ]. Sequencing needs to be en par in scale and throughput with high-throughput gene engineering methods, such as those employed in these studies. This will enable the generation of further libraries of GPCRs or even larger protein super-families (e.g., kinases or transcription factors) and ultimately dissect commonalities and differences in super-family-omes. Here, we systematically developed protein super-family library NGS. We first evaluated whether short-read sequencing, being the most economical and large-scale NGS method, is suited to characterize prototypical ORF libraries. We then developed the experimental workflow termed PPLseq and validated it on a previously established GPCR library. We showed that PPLseq can report library coverage and accuracy with single nucleotide resolution, including previously undocumented ORF variants. We expanded on the use of PPLseq to generate a large fluorescently-labelled GPCR library. Analysis of this library using single-cell confocal microscopy quantified expression levels, including of understudied receptors, as targets for future structure-function analysis. Results Barcode-free ORF library sequencing We first asked if short-read NGS is suited for ORF library sequencing without barcoding ( Figure 1a , Supplementary Figure S1 ). ORF-specific barcoding prior to NGS library preparation assists with read mapping but is incompatible with the analysis of existing pooled libraries and adds additional processing steps ( Supplementary Figure S1 ). Barcodes are not required if library genes are sufficiently different for reads to be mapped to the correct ORF ( Figure 1a ). We tested this computationally for three large human protein super-families: the largest receptor family (314 unique non-odorant GPCRs), the largest enzyme family (495 unique kinases), and a diverse super-family (1993 unique transcription factors (TFs)) (for sequences, source materials, and deduplication methods see Supplementary Table 1 , Supplementary Data S1-3 and Materials and Methods ; the combined lengths of the ORFs in these libraries are 0.42 (GPCRs), 0.88 (kinases) and 3.2 Mb (TFs)). To test for unambiguous read mapping, we searched the libraries for non-unique stretches using four sliding windows (50, 75, 150, or 300 bases (b)) which reflect cycle number and read lengths presets in state-of-the-art short-read sequencers. The emerging data tracks report non-uniqueness akin to ‘uniquome’ analysis in whole genome sequencing [ 43 , 44 ]. We found that sequences in the GPCR library were unique for all window lengths ( Figure 1b,c ). For window lengths of 150 and 300 b, but not shorter lengths, only a small number of non-unique segments were identified both in the kinase and TF libraries ( Figure 1d-g , Supplementary Figure S2 ). The kinase library contained only two non-unique ORF pairs which correspond to gene splice variants of MAP2K2 and MST4 ( Supplementary Table S2 ). These pairs share eight (150 b) or six (300 b) overlapping segments ( Figure 1d,e ; segment lengths are specified in Supplementary Figure S2 ). In the TF library, ten sequence pairs (150 b) or one sequence pair (300 b) were identified containing a total of 20 (150 b) or two (300 b) overlapping segments ( Figure 1f,g , Supplementary Figure S2 ). These pairs correspond to paralogs (e.g., STAT5A and STAT5B [ 45 ]), genes that share exons ( ZIM2 and PEG3 [ 46 ]), or genes with internal tandem repeats (e.g., IFI16 [ 47 ]) ( Supplementary Table S2 ). Thus, with the exception of a small number of closely related ORFs, correct read mapping is likely at read lengths ≥150 b. Under the assumption that any technical replicate clones of the same ORF are placed in different sequencing pools (termed sub-libraries, see below), barcode-free pooled library short-read sequencing is suited for protein super-family sequencing. Download figure Open in new tab Figure 1. Multi-mapping reads in three large human gene families. (a) Pipeline encompassing plasmid library pool NGS and read mapping to correct (green box) or incorrect (red box) ORFs. Uniqueness analysis identifies non-unique segments that may result in ambiguous read mapping (reads with red outline). (b,c) Uniqueness tracks and the number of non-unique segments in the human GPCR library. (d,e) Uniqueness tracks and the number of non-unique segments in the human kinase library. (f,g) Uniqueness tracks and the number of non-unique segments in the human TF library. Grey horizontal lines in (b,d, and f) represent ORF library sequences from the first base of the first ORF to the last base of the last ORF. Black squares denote non-unique sequence stretches for the window length specified by the Y-axis and at the base position specified by the X-axis. Bars in (c,e, and g) denote the number of continuous non-unique sequence segments identified in the corresponding libraries. PPLseq workflow We next established the experimental PPLseq workflow ( Figure 2a ). We focused on the GPCR library because we have reengineered these receptors in the course of this work and because high quality reference sequences are available for 314 unique full length receptors in the PRESTO-Tango library [ 39 ]. To generate a library pool, we first replica-plated the cryo-preserved E. coli cultures onto agar trays for overnight growth ( Figure 2a ). Growth was observed for all but four genes ( Supplementary Figure S3 , also see below). The localized inoculation spots were pooled and plasmid DNA was isolated using standard silica columns. We performed NGS with a target data yield of 10 to 20 Gb as this would allow for deep sequencing (depth >1000; including plasmid backbones, the library spans ∼2.5 Mb). Sequencing should thus provide complete library deep sequencing with a read excess to compensate for E. coli plasmid copy number variability (see below). We chose a WGS library preparation method as it is designed for double-stranded DNA and offers read lengths supported by above uniqueness analysis in paired-end sequencing (150 b). We processed >43 million filtered NGS reads. The main steps of the pipeline were read mapping to ORFs collected in a reference sequence inventory, read statistics (i.e., overall coverage of the sequence from first to last codon or per base sequencing depth), and variant calling (i.e., deviations from the expected sequences) ( Figure 2a ). Pipeline outputs are discussed in the context of ORF recovery and accuracy. Download figure Open in new tab Figure 2. Gene recovery in PPLseq. (a) PPLseq workflow encompassing replica growth of clonal cultures, DNA pooling, NGS, and sequencing statistics. (b) Total reads mapped to each GPCR ORF. Red spheres indicate genes without mapped reads, except for SSTR1 that was mapped by two reads. (c) Mean depth for each GPCR ORF. Red spheres as in (b). (d) Standard deviation of mean depths of (c). In (b-d), grey lines indicate medians. ORF recovery in PPLseq We assessed recovery in the PPLseq workflow by comparing the identified GPCR ORFs to those expected in the library. We found that reads could be mapped to all but six ORFs indicating an initial gene recovery rate of >98% ( Figure 2b , Supplementary Data S4 ). These 308 mapped ORFs were sequenced with median counts of 38’487 reads per ORF and with a median depth of 4’284 ( Figure 2b,c ). We noticed that reads and mean depth were not uniformly distributed across the genes ( Figure 2b,c ) or within each gene ( Figure 2d , Supplementary Figure S4 ), but this did not negatively impact analysis and is discussed below. To assess recovery more completely, we further investigated the six genes with zero or less than two mapped reads ( GLP2R , GPR1 , LPAR5 , HTR1D , RXFP3 , and SSTR1 ; red spheres in Figure 2b,c ). The library strains that should encode GLP2R , GPR1 , HTR1D , and RXFP3 did not yield inoculation spots in our library cultures and could also not be isolated from the pooled library using polymerase chain reactions (PCR) reactions ( Supplementary Figure S5 ). Thus, these genes were likely absent from our preparation of the source library and accordingly not captured by PPLseq. The strains that should encode LPAR5 and SSTR1 did yield inoculation spots and plasmids isolated from this growth were analyzed using Sanger sequencing. In the case of the plasmid expected to encode LPAR5 , Sanger sequencing reported a variant of the human LPAR5 receptor with an alternative DNA sequence ( Supplementary Table S3 ). In the case of the plasmid expected to encode SSTR1 , Sanger sequencing instead reported a sequence that is identical to SSTR2 ( Supplementary Table S4 ). This independent analysis using Sanger sequencing confirms that these two ORFs were not captured in PPLseq because they were apparently not present in the library. Thus we were able to demonstrate that PPLseq accurately describes ORF library content. Accuracy of PPLseq We next analyzed the accuracy of sequences obtained using PPLseq at the single nucleotide level. We found that all but the above six absent ORFs and PTH1R (which is discussed separately below) had complete coverage over their entire ORFs ( Figure 3a,b , Supplementary Figure S4 , Supplementary Data S4 ). Variant calling demonstrated that of the 307 full length ORFs all but eleven were sequenced without deviation from the expected reference ORF sequence (i.e., with 100% sequence identity; Figure 3c ). For these eleven genes, between one and four apparent variants were reported, and most of these 20 apparent variants were single nucleotide polymorphisms ( Figure 3d,e , Supplementary Data S5 ). We analyzed site-level quality scores and the ratio of alternate and reference bases (expressed as the fraction of alternate base reads (AR fraction)) for the sites with reporter variants ( Figure 3f,g , Supplementary Data S5 ). Quality scores were either very small (<1, for all variants with AR fractions 2000, for all variants with AR fractions of 100%). We first investigated the variants with high quality scores using Sanger sequencing ( BB3 (C927T), DRD3 (A527G), and PTH1R (A1714G); Figure 3h ). In the case of BB3 and DRD3 , we confirmed the substitutions observed in PPLseq that were not documented in the reference sequences for this library ( Supplementary Table S5, top ). The substitution in BB3 was synonymous (F309F). The substitution in DRD3 was non-synonymous encoding for an amino acid change in the protein relative to the deposited reference sequence (E176G; however, a glycine at this position corresponds to the canonical coding sequence). In the case of PTH1R , we found that the library strain encoded a canonical gene that was 22 amino acids longer than the reference sequence ( Supplementary Table S6 ), also providing an explanation for the incomplete coverage noted above. We next investigated the variants with low quality scores using Sanger sequencing ( Figure 3h ). We found that none of these variants were present in the library ( Supplementary Table S5, bottom ). These findings substantiate post-calling filtering based on quality scores for reporting of final sequences filtered variants ( Figure 3i-k ). As a further test for the variant calling methodology, we introduced ten single nucleotide variants (insertions, deletions, and substitutions) into the reference ORF sequences and asked whether these could be identified in the workflow. This was indeed the case for all introduced variants with high quality scores and AR fractions ( Supplementary Figure S6 , Supplementary Data S6,S7 ). Overall, these results demonstrate single nucleotide accuracy sequencing in the efficient PPLseq method. Download figure Open in new tab Figure 3. Accuracy of PPLseq. (a) Variant calling parameters (quality scores and AR fractions) in an illustrative ORF. (b) Sequence range of each GPCR ORF that is covered by mapped reads. Red spheres indicate genes from Figure 2 with limited reads and zero coverage. Blue spheres indicate receptors with variants discussed in the Main Text and below. (c) The number of ORFs recovered with 100% or less than 100% sequence identity (i.e., recovered with and without variants). (d,e) Distribution of variants per ORF and variant types. (f,g) Distribution of AR fractions and Q factors for all variants. (h) Variants verified using Sanger sequencing. Variants shown in orange were confirmed whereas variants shown in green were not confirmed. a: high quality score (>100); b low quality score (<1). (j) The number of ORFs recovered with 100% sequence identity after variant filtering. (j,k) Distribution of quality score filtered variants per ORF and their variant types. GPCR library generation supported by PPLseq We next utilized PPLseq in the generation of a new library that contains 246 GPCRs fused to the FP mCherry ( Figure 4a ). This library is complementary to available GPCR collections that aim at understanding pharmacological and signaling properties [ 37 – 39 , 42 ] and to the best of our knowledge the largest assembly of GPCRs with a FP fused to their C-terminus. For library generation, we applied the Type IIS restriction enzyme-based high throughput ‘Golden Gate’ gene insertion method [ 48 ]. We found that 280 of the 314 human GPCRs were amenable to this technique using two restriction enzymes selected with a hierarchical approach (SapI (229 ORFs) and BsmBI (51 ORFs), see Materials and Methods ). We amplified each receptor in individual PCRs using ORF-specific primers from the pooled GPCR library and inserted the PCR products into a vector containing mCherry. Anticipating limitations of the cloning technique (see below), we generated six sub-libraries: three sub-libraries for each insertion reaction with each sub-library containing the complement of restriction enzyme-specific GPCR ORFs ( Figure 4a ). PPLseq then proved both efficient and powerful in understanding library generation. A single NGS reaction was sufficient to characterize each sub-library consisting of a pool of dozens of ORFs. The composition of each sub-library was resolved in detail. No reads could be mapped in any of the sub-libraries to GLP2R , GPR1 , HTR1D , LPAR5 , and RXFP3 ( Supplementary Data S8-S13 ), which we showed were not present in the starting library as reference sequences. Similar numbers of inserted genes were identified in the individual sub-libraries and insertion rates were generally >80% reflective of the high efficiency of the Golden Gate method ( Figure 4b,c ). As expected, distribution of genes across the groups of sub-libraries were mutually exclusive (i.e., genes inserted with SapI were absent from the BsmBI sub-libraries, and vice versa ) ( Figure 4d ). The sequences of the inserted genes were verified through variant calling ( Figure 4e , Supplementary Data S14-S19 ). Of a total of 679 inserted ORFs, 551 (79.5%) were identical to the library sequence after quality score post-calling variant filtering. The remainder exhibited between one and three single nucleotide variants that were predominantly single nucleotide polymorphisms (SNPs) ( Figure 4f,g ). This analysis allowed assembly of a final library from the sub-libraries for functional studies ( Supplementary Data S20 ). The final library encompassed 246 mCherry-fused receptors with 100% sequence coverage and without variants, with the exception of synonymous substitutions in GABBR1 (G930A, GABAB1-A310A), GPR150 (T36G, GPR150-P12P), and RXFP1 (G909A, RXFP1-G303G), as well as one non-synonymous substitution in CHRM5 (T368C, M5R-V123A), which were included to maximize ORF numbers. Thus, PPLseq was valuable for understanding the efficiency of library construction at scale, including the specificity of amplification and insertion reactions, and compilation of a new receptor library. Download figure Open in new tab Figure 4. PPLseq-assisted generation of a new GPCR library. (a) Library generation through PCR reactions, Golden Gate insertion, sub-library fractionation (three sub-libraries per enzyme), and PPLseq. (b,c) Inserted GPCR ORFs recovered in six sub-libraries for insertion reactions utilising SapI (b, sub-library 1-3; out of 229 ORFs) and BsmBI (c, sub-library 4-6; out of 51 ORFs). Recovery criteria was 100% sequence coverage. (d) Mutually exclusive read distribution according to aggregated sub-libraries for the inserted 679 GPCR ORFs. (e) The number of ORFs recovered with 100% or less than 100% sequence identity (i.e., recovered with and without filtered variants). (f,g) Distribution of quality score filtered variants per ORF and their variant types. SL: sub-library. Expression profiles in new GPCR library The 246 GPCR-mCherry-fusion ORFs were expressed in HEK293 cells to determine their levels using confocal microscopy and single cell analysis. Excluding controls, a total of 7.2 million single cells across 12,300 microscopy fields-of-view were analyzed corresponding to an average of 29,300 single cells per ORF ( Supplementary Data S21 ). We first optimized transfection conditions using two receptors with different expression levels. In microscopy images ( Figure 5a , Supplementary Figure S7a,b ), we quantified single cell fluorescence intensity and transfection efficiencies ( Figure 5b , Supplementary Figure S7c-f ). Recognizing the limitations of transient transfections, we found that transfection with 25 ng of GPCR-mCherry-fusion vectors resulted in cell transfection rates comparable to higher doses but markedly lower aggregation ( Supplementary Figure S7c-f ). Thus, this plasmid amount was chosen for experiments. Expression was observed for most receptors. For instance, expression levels of 234 receptors exceeded a threshold of five standard deviations (SD) above negative control wells ( Figure 5c ). We found that receptor fluorescence intensities varied by nearly ten-fold between the lowest and highest expressing receptor ( Figure 5c ) and were robust between biological replicates ( Figure 5d ). The receptor expression profiles were further analyzed. First, we identified the ten most highly expressed receptors and examined their tendency to aggregate ( Figure 5d , Supplementary Figure S8 ). We observed this for some receptors. For instance, receptors for the pituitary adenylate cyclase-activating polypeptide ( ADCYAP1R1 ) or melanocyte-stimulating hormones ( MC5R ) were largely membrane localized, whereas, for instance, the EP subtype receptor for prostaglandin E₂ ( PTGER2 ) was found to form aggregates ( Supplementary Figure S8 ). Second, we correlated expression level profiles to ORF length and GPCR classification. Class C or B2 GPCRs generally exhibited lower expression levels, in line with their complex architectures that include large extracellular domains ( Figure 5e ). Finally, we cross-referenced expression to information about the availability of ligands and atomic resolution structures ( Figure 5f,g ). Among the receptors that express at moderate to high levels (in the 50 th percentile), we identified 80 receptors for which between zero and two ligands have been reported ( Figure 5f ), and 63 receptors for which between zero and two structures have been reported ( Figure 5g ). Collectively, these results demonstrate the applicability of the new GPCR library in expression level profiling towards the identification of, e.g., robustly expressed receptors that are membrane localized or understudied for further biochemical or structural investigation. Download figure Open in new tab Figure 5. Visualisation of the GPCR ORF library. (a) Representative confocal microscopy images of HEK293 cells expressing CXCR4-mCherry and HTR1A-mCherry . Scale bars are 10 µm. (b) Average fluorescence in CXCR4-mCherry and HTR1A-mCherry transfected cells. (c) Fluorescence intensity distribution across GPCR-mCherry library (replicate 1) color-coded by GPCR family. Red line denotes the average intensity of negative controls. (d) Correlation between imaging replicates. In addition, the ten most highly expressed receptors are labelled. Receptor classes are color-coded as in (c). (e) Analysis of ORF length and fluorescence intensity. Receptor classes are color-coded as in (c). (f) Analysis of known physiological ligands and fluorescence intensity. Inset: Intensities of receptors with zero to two known ligands. (g) Analysis of available atomic resolution structures and fluorescence intensity. Inset: Intensities of receptors with zero to two available structures. Discussion GOF screens utilizing ORF libraries are powerful tools for large-scale exploration of protein function. Whereas NGS is routinely applied to quantify enrichment in pooled screens, including those studying ORFs [ 5 , 7 , 25 – 27 ], few studies utilized modern sequencing methods to examine library integrity. NGS facilitated whole-genome ORF screens of model organisms [ 10 , 17 ] and pathogens [ 49 – 51 ] as well as determined coverage in libraries of small protein domains [ 4 , 20 ] and full length proteins or plasmids [ 13 , 52 ]. However, in these studies, sequencing focused either on library intermediates or partial libraries and/or if sequence deviations were reported these were not resolved through independent validation. In addition, screens performed before the widespread adoption of NGS necessarily relied on libraries that were less extensively characterized. This underscores the need for a framework towards library sequencing at scale and with single-nucleotide accuracy. We systematically established super-family library NGS by integrating sequence uniqueness mapping, pooled library preparation, and characterization of reference and new libraries. Uniqueness analysis, which has been previously applied to genomes and transcriptomes [ 43 , 44 ], provided a measure to gauge fidelity of read alignments. This analysis showed that even in large super-family libraries only a limited number of genes, in particular gene isoforms, are beyond the reach of short-read NGS with ≥ 150 cycles. In PPLseq, read mapping and variant calling established on a reference library demonstrated complete recovery (i.e., identification of all ORFs in the library preparation) and high accuracy (i.e., identification of sequence variants that were then independently validated). Whereas there was general agreement between the predicted and observed mean depth, we noted variation in depth between ORFs. We attribute this to differences in input amount, e.g. due to varying plasmid copy numbers, and we can in turn conclude that the sequencing depth chosen here allowed adequate library representation. Finally, sequencing of sub-libraries using PPLseq facilitated the generation of a new GPCR library consisting of 246 mCherry-fused receptors. We envision several potential use cases of PPLseq. In general, the method directly reports key elements of library integrity, in particular missing sequences or sequence variations. As ORF-specific barcoding is not a part of the workflow, this method is -within the limitations imposed by sequence uniqueness-directly applicable to historic libraries, including pooled libraries. With the exception of ORF libraries of the propagation host ( E. coli ), the method is agnostic to the origin of the sequence elements. PPLseq will support generation of new libraries using high-throughput gene ‘shuffling’ methods, such as the Golden Gate or Gateway techniques. Despite the robustness of these methods, it may be advisable to validate newly generated libraries. Integration of PPLseq in library generation can be somewhat more complex than characterization of existing libraries as in some instances multiple clonal replicates of the same ORF are analyzed. We showed that this can be achieved through scalable sub-libraries, such as three sub-libraries in our application of the Golden Gate method. Finally, since many gene knock-in or knock-out techniques rely on gene fragments or plasmids containing genome homologous sequences, applications of PPLseq are not limited to overexpression screens. Assisted by PPLseq, we have generated, to the best of our knowledge, the largest GPCR-FP fusion library. Experimental characterization of GPCRs, e.g. their expression levels or sub-cellular localization, generally rely on well-by-well library arrays, despite recent successes in the development of pooled assays [ 53 , 54 ]. The GPCR fusion proteins were transiently expressed in HEK293 cells and their expression was visualized using confocal microscopy. Whereas our methodology is distinct from work using surface immunostaining and FACS in the same cell type [ 41 ], we consistently observed that many receptors express at low levels and a correlation of expression and receptor classes. GPCR expression profiles may be harnessed in future studies that go beyond the development of PPLseq that was the focus of this work. These studies may focus on identification of receptors with efficient membrane localization for structural studies [ 40 ], the generation of fluorescent sensors [ 55 , 56 ], or for incorporation in chimeric receptors [ 57 – 59 ]. We note that overexpression systems utilizing sequence-optimized genes do not recapitulate certain aspects of endogenous receptor expression, such as their dependence on DNA sequence features [ 41 ]. PPLseq is not without limitations. Knowledge of the expected sequences in the library is required for read mapping; thus, the methods is not suited for analysis of libraries with unknown content. As PPLseq relies on short-read sequencing, it is limited when resolving splice variants, but these can identified a priori through uniqueness analysis. Similarly, domains fused to ORF inserts are verified in vectors prior to ORF insertion. In future work, some of these limitations may be addressed through combinations with long-read sequencing methods [ 52 , 54 ]. Although an ever-growing complement of structural features of proteins and their complexes can be inferred from sequences, it is to date still in situ experiments that allow understanding protein function, molecular interactions, and cellular localization. Elucidating this complement of properties across entire superfamilies remains challenging and requires high-throughput approaches. We present a barcode-free sequencing workflow with single nucleotide accuracy that will support plasmid library characterization in a variety of contexts. Just like NGS has revolutionized our understanding of the genome, this accurate and accessible methodology sets the stage for the use of NGS in studies that analyze coding and non-coding sequences at scale using large and flexible libraries. Author contributions Conceptualization, R.T.S., H.J.; Funding Acquisition, H.J.; Methodology, R.T.S., K.H.C., J.F., S.M., G.B.G., H.J.; Project Administration, H.J.; Investigation, R.T.S., K.H.C., J.F., G.B.G., H.J.; Data Curation, R.T.S., K.H.C., G.B.G., H.J.; Supervision, H.J.; Visualization, R.T.S., K.H.C., G.B.G., H.J.; Writing - Original Draft, R.T.S., H.J.; Writing - Review and Editing, R.T.S., H.J. Declaration of interests The authors declare no competing interests. Materials and Methods Library curation ORF sequences were compiled from public repositories in August 2021 ( Supplementary Table S1 ). In the case of the GPCR library, curation was based on sequences deposited in the Addgene Inc. repository (Addgene Kit #1000000068). Of the available sequences, the third ‘depositor partial sequence’ column in the sequence dataset contained intact ORFs for all but 13 GPCRs. Sequences were considered intact if a start codon was present and if their lengths were indicative of intact codons (multiples of three nucleotides). For the remaining 13 sequences, incomplete terminal codons were manually corrected or entire sequences completed using sequencing information provided by Addgene Inc.. For the kinase and TF ORF library, sequence curation was based on sequences provided by the DNASU Plasmid Repository at the Biodesign Institute of the Arizona State University. These sequences were manually amended to correct reading frames and/or exclude stop codons (for clones in the ‘closed’ ‘fusion format’) or added terminal codons (for clones in the ‘open’ ‘fusion format’) ( Supplementary Table S1 ). Uniqueness analysis and deduplication of ORF libraries The kinase and TF libraries contained duplicate sequences (most commonly because in some cases the same gene was present in two library clones) that had to be removed prior to conducting a meaningful uniqueness analysis. Deduplication was performed with USEARCH (v11) using a fast variant of the UCLUST algorithm [ 60 ]. In the process of deduplication, input sequences were sorted by decreasing length and an identity threshold of 90% was applied. Cluster centroids were retained for further analysis. In the kinase and TF library, 135 and 44 sequences were removed, resp.. Unique ORF sequences are provided in Supplementary Data S1-S3 . To test for sequence uniqueness, ORF sequences were then concatenated into long strings separated with non-nucleotide characters. Strings were then searched 5’ to 3’ in forward direction and backward direction (in instances where forward searching did not report a match) using sliding windows of pre-defined length (50, 75, 150, or 300 b). Occurrence of the window sequence in another location of the string was registered in binary format. The number and length of non-unique sequence segments along with the corresponding ORF names was determined using a level crossing algorithm. Analysis was performed using macros written in C in Igor Pro (v6.22). For sequence pairs reported to contain non-unique segments, sequence overlap (using pairwise alignments), naming and presence of splice variants (using nucleotide BLAST searches and GeneCards) and internal repeats (using Tandem Repeats Finder v4.09 with default parameters [ 61 ] were manually verified. Regrowth of GPCR collection The PRESTO-Tango GPCR collection was a kind gift from Bryan Roth and colleagues [ 39 ]. E. coli glycerol stocks were obtained from Addgene.org (Addgene Kit #1000000068) and replica-plated using sterile disposable 96-pin replicator plates (Ciro Manufacturing Corp.) onto single-well plates (242811, Thermo Fisher Scientific Inc.) containing LB-ampicillin agar. The plates were incubated overnight at 37°C and replica-plated the following day for imaging on a white light imager (Gel Logic 212 Pro, Carestream Health Inc.). Inoculation spots for each plate were then scraped in 5 ml of LB-ampicillin medium into sterile tubes for downstream plasmid purification using four mini-scale silica columns. Each plate resulted in an elution volume of >150 μl and DNA concentrations between 135 and 360 ng/μl. Validation of genes with incomplete coverage (Sanger sequencing) The library clones that should encode LPAR5 and SSTR1 yielded growing cultures ( Supplementary Figure S3 ) but either zero or only minimal read numbers limited to short sequence stretches could be mapped to these sequences ( Supplementary Figure S4 , Supplementary Data S4 ). To investigate this, clones were restreaked from the original GPCR library and a clonal culture was grown overnight at 37°C in 2 ml LB-ampicillin medium. Purified plasmids were analyzed using Sanger Sequencing (AGRF Ltd.) using forward and reverse primers (primers 1 and 2, Supplementary Table S7 ). Validation of genes with low reads and incomplete coverage (PCR) The library clones that should encode GLP2R , GPR1 , HTR1D , and RXFP3 did not yield growing cultures ( Supplementary Figure S3 ) and no reads could be mapped to these sequences ( Supplementary Figure S4 , Supplementary Data S4 ). We thus utilized PCR with gene-specific forward and reverse primers to test for their presence in the library pool (primers 12 to 19, Supplementary Table S7 ; TSHR served as a control, primers 20 to 21, Supplementary Table S7 ). Reactions contained 2.5 μl 10x Pfu polymerase buffer (B600003, Sangon Biotech (Shanghai) Co. Ltd.), template DNA (containing ∼30 pg target gene if present), 1 μl forward primer (10 μM stock concentration), 1 μl reverse primer (10 μM stock concentration), 0.5 μl dNTP mix (10 mM stock concentration; U151B, Promega Corp.), 0.5 μl Pfu polymerase (5 u) (B600003, Sangon Biotech (Shanghai) Co. Ltd.) in a total volume of 25 µl. PCR cycling parameters were 1. 2 minutes at 95°C, 2. 30 seconds at 95°C, 3. 30 seconds at 60°C, 4. 1 minute at 72°C, 5. Repeat steps 2 to 4 a further 23 times. Reactions were performed in 200 μl PCR tubes in a T100 thermal cycler (Bio-Rad Laboratories Inc.). NGS sample preparation NGS samples were prepared in low DNA binding tubes (0030108418, Eppendorf SE). Typically, >2 μg of purified plasmids were collected in a final volume of 30 μl. These pools were created by combining material isolated either from the four replica plates (original ORF collection) or from twelve individual cloning plates (plasmids created with the Golden Gate method). In the case of cloning plates, each sub-library pool contained up to one clone per ORF (see below). The amount of input material from each plate was chosen such that the number of clones on each plate were represented with equal amounts in the pool, except for sequencing of the initial GPCR library, where read numbers for ORFs with >2 reads on a partial plate were normalized in comparison and visualization. Libraries were prepared using TruSeq DNA Nano kit (20015964, Illumina Inc.) with 100 ng starting material and 350 bp fragment size. Libraries were sequenced on the NovaSeq6000 platform (Illumina Inc.) with a paired-end read length of 150 b. Sequencing was provided by JS LINK Inc. NGS analysis Tabulated ORF sequences including short extensions into the plasmid backbone (33 b each at the 5’ and 3’ end) served as reference sequences for NGS analysis. The reference sequence collection also included a plasmid backbone sequence for the mapping of non-ORF reads. Between 23 and 44 million filtered reads were processed for each of the NGS samples, and between 87 and 96% of reads were mapped to the ORF libraries (including plasmid backbones). Paired-end sequencing reads were filtered and aligned to reference sequences with Snippy (v4.6.0) [ 62 ] and BWA (v0.7.17-r1188) [ 63 ] to generate an alignment output file. Alignment statistics, including mapped read numbers, coverage, and sequencing depth, were extracted from the alignment output file using SAMtools (v1.18) [ 64 ]. Variant calling was performed with FreeBayes (v1.3.6) [ 65 ] with default parameters (minimum mapping quality ≥60, base quality ≥13, coverage ≥4, ALT-supporting reads ≥2, ALT allele fraction ≥5%) to identify single or multi-nucleotide polymorphisms and indels. Raw variant data from FreeBayes without further Snippy filtering were extracted using BCFtools (v1.19) [ 64 ]. Variant quality scores correspond to Phred-scaled probabilities that a polymorphism exists at this site. Final data were compiled in a spreadsheet where each gene was annotated with read numbers, depth and coverage. Variants were then reviewed to identify true deviations from the references sequences defined as those with quality scores above the default Snippy threshold of 100 (see Main Text for details). Not specified are variants in the plasmid backbone. Variant validation (Sanger sequencing) Selected sequence variants identified by PPLseq were verified using Sanger sequencing also to support the quality score-based post-calling filtering ( Supplementary Table S5 ). Clones were prepared directly from the original GPCR library and a clonal culture was grown overnight at 37°C in 2 ml LB-ampicillin medium. Purified plasmids were analyzed using Sanger Sequencing (AGRF Ltd.) using primers 3 to 7 ( Supplementary Table S7 ). Vectors for Golden Gate method Golden Gate insertions were performed int modified pcDNA 3.1(−) vector (V79520, Thermo Fisher Scientific Inc.). In this vector, six recognition sites for three Type IIS restriction enzymes (two sites for BsaI, one for BbsI, and three for SapI) were previously removed using site-directed mutagenesis [ 42 ]. In addition, the large section of the vector comprising the F1 origin of replication and aminoglycoside resistance cassette was removed to reduce vector size. Next, two receiver cassettes were designed and synthesized that contain (i) the FP mVenus [ 66 ] optimized for bacterial expression and under the control of a bacterial promoter (J23115, https://parts.igem.org/Part:BBa_J23115 ), (ii) SapI or BsmBI restriction sites flanking mVenus, and (iii) the FP mCherry positioned for C-terminal fusion to the inserted GPCR. mVenus served as a removable marker protein to identify colonies in which the Golden Gate reaction had proceeded. The cassettes were obtained as GBlock gene fragments (Integrated DNA Technologies Inc.) and inserted into the new vector via restriction enzyme cloning. The cassettes were first amplified by PCR using Pfu polymerase (B600003, Sangon Biotech (Shanghai) Co. Ltd.). The vector and cassettes were then digested with XbaI and HindIII restriction enzymes (R0145 and R0104, New England Biolabs Inc.) and ligated with T4 DNA Ligase (M180B, Promega Corp.). Sequences were verified using Sanger sequencing (AGRF Ltd.). Primers and PCRs for Golden Gate insertion A script was employed for automated primer design and screening of internal Type IIS restriction enzyme recognition sites in GPCR ORF sequences. ORF sequences were searched for recognition sites of the restriction enzymes SapI, BsmBI, BsaI, BbsI, BseRI, and PaqCI. The prevalence of recognition sites across the entire library was considered to determine an enzyme hierarchy so that the maximal number of sequences could be cloned with the smallest number of enzymes. This hierarchy was SapI > BsmBI, BsaI > BbsI > BseRI > PaqCI. Given the small number of GPCRs incompatible with either SapI or BsmBI (34 GPCRs), sequences within that group were excluded from the workflow. Primers were automatically designed with appropriate annealing sites specific to the start and end of each ORF with a melting temperature of >64°C. The primers contained flanking overhangs with the respective Type IIS sites for Golden Gate insertion and were synthesized in 96-well plates on the scale of 500 pmoles (Integrated DNA Technologies Inc.). The pooled GPCR library was used as the template for gene amplification. Reactions contained 2.5 μl 10x Pfu polymerase buffer (B600003, Sangon Biotech (Shanghai) Co. Ltd.), 1 μl pooled template DNA (10 ng), 1 μl forward primer (10 μM stock concentration), 1 μl reverse primer (10 μM stock concentration), 0.5 μl dNTP mix (10 mM stock concentration, U151B, Promega Corp.) and 0.5 μl Pfu polymerase (5 u) (B600003, Sangon Biotech (Shanghai) Co. Ltd.) in a total volume of 25 μl. PCR cycling parameters were 1. 2 minutes at 95°C, 2. 30 seconds at 95°C, 3. 30 seconds at 55°C, 4. 5 minutes at 72°C, 5. Repeat steps 2 to 4 a further 29 times, and 6. 10 minutes at 72°C. PCRs were performed in 96-well PCR plates (HSS9601, Bio-Rad Laboratories Inc.) in a T100 thermal cycler (Bio-Rad Laboratories Inc.) Golden Gate method Golden Gate reactions utilized SapI or BsmBI restriction enzymes (R0739 and R0569, New England Biolabs Inc.) depending on ORF enzyme compatibility. Reactions contained 2 μl T4 ligase buffer (C126A, Promega Corp.), 1 μl receiver plasmid (100 ng), 2 μl unpurified PCR product, 0.5 μl (5 u) restriction enzyme, and 1 μl (3 u) T4 Ligase (M180B, Promega Corp.) in a total volume of 20 μl. Reactions were performed in 96-well PCR plates (3441-00, Scientific Specialists Inc.). Reactions were incubated at room temperature (RT) for 30 minutes (SapI) or thermocycled (BsmBI) with the following parameters: 1. 2 minutes at 37°C, 2. 3 minutes at 16°C, 3. Repeat steps 1 to 2 a further 11 times, 4. 5 minutes at 50°C, and 5. 10 minutes at 80°C in the T100 thermal cycler. In the next step, 0.2 μl of each reaction was then transformed into 20 μl NEB Turbo competent cells (C2984H, New England Biolabs Inc.) for 20 minutes on ice and plated on large-scale well plates containing LB-ampicilin to incubate overnight at 37°C. For each transformed Golden Gate reaction, three colonies were manually picked and grown individually in 200 μl LB-ampicllin media overnight at 37°C. In the next step, 50 μl of each overnight culture were obtained from the first colony of each Golden Gate reaction, pooled, and purified using standard silica columns for NGS. This was repeated for second and third colonies. At the same time and for each individual culture, 50 μl of each overnight culture was mixed with 25 μl of sterile glycerol (50%) and stored at −80°C in 96-well plates. Assembly of sequenced GPCR fusion protein library Successful insertion of the intact GPCR ORF into the plasmid was verified by PPLseq and defined as 100% sequence coverage with no variants (see Main Text for exceptions). Sequence-verified clones were grown starting from the glycerol stock. If more than one colony was sequence-verified, colonies with the highest number of reads or colonies from the sub-library with maximum validated ORFs were selected. For each colony, the glycerol stock was tip touched into 200 μl of LB-ampicllin media and incubated overnight at 37°C with shaking at 600 rpm. 50 μl from each well was mixed with 25 μl of sterile glycerol (50%) and stored at −80°C. The final glycerol stock was then transferred into 96-well deep well plates containing 1.5 ml LB-ampicllin media using the sterile disposable 96-pin replicator plates. The cultures were sealed with a breathable foil (BF-400-S, Axygen Inc.) and incubated overnight at 37°C with shaking at 600 rpm. Plasmids were purified using 96-well plasmid purification kits (B519196, Sangon Biotech (Shanghai) Co. Ltd.). The method followed the manufacturer’s recommendations with minor modification: centrifugation steps were performed at 3,350 x g, bacteria were pelleted for 3 minutes, solutions were mixed by pipetting, and lysates were clarified for 40 minutes. Mammalian cell culture and transfection HEK293 cells were cultured in Dulbecco’s Modified Eagle’s Medium (DMEM) (1195073, Thermo Fisher Scientific Inc.) supplemented with 10% fetal bovine serum (FBS) (26140079, Thermo Fisher Scientific Inc.), 100 U ml −1 penicillin and 0.1 mg ml −1 streptomycin (15140122, Thermo Fisher Scientific Inc.) in a humidified incubator in 5% CO 2 atmosphere at 37°C. Reverse transfections utilizing poly-ethyleneimine (PEI) were performed as described previously [ 67 , 68 ] in 96-well PhenoPlates (6055302, Revvity Inc.) coated with poly- L -lysine (P8920, Sigma-Aldrich Corp.). For each receptor, 250 ng of DNA (in the library screen: 90% of plasmid without an inserted ORF and 10% ORF plasmid) was diluted to a total volume of 25 µl with pre-warmed Opti-MEM (OM) (31985070, Thermo Fisher Scientific Inc.). Additionally, for each receptor, 1 µg polyethyleneimine (PEI) was diluted to a total volume of 25 µl with prewarmed OM. The combined DNA-OM and PEI-OM solutions were incubated for 20 minutes at RT. The mixture was then added to 2 × 10 4 cells in 100 µl antibiotic-free DMEM containing 5% FBS. Six hours post incubation, the original medium was replaced with 150 µl complete DMEM. Staining and imaging Cells were washed 48 hours after transfection with 100 μl prewarmed phosphate buffered saline (PBS) (20012050, Thermo Fisher Scientific Inc.). Cells were then incubated with 50 μl of 1x working concentration of Cell Navigator Cell Plasma Membrane Stain (AAT-22682, AAT BioQuest Inc) and 16 μM Hoechst 33342 (14533, Sigma-Aldrich Corp.) for 15 minutes at 37°C. Cells were then washed a further two times with 100 μl prewarmed PBS and fixed in 50 μl 3% paraformaldehyde (C0042, ProSciTech Pty Ltd.) for 20 minutes at RT. After fixation, cells were washed one more time and maintained in PBS at 4°C until imaging. Imaging was performed on an Operetta automated confocal microscope with Harmony (v4.1) image acquisition and image analysis software (Revvity Inc.). Confocal fluorescence images were obtained using excitation/emission wavelengths of 475 ± 15 / 525 ± 25 nm (Cell Navigator), 380 ± 20 / 445 ± 35 (Hoechst), and 570 ± 10 / 615 ± 25 (mCherry). In this analysis, 25 fields of view were recorded per well at a magnification of 20x with the confocal plane located through the cell midline. Cells transfected with plasmids lacking ORFs or left untransfected served as negative controls. Image analysis Raw images were imported into a dedicated local network environment and image analysis was performed in the Columbus (v2.5) image data storage and analysis system on an Omero (v4.0) and Acapella (v3.2) server (all from Revvity Inc.). In total, 16,100 images and 9.8 million individual cells were analyzed from 644 individual wells (25 fields of view per well). Of these, 2,400 images and 2.1 million cells were from positive and negative control wells, 12,300 images and 7.2 million single cells were from the GPCR-ORF library, and the remainder from further calibration and reference wells. The nucleus of individual cells was identified using nuclear counter staining Hoechst. Cells on the edges of the images were discarded. Following nuclei localization, cell bodies were segmented using the plasma membrane stain with the cytoplasm identification method “C” in Columbus. mCherry fluorescence intensity was measured in the whole cell as a normalized value to the cell area and this value was used to classify cells as transfected or not transfected. Cells with intensity values greater than the mean + 5 SD of the negative control wells (on a per plate basis) were considered transfected. Only wells with >100 transfected cells were included in the analysis (this applied to 238 ORFs in either of the biological replicates). GPCR ligand and structure analysis Receptor names were standardized by mapping HGNC symbols to corresponding IUPHAR identifiers using the Guide to Pharmacology (IUPHAR) database ( https://www.guidetopharmacology.org/GRAC/GPCRListForward?class=A ). Formatting artifacts, such as HTML tags and symbols were removed prior to downstream analysis. For each receptor, the following was retrieved from GPCRdb (human receptors only): receptor family classification, the number of available structures ( https://gpcrdb.org/structure/ ), and the number of deposited physiological ligands ( https://gpcrdb.org/ligand/physiological_ligands/ ). Duplicate receptor-ligand entries were removed. In cases where automated look-up failed (e.g., due to formatting or nomenclature differences), information was manually retrieved from GPCRdb. Acknowledgements We thank Josh Dubowsky and Michael Roach for helpful comments and Matthew Popp, Josh Dubowsky, and Daniel Carrillo Baez for assistance with experiments. This study was supported by grants of the Australian Research Council (FT200100519 and DP200102093, to H.J.), the National Health and Medical Research Council (APP1187638, to H.J.). This research was supported by Flinders University through the Deep Thought High-Performance Computing environment. The Australian Regenerative Medicine Institute is supported by grants from the State Government of Victoria and the Australian Government. The EMBL Australia Partnership Laboratory (EMBL Australia) is supported by the National Collaborative Research Infrastructure Strategy (NCRIS) of the Australian Government. Imaging was performed in the CellScreen SA screening center at Flinders University. Funder Information Declared Australian Research Council , FT200100519 , DP200102093 National Health and Medical Research Council , APP1187638 References [1]. ↵ K. Tunyasuvunakool , J. Adler , Z. Wu , T. Green , M. Zielinski , A. Zidek , et al. , Highly accurate protein structure prediction for the human proteome , Nature . 596 ( 2021 ) 590 – 596 . OpenUrl CrossRef PubMed [2]. ↵ J. Jumper , R. Evans , A. Pritzel , T. Green , M. Figurnov , O. Ronneberger , et al. , Highly accurate protein structure prediction with AlphaFold , Nature . 596 ( 2021 ) 583 – 589 . OpenUrl CrossRef PubMed [3]. ↵ M. Baek , F. DiMaio , I. Anishchenko , J. Dauparas , S. Ovchinnikov , G.R. Lee , et al. , Accurate prediction of protein structures and interactions using a three-track neural network , Science . 373 ( 2021 ) 871 – 876 . OpenUrl Abstract / FREE Full Text [4]. ↵ N. DelRosso , J. Tycko , P. Suzuki , C. Andrews , Aradhana , A. Mukund , et al. , Large-scale mapping and mutagenesis of human transcriptional effector domains , Nature . 616 ( 2023 ) 365 – 372 . OpenUrl CrossRef PubMed [5]. ↵ M. Legut , Z. Gajic , M. Guarino , Z. Daniloski , J.A. Rahman , X. Xue , et al. , A genome-scale screen for synthetic drivers of T cell proliferation , Nature . 603 ( 2022 ) 728 – 735 . OpenUrl CrossRef PubMed [6]. O. Ursu , J.T. Neal , E. Shea , P.I. Thakore , L. Jerby-Arnon , L. Nguyen , et al. , Massively parallel phenotyping of coding variants in cancer with Perturb-seq , Nat Biotechnol . 40 ( 2022 ) 896 – 905 . OpenUrl CrossRef PubMed [7]. ↵ N. Alerasool , H. Leng , Z.Y. Lin , A.C. Gingras , M. Taipale , Identification and functional characterization of transcriptional activators in human cells , Mol Cell . 82 ( 2022 ) 677 – 695 e677. OpenUrl CrossRef PubMed [8]. ↵ L.M. Sack , T. Davoli , Q. Xu , M.Z. Li , S.J. Elledge , Sources of error in mammalian genetic screens , G3 (Bethesda) . 6 ( 2016 ) 2781 – 2790 . OpenUrl CrossRef [9]. D. Skalamera , M. Dahmer , A.S. Purdon , B.M. Wilson , M.V. Ranall , A. Blumenthal , et al. , Generation of a genome scale lentiviral vector library for EF1alpha promoter-driven expression of human ORFs and identification of human genes affecting viral titer , PLoS One . 7 ( 2012 ) e51733 . OpenUrl CrossRef PubMed [10]. ↵ X. Yang , J.S. Boehm , K. Salehi-Ashtiani , T. Hao , Y. Shen , R. Lubonja , et al. , A public genome-scale lentiviral expression library of human ORFs , Nat Methods . 8 ( 2011 ) 659 – 661 . OpenUrl CrossRef PubMed Web of Science [11]. G. Prelich , Gene overexpression: uses, mechanisms, and interpretation , Genetics . 190 ( 2012 ) 841 – 854 . OpenUrl Abstract / FREE Full Text [12]. ↵ J. Joung , S. Ma , T. Tay , K.R. Geiger-Schuller , P.C. Kirchgatterer , V.K. Verdine , et al. , A transcription factor atlas of directed differentiation , Cell . 186 ( 2023 ) 209 – 229 e226. OpenUrl CrossRef PubMed [13]. ↵ A.H.M. Ng , P. Khoshakhlagh , J.E. Rojo Arias , G. Pasquini , K. Wang , A. Swiersy , et al. , A comprehensive library of human transcription factors for cell fate engineering , Nat Biotechnol . 39 ( 2021 ) 510 – 519 . OpenUrl CrossRef PubMed [14]. ↵ P. Lamesch , N. Li , S. Milstein , C. Fan , T. Hao , G. Szabo , et al. , hORFeome v3.1: a resource of human open reading frames representing over 10,000 human genes , Genomics . 89 ( 2007 ) 307 – 315 . OpenUrl CrossRef PubMed Web of Science [15]. J. Liu , A.G. Bang , C. Kintner , A.P. Orth , S.K. Chanda , S. Ding , et al. , Identification of the Wnt signaling activator leucine-rich repeat in Flightless interaction protein 2 by a genome-wide functional analysis , Proc Natl Acad Sci U S A . 102 ( 2005 ) 1927 – 1932 . OpenUrl Abstract / FREE Full Text [16]. V. Iourgenko , W. Zhang , C. Mickanin , I. Daly , C. Jiang , J.M. Hexham , et al. , Identification of a family of cAMP response element-binding protein coactivators by genome-scale functional analysis in mammalian cells , Proc Natl Acad Sci U S A . 100 ( 2003 ) 12147 – 12152 . OpenUrl Abstract / FREE Full Text [17]. ↵ J. Bischof , E.M. Sheils , M. Bjorklund , K. Basler , Generation of a transgenic ORFeome library in Drosophila , Nat Protoc . 9 ( 2014 ) 1607 – 1620 . OpenUrl CrossRef PubMed [18]. Y. Liu , J.T. Kern , J.R. Walker , J.A. Johnson , P.G. Schultz , H. Luesch , A genomic screen for activators of the antioxidant response element , Proc Natl Acad Sci U S A . 104 ( 2007 ) 5205 – 5210 . OpenUrl Abstract / FREE Full Text [19]. ↵ S.K. Chanda , S. White , A.P. Orth , R. Reisdorph , L. Miraglia , R.S. Thomas , et al. , Genome-scale functional profiling of the mammalian AP-1 signaling pathway , Proc Natl Acad Sci U S A . 100 ( 2003 ) 12153 – 12158 . OpenUrl Abstract / FREE Full Text [20]. ↵ J. Tycko , N. DelRosso , G.T. Hess , Aradhana , A. Banerjee , A. Mukund , et al. , High-throughput discovery and characterization of human transcriptional effectors , Cell . 183 ( 2020 ) 2020 – 2035 e2016. OpenUrl CrossRef PubMed [21]. A. Hossain , E. Lopez , S.M. Halper , D.P. Cetnar , A.C. Reis , D. Strickland , et al. , Automated design of thousands of nonrepetitive parts for engineering stable genetic systems , Nat Biotechnol . 38 ( 2020 ) 1466 – 1475 . OpenUrl CrossRef PubMed [22]. J. van Arensbergen , V.D. FitzPatrick , M. de Haas , L. Pagie , J. Sluimer , H.J. Bussemaker , et al. , Genome-wide mapping of autonomous promoter activity in human cells , Nat Biotechnol . 35 ( 2017 ) 145 – 153 . OpenUrl CrossRef PubMed [23]. ↵ I. Liachko , R.A. Youngblood , U. Keich , M.J. Dunham , High-resolution mapping, characterization, and optimization of autonomously replicating sequences in yeast , Genome Res . 23 ( 2013 ) 698 – 704 . OpenUrl Abstract / FREE Full Text [24]. ↵ S.N. Chandrasekaran , E. Alix , J. Arevalo , A. Borowa , P.J. Byrne , W.G. Charles , et al. , Morphological map of under- and overexpression of genes in human cells , Nat Methods . 22 ( 2025 ) 1742 – 1752 . OpenUrl PubMed [25]. ↵ J. Poirson , H. Cho , A. Dhillon , S. Haider , A.Z. Imrit , M.H.Y. Lam , et al. , Proteome-scale discovery of protein degradation and stabilization effectors , Nature . 628 ( 2024 ) 878 – 886 . OpenUrl CrossRef PubMed [26]. C.D. Arnold , F. Nemcko , A.R. Woodfin , S. Wienerroither , A. Vlasova , A. Schleiffer , et al. , A high-throughput method to identify trans-activation domains within transcription factor sequences , Embo J . 37 ( 2018 ). [27]. ↵ L.M. Sack , T. Davoli , M.Z. Li , Y. Li , Q. Xu , K. Naxerova , et al. , Profound tissue specificity in proliferation control underlies cancer drivers and aneuploidy patterns , Cell . 173 ( 2018 ) 499 – 514 e423. OpenUrl CrossRef PubMed [28]. H. Zhang , X. Cao , M. Tang , G. Zhong , Y. Si , H. Li , et al. , A subcellular map of the human kinome , Elife . 10 ( 2021 ). [29]. G. Manning , D.B. Whyte , R. Martinez , T. Hunter , S. Sudarsanam , The protein kinase complement of the human genome , Science . 298 ( 2002 ) 1912 – 1934 . OpenUrl Abstract / FREE Full Text [30]. ↵ N.A.M. Mohany , A. Totti , K.R. Naylor , H. Janovjak , Microbial methionine transporters and biotechnological applications , Appl Microbiol Biotechnol . 105 ( 2021 ) 3919 – 3929 . OpenUrl [31]. L. Lin , S.W. Yee , R.B. Kim , K.M. Giacomini , SLC transporters as therapeutic targets: emerging opportunities , Nat Rev Drug Discov . 14 ( 2015 ) 543 – 560 . OpenUrl CrossRef PubMed [32]. ↵ D.O. Daley , M. Rapp , E. Granseth , K. Melen , D. Drew , G. von Heijne , Global topology analysis of the Escherichia coli inner membrane proteome , Science . 308 ( 2005 ) 1321 – 1323 . OpenUrl Abstract / FREE Full Text [33]. ↵ P.A. Insel , K. Sriram , M.W. Gorr , S.Z. Wiley , A. Michkov , C. Salmeron , et al. , GPCRomics: An approach to discover GPCR drug targets , Trends Pharmacol Sci . 40 ( 2019 ) 378 – 387 . OpenUrl CrossRef [34]. ↵ A.J. Kooistra , S. Mordalski , G. Pandy-Szekeres , M. Esguerra , A. Mamyrbekov , C. Munk , et al. , GPCRdb in 2021: integrating GPCR sequence, structure and function , Nucleic Acids Res . 49 ( 2021 ) D335 – D343 . OpenUrl CrossRef PubMed [35]. ↵ A.S. Hauser , S. Chavali , I. Masuho , L.J. Jahn , K.A. Martemyanov , D.E. Gloriam , et al. , Pharmacogenomics of GPCR drug targets , Cell . 172 ( 2018 ) 41 – 54 e19. OpenUrl CrossRef PubMed [36]. ↵ K. Sriram , P.A. Insel , G protein-coupled receptors as targets for approved drugs: How many targets and how many drugs? , Mol Pharmacol . 93 ( 2018 ) 251 – 258 . OpenUrl Abstract / FREE Full Text [37]. ↵ A. Inoue , F. Raimondi , F.M.N. Kadji , G. Singh , T. Kishi , A. Uwamizu , et al. , Illuminating G-protein-coupling selectivity of GPCRs , Cell . 177 ( 2019 ) 1933 – 1947 e1925. OpenUrl CrossRef PubMed [38]. ↵ C. Avet , A. Mancini , B. Breton , C. Le Gouill , A.S. Hauser , C. Normand , et al. , Effector membrane translocation biosensors reveal G protein and betaarrestin coupling profiles of 100 therapeutically relevant GPCRs , Elife . 11 ( 2022 ). [39]. ↵ W.K. Kroeze , M.F. Sassano , X.P. Huang , K. Lansu , J.D. McCorvy , P.M. Giguere , et al. , PRESTO-Tango as an open-source resource for interrogation of the druggable human GPCRome , Nat Struct Mol Biol . 22 ( 2015 ) 362 – 369 . OpenUrl CrossRef PubMed [40]. ↵ X. Lv , J. Liu , Q. Shi , Q. Tan , D. Wu , J.J. Skinner , et al. , In vitro expression and analysis of the 826 human G protein-coupled receptors , Protein Cell . 7 ( 2016 ) 325 – 337 . OpenUrl CrossRef PubMed [41]. ↵ A. Tedman , M. Goel , S. Shah , M.K. Howard , L.M. Chamness , A. Bonifasi , et al. , Deep receptor scanning reveals general sequence constraints on GPCR biosynthesis , bioRxiv . ( 2025 ). [42]. ↵ M. Morri , I. Sanchez-Romero , A.M. Tichy , S. Kainrath , E.J. Gerrard , P.P. Hirschfeld , et al. , Optical functionalization of human Class A orphan G-protein-coupled receptors , Nat Commun . 9 ( 2018 ) 1950 . OpenUrl PubMed [43]. ↵ T. Derrien , J. Estelle , S. Marco Sola , D.G. Knowles , E. Raineri , R. Guigo , et al. , Fast computation and applications of genome mappability , PLoS One . 7 ( 2012 ) e30377 . OpenUrl CrossRef PubMed [44]. ↵ R. Koehler , H. Issac , N. Cloonan , S.M. Grimmond , The uniqueome: a mappability resource for short-tag sequencing , Bioinformatics . 27 ( 2011 ) 272 – 274 . OpenUrl CrossRef PubMed Web of Science [45]. ↵ P.M. Grimley , F. Dong , H. Rui , Stat5a and Stat5b: fraternal twins of signal transduction and transcriptional activation , Cytokine Growth Factor Rev . 10 ( 1999 ) 131 – 157 . OpenUrl CrossRef PubMed Web of Science [46]. ↵ J. Kim , A. Bergmann , L. Stubbs , Exon sharing of a novel human zinc-finger gene, ZIM2, and paternally expressed gene 3 (PEG3) , Genomics . 64 ( 2000 ) 114 – 118 . OpenUrl CrossRef PubMed Web of Science [47]. ↵ R.W. Johnstone , J.A. Kerry , J.A. Trapani , The human interferon-inducible protein, IFI 16, is a repressor of transcription , J Biol Chem . 273 ( 1998 ) 17172 – 17177 . OpenUrl Abstract / FREE Full Text [48]. ↵ C. Engler , R. Kandzia , S. Marillonnet , A one pot, one step, precision cloning method with high throughput capability , PLoS One . 3 ( 2008 ) e3647 . OpenUrl CrossRef PubMed [49]. ↵ Y. Chaudhari , T.C. Cairns , Y. Sidhu , V. Attah , G. Thomas , M. Csukai , et al. , The Zymoseptoria tritici ORFeome: A functional genomics community resource , Mol Plant Microbe Interact . 32 ( 2019 ) 1564 – 1570 . OpenUrl PubMed [50]. H. Gingras , K. Patron , A. Bhattacharya , P. Leprohon , M. Ouellette , Gain- and loss-of-function screens coupled to next-generation sequencing for antibiotic mode of action and resistance studies in Streptococcus pneumoniae , Antimicrob Agents Chemother . 63 ( 2019 ). [51]. ↵ E. Gazanion , C. Fernandez-Prada , B. Papadopoulou , P. Leprohon , M. Ouellette , Cos-Seq for high-throughput identification of drug target and resistance mechanisms in the protozoan parasite Leishmania , Proc Natl Acad Sci U S A . 113 ( 2016 ) E3012 – 3021 . OpenUrl Abstract / FREE Full Text [52]. ↵ M. Uematsu , J.M. Baskin , Barcode-free multiplex plasmid sequencing using Bayesian analysis and nanopore sequencing , Elife . 12 ( 2025 ). [53]. ↵ E.M. Jones , N.B. Lubock , A.J. Venkatakrishnan , J. Wang , A.M. Tseng , J.M. Paggi , et al. , Structural and functional characterization of G protein-coupled receptors with deep mutational scanning , Elife . 9 ( 2020 ). [54]. ↵ T.L. Mighell , B. Lehner , The genetic architecture of G-protein coupled receptor signaling , bioRxiv . ( 2025 ). [55]. ↵ V.L. Rohner , P.J. Lamothe-Molina , T. Patriarchi , Engineering, applications, and future perspectives of GPCR-based genetically encoded fluorescent indicators for neuromodulators , J Neurochem . 168 ( 2024 ) 163 – 184 . OpenUrl CrossRef PubMed [56]. ↵ Y. Yang , B. Li , Y. Li , Genetically encoded sensors for the in vivo detection of neurochemical dynamics , Annu Rev Anal Chem (Palo Alto Calif) . 17 ( 2024 ) 367 – 392 . OpenUrl PubMed [57]. ↵ F. Zhou , A.M. Tichy , B.N. Imambocus , S. Sakharwade , F.J. Rodriguez Jimenez , M. Gonzalez Martinez , et al. , Optimized design and in vivo application of optogenetically functionalized Drosophila dopamine receptors , Nat Commun . 14 ( 2023 ) 8434 . OpenUrl CrossRef PubMed [58]. A.M. Tichy , W.L. So , E.J. Gerrard , H. Janovjak , Structure-guided optimization of light-activated chimeric G-protein-coupled receptors , Structure . 30 ( 2022 ) 1075 – 1087 e1074. OpenUrl CrossRef [59]. ↵ R.D. Airan , K.R. Thompson , L.E. Fenno , H. Bernstein , K. Deisseroth , Temporally precise in vivo control of intracellular signalling , Nature . 458 ( 2009 ) 1025 – 1029 . OpenUrl CrossRef PubMed Web of Science [60]. ↵ R.C. Edgar , Search and clustering orders of magnitude faster than BLAST , Bioinformatics . 26 ( 2010 ) 2460 – 2461 . OpenUrl CrossRef PubMed Web of Science [61]. ↵ G. Benson , Tandem repeats finder: a program to analyze DNA sequences , Nucleic Acids Res . 27 ( 1999 ) 573 – 580 . OpenUrl CrossRef PubMed Web of Science [62]. ↵ T. Seemann , Snippy: rapid haploid variant calling and core SNP phylogeny . ( 2015 ). [63]. ↵ H. Li , Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 1303.3997v2 ( 2013 ). [64]. ↵ P. Danecek , J.K. Bonfield , J. Liddle , J. Marshall , V. Ohan , M.O. Pollard , et al. , Twelve years of SAMtools and BCFtools , Gigascience . 10 ( 2021 ). [65]. ↵ E. Garrison , G. Marth , Haplotype-based variant detection from short-read sequencing. 1207.3907 ( 2012 ). [66]. ↵ G.J. Kremers , J. Goedhart , E.B. van Munster , T.W. Gadella , Jr . ., Cyan and yellow super fluorescent proteins with improved brightness, protein folding, and FRET Forster radius , Biochemistry . 45 ( 2006 ) 6570 – 6580 . OpenUrl CrossRef PubMed Web of Science [67]. ↵ J. Dietler , R. Schubert , T.G.A. Krafft , S. Meiler , S. Kainrath , F. Richter , et al. , A light-oxygen-voltage receptor integrates light and temperature , J Mol Biol . 433 ( 2021 ) 167107 . OpenUrl PubMed [68]. ↵ A.M. Tichy , E.J. Gerrard , J.M.D. Legrand , R.M. Hobbs , H. Janovjak , Engineering strategy and vector library for the rapid generation of modular light-controlled protein-protein interactions , J Mol Biol . 431 ( 2019 ) 3046 – 3055 . OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted October 21, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Gene library deep sequencing for protein super-family profiling Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Gene library deep sequencing for protein super-family profiling Rahkesh T Sabapathy , Kara Henry-Cocks , Jackson Feng , Shashikanth Marri , Gustavo Bracho Granado , Harald Janovjak bioRxiv 2025.10.20.682913; doi: https://doi.org/10.1101/2025.10.20.682913 Share This Article: Copy Citation Tools Gene library deep sequencing for protein super-family profiling Rahkesh T Sabapathy , Kara Henry-Cocks , Jackson Feng , Shashikanth Marri , Gustavo Bracho Granado , Harald Janovjak bioRxiv 2025.10.20.682913; doi: https://doi.org/10.1101/2025.10.20.682913 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Synthetic Biology Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17633) Bioengineering (13856) Bioinformatics (41841) Biophysics (21399) Cancer Biology (18529) Cell Biology (25422) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24282) Genetics (15582) Genomics (22462) Immunology (17700) Microbiology (40295) Molecular Biology (17140) Neuroscience (88419) Paleontology (666) Pathology (2823) Pharmacology and Toxicology (4813) Physiology (7632) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.