Classification of Human Transcription Factors Based on Their Effector Domains via Unsupervised Learning

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 86,420 characters · extracted from preprint-html · click to expand
Classification of Human Transcription Factors Based on Their Effector Domains via Unsupervised Learning | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Classification of Human Transcription Factors Based on Their Effector Domains via Unsupervised Learning Eduardo Ayala , Ayush Gupta , Nehil Shreyash , Arvind Ramanathan , Gül H. Zerze doi: https://doi.org/10.1101/2025.10.26.684687 Eduardo Ayala 1 William A. Brookshire Department of Chemical and Biomolecular Engineering, University of Houston , Houston, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ayush Gupta 1 William A. Brookshire Department of Chemical and Biomolecular Engineering, University of Houston , Houston, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nehil Shreyash 1 William A. Brookshire Department of Chemical and Biomolecular Engineering, University of Houston , Houston, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Arvind Ramanathan 2 Argonne National Laboratory , Lemont, IL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gül H. Zerze 1 William A. Brookshire Department of Chemical and Biomolecular Engineering, University of Houston , Houston, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: gzerze{at}uh.edu Abstract Full Text Info/History Metrics Supplementary material Preview PDF SUMMARY TFs combine DBDs, which anchor them to DNA, with EDs that regulate transcription through activation or repression, yet the sequence logic linking ED composition to function remains unclear. Here, we systematically define proxy regions —disordered segments adjacent to DBDs—to enable quantitative analysis of ED-like sequences across the human TF repertoire. Using a biophysically interpretable 22-feature classifier (FALK22) together with an embedding-based model (ESM), we map ED diversity and identify composition and charge-pattern signatures that correspond to regulatory activity along a disorder continuum, separating activation-from repression-associated regions. FALK22 identified classes align well with those identified from ESM while providing transparent, sequence-level features. Proxy regions near C-termini exhibit gradients that track DBD families, suggesting that EDs and DBDs might have co-evolved rather than evolved independently. These results establish proxy regions and FALK22 as a framework to connect sequence features with transcriptional activity and to generate testable hypotheses about effector-domain function and co-evolution with DNA-binding domains. HIGHLIGHTS We define proxy regions as systematically identified disordered segments adjacent to DNA-binding domains (DBD), enabling quantitative analysis of effector domain (ED)-like sequences across the human transcription factor (TF) repertoire. We develop FALK22, a 22-feature classification algorithm that classifies transcription factors based on simple sequence properties of their EDs and shows strong alignment with complex embedding-based representations from the Evolutionary Scale Model (ESM). FALK22 and ESM uncover distinct amino-acid composition and patterning signatures of EDs that correlate with transcriptional function, separating activation- and repression-associated regions along a disorder continuum. Proxy regions located at the C-termini exhibit gradients that correspond to their DBD families, suggesting that EDs did not evolve as independent modular units but rather co-evolved with, or became selectively matched to, their DBD contexts. INTRODUCTION Transcription factors (TF) are key regulatory proteins that control gene expression by binding to specific DNA elements, such as enhancers and promoters, and recruiting the transcriptional machinery necessary to activate a gene 1 - 5 . Their proper function is essential for maintaining cellular homeostasis, and mutations or dysregulation of TFs (e.g., through overexpression or loss-of-function) are associated with a wide range of diseases 6 - 8 . Most TFs contain at least two functionally distinct regions: one or more DNA binding domains (DBD) and effector domains (ED) 9 , 10 . EDs mediate interactions between TFs and other proteins involved in transcriptional regulation, such as coactivators or corepressors. Although DBDs may be partially or fully disordered before binding, they typically adopt a defined structure upon engaging their target DNA (and binding partners) 11 - 13 , enabling sequence- and structure-specific interactions. Similarly, EDs—whether functioning as activation or repression domains—often lack stable secondary or tertiary structure in isolation. Some activation domains do acquire defined structures upon binding to their partners 14 - 16 , but their folding behavior tends to be more variable and highly context-dependent. This structural flexibility supports their ability to interact with a broad range of targets and enables diverse regulatory outcomes. Studies have shown that many transcriptional regulators, including transcription factors, coactivators, mediator complex, and other chromatin remodelers, interact through multivalent, weak interactions (without requiring structure) that drive liquid-liquid phase separation (LLPS), forming dynamic, membrane-less compartments 17 - 21 . While TFs are traditionally classified based on their DBDs, whose conserved sequence and recognizable folds enable reliable annotation across species with recognizable folds 11 - 13 , 22 , 23 , ED classification remains underexplored, despite EDs playing critical roles in transcriptional regulation 24 , 25 . Historically, ED classification has been largely confined to a small number of well-characterized repression domains, such as KRAB, SCAN, and POZ domains, found primarily in specific zinc finger transcription factor families 26 - 29 . These domain annotations capture a relatively narrow subset of ED diversity and are often insufficient for functional categorization across the full TF repertoire. More recently, tools for predicting EDs or inferring activation potential have emerged 30 - 32 , but many rely on datasets derived from yeast 33 , limiting their applicability to humans due to proteomic divergence across species, tissues, and cell types 34 - 36 . Soto et al. 37 assembled a compendium of EDs from human TFs based on manual curation of experimental studies. While this resource provides broad coverage and functional annotations, a systematic classification of EDs—and of TFs based on their EDs—using sequence-derived features remained lacking. To address the lack of sequence-based classification strategies for EDs, here we introduced a 22-feature classification algorithm, Fractions of Amino acids, sequence Length, and Kappa (a measure of sequence charge patterning) 38 , FALK22. Prior studies have linked activation domains to glutamine-, proline-, and acidic-rich regions 39 , 40 , with recent work demonstrating a correlation between acidic residue content and transcriptional potency 41 . To uncover insights about the role of other amino acids, we incorporated the full amino acid composition into FALK22. Because boundaries of EDs are often ambiguous, we first systematically defined proxy regions (PRs), which are disordered sequence segments adjacent to DBDs, and applied this classifier to PRs, allowing consistent large-scale comparison of ED-like sequences across human TFs. To complement the FALK22 approach, we also performed a separate classification using embeddings from the Evolutionary Scale Model (ESM), a state-of-the-art protein language model trained on massive sequence databases 42 . These embeddings capture deep contextual relationships between residues but are computationally intensive and less interpretable than FALK22. Both methods were applied to PRs and full-length HTF sequences, enabling a direct comparison between the classification schemes. While the two approaches revealed partially overlapping classes, FALK22, despite its simplicity, achieved clearer segregation of ED types and outperformed ESM in recovering known motif-level distinctions. RESULTS 0.1 Decomposition of human transcription factors (HTFs) into analyzable sequence subsets Our search with the keyword “transcription factor”, filtered for human proteins using the most recent UniProt release 43 , identified only 1,465 HTFs. Similarly, previous HTF datasets 34 , 44 , 45 report fewer than 1,600 proteins. The data set constructed by Lambert et al . 46 remains the most comprehensive publicly available resource for HTFs, containing 2,765 entries. We started with this data set and excluded entries marked as lacking “transcription factor activity, sequence-specific DNA binding” as well as those not annotated with “evidence at protein level” ( Figure 1f ), leaving 1,632 entries. Using the same UniProt release 43 , we then extracted the corresponding protein sequences and annotations. Of these, 802 proteins have only one reported isoform ( Figure 1g ). For entries with multiple isoforms, we retained only the main isoform, as other isoforms could lack transcriptional activity or DNA-binding capability 47 - 51 . This procedure defined our “full-sequence” data set. Download figure Open in new tab Figure 1. Sequence characterization of HTFs. Examples of the different HTF sequence arrangements: (a) single centrally located DBD with variable N- and C-terminus size, (b) multiple DBDs connected by short sequences identified as linkers. If shorter than 50 amino acids, linkers are not considered as proxy effector domain regions, (c) TFs with a single DBD located at the C-terminus, and (d) N-terminus. (e) A set of 142 TFs contains two DBD regions connected by a long aa segment. (f) Distribution of different existence evidence levels of HTFs. (g) Distribution of the number of isoforms per HTF. (h-k) Distribution of sequence-related metrics for the parameters κ (h), net charge (i), normalized hydropathy score (h), and DS (k) for the DBD ( green ), PR1 ( blue ), and PR2 ( red ) regions. We next analyzed the DBD annotations to approximate ED boundaries and build the corresponding ED datasets. UniProt contains “automatic”, “by similarity”, “ProRule”, and publication-based DBD annotations. We excluded entries without DBD annotations, yielding 1,588 HTFs (with DBD) with at least one annotated DBD. We used these annotations to define the boundaries of the EDs analyzed in our classification scheme. Representative DBD boundary schematics are shown in Figures 1a-e . Since the precise locations of EDs are often unknown, we identified proxy regions (PRs) as approximations of the regions where activation (or repression) domains could reside. These PRs correspond to extended disordered segments beyond the DBDs, located primarily at the N- or C-terminus ( Figures 1a-d ) or within long non-DBD linkers ( Figure 1e ). For each HTF, our aim was to identify at most 2 PRs. Approximately 57% (954 HTFs) contain more than one DBD (e.g., Figure 1b ). When two DBDs are separated by linkers shorter than 50 amino acids (aa), linkers were not considered part of any PR. However, 142 HTFs contain longer linkers ( > 50 aa) between two DBDs ( Figure 1e ). Such longer linkers could stabilize chromatin and indirectly regulate transcription by mediating specific protein-protein interactions to bridging between distant enhancer/promoter regions (if permitted by steric hindrance) 37 , 52 ; therefore, we included them as a proxy region. We found that for all TFs, the disordered C-terminus tail (if present) is consistently longer than the any linker segment and hence was designated PR1. Any significantly long linker fragment designated PR2 if it is at least 50 residues longer than the N-terminal tail; otherwise, the N-terminal tail was assigned as PR2. Using these criteria, we compiled the 1631 sequences in the PR1 database (C-termini) and the 1529 sequences in the PR2 database (linkers or N-termini). The compiled full-length, PR1, PR2, and DBD databases are provided as Data S1. 0.2 Hydropathy, charge, and disorder profiles of DBDs and PRs Intrinsically disordered proteins (IDP) and normally foldable (globular) proteins are traditionally separated on a pseudophase diagram proposed by Uversky et al . 53 , 54 , defined by mean net charge (MNC) and mean hydropathy score 55 , commonly known as the Uversky plot (Figures S2a, S2d). However, this classical approach placed 45% and 43% of the PR1 and PR2 sequences, respectively, in the ordered region (Figure S2a and S2c). This can also be seen in Figure 1, j , which shows that the hydropathy distributions for the PRs were centered very close to those for DBDs. The MNC-hydropathy features generate an overly dense space that lacks clear segregation, leading to a poor clustering performance. As an alternative to the hydropathy index, we calculate the disorder score (DS) introduced by Emenecker et al. 56 , which is a deep-learning predictor trained on consensus annotations from multiple methods to yield a normalized per-residue probability of disorder. It provides a more robust scale for cross-protein comparisons since it integrates diverse sources of experimental and computational information. DS values for PR1 and PR2 are centered around 0.9 ( Figure 1k ), confirming their predominantly disordered character and placing them and DBDs at the opposite ends of the spectrum. To further contextualize these regions relative to other sequence types, we compared their average sequence properties—including hydropathy, net charge, DS, and individual amino acid fractions—to those of full-length TFs, their DBDs, and the average human proteome (Figure S1). This comparison reveals that DBD are distinctly enriched in positively charged amino acids, cysteine, and tyrosine, whereas PRs are enriched in acidic residues, other polar amino acids, and prolines. Applying HDBSCAN 57 , a density-based clustering technique, to MNC-hydropathy Uversky space resulted in > 50% of proteins classified as noise in every trial (Figures S2b and S2d). A related representation proposed by Das et al. 58 , which plots the fractions of positive ( f + ) and negative ( f − ) residues, produced a similarly dense distribution (Figure S3) and an even higher proportion of noise points. These results confirm that classical two-feature charge—hydropathy spaces cannot effectively separate DBDs and proxy regions. Numerous studies have extended such pseudophase diagrams to include conformational states of IDPs as additional functions of charge patterning 38 . These sequence-based approaches have also been used as guides in both experimental 59 , 60 and computational 61 studies of IDPs. Since neither of these classical approaches provided adequate sequence differentiation, we next explored whether further features could better distinguish DBDs and PRs. We found that DBDs are slightly positively charged on average ( Figure 1h ), consistent with predictions of an overall positive charge on DBDs 62 . Furthermore, we found that the EDs are predominantly negatively charged ( Figure 1h ), which is consistent with the fact that acidic amino acids are common in the activation domains, which are considered to regulate the binding of DBD and DNA (e.g., being electrostatically repulsive) 63 . Beyond the net charge, the spatial arrangement of the charged residues—quantified by the charge patterning parameter κ 38 —also serves as a differentiating factor. κ ranges from 0 to 1, where low values correspond to well-mixed opposite charges, while values near 1 indicate segregated charge blocks. κ ranges from 0 to 1, where low values correspond to well-mixed opposite charges, while values near 1 indicate segregated charge blocks. For both DBDs and proxy regions, the κ values are centered around 0.16 ( Figure 1h ), where PR1 has a broader distribution of κ compared to PR2. In FALK22, we also used κ as an informative descriptor. Analysis of PR lengths revealed that 278 of the HTFs have PR1 segments shorter than 10aa, while 66 have PR2 segments below this threshold (Figure S4a). Based on the calculated average DS per residue (Figure S4b), which drops near 10 aa and crosses the disorder-order threshold of 0.5, such short segments are unlikely to form stable motifs 64 and are too short to engage in extended intermolecular contacts. This observation further motivated the inclusion of sequence length as a key descriptor in FALK22. Collectively, these analyses prompted the development of a broader yet still minimal feature set, FALK22, for systematic classification as presented in the following section. 0.3 Feature representation for PRs as surrogates for EDs EDs derive their activity from sequence composition and disorder rather than well-defined structure. Building on evidence that activation domains often rely on aromatic, leucine, and acidic residues 30 , 41 , 65 , 66 , we first tested whether similar compositional and charge-based descriptors could distinguish the PRs identified in HTFs. We began with a seven-feature representation (F7) that captured the minimal physicochemical signatures previously linked to activation and repression domains: mean hydropathy, fractions of isoleucine, proline, and glutamine, fractions of positively ( f + ) and negatively ( f − ) charged residues, and the mean net charge per residue (MNC = | f + f − |). UMAP projections (which is a dimensionality reduction technique) of this space revealed no consistent gradients with respect to disorder score (DS), mean hydropathy, or MNC for either PR1 or PR2 (Figures S5a–S5l). Motif-enrichment analyses using FIMO 67 and STREME 68 identified dispersed motifs (Figures S5c,f,i,l), consistent with the absence of well-defined subgroups, in contrast to structured motif clusters within the KRAB, SCAN, and BTB/POZ families 27 . To incorporate additional sequence-level information neglected in F7, we added the charge-patterning parameter κ 38 and sequence length—both we hypothesized to influence conformational and phase-separation behavior—forming a nine-feature representation (F9). The resulting projections (Figures S5m–x) showed only subtle shifts in cluster density, indicating that κ and length alone do not significantly improve separation. These findings implied that ED-like regions cannot be characterized solely by a few canonical amino-acid enrichments or by overall charge metrics. We therefore constructed a more comprehensive 22-feature space, FALK22 (Fractions of Amino acids, sequence Length, and Kappa), encompassing all amino-acid fractions, κ , and sequence length. FALK22 captured clear gradients of disorder across PR1 and PR2 ( Figures 2a, 2e ), (as well as full-length sequence, Figure 2i ) with PR2 exhibiting broader DS variability, consistent with its broader DS distribution ( Figure 1k ). Importantly, FALK22 also resolved distinct family-level organization when colored by DBD identity ( Figures 2b, 2f ). Among the largest families ( Figure 2q ), the Nuclear Receptor (NR) and Forkhead families were clearly segregated on PR1 projection ( Figure 2b , the coloring of the projection is as bar colors in Figure 2q ), whereas C2H2 Zinc-Finger (ZF) and Homeodomain families were clearly segregated on PR2 projection ( Figure 2f ). Download figure Open in new tab Figure 2. Two-dimensional projections of high dimensional FALK22 and ESM-based features, colored by the two properties that showed strongest gradients. Two-dimensional projections for PR1 and PR2 of FALK22 and ESM colored with respect to (a, c, e, g, i, k, m, o) DS and (b, d, f, h, j, l, n, p) DBD family. The DBD family color scheme for the transcription factors is shown in the size distribution plot (q). This family coloring scheme is kept consistent throughout all figures whereever DBD family based coloring is applied. ESM projections were reduced to 2 dimensions for comparison purposes. PRs spanned the entire disorder range, consistent with their functional plasticity. NR family EDs clustered toward the more ordered region (low DS region)—matching their structured activation domains known to stabilize DNA–protein interactions ( Figures 2a and 2b ). Similarly, the C2H2-ZF family populated mostly the low DS region in PR2 projections ( Figures 2e and 2f ). These gradients and family-specific arrangements demonstrate that FALK22 captures biophysical diversity that mirrors both transcriptional function and DBD-ED evolutionary context. When we processed the full-length sequences (FS) of HTFs as well as their DBD sequences, Comparable trends were also observed when the same features were applied to the full-length HTF sequences (FS) and their isolated DBD segments, both of which exhibited weaker but same disorder and DBD family gradients ( Figures 2i, 2j, 2m , and 2n ). These consistent relationships across domain-level and full-protein projections indicate that the physicochemical principles encoded in FALK22 generalize beyond the proxy-region datasets, capturing hierarchical organization from individual domains to entire TFs. The same gradients strengthened going from FALK22 to a context-aware language model ( Figures 2k, 2l, 2o , and 2p ) as it will be discussed in the next subsection. Gradients on further features will be also discussed in the next subsection ( Figure 3 ). Download figure Open in new tab Figure 3. Two-dimensional projections of high dimensional FALK22 and ESM-based features for PR1 and PR2, colored by four other properties. (a-h) Two-dimensional projections for PR1 and PR2 of FALK22 and (i-p) ESM. (a, c, i, k) Colored projections with respect to the hydropathy score, (b, d, j, l) MNC, with the corresponding colorscales below the panels. (e, m, g, o) FIMO and (f, n, h, p) STREME sequence motifs (wherever successfully found) distribution over the sequence space. The color key for the motifs is not shown for simplicity. But the assignment of the same color indicate presence of shared motifs. While FALK22 provides an interpretable, composition-based description of sequence space, it does not explicitly encode positional context (positions of aa with respect to each other) or higher-order correlations between residues—features that may underlie motif co-occurrence, cooperative binding, and post-translational modification patterns. To evaluate the extent of the importance of contextual information, in the next subsection, we compared FALK22 to a transformer-based, context-aware language model trained on millions of natural protein sequences. 0.4 Comparison of FALK22 with a large language model Context-aware large language model (LLM) embeddings have achieved remarkable success in predicting post-translational modifications (PTMs), 69 structural similarities, 70 and more recently, complete 3D structure prediction. 42 We expect such representations to implicitly inform about the protein’s regulatory partners (ligands, coactivators, repressors, etc.) in the case of HTFs. To benchmark FALK22 against such deep-learning LLM representations, we compared it with token embeddings from the latest version of the Evolutionary Scale Model (ESM-2) 42 , a transformer model trained on ∼65 million UniRef sequences 71 . For both PR1 and PR2, two-dimensional UMAP projections of ESM embeddings ( Figures 2c, 2d, 2g , and 2h ) produced denser, less separated spaces than FALK22 but retained the same global gradient in disorder score (DS) ( Figures 2a and 2c ; as well as 2e and 2g ). When colored by DBD family, ESM reproduced the major groupings observed in FALK22—especially the compact nuclear-receptor (NR) cluster in PR1 ( Figures 2b and 2d , pale green family) and the dispersed C2H2-ZF family in PR2 ( Figures 2f and 2h , dark red family). Strinkingly, these very high-dimensional embeddings generated by an LLM trained over 65 million protein sequences produced the same gradients as the simple 22-descriptor FALK22. This finding suggests that family-specific biophysical trends are compositionally encoded rather than being contextual. We repeated the feature mapping for FS and isolated DBD segments. FS maps showed stronger gradients of both DS and DBD families on reduced dimensionality (UMAP) projections ( Figures 2k and 2l ) with ESM, where the NR family matched to low DS at the top left corner, C2H2 ZF family also matched to low DS at the bottom right corner, and homeodomain family matched to high DS at the top left corner. Importantly, these consistent relationships across domain-level and full-protein projections reinforce the same conclusion that similar physicochemical information is encoded within these regions, suggesting a possible co-evolution of DBDs and EDs. Quantitative analysis of family proximity (Figure S6) revealed that within-family distances are smaller for PR1 than PR2 across both models, indicating that C-terminal EDs (PR1) are more conserved in their physicochemical signatures. PR2 sequences, typically at N-termini (Figure S7), showed greater dispersion, consistent with their diverse roles in protein—protein interaction networks. These differences imply that while EDs share no sequence homology, their disorder and charge patterns preserve family-specific physicochemical fingerprints, supporting the notion that EDs co-evolved with DBDs. In addition to the DS and DBD family gradients, we also examined whether these feature projections show gradients on other properties ( Figure 3 ). In FALK22, PR1 and PR2 embeddings show subtle gradients with respect to hydropathy and MNC—two properties known to influence intrinsic disorder 72 —whereas these gradients are absent in ESM ( Figures 3a–d vs 3i–l ). The existence of such gradients in FALK22 implies that it captures additional axes of physicochemical variability, potentially linked to differences in phase-separation or condensate-forming propensities 73 . We also tested whether motif-level organization was preserved by mapping FIMO- and STREME-identified motifs onto each feature space. FALK22 revealed dispersed motif distributions for PR1 and more compact, domain-like enrichment for PR2 ( Figures 3e–h ), whereas ESM embeddings showed weaker motif segregation ( Figures 3m–p ). These results demonstrate that despite its simplicity, FALK22 can recover both compositional and some contextual organization comparable to that of transformer-based embeddings. To determine how similar or different the representations from FALK22 and ESM are, we also performed a feature space alignment in three dimensions after normalization of the ESM embeddings and FALK22. We used Umeyama’s algorithm 74 , which uses singular value decomposition to find the optimal rotation matrix and translation vector that minimizes the sum of squared distances between the two spaces. The aligned spaces in Figure 4 show that for both PRs, the FALK22 space largely differs in shape and distribution when compared to the ESM embeddings. The PR1 and PR2 projection alignments have an average RMSD of 0.6025 and 0.3626 ( Figures 4b and 4c ), respectively. This made us question the capability of ESM to represent the PRs, since the ESM is trained on complete sequences (i.e., not fragments) and contains separate tokens that define the start and end of the sequence, potentially limiting its utility to study complete proteins only. This type of influence may degrade the representation for any of the PRs, as none of those sequences is complete by itself. To address this hypothesis, we also derived the reduced features for the FS dataset using FALK22 and ESM and projected them in 3D space following the alignment procedure defined above ( Figure 4a ). Strikingly, the aligned representations have a very similar shape, with a smaller RMSD of 0.3152 for the FS dataset. Download figure Open in new tab Figure 4. Alignment of FALK22 and ESM embeddings in the three-dimensional space. FALK22 ( red ) and ESM embeddings ( blue ) for a) FS (RMSD: 0.3152), b) PR1 (RMSD: 0.6025) and c) PR2 (RMSD: 0.3626). We note that we found this large resemblance and the close alignment only for the ESM model that was trained with 150M parameters. Increasing ESM size to 650M or 3B parameters drastically altered embedding geometry and reduced motif segregation (Figure S8, Table S2), echoing known performance saturation effects in large language models 75 . This analysis shows that going above 150M parameters is not only more computationally expensive but also has poorer performance. By contrast, the 22-feature FALK22 model maintains stable geometry, interpretable axes, and minimal computational cost while producing comparable clustering and family-level organization. Having optimized this interpretable feature space, we next applied clustering analyses to classify human TFs according to their effector-domain properties. 0.5 ED based classification of HTFs We next applied HDBSCAN to classify the HTFs based on their FALK22-derived PR features. Parameter optimization (Figure S9) minimized noise and produced seven primary PR1 clusters and twelve primary PR2 clusters initally, which are then further refined via subsequent clustering. After the first round of clustering, the PR1 dataset splits into 7 clusters with moderately low noise of 27% ( Figure 5a , left panel). Mutual-information analysis ( Figure 5a , right panel) identified sequence length, proline, and alanine fractions as the dominant discriminants for the largest clusters of PR1, with subsequent refinement revealing the influence of methionine and cysteine fractions ( Figures 5b to 5e, right panels), as well. Download figure Open in new tab Figure 5. Clustering of the 1,361 elements in PR1 with its corresponding mutual information plot. a) Initial set split into 7 clusters with 36, 40, 48, 114, 152, 295, and 309 sequences. Projections included the subsequent clustering of the 309 (b), 295 (c), 152 (d), and 114 (e). Further clustering steps are excluded for visual clarity. Noise points are not included in the projections. Normalized averages per amino acid fraction (f) and identical amino acid spacing (g) for the obtained clusters ( black ) and global averages ( red ). PR2 dataset initially splits into 12 clusters with moderate noise of 35% ( Figure 6a , left panel). For PR2, alanine, proline, tryptophan, and tyrosine, and sequence length appear to be key differentiators for the generated clusters ( Figures 6a to 6d , right panels). Download figure Open in new tab Figure 6. Clustering of the 1,529 elements in PR2 with its corresponding mutual information plot. a) Initial set splitting into 12 clusters. b) Subsequent clustering was applied only to the largest cluster (472 sequences). The first three rounds of subsequent clustering are shown in b) to d). Further clustering steps are excluded for visual clarity. Noise points are not included in the projections. Normalized averages per amino acid fraction (e) and identical amino acid spacing (f) for the obtained clusters ( black ) and global averages ( red ). Further amino acid composition and spacing analyses ( Figures 5f and 5g ; 6e and 6f ) revealed contrasting enrichment trends between PR1 and PR2: PR1 clusters were enriched in alanine, glycine, proline, leucine, glutamic acid, and serine; and depleted in aromatic residues and other hydrophobic amino acids, suggesting roles in flexible scaffolding and coactivator recruitment. PR2 clusters, by contrast, were distinctly enriched in hydrophobic amino acids, including phenylalanine and tryptophan, residues known to enhance activation 41 , and showed low conservation across most amino acids. Importantly, the spacing distributions of the amino acids that dominate PR2 are very narrow, whereas the spacing distributions of amino acids within PR1 are very wide, i.e., none of the same type amino acids located particularly closely or in a repeating pattern. This is important because these closely located amino acids within PR2 can make sticker-like patches within the sequences that can form weak multivalent interactions. This suggests possibility of such patches within N-terminal portions of HTFs. Full-sequence (FS) clustering (Figure S10) largely mirrored PR1 behavior, indicating that C-terminal effector segments dominate full-length compositional behavior. 0.6 Applying FALK22 to known activation and repression domains To test whether FALK22 captures functional polarity within EDs, we projected the curated dataset of activation and repression domains 37 into the FALK22-based reduced feature space. A clear gradient emerged from highly disordered (activation-dominated) to more ordered (repression-dominated) regions ( Figure 7a and 7b ). Strikingly, the effector domains from C2H2 ZF and NR family TFs, once again, clustered together near the lower-DS end ( Figure 7c ). This pattern reinforces our major finding that EDs and DBDs possibly co-evolved together, instead of evolving as modular entities. This pattern also indicates that FALK22 can separate EDs not only by DBD lineage but also by functional tendency toward activation or repression, reinforcing that disorder and composition jointly encode regulatory polarity. Download figure Open in new tab Figure 7. UMAP projections of the FALK22-processed known activation, repression and bi-functional domains (curated by Soto et al. 37 ) The UMAP projections are colored according to a) DS, b) domain type [activation domain ( red ), repression domain ( purple ) and bi-functional domain ( green )], and c) DBD family [follows the same color code as in Figure 2q ]. DISCUSSION The systematic classification of HTFs based on their EDs represents a significant methodological advance that opens new frontiers in understanding transcriptional regulation. The first novelty in this work is to approximate EDs as non-DBDs of HTFs. This is significant because transcriptional activation or repression is a complicated process that does not follow a singular mechanism. Our vision here was based on the fact that any part of the TF chain that does not bind to DNA would be available for further interactions that can facilitate activation or repression. Then the FALK22 framework that we have developed in this worksuccessfully revealed previously unrecognized patterns in amino acid composition and charge distribution that underlie the functional diversity of effector domains, establishing a foundation that bridges computational classification with emerging therapeutic paradigms and mechanistic insights. The identification of large number of distinct classes for PR1 and PR2, respectively, demonstrates remarkable functional diversity within HTFs. The comparable performance of FALK22 compared to ESM embeddings in motif segregation and domain classification is particularly striking, given the computational simplicity of our biophysically-informed feature space relative to transformer-based approaches. This finding challenges the prevailing notion that complex deep learning models invariably outperform simpler, interpretable approaches in biological sequence analysis. The degradation of ESM performance with increasing model size—contrary to expectations based on structure prediction benchmarks—reveals important limitations in applying protein language models to sequence fragments and disordered regions. This observation has significant implications for the field, as it suggests that domain-specific approaches like FALK22 may be more appropriate for analyzing intrinsically disordered proteins than general-purpose language models trained on complete protein sequences. Our analysis reveals fundamental differences between PR1 (C-terminal) and PR2 (mostly N-terminal) effector domains that provide crucial insights into transcription factor evolution and function. The conservation of amino acid composition in PR1 regions across diverse DBD families, coupled with enrichment in proline, leucine, glutamic acid, and serine, suggests these domains have evolved specialized roles in stabilizing transcriptional machinery and mediating promiscuous protein-protein interactions. This compositional signature aligns with the emerging understanding that C-terminal effector domains may serve as universal platforms for recruiting coactivators and affect transcription. The high compositional diversity observed in PR2 regions, with their enrichment in aromatic residues like phenylalanine and tryptophan, points to evolutionary pressure to develop context-specific activation mechanisms, potentially through π - π stacking interactions and unique binding interfaces that confer target gene specificity. The narrow distribution of amino acids enriched in PR2 suggests the presence of sticker-like patches in PR2 that promote, for example, condensate formation. Our findings indeed provide crucial insights into the rapidly expanding field of transcriptional condensate biology. The charge patterning features captured by the κ parameter in FALK22, combined with the amino acid compositional differences we identified, directly relate to the phase separation propensities that drive biomolecular condensate formation. The prevalence of disorder-promoting residues and specific charge distributions in both PR1 and PR2 regions suggests that many human transcription factors possess the biophysical properties necessary for liquid-liquid phase separation. This connection is particularly relevant given recent advances demonstrating how transcriptional condensates contribute to gene regulation through the selective partitioning of transcription factors and coactivators. 76 Future studies integrating our classification system with condensate biology could reveal how different effector domain classes contribute to the formation, composition, and function of transcriptional hubs. Importantly, ED-like domains approximated as PR1 and PR2, showed DBD family-specific manifold gradients, e.g., C2H2 ZF and NR separated themselves from others consistently. This is striking because PR1 or PR2 databases completely excludes DBDs. No information regarding DBDs was fed to FALK22. Yet regardless of the feature identification technique (whether it is token embeddings from ESM or simple features from FALK22), the ED domain populated gradients segregated based on DBD families, which were also correlated with disorder score. This finding suggests co-evolution EDs with their DBD counterparts, aligning with their possibly shared regulatory logic. The classification framework we present has also significant implications for the development of next-generation therapeutics targeting transcriptional dysregulation. The identification of distinct effector domain classes provides a roadmap for developing selective inhibitors that target specific transcription factor subfamilies. This precision approach could overcome the longstanding challenges associated with targeting the traditionally “undruggable” intrinsically disordered regions of HTFs. Recent breakthroughs in targeting intrinsically disordered proteins, exemplified by clinical compounds like ralaniten for androgen receptor effector domains and emerging condensate-modifying drugs (c-mods), demonstrate the therapeutic potential of this approach. 77 , 78 Our classification system could accelerate similar drug discovery efforts by identifying effector domains with similar biophysical properties and suggesting common druggable features across HTF families. The development of novel approaches for targeting intrinsically disordered regions, including the recently described “logos” system that uses AI to design binders for flexible protein regions, 79 provides a technological foundation for translating our classification insights into therapeutic applications. The compositional and charge distribution patterns revealed by FALK22 could guide the design of such binders, enabling the development of highly selective transcription factor modulators. The integration of our classification system with high-resolution structural studies, such as cryo-electro microscopy and NMR spectroscopy represents a critical next step for understanding the mechanistic basis of the compositional patterns we identified. Multi-omics integration approaches that combine our sequence-based classifications with transcriptomic, proteomic, and metabolomic data could provide systems-level insights into how effector domain diversity contributes to cell-type-specific gene expression programs. Such integrative analyses could reveal the regulatory logic underlying cellular identity and differentiation programs. As the field moves toward more integrative and mechanistic approaches to studying gene regulation, the foundation provided by our effector domain classification system will serve as a valuable resource for the research community. The simple yet powerful framework we have established bridges the gap between sequence-level features and systems-level function, providing a roadmap for future investigations into the complex world of transcriptional regulation. Limitations of the study While our classification framework represents a significant advance in understanding effector domain diversity, several important limitations highlight critical directions for future research and methodological development that will be essential for realizing the full potential of this approach. The reliance on existing database annotations for defining DBD boundaries introduces inherent uncertainties in effector domain delineation. The fact that only 1,588 of 1,632 HTFs have sufficient DBD annotations reflects a fundamental challenge in the field, i.e., the need for comprehensive experimental characterization of HTF domains. This limitation is particularly significant given that 57% of HTFs contain multiple DBDs, creating complex domain arrangements that our proxy region approach may not fully capture. The exclusion of peptide fragments below 10 amino acids, while justified by secondary structure formation thresholds, may overlook functionally important short linear motifs that contribute to transcriptional regulation. Recent advances in understanding the role of short disordered regions in protein-protein interactions and phase separation suggest that future iterations of our approach should incorporate methods for analyzing these shorter sequences, potentially through specialized algorithms designed for motif discovery in disordered regions. Additionally, our feature space, while comprehensive in terms of amino acid composition, lacks information about higher-order sequence patterns and long-range correlations that may be important for effector domain function. Furthermore, many effector domains undergo post-translational modifications that can dramatically alter their functional properties, and our static sequence-based approach may not account for these dynamic changes. The role of cofactors, chromatin context, cellular localization, and tissue-specific expression patterns in modulating effector domain function cannot be captured by our current approach. Our methodology offers the opportunity to study the temporal and spatial resolution needed to describe the dynamics of gene expression regulation including the formation of phase-separated condensates, 80 signaling, 81 ligand binding, 82 post-translational modifications (PTMs), 83 , 84 homo-, heterodimerization, 85 and nuclear transport 86 which mainly take place across the ED. Future studies integrating proteomic data on post-translational modifications and cellular environment could significantly enhance the biological relevance of our classification system. By acknowledging these limitations and embracing the opportunities they represent, the scientific community can build upon our foundational work to develop increasingly sophisticated and biologically relevant models of transcriptional regulation. The future of the field lies in integrative approaches that combine computational prediction, experimental validation, and systems-level understanding to unlock the full complexity of gene regulation in health and disease. Our classification framework provides a crucial starting point for this endeavor, offering a bridge between sequence-level features and functional outcomes that can guide future investigations and therapeutic developments. Author contributions Conceptualization, G.H.Z.; methodology, E.A., A.G., and A.R.; investigation, E.A., A.G., and N.S.; writing – original draft, E.A., A.G., and G.H.Z.; writing – review & editing, E.A., A.G., and A.R.; funding acquisition, G.H.Z.; resources, G.H.Z. and A.R.; supervision, G.H.Z. STAR METHODS Method details 1.0.1 Datasets The dataset of HTFs was extracted from the work of Lambert et al . 46 . We constructed a comprehensive dataset of 1,632 human transcription factors (TFs) and their corresponding 1,885 isoforms from the UniProtKB 2024 − 2 release 43 along with their annotations. To identify the effector domains (ED), we removed the DBD segments from the main isoform sequence (identified throughout this work as full sequence (FS)). Since many TFs have multiple DBDs and DBDs are often located in non-terminal part of the sequence, the EDs approximated by simple removal of DBDs would be discontinuous, and therefore, can’t be used for classification. Hence, we identify all the continuous fragments of after the removal of known DBDs and consider the two longest of them, which we refer to as Proxy Region 1 (PR1) and Proxy Region (PR2), as representatives of the ED. PR1 consisted only of the C-terminal sequences, while PR2 could be either the N-terminal segment or the longest sequence sandwiched between DBD sequences, only if it is at least 50 residues longer than the N-terminal fragment ( Figure 1e ). This process yielded sets of 1,588 EDs, 1,584 PR1s, and 1,566 PR2s. We classified the PR1 and PR2 sequences separately. 1.0.2 Feature space The sequence properties κ and disorder score (DS) were calculated using CIDER 87 and Metapredict 56 , respectively. Normalized and averaged hydropathy score of chains were calculated using the per-amino acids scores assigned by Kyte and Dolittle 55 . All other parameters were calculated at physiological pH (7.4) using Biopython. 88 The parameters f + , f − and | f + − f − | are the fraction of basic amino acids, acidic amino acids, and mean net charge (MNC) (also known as absolute net charge per residue (NCPR)), respectively. After optimization of different feature spaces, we hypothesized that a total of 22 physically-interpretable features i.e., amino acid fractions, sequence length, and κ can sufficiently differentiate and classify the different sequences for the classification task. 89 - 91 Because of the high dimensionality of the feature space, we implemented feature scaling and subsequent dimensionality reduction across the feature dataset to enhance computational efficiency during the clustering phase. We employed Uniform Manifold Approximation and Projection (UMAP) 92 for this purpose, which is known for preserving global, local, and hierarchical structures within the data, thereby maintaining the relationships between data points. The optimal number of UMAP components was determined by evaluating the ‘trustworthiness’ score function available in scikit-learn 93 , where we iterated the number of UMAP components from 2 to 10. The selection was based on the first plateau observed in the plot of trustworthiness versus the number of UMAP components. Before applying UMAP, the physical features were normalized using the StandardScaler function from scikit-learn to ensure uniform scaling. As an alternative feature representation, we also used token embeddings from the Evolutionary Scale Model (ESM) 42 , which is a fully contextualized language model pre-trained on the pre-clustered UniRef proteins datasets. The ESM embeddings were obtained from the last layer representation using different model sizes varying from 8M to 3B parameters. We applied zero padding to the sequences and mean pooling to obtain the final embeddings. Representation comparisons accross the manuscript were done against the 150M model size. 1.0.3 Clustering Within the clustering algorithms, density-based algorithms offer the advantage of identifying arbitrary-shaped clusters in the data, without requiring the user to define the expected number of clusters beforehand. 94 We applied Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN) which can identify non-linear relations between data points and segregate them based on the hyperparameters. 57 Among HDBSCAN’s tunable parameters, the minimum cluster size (MCS) and minimum sample size (MSS) exert the greatest influence on clustering outcomes. We systematically varied both parameters from 2 to 30 (Figure S9) and assessed the resulting clusterings using two complementary criteria: (i) the Density-Based Clustering Validation (DBCV) score, 95 which indicates the separation of the resulting clusters (higher values indicate better-defined clusters), and (ii) the noise fraction, representing the proportion of sequences not assigned to any cluster. While maximizing DBCV score is desirable, the highest scores often coincided with trivial outcomes, such as single dense cluster or solutions dominated by noise, resulting in poor biological interpretability. Therefore, we selected the parameter combination that achieved a balanced trade-off: a moderate DBCV score, an acceptable noise fraction, and a meaningful number of clusters. To further homogenize the resulting clusters, we performed successive independent HDB-SCAN clustering to any cluster containing more than 100 sequences. Reclustering continued until each subcluster contained fewer than 100 sequences or the overall noise fraction surpassed 0.3. At every step, we computed the mutual information between cluster identity and individual features using the mutual info classif function of scikit-learn 93 . The resulting mutual-information values quantify how strongly each feature contributes to the classification; higher values indicate greater influence on cluster assignment. Additional resources Cell.com homepage: https://www.cell.com Templates for Cell Press authors: https://www.cell.com/templates Acknowledgments GHZ is a Cancer Prevention and Research Institute of Texas (CPRIT) scholar in cancer research and supported by CPRIT-RR220008, the Welch Foundation (Award E-2221 and Catalyst Center for Advanced Bioactive Materials Crystallization Award V-E-0001), and NSF CBET-2442006 (CAREER). The simulations presented in this work were performed using the computational resources provided by the Hewlett-Packard Enterprise Data Science Institute at the University of Houston. The authors thank Melissa Unlu, Preethi Kakarla, and Sandeep Reddy Kukunuru for their contributions in curating the HTFs datasets, and Heng Ma and the Argonne National Laboratory facilities for obtaining the ESM embeddings. Funder Information Declared Cancer Prevention and Research Institute of Texas , RR220008 Welch Foundation, https://ror.org/00np6vq88 , E-2221 , V-E-001 U.S. National Science Foundation, https://ror.org/021nxhr62 , CBET- 2442006 Footnotes ↵ 3 Senior author ↵ 4 Lead contact References 1. ↵ Reményi , A. , Schöler , H. R. , and Wilmanns , M. ( 2004 ). Combinatorial control of gene expression . Nature Structural & Molecular Biology 11 , 812 – 815 . https://www.nature.com/articles/nsmb820 . doi: 10.1038/nsmb820 . OpenUrl CrossRef PubMed Web of Science 2. Spitz , F. , and Furlong , E. E. ( 2012 ). Transcription factors: from enhancer binding to developmental control . Nature reviews genetics 13 , 613 – 626 . OpenUrl CrossRef PubMed 3. Ptashne , M. , and Gann , A. Genes & signals vol. 402 . Cold Spring Harbor Laboratory Press Cold Spring Harbor , NY : ( 2002 ). 4. Schramm , L. , and Hernandez , N. ( 2002 ). Recruitment of rna polymerase iii to its target promoters . Genes & development 16 , 2593 – 2620 . OpenUrl FREE Full Text 5. ↵ Hsu , H.-T. , Chen , H.-M. , Yang , Z. , Wang , J. , Lee , N. , Burger , A. , Zaret , K. , Liu , T. , Levine , E. , and Mango , S. ( 2015 ). Recruitment of rna polymerase ii by the pioneer transcription factor pha-4 . Science 348 , 1372 – 1376 . OpenUrl Abstract / FREE Full Text 6. ↵ Villard , J. ( 2004 ). Transcription regulation and human diseases . Swiss medical weekly 134 , 571 – 571 . OpenUrl PubMed 7. Jin , W. , Qazi , T. J. , Quan , Z. , Li , N. , and Qing , H. ( 2019 ). Dysregulation of transcription factors: a key culprit behind neurodegenerative disorders . The Neuroscientist 25 , 548 – 565 . OpenUrl PubMed 8. ↵ Lee , T. I. , and Young , R. A. ( 2013 ). Transcriptional regulation and its misregulation in disease . Cell 152 , 1237 – 1251 . OpenUrl CrossRef PubMed Web of Science 9. ↵ Staller , M. V. , Ramirez , E. , Kotha , S. R. , Holehouse , A. S. , Pappu , R. V. , and Cohen , B. A. ( 2022 ). Directed mutational scanning reveals a balance between acidic and hydrophobic residues in strong human activation domains . Cell systems 13 , 334 – 345 . OpenUrl PubMed 10. ↵ Frankel , A. D. , and Kim , P. S. ( 1991 ). Modular structure of transcription factors: implications for gene regulation . Cell 65 , 717 – 719 . OpenUrl CrossRef PubMed Web of Science 11. ↵ Garza , A. S. , Khan , S. H. , Moure , C. M. , Edwards , D. P. , and Kumar , R. ( 2011 ). Bindingfolding induced regulation of af1 transactivation domain of the glucocorticoid receptor by a cofactor that binds to its dna binding domain . PLoS One 6 , e25875 . OpenUrl CrossRef PubMed 12. Dyson , H. J. ( 2012 ). Roles of intrinsic disorder in protein-nucleic acid interactions . Molecular bioSystems 8 , 97 – 104 . OpenUrl PubMed 13. ↵ Kuravsky , M. , Kelly , C. , Redfield , C. , and Shammas , S. L. ( 2024 ). The transition state for coupled folding and binding of a disordered DNA binding domain resembles the unbound state . Nucleic Acids Research ( 2002 – 2024 ). https://academic.oup.com/nar/advance-article/doi/10.1093/nar/gkae794/7771570 . doi: 10.1093/nar/gkae794 . OpenUrl CrossRef PubMed 14. ↵ Wright , P. E. , and Dyson , H. J. ( 2015 ). Intrinsically disordered proteins in cellular signalling and regulation . Nature reviews Molecular cell biology 16 , 18 – 29 . OpenUrl CrossRef PubMed 15. Wright , P. E. , and Dyson , H. J. ( 1999 ). Intrinsically unstructured proteins: re-assessing the protein structure-function paradigm . Journal of molecular biology 293 , 321 – 331 . OpenUrl CrossRef PubMed Web of Science 16. ↵ Shammas , S. L. , Travis , A. J. , and Clarke , J. ( 2014 ). Allostery within a transcription coactivator is predominantly mediated through dissociation rate constants . Proceedings of the National Academy of Sciences 111 , 12055 – 12060 . OpenUrl Abstract / FREE Full Text 17. ↵ Sabari , B. R. , Dall’Agnese , A. , Boija , A. , Klein , I. A. , Coffey , E. L. , Shrinivas , K. , Abraham , B. J. , Hannett , N. M. , Zamudio , A. V. , Manteiga , J. C. et al. ( 2018 ). Coactivator condensation at super-enhancers links phase separation and gene control . Science 361 , eaar3958 . OpenUrl Abstract / FREE Full Text 18. Chong , S. , Dugast-Darzacq , C. , Liu , Z. , Dong , P. , Dailey , G. M. , Cattoglio , C. , Heckert , A. , Banala , S. , Lavis , L. , Darzacq , X. et al. ( 2018 ). Imaging dynamic and selective lowcomplexity domain interactions that control gene transcription . Science 361 , eaar2555 . OpenUrl Abstract / FREE Full Text 19. Sharp , P. A. , Chakraborty , A. K. , Henninger , J. E. , and Young , R. A. ( 2022 ). Rna in formation and regulation of transcriptional condensates . RNA 28 , 52 – 57 . http://rnajournal.cshlp.org/content/28/1/52.full http://rnajournal.cshlp.org/content/28/1/52 http://rnajournal.cshlp.org/content/28/1/52.abstract . doi: 10.1261/RNA.078997.121 . OpenUrl Abstract / FREE Full Text 20. Henninger , J. E. , Oksuz , O. , Shrinivas , K. , Cissé , I. I. , Chakraborty , A. K. , and Young , R. A. ( 2021 ). Rna-mediated feedback control of transcriptional condensates charge balance of electrostatic interactions can account for rna feedback regulation in brief during the early steps of transcription initiation, nascent rnas stimulate transcriptional condensate formation, whereas the burst of rnas produced during elongation stimulates condensate dissolution . Cell 184 , 207 – 225 .e24. https://doi.org/10.1016/j.cell.2020.11.030. doi: 10.1016/j.cell.2020.11.030 . OpenUrl CrossRef PubMed 21. ↵ Boija , A. , Klein , I. A. , Sabari , B. R. , Dall’Agnese , A. , Coffey , E. L. , Zamudio , A. V. , Li , C. H. , Shrinivas , K. , Manteiga , J. C. , Hannett , N. M. , Abraham , B. J. , Afeyan , L. K. , Guo , Y. E. , Rimel , J. K. , Fant , C. B. , Schuijers , J. , Lee , T. I. , Taatjes , D. J. , and Young , R. A. ( 2018 ). Transcription factors activate genes through the phase-separation capacity of their activation domains . Cell 175 , 1842 – 1855 .e16. http://dx.doi.org/10.1016/j.cell.2018.10.042. doi: 10.1016/j.cell.2018.10.042 . OpenUrl CrossRef PubMed 22. ↵ Yan , J. , and Kurgan , L. ( 2017 ). DRNApred, fast sequence-based method that accurately predicts and discriminates DNA-and RNA-binding residues . Nucleic Acids Research 45 , 1 – 16 . doi: 10.1093/nar/gkx059 . OpenUrl CrossRef PubMed 23. ↵ Kim , G. B. , Gao , Y. , Palsson , B. O. , and Lee , S. Y. ( 2021 ). DeepTFactor: A deep learning-based tool for the prediction of transcription factors . Proceedings of the National Academy of Sciences 118 , 1 – 5 . https://pnas.org/doi/full/10.1073/pnas.2021171118 . doi: 10.1073/pnas.2021171118 . OpenUrl CrossRef PubMed 24. ↵ Qian , Z. , Cai , Y. D. , and Li , Y. ( 2006 ). Automatic transcription factor classifier based on functional domain composition . Biochemical and Biophysical Research Communications 347 , 141 – 144 . doi: 10.1016/j.bbrc.2006.06.060 . OpenUrl CrossRef PubMed Web of Science 25. ↵ Wingender , E. ( 2013 ). Criteria for an updated classification of human transcription factor DNA-binding domains . Journal of bioinformatics and computational biology 11 , 1340007 . https://www.worldscientific.com/doi/abs/10.1142/S0219720013400076 http://www.ncbi.nlm.nih.gov/pubmed/23427989 . doi: 10.1142/S0219720013400076 . OpenUrl CrossRef PubMed 26. ↵ Taylor , B. L. , and Zhulin , I. B. ( 1999 ). PAS Domains: Internal Sensors of Oxygen, Redox Potential, and Light . Microbiology and Molecular Biology Reviews 63 , 479 – 506 . https://journals.asm.org/doi/10.1128/MMBR.63.2.479-506.1999 . doi: 10.1128/MMBR.63.2.479-506.1999 . OpenUrl Abstract / FREE Full Text 27. ↵ Collins , T. , Stone , J. R. , and Williams , A. J. ( 2001 ). All in the Family: the BTB/POZ, KRAB, and SCAN Domains . Molecular and Cellular Biology 21 , 3609 – 3615 . https://www.tandfonline.com/doi/full/10.1128/MCB.21.11.3609-3615.2001 . doi: 10.1128/MCB.21.11.3609-3615.2001 . OpenUrl FREE Full Text 28. Schumacher , C. , Wang , H. , Honer , C. , Ding , W. , Koehn , J. , Lawrence , Q. , Coulis , C. M. , Wang , L. L. , Ballinger , D. , Bowen , B. R. , and Wagner , S. ( 2000 ). The SCAN domain mediates selective oligomerization . Journal of Biological Chemistry 275 , 17173 – 17179 . http://dx.doi.org/10.1074/jbc.M000119200. doi: 10.1074/jbc.M000119200 . OpenUrl Abstract / FREE Full Text 29. ↵ Boyer , L. A. , Latek , R. R. , and Peterson , C. L. ( 2004 ). The SANT domain: a unique histonetail-binding module? Nature Reviews Molecular Cell Biology 5 , 158 – 163 . https://www.nature.com/articles/nrm1314 . doi: 10.1038/nrm1314 . OpenUrl CrossRef PubMed Web of Science 30. ↵ Erijman , A. , Kozlowski , L. , Sohrabi-Jahromi , S. , Fishburn , J. , Warfield , L. , Schreiber , J. , Noble , W. S. , Söding , J. , and Hahn , S. ( 2020 ). A High-Throughput Screen for Transcription Activation Domains Reveals Their Sequence Features and Permits Prediction by Deep Learning . Molecular Cell 78 , 890 – 902 . doi: 10.1016/j.molcel.2020.04.020 . OpenUrl CrossRef PubMed 31. Klaus , L. , de Almeida , B. P. , Vlasova , A. , Nemčko , F. , Schleiffer , A. , Bergauer , K. , Hofbauer , L. , Rath , M. , and Stark , A. ( 2023 ). Systematic identification and characterization of repressive domains in Drosophila transcription factors . The EMBO Journal 42 , 1 – 22 . https://www.embopress.org/doi/10.15252/embj.2022112100 . doi: 10.15252/embj.2022112100 . OpenUrl CrossRef 32. ↵ Farheen , F. , Broyles , B. K. , Zhang , Y. , Ibtehaz , N. , Erkine , A. M. , and Kihara , D. ( 2024 ). Predicting transcriptional activation domain function using Graph Neural Networks . bioRxiv : the preprint server for biology ( 1 – 30 ). http://www.ncbi.nlm.nih.gov/pubmed/38766093 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC11100744 . doi: 10.1101/2024.05.08.593266 . OpenUrl Abstract / FREE Full Text 33. ↵ Merhej , J. , Frigo , A. , Le Crom , S. , Camadro , J.-M. , Devaux , F. , and Lelandais , G. ( 2014 ). bPeaks: a bioinformatics tool to detect transcription factor binding sites from ChIPseq data in yeasts and other organisms with small genomes . Yeast 31 , 375 – 391 . https://onlinelibrary.wiley.com/doi/10.1002/yea.3031 . doi: 10.1002/yea.3031 . OpenUrl CrossRef PubMed 34. ↵ Göös , H. , Kinnunen , M. , Salokas , K. , Tan , Z. , Liu , X. , Yadav , L. , Zhang , Q. , Wei , G. H. , and Varjosalo , M. ( 2022 ). Human transcription factor protein interaction networks . Nature Communications 13 . doi: 10.1038/s41467-022-28341-5 . OpenUrl CrossRef PubMed 35. Holland , P. , Bergenholm , D. , Börlin , C. S. , Liu , G. , and Nielsen , J. ( 2019 ). Predictive models of eukaryotic transcriptional regulation reveals changes in transcription factor roles and promoter usage between metabolic conditions . Nucleic Acids Research 47 , 4986 – 5000 . https://academic.oup.com/nar/article/47/10/4986/5446249 . doi: 10.1093/nar/gkz253 . OpenUrl CrossRef PubMed 36. ↵ Petrenko , N. , and Struhl , K. ( 2021 ). Comparison of transcriptional initiation by RNA polymerase II across eukaryotic species . eLife 10 , 1 – 23 . https://elifesciences.org/articles/67964 . doi: 10.7554/eLife.67964 . OpenUrl CrossRef 37. ↵ Soto , L. F. , Li , Z. , Santoso , C. S. , Berenson , A. , Ho , I. , Shen , V. X. , Yuan , S. , and Fuxman Bass , J. I. ( 2022 ). Compendium of human transcription factor effector domains . Molecular Cell 82 , 514 – 526 . https://doi.org/10.1016/j.molcel.2021.11.007 https://linkinghub.elsevier.com/retrieve/pii/S1097276521009576 . doi: 10.1016/j.molcel.2021.11.007 . OpenUrl CrossRef PubMed 38. ↵ Das , R. K. , and Pappu , R. V. ( 2013 ). Conformations of intrinsically disordered proteins are influenced by linear sequence distributions of oppositely charged residues . Proceedings of the National Academy of Sciences 110 , 13392 – 13397 . https://pnas.org/doi/full/10.1073/pnas.1304749110 . doi: 10.1073/pnas.1304749110 . OpenUrl Abstract / FREE Full Text 39. ↵ Tjian , R. , and Maniatis , T. ( 1994 ). Transcriptional activation: A complex puzzle with few easy pieces . Cell 77 , 5 – 8 . https://linkinghub.elsevier.com/retrieve/pii/0092867494902275 . doi: 10.1016/0092-8674(94)90227-5 . OpenUrl CrossRef PubMed Web of Science 40. ↵ Frietze , S. , and Farnham , P. J. ( 2011 ). Transcription factor effector domains . Sub-Cellular Biochemistry 52 , 261 – 277 . doi:10.1007/978-90-481-9069-0{\_}12. OpenUrl CrossRef PubMed 41. ↵ Sanborn , A. L. , Yeh , B. T. , Feigerle , J. T. , Hao , C. V. , Townshend , R. J. , Lieberman Aiden , E. , Dror , R. O. , and Kornberg , R. D. ( 2021 ). Simple biochemical features underlie transcriptional activation domain diversity and dynamic, fuzzy binding to mediator . Elife 10 , e68068 . OpenUrl CrossRef PubMed 42. ↵ Lin , Z. , Akin , H. , Rao , R. , Hie , B. , Zhu , Z. , Lu , W. , Smetanin , N. , Verkuil , R. , Kabeli , O. , Shmueli , Y. , dos Santos Costa , A. , Fazel-Zarandi , M. , Sercu , T. , Candido , S. , and Rives , A. ( 2023 ). Evolutionary-scale prediction of atomic-level protein structure with a language model . Science 379 , 1123 – 1130 . https://www.science.org/doi/10.1126/science.ade2574 . doi: 10.1126/science.ade2574 . OpenUrl CrossRef PubMed 43. ↵ UniProtKB - Proteomes. UP000005640, Homo sapiens (Human) Available from: https://www.uniprot.org/proteomes/UP000005640 . Accessed on March 10, 2024 . ( 2024 ). https://www.uniprot.org/proteomes/UP000005640 . 44. ↵ Zhang , Q. , Liu , W. , Zhang , H.-M. , Xie , G.-Y. , Miao , Y.-R. , Xia , M. , and Guo , A.-Y. ( 2020 ). hTFtarget: A Comprehensive Database for Regulations of Human Transcription Factors and Their Targets . Genomics, Proteomics & Bioinformatics 18 , 120 – 128 . https://doi.org/10.1016/j.gpb.2019.09.006 https://academic.oup.com/gpb/article/18/2/120/7229801 . doi: 10.1016/j.gpb.2019.09.006 . OpenUrl CrossRef 45. ↵ Ng , A. H. M. , Khoshakhlagh , P. , Rojo Arias , J. E. , Pasquini , G. , Wang , K. , Swiersy , A. , Shipman , S. L. , Appleton , E. , Kiaee , K. , Kohman , R. E. , Vernet , A. , Dysart , M. , Leeper , K. , Saylor , W. , Huang , J. Y. , Graveline , A. , Taipale , J. , Hill , D. E. , Vidal , M. , Melero-Martin , J. M. , Busskamp , V. , and Church , G. M. ( 2021 ). A comprehensive library of human transcription factors for cell fate engineering . Nature Biotechnology 39 , 510 – 519 . https://www.nature.com/articles/s41587-020-0742-6 . doi: 10.1038/s41587-020-0742-6 . OpenUrl CrossRef PubMed 46. ↵ Lambert , S. A. , Jolma , A. , Campitelli , L. F. , Das , P. K. , Yin , Y. , Albu , M. , Chen , X. , Taipale , J. , Hughes , T. R. , and Weirauch , M. T. ( 2018 ). The human transcription factors . Cell 172 , 650 – 665 . OpenUrl CrossRef PubMed 47. ↵ Galán-Martínez , J. , Stamatakis , K. , Sánchez-Gómez , I. , Vázquez-Cuesta , S. , Gironés , N. , and Fresno , M. ( 2022 ). Isoform-specific effects of transcription factor TCFL5 on the pluripotency-related genes SOX2 and KLF4 in colorectal cancer development . Molecular Oncology 16 , 1876 – 1890 . https://febs.onlinelibrary.wiley.com/doi/10.1002/1878-0261.13085 . doi: 10.1002/1878-0261.13085 . OpenUrl CrossRef PubMed 48. Belluti , S. , Rigillo , G. , and Imbriano , C. ( 2020 ). Transcription Factors in Cancer: When Alternative Splicing Determines Opposite Cell Fates . Cells 9 , 760 . https://www.mdpi.com/2073-4409/9/3/760 . doi: 10.3390/cells9030760 . OpenUrl CrossRef 49. Santos , B. F. , Grenho , I. , Martel , P. J. , Ferreira , B. I. , and Link , W. ( 2023 ). FOXO family isoforms . Cell Death & Disease 14 , 702 . https://www.nature.com/articles/s41419-023-06177-1 . doi: 10.1038/s41419-023-06177-1 . OpenUrl CrossRef PubMed 50. Katzenellenbogen , B. S. , Guillen , V. S. , and Katzenellenbogen , J. A. ( 2023 ). Targeting the oncogenic transcription factor FOXM1 to improve outcomes in all subtypes of breast cancer . Breast Cancer Research 25 , 1 – 11 . https://doi.org/10.1186/s13058-023-01675-8. doi: 10.1186/s13058-023-01675-8 . OpenUrl CrossRef PubMed 51. ↵ Krivtsova , O. , Makarova , A. , and Lazarevich , N. ( 2018 ). Aberrant expression of alternative isoforms of transcription factors in hepatocellular carcinoma . World Journal of Hepatology 10 , 645 – 661 . http://www.wjgnet.com/1948-5182/full/v10/i10/645.htm . doi: 10.4254/wjh.v10.i10.645 . OpenUrl CrossRef PubMed 52. ↵ Vuzman , D. , Polonsky , M. , and Levy , Y. ( 2010 ). Facilitated DNA search by multidomain transcription factors: cross talk via a flexible linker . Biophysical journal 99 , 1202 – 11 . http://dx.doi.org/10.1016/j.bpj.2010.06.007 http://www.ncbi.nlm.nih.gov/pubmed/20713004 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC2920665 . doi: 10.1016/j.bpj.2010.06.007 . OpenUrl CrossRef PubMed Web of Science 53. ↵ Uversky , V. N. , Gillespie , J. R. , and Fink , A. L. ( 2000 ). Why are ‘natively unfolded’ proteins unstructured under physiologic conditions? Proteins: Structure, Function and Genetics 41 , 415 – 427 . doi: 10.1002/1097-0134(20001115)41:33.0.CO;2-7 . OpenUrl CrossRef 54. ↵ Uversky , V. N. ( 2002 ). Natively unfolded proteins: a point where biology waits for physics . Protein science 11 , 739 – 756 . OpenUrl CrossRef PubMed Web of Science 55. ↵ Kyte , J. , and Doolittle , R. F. ( 1982 ). A simple method for displaying the hydropathic character of a protein . Journal of Molecular Biology 157 , 105 – 132 . doi: 10.1016/0022-2836(82)90515-0 . OpenUrl CrossRef PubMed Web of Science 56. ↵ Emenecker , R. J. , Griffith , D. , and Holehouse , A. S. ( 2021 ). Metapredict: a fast, accurate, and easy-to-use predictor of consensus disorder and structure . Biophysical Journal 120 , 4312 – 4319 . https://doi.org/10.1016/j.bpj.2021.08.039 https://linkinghub.elsevier.com/retrieve/pii/S0006349521007256 . doi: 10.1016/j.bpj.2021.08.039 . OpenUrl CrossRef 57. ↵ McInnes , L. , Healy , J. , and Astels , S. ( 2017 ). hdbscan: Hierarchical density based clustering . The Journal of Open Source Software 2 , 205 . http://joss.theoj.org/papers/10.21105/joss.00205 . doi: 10.21105/joss.00205 . OpenUrl CrossRef 58. ↵ Das , R. K. , Ruff , K. M. , and Pappu , R. V. ( 2015 ). Relating sequence encoded information to form and function of intrinsically disordered proteins . Current Opinion in Structural Biology 32 , 102 – 112 . http://dx.doi.org/10.1016/j.sbi.2015.03.008 https://linkinghub.elsevier.com/retrieve/pii/S0959440X15000354 . doi: 10.1016/j.sbi.2015.03.008 . OpenUrl CrossRef PubMed 59. ↵ Shamilov , R. , Robinson , V. L. , and Aneskievich , B. J. ( 2021 ). Seeing Keratinocyte Proteins through the Looking Glass of Intrinsic Disorder . International Journal of Molecular Sciences 22 , 7912 . https://www.mdpi.com/1422-0067/22/15/7912 . doi: 10.3390/ijms22157912 . OpenUrl CrossRef PubMed 60. ↵ Zarubin , M. , Murugova , T. , Ryzhykau , Y. , Ivankov , O. , Uversky , V. N. , and Kravchenko , E. ( 2024 ). Structural study of the intrinsically disordered tardigrade damage suppressor protein (Dsup) and its complex with DNA . Scientific Reports 14 , 22910 . https://www.nature.com/articles/s41598-024-74335-2 . doi: 10.1038/s41598-024-74335-2 . OpenUrl CrossRef PubMed 61. ↵ Zerze , G. H. , Best , R. B. , and Mittal , J. ( 2015 ). Sequence- and Temperature-Dependent Properties of Unfolded and Disordered Proteins from Atomistic Simulations . Journal of Physical Chemistry B 119 , 14622 – 14630 . doi: 10.1021/acs.jpcb.5b08619 . OpenUrl CrossRef PubMed 62. ↵ de Jonge , W. J. , Patel , H. P. , Meeussen , J. V. , and Lenstra , T. L. ( 2022 ). Following the tracks: How transcription factor binding dynamics control transcription . Biophysical Journal 121 , 1583 – 1592 . https://doi.org/10.1016/j.bpj.2022.03.026 https://linkinghub.elsevier.com/retrieve/pii/S0006349522002363 . doi: 10.1016/j.bpj.2022.03.026 . OpenUrl CrossRef PubMed 63. ↵ DelRosso , N. , Tycko , J. , Suzuki , P. , Andrews , C. , Aradhana Mukund , A. , Liongson , I. , Ludwig , C. , Spees , K. , Fordyce , P. , Bassik , M. C. , and Bintu , L. ( 2023 ). Large-scale mapping and mutagenesis of human transcriptional effector domains . Nature 616 , 365 – 372 . doi: 10.1038/s41586-023-05906-y . OpenUrl CrossRef 64. ↵ Pan , X.-M. , Niu , W.-D. , and Wang , Z.-X. ( 1999 ). What Is the Minimum Number of Residues to Determine the Secondary Structural State? Journal of Protein Chemistry 18 , 579 – 584 . https://link.springer.com/10.1023/A:1020655417839 . doi: 10.1023/A:1020655417839 . OpenUrl CrossRef PubMed 65. ↵ Ravarani , C. N. , Erkina , T. Y. , De Baets , G. , Dudman , D. C. , Erkine , A. M. , and Babu , M. M. ( 2018 ). High-throughput discovery of functional disordered regions: investigation of transactivation domains . Molecular Systems Biology 14 , 1 – 14 . https://www.embopress.org/doi/10.15252/msb.20188190 . doi: 10.15252/msb.20188190 . OpenUrl CrossRef 66. ↵ Kotha , S. R. , and Staller , M. V. ( 2023 ). Clusters of acidic and hydrophobic residues can predict acidic transcriptional activation domains from protein sequence . Genetics 225 , 1 – 16 . https://doi.org/10.1093/genetics/iyad131. doi: 10.1093/genetics/iyad131 . OpenUrl CrossRef 67. ↵ Grant , C. E. , Bailey , T. L. , and Noble , W. S. ( 2011 ). FIMO: scanning for occurrences of a given motif . Bioinformatics 27 , 1017 – 1018 . https://academic.oup.com/bioinformatics/article/27/7/1017/232614 . doi: 10.1093/bioinformatics/btr064 . OpenUrl CrossRef PubMed Web of Science 68. ↵ Bailey , T. L. ( 2021 ). STREME: accurate and versatile sequence motif discovery . Bioinformatics 37 , 2834 – 2840 . https://academic.oup.com/bioinformatics/article/37/18/2834/6184861 . doi: 10.1093/bioinformatics/btab203 . OpenUrl CrossRef 69. ↵ Pokharel , S. , Pratyush , P. , Heinzinger , M. , Newman , R. H. , and Kc , D. B. ( 2022 ). Improving protein succinylation sites prediction using embeddings from protein language model . Scientific Reports 12 , 16933 . https://doi.org/10.1038/s41598-022-21366-2 https://www.nature.com/articles/s41598-022-21366-2 . doi: 10.1038/s41598-022-21366-2 . OpenUrl CrossRef PubMed 70. ↵ Bepler , T. , and Berger , B. ( 2019 ). Learning protein sequence embeddings using information from structure . 7th International Conference on Learning Representations, ICLR 2019 ( 1 – 17 ). http://arxiv.org/abs/1902.08661 . 71. ↵ Suzek , B. E. , Huang , H. , McGarvey , P. , Mazumder , R. , and Wu , C. H. ( 2007 ). Uniref: comprehensive and non-redundant uniprot reference clusters . Bioinformatics 23 , 1282 – 1288 . OpenUrl CrossRef PubMed Web of Science 72. ↵ Liu , J. , Perumal , N. B. , Oldfield , C. J. , Su , E. W. , Uversky , V. N. , and Dunker , A. K. ( 2006 ). Intrinsic Disorder in Transcription Factors . Biochemistry 45 , 6873 – 6888 . https://pubs.acs.org/doi/10.1021/bi0602718 . doi: 10.1021/bi0602718 . OpenUrl CrossRef PubMed Web of Science 73. ↵ Bianchi , G. , Longhi , S. , Grandori , R. , and Brocca , S. ( 2020 ). Relevance of electrostatic charges in compactness, aggregation, and phase separation of intrinsically disordered proteins . International Journal of Molecular Sciences 21 , 1 – 30 . doi: 10.3390/ijms21176208 . OpenUrl CrossRef PubMed 74. ↵ Umeyama , S. ( 1991 ). Least-squares estimation of transformation parameters between two point patterns . IEEE Transactions on Pattern Analysis and Machine Intelligence 13 , 376 – 380 . http://ieeexplore.ieee.org/document/88573/ . doi: 10.1109/34.88573 . OpenUrl CrossRef 75. ↵ Wu , C. , and Tang , R. Performance law of large language models ( 2024 ). https://arxiv.org/abs/2408.09895 . arXiv: 2408.09895 . 76. ↵ Wagh , K. , Garcia , D. A. , and Upadhyaya , A. ( 2021 ). Phase separation in transcription factor dynamics and chromatin organization . Current Opinion in Structural Biology 71 , 148 – 155 . OpenUrl CrossRef PubMed 77. ↵ Sadar , M. D. ( 2020 ). Discovery of drugs that directly target the intrinsically disordered region of the androgen receptor . Expert opinion on drug discovery 15 , 551 – 560 . OpenUrl CrossRef PubMed 78. ↵ Qin , C. , Wang , Y.-L. , Zheng , J. , Wan , X.-B. , and Fan , X.-J. ( 2025 ). Current perspectives in drug targeting intrinsically disordered proteins and biomolecular condensates . BMC biology 23 , 118 . OpenUrl PubMed 79. ↵ Wu , K. , Jiang , H. , Hicks , D. R. , Liu , C. , Muratspahić , E. , Ramelot , T. A. , Liu , Y. , McNally , K. , Kenny , S. , Mihut , A. et al. ( 2025 ). Design of intrinsically disordered region binding proteins . Science 389 , eadr8063 . OpenUrl PubMed 80. ↵ Wagh , K. , Stavreva , D. A. , Upadhyaya , A. , and Hager , G. L. ( 2023 ). Transcription Factor Dynamics: One Molecule at a Time . Annual Review of Cell and Developmental Biology 39 , 277 – 305 . https://www.annualreviews.org/doi/10.1146/annurev-cellbio-022823-013847 . doi: 10.1146/annurev-cellbio-022823-013847 . OpenUrl CrossRef PubMed 81. ↵ Jose , E. , March-Steinman , W. , Wilson , B. A. , Shanks , L. , Parkinson , C. , Alvarado-Cruz , I. , Sweasy , J. B. , and Paek , A. L. ( 2024 ). Temporal coordination of the transcription factor response to H2O2 stress . Nature Communications 15 , 3440 . https://www.nature.com/articles/s41467-024-47837-w . doi: 10.1038/s41467-024-47837-w . OpenUrl CrossRef PubMed 82. ↵ Mactavish , B. , Zhu , D. , Shang , J. , Shao , Q. , Yang , Z. J. , Kamenecka , T. M. , and Kojetin , D. J. ( 2024 ). Ligand efficacy shifts a nuclear receptor conformational ensemble between transcriptionally active and repressive states . doi: 10.1101/2024.04.23.590805 . OpenUrl Abstract / FREE Full Text 83. ↵ Qian , M. , Yan , F. , Yuan , T. , Yang , B. , He , Q. , and Zhu , H. ( 2020 ). Targeting post-translational modification of transcription factors as cancer therapy . Drug Discovery Today 25 , 1502 – 1512 . https://doi.org/10.1016/j.drudis.2020.06.005 https://linkinghub.elsevier.com/retrieve/pii/S1359644620302300 . doi: 10.1016/j.drudis.2020.06.005 . OpenUrl CrossRef PubMed 84. ↵ Kim , H. K. , Jeong , M. G. , and Hwang , E. S. ( 2021 ). Post-Translational Modifications in Transcription Factors that Determine T Helper Cell Differentiation . Molecules and Cells 44 , 318 – 327 . http://molcells.org/journal/view.html?doi=10.14348/molcells.2021.0057 https://linkinghub.elsevier.com/retrieve/pii/S1016847823001991 . doi: 10.14348/molcells.2021.0057 . OpenUrl CrossRef PubMed 85. ↵ Piepoli , S. , Barakat , S. , Nogay , L. , Şimşek , B. , Akkose , U. , Taskiran , H. , Tolay , N. , Gezen , M. , Yeşilada , C. Y. , Tuncay , M. , Adebali , O. , Atilgan , C. , and Erman , B. ( 2022 ). Sibling rivalry among the ZBTB transcription factor family: homodimers versus heterodimers . Life Science Alliance 5 , e202201474 . https://www.life-science-alliance.org/lookup/doi/10.26508/lsa.202201474 . doi: 10.26508/lsa.202201474 . OpenUrl Abstract / FREE Full Text 86. ↵ Yang , Y. , Guo , L. , Chen , L. , Gong , B. , Jia , D. , and Sun , Q. ( 2023 ). Nuclear transport proteins: structure, function, and disease relevance . Signal Transduction and Targeted Therapy 8 , 425 . https://www.nature.com/articles/s41392-023-01649-4 . doi: 10.1038/s41392-023-01649-4 . OpenUrl CrossRef PubMed 87. ↵ Holehouse , A. S. , Das , R. K. , Ahad , J. N. , Richardson , M. O. , and Pappu , R. V. ( 2017 ). CIDER: Resources to Analyze Sequence-Ensemble Relationships of Intrinsically Disordered Proteins . Biophysical Journal 112 , 16 – 21 . http://dx.doi.org/10.1016/j.bpj.2016.11.3200 https://linkinghub.elsevier.com/retrieve/pii/S0006349516342692 . doi: 10.1016/j.bpj.2016.11.3200 . OpenUrl CrossRef PubMed 88. ↵ Cock , P. J. A. , Antao , T. , Chang , J. T. , Chapman , B. A. , Cox , C. J. , Dalke , A. , Friedberg , I. , Hamelryck , T. , Kauff , F. , Wilczynski , B. , and de Hoon , M. J. L. ( 2009 ). Biopython: freely available Python tools for computational molecular biology and bioinformatics . Bioinformatics 25 , 1422 – 1423 . https://academic.oup.com/bioinformatics/article/25/11/1422/330687 . doi: 10.1093/bioinformatics/btp163 . OpenUrl CrossRef PubMed Web of Science 89. ↵ Rafique , O. , and Mir , A. ( 2020 ). Weighted dimensionality reduction and robust Gaussian mixture model based cancer patient subtyping from gene expression data . Journal of Biomedical Informatics 112 , 103620 . https://doi.org/10.1016/j.jbi.2020.103620 https://linkinghub.elsevier.com/retrieve/pii/S1532046420302483 . doi: 10.1016/j.jbi.2020.103620 . OpenUrl CrossRef PubMed 90. Bera , D. , Pratap , R. , and Verma , B. D. ( 2023 ). Dimensionality Reduction for Categorical Data . IEEE Transactions on Knowledge and Data Engineering 35 , 3658 – 3671 . doi: 10.1109/TKDE.2021.3132373 . OpenUrl CrossRef 91. ↵ Wang , J. , Yue , S. , Yu , X. , and Wang , Y. ( 2017 ). An efficient data reduction method and its application to cluster analysis . Neurocomputing 238 , 234 – 244 . http://dx.doi.org/10.1016/j.neucom.2017.01.059 https://linkinghub.elsevier.com/retrieve/pii/S0925231217301674 . doi: 10.1016/j.neucom.2017.01.059 . OpenUrl CrossRef 92. ↵ McInnes , L. , Healy , J. , and Melville , J. ( 2018 ). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction . http://arxiv.org/abs/1802.03426 . doi: 10.48550/arXiv.1802.03426 . OpenUrl CrossRef 93. ↵ Pedregosa , F. , Varoquaux , G. , Gramfort , A. , Michel , V. , Thirion , B. , Grisel , O. , Blondel , M. , Prettenhofer , P. , Weiss , R. , Dubourg , V. , Vanderplas , J. , Passos , A. , Cournapeau , D. , Brucher , M. , Perrot , M. , and Duchesnay , E. ( 2011 ). Scikit-learn: Machine Learning in Python Fabian . Journal of Machine Learning Research 12 , 2825 – 2830 . https://ehp.niehs.nih.gov/doi/10.1289/EHP4713 . OpenUrl 94. ↵ Pitafi , S. , Anwar , T. , and Sharif , Z. ( 2023 ). A Taxonomy of Machine Learning Clustering Algorithms, Challenges, and Future Realms . Applied Sciences 13 , 3529 . https://www.mdpi.com/2076-3417/13/6/3529 . doi: 10.3390/app13063529 . OpenUrl CrossRef 95. ↵ Moulavi , D. , Jaskowiak , P. A. , Campello , R. J. , Zimek , A. , and Sander , J. Density-based clustering validation . In: Proceedings of the 2014 SIAM international conference on data mining. SIAM ( 2014 ):( 839 – 847 ). View the discussion thread. Back to top Previous Next Posted October 27, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Classification of Human Transcription Factors Based on Their Effector Domains via Unsupervised Learning Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Classification of Human Transcription Factors Based on Their Effector Domains via Unsupervised Learning Eduardo Ayala , Ayush Gupta , Nehil Shreyash , Arvind Ramanathan , Gül H. Zerze bioRxiv 2025.10.26.684687; doi: https://doi.org/10.1101/2025.10.26.684687 Share This Article: Copy Citation Tools Classification of Human Transcription Factors Based on Their Effector Domains via Unsupervised Learning Eduardo Ayala , Ayush Gupta , Nehil Shreyash , Arvind Ramanathan , Gül H. Zerze bioRxiv 2025.10.26.684687; doi: https://doi.org/10.1101/2025.10.26.684687 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41909) Biophysics (21436) Cancer Biology (18576) Cell Biology (25479) Clinical Trials (138) Developmental Biology (13367) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40359) Molecular Biology (17162) Neuroscience (88532) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7636) Plant Biology (15129) Scientific Communication and Education (2044) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00