Proteome-wide computational analyses reveal links between protein condensate formation and RNA biology

doi:10.1101/2025.03.03.640993

Proteome-wide computational analyses reveal links between protein condensate formation and RNA biology

2025 · doi:10.1101/2025.03.03.640993

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 82,963 characters · extracted from preprint-html · click to expand

Proteome-wide computational analyses reveal links between protein condensate formation and RNA biology | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Proteome-wide computational analyses reveal links between protein condensate formation and RNA biology View ORCID Profile Snigdha Maiti , Swarnendu Tripathi , David W Baggett , View ORCID Profile Aaron H. Phillips , Cheon-Gil Park , Jina Wang , Wahiduzzaman , William T Freyaldenhoven , View ORCID Profile Swati Kinger , View ORCID Profile Brittany Pioso , View ORCID Profile John Bollinger , View ORCID Profile Ramiz Somjee , View ORCID Profile Benjamin Lang , M. Madan Babu , View ORCID Profile Richard W. Kriwacki doi: https://doi.org/10.1101/2025.03.03.640993 Snigdha Maiti 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Snigdha Maiti Swarnendu Tripathi 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site David W Baggett 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aaron H. Phillips 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Aaron H. Phillips Cheon-Gil Park 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jina Wang 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site Wahiduzzaman 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site William T Freyaldenhoven 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site Swati Kinger 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee 2 Rhodes College , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Swati Kinger Brittany Pioso 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Brittany Pioso John Bollinger 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for John Bollinger Ramiz Somjee 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee 2 Rhodes College , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ramiz Somjee Benjamin Lang 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee 3 Center of Excellence for Data-Driven Discovery, Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Benjamin Lang M. Madan Babu 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee 3 Center of Excellence for Data-Driven Discovery, Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site Richard W. Kriwacki 1 Department of Structural Biology, St. Jude Children’s Research Hospital , Memphis, Tennessee 4 Department of Microbiology , Immunology and Biochemistry, University of Tennessee Health Sciences Center , Memphis, Tennessee Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Richard W. Kriwacki For correspondence: richard.kriwacki{at}stjude.org Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Biomolecular condensates mediate dynamic compartmentalization of cellular processes. The multivalent interactions that underlie biomolecular condensation are often promoted by intrinsically disordered regions (IDRs) within proteins. While the role of IDRs in biomolecular condensates is well appreciated, predicting whether an IDR forms condensates in cells remains challenging. Here, we developed a machine learning model to predict condensation behavior of IDRs, analyzing 215 IDRs from fusion oncoproteins in HEK293T cells. Our study identified distinct sequence-derived physicochemical features associated with condensation. Leveraging these data, our model predicts that ∼12% of the ∼13,000 IDRs in the human proteome are likely to form cellular condensates. Proteins with condensate-forming IDRs are enriched in functions involving RNA-related processes and membrane-less organelles (MLOs), highlighting their role in MLO assembly and function. Our model, available via the SAK3.0 web server ( https://sak.stjude.org ), provides a powerful resource for studying IDR-driven phase separation across proteomes, offering insights into biomolecular condensates and their biological roles. Introduction Biomolecular condensates, also termed membrane-less organelles (MLOs), mediate dynamic compartmentalization of biomolecules associated with diverse biological processes 1 . Found in both the nucleus (nucleoli, paraspeckles, Cajal bodies, PML bodies) and the cytoplasm (P-bodies, stress granules, germ granules), MLOs are essential for myriad cellular functions, and their dysfunction has been implicated in numerous diseases 2 – 5 . Assembly of biomolecules into condensates is often driven by phase separation (PS), which is facilitated by weak, multivalent interactions between constituent biomolecules 6 , 7 and often promoted by intrinsically disordered regions (IDRs) in proteins 8 . IDRs lack stable secondary and tertiary structures and often retain a high degree of disorder when interacting with other molecules to form condensates 9 – 11 . They show biased amino acid composition, with depletion in structure-promoting hydrophobic amino acids and enrichment of polar and charged amino acids, features that enable the dynamic multivalent interactions that underpin phase separation 12 . Analyses of amino acid patterns in IDRs across diverse biomolecular condensates have revealed the sequence-related physicochemical features that govern condensate formation 13 – 16 . Multiple algorithms have been developed to identify those features in IDRs associated with PS and several predictors are available 17 – 27 . For example, PhaSePred 22 predicts PS in autonomously self-assembling and partner-dependent proteins using multimodal features. DeePhase 23 utilizes a combined engineered features model and a neural network-based language model to predict the propensity of proteins to undergo homotypic PS. FuzDrop 28 , a conformational entropy-based model, predicts droplet promoting regions and proteins driving PS. Tools including PScore 17 , ParSe v2 25 , and catGranule 29 , leverage sequence-specific features, such as sequence composition, prion-like domains, and charged residues, while others such as MolPhase 30 integrate multidimensional data, including structural and physicochemical properties. Notably, LLPhyScore 21 predicts protein PS potential using a comprehensive scoring system based on sequence and physicochemical features of IDRs, offering valuable insights into mechanisms of IDR-driven protein phase separation. A recently developed predictor called PICNIC 31 utilizes protein-protein interaction networks to exclude proteins that have a connection with known condensate-forming proteins. However, these predictors rely on literature-mined data, which were often obtained using non-standard experimental conditions, leading to variability and noise. Additionally, these studies often use information from the Protein Data Bank (PDB) 32 or the human proteome as negative datasets, which results in false negatives. Curating a verified set of IDRs, including those that form condensates and others that remain diffuse under controlled experimental conditions, is crucial for training and validating predictive models 33 . In our previous study, using experimentally validated datasets, we developed the FO-Puncta ML model to predict the condensation behavior of fusion oncoproteins (FO) in cells 34 . However, this model does not identify the specific regions responsible for condensate formation, nor does it elucidate the role and influence of IDRs present in these FOs on the condensation process. Recently, a machine learning model was developed that predicts homotypic PS of IDRs 35 , but heterotypic interactions (between IDRs and other proteins or nucleic acids) contribute to the formation of many biological condensates. Thus, it is important to assess IDR condensate formation under conditions that reflect the compositional complexity of cellular environments. We addressed these challenges by examining the cellular condensation behavior of 215 IDRs derived from 149 human fusion oncoproteins (FO), 58% (96) of which were previously shown to form cellular condensates 34 . Each IDR was tagged with monomeric, enhanced green fluorescent protein (GFP) and assessed in cells using standardized conditions and microscopy protocols, coupled with rigorous computational image analysis. These efforts yielded a dataset of condensate-forming [termed puncta(+)] and condensate-negative [puncta(-)] IDRs, and the data was used to develop a machine learning (ML) model (termed IDR-Puncta ML model) to predict IDR condensation with high accuracy, based on a large set of sequence-derived physicochemical features. Using the IDR-Puncta ML model, we demonstrate that 12% of 12,899 IDRs identified in the human proteome (termed the human “IDRome”) are predicted to form biomolecular condensates. We show that proteins containing puncta(+) IDRs (1,393 of 8,067 IDR-containing human proteins) are significantly enriched in Gene Ontology (GO) terms involving RNA-related biological processes, including transcription and processing, and others associated with cell division and actin cytoskeleton, and are over-represented as constituents of MLOs. These findings indicate that proteins with puncta(+) IDRs are specialized for functions associated with nuclear processes involving RNA processing, suggesting a role for PS-driven compartmentalization of proteins, RNA, and other biomolecules in these processes. We note that these findings are based on analyses of IDRs derived from cancer-associated FOs and their functional associations may be biased toward biological processes altered in cancer. However, the physicochemical features of FO-derived IDRs broadly sample those of the human IDRome, supporting the generality of our conclusions on the specialized biological functions of puncta(+) IDR-containing human proteins. Finally, the IDR-Puncta ML model, given its high accuracy, will be a powerful tool for uncovering the biological roles of proteins with puncta(+) and puncta(-) IDRs in organisms beyond humans. Results Establishing the condensation behavior of diverse IDRs We previously compiled a database of 3,174 FO protein sequences and initially tested 166 of them for condensate formation in HeLa cells, with 96 of these forming condensates [puncta(+)] and 53 remaining diffuse [puncta(-)]. The sequences of the 96 puncta(+) FOs were significantly enriched in amino acids associated with protein disorder and in physicochemical features associated with phase separation, including the likelihood of pi-pi and pi-cation interactions and the presence of prion-like domains 34 , 36 . These observations led us to hypothesize that IDRs contribute to FO condensation behavior and that this set of 96 puncta(+) and 53 puncta(-) FOs (149 total) could serve as a source of both puncta(+) and puncta(-) IDRs. We used the SAK pipeline 34 , which utilizes IUPred2A 37 , and Metapredict 38 , to identify 215 unique IDRs with long disordered regions (≥60 amino acids in length) 39 within 83 puncta(+) and 47 puncta(-) FOs (see Methods). We note that 19 of the 149 puncta(+) and puncta(-) FOs did not display IDRs of sufficient length for inclusion. We expressed GFP-tagged forms of the 215 IDRs in HEK293T cells and scored them for condensate formation using established procedures 34 , including use of the PunctaTools image analysis pipeline 40 (Supplementary Fig. 1a, b). IDRs that formed condensates in ≥24% of the imaged cells were scored as puncta(+); those that formed condensates in <24% of the imaged cells, or were diffuse in all cells, were scored as puncta(-) (Supplementary Fig. 1c, d). Some IDRs localized within nucleoli or other cellular structures lacking hallmark condensate features (e.g., round appearance with varied size) and were scored as nucleolar or other, respectively (Supplementary Fig. 1e, f), and not included in subsequent analyses. Of the 215 IDRs examined, 41 were classified as puncta(+), 137 as puncta(-), 15 as nucleolar, and 22 as other ( Fig. 1a ). Among the puncta(+) IDRs, 61% (25) formed condensates within nuclei, 10% (4) within the cytoplasm, and 29% (12) within both cellular compartments (Supplementary Fig. 1c, d) and the number and size of these condensates varied widely (Supplementary Fig.1c). Puncta(-) IDRs also exhibited varied sub-cellular localization (Supplementary Fig.1d) with many displaying diffuse fluorescence similar to that of GFP, the negative control (Supplementary Fig.1g). Download figure Open in new tab Fig 1. Condensate formation results from live cell imaging of mEGFP-tagged IDRs (a) Quantification of the number of IDRs classified as puncta(+), puncta(-), nucleolar, or other (left). See “Methods” for details of these classifications. (b) Alluvial plot illustrating the puncta status of FOs and the IDRs derived from their sequences [puncta(+), green; puncta(-), red; nucleolar, magenta; and other, orange]. Our results showed that 48% of IDR-containing, puncta(+) FOs (40 of 83) display at least one puncta(+) IDR ( Fig. 1b ), leading us to hypothesize that puncta(+) IDRs contribute to condensate formation by these FOs. We tested this idea through IDR deletion analysis of five puncta(+) FOs, each containing a single puncta(+) and multiple puncta(-) IDRs (Supplementary Fig. 2a). We observed loss of condensate formation for four of the five FO IDR deletion mutants (Supplementary Fig. 2b), supporting that IDRs contribute substantially to condensate formation by these puncta(+) FOs. For the remaining IDR-containing, puncta(+) FOs (43 of 83) that lack a condensate-forming IDR, condensate formation is likely influenced by additional factors, such as synergy between IDRs and folded domains, or recruitment as clients into condensates formed by endogenous cellular biomolecules. Of the 47 puncta(-) FOs, one contained a puncta(+) IDR, and another contained an IDR that localized within nucleoli ( Fig. 1b ). For these two puncta(-) FOs, the condensate-associated IDRs they contain are insufficient to drive condensate formation by the full proteins, potentially due to solubilizing effects of folded domains and/or other IDRs in their sequences. Overall, our results show that 19% (41 of 215) of the IDRs we tested form cellular condensates, with all but one of these derived from condensate-forming FOs. Most of the tested IDRs (64%, 137 of 215) scored as puncta(-), with many of these derived from condensate-forming FOs. These results indicate that autonomously puncta(+) IDRs contribute to condensate formation by about half of the FOs that exhibit this behavior, and that autonomously puncta(-) IDRs require synergy with other protein regions for the other half of condensate-forming FOs. Our previous studies of FOs indicated that sequence-derived physicochemical features are statistically accurate indicators of their condensation behavior. Based on these findings, we next asked whether these types of physical chemistry-based features could distinguish between puncta(+) and puncta(-) IDRs. Physicochemical features and amino acid enrichments governing IDR condensate formation We analyzed the amino acid sequences of the 41 puncta(+) and 137 puncta(-) IDRs to identify physicochemical features associated with autonomous cellular condensate formation ( Fig. 2a ). For each sequence, we calculated the values of 600 diverse features, including PS-relevant physicochemical features (38 features from the SAK pipeline 34 ), numerical indices related to amino acid physicochemical and biochemical properties (553 features from AAindex v9.2 41 grouped into 12 classes; Supplementary Fig. 3), and molecular interaction-based features (9 features from LLPhyScore 21 ). Feature values are reported as z-scores with respect to average values for the human IDRome (all IDRs ≥60 amino acids in length in the human proteome; see Methods). Amongst these 600 features, we identified 38 with average z-score values that were significantly different between puncta(+) and puncta(-) IDRs. To minimize redundancy, we removed 13 features that exhibited high mutual information (MI) scores with others (MI > 0.5) ( Fig. 2a ). The remaining 25 features sample a range of amino acid sequence-derived properties, including prion-like domain content, fraction of aromatic amino acids, potential for interactions, disorder content, charge content and patterning, and hydrophobic characteristics, which are differentially enriched or depleted in puncta(+) versus puncta(-) IDRs ( Fig. 2b and Supplementary Dataset). Interestingly, secondary structure-related terms associated with extended conformations, e.g., sheets, and coils, are enriched in puncta(+) IDRs while those associated with compact conformations, e.g., helices and turns, are depleted. This collection of 25 features defines the physicochemical properties that, on average, underpin condensate formation by IDRs in the crowded, heterogeneous cellular environment. Download figure Open in new tab Fig 2. Physicochemical features differentiate between puncta(+) and puncta(-) IDRs (a) Workflow of selection of the 25 most significant and non-redundant physicochemical features for 41 puncta(+) and 137 puncta(-) IDRs from SAK (cyan), AAindex database (red) and LLPhyScore (blue) (see Methods). To remove redundant features, the mutual information (MI) value of ≤ 0.5 was selected. See Supplementary Dataset for additional information on the physicochemical features used in these analyses. (b) Quantification of the enrichment or depletion of the 25 non-redundant, most significant physicochemical features for 41 puncta(+) and 137 puncta(-) IDRs with respect to the IDR sequences in the human proteome (human IDRome). Feature values for puncta(+) and puncta(-) IDRs are reported as z-scores using box plots in green (left) and red (right), respectively, along the y -axis. Each box shows the quartiles of the dataset, where the first black horizontal line of the box is the first quartile (25% of the data), the second black horizontal line is the second quartile or median (50% of the data), the third black horizontal line is third quartile (75% of the data). The whiskers extend to points that lie within 1.5 IQRs (interquartile range) of the lower and upper quartile and outliers are displayed as filled circles in black. The mean values of each feature are shown as diamond shapes in grey inside the boxes. Significance of the difference between the puncta(+) and puncta(-) IDRs was assessed using a two-sided Welch’s t -test and no adjustments were made for multiple comparisons (* p D<D0.05; ** p D<D0.01; *** p D<D0.001; **** p D<D0.0001). The names of the physicochemical features are given at the bottom. The colored bars above the feature names represent feature types (SAK, light green; AAindex, light red; LLPhyScore, light blue). (c) Results of two-dimensional (2D) hierarchical clustering of the 41 puncta(+) IDRs based on the 25 most discriminatory physicochemical features as z-scores (columns) with respect to the human IDRome (see Methods) into four groups (Groups 1–4). The top row represents feature types (color scheme as in panel b). IDR feature values are color-coded in the rows, with IDR names are given on the right. The first column (left) represents the cellular localization of the IDR puncta (nucleus, blue; cytoplasm, green; or both, orange). The names of the physicochemical features are given at the bottom. We next used two-dimensional (2D) hierarchical clustering to determine whether the individual IDRs within the puncta(+) and puncta(-) sets exhibited similar or different features and if they clustered into groups with similar features. The results showed that both sets were divided into four groups of IDRs, with members of the groups exhibiting similar patterns of physicochemical features ( Fig. 2c , Supplementary Figs. 4, 5). For the puncta(+) IDRs, Groups 1 and 3 (with 6 and 12 IDRs, respectively) are most highly enriched in features reporting on different types of molecular interactions (6 features from LLPhyScore), and on aromatic residue content and prion-like domain content (3 features from SAK). Puncta(+) IDR Group 2 (with 13 IDRs) is weakly enriched in features related to aromatic residue content and prion-like domain content, and others reporting on charge balance (from SAK) and β-sheet and coil secondary structure content (from AAIndex). In contrast, puncta(+) IDR Group 4 (with 10 IDRs) exhibits moderate enrichment of charge related features (4 features from SAK) and otherwise mixed and weak feature enrichments and depletions. Average, group-wise feature enrichments for puncta(+) IDRs were accompanied by distinct patterns of average amino acid enrichments (see Methods), variably including enrichment of phenylalanine (Group 1), tyrosine (Groups 1-3), glycine, asparagine and glutamine (Group 1), and arginine and lysine (Group 4; Supplementary Fig. 5a, b), which are known from prior studies to be enriched in condensate-forming proteins 13 , 15 , 17 , 34 , 42 . The puncta(-) IDR groups exhibited feature and amino acid enrichments that were either weaker than (Groups 1’ and 4’) or different from (Group 3’) those of the puncta(+) IDRs with the exception of puncta(-) Group 2’, whose enrichments resemble those of puncta(+) IDR Group 3. However, the magnitude of the feature enrichments and depletions for Group 3 puncta(+) IDRs are generally greater than those for Group 2’ puncta(-) IDRs (encoded as the red and blue color intensities in the feature heatmaps; Fig. 2c , Supplementary Figs. 4), suggesting that subtle differences in physicochemical feature profiles govern the condensation behavior of IDRs in these two groups. In summary, our results establish the physicochemical feature and amino acid enrichment landscape of IDRs that do and do not form condensates in HEK293T cells. Predicting IDR cellular condensation using physicochemical features and machine learning We next asked whether our IDR dataset could be leveraged to accurately predict the condensation behavior of additional, experimentally untested IDRs. Using 25 physicochemical features ( Fig. 2b ) for 41 puncta(+) and 137 puncta(-) IDRs as training data (termed the Training IDRs), we applied H 2 O AutoML 43 to evaluate 120 machine learning (ML) models for prediction of cellular IDR condensation behavior ( Fig. 3a ). A stacked ensemble model comprised of three tree-based models was superior among the tested models (termed the IDR-Puncta ML model, see Methods) and displayed the following performance metrics based on 25-fold cross validation: AUC [area under the ROC (receiver operating characteristic) curve], 0.98; AUCPR (area under the precision-recall curve), 0.93; accuracy, 0.95; and balanced accuracy, 0.92 ( Fig 3b ). Analysis using Shapley Additive exPlanations (SHAP) 44 showed that diverse physicochemical features contribute to predictions of IDR condensate formation (see Methods), including those reporting on charge-, disorder-, and hydrophobicity-related properties (Supplementary Fig. 6a). We independently verified IDR-Puncta ML model performance using 30 Human IDRs out of the total 33 IDRs (Verification IDRs) selected to have low degree of sequence identity based on pairwise alignment (<55% for one sequence and <20% for the remaining 32 sequences) with the Training IDRs ( Fig. 3c ; see Methods), and observed accuracy similar to that obtained during cross validation [AUC, 0.95; AUCPR, 0.88; accuracy, 0.90; and balanced accuracy, 0.92 ( Fig. 3d ; Supplementary Dataset)]. We excluded three verification IDRs from the ML model performance evaluation, which were experimentally categorized as either “nucleolar” or “other”. We next performed dimensionality reduction analysis using the 25 physicochemical features of the Training IDRs and the IDRs (≥60 amino acids in length) from human proteome combined. The analysis revealed that the physicochemical features of the FO-derived Training IDRs we tested spanned the feature landscape of the human IDRome (Supplementary Fig. 6b). This result further confirms that the IDR-Puncta ML model can be applied to the human IDRome to understand the prevalence of puncta(+) IDRs in human proteins. The human proteome (20,396 proteins) contains 12,899 IDRs (derived from 8,067 proteins, see Methods) and 1,572 of these (12%) were predicted to be puncta(+) using the IDR-Puncta ML model ( Fig. 3e , Supplementary Fig. 6c), indicating that the propensity for condensate formation is a specialized property of human IDRs. Download figure Open in new tab Fig 3. A Machine Learning model for predicting condensate formation probability of IDRs (a) Supervised Machine Learning was used to develop a Stacked Ensemble model (termed IDR-Puncta ML model; see Methods) trained using the 25 low mutual information physicochemical features from SAK (9 features; cyan), AAindex (9 features; red) and LLPhyScore (7 features; blue) for 41 puncta(+) and 137 puncta(-) IDRs (termed Training IDRs). (b) Performance metrics [area under the curve (AUC, dots), area under the precision-recall curve (AUCPR, stripes), accuracy (grey), and balanced accuracy (black)] for the IDR-Puncta ML model using 25-fold cross validation (CV) with the Training IDRs. (c) Results of prediction of condensate formation behavior [x-axis, puncta(+) or puncta(-)] using the IDR-Puncta ML model for the 30 puncta(+) and puncta(-) Verification IDRs out of the total 33 Verification IDRs (see Methods). The number of predicted true puncta(+) and true puncta(-) IDRs from the IDR-Puncta ML model are shown as green and red bars, respectively. The number of predicted false puncta(+) IDRs is shown as a grey bar. No false puncta(-) IDRs were predicted. (d) Performance metrics [area under the curve (AUC, dots), area under the precision-recall curve (AUCPR, stripes), accuracy (grey), and balanced accuracy (black)] for the IDR-Puncta ML model using the 30 Verification IDRs. (e) Predicted condensation behavior of all IDRs in human proteome, termed the human IDRome (12,899 IDRs, in total), using the IDR-Puncta ML model. Human proteins with puncta(+) IDRs are enriched for RNA processing-related functions Sequence mapping of the ML prediction results showed that 1,393 human proteins contain one or more puncta(+) IDRs [some of these proteins also contain one or more puncta(-) IDRs] while 6,674 proteins contain only puncta(-) IDRs ( Fig. 3e ). We next asked whether the human proteins that contain one or more puncta(+) IDR(s) are associated with specific biological functions using Gene Ontology (GO) biological process enrichment analysis in comparison with all human proteins. A similar analysis was performed for human proteins containing only puncta(-) IDRs. Three of the top four most highly and significantly enriched parent biological process terms for proteins with puncta(+) and puncta(-) IDRs were similar (e.g., positive regulation of transcription, cell division, and actin cytoskeleton regulation; Fig. 4a ), indicating that functions related to these processes are common to proteins with IDRs regardless of condensate formation. In contrast, the most highly enriched parent term for puncta(+) IDRs was RNA processing (3.5-fold enrichment, Fig. 4a ), which aggregates numerous enriched child terms related to regulation and processing of RNA, including metabolism and splicing (Supplementary Fig. 7a). We note that the magnitude of functional term enrichments was greater for proteins with puncta(+) than puncta(-) IDRs ( Fig. 4a , Supplementary Fig. 7a). These results indicate that proteins involved in RNA-related processes are enriched in IDRs prone to condensate formation and suggest that these processes occur within condensate environments. Analysis of GO cellular component terms, which report on sub-cellular localization, showed that proteins with puncta(+) IDRs were enriched for terms including nucleolus, nuclear body, nuclear speck, nuclear protein-containing complex, and spliceosomal complex ( Fig. 4b , Supplementary Fig. 7b), supporting the association of these proteins with several types of nuclear biomolecular condensates. The condensates associated terms (e.g., nucleolus, nuclear body, and nuclear speck) were not enriched in human proteins with puncta(-) IDRs ( Fig. 4b , Supplementary Fig. 7b). Download figure Open in new tab Fig 4. Biological functions of human proteins with puncta(+) and puncta(-) IDRs Scatter plots showing Gene Ontology (GO) enrichment analysis in two categories, biological processes (a) and cellular component (b). The y-axis shows the combined adjusted p -value (p.adj) of the enriched GO terms, and the x-axis gives the average fold-enrichment after grouping the GO terms using semantic similarity analysis (see Methods). (a) Enriched biological processes for proteins with predicted puncta(+) IDRs (left) and proteins with only predicted puncta(-) IDRs (right). (b) Enriched cellular components for proteins with predicted puncta(+) IDRs (left) and proteins with only predicted puncta(-) IDRs (right). Symbols (a, b) are color coded according to the different grouped GO terms from semantic similarity analysis and the symbol size is proportional to the number of GO terms in each group. For the grouped GO terms combined significance was obtained using Fisher’s method and average fold enrichment was computed using log transformation. The human proteome was used as background for all the GO term analysis (see Methods). In summary, our IDR-focused analyses indicate that 40% of human proteins (8,067 of 20,396 proteins) exhibit one or more IDRs ≥60 amino acids in length and that these are enriched for functions associated with transcription, cell division, and actin cytoskeleton. The IDR-containing human proteins with potential for condensate formation [e.g., displaying puncta(+) IDRs; 1,393 of 8,067 proteins] are uniquely enriched in functional terms involving RNA-related processes and in localization terms associated with nuclear membraneless organelles. While others have associated condensate formation by proteins with RNA-related biological processes 45 – 47 , our findings extend these results by showing through large-scale, unbiased analyses that human proteins with condensation-prone IDRs are highly enriched for RNA-related biological functions and localization within nuclear membraneless organelles. Proteins with puncta(+) IDRs are enriched in membrane-less organelles (MLOs) We independently tested our findings on preferential localization of human, puncta(+) IDR-containing proteins within biomolecular condensates by examining their occurrence in the membraneless organelle (MLO) protein database, PhaSepDB 48 , which compiles information on proteins demonstrated to undergo PS and/or reported to be associated with one or more MLOs. We retrieved 499 human MLO-associated proteins from PhaSepDB and identified 345 with one or more IDRs ≥60 amino acids in length. We also compiled a control dataset comprised of membrane associated proteins, which we reasoned do not autonomously form biomolecular condensates. Specifically, we examined proteins found at membrane-bound organelle contact sites (MCSs; compiled in MCSdb 49 ), wherein membranes of two different organelles are in close proximity but do not fuse 50 . We obtained 199 human proteins from MCSdb and identified 90 with one or more IDRs. To enhance the robustness of our analyses, we only included proteins annotated with high confidence; our final MLO and MCS datasets included 345 and 90 human proteins, respectively, with only 2 proteins common between the two datasets (Supplementary Fig. 8a, see Methods). We next applied our IDR-Puncta ML model to the two protein sets and determined that 134 of 345 MLO proteins (39%) and 5 of 90 MSC proteins (6%), respectively, contain one or more IDRs predicted to form condensates ( Fig. 5a ). In comparison, we reported above that 1,393 of 8,067 human IDR-containing proteins (17%) contain puncta(+) IDRs ( Fig. 3e ). These results, showing that proteins with puncta(+) IDRs are much more highly enriched in MLO proteins than in MCS proteins or human IDR-containing proteins support our independent findings based on analysis GO cellular component localization terms ( Fig. 4b , Supplementary Fig. 7b). While the occurrence of a puncta(+) IDR within the sequence does not necessarily indicate that a protein will autonomously form condensates, it is highly suggestive of localization within an MLO. The 134 MLO proteins with puncta(+) IDRs are predominantly associated with nuclear MLOs although some are associated with cytoplasmic MLOs, including stress granules and P-bodies ( Fig. 5b ). These proteins are most frequently associated with nuclear speckles, nucleoli, and paraspeckles, MLOs involved in different RNA-related processes, consistent with our observation that the functional term, RNA processing, was the most highly enriched amongst human proteins with puncta(+) IDRs ( Fig. 4a , Supplementary Fig. 7a). These findings highlight a strong correlation between proteins containing puncta(+) IDRs and their preferential localization within MLOs, particularly those with roles in RNA-related processes. This relationship underscores the importance of condensate-driven compartmentalization in organizing and regulating RNA metabolism, indicating how MLOs support complex biochemical processes. Download figure Open in new tab Fig 5. Prevalence of condensate prone IDRs in MLOs (a) Quantification of the presence of puncta(+) or only puncta(-) IDRs in MLO proteins (left bars), MSC proteins (middle bars) and human proteins (right bars). The puncta(+) or only puncta(-) status of IDRs in the sets of proteins was determined using the IDR-Puncta ML model [green, percentage of proteins with at least one punta(+) IDR; red, percentage of proteins with only punta(-) IDRs]. Significance was assessed using Fisher’s exact count test and no adjustments were made for multiple comparisons (* p < 0.05; ** p < 0.01; *** p < 0.001; **** p < 0.0001). (b) MLO localization of proteins from the MLO set containing at least one predicted puncta(+) IDR. The colors of the bars represent the two major cellular compartments (nucleus, blue; cytoplasm, green) and the numerical values above the bars indicate number of proteins along the x-axis. Discussion Protein PS plays a key role in cellular organization and function 6 , 13 . The ability to accurately predict PS propensities of proteins is critical to understand cellular compartmentalization and its roles in biology and disease. Recent advances fostered the development of several computational tools, each leveraging different approaches to analyze protein PS 21 , 23 , 30 , 31 . However, the lack of standardized PS testing methods and robust PS negative datasets introduces implicit and difficult-to-measure biases among these different prediction methods 33 . Furthermore, many of these predictors use literature-mined datasets from different databases, which often exhibit inconsistencies and uncertainty due to non-standardized experimental conditions. We previously developed the FO-Puncta ML model to predict the cellular condensation behavior of FOs 34 . However, FOs differ from human proteins due to their unique sequence features, which result from aberrant gene translocation. As a result, the model is not optimized to recognize the broader sequence-based features associated with phase separation-prone human proteins. Additionally, this model is designed to assess the condensation behavior of full-length proteins and does not identify which specific region(s) of the FO drives this behavior. Since many proteins harbor multiple IDRs, only some of which may contribute to PS, identifying the exact IDR(s) responsible for condensate formation is crucial. Such specificity not only illuminates the molecular basis of PS but also enables targeted sequence modifications. An improved understanding of key features of PS prone IDRs such as amino acid composition, charge distribution, and potential for multivalent interactions is essential for rationally controlling the phase behavior of proteins. Building on these advances, we developed the IDR-Puncta ML model, a machine-learning tool that accurately predicts the condensate-forming potential of IDRs, based on sequence-derived physicochemical features. The IDR-Puncta ML model was trained on experimentally tested IDRs using standardized cellular assays and does not report on the specific type (homotypic or heterotypic) of interactions that may drive condensate formation. Our model addresses limitations that may have variably impacted existing tools, such as unvalidated datasets, lack of cellular context, potential overfitting due to feature redundancy, and narrowly focused features. We experimentally validated the IDR-Puncta ML model using randomly selected IDRs from the human proteome, confirming its accuracy and broad applicability. Recent work by Bülow, et al., predicts that 5% of IDRs from the human proteome can undergo homotypic PS under physiologically relevant conditions (e.g., sequences with transfer free energy values (ΔG tr ) less than −2k B T 35 ). We applied this model to our set of 41 puncta(+) IDRs and found that Group 1 IDRs are predicted to undergo homotypic phase separation, with a mean ΔG tr value of −4.89 k B T (Supplementary Fig.9a). Group 3 IDRs have a mean ΔG tr value of −1.87 k B T, indicating a moderate tendency toward homotypic phase separation. Conversely, IDRs from Group 2 (mean ΔG tr = −0.82 k B T) and Group 4 (mean ΔG tr = −0.29 k B T) are less likely to undergo homotypic PS (Supplementary Fig.9a). These trends align with the distinct sequence feature enrichment observed in each group (Supplementary Fig. 4a). Groups 1 and 3 show similar feature enrichment patterns, although the enrichment is less pronounced in Group 3. Groups 2 and 4 display unique feature enrichment profiles, distinguishing them from Groups 1 and 3. These results highlight variability in PS propensities across different IDR groups. We hypothesize that puncta(+) IDRs, exhibiting distinct patterns of physicochemical features and amino acid enrichments (e.g., IDRs in Groups 1-4; Fig. 3a ), will display different conformational properties and engage in various intra- and inter-polypeptide chain interactions, both homotypic and heterotypic, that drive condensate formation. However, testing this hypothesis requires further investigation in the future. We explored the biological implications of protein condensate formation by conducting a detailed functional annotation of proteins containing condensate-forming IDRs. Using our predictive model, we identified 1,393 proteins containing at least one predicted puncta(+) IDR (∼17% of all IDR-containing human proteins). These findings align with studies suggesting that PS driven by IDRs alone is limited, with additional factors such as folded domains influencing condensate formation 25 , 51 . Our data indicates that proteins with predicted puncta(+) IDRs are highly enriched in RNA-related biological processes, specifically RNA-processing and splicing. It is increasingly evident that RNA-related processes (including transcription, splicing, and translation) are frequently associated with condensate formation 52 – 54 . Furthermore, the proteins with puncta(+) IDRs are enriched in nuclear speckles, nucleoli, and paraspeckles, MLOs known to facilitate various RNA-related processes. These data serve as a valuable resource for hypothesis-driven research into the roles and mechanisms of condensate formation by IDRs in RNA biology, by providing insight into patterns of physicochemical features in IDRs and their cellular condensation behavior. Our study establishes a framework for examining the link between IDR mediated protein condensation and biological function. Herein, we present a valuable resource for mapping the potential of human IDRs to form condensates in cells, offering insights into their physicochemical features, and associated biological functions, especially in the context of PS. Our prediction model should be useful in guiding cellular experiments to explore the role of novel IDRs in PS. Our model may also be used in providing key insights into condensate pathology. For example, aberrant condensate formation has been associated with several human diseases, especially in neurodegeneration and cancer 4 , 7 . Identifying specific IDR that drives pathological condensate formation can aid in finding the association between PS and disease progression. As our model was trained on FO-derived IDRs, it is particularly suitable to analyze the cancer proteome. Identifying puncta-forming IDRs in cancer-associated proteins will provide critical insights into their possible oncogenic mechanism. The IDR-Puncta ML holds significant potential in interpreting the impact of disease-associated mutations in IDRs that alter cellular PS behavior for understanding complex diseases, including neurodegenerative disease and cancer 5 , 55 . Beyond individual IDR, our model can also be applied to a proteome-wide assessment of IDR-mediated PS across different organisms to explore the prospect of PS being an evolutionary conserved mechanism in regulating diverse cellular processes and stress response 56 . The IDR-Puncta ML model can also be applied in synthetic biology and biomaterials engineering. Our work provides refined knowledge of sequence features associated with condensate-prone IDRs and tools to design and modify PS-prone IDRs, which will assist the engineering of synthetic protein condensates and new biomaterials with tunable phase-separating properties 57 , 58 . Limitations of our study We note a few factors to be considered when evaluating our findings and conclusions. First, the training set for our model used IDRs from a subset of FOs, which may limit the generalizability of our findings to other proteins or IDRs with different characteristics. However, our analyses showed that the physicochemical features of the FO-derived IDRs we tested spanned the feature landscape of the human IDRome, mitigating this concern. Second, we focused on IDR-driven condensate formation of proteins and excluded PS promoted by folded domains (oligomerization domains, nucleic acid binding domains) 59 , 60 as well as PS facilitated by multiple IDRs working in concert. These factors limit our ability to identify all condensation-prone proteins in the human proteome. Moreover, we segregated IDRs that flank a folded domain and tested them individually, potentially overlooking the enhanced effects of multiple IDRs acting together. Additionally, while we identified physicochemical feature patterns linked to puncta(+) behavior in IDRs, we did not explore how these patterns affect IDR conformational dynamics or multivalent interactions, both crucial for condensate formation. Methods IDR selection for in-cell expression IDRs were identified by analyzing the 149 Fusion oncoprotein sequences in our previous work 34 using sak.stjude.org , which identified IDRs based on continuous stretches of disordered residues ≥ 60 residues according to the IUPRED2A algorithm 37 . Sequences were also analyzed with Metapredict (version: V2) 38 , and Metapredict identified IDRs were added to the IDR database when there was not an analogous IUPRED2A identified IDR. In some cases, IUPred2A and Metapredict identified multiple adjacent IDRs that did not form puncta. In these cases, combined IDRs that contained all the adjacent disordered regions were created and tested. A total of 215 unique IDRs were identified among which 188 were predicted using SAK, 23 with Metapredict, and 4 IDRs were identified by both IUPred2A and Metapredict. Cloning We previously reported Escherichia coli codon optimized full-length plasmids for cellular expression of FOs 34 . IDRs and FO-IDR deletion mutants were generated using PCR with the respective FOs as a templates. PCR reactions were performed using Q5 High Fidelity 2x Mastermix and primers were designed with 5’-Not1 and 3’-Xba1 restriction sites. PCR fragments and the destination vector (CL20) were cut with Not1 and Xba1 restriction enzymes and ligated using New England BioLabs’ Quick Ligation Kit per the manufacturer’s instructions. Ligation reactions were transformed into chemically competent bacteria (NEB 5-alpha). All plasmid sequences were confirmed by whole plasmid sequencing. Cell culture and transient transfections HEK293T cells (ATCC; RRID: CVCL_0063) were cultured in DMEM with high glucose (Gibco) and supplemented with 1× penicillin/streptomycin (Gibco), 10% FBS (HyClone), and 6 mmol/L l-glutamine (Gibco) and maintained at 37 °C in 5% CO 2 . Cells were tested for Mycoplasma every 2 months using PCR (e-Myco plus, LiLiF). Cells were authenticated by short tandem repeat profiling (PowerPlex Fusion at the St. Jude Hartwell Center). Cells were transfected in a 96-well plate with 100 ng of plasmid DNA in the CL20 vector backbone using FuGENE HD (Promega) per the manufacturer’s instructions. All IDRs were N-terminally tagged with monomeric EGFP (A207K mutation in EGFP), and EGFP was used for the empty vector control plasmids. Cells were used for a maximum of 25 passages after thawing. Confocal microscopy imaging All microscopy images were acquired on a 3i Marianas system (Denver, CO) configured with a Yokogawa CSU-W spinning disk confocal microscope utilizing a 100x Zeiss objective, 405 nm (Hoechst) and 488 nm (mEGFP) laser lines, and Slidebook (RRID: SCR_014300) 6.0 (3i). 3D images of cells were captured as z stacks with 0.3 µm spacing between planes, spanning 12 µm in total. Live HEK293T cells were imaged at 37 °C in phenol red-free DMEM with high glucose (Gibco) supplemented with 1× penicillin/streptomycin, 10% FBS, 6 mmol/L l-glutamine, and 25 mmol/L HEPES. Quantitative image analysis to classify IDRs as puncta (+), puncta (-), Other, and Nucleolar Fluorescent microscopy images were segmented and quantified using the PunctaTools pipeline 40 . In brief, cells were segmented using eGFP as the primary signal, and Hoechst as a secondary signal to establish nuclei. Cells were segmented as sets of 10 layers to minimize cell-segmentation errors, then combined into 3D stacks using the Cellpose algorithm 61 . Condensates (termed puncta here) were then segmented by filtering respective channels with a scale adapted Laplacian of Gaussian (LoG) filter, thresholding the result, and applying watershed segmentation using the maxima of the LoG filtered image as seeds. Using this method, cells were recorded as having puncta, or not. A threshold of 24% of expressing cells was set based on agreement with the manual assessment of puncta status. After quantification, IDR puncta status was manually verified by two independent researchers, where segmentation errors and alternate classifications (Other, Nucleolar) were assigned, after which final classifications were assigned to each IDR. Calculation of amino acid sequence-derived physicochemical features of IDRs To understand the potential of condensate formation by IDRs, we deployed physicochemical features that were computed for each IDR amino acid sequence. Specifically, we obtained features from our previous work 34 through the SAK pipeline (sak.stjude.org ), the Amino Acid Index (AAindex) database 41 , and the recently developed LLPhyScore resource 21 . We accessed amino acid properties from the AAindex database, which is a curated set of numerical indices representing various physiochemical and biochemical properties of amino acids. Additional features were obtained from the LLPhyScore resource, which includes a predictor of IDR-driven phase separating proteins based on underlying physicochemical interactions, including, solvent contacts, disorder, hydrogen bonds, pi-pi and cation-pi contacts, electrostatic interactions, and secondary structure. The Python package protlearn (version: 0.0.3) was used to extract 553 amino acid properties from the AAindex database with no missing values that belong to 12 property classes 25 (Supplementary Fig. 3). The features from LLPhyScore were computed using the standalone package from GitHub (github.com/julie-forman-kay-lab/LLPhyScore) based on the model trained on both folded proteins in the PDB and proteins from the human proteome. Human IDRome sequences To represent the human IDRome, we first obtained all human Swiss-Prot (reviewed) proteins from Uniprot release 2023_04. Sequences were excluded from the collection if their length was less than 10 residues, or if any characters outside of the natural amino acids were present. This resulted in a total of 20,396 human protein sequences (termed the human proteome here). From the human proteome, we identified 13,047 IDR sequences (from 8,067 proteins) through sak.stjude.org with lengths ≥ 60 residues, which were termed the human IDRome. Calculation of amino acid enrichment for IDR sequences Amino acid enrichment for each IDR sequence was calculated using equation 1 , where “Percent composition of sequence” is the percent composition of a particular amino acid in the sequence being evaluated and “Mean percent composition of human IDRome” is the mean percent composition of a particular amino acid in a database of human IDR sequences (human IDRome). Amino acid compositions of the IDR sequences were computed using the protr 62 package (version: 1.7.0) in R. Analysis of sequence-derived physicochemical features First, we identified that for the 41 puncta(+) and 137 puncta(-) Expressed IDRs (collectively termed the Training IDRs; 178 IDRs, in total) values of the 38 features obtained from sak.stjude.org , values of the features, “ABT valence”, “ABT balance”, and “ABT density”, were missing for ∼5% of IDRs, whereas “PAPAprop” values were missing in ∼8% of IDRs, due to the limitation of the IDR length requirement for the calculation of these features. Therefore, we replaced all the missing values using the non-missing median values of these features from the puncta(+) and puncta(-) Expressed IDRs, respectively. We next performed the two-sided t-test for the 38 features from sak.stjude.org and identified 21 features that showed significant differences (p-value ≤ 0.01) between the 41 puncta(+) and 137 puncta(-) IDRs using the rstatix package (version: 0.7.2) in R (version: 4.2.1). Similarly, we identified 10 features from the AAindex database (with the most significant feature difference from each of the 12 classes) and 7 features from LLPhyScore (out of the total of 9 LLPhyScore features) that showed significant differences (p-value ≤ 0.01) between the 41 puncta(+) and 137 puncta(-) IDRs. Effect size was calculated using the effsize package (version: 0.8.1) ( https://zenodo.org/record/196082 ) in R. Next, to identify interdependence of the 38 sequence-based physicochemical features for the 41 puncta(+) and 137 puncta(-) IDRs, we computed mutual information (MI) among these features using infotheo (version: 1.2.0.1) package in R. Features that displayed strong mutual dependence with others were removed, which resulted in 25 features with low MI (≤ 0.5). These 25 physicochemical features were converted to z-scores using the scale function in the R package, with respect to the human IDRome sequences. We next performed hierarchical clustering based on Euclidean distance and using the complete linkage method, as implemented in the pheatmap package (version: 1.0.12) in R, using the z-scores for the noted 25 features to identify groups within the 41 puncta(+) IDR set with related physicochemical features. For the 137 puncta(-) IDR set, we performed hierarchical clustering based on Manhattan distance and Ward’s minimum variance method, using the z-scores of the 25 physicochemical features. Supervised machine learning for puncta classification We employed the automatic machine learning (AutoML) tool within the h2o.ai 43 (version: 3.44.0.1) package in R to classify the puncta (+) and puncta (-) IDRs and predict the probability of condensate formation using data for the 178 Training IDRs [consisting 41 puncta(+) and 137 puncta(-) IDRs from the Expressed IDRs]. Using the 25 sequence-based physicochemical features with low MI for the 178 Training IDRs, we set nfolds =25 for 25-fold cross-validation (CV) and generated 120 models from AutoML. Additionally, we set include_algos to the H2O tree-based models [Gradient Boosting Machine (GBM), Distributed Random Forest (DRF) including Extremely Randomized Trees (XRT), and Extreme Gradient Boosting (XGBoost)] and Stacked Ensembles, using otherwise default parameters in H2O AutoML. A “Best of Family” Stacked Ensemble Model consisting of three base models GBM, XRT, and DRF performed the best amongst the 120 models tested (termed the IDR-Puncta ML model). The Stacked Ensemble metalearner using an Elastic net regularized (with CV) Generalized Linear Model (GLM) resulted in the ML model with GLM coefficients, 0.62, 0.46, and 0.18 for the three base models GBM, XRT, and DRF, respectively. Performance of the model was based on the metrics logistic loss or cross-entropy loss (the difference between predicted probabilities and actual values) value of 0.17 for the 25-fold CV set with 178 Training IDRs. We determined a threshold value for the condensate formation probability of 0.40 based on a maximum accuracy value of 0.95, maximum F1 score (harmonic mean of the precision and recall) value of 0.89, and maximum absolute Matthew’s correlation coefficient value of 0.86 for classifying the puncta(+) and puncta(-) IDRs for the 25-fold CV set with 178 Training IDRs. After its establishment, we applied the IDR-Puncta ML model to 30 Verification IDRs [with experimentally determined puncta(+) and puncta(-) status] out of the total 33 Verification IDRs from human IDRome. We excluded three Verification IDRs which were experimentally determined as either “nucleolar” or “other” from the ML model performance evaluation. To measure the degree of similarity between the IDR sequences in the Training and Verification sets, we performed pairwise sequence alignment using protein BLAST (version: BLAST+ 2.14.1). We defined pairwise normalized alignment score, fraction of identical matches as, where qlen is the length of the query sequence, slen is the length of the subject sequence and nident is the fraction of identical matches between the query and subject sequences. fident varies betwee 0 and 1, with 0 indicating no identical residues between the query and subject sequence, and 1 indicating an exact match between the query and subject sequences. We used the packages PRROC (version: 1.3.1) 63 in R to compute the performance metrics AUC and AUCPR for the Verification IDRs. We next applied Shapley Additive exPlanations (SHAP) analysis 44 to determine the importance of the 25 physicochemical features to IDR-Puncta ML model predictions. SHAP is a game-theoretic approach to explain the output of any machine learning model. First, we computed Shapley values ( SHAP value ) of the features for each of the 178 Training IDRs using the h2o.predcit_contribution function in the h2o.ai package in R. Next, we computed SHAP importance for each feature from the base GBM model with highest contribution to the IDR-Puncta ML model as, where n is the number of IDRs in the Training set and | SHAP value | measures the importance of each feature for the model decisions. Dimensionality reduction analysis of the Training IDRs and human IDRome To validate whether the IDR-Puncta ML model can be applied to the human IDRome for predicting probability of condensate formation and puncta classification. We performed dimensionality reduction analysis by applying Uniform Manifold Approximation and Projection (UMAP) 64 using the 25 physicochemical features of the Training IDRs and human IDRome combined. UMAP algorithm aims to preserve both the local and the global data structure. The IDRs from human IDRome with missing feature(s) were excluded from the UMAP analysis. The feature values of the Training IDRs and human IDRome were converted to z-score before applying UMAP. UMAP analysis was performed using the uwot package (version: 0.1.12) in R with the parameters, n_neighbors = 100, metric =“euclidean”, min_dist = 1, and spread = 5. GO enrichment analysis of human IDRome GO enrichment analysis of the human IDRome was performed using the Database for Annotation, Visualization and Integrated Discovery (DAVID) 65 Knowledgebase (version: v2024q1) in the web server ( david.ncifcrf.gov ). For the predicted puncta(+) IDRome, proteins with at least one puncta(+) IDR were used, whereas for the predicted puncta(-) IDRome, proteins with only puncta(-) IDRs were used, and proteins from the full human proteome were used as the background set. The lowest level of GO categories for biological process (GO_BP_ALL), cellular component (GO_CC_ALL) and molecular function (GO_MF_ALL) were used in DAVID. The adjusted p-value (false discovery rate) cutoff of ≤ 0.05 based on Benjamini correction (default in DAVID) and fold enrichment of ≥ 1.5 was used to filter the GO terms. Additionally, GO terms containing ≥ 2,000 proteins from the background set (human IDRome) and GO terms containing < 10 proteins from the target sets [predicted puncta(+) and puncta(-) IDRomes] were removed. The rrvgo 66 package (version: 1.10.0) was used to simplify the redundance of GO terms by grouping similar terms based on their semantic similarity using the “Wang” method with similarity threshold values of 0.88 and 0.94 for the predicted puncta(+) and puncta(-) IDRomes, respectively. “Wang” is a graph-based method implemented in the rrvgo R-package, which uses the topology of GO graph structure to compute semantic similarity 67 . This method determines the semantic similarity of two GO terms based on both locations of these terms in the GO graph and their relations with their ancestor terms. Higher thresholds lead to fewer groups of GO terms. For visualization purposes, the combined adjusted p-value of a grouped GO term from semantic similarity was obtained by applying the sum of logs (Fisher’s) method using the metap (version: 1.11) R-package. A log 2 transformation was used to calculate the average fold enrichment of grouped GO terms. Puncta prediction of the IDRs from proteins in membrane-less organelles (MLOs) and membrane contact sites (MCSs) Protein constituents of MLOs were obtained from the manually curated database of phase-separation related proteins (PhaSepDB; version: 2.1 48 ). This database contains 499 human proteins identified by low throughput methods, termed “MLO-lt”, that matched UniProt IDs found in our human proteome database. Of these 499 proteins, 345 were mapped onto our human IDRome database. We identified 199 human proteins from a manually curated database of experimentally supported MCS proteins and complexes (MCSdb 49 ) labeled with “low throughput experimental methods”. Two out of the 199 proteins were not found in our human proteome database, and for the remaining 197 proteins, 90 proteins were mapped onto our human IDRome database. Significance of the number of proteins containing puncta(+) or only puncta(-) IDRs in MLO, MCs and hman IDRome data sets were computed from Fisher’s exact test for count data using fisher.test from the stats (version: 4.2.1) package in R. Author Contributions Conceptualization, SM, ST and RWK; Software, ST, DWB, JB, RS and BL; Investigation, SM, ST, DWB, AHP, CP, JW, W, WTF, SK, BP, RS and BL; Writing - Original Draft, SM, ST and RWK; Writing – Review & Editing, all authors; Supervision, RWK and MMB; Funding acquisition, RWK and MMB Corresponding author Correspondence to Richard W. Kriwacki. Acknowledgments We thank Dr. Ines Chen for the critical review of the manuscript and Dr. Steven W. Whitten for providing the AAindex classification. This work is supported by NCI R01 CA246125 (to R.W.K. and M.M.B.), NCI U54 CA243124 (to R.W.K.), Developmental Funds under NCI P30 CA021765 (to R.W.K.) and ALSAC. We are grateful for the support of Core Facilities used in this study by NCI P30 CA021765, including the Cell and Tissue Imaging Center, with technical support from George Campbell, Aaron Taylor, and Aaron Pitre, and the Hartwell Center. This research content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health. References 1. ↵ Boeynaems , S. et al. Protein Phase Separation: A New Phase in Cell Biology . Trends Cell Biol 28 , 420 – 435 ( 2018 ). doi: 10.1016/j.tcb.2018.02.004 OpenUrl CrossRef PubMed 2. ↵ Jacobs , M. I. , Jira , E. R. & Schroeder , C. M . Understanding How Coacervates Drive Reversible Small Molecule Reactions to Promote Molecular Complexity . Langmuir 37 , 14323 – 14335 ( 2021 ). doi: 10.1021/acs.langmuir.1c02231 OpenUrl CrossRef PubMed 3. Zhang , Y. , Narlikar , G. J. & Kutateladze , T. G . Enzymatic Reactions inside Biological Condensates . J Mol Biol 433 , 166624 ( 2021 ). doi: 10.1016/j.jmb.2020.08.009 OpenUrl CrossRef 4. ↵ Alberti , S. & Dormann , D . Liquid-Liquid Phase Separation in Disease . Annu Rev Genet 53 , 171 – 194 ( 2019 ). doi: 10.1146/annurev-genet-112618-043527 OpenUrl CrossRef PubMed 5. ↵ Tsang , B. , Pritisanac , I. , Scherer , S. W. , Moses , A. M. & Forman-Kay , J. D . Phase Separation as a Missing Mechanism for Interpretation of Disease Mutations . Cell 183 , 1742 – 1756 ( 2020 ). doi: 10.1016/j.cell.2020.11.050 OpenUrl CrossRef PubMed 6. ↵ Banani , S. F. , Lee , H. O. , Hyman , A. A. & Rosen , M. K . Biomolecular condensates: organizers of cellular biochemistry . Nat Rev Mol Cell Biol 18 , 285 – 298 ( 2017 ). doi: 10.1038/nrm.2017.7 OpenUrl CrossRef PubMed 7. ↵ Shin , Y. & Brangwynne , C. P . Liquid phase condensation in cell physiology and disease . Science 357 ( 2017 ). doi: 10.1126/science.aaf4382 OpenUrl Abstract / FREE Full Text 8. ↵ Mitrea , D. M. & Kriwacki , R. W . Phase separation in biology; functional organization of a higher order . Cell Commun Signal 14 , 1 ( 2016 ). doi: 10.1186/s12964-015-0125-7 OpenUrl CrossRef PubMed 9. ↵ Kato , M. et al. Cell-free formation of RNA granules: low complexity sequence domains form dynamic fibers within hydrogels . Cell 149 , 753 – 767 ( 2012 ). doi: 10.1016/j.cell.2012.04.017 OpenUrl CrossRef PubMed Web of Science 10. Lin , Y. , Protter , D. S. , Rosen , M. K. & Parker , R . Formation and Maturation of Phase-Separated Liquid Droplets by RNA-Binding Proteins . Mol Cell 60 , 208 – 219 ( 2015 ). doi: 10.1016/j.molcel.2015.08.018 OpenUrl CrossRef PubMed 11. ↵ Uversky , V. N . Intrinsically disordered proteins in overcrowded milieu: Membrane-less organelles, phase separation, and intrinsic disorder . Curr Opin Struct Biol 44 , 18 – 30 ( 2017 ). doi: 10.1016/j.sbi.2016.10.015 OpenUrl CrossRef PubMed 12. ↵ Borcherds , W. , Bremer , A. , Borgia , M. B. & Mittag , T . How do intrinsically disordered protein regions encode a driving force for liquid-liquid phase separation? Curr Opin Struct Biol 67 , 41 – 50 ( 2021 ). doi: 10.1016/j.sbi.2020.09.004 OpenUrl CrossRef PubMed 13. ↵ Mitrea , D. M. et al. Nucleophosmin integrates within the nucleolus via multi-modal interactions with proteins displaying R-rich linear motifs and rRNA . Elife 5 ( 2016 ). doi: 10.7554/eLife.13571 OpenUrl CrossRef PubMed 14. Mintz , P. J. , Patterson , S. D. , Neuwald , A. F. , Spahr , C. S. & Spector , D. L . Purification and biochemical characterization of interchromatin granule clusters . EMBO J 18 , 4308 – 4320 ( 1999 ). doi: 10.1093/emboj/18.15.4308 OpenUrl Abstract / FREE Full Text 15. ↵ Mittag , T. & Parker , R . Multiple Modes of Protein-Protein Interactions Promote RNP Granule Assembly . J Mol Biol 430 , 4636 – 4649 ( 2018 ). doi: 10.1016/j.jmb.2018.08.005 OpenUrl CrossRef PubMed 16. ↵ Kim , H. J. et al. Mutations in prion-like domains in hnRNPA2B1 and hnRNPA1 cause multisystem proteinopathy and ALS . Nature 495 , 467 – 473 ( 2013 ). doi: 10.1038/nature11922 OpenUrl CrossRef PubMed Web of Science 17. ↵ Vernon , R. M. et al. Pi-Pi contacts are an overlooked protein feature relevant to phase separation . Elife 7 ( 2018 ). doi: 10.7554/eLife.31486 OpenUrl CrossRef PubMed 18. Lancaster , A. K. , Nutter-Upham , A. , Lindquist , S. & King , O. D . PLAAC: a web and command-line application to identify proteins with prion-like amino acid composition . Bioinformatics 30 , 2501 – 2502 ( 2014 ). doi: 10.1093/bioinformatics/btu310 OpenUrl CrossRef PubMed 19. Hughes , M. P. et al. Atomic structures of low-complexity protein segments reveal kinked beta sheets that assemble networks . Science 359 , 698 – 701 ( 2018 ). doi: 10.1126/science.aan6398 OpenUrl Abstract / FREE Full Text 20. Meszaros , B. et al. PhaSePro: the database of proteins driving liquid-liquid phase separation . Nucleic Acids Res 48 , D360 – D367 ( 2020 ). doi: 10.1093/nar/gkz848 OpenUrl CrossRef PubMed 21. ↵ Cai , H. , Vernon , R. M. & Forman-Kay , J. D . An Interpretable Machine-Learning Algorithm to Predict Disordered Protein Phase Separation Based on Biophysical Interactions . Biomolecules 12 ( 2022 ). doi: 10.3390/biom12081131 OpenUrl CrossRef PubMed 22. ↵ Chen , Z. et al. Screening membraneless organelle participants with machine-learning models that integrate multimodal features . Proc Natl Acad Sci U S A 119 , e2115369119 ( 2022 ). doi: 10.1073/pnas.2115369119 OpenUrl CrossRef PubMed 23. ↵ Saar , K. L. et al. Learning the molecular grammar of protein condensates from sequence determinants and embeddings . Proc Natl Acad Sci U S A 118 ( 2021 ). doi: 10.1073/pnas.2019053118 OpenUrl Abstract / FREE Full Text 24. Orlando , G. et al. Computational identification of prion-like RNA-binding proteins that form liquid phase-separated condensates . Bioinformatics 35 , 4617 – 4623 ( 2019 ). doi: 10.1093/bioinformatics/btz274 OpenUrl CrossRef PubMed 25. ↵ Ibrahim , A. Y. et al. Intrinsically disordered regions that drive phase separation form a robustly distinct protein class . J Biol Chem 299 , 102801 ( 2023 ). doi: 10.1016/j.jbc.2022.102801 OpenUrl CrossRef PubMed 26. Murthy , A. C. et al. Molecular interactions underlying liquid−liquid phase separation of the FUS low-complexity domain . Nature Structural & Molecular Biology 26 , 637 – 648 ( 2019 ). doi: 10.1038/s41594-019-0250-x OpenUrl CrossRef PubMed 27. ↵ Das , S. , Lin , Y. H. , Vernon , R. M. , Forman-Kay , J. D. & Chan , H. S . Comparative roles of charge, pi, and hydrophobic interactions in sequence-dependent phase separation of intrinsically disordered proteins . Proc Natl Acad Sci U S A 117 , 28795 – 28805 ( 2020 ). doi: 10.1073/pnas.2008122117 OpenUrl Abstract / FREE Full Text 28. ↵ Hardenberg , M. , Horvath , A. , Ambrus , V. , Fuxreiter , M. & Vendruscolo , M . Widespread occurrence of the droplet state of proteins in the human proteome . Proc Natl Acad Sci U S A 117 , 33254 – 33262 ( 2020 ). doi: 10.1073/pnas.2007670117 OpenUrl Abstract / FREE Full Text 29. ↵ Bolognesi , B. et al. A Concentration-Dependent Liquid Phase Separation Can Cause Toxicity upon Increased Protein Expression . Cell Rep 16 , 222 – 231 ( 2016 ). doi: 10.1016/j.celrep.2016.05.076 OpenUrl CrossRef PubMed 30. ↵ Liang , Q. et al. MolPhase, an advanced prediction algorithm for protein phase separation . EMBO J 43 , 1898 – 1918 ( 2024 ). doi: 10.1038/s44318-024-00090-9 OpenUrl CrossRef PubMed 31. ↵ Hadarovich , A. et al. PICNIC accurately predicts condensate-forming proteins regardless of their structural disorder across organisms . Nat Commun 15 , 10668 ( 2024 ). doi: 10.1038/s41467-024-55089-x OpenUrl CrossRef PubMed 32. ↵ Berman , H. M. et al. The Protein Data Bank . Nucleic Acids Res 28 , 235 – 242 ( 2000 ). doi: 10.1093/nar/28.1.235 OpenUrl CrossRef PubMed Web of Science 33. ↵ Pancsa , R. , Vranken , W. & Meszaros , B . Computational resources for identifying and describing proteins driving liquid-liquid phase separation . Brief Bioinform 22 ( 2021 ). doi: 10.1093/bib/bbaa408 OpenUrl CrossRef 34. ↵ Tripathi , S. et al. Defining the condensate landscape of fusion oncoproteins . Nat. Commun . 14 ( 2023 ). doi: 10.1038/s41467-023-41655-2 OpenUrl CrossRef PubMed 35. ↵ von Bülow , S. , Tesei , G. & Lindorff-Larsen , K . Prediction of phase separation propensities of disordered proteins from sequence . bioRxiv , 2024.2006.2003.597109 ( 2024 ). doi: 10.1101/2024.06.03.597109 OpenUrl Abstract / FREE Full Text 36. ↵ Shirnekhi , H. K. , Chandra , B. & Kriwacki , R. W . The Role of Phase-Separated Condensates in Fusion Oncoprotein–Driven Cancers . Annual Review of Cancer Biology 7 , 73 – 91 ( 2023 ). doi: 10.1146/annurev-cancerbio-061421-122050 OpenUrl CrossRef 37. ↵ Meszaros , B. , Erdos , G. & Dosztanyi , Z . IUPred2A: context-dependent prediction of protein disorder as a function of redox state and protein binding . Nucleic Acids Res 46 , W329 – W337 ( 2018 ). doi: 10.1093/nar/gky384 OpenUrl CrossRef PubMed 38. ↵ Emenecker , R. J. , Griffith , D. & Holehouse , A. S . Metapredict: a fast, accurate, and easy- to-use predictor of consensus disorder and structure . Biophys J 120 , 4312 – 4319 ( 2021 ). doi: 10.1016/j.bpj.2021.08.039 OpenUrl CrossRef 39. ↵ Zhou , J. , Oldfield , C. J. , Yan , W. , Shen , B. & Dunker , A. K . Intrinsically disordered domains: Sequence IZ disorder IZ function relationships . Protein Sci 28 , 1652 – 1663 ( 2019 ). doi: 10.1002/pro.3680 OpenUrl CrossRef PubMed 40. ↵ Baggett , D. W. et al. An Image Analysis Pipeline for Quantifying the Features of Fluorescently-Labeled Biomolecular Condensates in Cells . Front Bioinform 2 , 897238 ( 2022 ). doi: 10.3389/fbinf.2022.897238 OpenUrl CrossRef PubMed 41. ↵ Kawashima , S. et al. AAindex: amino acid index database, progress report 2008 . Nucleic Acids Research 36 , D202 – D205 ( 2008 ). doi: 10.1093/nar/gkm998 OpenUrl CrossRef PubMed Web of Science 42. ↵ Wang , J. et al. A Molecular Grammar Governing the Driving Forces for Phase Separation of Prion-like RNA Binding Proteins . Cell 174 , 688 – 699 e616 ( 2018 ). doi: 10.1016/j.cell.2018.06.006 OpenUrl CrossRef PubMed 43. ↵ LeDell , E. & Poirier , S . H2o automl: Scalable automatic machine learning . Proceedings of the AutoML Workshop at ICML 2020 ( 2020 ). 44. ↵ Lundberg , S. M. & Lee , S. I. A Unified Approach to Interpreting Model Predictions . Adv Neur In 30 ( 2017 ). 45. ↵ Wiedner , H. J. & Giudice , J . It’s not just a phase: function and characteristics of RNA-binding proteins in phase separation . Nat Struct Mol Biol 28 , 465 – 473 ( 2021 ). doi: 10.1038/s41594-021-00601-w OpenUrl CrossRef PubMed 46. Forman-Kay , J. D. , Ditlev , J. A. , Nosella , M. L. & Lee , H. O . What are the distinguishing features and size requirements of biomolecular condensates and their implications for RNA-containing condensates? Rna 28 , 36 – 47 ( 2022 ). doi: 10.1261/rna.079026.121 OpenUrl Abstract / FREE Full Text 47. ↵ Giudice , J. & Jiang , H . Splicing regulation through biomolecular condensates and membraneless organelles . Nat Rev Mol Cell Biol 25 , 683 – 700 ( 2024 ). doi: 10.1038/s41580-024-00739-7 OpenUrl CrossRef PubMed 48. ↵ Hou , C. et al. PhaSepDB in 2022: annotating phase separation-related proteins with droplet states, co-phase separation partners and other experimental information . Nucleic Acids Res 51 , D460 – D465 ( 2023 ). doi: 10.1093/nar/gkac783 OpenUrl CrossRef PubMed 49. ↵ Pan , X. et al. MCSdb, a database of proteins residing in membrane contact sites . Sci Data 11 , 281 ( 2024 ). doi: 10.1038/s41597-024-03104-7 OpenUrl CrossRef 50. ↵ Scorrano , L. et al. Coming together to define membrane contact sites . Nat Commun 10 , 1287 ( 2019 ). doi: 10.1038/s41467-019-09253-3 OpenUrl CrossRef PubMed 51. ↵ Paiz , E. A. et al. Beta turn propensity and a model polymer scaling exponent identify intrinsically disordered phase-separating proteins . J Biol Chem 297 , 101343 ( 2021 ). doi: 10.1016/j.jbc.2021.101343 OpenUrl CrossRef PubMed 52. ↵ Garcia-Jove Navarro , M. , et al. RNA is a critical element for the sizing and the composition of phase-separated RNA-protein condensates . Nat Commun 10 , 3230 ( 2019 ). doi: 10.1038/s41467-019-11241-6 OpenUrl CrossRef PubMed 53. Lin , Y. & Fang , X . Phase separation in RNA biology . J Genet Genomics 48 , 872 – 880 ( 2021 ). doi: 10.1016/j.jgg.2021.07.012 OpenUrl CrossRef PubMed 54. ↵ Rhine , K. , Vidaurre , V. & Myong , S. RNA Droplets . Annu Rev Biophys 49 , 247 – 265 ( 2020 ). doi: 10.1146/annurev-biophys-052118-115508 OpenUrl CrossRef PubMed 55. ↵ Feng , M. et al. Decoding Missense Variants by Incorporating Phase Separation via Machine Learning . Nat Commun 15 , 8279 ( 2024 ). doi: 10.1038/s41467-024-52580-3 OpenUrl CrossRef PubMed 56. ↵ Feric , M. & Misteli , T . Phase separation in genome organization across evolution . Trends Cell Biol 31 , 671 – 685 ( 2021 ). doi: 10.1016/j.tcb.2021.03.001 OpenUrl CrossRef PubMed 57. ↵ Qian , Z. G. , Huang , S. C. & Xia , X. X . Synthetic protein condensates for cellular and metabolic engineering . Nat Chem Biol 18 , 1330 – 1340 ( 2022 ). doi: 10.1038/s41589-022-01203-3 OpenUrl CrossRef PubMed 58. ↵ Song , S. et al. Synthetic Biomolecular Condensates: Phase-Separation Control, Cytomimetic Modelling and Emerging Biomedical Potential . Angew Chem Int Ed Engl , e202418431 ( 2024 ). doi: 10.1002/anie.202418431 OpenUrl CrossRef 59. ↵ Mitrea , D. M. et al. Self-interaction of NPM1 modulates multiple mechanisms of liquid-liquid phase separation . Nat Commun 9 , 842 ( 2018 ). doi: 10.1038/s41467-018-03255-3 OpenUrl CrossRef PubMed 60. ↵ Wang , A. et al. A single N-terminal phosphomimic disrupts TDP-43 polymerization, phase separation, and RNA splicing . EMBO J 37 ( 2018 ). doi: 10.15252/embj.201797452 OpenUrl CrossRef PubMed 61. ↵ Stringer , C. , Wang , T. , Michaelos , M. & Pachitariu , M . Cellpose: a generalist algorithm for cellular segmentation . Nat Methods 18 , 100 – 106 ( 2021 ). doi: 10.1038/s41592-020-01018-x OpenUrl CrossRef PubMed 62. ↵ Xiao , N. , Cao , D. S. , Zhu , M. F. & Xu , Q . S. protr/ProtrWeb: R package and web server for generating various numerical representation schemes of protein sequences . Bioinformatics 31 , 1857 – 1859 ( 2015 ). doi: 10.1093/bioinformatics/btv042 OpenUrl CrossRef PubMed 63. ↵ Grau , J. , Grosse , I. & Keilwagen , J . PRROC: computing and visualizing precision-recall and receiver operating characteristic curves in R . Bioinformatics 31 , 2595 – 2597 ( 2015 ). OpenUrl CrossRef PubMed 64. ↵ McInnes , L. , Healy , J. & Melville , J. Umap: Uniform manifold approximation and projection for dimension reduction . arXiv preprint arXiv:1802.03426 ( 2018 ). 65. ↵ Sherman , B. T . et al. DAVID: a web server for functional enrichment analysis and functional annotation of gene lists (2021 update) . Nucleic Acids Research 50 , W216 – W221 ( 2022 ). doi: 10.1093/nar/gkac194 OpenUrl CrossRef PubMed 66. ↵ Sayols , S. rrvgo: a Bioconductor package for interpreting lists of Gene Ontology terms . MicroPubl Biol 2023 ( 2023 ). doi: 10.17912/micropub.biology.000811 OpenUrl CrossRef 67. ↵ Wang , J. Z. , Du , Z. D. , Payattakool , R. , Yu , P. S. & Chen , C. F . A new method to measure the semantic similarity of GO terms . Bioinformatics 23 , 1274 – 1281 ( 2007 ). doi: 10.1093/bioinformatics/btm087 OpenUrl CrossRef PubMed Web of Science View the discussion thread. Back to top Previous Next Posted March 11, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Proteome-wide computational analyses reveal links between protein condensate formation and RNA biology Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Proteome-wide computational analyses reveal links between protein condensate formation and RNA biology Snigdha Maiti , Swarnendu Tripathi , David W Baggett , Aaron H. Phillips , Cheon-Gil Park , Jina Wang , Wahiduzzaman , William T Freyaldenhoven , Swati Kinger , Brittany Pioso , John Bollinger , Ramiz Somjee , Benjamin Lang , M. Madan Babu , Richard W. Kriwacki bioRxiv 2025.03.03.640993; doi: https://doi.org/10.1101/2025.03.03.640993 Share This Article: Copy Citation Tools Proteome-wide computational analyses reveal links between protein condensate formation and RNA biology Snigdha Maiti , Swarnendu Tripathi , David W Baggett , Aaron H. Phillips , Cheon-Gil Park , Jina Wang , Wahiduzzaman , William T Freyaldenhoven , Swati Kinger , Brittany Pioso , John Bollinger , Ramiz Somjee , Benjamin Lang , M. Madan Babu , Richard W. Kriwacki bioRxiv 2025.03.03.640993; doi: https://doi.org/10.1101/2025.03.03.640993 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7633) Biochemistry (17681) Bioengineering (13890) Bioinformatics (41929) Biophysics (21446) Cancer Biology (18586) Cell Biology (25492) Clinical Trials (138) Developmental Biology (13374) Ecology (19897) Epidemiology (2067) Evolutionary Biology (24308) Genetics (15606) Genomics (22497) Immunology (17736) Microbiology (40385) Molecular Biology (17175) Neuroscience (88584) Paleontology (666) Pathology (2831) Pharmacology and Toxicology (4822) Physiology (7641) Plant Biology (15149) Scientific Communication and Education (2045) Synthetic Biology (4293) Systems Biology (9822) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00