Machine Learning-Based Identification of Sickle Cell Disease Subphenotypes in Clinical Trial Data

preprint OA: closed CC-BY-4.0
📄 Open PDF Full text JSON View at publisher
Full text 87,349 characters · extracted from preprint-html · click to expand
Machine Learning-Based Identification of Sickle Cell Disease Subphenotypes in Clinical Trial Data | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Machine Learning-Based Identification of Sickle Cell Disease Subphenotypes in Clinical Trial Data View ORCID Profile Wei Xiao , Patricia Oneal , View ORCID Profile Menglun Wang , Nihar J. Mehta , Qi Liu , Rongmei Zhang , Susan Perrine , Qin Ryan doi: https://doi.org/10.1101/2025.06.01.25328537 Wei Xiao 1 Office of New Drugs, Center for Drug Evaluation (CDER) , Food and Drug Administration (FDA) Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Wei Xiao Patricia Oneal 1 Office of New Drugs, Center for Drug Evaluation (CDER) , Food and Drug Administration (FDA) Find this author on Google Scholar Find this author on PubMed Search for this author on this site Menglun Wang 2 Office of Clinical Pharmacology , CDER, FDA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Menglun Wang Nihar J. Mehta 1 Office of New Drugs, Center for Drug Evaluation (CDER) , Food and Drug Administration (FDA) Find this author on Google Scholar Find this author on PubMed Search for this author on this site Qi Liu 2 Office of Clinical Pharmacology , CDER, FDA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rongmei Zhang 3 Office of Biostatistics , CDER Find this author on Google Scholar Find this author on PubMed Search for this author on this site Susan Perrine 1 Office of New Drugs, Center for Drug Evaluation (CDER) , Food and Drug Administration (FDA) Find this author on Google Scholar Find this author on PubMed Search for this author on this site Qin Ryan 1 Office of New Drugs, Center for Drug Evaluation (CDER) , Food and Drug Administration (FDA) Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: qin.ryan{at}fda.hhs.gov Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Sickle Cell Disease (SCD) is a rare autosomal recessive disorder caused by a point mutation producing abnormal hemoglobin S, leading to deformed red blood cells and a wide range of clinical manifestations, including pain crises, organ damage, and an increased risk of infection. These devastating complications often result in significant morbidity and early mortality, presenting significant therapeutic challenges. Currently, there is a lack of clinically validated predictive tools to assess individual SCD patients’ prognoses and therapeutic responses. This is largely due to the complexity and variability of the clinical manifestations, which vary widely among patients. As a result, there remains an unmet need for a systematic approach to SCD disease subphenotype classification that can guide and tailor therapeutic strategies, predict outcomes, and improve patients’ lives. Over a decade ago, two clinical subphenotypes in SCD were proposed based on literature and clinical observations. However, this concept has not been applied or explored in the design of clinical trials (CT). Recent advances in machine learning (ML) applications in medicine, and growing availability of SCD clinical trial data evaluating therapeutics which target different pathophysiologic aspects of the disease, provides opportunity to enhance understanding of therapeutic responses within SCD populations. Applying ML techniques to a large CT database could support development of robust disease models capable of identifying and validating disease subphenotypes, with potential to predict outcomes to specific therapies based on mechanism of action and to optimize care in SCD. In this study, we constructed a comprehensive database comprising 3,551 patients with SCD from 16 clinical trials that supported therapeutic approvals for SCD. Using this database, we applied a machine learning pipeline to develop a rule-based classification method, which identified two distinct clinical subphenotypes of SCD: the Vaso-occlusive Primary (VP) subphenotype, primarily characterized by a higher frequency of vaso-occlusive pain crises, and the Hemolytic Dominant (HD) subphenotype, characterized by chronic hemolysis and its associated complications. Biomarker comparisons demonstrated that the VP subphenotype was associated with a significantly higher annual rate of vasoocclusive crisis events, significantly higher levels of total and fetal hemoglobin, and leukocytosis, while the HD subphenotype exhibited significantly higher levels of hemolysis-related biomarkers of indirect bilirubin. The biomarker profiles were validated using an independent clinical trial dataset, which confirmed these two subphenotypes in SCD. Our study demonstrated that the integration of ML with disease pathophysiology enables robust identification of clinically meaningful subphenotypes of SCD from an international clinical trial database. This approach provides a basis for developing predictive disease models, which may optimize treatment strategies and improve patients’ outcomes. Further, our methodological framework offers a scalable model for application to identify subsets in other rare genetic diseases. Introduction Sickle Cell Disease (SCD), a group of autosomal recessive disorders caused by mutations in the beta globin gene of adult hemoglobin (HbA, α 2 β 2 ), includes sickle cell anemia (HbSS), HbSC, HbS-β-thalassemia, and other HbS-related hemoglobinopathies ( 1 – 5 ). Due to the genetic and pathophysiological variability across these subtypes, clinical manifestations and associated laboratory biomarkers vary significantly from patient to patient ( 6 – 15 ). This heterogeneity creates an unmet need for predictive phenotypic disease models to optimize potential treatments, estimate short- and long-term disease outcomes, and guide therapeutic development for subjects with SCD to those more likely to benefit from therapies which address their phenotype. Despite decades of research, individuals with SCD continue to experience poor health outcomes and lower life expectancy than the unaffected population ( 26 , 27 ). Supportive treatments have been developed for identified risks such as prophylactic antibiotics, targeted vaccines, red blood cell transfusions, pain management, hydroxyurea therapy, and, for those with donors, stem cell transplantation ( 16 – 23 ). Although a few therapeutics were recently approved, clinical trials of well-designed therapies have faced significant challenges related to the variability of disease complications ( 24 , 25 ). SCD is characterized by chronic hemolytic anemia and recurrent vaso-occlusive crises, leading to a wide range of complications. However, prognostication remains difficult due to gaps in our current understanding of disease progression and the lack of robust, individualized predictive models ( 26 – 28 ). Although a number of laboratory and genetic modifiers have been identified ( 67 – 81 ), biological and clinical prognostic factors that reliably predict outcomes or survival in SCD have not been studied in large clinical trial datasets. Previous reports have hypothesized two major SCD subphenotypes based on predominance of two pathophysiologic mechanisms-cell adhesion to the vasculature causing vaso-occlusive crises or hemolysis and endothelial dysfunction, respectively ( 29 – 31 ). However, distinguishing between these subphenotypes in practice is complicated by overlapping clinical presentations, and this hypothesis has not been evaluated in clinical trials. In this study, we applied machine learning (ML) modeling and computational analyses to a large, curated clinical trial database of patients with SCD, to identify distinct disease subphenotypes. This approach aims to enhance understanding of disease heterogeneity, evaluate how specific subphenotypes respond to therapies directed to a predominant pathophysiology, and provide a foundation for predictive algorithms that can inform future clinical trial design. Moreover, our ML pipeline offers a scalable framework that may also benefit research in other rare genetic diseases. Methods Database The clinical trial baseline database utilized in this study was constructed from 16 clinical trials in SCD submitted to the U.S. Food and Drug Administration (FDA) between 2017 to 2021. Data were obtained from 3,551 patients with baseline clinical data encompassing demographics, genotypes, disease characteristics, medical history of SCD-related clinical complications, and laboratory test results collected prior to any trial interventions ( Figure 1 ). Most baseline data were obtained within 12 months before enrollment. Notably, enrolled patients were permitted to be on a stable dose of hydroxyurea before enrollment in these trials. Additionally, data from an independent study, the Multicenter Study of Hydroxyurea in Sickle Cell Anemia (MSH) ( 32 ), provided by National Institutes of Health, National Heart, Lung, and Blood Institute, which randomized participants to hydroxyurea or placebo, were used for model evaluation. Download figure Open in new tab Figure 1. Data processing workflow. 1. Data Curation: SCD patients’ demographics, clinical characteristics, and biomarkers were extracted from multiple clinical trials submitted to the FDA. Patients with insufficient data were excluded from the analysis. 2. Subtype Classification Pipeline: Multiple Correspondence Analysis (MCA) was applied to the clinical features, obtaining coordinates for each category of clinical features across the principal components. K-means clustering was then performed on these coordinates, resulting in the classification of clinical features into two distinct clusters. 3. Downstream Analysis: Based on the clustering results, two subphenotypes of SCD were defined, and the differences in biomarkers between the two subphenotypes were subsequently compared. Variable Selection For disease subphenotype classification, we selected seven clinical features commonly associated as SCD complications: vaso-occlusive pain crisis (VOC), acute chest syndrome (ACS), avascular necrosis (AVN), pulmonary hypertension (PHT), stroke, leg ulcers (ulcers), and priapism. Each feature was recorded as a binary indicator at baseline for each patient: “Yes” if the feature had occurred at least once and “No” if it had never occurred during the entire baseline data collection period. Patients with data missing on any of these variables were excluded from the analysis, resulting in a final dataset of 2,887 patients, representing 81.3% of the total database. Clustering Analysis To identify potential disease subphenotypes, we conducted Multiple Correspondence Analysis (MCA) on a dataset comprising 2,887 patients with complete data for seven binary clinical features: VOC, ACS, AVN, PHT, stroke, ulcers, and priapism. MCA ( 33 ) is a dimensionality reduction technique suitable for categorical data, transforming the original variables into a set of principal components that capture the underlying structure of the data. In our analysis, each distinct category (“Yes” or “No”) of the clinical features was represented in the transformed seven-dimensional space as seven principal components, resulting in seven coordinates corresponding to 14 features. Subsequently, we applied K-means clustering ( 34 ) to the coordinates derived from the first five principal components, specifying two clusters as the target ( Figure 1 ). Downstream Primary and Secondary Analysis We defined two SCD subphenotypes based on clustering results and assigned patients accordingly. In the primary analysis, 2,887 patients from our clinical trial database were categorized into these subphenotypes. To assess baseline differences, we selected several disease-relevant laboratory biomarkers, including hemoglobin (Hb), fetal hemoglobin (HbF), lactate dehydrogenase (LDH), reticulocytes, indirect bilirubin, neutrophils, and leukocytes ( Figure 1 ). Additionally, the annual rate of VOC was included as a numerical variable. The Mann-Whitney U test ( 35 ) was employed for continuous variables, while the chi-square test ( 36 ) was used for the discrete variable of annual VOC rate. For secondary analysis, we applied our rule-based subphenotype classification to individual patient data (n=299) from an independent SCD clinical trial, the Multicenter Study of Hydroxyurea in Sickle Cell Anemia ( 32 ). Similar statistical methods were utilized to evaluate biomarkers and the annual VOC rate in this cohort to validate the primary analysis. Results Clinical Feature Clustering Previous research has hypothesized that sickle cell disease comprises two primary subphenotypes driven by distinct underlying mechanisms ( 28 – 29 ). One subphenotype, associated with hemolysis and low steady-state hemoglobin levels, is more likely to experience complications related to nitric oxide consumption by free hemoglobin, such as pulmonary hypertension (PHT), stroke, leg ulcers (ulcers), and priapism. The other subphenotype is characterized by vaso-occlusive events, including pain crisis (VOC), acute chest syndrome (ACS), and avascular necrosis (AVN), and is associated with higher steady-state leukocyte counts and relatively higher hemoglobin levels, and associated with increased red cell adhesion to endothelium ( 30 ). To investigate this hypothesis in clinical trial data, we curated a database comprising 3,551 patients from 16 clinical trials that supported drug approvals for treatment of SCD. After excluding 664 patients with incomplete clinical feature data, 2,887 patients (81.3% of the database) remained for analysis. Demographic characteristics of these patients are summarized in Supplementary Table 1 . The clinical features were selected to assess potential stratification within this trial dataset that align with the hypothesized subphenotypes ( 30 ), potentially elucidating variations in disease severity and complications. The data processing workflow is illustrated in Figure 1 . View this table: View inline View popup Download powerpoint Supplementary Table 1. Summary of SCD population from clinical trial database Disease variable assessment was conducted based on clinical relevance and data availability, resulting in the selection of seven primary clinical features as subjects for our analytical pipeline: VOC, ACS, AVN, PHT, stroke, ulcers, and priapism. Although jaundice is a common clinical feature of SCD, it was excluded due to significant missing data. After several methodology explorations, to analyze binary clinical features (“Yes” or “No”), we applied Multiple Correspondence Analysis (MCA) ( 33 ), a dimensional reduction technique for categorical data. The MCA provided percentage contributions of each clinical feature category to the principal components ( Supplementary Figure 1 ). We combined the “Yes” and “No” categories for each clinical feature by percentage contributions to clarify their significance within each principal component ( Figure 2A ). The relatively even weight of each feature in explaining the data variance ( Figure 2B ) led to a rearrangement of principal components, revealing two distinct modules ( Figure 2A ). Notably, PHT, stroke, ulcers, and priapism clustered together, suggesting a common underlying mechanism. Interestingly, these clinical features were related to the hemolytic subphenotype in previous reports by Kato et al ( 28 – 29 , 37 ). Download figure Open in new tab Supplementary Figure 1. The percentage contribution of each category (“Yes” or “No”) within each clinical feature to each principal component. Download figure Open in new tab Figure 2. Identification of the two subphenotypes of SCD based on clinical features. (A) The percentage contributions of each clinical features to each principal component (the two categories, “Yes” and “No”, of each clinical feature are combined for clearer pattern visualization). (B) The proportion of data variance explained by each principal component. (C) The projection of both categories of all clinical features along the first two principal components, with circles highlighting the two subphenotypes identified by K-means clustering. The green circle represents the Vaso-Occlusive Primary (VP) subphenotype, while the pink circle represents the Hemolysis Dominant (HD) subphenotype. (D) Age distributions across combined subsets of gender and subphenotype. (E) A heatmap visualizing clinical features and laboratory biomarkers over the baseline dataset patients with SCD. The first two rows represent the SCD subphenotype and sex of each patient, with patient ordering prioritized by SCD subphenotype, followed by sex, and then by age. Clinical and laboratory features, shown in subsequent rows, are normalized to a 0-100 scale to standardize across diverse measurement ranges (reference to laboratory upper normal limit of healthy people, Supplementary table 5 ). The yellow color in the heatmap represents patients with missing laboratory biomarkers. Subsequently, we utilized the coordinates of the first five principal components, which accounted for 76.25% of the variance, and performed K-means clustering specifying two clusters. The clustering results ( Figure 2C ) indicated that the “Yes” categories of PHT, stroke, ulcers, and priapism grouped into one cluster, while the “No” categories of these features, along with VOC, ACS and AVN, formed a second cluster. These findings support the existence of two SCD subphenotypes: one associated with hemolysis-related clinical features, and the other without. Subphenotype Define and Comparison Based on the K-means clustering results of seven binary clinical features, we identified two SCD subphenotypes. The Vasoocclusive Primary (VP) subphenotype was characterized by the absence of PHT, stroke, ulcers, or priapism. In contrast, the Hemolysis-Dominant (HD) subphenotype was defined by the presence of at least one of these features. Among the 2,887 patients analyzed in our rule-based subphenotype classification, 2,329 were classified as VP and 552 as HD ( Figure 2E ). The distribution of clinical features and laboratory biomarkers analyzed for ML subtyping at the subject level is depicted in Figure 2E . In this figure, subjects were organized by HD and VP subphenotypes, followed by sex (female and male), and then by ascending age. The age ranges were as follows: HD females and HD males, 3-67 years; VP females, 1-70 years; and VP males, 0-65 years. A difference of sex and age distributions between the two subphenotypes was found. Females were significantly older than males in both subphenotypes and overall, patients in the HD subphenotype were generally older than those in the VP subphenotype ( Figure 2D and Supplementary Figure 2 ) Download figure Open in new tab Supplementary Figure 2. Age distributions across gender and subphenotype subsets. (A) Percentage histograms of age distributions for the two subphenotypes, with density curves overlaid. (B) The summary statistics table for each subset shown in Panels A and Figure 2D . VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. To elucidate the definitions of the two subphenotypes, seven clinical features used for classification were visualized in Figure 2E . While PHT, stroke, ulcer and priapism were exclusively presented for the HD subphenotype, VOC, AVN, and ACS were populated in both subphenotypes with various prevalence (Table 2E). Additionally, laboratory biomarkers were displayed in Figure 2E , with all values normalized to a 0-100 scale. Given the strong association between age and biomarker values, age-dependent scaling for each biomarker was applied during the normalization to address associations between age and biomarker levels ( Supplementary Table 3 ). We compared numerical differences in laboratory biomarkers and a numerical clinical feature, the annual rate of VOC, between the two subphenotypes. As depicted in Figure 2E and Supplementary Table 4 , a notable proportion of subjects had missing data for certain laboratory biomarkers. To address this, we conducted separate analyses for each biomarker, excluding only those patients who lacked data for the specific biomarker under consideration. The number of patients involved in each analysis is listed in Figure 3 . Download figure Open in new tab Figure 3. Violin plots showing differences in laboratory biomarkers between the VP and HD subphenotypes at baseline of the Clinical Trial population. In each plot, the median value is represented by a dashed line, and the 75th and 25th percentiles are indicated by doted lines. (A) Hemoglobin levels (g/dL, N total =2,583, N VP =2,084, N HD =502). (B) Fetal hemoglobin levels (%, N total =1,671, N VP =1,330, N HD =341). (C) Indirect bilirubin (mg/dL, value has been zoomed in for better visualization. N total =1,941, N VP =1,541, N HD =400). (D) Lactate dehydrogenase (LDH) levels (U/L, N total =1,623, N VP =1,295, N HD =328). (E) Reticulocyte counts (10 9 /L, N total =1,914, N VP =1,528, N HD =386). (F) Neutrophils (10 9 /L, N total =2,365, N VP =1,901, N HD =464). (G) Leukocytes (10 9 /L, N total =2,545, N VP =2,050, N HD =495). (H) A table showing mean biomarker levels in VP and HD subphenotypes, with p-values from the Mann-Whitney U test indicating statistical differences between the two groups. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. First, we compared baseline hemoglobin (Hb) levels between the VP and HD subphenotypes. VP patients exhibited a mean total Hb of 8.9 g/dL compared to 8.6 g/dL in HD patients (Mann-Whitney U test, p < 0.001, Figure 3A, 3H ). Similarly, fetal hemoglobin (HbF) values were slightly, but significantly, higher in the VP group, with a mean of 10.8%, compared to 9.98% in the HD group (Mann-Whitney U test, p = 0.012, Figure 3B, 3H ). Second, small, but significantly, lower baseline levels of indirect bilirubin were found in VP patients compared to HD patients, with mean values of a 2.96 mg/dL and 3.44 mg/dL, respectively (Mann-Whitney U test, p = 0.002, Figure 3C, 3H ). As elevated indirect bilirubin is a known marker of hemolysis, its higher levels in the HD subphenotype supports the rule-based classification. The ranges of each laboratory values are shown in Figure 3 . Despite overall higher levels of LDH exhibited in the database, mean differences between the two subphenotypes were not statistically significant (p = 0.089), nor were mean differences in absolute reticulocytes (p = 0.76) or absolute neutrophils (p = 0.21) (Means are tabulated in Figure 3H . Mann-Whitney U test, Figure 3D-F , and 3H ). However, LDH levels in the HD subphenotype exhibited a bimodal distribution, with a subset of HD patients demonstrating substantially higher LDH levels ( Figure 3D , detail in Supplementary Figure 3 ). This distribution indicates heterogeneity within the HD subphenotype. Leukocyte counts were significantly higher in the VP subphenotype (mean: 9.93 x 10 9 /L) than in the HD patients (mean: 9.52 x 10 9 /L, p = 0.037, Figure 3G, 3H ). The higher leukocyte count in VP patients may be related to inflammatory responses associated with vaso-occlusive crises following vascular obstruction and tissue ischemia ( 37 , 38 ). Download figure Open in new tab Supplementary Figure 3. Density distribution of LDH by subphenotype (VP and HD) (U/L, N total =1,623, N VP =1,295, N HD =328). A bimodal distribution in HD is clearly visualized. Mann-Whitney U test p value is displayed. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. Our baseline CT data demonstrate a distinct difference in the annual rate of VOC events between the two subphenotypes. Patients with the VP subphenotype experienced a significantly higher rate of painful crises compared to those with the HD subphenotype (chi-square test, p = 0.012). As shown in Figure 4 , a markedly higher number of subjects classified in VP subphenotype experienced annual numbers of vaso-occlusive crises from 1-5/year, despite sizeable proportions of patients in both groups (VP: 39.5%, HD: 50.0%) who did not report any painful crises during the baseline data collection periods during the trials. Additionally, 1,072 patients (37.1% of the cohort) were excluded from this analysis due to missing data on baseline annual rate of VOC events. Consequently, the proportion of patients with at least one VOC event, as determined by this variable, differs from the binary clinical feature “VOC” used to define the VP subphenotype ( Supplementary Table 2 ). In summary, the VP subphenotype is characterized by a higher annual rate of VOC events, increased total hemoglobin levels, and higher steady-state leukocyte counts, whereas the HD subphenotype is marked by elevated hemolysis-related biomarkers, such as indirect bilirubin and LDH ( Table 1 ). View this table: View inline View popup Download powerpoint Supplementary Table 2. The Proportion of Subjects with Vasooclusive Features. This table presents the proportion of patients who have the listed clinical features (VOC, ACS, AVN) at baseline in the VP and HD subphenotypes. P-values from Chi-square tests are shown to assess the statistical significance of differences between subphenotypes. N VP = 2,334, N HD = 553. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. Download figure Open in new tab Figure 4. The figure shows the percentage of patients with various annual rates of VOC events at baseline, stratified by subphenotype. The patient counts in each group are labeled at the top of the bar. N total =1,815, N VP =1,479, N HD =336. The red dotted line divided patients without VOC events and patients with VOC events. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. View this table: View inline View popup Download powerpoint Table 1. Summary of subphenotyping analyses. Comparison of laboratory biomarkers and the VOC annual rates between VP and HD subphenotypes in the SCD clinical trial database and the data of MSH study. The table shows the trend of differences for each feature, with statistical significance indicated by asterisks (*). VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. Evaluation of Subphenotype Definition on the Multicenter Study of Hydroxyurea in Sickle Cell Anemia To verify our subphenotype definition, we conducted a comparable analysis using data from an independent trial, Multicenter Study of Hydroxyurea (MSH). The MSH study was a 1:1 randomized, double-blind, placebo-controlled clinical trial conducted from 1992-1995, enrolling 299 subjects with sickle cell anemia and aged 18 and older. The study evaluated the efficacy and safety of hydroxyurea, leading to its approval for treating adults with SCD in March 1998. At baseline, all MSH participants were treatment -naïve, having received only supportive care such as RBC transfusion and pain management. Therefore, the recorded baseline clinical features and laboratory biomarkers should reflect natural history of SCD in the absence of disease modifying therapy. The MSH trial collected data on five clinical features: ACS, AVN, stroke, ulcers, and priapism. PHT data were not recorded, possibly due to the limited access for definitive diagnostic tools like right heart catheterization at the time. Additionally, all (100%) participants had baseline VOC events recorded, as eligible patients were required to have had at least three VOCs during the year preceding enrollment. Demographic characteristics of the MSH cohort are summarized in Supplementary Table 5 . Initially, we attempted to input the five available clinical features from 299 patients in the MSH study using MCA and applied K-means clustering on coordinates of the first few principal components. However, clustering based on the first three to five principal components only grouped one or two of the three clinical features—stroke, ulcers, and priapism—together. This limited clustering is likely attributable to the small sample size and two unimputable clinical features, PHT (100% absence), and VOC (100% presence) in the MSH dataset. Consequently, we methodically determined that subphenotype definitions derived from our previous ML-modeling, which utilized seven clinical features, could be suitable for assessing MSH study data. Subsequently, we applied the previously established subphenotype definition to the MSH dataset using the five available clinical features. Patients were classified into two groups: the VP subphenotype, characterized by the absence of stroke, ulcers, or priapism at baseline, and the HD subphenotype, defined by the presence of at least one of these features. With this rule-based classification method, the 299 MSH patients were categorized into two groups: 166 VP patients and 133 HD patients. We then compared the same biomarkers and annual rate of VOC between these two subphenotypes. Our analysis revealed that VP patients exhibited significantly higher Hb levels (mean: 8.69 g/dL) compared to HD patients (mean: 8.23 g/dL, Mann-Whitney U test, p = 0.0023, Figure 5A, 5G ). Similarly, HbF showed the same trend as Hb, elevated in VP with mean HbF 5.75% compared to HD with mean HbF 4.25% (Mann-Whitney U test, p < 0.001, Figure 5B, 5G ). Download figure Open in new tab Figure 5. Violin plots showing differences in laboratory biomarkers between the VP and HD subphenotypes at baseline of MSH Study (N total =299, N VP =166, N HD =133). In each plot, the median value is represented by a dashed line, and the 75th and 25th percentiles are indicated by doted lines. (A) Hemoglobin levels (g/dL). (B) Fetal hemoglobin levels (%). (C) Indirect bilirubin (mg/dL). (D) Reticulocyte counts (10 9 /L). (E) Neutrophils (10 9 /L). (F) Leukocytes (10 9 /L). (G) A table showing mean biomarker levels in VP and HD subphenotypes, with p-values from the Mann-Whitney U test indicating statistical differences between the two groups. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. In contrast, no statistically significant differences were found between the two subphenotypes in either indirect bilirubin (Mann-Whitney U test, p = 0.36, Figure 5C, 5G ) or reticulocyte counts (Mann-Whitney U test, p = 0.62, Figure 5D, 5G ). However, a trend toward higher indirect bilirubin levels in the HD subphenotype was noted. This lack of statistical significance may be attributed to the limited sample size of the MSH study and potential misclassification due to incomplete clinical feature data. Notably, LDH was not available in the MSH dataset. Further comparison showed no significant difference in neutrophil counts (Mann-Whitney U test, p = 0.13, Figure 5E, 5G ) and leukocyte counts (Mann-Whitney U test, p = 0.32, Figure 5F, 5G ) between the two subphenotypes. Nonetheless, leukocyte counts tend to be higher in the VP subphenotype compared to the HD subphenotype ( Figure 5F ). Additionally, we assessed the annual rate of VOC at baseline between the subphenotypes. No significant difference was found (chi-square test, P = 0.92, Supplementary Figure 4 ). Download figure Open in new tab Supplementary Figure 4. The figure shows the percentage of patients in each number of annual rate of VOC events at baseline, stratified by subphenotype of the MSH study. The patient counts in each group are labelled at the top of the bar. N Total = 299, N VP =166, N HD =133. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. Summary of Biomarker and the Annual Rate of VOC in VP and HD Subphenotypes Based on our downstream analyses in both the clinical trial database and the MSH study data, comparative patterns between the VP and HD subphenotypes for biomarkers and the annual rate of VOC, are summarized as presented in Table 1 . In the primary analyses with the clinical trial database, the VP subphenotype exhibited significantly higher Hb, HbF, leukocytes and annual VOC rates. Conversely, the HD subphenotype showed significantly elevated indirect bilirubin and a trend toward higher LDH. These findings strongly supported our rule-based subphenotype classification derived from clinical feature clustering analysis. In our secondary analyses using MSH data, the results from application of our rule-based subphenotype classification were generally consistent with our primary findings from the SCD clinical trial database ( Table 1 ). Even in instances where differences did not reach statistical significance, the observed trends aligned with those identified in the primary analyses. It is important to note that the baseline data from the MSH study represented the natural history of SCD, as participants were naïve to hydroxyurea and other disease modifying drugs at the time of enrollment. This provided a unique opportunity to validate our primary findings, which were derived from a mixed SCD patient population, some of whom had been exposed to hydroxyurea. However, the MSH dataset has limitations, including a smaller sample size and certain unfeasible clinical feature data, which may affect the precision of subphenotype classification. Despite these constraints, it remains the most suitable available dataset for evaluating our subphenotype classification. We will discuss these limitations further in the discussion. In conclusion, the clustering of clinical features, along with distinct patterns in biomarker levels and the annual rate of VOCs, supports the differentiation between vasoocclusive primary (VP) and hemolytic dominant (HD) subphenotypes in sickle cell disease. Discussion Our study achieved its primary goal of developing a methodical and feasible ML pipeline to analyze a sizable international SCD clinical trial dataset. By integrating disease pathophysiology with computational methods, we identified clinical subphenotypes and laboratory biomarkers, laying the groundwork for future predictive disease modeling ( Figure 1 ). Using a combined MCA and K-means clustering approach, we uncovered hidden baseline patterns in clinical features of SCD and identified two distinct subphenotypes. The first, the hemolysis dominant (HD) subphenotype, is defined by the presence of hemolytic clinical features, including PHT, stroke, priapism, and ulcers—all of which clustered together in our model, suggesting a shared underlying pathophysiological process ( Figure 2C ). This subphenotype exhibited significantly lower Hb levels and elevated markers of hemolysis, such as LDH and indirect bilirubin ( Figure 3 , Table 1 ). In contrast, the vasoocclusive primary (VP) subphenotype was characterized by a higher annual rate of VOC ( Figure 4 ) and absence of any hemolytic clinical features ( Figure 2C ). The VP subphenotype exhibited significant higher Hb, HbF and leukocyte counts ( Figure 3 , Table 1 ). These patterns were further validated using the independent clinical study, MSH. Our findings underscore the clinical and pathophysiological heterogeneity of SCD. Hemolysis plays a direct role in complications such as PHT, stroke, priapism, and ulcers through mechanisms involving vascular dysfunction, increased blood viscosity, and inflammation. In contrast, complications like pain crises, ACS, and AVN, are more directly linked to vascular occlusion and ischemic injury due to hemoglobin S polymerization and red cell sickling. Understanding these distinct pathophysiological pathways can inform clinical trial design and support the development of personalized treatment plans tailored to each subphenotype. Kato et al. previously hypothesized two SCD subphenotypes based on pathophysiology and literature review: ( 1 ) the viscosity vasoocclusive subphenotype characterized by VOC, ACS, and AVN; ( 2 ) hemolytic endothelial dysfunction subphenotype marked by ulcers, PHT, priapism and stroke ( 15 , 23 , 29 , 31 ). Our rule-based ML classification derived from clinical trial data supports the general structure of Kato’s hypothesis. However, while Kato proposed bidirectional overlap between the two subphenotypes ( 23 ), our findings reveal a more unidirectional pattern. In our model, hemolytic features (PHT, priapism, stroke, and ulcers) are exclusively clustered within the HD subphenotype ( Figure 2C and 2E Supplementary Figure 5 ). In contrast, vaso-occlusive features of VOC, ACS and AVN appeared in both subphenotypes, but with small differences in distribution: 65.9%, 23.5%, 21% in the VP subphenotype versus 60.4%, 24.6%, 27.3% in the HD subphenotype, respectively ( Supplementary Table 2 , Supplementary Figure 5 ). Although the differences in VOC and AVN distribution were statistically significant, these differences were relatively small. These findings suggest that hemolytic features play a more decisive role than vaso-occlusive features in driving our ML-based subphenotype classification. Unlike Kato’s’ bidirectional model, our results suggest a unidirectional overlap—vaso-occlusive features appear in both subphenotypes, but hemolytic features are largely confined to HD. These discrepancies may stem from differences in data source, granularity, sample size, and methodology between the two studies, warranting further investigation. Download figure Open in new tab Supplementary Figure 5. Clinical feature distribution across patients within subphenotypes. (A) A 3D bar plot illustrating the proportion of patients in each clinical feature combination group within the subphenotype. (B) A table detailing the group name, an explanation of the group’s clinical feature combination characteristics, and the patient count and percentage of groups within each subphenotype. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. Laboratory biomarker profiling in our rule-based subphenotypes model is aligned with the findings of Kato et al ( 23 ). In our primary analysis using the clinical trial database, the mean Hb was significantly higher in the VP subphenotype compared to the HD subphenotype (although only by 0.3 g/dL, p < 0.001) This trend was also observed in the secondary analysis using the MSH study data, with a difference of 0.45 g/dL (p = 0.002, Figure 3H , 5G ). These findings align with the clinical intuitions, that higher Hb and HbF in the VP subphenotype could be the result of the body’s compensatory mechanisms to address ischemia and pain. Similarly, Kato et al. reported that higher Hb levels were associated with increased risk of vaso-occlusive complications (VOC, AVN, and ACS), whereas lower Hb levels are associated with hemolytic complications (ulcers, PHT, priapism and stroke) ( 23 ). Further validation of these predictive marker patterns in a larger SCD database is warranted. Higher levels of HbF can interfere with HbS polymerization, serving as a protective factor against vaso-occlusion. In our study, the VP subphenotype demonstrated significant higher mean HbF values in both primary (clinical trial database) and secondary (MSH data) biomarker analyses ( Figure 3H , 5G ). However, the magnitudes of the difference in mean HbF between HD and VP subphenotypes varied: 0.8% in the clinical trial database vs. 1.5% in the MSH data. This discrepancy may be due to a substantial proportion of missing baseline HbF data in our clinical trial database – 43% in the VP subphenotype and 38.3% in the HD subphenotype ( Supplementary Table 4 ). In contrast, the MSH data had no missing baseline laboratory values. View this table: View inline View popup Download powerpoint Supplementary Table 3. Age-specific value ranges used for the normalization of laboratory biomarkers. These ranges were based on the literature reported for both normal individual and patient’s with SCD. View this table: View inline View popup Download powerpoint Supplementary Table 4. Summary of missing data. Patient counts and percentages of missing data counts for biomarkers in the VP and HD subphenotypes. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. Another possible reason for the relatively higher mean HbF observed in the pooled clinical trial data compared to the MSH study may be due to the age distribution difference between the two datasets. The clinical trial database included 17.5% (n=506) patients younger than 10 years, and 37% (n=1,074) younger than 18 years ( Supplementary Table 1 ), whereas MSH participants were adults aged 18-54 years ( Supplementary Table 5 ). This age disparity could explain the HbF differences, as HbF concentrations in patients with SCD are influenced by age, sex and the geographical distribution of genetic modifiers ( 2 , 39 ). In individuals with SCD, HbF switching is delayed to 5-10 years of age ( 40 ), whereas in the normal population HbF declines approximately from 85% at birth to <1% at one year of age ( 14 ). View this table: View inline View popup Download powerpoint Supplementary Table 5. Summary of SCD population from MSH study In addition, all patients enrolled in the MSH study were hydroxyurea naïve; whereas, our clinical trial database included patients with varying baseline hydroxyurea exposure. Specifically, 1,270 patients (44.0%) were hydroxyurea naïve, 1,479 (51.2%) were receiving stable hydroxyurea treatment, and 138 (4.8%) had missing hydroxyurea exposure data ( Supplementary Table 1 ). Therefore, the smaller HbF difference observed between the HD and VP subphenotypes in the clinical trial database compared to the MSH study may be partly explained by the fact that over half of the patients (51.2%) in our clinical trial database were receiving hydroxyurea treatment at baseline. Another notable finding observed in both subphenotypes is that the mean HbF value in patients with the HbSS genotype is significantly lower than in those with an Sβ-thalassemia genotype ( Supplementary Table 7 ). Interestingly, among patients with an Sβ-thalassemia genotype, the mean HbF trended higher in the HD subphenotype compared to the VP subphenotype. In contrast, for patients with the SS genotype, the mean HbF trended lower in the HD subphenotype than in the VP subphenotype ( Supplemental Table 7 ). Although these findings align with clinical observations in SCD and support our rule-based classification, we were unable to definitively characterize whether patients with a primarily vaso-occlusive presentation tend to have higher HbF than patients with the hemolytic presentation using our current subphenotype models. This limitation is partly due to the parametric comparison method employed, where large variability in HbF values affected our ability to detect a consistent difference between subphenotypes. Moreover, the clinical trial data used in this study lacked comprehensive molecular genetic information for all subjects. The foundational observation of the protective role of higher HbF in young children with SCD, first described by Janet Watson ( 41 , 42 ), has driven the field toward increasingly refined genetic characterization of HbF gene expression in SCD. Over several decades, this has evolved into extensive subject-level genetic data collection-encompassing haplotypes, single-nucleotide polymorphism (SNP) or genome-wide association studies (GWAS), including regulatory elements such as X-or other chromosomal transcription factors that are associated with SCD ( 14 , 43 – 75 ). For instance, HbF values <10% driven by certain SNPs or GWAS loci have been associated with increased risk of cerebral vasculopathy in children with SCD ( 76 ). Without incorporating patient level molecular genetic data into our database, we were unable to adequately examine the relationship between HbF values and observed genotypic and phenotypic subgroups. Further, the ML subphenotypes could potentially be refined with additional hemolytic markers and subject level genetic data on deletional or nondeletional α-thalassemia ( 77 – 82 ). Further clinical trials incorporating this level of genetic detail could significantly enhance the precision of subphenotype classification in SCD. Aligning with Kato’s hypothesis, findings from our study revealed that the VP subphenotype had a significantly higher mean HbF value and a significantly higher mean leukocyte count compared to the HD subphenotype ( Figure 3 and Table 1 ). Further analyses revealed that HbF values were inversely correlated with absolute neutrophil and leukocyte counts in both VP and HD subphenotypes ( Supplementary Figure 6 ). This finding is consistent with a report on increases in cumulative level of HbF silencing factors secreted by leukocytes that suppress gamma-globin gene expression ( 83 ). In addition, our analysis did not identify correlations between HbF value and the annual rate of VOC, indirect bilirubin or LDH in either subphenotype. An exception was noted in the VP subphenotype, where HbF was inversely correlated with reticulocyte count ( Supplementary Figure 6 ). These results aligns with previous literature reports that increased HbF values are associated with a reduced frequency of VOC events, and fewer hemolysis related complications, such as an absence of leg ulcers in the Arabian Indian haplotype ( 53 ), but do not significantly influence hemolytic makers in other patients such as those with the severe HbS D-Punjab genotype ( 84 ). Download figure Open in new tab Supplementary Figure 6. Correlation matrices of the clinical feature and laboratory biomarkers for HD and VP Subphenotypes. (A) and (B) display the Pearson correlation coefficients between the clinical feature, annual rate of VOC, and all available laboratory biomarkers for the HD and VP subphenotypes, respectively. The lower triangular heatmaps show the correlation matrix, with correlation coefficients colored and labeled in plot, and significant p-values indicated by asterisks (p < 0.001: ***, p < 0.01: **, p < 0.05: *). VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. The findings of older age of our HD subphenotype compared to the VP subphenotype is expected, ( Figure 2D and Supplementary Figure 2 ), given that the HD subphenotype was defined by clinical conditions such as PHT, stroke, ulcers, and priapism, which are clinical conditions positively correlated with age. However, this age discrepancy introduces a potential confounding factor in our biomarker analysis, as many biomarkers are age dependent. To account for this, we conducted GLM analysis with the formula: biomarker ∼ subphenotype + age. This analysis confirmed that age is significantly associated with most biomarkers in our database. Nevertheless, subphenotypes remained significant for associations with Hb, LDH, and a non-significant, but consistent trend, with indirect bilirubin ( Supplementary Table 6 ), suggesting the relevancy of that subphenotypes in biomarker differences overweighing the impact of age. View this table: View inline View popup Download powerpoint Supplementary Table 6. Coefficients and p-values from Generalized Linear Models (GLM) assessing the association between biomarkers and subphenotype and age. The table presents the estimated coefficients for the effect of subphenotype and age on each biomarker, along with the corresponding p-values indicating statistical significance. A positive coefficient for subphenotype indicates higher biomarker levels in VP patients compared to HD patients, adjusted for age. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. View this table: View inline View popup Download powerpoint Supplementary Table 7. Multiple Comparison of Means using Tukey HSD for the comparison of HbF between different genotypes (HbSBeta and HbSS) and subphenotypes (HD and VP) combinations. The p-values represent the adjusted significance levels (FWER = 0.05) for pairwise comparisons, with meandiff indicating the difference in means between the groups (Higher mean group is marked in bold, meandiff = mean(group2) - mean(group1)). A p-value less than 0.05 suggests a statistically significant difference. N HD HbSBeta = 33, N HD HbSS = 281, N VP HbSBeta = 230, N VP HbSS = 934. VP: Vasoocclusive Primary subphenotype; HD: Hemolysis-Dominant subphenotype. Although SCD is a rare disease, our study is distinguished by the large clinical trial database that we curated and assembled from 16 selected clinical trials. This resource enabled development of our ML pipeline and facilitated a more comprehensive exploration of clinical patterns in SCD, thereby enhancing the robustness and generalizability of our findings. However, when compared to common diseases, even a cohort of several thousands of patients does not constitute true ‘big data’. Further, missing baseline laboratory values ( Figure 2E , Supplementary Table 4 ) presented challenges for applying more complex data mining techniques to our data. Despite these limitations, the ML model developed in our analysis effectively tackled several common challenges encountered in rare disease datasets. Principal Component Analysis (PCA) ( 85 ) combined with K-means is among the most commonly used approaches for clustering. However, both PCA and K-means are sensitive to data scaling and missing values. To address these limitations in cluster modeling in our disease specific data, we made several modifications. First, we selected clinical features with the least missing data, allowing us to retain the largest possible patient cohort for analysis. Second, we replaced PCA with Multiple Correspondence Analysis (MCA), which is more suitable for categorical data. Beyond dimensionality reduction, MCA generated continuous numeric coordinates that captured the relationships among binary clinical features associated with SCD. This was particularly advantageous because K-means clustering requires numeric input, and MCA provided a robust and algorithmic means to transform categorical features accordingly. Using the clinical feature clusters generated by our modified ML pipeline, we assigned patients into their perspective subphenotypes. We then explored each biomarker independently, excluding patients with missing data for that specific variable. Rather than imputing missing values– particularly when insufficient information is available– we chose to preserve data integrity and minimized potential bias. Overall, our integration of ML techniques and statistical methods offers a novel pipeline for exploring clinical datasets with mixed data structure and limited sample sizes. This framework enhances the characterization of SCD subphenotypes and provides a valuable methodological path for research in rare diseases. In addition to our subphenotypes described, we explored supervised learning approaches to integrate demographics, clinical features, biomarkers, and genetic data to predict SCD subphenotypes. However, due to the database size limitation and the high proportion of missing data, we were unable to impute missing values reliably, which precluded the effective use of Random Forest ( 86 ). As an alternative, we applied XGBoost ( 87 ), which achieved a high F1 score for the VP subphenotype. In contrast, performance for the HD subphenotype –representing less than 20% of the study population - was suboptimal, with an unsatisfactory maximal F1 score of approximately 0.3. We also attempted unsupervised visualization using UMAP ( 88 ). However, the binary nature of most variables, combined with the substantial missingness in biomarker data, limited our ability to identify meaningful or distinct patterns. As discussed, our study has limitations. While the clinical trial database we assembled is relatively large for an orphan disease, the sample size remains modest for ML applications, which restricted the use of certain methods. Additionally, the substantial amount of missing data led to exclusion of 18.7% patients from the analysis due to incomplete clinical features ( Figure 1 ). Among the 2,887 patients included in the analysis, missing laboratory data ranged from 9.2% to 44.5% across biomarkers ( Supplementary Table 4 ). The precision of ML-based subphenotype classification was constrained by both limited sample size and the extent of missing data. Furthermore, the range of usable variables available for subphenotype classification in the clinical trial database was restricted to just 14 variables – comprising 7 clinical features and 7 laboratory biomarkers. We applied our rule-based subphenotype classification to an independent clinical study, the MSH study, to evaluate biomarker differences between subphenotypes. The patterns of Hb, HbF, and indirect bilirubin levels observed in the MSH study were consistent with those observed in the clinical trial database. However, several limitations associated with the MSH data analysis should be noted. First, our subphenotype classification pipeline did not perform satisfactorily on MSH data, primarily due to an incomplete clinical feature set -only 5 out of 7 clinical features were available. In addition, LDH levels were not available, and the granularity of baseline VOC data was insufficient. Second, the small sample size (N = 299) further limited the robustness of our analysis. Third, after assigning subphenotypes to patients with incomplete clinical feature data, the VP to HD ratio in the MSH dataset was 1.25:1, compared to 4.22:1 in our clinical trial database. This shift suggests a relative overrepresentation of HD-classified subphenotypes in the MSH data. One possible explanation is that all participants in the MSH study were over 18 years old ( Supplementary Table 5 ), making them more likely to develop age-associated complications such as stroke, leg ulcers, and priapism – features linked to the HD subphenotype. We did not observe significant differences in indirect bilirubin, leukocytes, and the annual rate of VOC between subphenotypes from MSH data, likely due to the limited sample size. Furthermore, the MSH study inclusion criteria required an annual rate of VOC ≥ 3, effectively excluding patients with milder disease. As a result, comparison between VP and HD subphenotypes in MSH data may be confounded by the inclusion of mixed or misclassified phenotypes. This sub-optimal classification likely contributed to the lack of statistically significant differences in certain biomarkers and annual rate of VOC between subphenotypes ( Figure 4 , Supplementary Figure 4 ). Another potential limitation lies in our choice of classification pipeline. While MCA is a well-established and widely used method for analyzing binary variables, it is not the only option. More advanced ML algorithms, including deep learning approaches, may offer improved classification performance. However, as previously discussed, the limited size of our clinical trial database and the high proportion of missing data make it challenging to implement and validate such complex models. The finding of our study should be further validated using a larger database with less missing, more complete data. Further study should also focus on linking subphenotypes to clinical outcomes. This will be a crucial step toward developing predictive ML models that can enhance disease understanding, enhance prognosis, and ultimately enable personalized treatment strategies for patients with SCD. By integrating clinical characteristics, genetic information, and laboratory results, such a model could guide and support more targeted and effective interventions. Data Availability Data are not available to public. Institutional review board statement Author contributions Conceptualization, Study design and clinical implication: Q.R. and P.O. Data curation: N.M., Q.R., R.M.Z., and W.X. Methodology, Analytical design: M.W., Q.L., W.X., and Q.R. Initial Analysis: M.W. Formal analysis and Writing—original draft: W.X. and Q.R. Contributing, writing—review & editing: All authors had full access to all the data related to this study and had final responsibility for the decision to submit for publication. Supervision, Final editing, Resource, and Funding acquisition: Q.R., and P.O. Funding This study was funded by FDA CDER RSR Grants. This project was supported in part by an appointment to the Research Fellowship Program at the Office of New Drugs, Center for Drug Evaluation and Research, U.S. Food and Drug Administration, administered by the Oak Ridge Institute for Science and Education through an interagency agreement between the U.S. Department of Energy and FDA. Declarations No competing or financial interest. Acknowledgement This article reflects the views of the authors and should not be construed to represent views or policies of the FDA. We wish to express our thanks to the following individuals and institutes: Dr. Ann Farrell for her leadership, insightful guidance, and support for the project during her tenure. Dr. Tanya Wroblewski for her leadership, insightful guidance, and support for the project and manuscript preparation. Dr. Sergey Rakhilin, for providing specific materials to this project. NHLBI for generously sharing their MSH database. Reference 1. ↵ Huisman TH . Sickle cell anemia as a syndrome: a review of diagnostic features . Am J Hematol . 1979 ; 6 ( 2 ): 173 – 84 . doi: 10.1002/ajh.2830060210 . PubMed PMID: 382840 . OpenUrl CrossRef PubMed Web of Science 2. ↵ Serjeant GR . The Natural History of Sickle Cell Disease . Cold Spring Harbor Perspectives in Medicine . 2013 ; 3 ( 10 ): a011783 -a. doi: 10.1101/cshperspect.a011783 . OpenUrl Abstract / FREE Full Text 3. Rees DC , Williams TN , Gladwin MT . Sickle-cell disease . Lancet . 2010 ; 376 ( 9757 ): 2018 - 31 . Epub 20101203. doi: 10.1016/s0140-6736(10)61029-x . PubMed PMID: 21131035 . OpenUrl CrossRef PubMed Web of Science 4. Platt OS , Brambilla DJ , Rosse WF , Milner PF , Castro O , Steinberg MH , et al. Mortality in sickle cell disease. Life expectancy and risk factors for early death . N Engl J Med . 1994 ; 330 ( 23 ): 1639 – 44 . doi: 10.1056/nejm199406093302303 . PubMed PMID: 7993409 . OpenUrl CrossRef PubMed Web of Science 5. ↵ Steiberg MH FB , Higgs DR Weathrall DJ . Disorders of hemoglobin: genetics, pathophysiology, and clinical management : Cambridge University Press ; 2009 . 6. ↵ Platt OS , Thorington BD , Brambilla DJ , Milner PF , Rosse WF , Vichinsky E , et al. Pain in Sickle Cell Disease . New England Journal of Medicine . 1991 ; 325 ( 1 ): 11 – 6 . doi: 10.1056/nejm199107043250103 . OpenUrl CrossRef PubMed Web of Science 7. Castro O , Brambilla DJ , Thorington B , Reindorf CA , Scott RB , Gillette P , et al. The acute chest syndrome in sickle cell disease: incidence and risk factors. The Cooperative Study of Sickle Cell Disease . Blood . 1994 ; 84 ( 2 ): 643 – 9 . PubMed PMID: 7517723 . OpenUrl Abstract / FREE Full Text 8. Milner PF , Kraus AP , Sebes JI , Sleeper LA , Dukes KA , Embury SH , et al. Sickle Cell Disease as a Cause of Osteonecrosis of the Femoral Head . New England Journal of Medicine . 1991 ; 325 ( 21 ): 1476 – 81 . doi: 10.1056/nejm199111213252104 . OpenUrl CrossRef PubMed Web of Science 9. Nolan VG , Wyszynski DF , Farrer LA , Steinberg MH . Hemolysis-associated priapism in sickle cell disease . Blood . 2005 ; 106 ( 9 ): 3264 – 7 . doi: 10.1182/blood-2005-04-1594 . OpenUrl Abstract / FREE Full Text 10. Ohene-Frempong K , Weiner SJ , Sleeper LA , Miller ST , Embury S , Moohr JW , et al. Cerebrovascular accidents in sickle cell disease: rates and risk factors . Blood . 1998 ; 91 ( 1 ): 288 – 94 . PubMed PMID: 9414296 . OpenUrl Abstract / FREE Full Text 11. Steinberg MH . Sickle cell anemia, the first molecular disease: overview of molecular etiology, pathophysiology, and therapeutic approaches . ScientificWorldJournal . 2008 ; 8 : 1295 – 324 . Epub 20081225. doi: 10.1100/tsw.2008.157 . PubMed PMID: 19112541 ; PubMed Central PMCID: PMC5848659 . OpenUrl CrossRef PubMed 12. Minniti CP , Delaney KMH , Gorbach AM , Xu D , Lee CCR , Malik N , et al. Vasculopathy, inflammation, and blood flow in leg ulcers of patients with sickle cell anemia . American Journal of Hematology . 2014 ; 89 ( 1 ): 1 – 6 . doi: 10.1002/ajh.23571 . OpenUrl CrossRef PubMed 13. Steinberg MH , Chui DHK , Dover GJ , Sebastiani P , Alsultan A . Fetal hemoglobin in sickle cell anemia: a glass half full? Blood . 2014 ; 123 ( 4 ): 481 – 5 . doi: 10.1182/blood-2013-09-528067 . OpenUrl Abstract / FREE Full Text 14. ↵ Akinsheye I , Alsultan A , Solovieff N , Ngo D , Baldwin CT , Sebastiani P , et al. Fetal hemoglobin in sickle cell anemia . Blood . 2011 ; 118 ( 1 ): 19 – 27 . doi: 10.1182/blood-2011-03-325258 . OpenUrl Abstract / FREE Full Text 15. ↵ Gladwin MT , Vichinsky E . Pulmonary Complications of Sickle Cell Disease . New England Journal of Medicine . 2008 ; 359 ( 21 ): 2254 – 65 . doi: 10.1056/nejmra0804411 . OpenUrl CrossRef PubMed Web of Science 16. ↵ Telen MJ . Beyond hydroxyurea: new and old drugs in the pipeline for sickle cell disease . Blood . 2016 ; 127 ( 7 ): 810 – 9 . Epub 20160112. doi: 10.1182/blood-2015-09-618553 . PubMed PMID: 26758919 ; PubMed Central PMCID: PMC4760087 . OpenUrl Abstract / FREE Full Text 17. Yawn BP , Buchanan GR , Afenyi-Annan AN , Ballas SK , Hassell KL , James AH , et al. Management of Sickle Cell Disease . JAMA . 2014 ; 312 ( 10 ): 1033 . doi: 10.1001/jama.2014.10517 . OpenUrl CrossRef PubMed 18. Niihara Y , Miller ST , Kanter J , Lanzkron S , Smith WR , Hsu LL , et al. A Phase 3 Trial of l-Glutamine in Sickle Cell Disease . N Engl J Med . 2018 ; 379 ( 3 ): 226 – 35 . doi: 10.1056/NEJMoa1715971 . PubMed PMID: 30021096 . OpenUrl CrossRef PubMed 19. Ataga KI , Kutlar A , Kanter J , Liles D , Cancado R , Friedrisch J , et al. Crizanlizumab for the Prevention of Pain Crises in Sickle Cell Disease . New England Journal of Medicine . 2017 ; 376 ( 5 ): 429 – 39 . doi: 10.1056/nejmoa1611770 . OpenUrl CrossRef PubMed 20. Vichinsky E , Hoppe CC , Ataga KI , Ware RE , Nduba V , El-Beshlawy A , et al. A Phase 3 Randomized Trial of Voxelotor in Sickle Cell Disease . New England Journal of Medicine . 2019 ; 381 ( 6 ): 509 – 19 . doi: 10.1056/nejmoa1903212 . OpenUrl CrossRef PubMed 21. Kavanagh PL , Fasipe TA , Wun T. Sickle Cell Disease . JAMA . 2022 ; 328 ( 1 ): 57 . doi: 10.1001/jama.2022.10233 . OpenUrl CrossRef PubMed 22. Casgevy and Lyfgenia: Two gene therapies for sickle cell disease . Med Lett Drugs Ther . 2024 ; 66 ( 1694 ): 9 - 10 . doi: 10.58347/tml.2024.1694a . PubMed PMID: 38212256 . OpenUrl CrossRef PubMed 23. ↵ Kato GJ , Gladwin MT , Steinberg MH . Deconstructing sickle cell disease: reappraisal of the role of hemolysis in the development of clinical subphenotypes . Blood Rev . 2007 ; 21 ( 1 ): 37 – 47 . Epub 2006/11/07. doi: 10.1016/j.blre.2006.07.001 . PubMed PMID: 17084951 ; PubMed Central PMCID: PMC2048670 . OpenUrl CrossRef PubMed Web of Science 24. ↵ Heeney MM , Abboud MR , Githanga J , Inusa BPD , Kanter J , Michelson AD , et al. Ticagrelor vs placebo for the reduction of vaso-occlusive crises in pediatric sickle cell disease: the HESTIA3 study . Blood . 2022 ; 140 ( 13 ): 1470 – 81 . doi: 10.1182/blood.2021014095 . OpenUrl CrossRef PubMed 25. ↵ Kanter J , Heath LE , Knorr J , Agbenyega ET , Colombatti R , Dampier C , et al. Novel findings from the multinational study on geographic and age-related differences in pain perception and analgesic usage in children with sickle cell anaemia . British Journal of Haematology . 2019 ; 184 ( 6 ): 1058 – 61 . doi: 10.1111/bjh.15250 . OpenUrl CrossRef PubMed 26. ↵ Farrell AT , Panepinto J , Desai AA , Kassim AA , Lebensburger J , Walters MC , et al. End points for sickle cell disease clinical trials: renal and cardiopulmonary, cure, and low-resource settings . Blood Advances . 2019 ; 3 ( 23 ): 4002 – 20 . doi: 10.1182/bloodadvances.2019000883 . OpenUrl CrossRef PubMed 27. ↵ Farrell AT , Panepinto J , Carroll CP , Darbari DS , Desai AA , King AA , et al. End points for sickle cell disease clinical trials: patient-reported outcomes, pain, and the brain . Blood Advances . 2019 ; 3 ( 23 ): 3982 – 4001 . doi: 10.1182/bloodadvances.2019000882 . OpenUrl CrossRef PubMed 28. ↵ El Hoss S , El Nemer W , Rees DC . Precision Medicine and Sickle Cell Disease . Hemasphere . 2022 ; 6 ( 9 ): e762 . PubMed PMID: 35999951 . OpenUrl PubMed 29. ↵ Kato GJ , Piel FB , Reid CD , Gaston MH , Ohene-Frempong K , Krishnamurti L , et al. Sickle cell disease . Nat Rev Dis Primers . 2018 ; 4 : 18010 . Epub 2018/03/16. doi: 10.1038/nrdp.2018.10 . PubMed PMID: 29542687 . OpenUrl CrossRef PubMed 30. ↵ Kato GJ , Gladwin MT , Steinberg MH . Deconstructing sickle cell disease: Reappraisal of the role of hemolysis in the development of clinical subphenotypes . Blood Reviews . 2007 ; 21 ( 1 ): 37 – 47 . doi: 10.1016/j.blre.2006.07.001 . OpenUrl CrossRef PubMed Web of Science 31. ↵ Kato GJ , Steinberg MH , Gladwin MT . Intravascular hemolysis and the pathophysiology of sickle cell disease . J Clin Invest . 2017 ; 127 ( 3 ): 750 – 60 . Epub 2017/03/02. doi: 10.1172/JCI89741 . PubMed PMID: 28248201 ; PubMed Central PMCID: PMC5330745 . OpenUrl CrossRef PubMed 32. ↵ Charache S , Terrin ML , Moore RD , Dover GJ , Barton FB , Eckert SV , et al. Effect of Hydroxyurea on the Frequency of Painful Crises in Sickle Cell Anemia . New England Journal of Medicine . 1995 ; 332 ( 20 ): 1317 – 22 . doi: 10.1056/nejm199505183322001 . OpenUrl CrossRef PubMed Web of Science 33. ↵ Greenacre M , and Jorg Blasius . Multiple correspondence analysis and related methods 2006 . 34. ↵ Lloyd S . Least squares quantization in PCM . IEEE transactions on information theory 1982 ; 28 ( 2 ): 129 – 37 . OpenUrl CrossRef 35. ↵ Mann HB , and Donald R. Whitney . On a test of whether one of two random variables is stochastically larger than the other . The annals of mathematical statistics 1947 ; 1947 : 50 – 60 . OpenUrl 36. ↵ Pearson K . On the criterion that a given system of deviations from the probable in the case of a correlated system of variables is such that it can be reasonably supposed to have arisen from random sampling . The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science 1900 ; 50 ( 302 ): 157 – 75 . OpenUrl CrossRef 37. ↵ Okpala I . Leukocyte adhesion and the pathophysiology of sickle cell disease . Curr Opin Hematol . 2006 ; 13 ( 1 ): 40 – 4 . doi: 10.1097/01.moh.0000190108.62414.06 . PubMed PMID: 16319686 . OpenUrl CrossRef PubMed Web of Science 38. ↵ Wun T . The Role of Inflammation and Leukocytes in the Pathogenesis of Sickle Cell Disease . Hematology . 2000 ; 5 ( 5 ): 403 – 12 . doi: 10.1080/10245332.2000.11746536 . OpenUrl CrossRef 39. ↵ Steinberg MH . Fetal hemoglobin in sickle cell anemia . Blood . 2020 ; 136 ( 21 ): 2392 – 400 . doi: 10.1182/blood.2020007645 . OpenUrl CrossRef PubMed 40. ↵ Brown AK . Reference Values and Hematologic Changes From Birth to 5 Years in Patients With Sickle Cell Disease . Archives of Pediatrics && Adolescent Medicine . 1994 ; 148 ( 8 ): 796 . doi: 10.1001/archpedi.1994.02170080026005 . OpenUrl CrossRef PubMed Web of Science 41. ↵ Watson J . A study of sickling of young erythrocytes in sickle cell anemia . Blood . 1948 ; 3 ( 4 ): 465 – 9 . PubMed PMID: 18908032 . OpenUrl Abstract / FREE Full Text 42. ↵ Watson J . The significance of the paucity of sickle cells in newborn Negro infants . Am J Med Sci . 1948 ; 215 ( 4 ): 419 – 23 . doi: 10.1097/00000441-194804000-00008 . PubMed PMID: 18107723 . OpenUrl CrossRef PubMed Web of Science 43. ↵ Milton JN , Gordeuk VR , Taylor JG , Gladwin MT , Steinberg MH , Sebastiani P . Prediction of Fetal Hemoglobin in Sickle Cell Anemia Using an Ensemble of Genetic Risk Prediction Models . Circulation: Cardiovascular Genetics . 2014 ; 7 ( 2 ): 110 – 5 . doi: 10.1161/circgenetics.113.000387 . OpenUrl Abstract / FREE Full Text 44. Steinberg MH . Genetic Etiologies for Phenotypic Diversity in Sickle Cell Anemia . The Scientific World JOURNAL . 2009 ; 9 : 46 – 67 . doi: 10.1100/tsw.2009.10 . OpenUrl CrossRef PubMed 45. Rees DC , Brousse VAM , Brewin JN . Determinants of severity in sickle cell disease . Blood Rev . 2022 ; 56 : 100983 . Epub 20220609. doi: 10.1016/j.blre.2022.100983 . PubMed PMID: 35750558 . OpenUrl CrossRef PubMed 46. Pagnier J , Mears JG , Dunda-Belkhodja O , Schaefer-Rego KE , Beldjord C , Nagel RL , et al. Evidence for the multicentric origin of the sickle cell hemoglobin gene in Africa . Proceedings of the National Academy of Sciences . 1984 ; 81 ( 6 ): 1771 – 3 . doi: 10.1073/pnas.81.6.1771 . OpenUrl Abstract / FREE Full Text 47. Kulozik AE , Kar BC , Satapathy RK , Serjeant BE , Serjeant GR , Weatherall DJ . Fetal hemoglobin levels and beta (s) globin haplotypes in an Indian populations with sickle cell disease . Blood . 1987 ; 69 ( 6 ): 1742 – 6 . PubMed PMID: 2437982 . OpenUrl Abstract / FREE Full Text 48. Antonarakis SE , Boehm CD , Serjeant GR , Theisen CE , Dover GJ , Kazazian HH . Origin of the beta S-globin gene in blacks: the contribution of recurrent mutation or gene conversion or both . Proceedings of the National Academy of Sciences . 1984 ; 81 ( 3 ): 853 – 6 . doi: 10.1073/pnas.81.3.853 . OpenUrl Abstract / FREE Full Text 49. Chebloune Y , Pagnier J , Trabuchet G , Faure C , Verdier G , Labie D , et al. Structural analysis of the 5’ flanking region of the beta-globin gene in African sickle cell anemia patients: further evidence for three origins of the sickle cell mutation in Africa . Proceedings of the National Academy of Sciences . 1988 ; 85 ( 12 ): 4431 – 5 . doi: 10.1073/pnas.85.12.4431 . OpenUrl Abstract / FREE Full Text 50. RL N. The origin of the hemoglobin S gene: Clinical, genetic, and anthropological consequences . Einstein Q J Biol Med . 1984 ( 2 ): 53 – 62 . OpenUrl 51. Shriner D , Rotimi CN . Whole-Genome-Sequence-Based Haplotypes Reveal Single Origin of the Sickle Allele during the Holocene Wet Phase . The American Journal of Human Genetics . 2018 ; 102 ( 4 ): 547 – 56 . doi: 10.1016/j.ajhg.2018.02.003 . OpenUrl CrossRef PubMed 52. Green NS , Fabry ME , Kaptue-Noche L , Nagel RL . Senegal haplotype is associated with higher HbF than Benin and Cameroon haplotypes in African children with sickle cell anemia . Am J Hematol . 1993 ; 44 ( 2 ): 145 – 6 . doi: 10.1002/ajh.2830440214 . PubMed PMID: 7505527 . OpenUrl CrossRef PubMed 53. ↵ Powars D . Sickle Cell Anemia . American Journal of Diseases of Children . 1993 ; 147 ( 11 ): 1197 . doi: 10.1001/archpedi.1993.02160350071011 . OpenUrl CrossRef PubMed 54. Perrine RP , Pembrey ME , John P , Perrine S , Shoup F . Natural history of sickle cell anemia in Saudi Arabs . A study of 270 subjects. Ann Intern Med . 1978 ; 88 ( 1 ): 1 - 6 . doi: 10.7326/0003-4819-88-1-1 . PubMed PMID: 619731 . OpenUrl CrossRef PubMed Web of Science 55. Labie D , Pagnier J , Lapoumeroulie C , Rouabhi F , Dunda-Belkhodja O , Chardin P , et al. Common haplotype dependency of high G gamma-globin gene expression and high Hb F levels in beta-thalassemia and sickle cell anemia patients . Proc Natl Acad Sci U S A . 1985 ; 82 ( 7 ): 2111 – 4 . doi: 10.1073/pnas.82.7.2111 . PubMed PMID: 2580306 ; PubMed Central PMCID: PMC397502 . OpenUrl Abstract / FREE Full Text 56. Nagel RL , Fabry ME , Pagnier J , Zohoun I , Wajcman H , Baudin V , et al. Hematologically and Genetically Distinct Forms of Sickle Cell Anemia in Africa . New England Journal of Medicine . 1985 ; 312 ( 14 ): 880 – 4 . doi: 10.1056/nejm198504043121403 . OpenUrl CrossRef PubMed Web of Science 57. Pagnier J , Baudin V , Labie D , Wajcman H , Jaeger G , Girot R . Sickle Cell Anemia in Bantu Speaking Africa . Hemoglobin . 1986 ; 10 ( 1 ): 73 – 6 . doi: 10.3109/03630268609072472 . OpenUrl CrossRef PubMed 58. Serjeant GR , Vichinsky E . Variability of homozygous sickle cell disease: The role of alpha and beta globin chain variation and other factors . Blood Cells Mol Dis . 2018 ; 70 : 66 – 77 . Epub 20170621. doi: 10.1016/j.bcmd.2017.06.004 . PubMed PMID: 28689691 . OpenUrl CrossRef PubMed 59. Gilman JG , Huisman TH . DNA sequence variation associated with elevated fetal G gamma globin production . Blood . 1985 ; 66 ( 4 ): 783 – 7 . PubMed PMID: 2412616 . OpenUrl Abstract / FREE Full Text 60. Labie D , Dunda-Belkhodja O , Rouabhi F , Pagnier J , Ragusa A , Nagel RL . The −158 site 5’ to the G gamma gene and G gamma expression . Blood . 1985 ; 66 ( 6 ): 1463 – 5 . PubMed PMID: 4063531 . OpenUrl Abstract / FREE Full Text 61. Thein SL , Menzel S . Discovering the genetics underlying foetal haemoglobin production in adults . British Journal of Haematology . 2009 ; 145 ( 4 ): 455 – 67 . doi: 10.1111/j.1365-2141.2009.07650.x . OpenUrl CrossRef PubMed Web of Science 62. Gardner K , Fulford T , Silver N , Rooks H , Angelis N , Allman M , et al. g(HbF): a genetic model of fetal hemoglobin in sickle cell disease . Blood Adv . 2018 ; 2 ( 3 ): 235 – 9 . doi: 10.1182/bloodadvances.2017009811 . PubMed PMID: 29437638 ; PubMed Central PMCID: PMC5812320 . OpenUrl Abstract / FREE Full Text 63. Thein SL , Menzel S , Lathrop M , Garner C . Control of fetal hemoglobin: new insights emerging from genomics and clinical implications . Hum Mol Genet . 2009 ; 18 ( R2 ): R216 – 23 . doi: 10.1093/hmg/ddp401 . PubMed PMID: 19808799 ; PubMed Central PMCID: PMC2758709 . OpenUrl CrossRef PubMed Web of Science 64. Lettre G , Sankaran VG , Bezerra MAC , Araújo AS , Uda M , Sanna S , et al. DNA polymorphisms at the BCL11A, HBS1L-MYB, and β-globin loci associate with fetal hemoglobin levels and pain crises in sickle cell disease . Proceedings of the National Academy of Sciences . 2008 ; 105 ( 33 ): 11869 – 74 . doi: 10.1073/pnas.0804799105 . OpenUrl Abstract / FREE Full Text 65. Garner C , Tatu T , Reittie JE , Littlewood T , Darley J , Cervino S , et al. Genetic influences on F cells and other hematologic variables: a twin heritability study . Blood . 2000 ; 95 ( 1 ): 342 – 6 . PubMed PMID: 10607722 . OpenUrl Abstract / FREE Full Text 66. Menzel S , Garner C , Gut I , Matsuda F , Yamaguchi M , Heath S , et al. A QTL influencing F cell production maps to a gene encoding a zinc-finger protein on chromosome 2p15 . Nat Genet . 2007 ; 39 ( 10 ): 1197 – 9 . Epub 20070902. doi: 10.1038/ng2108 . PubMed PMID: 17767159 . OpenUrl CrossRef PubMed Web of Science 67. ↵ Danjou F , Zoledziewska M , Sidore C , Steri M , Busonero F , Maschio A , et al. Genome-wide association analyses based on whole-genome sequencing in Sardinia provide insights into regulation of hemoglobin levels . Nature Genetics . 2015 ; 47 ( 11 ): 1264 – 71 . doi: 10.1038/ng.3307 . OpenUrl CrossRef PubMed 68. Bauer DE , Kamran SC , Lessard S , Xu J , Fujiwara Y , Lin C , et al. An Erythroid Enhancer of BCL11A Subject to Genetic Variation Determines Fetal Hemoglobin Level . Science . 2013 ; 342 ( 6155 ): 253 - 7 . doi: 10.1126/science.1242088 . OpenUrl Abstract / FREE Full Text 69. Farrell JJ , Sherva RM , Chen Z-Y , Luo H-Y , Chu BF , Ha SY , et al. A 3-bp deletion in the HBS1L-MYB intergenic region on chromosome 6q23 is associated with HbF expression . Blood . 2011 ; 117 ( 18 ): 4935 – 45 . doi: 10.1182/blood-2010-11-317081 . OpenUrl Abstract / FREE Full Text 70. Gardner K , Fulford T , Silver N , Rooks H , Angelis N , Allman M , et al. g(HbF): a genetic model of fetal hemoglobin in sickle cell disease . Blood Advances . 2018 ; 2 ( 3 ): 235 – 9 . doi: 10.1182/bloodadvances.2017009811 . OpenUrl Abstract / FREE Full Text 71. Lettre G , Bauer DE . Fetal haemoglobin in sickle-cell disease: from genetic epidemiology to new therapeutic strategies . Lancet . 2016 ; 387 ( 10037 ): 2554 – 64 . doi: 10.1016/s0140-6736(15)01341-0 . PubMed PMID: 27353686 . OpenUrl CrossRef PubMed 72. Dover GJ , Smith KD , Chang YC , Purvis S , Mays A , Meyers DA , et al. Fetal hemoglobin levels in sickle cell disease and normal individuals are partially controlled by an X-linked gene located at Xp22.2 . Blood . 1992 ; 80 ( 3 ): 816 – 24 . PubMed PMID: 1379090 . OpenUrl Abstract / FREE Full Text 73. Thein SL , Menzel S , Peng X , Best S , Jiang J , Close J , et al. Intergenic variants of HBS1L-MYB are responsible for a major quantitative trait locus on chromosome 6q23 influencing fetal hemoglobin levels in adults . Proceedings of the National Academy of Sciences . 2007 ; 104 ( 27 ): 11346 – 51 . doi: 10.1073/pnas.0611393104 . OpenUrl Abstract / FREE Full Text 74. Close J , Game L , Clark B , Bergounioux J , Gerovassili A , Thein SL . Genome annotation of a 1.5 Mb region of human chromosome 6q23 encompassing a quantitative trait locus for fetal hemoglobin expression in adults . BMC Genomics . 2004 ; 5 ( 1 ). doi: 10.1186/1471-2164-5-33 . OpenUrl CrossRef PubMed 75. ↵ Bhatnagar P , Purvis S , Barron-Casella E , Debaun MR , Casella JF , Arking DE , et al. Genome-wide association study identifies genetic variants influencing F-cell levels in sickle-cell patients . Journal of Human Genetics . 2011 ; 56 ( 4 ): 316 – 23 . doi: 10.1038/jhg.2011.12 . OpenUrl CrossRef PubMed 76. ↵ Chamouine A , Saandi T , Muszlak M , Larmaraud J , Lambrecht L , Poisson J , et al. High fetal hemoglobin level is associated with increased risk of cerebral vasculopathy in children with sickle cell disease in Mayotte . BMC Pediatrics . 2020 ; 20 ( 1 ). doi: 10.1186/s12887-020-02187-6 . OpenUrl CrossRef 77. ↵ Steinberg MH , Embury SH . Alpha-thalassemia in blacks: genetic and clinical aspects and interactions with the sickle hemoglobin gene . Blood . 1986 ; 68 ( 5 ): 985 – 90 . PubMed PMID: 3533181 . OpenUrl Abstract / FREE Full Text 78. Bernaudin F , Arnaud C , Kamdem A , Hau I , Lelong F , Epaud R , et al. Biological impact of α genes, β haplotypes, and G6PD activity in sickle cell anemia at baseline and with hydroxyurea . Blood Advances . 2018 ; 2 ( 6 ): 626 – 37 . doi: 10.1182/bloodadvances.2017014555 . OpenUrl Abstract / FREE Full Text 79. Higgs DR , Aldridge BE , Lamb J , Clegg JB , Weatherall DJ , Hayes RJ , et al. The Interaction of Alpha-Thalassemia and Homozygous Sickle-Cell Disease . New England Journal of Medicine . 1982 ; 306 ( 24 ): 1441 – 6 . doi: 10.1056/nejm198206173062402 . OpenUrl CrossRef PubMed Web of Science 80. Serjeant BE , Mason KP , Kenny MW , Stuart J , Higgs DR , Weatherall DJ , et al. Effect of alpha thalassaemia on the rheology of homozygous sickle cell disease . British Journal of Haematology . 1983 ; 55 ( 3 ): 479 – 86 . doi: 10.1111/j.1365-2141.1983.tb02163.x . OpenUrl CrossRef PubMed Web of Science 81. ↵ Embury SH , Clark MR , Monroy G , Mohandas N . Concurrent sickle cell anemia and alpha-thalassemia. Effect on pathological properties of sickle erythrocytes . Journal of Clinical Investigation . 1984 ; 73 ( 1 ): 116 – 23 . doi: 10.1172/jci111181 . OpenUrl CrossRef PubMed Web of Science 82. ↵ de Ceulaer K HD , Weatherall DJ et al. ,. Alpha-Thalassemia Reduces the Hemolytic Rate in Homozygous Sickle-Cell Disease . New England Journal of Medicine . 1983 ; 309 ( 3 ): 189 – 90 . doi: 10.1056/nejm198307213090320 . OpenUrl CrossRef PubMed Web of Science 83. ↵ Ikuta T , Sellak H , Liu S-Y , Odo N . Serum of sickle cell disease patients contains fetal hemoglobin silencing factors secreted from leukocytes . Journal of Blood Medicine . 2018 ;Volume 9 : 95 – 104 . doi: 10.2147/jbm.s156999 . OpenUrl CrossRef PubMed 84. ↵ Patel DK , Purohit P , Dehury S , Das P , Dutta A , Meher S , et al. Fetal hemoglobin and alpha thalassemia modulate the phenotypic expression of HbSD-Punjab . Int J Lab Hematol . 2014 ; 36 ( 4 ): 444 – 50 . Epub 20131119. doi: 10.1111/ijlh.12165 . PubMed PMID: 24245819 . OpenUrl CrossRef PubMed 85. ↵ Gewers FL FG , Arruda HF , et al. Principal Component Analysis: A Natural Approach to Data Exploration . ACM Computing Surveys (CSUR) . 2021 ; 54 ( 1-34 ). 86. ↵ Breiman L. Machine Learning . 2001 ; 45 ( 1 ): 5 - 32 . doi: 10.1023/a:1010933404324 . OpenUrl CrossRef PubMed 87. ↵ Chen T , Guestrin C. XGBoost: A Scalable Tree Boosting System . Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining ; San Francisco, California, USA : Association for Computing Machinery ; 2016 . p. 785 – 94 . 88. ↵ McInnes L HJ , and Melville J. UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction . arXiv . 2020 ; 1802.03426 . doi: 10.48550/arxiv.1802.03426 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted June 02, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Machine Learning-Based Identification of Sickle Cell Disease Subphenotypes in Clinical Trial Data Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Machine Learning-Based Identification of Sickle Cell Disease Subphenotypes in Clinical Trial Data Wei Xiao , Patricia Oneal , Menglun Wang , Nihar J. Mehta , Qi Liu , Rongmei Zhang , Susan Perrine , Qin Ryan medRxiv 2025.06.01.25328537; doi: https://doi.org/10.1101/2025.06.01.25328537 Share This Article: Copy Citation Tools Machine Learning-Based Identification of Sickle Cell Disease Subphenotypes in Clinical Trial Data Wei Xiao , Patricia Oneal , Menglun Wang , Nihar J. Mehta , Qi Liu , Rongmei Zhang , Susan Perrine , Qin Ryan medRxiv 2025.06.01.25328537; doi: https://doi.org/10.1101/2025.06.01.25328537 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Hematology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15222) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6589) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (971) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9221) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffe37a00cf141e2',t:'MTc3OTQ3ODAzNg=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-23T02:00:01.238055+00:00
License: CC-BY-4.0