Full text
43,917 characters
· extracted from
preprint-html
· click to expand
Application of Machine Learning Approaches to Develop Predictive Models for Diabetes and Hypertension among Bangladesh Adults | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Application of Machine Learning Approaches to Develop Predictive Models for Diabetes and Hypertension among Bangladesh Adults View ORCID Profile Gulam Muhammed Al Kibria , James O’Hagan , View ORCID Profile Golam Shariar , View ORCID Profile Tarina Khan , Mohammed Elfaramawi doi: https://doi.org/10.1101/2025.05.30.25328660 Gulam Muhammed Al Kibria 1 Johns Hopkins Bloomberg School of Public Health , Baltimore, MD, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Gulam Muhammed Al Kibria For correspondence: gkibria1{at}outlook.com James O’Hagan 2 US Census Bureau Find this author on Google Scholar Find this author on PubMed Search for this author on this site Golam Shariar 3 New York Institute of Technology College of Osteopathic Medicine Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Golam Shariar Tarina Khan 3 New York Institute of Technology College of Osteopathic Medicine Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tarina Khan Mohammed Elfaramawi 4 University of Arkansas for Medical Sciences , Little Rock, AR, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Introduction With rapid urbanization, lifestyle changes, and an aging population, non-communicable diseases (NCDs), including hypertension and diabetes, pose significant public health challenges in Bangladesh and many other low- and middle-income countries. This study used machine learning (ML) approaches to develop predictive models for hypertension and diabetes among adults in this country. Methods We analyzed Bangladesh Demographic and Health Survey 2022 data. This is a nationally representative cross-sectional survey. Participants were classified as hypertensive when their systolic blood pressure was ≥140 mmHg, diastolic blood pressure was ≥90 mmHg, or if they used antihypertensive medication. They were classified as diabetic if their fasting plasma glucose was ≥7.0 mmol/L or they used glucose-lowering drugs. Potential predictors included age, gender, education, wealth quintile, overweight/obesity, rural-urban residence, and division of residence. Descriptive analysis was conducted, and six ML models were applied: artificial neural network (ANN), random forest, adaptive boosting (AdaBoost), gradient boosting, XGBoost, and support vector machine (SVM). Models’ performance was evaluated via accuracy, area under the curve (AUC), sensitivity, specificity, and F1-score. Feature importance was assessed to rank risk factors. Results The study included 13,847 adults, 55% of whom were females. Diabetes and hypertension had prevalence rates of 16.3% and 20.5%, respectively, with both conditions increasing with age, and the highest prevalence was 24.4% for diabetes and 43.3% for hypertension in individuals aged 65 and older. Wealthier and urban residents experienced higher rates (diabetes: 24.9% among the richest compared to 9.9% among the poorest; hypertension: 23.3% in urban versus 19.2% in rural areas). Additionally, overweight/obesity was a strong predictor for both conditions. For diabetes, AdaBoost had the highest AUC (0.699) and SVM had the highest accuracy (0.836); for hypertension, AdaBoost had the greatest AUC (0.775) and accuracy (0.799). Hypertension topped diabetes predictors, while overweight/obesity led for hypertension, followed by age and diabetes. Wealth and gender were moderately influential, with education and geographic factors less so. Low specificity across models indicated challenges in identifying non-cases. Conclusion In this ML-driven analysis, we identified the bidirectional relationship of hypertension and diabetes along with several other predictors, including overweight/obesity, older age, and richer household wealth quintiles. Our findings underscore the need for integrated screening and lifestyle interventions targeting high-risk groups to mitigate future NCD burden. Introduction Non-communicable diseases (NCDs) such as hypertension and diabetes have emerged as critical public health challenges globally, with a particularly alarming rise in low- and middle-income countries (LMICs), including Bangladesh [ 1 – 3 ]. Recent estimates suggest that NCDs are responsible for over two-thirds of global deaths each year, with cardiovascular diseases and diabetes ranking among the top contributors [ 4 , 5 ]. Rapid urbanization, sedentary lifestyles, poor dietary habits, and an aging population have caused the increasing prevalence of these conditions in LMICs, including Bangladesh [ 1 , 6 , 7 ]. According to the International Diabetes Federation (IDF), approximately 13.1 million adults in Bangladesh were living with diabetes in 2021, a number projected to grow significantly by 2045 if current trends persist [ 8 ]. Similarly, hypertension affects nearly one in four adults in the country, often undiagnosed until complications arise [ 9 , 10 ]. These results underscore the urgent need for proactive strategies to identify at-risk populations and implement effective prevention measures. Artificial intelligence (AI) and machine learning (ML) have revolutionized the analysis of complex health data, and offer a data-driven approach to identify patterns and predictors of diseases or outcomes which are not identifiable by conventional statistical methods [ 11 , 12 ]. For instance, unlike traditional regression models, ML techniques such as decision trees, random forests, and neural networks can handle high-dimensional data and detect non-linear relationships [ 11 ]. ML has been successfully applied in various settings to predict burden for conditions like diabetes and hypertension based on factors like body mass index (BMI), family history, and lifestyle habits [ 13 , 14 ]. The lack of data on conditions like hypertension and diabetes was a problem in Bangladesh and other similar LMICs. However, multiple recent surveys and studies have investigated the burden and factors affecting these two conditions [ 3 , 6 , 15 , 16 ]. For instance, studies reported that people with older age and higher socioeconomic status (i.e., higher education and wealth) have a higher burden of hypertension/diabetes than those with younger age and lower socioeconomic status [ 6 , 16 , 17 ]. Despite the growing body of research on NCDs in Bangladesh, several gaps in the literature persist, including the identification of actionable risk factors through advanced analytics like ML [ 18 , 19 ]. This narrow scope overlooks the diversity of risk factors across rural and urban populations, as well as socioeconomic gradients captured in national surveys like the BDHS. Although global studies have highlighted predictors such as obesity and wealth status using ML techniques, there is a lack of evidence on how these factors differ in a Bangladeshi setting. These gaps highlight the need for a comprehensive, ML-driven analysis that uses nationally representative data to provide a holistic understanding of hypertension and diabetes risk. To overcome these limitations, this study employs ML techniques on the BDHS 2022 dataset. Our objectives are twofold: (1) to identify predictors of diabetes and hypertension among Bangladeshi adults, and (2) to assess the performance of different ML models in this area. We aim to reveal new insights into the factors driving these conditions and inform focused public health strategies. The results have the potential to help policymakers develop interventions that tackle the underlying causes of NCDs, thereby easing their impact on Bangladesh’s healthcare system. Methods Study design We analyzed data from BDHS 2022, a nationwide cross-sectional survey in Bangladesh. This survey covered rural and urban regions of all administrative divisions of Bangladesh. It aimed to obtain estimates of major demographic and health indicators, including hypertension and diabetes. Mitra and Associates, a private research firm in Bangladesh, conducted it. Approximately two hundred data collectors underwent recruitment and training to conduct interviews. Data collection took place from June to December of 2022 [ 9 ]. We analyzed the data for the present study in April 2025. Sample design and coverage BDHS 2022 used a multistage cluster sampling design. It created a sampling frame based on Bangladesh’s 2011 housing and population census. This sample frame contained a list of enumeration areas (EA). Households were chosen from the EA in two stages, and data were collected during household visits [ 9 ]. For BDHS 2022, 675 EAs were chosen, comprising 425 from rural and 250 from urban areas [ 10 , 20 ]. A higher number of EAs were selected from rural regions to align with the country’s population distribution. Approximately 30 households were randomly selected from each EA in the second stage. One-third of the households were eligible for blood pressure and blood sugar measurements. These ensured the representativeness of the samples to ensure reliable estimates of demographic and health indicators. The age eligibility for BDHS 2022 hypertension and diabetes modules was 18. The response rate for the survey was 98%. Detailed information on both BDHS, including survey designs, methodologies, sample size estimations, questionnaires, and results, can be found online [ 9 ]. Outcome variables The outcome variables of the present study were hypertension and diabetes. Hypertension We considered a person hypertensive if s/he met at least one of these three characteristics: systolic blood pressure (SBP) of 140 mmHg or above, diastolic blood pressure (DBP) of 90 mmHg or above, and the participant was getting any hypertensive medication. LIFE SOURCE® UA-767 Plus monitors recorded the blood pressure (BP). Three measurements were done with ten minutes of interval between each measurement. Other data were collected during the interval. The mean of the last two BP measurements was used to mark the final BP [ 9 ]. Diabetes We considered a person diabetic if s/he had a fasting plasma glucose of 7.0 mmol/l (or above) or was taking blood sugar-lowering medications. The HemoCue 201+ blood analyzer was used to record the glucose [ 9 ]. Potential Factors Based on scientific plausibility, literature search, and data structure, we investigated the following variables as potential features associated with the outcomes: women’s age, education level, household wealth quintile, overweight/obesity, and rural-urban place and division of residence. Since diabetes and hypertension have a bidirectional association (i.e., they can increase the risk of one another) [ 21 – 23 ], we investigated hypertension as a risk factor for diabetes and diabetes as a risk factor for hypertension. Age was grouped as 18-34, 35–44, 45–54, 55-64, and 65 or more years. The education level was classified as no formal education, primary (i.e., 1 to 5 school years), secondary (i.e., 6 to 10 school years), and college/above education (i.e., 11 or more school years). Respondents reported their current work status. The household wealth index score was obtained by principal component analyses of households’ basic building materials (e.g., floors, roofs, and walls), drinking water sources, sanitation facilities, and availability of electricity and other items. The score was then stratified into quintiles: poorest, poorer, middle, richer, and richest. Dhaka, Chittagong, Rajshahi, Khulna, Barisal, Rangpur, Sylhet and Mymensingh were the eight divisions during BDHS 2022 [ 9 ]. Statistical Analysis All analyses were performed using Python. First, we conducted descriptive analyses and calculated the distribution (%) of diabetes and hypertension across the features listed above. We used chi-square tests to examine the association of these factors with hypertension and diabetes. We divided the dataset into training (80%) and test sets (20%). As the dataset was imbalanced (i.e., low prevalence of diabetes/hypertension), we applied synthetic minority over-sampling technique (SMOTE) and adaptive synthetic sampling (ADASYN) to balance the training dataset. SMOTE generates synthetic samples for the minority class by interpolating between existing minority samples [ 24 ], and ADASYN does the same but focuses more on harder-to-learn minority samples near decision boundaries [ 25 ]. We implemented six ML algorithms for classification: (1) Artificial Neural Network (ANN); (2) Random forest; (3) AdaBoost; (4) Stochastic Gradient Boosting (GBM); (5) Extreme Gradient Boosting (XGBoost); and (6) Support Vector Machine (SVM). We used 10-fold cross-validation during training and validation to check model reliability. We used preprocessing to transform categorical variables via one-hot encoding. Model hyperparameters were fine-tuned through a combination of grid search and cross-validation We calculated accuracy, sensitivity, specificity, area under the curve (AUC), and F1-score to assess and compare the performance of the six ML models. We also used receiver operating characteristic (ROC) curves to evaluate the discriminative abilities of the models. For algorithms capable of estimating feature importance (e.g., random forest, XGBoost, GBM), we identified the most significant predictors using feature importance scores. For linear models like SVM (with a linear kernel), we extracted model coefficients to determine each variable’s impact on hypertension and diabetes prediction. Bar plots were used to visually represent the coefficients and feature importance rankings, improving interpretability. Data manipulation tasks (e.g., recoding or grouping) were handled using the pandas and numpy libraries. Visualizations were created with Matplotlib and Seaborn, while the ML algorithms were developed using scikit-learn, XGBoost, and TensorFlow/Keras libraries. Results A total of 13,835 adults were included in the analysis; about 16.3% and 20.6% had diabetes and hypertension, respectively ( Table 1 ). Diabetes affected 15.6% of males and 16.9% of females (P=0.052). The burden of hypertension was significantly higher in females (23.5%) compared to their male counterparts (17.0%; P<0.001). Prevalence of both conditions increased with age (P<0.001), peaking at 24.4% for diabetes (65+ years) and 43.3% for hypertension (65+ years). Education showed a modest association with diabetes (P=0.046), while hypertension was highest among those with no education (28.6%; P<0.001). Wealth quintiles revealed a gradient, with diabetes rising from 9.9% among the poorest to 24.9% among the richest and hypertension from 15.4% (poorest) to 27.6% (richest) (P<0.001). Urban residents had higher rates (diabetes: 21.7%; hypertension: 23.3%) than rural residents (13.5%; 19.2%; P<0.001). Divisionally, Dhaka had the highest diabetes prevalence (21.1%), while Rajshahi had the highest prevalence for hypertension (23.6%; P<0.001). Overweight/obesity significantly increased both conditions’ prevalence (P<0.001). View this table: View inline View popup Table 1: Distribution of diabetes and hypertension according to features Table 2 shows the performance metrics of six ML models. For diabetes, XGBoost achieved the highest accuracy (0.821) and F1 score (0.899), with a sensitivity of 0.958, indicating strong positive class detection, though specificity was low (0.127). Gradient boosting followed closely (accuracy: 0.803, F1: 0.885). SVM and random forest also performed well, with accuracies of 0.805 and 0.790, respectively. AdaBoost had the lowest accuracy (0.682) but a higher AUC (0.694). For hypertension, XGBoost again outperformed others (accuracy: 0.779, F1: 0.865), with a sensitivity of 0.899. Gradient boosting and SVM showed balanced performance (accuracies: 0.751, 0.755). AdaBoost had the highest AUC (0.770) but lower accuracy (0.700). Overall, XGBoost consistently demonstrated superior predictive performance across both conditions, particularly in sensitivity and F1 score. View this table: View inline View popup Download powerpoint Table 2: Performance metrices of the evaluation models obtained by ADASYN In Figure 1 , we presented ROC curves to evaluate the discriminative ability of six classification models. For diabetes, gradient boosting achieved the highest AUC (0.695), followed closely by AdaBoost (0.694), indicating strong overall performance in distinguishing classes. XGBoost (AUC: 0.652) and SVM (0.632) showed moderate performance, while ANN had the lowest AUC (0.585). Random forest (AUC: 0.636) performed slightly better than ANN but lagged behind the top models. For hypertension, Gradient Boosting again led with the highest AUC (0.773), closely followed by AdaBoost (0.770) and XGBoost (0.745), reflecting robust classification capabilities. SVM (AUC: 0.737) and random forest (0.704) showed moderate performance, while ANN had the lowest AUC (0.680). Across both conditions, Gradient boosting consistently demonstrated superior discriminative power, with AdaBoost also performing notably well, particularly in AUC metrics. Download figure Open in new tab Figure 1. Receiver operating characteristics curve for the outcomes We reported feature importance for predicting diabetes an Table 3 . We reported the features that were among the top 5 for 6 models. Notably, the top 5 features for both outcome variables were similar. For diabetes, hypertension ranked first across all models (frequency=6), followed by overweight/obesity (second, frequency=6) and age (third, frequency=6). Wealth quintile and gender ranked fourth–fifth, while education, division, and place of residence ranked sixth–eighth. For hypertension, overweight/obesity was the top predictor (first, frequency=6), followed by age (second, frequency=6) and diabetes (third, frequency=6). Wealth quintile and gender ranked fourth–fifth, with education, division, and place of residence at sixth–eighth. Health-related features (hypertension, overweight/obesity, and diabetes), and age consistently dominated. Education and geographic factors showed lower importance, with SVM slightly varying in ranking wealth quintile and gender. These findings highlight key risk factors for diabetes and hypertension, informing targeted interventions. View this table: View inline View popup Table 3: The top 5 identified features in all studied models Discussion We used nationally representative data from BDHS 2022 and applied modern ML approaches to identify and rank predictors of diabetes and hypertension among Bangladeshi adults. In addition to observing the bidirectional relationship of diabetes and hypertension, we found that age, wealth quintile, and overweight/obesity are potential associated features for the studied conditions in Bangladesh. Our findings not only confirmed known risk factors for diabetes or hypertension but also provided insights into the demographic, socioeconomic, and health-related variables that impact these two outcomes in this country. These insights have important implications for targeted prevention, policy development, and health system planning. From a methodological perspective, the use of ML algorithms allowed for robust predictive modeling. The identification of complex relationships among variables might have been overlooked using traditional statistical techniques. AdaBoost, XGBoost, and gradient boosting consistently outperformed other models in terms of sensitivity and F1 scores for both conditions. Their superior performance underscores the value of ensemble learning methods in health prediction tasks, specifically with multidimensional data [ 11 , 12 ]. SVM and ANN also performed well, though SVM exhibited the lowest AUC for diabetes, indicating weaker discriminative ability for that outcome. Despite the application of SMOTE and ADASYN, the low specificity observed across models, particularly for diabetes, highlights a limitation in distinguishing non-cases, could result from low prevalce of both outcomes (i.e., 16% for diabetes and 21% hypertension). This low specificity levels suggest that further model refinement and perhaps inclusion of additional behavioral or clinical variables may enhance performance [ 11 , 12 ]. The feature importance rankings revealed a consistent pattern across all ML models. Health-related variables (i.e., hypertension, diabetes, and overweight/obesity) and age were top predictors. Wealth quintile and gender also had significant association, while education, place of residence, and division had relatively lesser influence [ 6 , 16 , 26 ]. This hierarchy of predictors reinforces the notion that while demographic and geographic contexts matter, physiological and lifestyle-related indicators are the most immediate determinants of NCD risk [ 16 , 27 ]. The prevalence estimates by features observed in this study underscore the significance of known predictors that could be used to develop and implement prevention and control programs in the context of this country. For instance, consistent with prior research, both conditions showed a strong association with older age, and individuals aged 65 years and above have the highest burden [ 6 , 26 , 28 ]. This age-associated rise is consistent with biological aging processes and cumulative exposure to risk factors such as obesity, poor diet, and sedentary behavior [ 18 ]. Although women had a higher burden of hypertension, the diabetes burden was not significantly higher. These gendered differences may reflect differences in healthcare access, behavioral risk profiles, and sociocultural dynamics [ 29 , 30 ]. Among the most important predictors across all six ML models were hypertension for diabetes (and vice versa). This finding reaffirmed the bidirectional and interdependent relationship between diabetes and hypertension [ 21 , 22 ]. Previous studies have documented that hypertension increases insulin resistance, and diabetes can impair vascular function, which ultimately increases blood pressure levels [ 21 , 22 , 30 ]. Our findings support integrating screening and management protocols for both conditions simultaneously, specifically in primary and outpatient care settings in upazila (i.e., sub-district) health complexes, district hospitals, or medical college hospitals. Overweight/obesity (i.e., BMI 25kg/m 2 or above) appeared as a predictor for both diabetes and hypertension across all models. This echoed global evidence on the central role of excess adiposity in metabolic dysfunction [ 31 , 32 ]. Cardiometabolic syndromes can also occur due to adipocyte hypertrophy, visceral adiposity, and ectopic fat deposition. Additionally, compensatory insulin secretion is closely associated with obesity [ 33 ]. The transition from pre-diabetes to type 2 diabetes is caused by this hypersecretion, which also causes insulin resistance and ultimately β-cell failure [ 32 , 33 ]. The overall prevalence of overweight/obesity has increased in Bangladesh. For instance, the prevalence of overweight/obesity was 25% among 35+-year-old people in 2011, this increased to 32% in 2022 [ 9 , 20 ]. The rising prevalence has been driven by urbanization, dietary transitions, and physical inactivity and poses a significant challenge to the ongoing burden of NCDs like diabetes and hypertension [ 1 , 17 ]. Public health campaigns promoting healthy lifestyles, including physical activity and nutritional education, are urgently needed to curb this trend. Wealth quintile, a proxy for socioeconomic status, demonstrated a “dose-response relationship” with both conditions; disease prevalence increased from the poorest to the richest households. Another predictor variable, education level, often considered as a socioeconomic variable in addition to wealth quintile, did not have a significant association with either of the conditions. This is contrary to previous research and suggests that the significance of education level on the burden of hypertension/diabetes is declining [ 16 , 26 ]. The wealth pattern likely reflects a shift in the burden of diabetes/hypertension toward wealthier segments of the population due to their greater access to energy-dense diets, sedentary occupations, and urban environments. However, this does not diminish the vulnerability of populations with lower household wealth, who may face more barriers to receiving early diagnosis and effective treatment [ 34 ]. Socioeconomic gradients in NCD prevalence call for a dual approach to prevent or control these conditions. It signifies targeting high-risk behaviors in affluent groups while ensuring equitable access to care for the poor. Our study fills critical gaps in the Bangladeshi literature on NCDs. Unlike earlier studies that primarily focused on prevalence estimates, we employed predictive analytics to identify actionable risk factors. Additionally, by incorporating ML techniques into nationally representative data analysis, this study moves beyond traditional epidemiological approaches, offering a more nuanced and dynamic understanding of disease risk. This methodology also allows for potential future integration into health decision-support systems, enabling personalized risk assessments and real-time public health surveillance. Limitations of this study also merit discussion. First, the cross-sectional design of BDHS restricts causal inference. Second, some important behavioral and clinical predictors (e.g., dietary intake, physical activity levels, and family history) were not available in the dataset, potentially limiting model accuracy. Third, the models were trained on structured categorical variables; incorporating continuous or unstructured data in future analyses may yield improved performance. Lastly, although ML offers powerful tools for prediction, the black-box nature of some algorithms (e.g., ANN) may hinder interpretability for clinical application. Conclusion This study demonstrates that survey data with advanced ML models can effectively identify key predictors of hypertension and diabetes. Our results highlight the dominant role of modifiable risk factors such as overweight/obesity and wealth, in addition to the reinforcing relationship between the two conditions. These findings should guide the design of targeted, data-driven public health interventions that prioritize high-risk groups, promote early screening, and support healthy lifestyle changes to reduce the burden of NCDs across Bangladish. Data Availability Data is available from the DHS website: https://dhsprogram.com/data/available-datasets.cfm https://dhsprogram.com/data/available-datasets.cfm Declarations Abbreviations ADASYN nAdaptive synthetic sampling BMI Body mass index BP Blood pressure CKD Chronic kidney disease GFR Glomerular filtration rate HDL High density lipoprotein LDA Linear discriminant analysis ML Machine learning NHANES National Health and Nutrition Examination Survey SMOTE Synthetic Minority Over-sampling Technique SVM Support vector machine. Consent for publication Not applicable Availability of data and material Data is available from https://dhsprogram.com/data/available-datasets.cfm Funding No funding was received for this study. Competing Interests: None declared. Ethics, Consent to Participate, and Consent to Publish declarations Not applicable. Authors contributions Conceptualization: Gulam Muhammed Al Kibria Data Cleaning and Analysis: Gulam Muhammed Al Kibria Investigation: Gulam Muhammed Al Kibria, Golam Shariar, Tarina Khan, James O’Hagan, Mohammed Elfarawami Methodology: Gulam Muhammed Al Kibria, James O’Hagan Writing-Original Draft: Gulam Muhammed Al Kibria Writing – Review & Editing: Golam Shariar, Tarina Khan, James O’Hagan, Mohammed Elfarawami, Mohammed Elfarawami Acknowledgment The authors thank the survey participants for their time. References 1. ↵ Forouzanfar MH , Liu P , Roth GA , Ng M , Biryukov S , Marczak L , et al. Global Burden of Hypertension and Systolic Blood Pressure of at Least 110 to 115 mm Hg, 1990-2015 . JAMA . 2017 ; 317 : 165 – 182 . doi: 10.1001/jama.2016.19043 OpenUrl CrossRef PubMed 2. Ataklte F , Erqou S , Kaptoge S , Taye B , Echouffo-Tcheugui JB , Kengne AP . Burden of Undiagnosed Hypertension in Sub-Saharan Africa: A Systematic Review and Meta–Analysis . Hypertension . 2015 ; 65 : 291 – 298 . doi: 10.1161/HYPERTENSIONAHA.114.04394 OpenUrl CrossRef 3. ↵ Kibria GMA , Swasey K , Choudhury A , Burrowes V , Stafford KA , Uddin SMI , et al. The new 2017 ACC/AHA guideline for classification of hypertension: changes in prevalence of hypertension among adults in Bangladesh . J Hum Hypertens . 2018 . doi: 10.1038/s41371-018-0080-z OpenUrl CrossRef 4. ↵ Heron M , Anderson RN . Changes in the Leading Cause of Death: Recent Patterns in Heart Disease and Cancer Mortality . NCHS Data Brief . 2016 ; 1 – 8 . 5. ↵ Roth GA , Abate D , Abate KH , Abay SM , Abbafati C , Abbasi N , et al. Global, regional, and national age-sex-specific mortality for 282 causes of death in 195 countries and territories, 1980– 2017: a systematic analysis for the Global Burden of Disease Study 2017 . The Lancet . 2018 ; 392 : 1736 – 1788 . doi: 10.1016/S0140-6736(18)32203-7 OpenUrl CrossRef PubMed 6. ↵ Chowdhury MAB , Uddin MJ , Khan HMR , Haque MR . Type 2 diabetes and its correlates among adults in Bangladesh: a population based study . BMC Public Health . 2015 ; 15 : 1070 . doi: 10.1186/s12889-015-2413-y OpenUrl CrossRef PubMed 7. ↵ Nguyen TN , Chow CK . Global and national high blood pressure burden and control . The Lancet . 2021 ; 398 : 932 – 933 . doi: 10.1016/S0140-6736(21)01688-3 OpenUrl CrossRef 8. ↵ International Diabetes Federation . IDF Diabetes Atlas, 7th edition . 2019 . Available: https://www.diabetesatlas.org 9. ↵ National Institute of Population Research and Training (NIPORT), Mitra and Associates, ICF International. Bangladesh Demographic and Health Survey 2022. Dhaka, Bangladesh ; 2024 . 10. ↵ National Institute of Population Research and Training (NIPORT), Mitra and Associates, ICF International. Bangladesh Demographic and Health Survey 2017-18. Dhaka, Bangladesh ; 2020 . 11. ↵ Giskeødegård GF , Lydersen S. Machine learning in medical research . Tidsskr Den Nor Legeforening . 2023 [cited 24 Mar 2025 ]. doi: 10.4045/tidsskr.23.0196 OpenUrl CrossRef 12. ↵ Iniesta R , Stahl D , McGuffin P. Machine learning, statistical learning and the future of biological research in psychiatry . Psychol Med . 2016 ; 46 : 2455 – 2465 . doi: 10.1017/S0033291716001367 OpenUrl CrossRef PubMed 13. ↵ Oliullah K , Rasel MH , Islam MdM , Islam MdR , Wadud MdAH , Whaiduzzaman Md. A stacked ensemble machine learning approach for the prediction of diabetes . J Diabetes Metab Disord . 2023 ; 23 : 603 – 617 . doi: 10.1007/s40200-023-01321-2 OpenUrl CrossRef PubMed 14. ↵ Das S , Rahman R , Talukder A. Determinants of developing cardiovascular disease risk with emphasis on type-2 diabetes and predictive modeling utilizing machine learning algorithms . Medicine (Baltimore) . 2024 ; 103 : e40813 . doi: 10.1097/MD.0000000000040813 OpenUrl CrossRef PubMed 15. ↵ Ahmed SM , Hadi A , Razzaque A , Ashraf A , Juvekar S , Ng N , et al. Clustering of chronic non-communicable disease risk factors among selected Asian populations: levels and determinants . Glob Health Action . 2009 ; 2 . doi: 10.3402/gha.v2i0.1986 OpenUrl CrossRef 16. ↵ Kibria GMA , Hashan MR , Hossain MM , Zaman SB , Stennett CA . Clustering of hypertension, diabetes and overweight/obesity according to socioeconomic status among Bangladeshi adults . J Biosoc Sci . 2021 ; 53 : 157 – 166 . doi: 10.1017/S0021932020000085 OpenUrl CrossRef PubMed 17. ↵ Islam AKMM , Majumder AAS. Hypertension in Bangladesh: a review . Indian Heart J . 2012 ; 64 : 319 – 323 . doi: 10.1016/S0019-4832(12)60096-0 OpenUrl CrossRef PubMed 18. ↵ Akhtar S , Nasir JA , Sarwar A , Nasr N , Javed A , Majeed R , et al. Prevalence of diabetes and prediabetes in Bangladesh: a systematic review and meta-analysis . BMJ Open . 2020 ; 10 : e036086 . doi: 10.1136/bmjopen-2019-036086 OpenUrl Abstract / FREE Full Text 19. ↵ Islam JY , Zaman MM , Bhuiyan MR , Haq SA , Ahmed S , Al-Qadir AZ . Prevalence and determinants of hyperglycaemia among adults in Bangladesh: results from a population-based national survey . BMJ Open . 2019 ; 9 : e029674 . doi: 10.1136/bmjopen-2019-029674 OpenUrl Abstract / FREE Full Text 20. ↵ National Institute of Population Research and Training (NIPORT), Mitra and Associates, ICF International . Bangladesh Demographic and Health Survey 2011. Dhaka, Bangladesh ; 2013 . Available: https://dhsprogram.com/pubs/pdf/fr265/fr265.pdf 21. ↵ Colussi G , Da Porto A , Cavarape A. Hypertension and type 2 diabetes: lights and shadows about causality . J Hum Hypertens . 2020 ; 34 : 91 – 93 . doi: 10.1038/s41371-019-0268-x OpenUrl CrossRef PubMed 22. ↵ Sun D , Zhou T , Heianza Y , Li X , Fan M , Fonseca VA , et al. Type 2 Diabetes and Hypertension: A Study on Bidirectional Causality . Circ Res . 2019 ; 124 : 930 – 937 . doi: 10.1161/CIRCRESAHA.118.314487 OpenUrl CrossRef PubMed 23. ↵ Nakano H , Shiina K , Takahashi T , Fujii M , Iwasaki Y , Matsumoto C , et al. Bidirectional Longitudinal Relationships Between Arterial Stiffness and Hypertension Are Independent of Those Between Arterial Stiffness and Diabetes: A Large-Scale Prospective Observational Study in Employees of a Japanese Company . J Am Heart Assoc . 2022 ; 11 : e025924 . doi: 10.1161/JAHA.121.025924 OpenUrl CrossRef PubMed 24. ↵ Sowjanya AM , Mrudula O. Effective treatment of imbalanced datasets in health care using modified SMOTE coupled with stacked deep learning algorithms . Appl Nanosci . 2023 ; 13 : 1829 – 1840 . doi: 10.1007/s13204-021-02063-4 OpenUrl CrossRef PubMed 25. ↵ Maietta S Khan TM , Xu S , Khan ZG , Uzair Chishti M. Implementing Multilabeling, ADASYN, and ReliefF Techniques for Classification of Breast Cancer Diagnostic through Machine Learning: Efficient Computer-Aided Diagnostic System . Maietta S , editor. J Healthc Eng . 2021 ; 2021 : 1 – 15 . doi: 10.1155/2021/5577636 OpenUrl CrossRef 26. ↵ Chowdhury MAB , Uddin MdJ , Haque MdR , Ibrahimou B. Hypertension among adults in Bangladesh: evidence from a national cross-sectional survey . BMC Cardiovasc Disord . 2016 ; 16 . doi: 10.1186/s12872-016-0197-3 OpenUrl CrossRef 27. ↵ Biswas T , Islam MdS , Linton N , Rawal LB . Socio-Economic Inequality of Chronic Non-Communicable Diseases in Bangladesh . Rahman M, editor. PLOS ONE . 2016 ; 11 : e0167140 . doi: 10.1371/journal.pone.0167140 OpenUrl CrossRef 28. ↵ Kibria GMA , Swasey K , Hasan MdZ , Choudhury A , Gupta RD , Abariga SA , et al. Determinants of hypertension among adults in Bangladesh as per the JNC7 and 2017 ACC/AHA hypertension guidelines . J Am Soc Hypertens . 2018 [cited 24 Oct 2018 ]. doi: 10.1016/j.jash.2018.10.004 OpenUrl CrossRef 29. ↵ Uthman O Agho KE , Osuagwu UL , Ezeh OK , Ghimire PR , Chitekwe S , Ogbo FA . Gender differences in factors associated with prehypertension and hypertension in Nepal: A nationwide survey . Uthman O , editor. PLOS ONE . 2018 ; 13 : e0203278 . doi: 10.1371/journal.pone.0203278 OpenUrl CrossRef PubMed 30. ↵ Doumas M , Papademetriou V , Faselis C , Kokkinos P. Gender differences in hypertension: myths and reality . Curr Hypertens Rep . 2013 ; 15 : 321 – 330 . doi: 10.1007/s11906-013-0359-y OpenUrl CrossRef PubMed 31. ↵ Shariq OA , McKenzie TJ . Obesity-related hypertension: a review of pathophysiology, management, and the role of metabolic surgery . Gland Surg . 2020 ; 9 : 80 – 93 . doi: 10.21037/gs.2019.12.03 OpenUrl CrossRef PubMed 32. ↵ Artasensi A , Mazzolari A , Pedretti A , Vistoli G , Fumagalli L. Obesity and Type 2 Diabetes: Adiposopathy as a Triggering Factor and Therapeutic Options . Mol Basel Switz . 2023 ; 28 : 3094 . doi: 10.3390/molecules28073094 OpenUrl CrossRef 33. ↵ Iacobini C , Pugliese G , Blasetti Fantauzzi C , Federici M , Menini S. Metabolically healthy versus metabolically unhealthy obesity . Metabolism . 2019 ; 92 : 51 – 60 . doi: 10.1016/j.metabol.2018.11.009 OpenUrl CrossRef PubMed 34. ↵ Kibria GMA , Rahman Shawon MS , Hashan MR , Khan MH , Gibson DG . Disparities and factors affecting hypertension diagnosis from qualified doctors in Bangladesh and its impact on receiving hypertension control advice: Analysis of demographic & health survey 2017-18 . PLOS Glob Public Health . 2024 ; 4 : e0003496 . doi: 10.1371/journal.pgph.0003496 OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted June 04, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Application of Machine Learning Approaches to Develop Predictive Models for Diabetes and Hypertension among Bangladesh Adults Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Application of Machine Learning Approaches to Develop Predictive Models for Diabetes and Hypertension among Bangladesh Adults Gulam Muhammed Al Kibria , James O’Hagan , Golam Shariar , Tarina Khan , Mohammed Elfaramawi medRxiv 2025.05.30.25328660; doi: https://doi.org/10.1101/2025.05.30.25328660 Share This Article: Copy Citation Tools Application of Machine Learning Approaches to Develop Predictive Models for Diabetes and Hypertension among Bangladesh Adults Gulam Muhammed Al Kibria , James O’Hagan , Golam Shariar , Tarina Khan , Mohammed Elfaramawi medRxiv 2025.05.30.25328660; doi: https://doi.org/10.1101/2025.05.30.25328660 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Cardiovascular Medicine Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15228) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6599) Geriatric Medicine (668) Health Economics (997) Health Informatics (4536) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3332) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9231) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a0067924caa6ad07',t:'MTc3OTU2NDYwNg=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.