Full text
40,122 characters
· extracted from
preprint-html
· click to expand
Beyond Accuracy: A Cost-Aware Approach to Skin Lesion Detection Across Skin Tone Imbalances | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Beyond Accuracy: A Cost-Aware Approach to Skin Lesion Detection Across Skin Tone Imbalances Md Mohit Hasan , Mahbuba Tasnime Suchi , Md Hasibul Habib , Sumya Akter , Zarin Tasnim Rothy , A.M.Tayeful Islam , Tanmoy Sarkar Pias , David Eisenberg , Simon Bin Akter doi: https://doi.org/10.1101/2024.12.11.24318858 Md Mohit Hasan 1 Dept. of Computer Science and Engineering, Northern University Bangladesh , Dhaka, Bangladesh Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mahbuba Tasnime Suchi 1 Dept. of Computer Science and Engineering, Northern University Bangladesh , Dhaka, Bangladesh Find this author on Google Scholar Find this author on PubMed Search for this author on this site Md Hasibul Habib 1 Dept. of Computer Science and Engineering, Northern University Bangladesh , Dhaka, Bangladesh Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sumya Akter 1 Dept. of Computer Science and Engineering, Northern University Bangladesh , Dhaka, Bangladesh Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zarin Tasnim Rothy 1 Dept. of Computer Science and Engineering, Northern University Bangladesh , Dhaka, Bangladesh Find this author on Google Scholar Find this author on PubMed Search for this author on this site A.M.Tayeful Islam 1 Dept. of Computer Science and Engineering, Northern University Bangladesh , Dhaka, Bangladesh Find this author on Google Scholar Find this author on PubMed Search for this author on this site Tanmoy Sarkar Pias 2 Dept. of Computer Science, Virginia Tech , Blacksburg, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site David Eisenberg 3 Dept. of Information Management and Business Analytics, Montclair State University, Feliciano School of Business , New Jersey, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Simon Bin Akter 1 Dept. of Computer Science and Engineering, Northern University Bangladesh , Dhaka, Bangladesh Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: simon.akter{at}nub.ac.bd Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Skin lesion prediction using artificial intelligence (AI) models is highly dependent on skin tone, yet current approaches largely overlook this critical factor. The Fitzpatrick 17k dataset, which contains six skin tone categories: lighter to darker, is severely imbalanced, with most models biased toward lighter skin tones. Previous efforts to improve overall accuracy fall short: overall accuracy fails to reflect true performance across imbalances. This creates a significant gap, as effective skin lesion detection must work across all skin tones, not just a few. To address this, we introduce the Cost-Aware EfficientNet (CAEN) model, combining cost-sensitive learning (CSL) and attention mechanisms to tackle imbalanced data and ensure the model generalizes well across all skin tones with detailed interpretability. Rather than simply improving accuracy, our model enhances class-specific performance, achieving 79% recall for non-neoplastic, 88% for benign, and 80% for malignant lesions. This indicates an overall improvement in darker tones of approximately 44.86% compared to state-of-the-art results from prior studies. Furthermore, it remains robust across augmented test conditions, such as changes in brightness, contrast, blur, and zoom, providing balanced outcomes for diverse skin tones. This novel approach offers a significant leap toward fair and reliable skin lesion prediction for all skin tones with interpretability. I. INTRODUCTION Skin lesions affect individuals across all skin tones, but diagnosing them can be difficult due to variations in pigmentation [ 1 ]. The Fitzpatrick 17k [ 2 ] dataset provides a valuable resource for classifying skin diseases using artificial intelligence (AI) [ 3 ]–[ 5 ] approaches, as it covers a diverse range of skin tones [ 6 ]–[ 9 ]. However, a major challenge in applying AI to this dataset is the imbalance among different skin tone classes [ 10 ]. Ensuring accurate diagnosis for all skin tones is critical, as biased AI models can lead to incorrect diagnoses, especially for underrepresented skin tones [ 1 ], [ 9 ], [ 10 ]. Previous studies using the Fitzpatrick 17k dataset have applied AI models for skin lesion classification, often using accuracy as the main performance measure [ 1 ], [ 6 ]– [ 9 ]. However, this metric can hide how much better these models perform on majority classes (more common skin tones) compared to minority classes (less represented skin tones), leading to biased results [ 11 ]–[ 13 ]. While augmentation techniques were applied, they focused on improving overall accuracy and did not clearly show whether the models were effective at identifying minority skin disease classes across different skin tones [ 1 ], [ 6 ]–[ 9 ]. Prior studies [ 7 ]–[ 10 ] did not explore how models trained on one skin tone performed against others, missing a clearer analysis of bias. Additionally, these studies [ 7 ]–[ 10 ] showed significantly lower performance on darker tones compared to lighter ones. It raises questions about the generalizability of these models across different skin tones, highlighting a potential gap in understanding their applicability and effectiveness across diverse populations. Furthermore, these studies [ 1 ], [ 6 ]–[ 10 ] did not address the impact of image quality during testing, such as variations in brightness, contrast, blur, and zoom, which could affect model performance. The need for accurate diagnosis across all skin tones makes it essential to address the limitations of prior approaches [ 1 ], [ 6 ]–[ 10 ]. The reliance on overall performance metrics does not offer a reliable measure of fairness in class-wise predictions, especially when dealing with imbalanced datasets like Fitzpatrick 17k [ 12 ], [ 13 ]. These studies also fail to demonstrate the model’s generalizability and robustness across different skin tones, as indicated by their poor performance on darker skin tones [ 7 ]–[ 10 ]. Therefore, it is essential to explore how imbalances can be more carefully addressed, improve fairness, and ensure more equitable predictions across all skin tones. This research addresses and improves upon key questions, as outlined below. Are AI algorithms biased when detecting skin lesions across different skin tones? Can we rely on accuracy metrics when the dataset is imbalanced, like in the Fitzpatrick 17k skin tone dataset, despite prior research [ 1 ], [ 6 ]–[ 9 ] efforts to improve accuracy for each skin tone? Can a model trained on lighter skin tones accurately predict skin lesions on darker skin tones and vice versa? This research addresses the class imbalance in the Fitz-patrick 17k [ 2 ] dataset by exploring various modeling approaches combined with augmentation techniques. The primary contribution is the evaluation of class-wise performance and its improvement instead of just overall metrics, alongside the introduction of a novel Cost-Aware EfficientNet (CAEN) model to effectively handle class imbalance. CAEN, based on EfficientNet architecture, incorporates dynamic cost-sensitive learning (CSL) [ 14 ] and attention mechanism [ 15 ], fine-tuned to predict skin lesions, including i. non-neoplastic, ii. benign , and iii. malignant , across six skin tones from light to dark. The study discusses the issue of generalizability and robustness by training the proposed model on lighter skin tones and testing it on darker tones, and vice versa. It also ensures generalizability and robust, reliable performance across all skin tones through the proposed model. Additionally, we have tested our proposed model across different augmented samples from the test set, varying brightness, contrast, blur, and zoom, highlighting the importance of fairness in AI models to ensure accurate predictions for all skin tones. Our study also includes detailed interpretability [ 3 ], [ 11 ], [ 16 ] of its predictions across different skin tones, ensuring transparency in how the model makes decisions for diverse populations. The findings of this research have significant implications for dermatology by enhancing class-wise accuracy in skin lesion classification across diverse skin tones. This work promotes more equitable diagnostic methods and serves as a valuable reference for future researchers aiming to select strategies that mitigate bias in predictions [ 1 ] across different skin tones. A. Related Works and Their Limitations Previous studies [ 17 ], [ 18 ] on skin disease prediction have often overlooked how well models perform across different skin tones. Although datasets like Fitzpatrick 17k aim to address skin tone representation, prior studies [ 1 ], [ 6 ]–[ 10 ] have not effectively demonstrated how model improvements vary by skin tone. Many studies [ 1 ], [ 6 ]–[ 9 ] rely on overall accuracy metrics, which can be misleading, reflecting high performance on majority classes while neglecting minority groups [ 12 ], [ 13 ]. In contrast, our study introduces a new CAEN modeling technique that shows clear class-wise recall improvements in skin disease prediction for all skin tones. For instance, state-of-the-art recalls from prior research [ 10 ] for benign and malignant classes across light skin tones improved from 0.52 and 0.73 to 0.86 and 0.75 . For moderate skin tones, these classes improved from 0.60 and 0.65 to 0.88 and 0.84 , and for dark skin tones, from 0.55 and 0.45 to 0.88 and 0.86 . These studies [ 7 ]–[ 10 ] consistently showed lower performance for minority classes, such as benign and malignant, on darker tones. However, we have significantly improved performance on darker tones for these minority classes. Further, we validate the generalizability and robustness of our model by testing it on a diverse set of augmented samples with varying image qualities, ensuring balanced performance, which has been lacking in previous research. II. MATERIALS AND METHODS We utilized the Fitzpatrick 17k [ 2 ] dataset, which includes six different skin tones, ranging from lighter to darker, to predict skin lesions using AI approaches as detailed in Fig 1 . To address biases caused by class imbalances and limited samples from specific skin tones, we fine-tuned a novel CAEN modeling technique. We compared our proposed model with several existing techniques, analyzing class-wise recall performance to identify improvements. We also examined how performance varies by training the proposed model on lighter skin tones and testing it on darker skin tones, as well as the reverse scenario. Additionally, we tested the model’s performance under varying conditions, such as changes in brightness, contrast, blur, and zoom in the samples from the test set. Lastly, this study also comprehensively interprets its predictions across various skin tones. This comprehensive evaluation allowed us to assess the robustness and reliability of our model across different skin tones and image quality variations. Download figure Open in new tab Fig. 1. Complete workflow of improved skin lesion prediction across heavy skin tone imbalances. This includes optimized data augmentation, a cost-aware model for imbalance correction, and detailed explainability. A. Data Description The Fitzpatrick 17k [ 2 ] dataset comprises a diverse collection of skin disease images categorized by six distinct skin tones: Type I (very fair), Type II (fair), Type III (medium), Type IV (olive), Type V (brown) , and Type VI (dark) . In our study, we utilized a total of 16,012 images from this dataset to predict skin lesions, specifically focusing on three categories: (i) non-neoplastic, (ii) benign , and (iii) malignant lesions. However, it is important to note that the images for different skin lesions are heavily imbalanced, with very limited samples available for certain skin tones represented in Table I , posing challenges for accurate model training and evaluation. View this table: View inline View popup Download powerpoint TABLE I. D istribution of skin lesions across different F itzpatrick scales ( skin tones ). T he data distribution of benign andmalignant cases appears to be highly imbalanced compared to non - neoplastic cases , and there are also significantly fewer images representing dark shades . B. Train/Test Formulation The dataset is divided into a training set and a testing set in an 80:20 split ratio. To ensure balanced training and improve model performance, we implemented data augmentation strategies tailored to each skin tone. Specifically, we conducted augmentation separately for each skin tone to balance the benign and malignant lesion classes against the non-neoplastic class. This process involved applying various transformations, including rotation, brightness adjustment, contrast enhancement, zooming, and blurring, to create a more diverse set of training images. To effectively address the class imbalances within our dataset, we significantly increased the number of benign samples by a factor of 5.55, aligning it more closely with the non-neoplastic class. Similarly, we augmented the malignant samples by a factor of 4.86 to achieve a comparable balance represented in Table II . These augmentation ratios were determined through iterative testing of various sampling strategies within our predictive model, allowing us to optimize the balance for each class and improve the overall robustness of our model. View this table: View inline View popup Download powerpoint TABLE II. C omparison of N on - neoplastic , B enign , and M alignant C ounts P re - and P ost -A ugmentation in T rain S et . T he augmentation process involved applying several transformations to the images , such as varying rotation , adjusting brightness , enhancing contrast , zooming , and adding blur . C. Proposed Cost-Aware EfficientNet The proposed CAEN model represented in Equation 1 builds upon the EfficientNet architecture, integrating CSL [ 14 ] and an attention mechanism [ 15 ] to tackle class imbalance and improve model performance. CSL [ 14 ] assigns higher weights to underrepresented classes, such as benign and malignant cases, ensuring the model focuses more on accurately detecting these categories that are often overlooked in imbalanced datasets. The attention mechanism [ 15 ] allows the model to focus dynamically on the most important areas within an image, improving its ability to capture subtle differences between classes. These improvements make CAEN better at mitigating bias, as it directly addresses the imbalance issue that standard EfficientNet [ 19 ] struggles with. Furthermore, CAEN is more effective at developing a model that generalizes well across different skin tones, as the attention mechanism enables the model to focus on key visual patterns rather than being biased toward skin tone variations, which can be limited in certain datasets. This ability to extract relevant features across diverse cases allows CAEN to perform robustly in a wider range of real-world scenarios compared to the standard EfficientNet [ 19 ], which tends to struggle with such diversity due to its reliance on the limited sample present for particular scenarios. The description of the Equation 1 : Where: x represents the input image, while f ( x ) denotes feature extraction from the EfficientNet base model. The variable w signifies the weights of the dense layer, and b indicates the bias term. The function σ represents the softmax activation function, which is applied to the output of the attention mechanism, Att( f ( x )), that highlights relevant features in the extracted data. Additionally, y represents the true labels, and CSL( y ) denotes the costsensitive function that adjusts the loss based on class weights, enhancing the model’s ability to address class imbalances effectively. III. RESULTS AND ANALYSES First, the proposed CAEN model was trained on images of various skin tones together and compared with existing models. Next, it was trained and tested separately for each skin type, and the model with the best accuracy was compared to the CAEN model. Finally, the proposed model was trained on light skin tones and tested on dark tones, and vice versa to examine how model performance varies. Results are computed ten times each by changing the model hyper-parameters and conventional probability threshold [ 20 ], and the standard deviation (SD) [ 21 ] for each is calculated to demonstrate variance in results. Finally, our study also offers an in-depth explanation of its predictions for different skin tones. A. Experimental Results The Table III compares different models for skin lesion classification across three classes: non-neoplastic, benign, and malignant, with accuracy shown for each. Some models, like VGG 19, EfficientNet B0, and ResNet 50, achieved higher overall accuracy than the proposed CAEN model, but their class-wise performance, especially for the minority classes (benign and malignant), was significantly lower. The CAEN model, however, showed balanced and improved performance across all classes. It performed particularly well on dark skin tones, which had fewer samples, while also maintaining good accuracy on light skin tones. Even when tested on augmented data with varying image quality, CAEN continued to perform well, especially on darker tones, while balancing performance for lighter tones. View this table: View inline View popup Download powerpoint TABLE III. C lass - wise recall and accuracy comparison of the proposed model with existing approaches . T he model is trained using data from all skin tones together . T he test sets are divided into light ( i - ii ), moderate ( iii - iv ), and dark ( v - vi ) skin tones , with class - wise recall and accuracy presented for each and overall in the last section . A ugmented indicates that the model was tested on an augmented version of the test set , which varied brightness , contrast , blur , and zoom . Previous studies [ 7 ]–[ 10 ] have conducted methods to improve accuracy for each skin tone outcome. Hence, in this comparison in Table IV , models were trained separately for each skin type, and the model with the highest overall accuracy is compared with CAEN. While ResNet 50 achieved higher overall accuracy, its performance for minority classes like benign and malignant was significantly lower. In contrast, CAEN maintained a balanced performance across all classes, indicating that focusing solely on accuracy in imbalanced data can be misleading, and class-wise performance provides a more reliable evaluation. View this table: View inline View popup Download powerpoint TABLE IV. C lass - wise recall performance comparison of models trained separately by skin tone . M odels were trained on datasets divided by skin tones . T he proposed model is compared with the one achieving the highest accuracy to highlight the unreliability of accuracy metrics across imbalanced data . Previous studies [ 7 ]–[ 10 ] failed to investigate how models trained on one skin tone performed when applied to others, overlooking a more thorough analysis of potential skin tone bias. In Table V , we evaluated the CAEN model by training it on one skin tone and testing it on others to see how generalization varies across different skin tones. From our results, when the model was trained on lighter tones, it performed better on darker tones for non-neoplastic cases, but its performance for benign and malignant cases was lower compared to when it was trained on moderate tones. For models trained on moderate tones, the results varied: they performed better on non-neoplastic cases for darker tones than for lighter tones, but both darker and lighter tones showed lower performance for benign cases. When the model was trained on darker tones, its performance on lighter and moderate tones dropped, especially for non-neoplastic cases on lighter tones and benign cases on moderate tones, which were significantly low. View this table: View inline View popup Download powerpoint TABLE V. C omparison of recall generalizability when trained on lighter tones and tested on darker tones , and vice versa . T he skin tone on which the model is trained is marked in bold , while performance is also tested on the other two skin tones . L ighter tones include F itzpatrick types I-II, moderate tones include types III-IV, and darker tones include types V-VI. B. Explainability Gradient-Weighted Class Activation Mapping (Grad-CAM) [ 22 ] in the Fig 1 highlights affected areas in skin lesions, distinguishing non-neoplastic, benign, and malignant types. While green is prominent, indicating significant regions, orange and yellow mark the most critical areas. Blue and pink/magenta shades primarily indicate less significant regions but still contribute to the lesion assessment. The color intensity reflects how the model identifies skin tone across the Fitz-patrick scale, as seen from the consistent focus on lesion areas across diverse skin tones (Type 1 to Type 6), capturing lesions accurately despite varying pigmentation. C. Discussion In this research, we aimed to address key questions regarding the performance of AI algorithms in detecting skin lesions across various skin tones. The following summarizes our findings based on the results presented in Table III , Table IV , and Table V , offering insights into the addressed questions. Question 1 In our research, we first explored whether AI models are biased when detecting skin lesions on different skin tones. The results in Table IV highlight that models trained separately for each skin tone, like ResNet 50, achieved higher overall accuracy but performed poorly on minority classes, particularly for darker skin tones. This indicates a significant bias, as the model tended to favor lighter tones, confirming that AI models can indeed be biased based on skin tone. Additionally, Table V illustrates that the performance of the CAEN model varied significantly depending on the skin tone it was trained on. This variation suggests that models can be biased and can not be generalized well across different skin tones, as their effectiveness fluctuates depending on the training set. Question 2 We questioned the reliability of accuracy metrics in imbalanced datasets, like the Fitzpatrick 17k skin tone dataset. From our comparisons in Table IV , it became evident that focusing solely on overall accuracy can be misleading. Although ResNet 50 in Table IV had a higher accuracy, its performance on benign and malignant cases was significantly lower, highlighting the limitations of using accuracy metrics alone. This aligns with findings in prior studies [ 7 ]–[ 10 ] that often fail to adequately address class imbalances, leading to potentially biased conclusions about model effectiveness. Question 3 Finally, we examined whether a model trained on one skin tone can accurately predict lesions on others. Previous studies [ 7 ]–[ 10 ] failed to assess model performance when trained on one skin tone and tested on another, missing a critical opportunity to comprehensively evaluate skin tone bias. Additionally, their improvements mainly benefited light skin tones, failing to generalize across other tones [ 7 ]–[ 10 ]. This emphasizes the need for a model that delivers consistent performance across all skin tones, as confirmed by the results in Table V , which show the presence of skin tone bias. Our study advocates for the CAEN model, as outlined in Table III , which trains on all skin tones simultaneously. By utilizing CSL and an attention mechanism, the CAEN model effectively generalizes its performance across diverse skin tones, ensuring better reliability and reducing bias. IV. LIMITATIONS The Fatzpartc 17k [ 2 ] dataset has very few images of malignant lesions, especially for dark skin tones, making it hard to train a model to accurately classify these lesions on darker skin. Besides, the area of skin disease is often under-represented, with many images showing only small portions of the affected skin, making it difficult to distinguish between categories, which could be improved by reshaping the images under the guidance of a skin disease professional for better visibility and prediction. Additionally, besides the images, important clinical information, such as patient history and other findings, is essential for accurate diagnosis but is not provided in the dataset. V. CONCLUSION In this research, we addressed the bias of AI algorithms in detecting skin lesions across diverse skin tones. Previous research [ 1 ], [ 6 ]–[ 10 ] has mainly focused on increasing the overall accuracy of skin lesion detection across various skin tones. However, our experimentation, as shown in Table IV , indicates that AI models often perform better on lighter skin tones, leading to lower accuracy for darker skin tones, even when the overall accuracy appears high. Therefore, improving overall accuracy can not be a reliable strategy for imbalanced datasets like Fitzpatrick 17k. Previous studies [ 7 ]–[ 10 ] did not examine how models trained on one skin tone performed on others, missing a comprehensive analysis of potential skin tone bias. Furthermore, the enhancements they achieved primarily favored light skin tones and did not generalize well to other tones [ 7 ]–[ 10 ]. This underscores the necessity for a model that provides consistent performance across all skin tones, as evidenced by the results in Table V , which indicate the existence of skin tone bias. Our findings highlight the unreliability of overall accuracy metrics in imbalanced datasets represented in Table IV and advocate for the CAEN model, which effectively generalizes class-wise recall performance across all skin tones by employing dynamic CSL [ 14 ] and attention mechanism [ 15 ]. Class-wise performance metrics were used to effectively indicate improvements, rather than focusing solely on overall accuracy. Even when tested on augmented data with varying image quality—such as differences in brightness, contrast, blur, and zoom—CAEN continued to perform well, especially on darker tones, while balancing performance for lighter tones, demonstrating its reliability across diverse conditions. Our study further presents an extensive analysis of its predictions across various skin tones, promoting transparency in the model’s decision-making for diverse populations. These improvements not only enhance class-wise accuracy in skin lesion classification but also promote equitable diagnostic methods in dermatology, ensuring that all patients receive accurate and fair assessments, regardless of their skin tone. Data Availability Studies of publicly available human data Link: https://skincon-dataset.github.io/ Footnotes hasan_41220200217{at}nub.ac.bd suchi_41210301603{at}nub.ac.bd habib_41210301601{at}nub.ac.bd sumya.akter{at}nub.ac.bd Rothi_NUBCSE{at}nub.ac.bd Tayef_NUBCSE{at}nub.ac.bd tanmoysarkar{at}vt.edu eisenbergd{at}montclair.edu I have Corrected only a few formatting issues, with no major changes. REFERENCES [1]. ↵ E. Akuffo-Addo , L. Samman , L. Munawar , M. Akbik , N. Kokikian , R. Wescott , J. J. Wu , Assessing gpt-4’s diagnostic accuracy with darker skin tones: underperformance and implications , Clinical and Experimental Dermatology 49 ( 10 ) ( 2024 ) 1244 – 1245 . OpenUrl PubMed [2]. ↵ Kaggle , Fitzpatrick 17k dataset , accessed: 2024-02-04 ( 2023 ). [3]. ↵ S. Bin Akter , T. Sarkar Pias , S. Rahman Deeba , J. Hossain , H. Abdur Rahman , Ensemble learning based transmission line fault classification using phasor measurement unit (pmu) data with explainable ai (xai) , Plos one 19 ( 2 ) ( 2024 ) e0295144 . OpenUrl PubMed [4]. M. A. I. Siddique , A. Z. B. Aziz , A. Matin , An improved deep learning based classification of human white blood cell images , in: 2020 11th International Conference on Electrical and Computer Engineering (ICECE) , IEEE , 2020 , pp. 149 – 152 . [5]. ↵ R. Rahman , A. F. Rakib , M. Rahman , T. Helaly , T. S. Pias , A real-time end-to-end bangladeshi license plate detection and recognition system for all situations including challenging environmental scenarios , in: 2021 5th International Conference on Electrical Engineering and Information Communication Technology (ICEEICT) , IEEE , 2021 , pp. 1 – 6 . [6]. ↵ J. Schneider , I. Tejani , T. Jarmain , R. Moy , et al. , Diagnosis of skin disease in moderately to highly pigmented skin by artificial intelligence , Authorea Preprints ( 2023 ). [7]. ↵ S. Du , B. Hers , N. Bayasi , G. Hamarneh , R. Garbi , Fairdisco: Fairer ai in dermatology via disentanglement contrastive learning , in: European Conference on Computer Vision , Springer , 2022 , pp. 185 – 202 . [8]. M. Dominguez , J. T. Finnell , Unsupervised softotsunet augmentation for clinical dermatology image classifiers , in: AMIA Annual Symposium Proceedings , Vol. 2023 , American Medical Informatics Association , 2023 , p. 329 . OpenUrl PubMed [9]. ↵ M. Groh , C. Harris , L. Soenksen , F. Lau , R. Han , A. Kim , A. Koochek , O. Badri , Evaluating deep neural networks trained on clinical images in dermatology with the fitzpatrick 17k dataset , in: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , 2021 , pp. 1820 – 1828 . [10]. ↵ A. Pundhir , S. Verma , B. Raman , Towards ethical dermatology: Mitigating bias in skin condition classification , in: 2024 International Joint Conference on Neural Networks (IJCNN) , IEEE , 2024 , pp. 1 – 8 . [11]. ↵ S. B. Akter , S. Akter , T. S. Pias , Stroke probability prediction from medical survey data: Ai-driven analysis with insightful feature importance using explainable ai (xai ), in: 2023 26th International Conference on Computer and Information Technology (ICCIT) , IEEE , 2023 , pp. 1 – 6 . [12]. ↵ K. Ghosh , C. Bellinger , R. Corizzo , P. Branco , B. Krawczyk , N. Japkowicz , The class imbalance problem in deep learning , Machine Learning 113 ( 7 ) ( 2024 ) 4845 – 4901 . OpenUrl CrossRef [13]. ↵ B. Cao , Y. Liu , C. Hou , J. Fan , B. Zheng , J. Yin , Expediting the accuracy-improving process of svms for class imbalance learning , IEEE Transactions on Knowledge and Data Engineering 33 ( 11 ) ( 2020 ) 3550 – 3567 . OpenUrl [14]. ↵ I. Araf , A. Idri , I. Chairi , Cost-sensitive learning for imbalanced medical data: a review , Artificial Intelligence Review 57 ( 4 ) ( 2024 ) 80 . OpenUrl [15]. ↵ S. V. Moravvej , S. J. Mousavirad , M. H. Moghadam , M. Saadatmand , An lstm-based plagiarism detection via attention mechanism and a population-based approach for pre-training parameters with imbalanced classes , in: Neural Information Processing: 28th International Conference, ICONIP 2021, Sanur, Bali, Indonesia, December 8–12, 2021, Proceedings, Part III 28 , Springer , 2021 , pp. 690 – 701 . [16]. ↵ S. B. Akter , S. Akter , M. D. Tuli , D. Eisenberg , A. Lotvola , H. Islam , J. F. Fernandez , M. Hüttemann , T. S. Pias , Fair and explainable myocardial infarction (mi) prediction: Novel strategies for feature selection and class imbalance correction , Computers in Biology and Medicine 184 ( 2025 ) 109413 . OpenUrl [17]. ↵ A. K. Verma , S. Pal , S. Kumar , Comparison of skin disease prediction by feature selection using ensemble data mining techniques , Informatics in Medicine Unlocked 16 ( 2019 ) 100202 . OpenUrl [18]. ↵ A. A. Elngar , R. Kumar , A. Hayat , P. Churi , Intelligent system for skin disease prediction using machine learning , in: Journal of Physics: Conference Series , Vol. 1998 , IOP Publishing , 2021 , p. 012037 . OpenUrl [19]. ↵ V. Goutham , A. Sameerunnisa , S. Babu , T. B. Prakash , Brain tumor classification using efficientnet-b0 model , in: 2022 2nd International Conference on Advance Computing and Innovative Technologies in Engineering (ICACITE) , IEEE , 2022 , pp. 2503 – 2509 . [20]. ↵ J. S. Aguilar-Ruiz , M. Michalak , Classification performance assessment for imbalanced multiclass data , Scientific Reports 14 ( 1 ) ( 2024 ) 10759 . OpenUrl PubMed [21]. ↵ S. Prateek , R. Garg , K. Kumar Saxena , V. Srivastav , H. Vasudev , N. Kumar , Data-driven materials science: application of ml for predicting band gap , Advances in Materials and Processing Technologies 10 ( 2 ) ( 2024 ) 708 – 717 . OpenUrl [22]. ↵ C. Van Zyl , X. Ye , R. Naidoo , Harnessing explainable artificial intelligence for feature selection in time series energy forecasting: A comparative analysis of grad-cam and shap , Applied Energy 353 ( 2024 ) 122079 . OpenUrl View the discussion thread. Back to top Previous Next Posted December 13, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Beyond Accuracy: A Cost-Aware Approach to Skin Lesion Detection Across Skin Tone Imbalances Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Beyond Accuracy: A Cost-Aware Approach to Skin Lesion Detection Across Skin Tone Imbalances Md Mohit Hasan , Mahbuba Tasnime Suchi , Md Hasibul Habib , Sumya Akter , Zarin Tasnim Rothy , A.M.Tayeful Islam , Tanmoy Sarkar Pias , David Eisenberg , Simon Bin Akter medRxiv 2024.12.11.24318858; doi: https://doi.org/10.1101/2024.12.11.24318858 Share This Article: Copy Citation Tools Beyond Accuracy: A Cost-Aware Approach to Skin Lesion Detection Across Skin Tone Imbalances Md Mohit Hasan , Mahbuba Tasnime Suchi , Md Hasibul Habib , Sumya Akter , Zarin Tasnim Rothy , A.M.Tayeful Islam , Tanmoy Sarkar Pias , David Eisenberg , Simon Bin Akter medRxiv 2024.12.11.24318858; doi: https://doi.org/10.1101/2024.12.11.24318858 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Systems and Quality Improvement Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (302) Cardiovascular Medicine (4453) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15242) Forensic Medicine (30) Gastroenterology (1131) Genetic and Genomic Medicine (6615) Geriatric Medicine (669) Health Economics (1001) Health Informatics (4552) Health Policy (1372) Health Systems and Quality Improvement (1614) Hematology (543) HIV/AIDS (1270) Infectious Diseases (except HIV/AIDS) (15929) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6625) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3344) Ophthalmology (979) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5461) Public and Global Health (9252) Radiology and Imaging (2207) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1197) Rheumatology (597) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a02f04a81803c13d',t:'MTc3OTk4OTc1MQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.