Full text
50,558 characters
· extracted from
preprint-html
· click to expand
BUCAN: Bayesian Uncertainty-aware Classification with Attention Networks for Medical Images | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search BUCAN: Bayesian Uncertainty-aware Classification with Attention Networks for Medical Images Abhinav Sagar doi: https://doi.org/10.1101/2025.11.05.25339638 Abhinav Sagar 1 Vrije Universiteit Brussel (VUB) Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: abhinav.sagar{at}vub.be Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Accurate and reliable medical image classification is critical for clinical decision-making across diverse imaging modalities, including X-ray, CT, and MRI. Traditional convolutional neural networks often produce overconfident predictions, limiting their clinical trustworthiness. In this work, we propose an uncertainty-aware, attention-augmented neural network that integrates multi-scale SwirlAttention and FeedBackAttention modules with a Bayesian probabilistic classifier. This framework enables robust feature extraction, interpretable attention maps, and principled estimation of epistemic uncertainty. We evaluate our approach on four diverse datasets, including Diabetic Retinopathy, Kvasir, Skin Cancer, and fused multi-focal Oocyte images, covering a wide range of pathological and morphological variations. Extensive experiments demonstrate that our method outperforms state-of-the-art CNN and transformer-based baselines in terms of accuracy, calibration, and interpretability. Grad-CAM visualizations highlight clinically relevant regions, while uncertainty estimates provide actionable insights for ambiguous cases, making the framework suitable for reliable deployment in real-world clinical settings. Introduction Medical image classification plays a critical role in modern clinical workflows by enabling automated diagnosis, early disease detection, and treatment planning across a wide range of imaging modalities. Despite significant advances with deep learning, conventional convolutional neural networks (CNNs) often operate under deterministic assumptions, producing overconfident predictions when confronted with noisy, ambiguous, or out-of-distribution medical images. This limitation is particularly concerning in safety-critical applications, where diagnostic errors can have severe clinical consequences. Recent research has shown that incorporating attention mechanisms can improve both predictive accuracy and interpretability. Channel and spatial attention modules, such as SE (Squeeze-and-Excitation) ( Hu, Shen, and Sun 2018 ) and CBAM (Convolutional Block Attention Module) ( Woo et al. 2018 ), enable networks to adaptively emphasize informative feature maps and focus on the most relevant regions within an image. By jointly modeling inter-channel relationships and spatial dependencies, these mechanisms enhance the network’s ability to discriminate between visually similar classes and provide more interpretable feature representations for clinical decision-making. However, existing attention-based methods still face several challenges. Many architectures tend to overfit on small medical datasets due to their high model complexity, while others struggle to generalize across imaging modalities with varying contrast, resolution, and noise characteristics. Furthermore, most attention mechanisms provide deterministic outputs without quantifying uncertainty, limiting their reliability in critical diagnostic scenarios where confidence estimation is essential. In this study, we propose a novel framework that integrates multi-scale attention with a Bayesian probabilistic classifier for robust medical image classification. Our approach is evaluated on four diverse datasets spanning multiple organs and imaging modalities: Diabetic Retinopathy fundus images, Kvasir endoscopic images, Skin Cancer dermoscopic images, and a fused multi-focal Oocyte dataset. Extensive experiments demonstrate that our method not only achieves state-of-the-art classification performance but also provides interpretable attention maps and meaningful uncertainty estimates, making it particularly suitable for clinical deployment. To further explore the interpretability of our framework, we visualize the Grad-CAM-based attention maps alongside uncertainty quantification for oocyte images, as illustrated in Fig. 1 . The highlighted regions in the attention maps represent discriminative features that significantly contribute to the model’s decision-making process. For oocyte classification, the model primarily focuses on morphological structures and texture patterns that indicate variations in oocyte quality. Download figure Open in new tab Figure 1: Illustration of GradCAM-based attention maps along with predicted labels and uncertainty quantification using the Oocytes dataset. The main contributions of this work are summarized as follows: We introduce a novel attention-augmented residual network that integrates SwirlAttention and FeedBackAttention modules to capture multi-scale spatial and channel-wise dependencies in medical images. We incorporate a Bayesian probabilistic classifier using Bayes-by-Backprop, enabling reliable epistemic uncertainty estimation alongside discriminative predictions. We conduct comprehensive experiments on four diverse medical imaging datasets, demonstrating improved accuracy, calibration, and interpretability compared to state-of-the-art CNN and transformer-based baselines. We provide qualitative insights using Grad-CAM visualizations, showing that the network consistently attends to clinically relevant regions and highlighting cases where predictions may be less reliable. Overall, this work presents a unified framework for robust, interpretable, and uncertainty-aware medical image classification, addressing key limitations of conventional deterministic approaches while providing practical tools for real-world clinical applications. Related Work Medical image classification has witnessed rapid advancements with the adoption of deep learning, ranging from convolutional neural networks (CNNs) to vision transformers (ViTs) and hybrid architectures. Early works focused on CNNs for various modalities, demonstrating strong performance in tasks such as retinal fundus analysis, dermoscopic lesion classification, and brain tumor detection ( Tajbakhsh et al. 2016 ; Swati et al. 2019 ). Residual and densely connected architectures further improved feature representation and gradient flow, with models such as ResGANet ( Cheng et al. 2022 ) and DenseNet ( Huang et al. 2017 ) achieving state-of-the-art results on multiple datasets. Recent studies have emphasized attention mechanisms to enhance feature discriminability and interpretability. Lesion-aware networks ( Fang et al. 2019 ), multi-scale pyramid fusion models ( Wen et al. 2025 ), and hierarchical attentive fusion approaches ( Abdar et al. 2022 ) have shown that guiding the network to focus on clinically relevant regions improves both accuracy and reliability. Concurrently, uncertainty-aware methods, including BARF ( Abdar et al. 2021 ) and BayTTA ( Sherkatghanad et al. 2025 ), have high-lighted the importance of estimating epistemic and aleatoric uncertainty for robust clinical deployment. Transformers and hybrid CNN-Transformer architectures have also gained traction in medical imaging due to their capacity for capturing long-range dependencies. Models such as MedViT ( Manzari et al. 2023 ), MedMamba ( Yue and Li 2024 ), TransMed ( Dai, Gao, and Liu 2021 ), and MedTransNet ( Shaik et al. 2024 ) leverage self-attention for global context modeling, while hybrid designs like EFFResNet-ViT ( Hussain et al. 2025 ) and DBCvT ( Li, Feng, and Xia 2024 ) combine convolutional feature extraction with transformer-based token modeling for improved performance on multi-modal datasets. Several works have focused on model calibration and reliable uncertainty estimation to mitigate overconfident predictions, particularly in class-imbalanced scenarios ( Rajaraman, Ganesan, and Antani 2022 ; Liang et al. 2020 ; Ju et al. 2022 ). Dynamic fusion networks such as DAFNet ( Cai et al. 2025 ) and multi-task Mamba variants ( Wu and Gou 2025 ) further enhance robustness by adaptively integrating multi-scale and multi-modal information. Despite these advances, challenges remain in simultaneously achieving high classification accuracy, interpretable attention, and reliable uncertainty estimation across diverse medical imaging modalities. The proposed framework addresses these gaps by integrating multi-scale attention modules with a Bayesian probabilistic classifier, offering both strong discriminative capability and trustworthy uncertainty quantification. Our method is evaluated across four distinct datasets, including fundus, dermatology, endoscopy, and fused multi-focal microscopy images, demonstrating broad applicability and improved reliability compared to existing approaches. Methodology Problem Definition Medical image classification aims to automatically categorize medical scans or images into clinically meaningful classes, such as disease presence, tissue type, or anatomical region, based on their visual and structural characteristics. Formally, let the dataset be defined as where I i ∈ R 3× H × W or I i ∈ R 1× H × W represents a medical image (e.g., MRI, CT, or histopathology image) and y i ∈ Y = {1, 2, …, C } denotes its ground-truth diagnostic label among C possible disease categories. The goal of the model is to learn a mapping function. parameterized by θ , that accurately predicts the correct class label ŷ for each input image. Traditional CNNs achieve high discriminative power but operate under deterministic weight assumptions, which of-ten yield overconfident predictions in the presence of noisy or out-of-distribution medical data. This limitation is particularly critical in clinical settings where predictive confidence directly affects diagnostic reliability and downstream decision-making. To mitigate this issue, we introduce a Bayesian uncertainty-aware framework that models the weight parameters w of the classification layer as probability distributions rather than fixed values. The predictive distribution for a new unseen image I ∗ is expressed as: Where p ( w | D ) represents the posterior over the weights given the training data. Since this integral is intractable, it is approximated through a variational posterior q ( w θ ) using the Bayes-by-Backprop principle. Thus, the learning objective is to infer both discriminative parameters and uncertainty estimates by minimizing the negative evidence lower bound (ELBO): Where the first term corresponds to the expected classification loss and the second term regularizes the approximate posterior toward the prior distribution. In summary, the problem of medical image classification is reformulated as learning a robust, uncertainty-aware mapping. That outputs both diagnostic probabilities and corresponding uncertainty estimates. This design ensures reliable and interpretable predictions, a crucial property for safety-critical medical decision-making. Network Architecture The proposed architecture integrates deterministic and probabilistic reasoning through a hierarchical attention-based residual backbone coupled with a Bayesian classification head. The overall design aims to achieve robust feature representation under uncertainty while maintaining efficient spatial–channel interactions. 1) Swirl Attention To enhance spatial contextual reasoning, we introduce the Swirl Attention module, which performs multi-scale depthwise convolutions with kernel sizes {3, 5, 7, 9}. These convolutions emulate a “swirling” expansion of the receptive field, enabling the model to capture both fine and coarse spatial dependencies. The outputs from each scale are concatenated and fused through a 1 × 1 convolution followed by batch normalization: This residual formulation ensures stable feature propagation while enriching contextual awareness. 2) FeedBack Attention We propose a novel FeedBack Attention mechanism to jointly refine spatial and channel-wise activations. It consists of (i) a channel attention path based on global average pooling and two-layer bottleneck transformation, and (ii) a spatial attention path computed from the concatenation of average- and max-pooled channel responses. Unlike conventional dual-branch attention (e.g., CBAM ( Woo et al. 2018 )), the FeedBack Attention reuses the spatially modulated features to re-estimate the channel weights, thus forming a feedback loop that progressively refines salient feature activations. The joint attention is applied as: Where σ c and σ s denote channel and spatial attention maps, respectively. 3) Residual Attention Blocks Each residual unit integrates both the Swirl and FeedBack attention modules, providing adaptive feature recalibration and multi-scale context aggregation. When downsampling is required, a 1 × 1 convolutional skip connection is employed to match dimensions. The block formulation is given by: Where intermediate activations are modulated by the two attention branches. 4) Bayesian Classification Head To quantify epistemic uncertainty, the deterministic backbone is followed by a Bayesian inference layer using the Bayes-by-Backprop framework. The final fully connected layer is replaced by a BayesLinear module, where each weight and bias is modeled as a Gaussian distribution parameterized by ( µ, ρ ), with σ = log(1 + exp( ρ )) ensuring positivity. During training, weights are sampled as: And optimized via the evidence lower bound (ELBO), incorporating a Kullback–Leibler divergence term against a standard normal prior. This probabilistic head enables calibrated uncertainty estimation at inference. 5) Overall Architecture The complete model begins with a convolutional stem followed by four hierarchical Residual Attention Blocks with progressive channel expansion (32 → 64 → 128 → 256). A global average pooling layer and dropout regularization precede the Bayesian classifier. The integrated attention hierarchy and probabilistic inference enable robust and interpretable decision-making under data noise and domain variability. The complete network architecture diagram is shown in Figure 2 . Download figure Open in new tab Figure 2: Illustration of the proposed network architecture and its components - A: Swirl Attention. B: Feedback Attention. C: Gaussian Variational Module. D: Bayes Linear Layer. E: Residual Block. F: Complete Network Architecture. Here, D/W, Conv, CA, SA, KL, Linear, and GAP denote Depthwise Convolution, Regular Convolution, Channel Attention, Spatial Attention, KL Divergence, Fully-Connected Layer, and Global-Average-Pooling Layer, respectively. Loss Function The proposed framework employs a Bayesian variational learning strategy to jointly optimize deterministic feature representations and probabilistic model parameters. Given the Bayes-by-Backprop formulation of the final classification layer, each weight and bias is represented by a variational distribution parameterized by its mean µ and log-scale parameter ρ . This allows the model to capture epistemic uncertainty through stochastic sampling during training. Formally, let q ( w | θ ) denote the approximate posterior distribution over the network parameters w , where θ = { µ, ρ }, and let p ( w ) represent a standard normal prior. The learning objective is derived from the maximization of the evidence lower bound (ELBO), which can be equivalently expressed as a minimization problem: where: - The first term, E q ( w | θ ) [−log p ( y | I , w )], corresponds to the expected negative log-likelihood of the correct class and serves as the **classification loss**. - The second term, D KL ( q ( w | θ ) ∥ p ( w )), represents the **Kull-back–Leibler (KL) divergence** between the approximate posterior and the prior, acting as a regularizer that penalizes overconfident posterior distributions. - The scalar β balances predictive accuracy and uncertainty regularization; empirically, β ∈ [10 −4 , 10 −2 ] yields stable convergence. For medical image classification, the likelihood term is modeled using the categorical cross-entropy loss: where y i,c denotes the one-hot encoded ground-truth label and is the predicted class probability. The final loss function can thus be written as: with During training, the reparameterization trick It is used to enable gradient-based optimization through stochastic sampling. The model parameters θ are updated via backpropagation to minimize ℒ total . This combined objective ensures that the model simultaneously learns discriminative features and captures epistemic uncertainty. As a result, the proposed loss formulation encourages reliable predictions with calibrated confidence—an essential property for robust decision-making in medical image analysis. Experiments Dataset To comprehensively evaluate the effectiveness and generalizability of the proposed medical image classification framework, four publicly available and custom-prepared datasets were utilized. These datasets span multiple imaging modalities, organs, and diagnostic tasks, ensuring robust validation across diverse clinical applications. Diabetic Retinopathy (DR): The Diabetic Retinopathy dataset contains high-resolution retinal fundus images annotated with five severity levels: No DR, Mild, Moderate, Severe , and Proliferative DR . Specifically, the dataset includes 1,805 images without DR, 370 mild cases, 999 moderate cases, 193 severe cases, and 295 proliferative DR cases (Asia Pacific Tele-Ophthalmology Society (APTOS) 2019). Kvasir: The Kvasir dataset consists of endoscopic gastrointestinal (GI) tract images annotated for both anatomical landmarks and pathological findings. It contains 1,000 images per category across eight classes: dyed-lifted polyps, dyed-resection margins, esophagitis, normal cecum, normal pylorus, normal z-line, polyps, and ulcerative colitis ( Pogorelov et al. 2017 ). Skin Cancer (HAM10000): The HAM10000 dataset comprises dermoscopic images of skin lesions spanning nine diagnostic categories, covering both benign and malignant conditions. Specifically, it includes 130 images of actinic keratosis, 392 basal cell carcinoma, 111 dermatofibroma, 454 melanoma, 373 nevus, 478 pigmented benign keratosis, 80 seborrheic keratosis, 197 squamous cell carcinoma, and 142 vascular lesions ( Codella et al. 2019 ). Oocyte Multi-Focal Fusion Dataset: A specialized private oocyte imaging dataset was constructed by fusing 11 focal plane images into a single high-information composite using an adaptive fusion strategy. The dataset is categorized into four biologically meaningful classes representing distinct oocyte maturation stages. Implementation Details Dataset and Preprocessing All images were resized to 224 × 224 pixels and normalized using standard ImageNet statistics. For cross-validation experiments, a 5-fold stratified splitting strategy was employed to preserve class distributions across folds. Training Models were optimized using the Adam optimizer with an initial learning rate of 1 × 10 −4 . Training was conducted for 100 epochs per fold in the cross-validation setting using a batch size of 4. During inference, Monte Carlo Dropout with 30 stochastic forward passes was utilized to estimate epistemic uncertainty. Software and Hardware The framework was implemented in Python 3.10 using PyTorch 2.1, torchvision, and scikit-learn. All training and evaluation were performed on a workstation equipped with an NVIDIA A100 GPU (CUDA 11.8), 32 GB RAM, and an Intel Xeon CPU. Evaluation Metrics Model performance was assessed using accuracy, precision, recall, F1-score, Matthews correlation coefficient (MCC), and ROC-AUC. Calibration quality was evaluated using the Brier score, Expected Calibration Error (ECE), and reliability diagrams. Confusion matrices, ROC curves, and precision-recall curves were generated for qualitative assessment. Additionally, Grad-CAM was applied to visualize attention maps and interpret model decision-making. Comparison Approaches To evaluate the effectiveness of the proposed uncertainty-aware, attention-augmented framework, we benchmark it against several state-of-the-art models for medical image classification. The selected baselines encompass a broad range of architectures, including conventional convolutional neural networks (CNNs), lightweight mobile networks, and modern transformer-based approaches, providing a comprehensive performance comparison. Quantitative Performance Quantitative evaluation was conducted on four diverse medical imaging datasets: Diabetic Retinopathy (Asia Pacific Tele-Ophthalmology Society (APTOS) 2019), Kvasir ( Pogorelov et al. 2017 ), Skin Cancer (HAM10000) ( Codella et al. 2019 ), and the private Oocyte Multi-Focal Fusion dataset. The comparison approaches include the following representative models: CNN-based models: DenseNet121 ( Huang et al. 2017 ), ResNet50 ( He et al. 2016 ), ResNeXt101 ( Xie et al. 2017 ), RepVGG ( Ding et al. 2021 ), ShuffleNetV2 ( Ma et al. 2018 ), MobileNetV3 ( Howard et al. 2019 ), EfficientNetB0 ( Tan and Le 2019 ), and InceptionNextTiny ( Yu et al. 2024 ). Transformer-based and hybrid models: Simple-ViT ( Beyer, Zhai, and Kolesnikov 2022 ), Region-ViT ( Chen, Panda, and Fan 2021 ), VAN ( Guo et al. 2023 ), EfficientFormerL1 ( Li et al. 2022 ), Mobile-ViTXXS ( Mehta and Rastegari 2021 ), MLPMixer ( Tolstikhin et al. 2021 ), and ConvMixer ( Trockman and Kolter 2022 ). All models were trained and evaluated under the same preprocessing, data augmentation, and cross-validation protocols, ensuring a fair comparison. Qualitative Performance Figure 3 offer qualitative insights into the model’s performance. Figure 3 A depicts the confusion matrix using the Diabetic-Retinopathy dataset, showing that the classifier achieves high true positive rates across most classes with only a few misclassifications. Figure 3 B illustrates the ROC–AUC curves using the skin cancer dataset, indicating strong discriminative power between classes, as most curves approach the ideal top-left region. Th right diagram in Figure 3 C presents the precision–recall curves using the Kvasir dataset, demonstrating that the model sustains high precision even at elevated recall levels. Download figure Open in new tab Figure 3: From left to right - A: Confusion matrix using the diabetic retinopathy dataset, B: ROC-AUC curve using the skin cancer dataset, and C: Precision-Recall curve using the Kvasir dataset Attention Visualization To provide insights into the decision-making process of the proposed model, Grad-CAM ( Selvaraju et al. 2017 ) was employed to generate visual explanations of class-specific activations. For each input image, heatmaps were overlaid on the original images to highlight regions that contributed most strongly to the predicted class. The Grad-CAM visualizations reveal that the model consistently attends to clinically relevant regions. For example, in retinal images, the model highlights microaneurysms, hemorrhages, and exudates corresponding to diabetic retinopathy severity as in Figure 4 . Similarly, in dermoscopic images, attention is concentrated on lesion boundaries and abnormal tissue structures as in Figure 5 . Download figure Open in new tab Figure 4: Grad-CAM visual comparisons between our proposed model and other state-of-the-art image classification approaches on the Diabetic Retinopathy dataset. Download figure Open in new tab Figure 5: Grad-CAM visual comparisons between our proposed model and other state-of-the-art image classification approaches on the Skin Cancer dataset. These qualitative results demonstrate that the attention mechanisms integrated into the network, coupled with Grad-CAM analysis, provide interpretable and clinically meaningful explanations for predictions. They also help identify cases where the model may rely on spurious features, guiding future refinement of both architecture and training strategies for enhanced reliability. Reliability Diagram Figure 6 illustrates the reliability diagrams for multiple datasets, including Diabetic-Retinopathy, Kvasir, Oocytes, and Skin-Cancer. Each plot compares the model’s predicted confidence (X-axis) with the corresponding empirical accuracy (Y-axis), providing insights into calibration quality. Ideally, a well-calibrated model’s reliability curve aligns closely with the diagonal reference line, indicating that predicted probabilities accurately reflect true likelihoods. The diagrams show that, across most datasets, the model’s predictions exhibit reasonable calibration, though slight deviations from the diagonal suggest instances of overconfidence or underconfidence in certain confidence intervals. Download figure Open in new tab Figure 6: Reliability Diagram using datasets - A: Diabetic-Retinopathy, B: Kvasir, C: Oocytes, and D: Skin-Cancer. The Y-axis represents accuracy, and the X-axis represents confidence. Top Uncertain and Certain Samples To gain deeper insights into the reliability of the proposed model, we further examine its predictive uncertainty on the Diabetic-Retinopathy dataset. Figure 7 presents the five most uncertain and the five most certain cases as estimated by our framework. The most certain cases exhibit high-confidence predictions consistent with the ground-truth labels, underscoring the model’s robustness in recognizing clear diagnostic patterns. In contrast, the most uncertain cases involve challenging samples—such as those with ambiguous lesion boundaries or visually similar inter-class features—where the model demonstrates lower confidence. This analysis highlights not only the value of incorporating uncertainty estimation in medical image classification but also its potential as a diagnostic tool for flagging cases that may warrant additional expert assessment. Download figure Open in new tab Figure 7: Top row shows the 5 most uncertain samples, while the bottom row shows the 5 most certain samples using the diabetic-retinopathy dataset. Ablation Study Table 2 presents an extensive ablation study conducted on two medical datasets—Diabetic-Retinopathy (A) and Skin-Cancer (B)—to evaluate the impact of different architectural components on model performance. Across both datasets, the proposed model (“Ours”) consistently achieves superior or highly competitive results in nearly all key metrics, including AUC, Accuracy, F1 Score, and MCC, while also demonstrating the lowest computational overhead in terms of inference time and the highest FPS. These results highlight the synergistic benefit of integrating Swirl Attention, Feedback Attention, and Bayesian fully connected (BayesFC) mechanisms, yielding improved predictive reliability, computational efficiency, and uncertainty calibration across diverse medical imaging tasks. View this table: View inline View popup Download powerpoint Table 1: A: Quantitative comparison on the Diabetic Retinopathy dataset. B: Quantitative comparison on the Kvasir dataset. C: Quantitative comparison on the Oocytes dataset. D: Quantitative comparison on the Skin Cancer dataset. The best values are highlighted in bold, while in cases where our model performs the second best, we highlight with the color red. View this table: View inline View popup Download powerpoint Table 2: Ablation study using different components in the network architecture. A: Diabetic-Retinopathy dataset. B: Skin-Cancer dataset. Here, Conv, SA, CA, and FC denotes Convolutional layer, Spatial Attention, Channel Attention, and Fully-Connected layer, respectively. The best values are highlighted in bold. Table 3 presents the ablation analysis comparing the proposed loss function against the conventional Cross-Entropy (CE) loss on the Diabetic-Retinopathy and Skin-Cancer datasets. The results clearly demonstrate the effectiveness of the proposed loss formulation in improving both predictive accuracy and calibration quality. These consistent improvements across datasets highlight the robustness and calibration benefits of the proposed loss, effectively balancing classification accuracy and uncertainty awareness in medical diagnosis tasks. View this table: View inline View popup Download powerpoint Table 3: Ablation study using loss function. A: Diabetic-Retinopathy dataset. B: Skin-Cancer dataset. Here CE denotes the Cross-Entropy loss. The best values are highlighted in bold. Failure Cases Despite the overall strong performance of the proposed uncertainty-aware, attention-augmented framework, certain failure cases were observed across the evaluated datasets. In the Diabetic Retinopathy and Skin-Cancer datasets, intermediate stages or borderline morphological variations were occasionally misclassified due to overlapping feature representations between adjacent classes (e.g., Mild vs. Moderate DR). In dermoscopic images from the Skin Cancer dataset, lesions with atypical pigmentation or small size were occasionally misclassified, highlighting the challenge of rare or underrepresented patterns. We observe instances of both False Positives and False Negatives in Figure 8 . False Positives occur when the model’s predictions are correct, but Grad-CAM reveals that the model is focusing on irrelevant or inaccurate regions of the image. False Negatives, on the other hand, occur when the model’s predictions are incorrect, even though Grad-CAM highlights the correct or semantically relevant regions of the image. Download figure Open in new tab Figure 8: Illustration of some failure cases using GradCAM-based attention maps along with true and predicted labels using the Skin-Cancer dataset. These failure cases underscore the importance of high-quality imaging, sufficient class representation, and careful preprocessing. They also indicate potential avenues for future work, including multi-modal data integration, improved attention-guided feature extraction, and uncertainty-driven active learning to better handle ambiguous or visually challenging samples. Conclusions In this work, we propose an uncertainty-aware, attention-augmented neural network for medical image classification that combines multi-scale SwirlAttention and FeedBack-Attention modules with a Bayes-by-Backprop probabilistic classifier. The framework enables both discriminative feature learning and reliable uncertainty estimation, addressing the overconfidence of conventional deterministic models. Extensive experiments on four diverse datasets—including Diabetic Retinopathy, Kvasir, Skin Cancer, and fused multifocal Oocyte images—demonstrate improved predictive accuracy, calibration, and interpretability compared to state-of-the-art CNN and transformer-based models. The results highlight the potential of integrating attention mechanisms with Bayesian inference for robust and clinically reliable medical image analysis. Data Availability All data produced are available online at https://datasets.simula.no/kvasir/ https://www.kaggle.com/datasets/sovitrath/diabetic-retinopathy-224x224-2019-data https://www.kaggle.com/datasets/nodoubttome/skin-cancer9-classesisic Footnotes The citation was added: Paper - "BASIC: Bayesian Spiral Attention Classifier for Interpretable Medical Image Classification" References Aalishah , R. ; Navardi , M. ; and Mohsenin , T. 2025 . Med-MambaLite: Hardware-Aware Mamba for Medical Image Classification . arXiv preprint arxiv: 2508.05049 . ↵ Abdar , M. ; Fahami , M. A. ; Chakrabarti , S. ; Khosravi , A. ; Pławiak , P. ; Acharya , U. R. ; Tadeusiewicz , R. ; and Nahavandi , S. 2021 . BARF: A new direct and cross-based binary residual feature fusion with uncertainty-aware module for medical image classification . Information Sciences , 577 : 353 – 378 . OpenUrl ↵ Abdar , M. ; Fahami , M. A. ; Rundo , L. ; Radeva , P. ; Frangi , A. F. ; Acharya , U. R. ; Khosravi , A. ; Lam , H.-K. ; Jung , A. ; and Nahavandi , S. 2022 . Hercules: Deep hierarchical attentive multilevel fusion model with uncertainty quantification for medical image classification . IEEE Transactions on Industrial Informatics , 19 ( 1 ): 274 – 285 . OpenUrl Ansari , S. A. ; Agrawal , A. P. ; Wajid , M. A. ; Wajid , M. S. ; and Zafar , A. 2024 . MetaV: A pioneer in feature augmented meta-learning based vision transformer for medical image classification . Interdisciplinary Sciences: Computational Life Sciences , 16 ( 2 ): 469 – 488 . OpenUrl PubMed Asia Pacific Tele-Ophthalmology Society (APTOS ). 2019 . APTOS 2019 Blindness Detection . https://www.kaggle.com/competitions/aptos2019-blindness-detection . Kaggle dataset . ↵ Beyer , L. ; Zhai , X. ; and Kolesnikov , A. 2022 . Better plain vit baselines for imagenet-1k . arXiv preprint arxiv: 2205.01580 . ↵ Cai , Z. ; Chen , Y. ; Wang , J. ; He , X. ; Pei , Z. ; Lei , X. ; and Lu , C. 2025 . DAFNet: A novel Dynamic Adaptive Fusion Network for medical image classification . Information Fusion, 103507 . ↵ Chen , C.-F. ; Panda , R. ; and Fan , Q. 2021 . Regionvit: Regional-to-local attention for vision transformers . arXiv preprint arxiv: 2106.02689 . ↵ Cheng , J. ; Tian , S. ; Yu , L. ; Gao , C. ; Kang , X. ; Ma , X. ; Wu , W. ; Liu , S. ; and Lu , H. 2022 . ResGANet: Residual group attention network for medical image classification and segmentation . Medical Image Analysis , 76 : 102313 . OpenUrl PubMed Chowdary , G. J. ; and Yin , Z. 2024 . Med-former: A transformer based architecture for medical image classification . In International conference on medical image computing and computer-assisted intervention , 448 – 457 . Springer . ↵ Codella , N. ; Rotemberg , V. ; Tschandl , P. ; Celebi , M. E. ; Dusza , S. ; Gutman , D. ; Helba , B. ; Kalloo , A. ; Liopyris , K. ; Marchetti , M. ; et al. 2019 . Skin lesion analysis toward melanoma detection 2018: A challenge hosted by the international skin imaging collaboration (isic) . arXiv preprint arxiv: 1902.03368 . ↵ Dai , Y. ; Gao , Y. ; and Liu , F. 2021 . Transmed: Transformers advance multi-modal medical image classification . Diagnostics , 11 ( 8 ): 1384 . OpenUrl PubMed ↵ Ding , X. ; Zhang , X. ; Ma , N. ; Han , J. ; Ding , G. ; and Sun , J. 2021 . Repvgg: Making vgg-style convnets great again . In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition , 13733 – 13742 . Djoumessi , K. ; Mensah , S. O. ; and Berens , P. 2025 . A Hybrid Fully Convolutional CNN-Transformer Model for Inherently Interpretable Medical Image Classification . arXiv preprint arxiv: 2504.08481 . Dosovitskiy , A. ; Beyer , L. ; Kolesnikov , A. ; Weissenborn , D. ; Zhai , X. ; Unterthiner , T. ; Dehghani , M. ; Minderer , M. ; Heigold , G. ; Gelly , S. ; et al. 2020 . An image is worth 16×16 words: Transformers for image recognition at scale . arXiv preprint arxiv: 2010.11929 . ↵ Fang , L. ; Wang , C. ; Li , S. ; Rabbani , H. ; Chen , X. ; and Liu , Z. 2019 . Attention to lesion: Lesion-aware convolutional neural network for retinal optical coherence tomography image classification . IEEE transactions on medical imaging , 38 ( 8 ): 1959 – 1970 . OpenUrl PubMed Gal , Y. ; and Ghahramani , Z. 2016 . Dropout as a bayesian approximation: Representing model uncertainty in deep learning . In international conference on machine learning , 1050 – 1059 . PMLR . ↵ Guo , M.-H. ; Lu , C.-Z. ; Liu , Z.-N. ; Cheng , M.-M. ; and Hu , S.-M. 2023 . Visual attention network . Computational visual media , 9 ( 4 ): 733 – 752 . OpenUrl ↵ He , K. ; Zhang , X. ; Ren , S. ; and Sun , J. 2016 . Deep residual learning for image recognition . In Proceedings of the IEEE conference on computer vision and pattern recognition , 770 – 778 . He , X. ; Deng , Y. ; Fang , L. ; and Peng , Q. 2021 . Multi-modal retinal image classification with modality-specific attention network . IEEE transactions on medical imaging , 40 ( 6 ): 1591 – 1602 . OpenUrl PubMed ↵ Howard , A. ; Sandler , M. ; Chu , G. ; Chen , L.-C. ; Chen , B. ; Tan , M. ; Wang , W. ; Zhu , Y. ; Pang , R. ; Vasudevan , V. ; et al. 2019 . Searching for mobilenetv3 . In Proceedings of the IEEE/CVF international conference on computer vision , 1314 – 1324 . ↵ Hu , J. ; Shen , L. ; and Sun , G. 2018 . Squeeze-and-excitation networks . In Proceedings of the IEEE conference on computer vision and pattern recognition , 7132 – 7141 . ↵ Huang , G. ; Liu , Z. ; Van Der Maaten , L. ; and Weinberger , K. Q. 2017 . Densely connected convolutional networks . In Proceedings of the IEEE conference on computer vision and pattern recognition , 4700 – 4708 . ↵ Hussain , T. ; Shouno , H. ; Hussain , A. ; Hussain , D. ; Ismail , M. ; Mir , T. H. ; Hsu , F. R. ; Alam , T. ; and Akhy , S. A. 2025 . EFFResNet-ViT: A fusion-based convolutional and vision transformer model for explainable medical image classification . IEEE Access . ↵ Ju , L. ; Wang , X. ; Wang , L. ; Mahapatra , D. ; Zhao , X. ; Zhou , Q. ; Liu , T. ; and Ge , Z. 2022 . Improving medical images classification with label noise using dual-uncertainty estimation . IEEE transactions on medical imaging , 41 ( 6 ): 1533 – 1546 . OpenUrl PubMed ↵ Li , J. ; Feng , M. ; and Xia , C. 2024 . DBCvT: Double Branch Convolutional Transformer for Medical Image Classification . Pattern Recognition Letters , 186 : 250 – 257 . OpenUrl ↵ Li , Y. ; Yuan , G. ; Wen , Y. ; Hu , J. ; Evangelidis , G. ; Tulyakov , S. ; Wang , Y. ; and Ren , J. 2022 . Efficientformer: Vision transformers at mobilenet speed . Advances in Neural Information Processing Systems , 35 : 12934 – 12949 . OpenUrl ↵ Liang , G. ; Zhang , Y. ; Wang , X. ; and Jacobs , N. 2020 . Improved trainable calibration method for neural networks on medical imaging classification . arXiv preprint arxiv: 2009.04057 . Liu , S. ; Wang , L. ; and Yue , W. 2024 . An efficient medical image classification network based on multi-branch CNN, token grouping Transformer and mixer MLP . Applied Soft Computing , 153 : 111323 . OpenUrl ↵ Ma , N. ; Zhang , X. ; Zheng , H.-T. ; and Sun , J. 2018 . Shufflenet v2: Practical guidelines for efficient cnn architecture design . In Proceedings of the European conference on computer vision (ECCV) , 116 – 131 . ↵ Manzari , O. N. ; Ahmadabadi , H. ; Kashiani , H. ; Shokouhi , S. B. ; and Ayatollahi , A. 2023 . MedViT: a robust vision transformer for generalized medical image classification . Computers in biology and medicine , 157 : 106791 . OpenUrl PubMed Manzari , O. N. ; Asgariandehkordi , H. ; Koleilat , T. ; Xiao , Y. ; and Rivaz , H. 2025 . Medical image classification with kanintegrated transformers and dilated neighborhood attention . arXiv preprint arxiv: 2502.13693 . ↵ Mehta , S. ; and Rastegari , M. 2021 . Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer . arXiv preprint arxiv: 2110.02178 . Nickparvar , M. 2021 . Brain Tumor MRI Dataset . Okolo , G. I. ; Katsigiannis , S. ; and Ramzan , N. 2022 . IEViT: An enhanced vision transformer architecture for chest X-ray image classification . Computer Methods and Programs in Biomedicine , 226 : 107141 . OpenUrl PubMed ↵ Pogorelov , K. ; Randel , K. R. ; Griwodz , C. ; Eskeland , S. L. ; de Lange , T. ; Johansen , D. ; Spampinato , C. ; Dang-Nguyen , D.-T. ; Lux , M. ; Schmidt , P. T. ; et al. 2017 . Kvasir: A multiclass image dataset for computer aided gastrointestinal disease detection . In Proceedings of the 8th ACM on Multimedia Systems Conference, 164–169 . Raghu , M. ; Zhang , C. ; Kleinberg , J. ; and Bengio , S. 2019 . Transfusion: Understanding transfer learning for medical imaging . Advances in neural information processing systems , 32 . ↵ Rajaraman , S. ; Ganesan , P. ; and Antani , S. 2022 . Deep learning model calibration for improving performance in class-imbalanced medical image classification tasks . PloS one , 17 ( 1 ): e0262838 . OpenUrl CrossRef PubMed Sagar , A. 2025 . BASIC: Bayesian Spiral Attention Classifier for Interpretable Medical Image Classification . medRxiv , 2025–10. ↵ Selvaraju , R. R. ; Cogswell , M. ; Das , A. ; Vedantam , R. ; Parikh , D. ; and Batra , D. 2017 . Grad-cam: Visual explanations from deep networks via gradient-based localization . In Proceedings of the IEEE international conference on computer vision , 618 – 626 . ↵ Shaik , N. S. ; Cherukuri , T. K. ; Veeranjaneulu , N. ; and Bodapati , J. D. 2024 . Medtransnet: advanced gating transformer network for medical image classification . Machine Vision and Applications , 35 ( 4 ): 73 . OpenUrl Shastri , S. ; Kansal , I. ; Kumar , S. ; Singh , K. ; Popli , R. ; and Mansotra , V. 2022 . CheXImageNet: a novel architecture for accurate classification of Covid-19 with chest x-ray digital images using deep convolutional neural networks . Health and technology , 12 ( 1 ): 193 – 204 . OpenUrl ↵ Sherkatghanad , Z. ; Abdar , M. ; Bakhtyari , M. ; Pławiak , P. ; and Makarenkov , V. 2025 . BayTTA: Uncertainty-aware medical image classification with optimized test-time augmentation using Bayesian model averaging . Knowledge-Based Systems , 114123 . ↵ Swati , Z. N. K. ; Zhao , Q. ; Kabir , M. ; Ali , F. ; Ali , Z. ; Ahmed , S. ; and Lu , J. 2019 . Brain tumor classification for MR images using transfer learning and fine-tuning . Computerized Medical Imaging and Graphics , 75 : 34 – 46 . OpenUrl PubMed ↵ Tajbakhsh , N. ; Shin , J. Y. ; Gurudu , S. R. ; Hurst , R. T. ; Kendall , C. B. ; Gotway , M. B. ; and Liang , J. 2016 . Convolutional neural networks for medical image analysis: Full training or fine tuning? IEEE transactions on medical imaging , 35 ( 5 ): 1299 – 1312 . OpenUrl CrossRef PubMed ↵ Tan , M. ; and Le , Q. 2019 . Efficientnet: Rethinking model scaling for convolutional neural networks . In International conference on machine learning, 6105–6114 . PMLR . ↵ Tolstikhin , I. O. ; Houlsby , N. ; Kolesnikov , A. ; Beyer , L. ; Zhai , X. ; Unterthiner , T. ; Yung , J. ; Steiner , A. ; Keysers , D. ; Uszkoreit , J. ; et al. 2021 . Mlp-mixer: An all-mlp architecture for vision . Advances in neural information processing systems , 34 : 24261 – 24272 . OpenUrl ↵ Trockman , A. ; and Kolter , J. Z. 2022 . Patches are all you need? arXiv preprint arxiv: 2201.09792 . ↵ Wen , Y. ; Chen , B. ; Shi , W. ; Feng , D. ; Cao , W. ; and Wu , S. 2025 . MSPFM: Multi-Scale Pyramid Fusion Mamba for Medical Image Classification: Y. Wen et al . The Visual Computer , 1 – 16 . ↵ Woo , S. ; Park , J. ; Lee , J.-Y. ; and Kweon , I. S. 2018 . Cbam: Convolutional block attention module . In Proceedings of the European conference on computer vision (ECCV) , 3 – 19 . ↵ Wu , X. ; and Gou , G. 2025 . Uncertainty bidirectional guidance of multi-task mamba network for medical image classification and segmentation . Signal, Image and Video Processing , 19 ( 1 ): 29 . OpenUrl ↵ Xie , S. ; Girshick , R. ; Dollár , P. ; Tu , Z. ; and He , K. 2017 . Aggregated residual transformations for deep neural networks . In Proceedings of the IEEE conference on computer vision and pattern recognition , 1492 – 1500 . Xue , C. ; Dou , Q. ; Shi , X. ; Chen , H. ; and Heng , P.-A. 2019 . Robust learning at noisy labeled medical images: Applied to skin lesion classification . In 2019 IEEE 16th International symposium on biomedical imaging (ISBI 2019) , 1280 – 1283 . IEEE . ↵ Yu , W. ; Zhou , P. ; Yan , S. ; and Wang , X. 2024 . Inception-next: When inception meets convnext . In Proceedings of the IEEE/cvf conference on computer vision and pattern recognition , 5672 – 5683 . ↵ Yue , Y. ; and Li , Z. 2024 . Medmamba: Vision mamba for medical image classification . arXiv preprint arxiv: 2403.03849 . View the discussion thread. Back to top Previous Next Posted January 05, 2026. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following BUCAN: Bayesian Uncertainty-aware Classification with Attention Networks for Medical Images Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share BUCAN: Bayesian Uncertainty-aware Classification with Attention Networks for Medical Images Abhinav Sagar medRxiv 2025.11.05.25339638; doi: https://doi.org/10.1101/2025.11.05.25339638 Share This Article: Copy Citation Tools BUCAN: Bayesian Uncertainty-aware Classification with Attention Networks for Medical Images Abhinav Sagar medRxiv 2025.11.05.25339638; doi: https://doi.org/10.1101/2025.11.05.25339638 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15228) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6598) Geriatric Medicine (668) Health Economics (997) Health Informatics (4536) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3332) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9231) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a005568c68df0708',t:'MTc3OTU1MjcwMw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.