Full text
53,228 characters
· extracted from
preprint-html
· click to expand
Improving automated prostate pathological grading via confidence filtering | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Improving automated prostate pathological grading via confidence filtering View ORCID Profile Ryan B. Fogarty , Dmitry B. Goldgof , View ORCID Profile Lawrence O. Hall , Jasreman Dhillon , View ORCID Profile Vaibhav Chumbalkar , View ORCID Profile Yoganand Balagurunathan doi: https://doi.org/10.1101/2025.11.25.25340482 Ryan B. Fogarty 1 Department of Machine Learning, Moffitt Cancer Center , Tampa, FL, USA 2 Bellini College of Artificial Intelligence, Cybersecurity and Computing, University of South Florida , Tampa, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ryan B. Fogarty For correspondence: rfogarty{at}usf.edu Dmitry B. Goldgof 2 Bellini College of Artificial Intelligence, Cybersecurity and Computing, University of South Florida , Tampa, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lawrence O. Hall 2 Bellini College of Artificial Intelligence, Cybersecurity and Computing, University of South Florida , Tampa, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lawrence O. Hall Jasreman Dhillon 3 Department of Pathology, Moffitt Cancer Center , Tampa, FL, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Vaibhav Chumbalkar 3 Department of Pathology, Moffitt Cancer Center , Tampa, FL, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Vaibhav Chumbalkar Yoganand Balagurunathan 1 Department of Machine Learning, Moffitt Cancer Center , Tampa, FL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Yoganand Balagurunathan Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract There have been many promising developments in deep learning to identify degrees of malignancies in prostate cancer pathologies. Deep network models have been shown to be useful in identifying patterns in histology images assessed at different scales. Prostate pathological grade identification has been a challenge among clinical experts due to complex patterns on the whole slide level, for hematoxylin and eosin (H&E) stained samples. In this study, we identify primary patterns (Gleason) in small sections of the whole slide composed of uniform glandular patterns. We then follow sample selection methods that eliminate ambiguous regions or tiled-samples by confidence filtering. A pseudo-confidence is derived from the predicted output of the network, which is used as a quality indicator to consider the sample for discriminatory analysis. We provide further evidence that using highly calibrated confidence sample selection, these gland-level features on the prostate biopsy sections can discriminate degrees of malignancy following primary Gleason patterns. We used an optimized deep network (convolutional neural network, CNN) discriminating glandular regions with aggressive grades (Gleason 3 from 4) showed an accuracy of 0.68(0.04), F 1 of 0.66(0.06) and AUC of 0.74(0.04). We further improve this result using confidence filtering, with a sample fraction of 0.35 (with a calibrated confidence of greater than 0.85), achieving an accuracy of 0.74 (0.08), F 1 of 0.72 (0.12), and AUC 0.79 (0.08) averaged from holdout sets over multiple reshuffled experiments. Introduction Pathological assessment of prostate malignancy follows International Society for Urological Pathology (ISUP) grade patterns, which describe the extent of neoplasm on hematoxylin and eosin (H&E) slides [ 1 ]. The interpretation difficulty of complex histopathological specimens is evident, with a high level of variability among pathologists. It has been well documented in several studies that have shown lower concordance between the pathologists’ opinion (intra and inter-institutional) [ 2 ]. There is a need for standardized clinical evaluation to improve the grading and clinical care. Differences in interpretation of clinical grades in histopathology slides are due to both clinical opinion differences and data sampling, leading to aleatoric noise in data labels, which is a consequence of pattern complexity; methods that do not consider these label errors will affect the machine learning model performance. In earlier work, deep learning methods have been proposed to assist in Gleason pattern grading of digital whole slide images (WSI) without addressing clinical label level noise; these methods continue to be well-investigated [ 3 – 7 ]. Most research has concentrated on evaluating entire specimens or identifying Gleason grades or scores over large areas of the digitized biopsy. Due to the high resolution of digitized WSIs (typically ranging from 20x-40x or 50-100 mega-pixels), vision algorithms require the image to be tiled into many smaller patches that are evaluated independently and then encoded into a much smaller latent space. Behzadi et al. used a transformer and multi-instance learning (MIL) to classify pathology features. The derived patches were shown to be good predictors of pathological grade. State-of-the-art performance was achieved in classifying entire biopsies by selecting the most relevant patches of the WSI with the use of an autoencoder and trained attention network [ 4 ]. Generalization performance has improved by utilizing the diversity and amount of data available with TCGA [ 8 ], PANDA [ 9 ] and Gleason 2019 [ 10 , 11 ] datasets. Xu et al have created an extremely large pathology dataset with over 1.3 billion 256x256 patches derived from 117 thousand whole slide images collected from more than 30,000 patients at the Providence group in Portland OR, and Seattle WA [ 5 ]. A study by Butt et al. demonstrated that applying multiple Gleason scores to each patch or multi-labeling improves overall pathological grading by estimating pixel labeling error of up to 30% on the SICAPv2 pathology dataset [ 6 ]. The Deep Gleason study has demonstrated state-of-the-art performance with CNN architectures for Gleason scoring on whole slide images utilizing the ConvNeXt network, which has been shown to be even more effective than modern transformer architectures such as the Swin (sliding windows) transformer [ 7 , 12 ]. Our prior study [ 13 ] demonstrated that the random sampling and consensus (RANSAC) technique [ 14 ] provides a framework to handle class mislabeling. We presented a deep model to discriminate primary Gleason patterns using well-curated patterns in representative regions (with patch areas of ∼ 0.14 mm 2 ), which reported an accuracy of 0.97 and F 1 -score of 0.97 (using Monte-Carlo Cross Validation). The RANSAC approach is comparable to “confident learning” [ 15 ], recently developed techniques that are available in the Cleanlab software library [ 16 ]. A generalized approach has recently been summarized in a label-correction framework [ 17 ]. Several approaches to confidence calibration have been developed to provide meaningful confidence intervals to inform the clinician and patient. Jiang et al. compare several conventional techniques such as histogram binning, Platt scaling, and isotonic regression to their own adaptive calibration of predictions (ACP) method [ 18 ]. Gawlikowski et al. provide a survey of techniques and discussion of the various forms of aleatoric and epistemic error and some approaches to minimize these errors [ 19 ]. Our study demonstrates the Parzen-Rosenblatt method works well to calibrate a mapping function, which we tuned on validation data, to improve expected calibration error (ECE) of a confidence score [ 20 ]. Others are wary of relying on confidence predictions, especially if it is shown that expected calibration error is difficult to improve, or if future datasets may have a distribution shift [ 21 ]. Another study showed very modest gains after calibrating for confidence with little significance on classification performance [ 22 ]. In our study, ECE was improved reliably for each model, and classification performance shows very significant performance when comparing the most confident samples versus the least confident samples. In this study, we evaluate two significant procedures to improve overall performance of clinical evaluation of prostate pathology. In the first procedure, we improve labels on training data for improved classification of prostate primary Gleason grading. We applied the RANSAC technique to both prune or relabel a proportion of the training samples, demonstrating the effectiveness and importance of training data refinement. As a second significant procedure, we calibrated our output classification probabilities (or confidence) to determine thresholds to filter ambiguous samples with weak predictions. Together these two procedures improve performance over 10% across all metrics. Our results are consistent with other recent reports and ablation studies that have boosted performance through confidence filtering either on the training data [ 23 – 25 ] or after prediction and before final classification [ 26 , 27 ]. Our experiments included several well-studied optimization techniques to improve performance as a series of ablation tests; the results are critically analyzed by repeating the tests multiple times and computing statistical significance [ 28 ]. The most significant improvements were gleaned from data sanitizing on both the training side and inference (or prediction) side; the former by label correction and sample rejection and the latter by using calibrated confidence on inferred label samples to ignore ambiguous examples. Fig 1 shows a rough model of the processes involved in both training and inference and will be discussed in detail in the next section. Download figure Open in new tab Fig 1. Training and Inference performance improves with data sanitization. Materials and methods Data Cohort We retrospectively collected patient data (P=58) who underwent targeted biopsies (N=110) to evaluate prostate disease. Pathology slides were obtained for a cohort that had diagnosis and treatment at Moffitt in the years 2018-2019. The retrospective study was approved by the institutional review board (IRB) at Moffitt/University of South Florida. Informed patient consent was waived for retrospective research efforts for samples collected under the institutional protocol (Total Cancer Care ® ). The clinical records of the patients were obtained and pathological whole slides were digitized at high resolution (20x, 0.504 µm /pixel) using an Aperio ® slide scanner. The research pathologist assessed the WSI and marked contiguous regions to provide a score following Gleason criteria, limited to the prominent pattern (primary Gleason score) at the region level. The digitized slides and derivative tiles were used in this study from 01/03/2025 to 31/08/2025 (March-August) to compile these research results. We systematically divided the marked regions into smaller sections or tiles that are representative of the whole slide images and assigned the labels provided at the regions to the smaller tiles. The primary scores for these tiles/regions follow the ranges of patterns (benign, Gleason score 3, 4, or 5). Gleason 4 labels were not discriminated between cribriform and non-cribriform variants of malignancy, despite its role in aggressive disease prognosis [ 3 , 29 ]. Despite collecting benign and Gleason 5 samples, their under representation excluded them from the tests in our study. The converged data cohort consisted of 3311 Gleason-pattern 3 (GP3) and 2909 Gleason-pattern 4 (GP4) labeled glandular patches. Fig 2 shows a random selection of primary GP3 (top panel) and primary GP4 at the bottom. Download figure Open in new tab Fig 2. Random selection of Moffitt GP3 and GP4 indicated glands Data split for DL modeling We organized the data into seven randomized experiments (80/20 training/test ratio), with a total of 110 patient biopsies; each of the shuffled experiments had 88 training and 22 test WSIs. The seven training sets were further split into a five-fold cross-validation with each fold comprising 80/20% training/validation and evenly stratified. The CV folds were used to compute validation metrics for tuning ML models and later for creating ensembles of models. DL training data was shuffled between iterative runs and truncated (as necessary) to have a balanced representation of the labeled classes. The final metrics were computed by bootstrapping the data with both groups (GP3/GP4) in perfect proportion, and reported results are averaged over bootstraped samples (100 times). Depending on imbalance, bootstrapped metrics were sampled from up to 75% of the holdout data. Cross-validated data splits are discussed in more detail in Supplemental ?? . Deep Models We used tile-level data with glandular structures derived from whole slide images, and these patch images were preprocessed to standardize the data before sending them to a deep learner (DL). Our DL was a modified version of a VGG-16 architecture, limited to 3 groups of CNN filtering (or 7 total CNN stages) and two fully-connected layers replacing the 3-layer top of the standard VGG-16 [ 30 , 31 ]; early experiments showed a full VGG-16 to easily over train and for generalization support to suffer. Thus, our optimized architecture was effectively a VGG-9 CNN (or a VGG-15 if non-optimized, using all CNN stages but a simplified top layer). Stochastic gradient descent was used as the optimizer with learning rate scheduling following an exponentially decreasing cyclic cosine schedule. Training data patches whose predicted label after training disagreed with GT more than 80% of the time were relabeled, and the next 20% of patches that disagreed more than 60% of the time were removed from the training set. Results were improved by combining classifiers across 5 CV folds. Ablation Studies DL ablation studies were nominally performed with the TensorFlow/Keras framework on a VGG-9 or VGG-15 CNN. The network’s initial weights (CNN layers) were transfer-learned from model trained on ImageNet data. All studies were performed on an NVIDIA ® DGX™platform with up to eight A100 GPUs. The image patches were preprocessed to standardize the glandular data, which involved: resizing (to 300x300 pixels), converting to gray level (0-255), and masking of epithelial glands. The preprocessed data was also augmented via flips and rotations while training the deep learning network to improve generalization. Other hyper parameters (learning rate, momentum, batch size, dropout, etc.) were tuned to support validation data performance and following modern convention. Relevant source code and scripts may be referenced for additional detail at https://github.com/rfogarty/ConfidenceFiltering . Classifier Optimizations Two DL model optimizations are important enough to warrant discussion: data sanitization and an ensemble of models. Both techniques provided significant gains in performance and will be briefly discussed in our results. We used the random sample and consensus (RANSAC) approach, previously studied and reported in [ 13 ], to prune difficult samples from the training set [ 14 ]. We also applied label-flipping when samples rarely compared correctly to the ground-truth label. Further details of the algorithm and thresholds used to label-flip or prune samples are discussed in Supplemental ?? . Ensembles were used by voting on inference decisions across 5-fold cross-validated models. Similar gains were made using ensembles from the best 5 models using snapshot ensemble approach. We performed many other ablation studies, which include preprocessing of image patches (graying, cropping, masking) and deep network tuning (model pruning, exponentially-decreasing learning rate scheduling) - each only provided small improvements and are not discussed in detail in this paper. Several of these hyperparameter, neural network architecture, and data preprocessing steps were ablated to study their effect. Because their effect was small, the results of those tests are not included here for brevity and are reserved for publication at a later time. Confidence Filtering In a sample discrimination context with binary classes, we can order samples based on confidence and ignore predictions if alternate observations of a sample provide stronger evidence of a state or classification. We propose to remove those predictions with low confidence to improve decisions. The pseudo-confidence produced by a discriminate function (such as the output of the sigmoid of a binary DL classifier) can be used to derive a confidence on each sample’s decision. Whereby, the least confident samples are discarded when considering the larger picture or an overall classification decision. To measure confidence, the output of a DL classifier can be mapped to a pseudo-confidence ( p̂ i ) and then calibrated towards a true confidence ( q̂ i ). In an ideally calibrated system, a group of samples (say n samples) between two thresholds should each produce a pseudo-confidence that closely ties to the system’s measured accuracy. E.g., if we were to take n samples produced with a pseudo-confidence of between 0.9 and 1.0, then in the average we expect > 90% of these n inferences to classify correctly; we presume a frequentist interpretation of model behavior; we can both measure calibration error and correct for it by remapping to measured performance [ 18 , 32 , 33 ]. A mapping function for calibrated confidence is estimated using cross-validation data from five-fold results (see below). Prior works have produced various methods for estimating this calibration function, such as histogram binning, isotonic regression, and Platt scaling [ 33 ]. Each method estimates a mapping function between the true confidence and the pseudo-confidence derived from a model’s probabilistic output. We first define a pseudo-confidence, P̂ , relative to the DL model’s (sigmoidal) binary output Ŷ . The i th pseudo-confidence is: The goal is to map P̂ → Q̂ = p , or a calibrated confidence, so that or near perfect calibration. A generalization to histogram binning is kernel density estimation also termed the Parzen-Rozenblatt window method [ 20 ]. A closer approximation to the true underlying confidence distribution is achieved by convolving a correctness array C (boolean) with a kernel function. Before convolving the array C , it is first sorted in order of pseudo-confidence P̂ . The result of the kernel convolution estimates the cumulative density function. The kernel chosen was a one-sided falling exponential function of the form: The kernel window was empirically adjusted to perform well with a length of N = 40. The KDE algorithm computed for each model independently is as follows: Step 1. Compute inference results on validation samples, Step 2. Create boolean array C (correct classification for each inference), Step 3. Convert each sample’s DL test statistic output (sigmoid in binary case) to p̂ i , Step 4. Sort the boolean array C by lowest to highest p̂ i , Step 5. Convolve sorted C with kernel function to produce Q̂ , Step 6. Sort array Q̂ to ensure it is monotonically increasing Q̂ corrected , Step 7. Ensure remap array Q̂ corrected range [0.0-1.0]. Note that Step 6 is a small accommodation to enforce monotonicity and allow for an unambiguous mapping function. The algorithm trained on validation data and tested on holdout data reduced expected calibration error (ECE) from 22.6% down to 10.2%, or from 20.6% down to 8.6% when ensemble scoring. ECE was computed using the following standard algorithm: where P̂ B m is average pseudo-confidence in bin ( B m ), and Q̂ Bm is measured accuracy within that same bin ( B m ), and M is a reasonable number of bins to test, which was set to 20 in our case. Alternatives such as the histogram binning technique exhibited slightly lower calibration error than kernel density estimation (KDE), but bins were not as numerically stable and the resultant mapping function was not as parsimonious as the KDE approach. Results for histogram binning method are shown in more detail in the Supplemental ?? . Applying our KDE algorithm independently for each trained model, Fig 3 shows the scatter plot for 35 calibrated models showing uncalibrated or pseudo-confidence P̂ along the x-axis, and calibrated confidence Q̂ along the y-axis. Each point in the plot is a single validation sample (from a training-validation set) plotted with coordinates p̂ i and q̂ i , where q̂ i ≃ p is an estimate of a sample’s true confidence. Because we do not have infinite samples to use for calibration, the remapping function is a piecewise monotonically increasing mapping function, whose smoothness depends upon the number of validation samples. Download figure Open in new tab Fig 3. Calibrating confidence for 35 (7 experiments * 5 CV folds) VGG9 trained models. After calibrating confidence, performance is improved by throwing away samples with low-confidence. Fig 4 shows an algorithm for estimating a good confidence threshold or a sampling fraction that produces improved results. Fig 5 shows the related algorithm for measuring performance on holdout data and resultant summary statistics, which ultimately were determined by evaluating the 35% highest confidence samples, discussed in further detail below. Download figure Open in new tab Fig 4. Confidence Threshold Search Algorithm Download figure Open in new tab Fig 5. Testing Performance with Confidence Threshold Results We used deep learning architectures (VGG) initialized with ImageNet-trained weights and fine-tuned on our prostate pathology cohort. We report results averaged over multiple (reshuffled) experiments. Final metrics were derived from holdout sets unseen during training. The holdout sets were grouped such that patches from any one whole slide image were exclusive to the training/validation set or to the holdout set. The study curated over 6,000 patches, with 3311 primary GP3 and 2909 primary GP4 glandular patches labeled by the research pathologist. In total, 35 DL models (7 reshuffled experiments with 5-CV folds each) were trained for each ablation test or study. The accuracy, F 1 and AUC metrics reported are average and standard deviation over all experiments. Ensembles’ Significance Ensemble results were computed on a VGG-9 architecture and over 16 separate experiments studying various micro-optimizations (grayscale versus RGB, 4 patch resizing strategies, and masking on/off), with each of the 16 experiments showing ensemble improvement (though not shown here). We found creating ensembles over the five-fold CV models shows a very strong effect in increasing generalization performance (see Fig 6 ). The boxplots qualitatively show a significant gain in accuracy, F 1 and AUC scores, and tighter variances. The Cohen’s d effect size along with the Wilcoxon ranksum p-values shown in the x-axis labels provide a quantitatively strong significance across all metrics when comparing non-ensemble results on the left and ensemble models on the right. In total, metrics were computed for 35 single models and 7 ensembled models and repeated for 16 ablation experiments, which led to 560 simple models and 112 ensemble models; from these results we drew 50 samples in each category (without replacement) and computed significance over 1000 times. Due to the strong positive effect of ensembles and a 3-4% increase in all performance metrics, all additional experiments assume the ensemble optimization unless otherwise stated. Download figure Open in new tab Fig 6. Ensembling across CV folds improved performance significantly. Training Data Refinement We iteratively improved our training data by discriminating the samples into Gleason pattern groups and verifying improvement in classification performance. Data sanitization included pruning ambiguous samples or flipping labels when patches very rarely matched ground truth. Applying data sanitization and ensemble optimizations together achieved a 0.68 accuracy, 0.66 F 1 and 0.74 AUC for a DL when discriminating glandular labels between GP3 and GP4 samples. The results very closely matched our previously published performance of similarly-sized patches derived from a University of Miami data cohort as shown in Table 1 [ 13 ]. View this table: View inline View popup Download powerpoint Table 1. Deep learning (CNN) performance on Moffitt and Miami In this ablation test, a random sampling and consensus (RANSAC) approach was used to identify the lowest performing training/validation samples. Incorrectly inferenced training/validation samples with more than 80% error were flipped in label (between GP3 and GP4) and corrected in the training set for a subsequent fine-tuning period. The next least performing samples (incorrectly predicted more than 60% but less than 80%) were pruned from the training and validation sets. The results of label refinement were computed on the complete and unmodified holdout sets and based strictly on GT labels. The metrics in the boxplots of Fig 7 include 2 columns Original and Updated . The metrics in Original columns are derived from 35 models trained on the complete training with GT labels, while the metrics in Updated columns were results from 35 models trained on a pruned set and refined labels. Significance tests are computed over the N=70 total models from each group. As will be shown, these results are improved further through confidence filtering. Download figure Open in new tab Fig 7. Performance improves significantly with training data sanitizing. Confidence Filtering The results of confidence filtering on our holdout sets (filtering sample inferences with lowest confidence) improved results more than 5% demonstrating an accuracy of 0.74, F 1 of 0.72 and AUC of 0.79. To determine if a filter threshold or sample fraction expects improved performance, we first ranked validation samples according to pseudo-confidence P̂ . We ranked all inferences according to their p̂ i , and measured performance at filtering thresholds incremented by 5% starting at 0% and ending at 85%. Fig 8 shows that validation performance slowly improved in all metrics of accuracy, F 1 score and AUC, with an eventual peak 5-10% improvement over the baseline, which is nominally the result at 100% sample-fraction or no filtering. We will show that peak-performance on the validation set was at a filter threshold of ∼ 65% of the samples (or maintaining 35% sample-fraction ). Beyond 75% filtering, performance began to taper off rapidly. Download figure Open in new tab Fig 8. Accuracy, F 1 , and AUC versus sample-fraction observed on validation data. We observe the validation data in detail to show the effect of filtering on confidence for our discriminant classifier (the VGG-9 CNN) to forecast utility in the general case. Looking at the numbers in Table 2 shows confidence-filtered accuracy, F 1 and AUC metrics improving at each 5% threshold tested until about the 0.70 threshold. The improvement in metrics demonstrates a clear opportunity to discard or ignore low-confidence samples to improve our overall performance. View this table: View inline View popup Download powerpoint Table 2. VGG-9 CNN validation performance versus Sample-fraction Improved performance is achieved when sample-fraction is in the range of 30-40%, which corresponds to a minimum calibrated confidence of ∼ 0.85 at around the 0.35 sample-fraction threshold. So, one approach is to use calibrated confidence levels q̂ i ≥ 0.85 to discard low-performing samples on future unseen data. To use percentile sample-fraction , we only need to rank and sort samples of an unseen subject and prune the 65% lowest performing samples as a reasonable threshold as shown in the VGG-9 table line highlighted in gray; this latter approach was decided to be the best method before testing on holdout data to guarantee a reasonable sample size for any subject. Either approach can be used as a litmus test for the quality of inferenced samples or as a secondary approach to filter low-confidence samples. After making the determination to use the 35th percentile (ignoring lowest ∼ 2 / 3 of inferences ranked by confidence) as a reasonable threshold on unseen data, we checked how we performed in actual holdout test performance. Results for all sample-fraction thresholds are shown in detail in the Supplemental ?? . Summarizing the results of holdout testing with our chosen 65% threshold (i.e. keeping highest 35%) is shown in Table 3 . Performance is improved over 5% for the DL demonstrating an accuracy of 0.74, F 1 of 0.72 and AUC of 0.79, and improving prior work classifying prostate pathology glandular patches in the 40-100 micron range. The tradeoff for ignoring low-confidence samples is higher inference variance; the standard deviation is doubled across all performance metrics. View this table: View inline View popup Download powerpoint Table 3. Refined VGG9 performance ignoring least (65%) confident samples Total Optimization Effect Finally, in this study, we compared our optimized network with confidence filtering to a network with the full VGG-16 CNN layers (5 stage) to serve as a näıve baseline. The baseline network utilized the AdamW optimizer (default settings), performed basic image preprocessing (simple rescale, full-color patches, no gland masking), and no data sanitization (label flipping or pruning of training data). In both this baseline case and our optimized architecture, the models were trained for over 500 epochs or generally until we observed validation loss convergence. Here we present the optimal case with ensembles versus the baseline unoptimized case without ensembles. As shown in Fig 9 , boxplots are drawn for accuracy, F 1 and AUC for 7 ensembled models in the optimized case and 35 un-ensembled models in the baseline unoptimized case (5-folds x 7 repeated tests), for a total sample size of N = 42. The Supplemental ?? presents the significance comparisons when the optimized models and the näıve baseline are both without ensembles and both are with ensembles; regardless of the approach, the optimized models show much higher performance. Taken in concert, the optimizations used in this study account for a much improved result over the näıve baseline and the p-values computed with the Wilcoxon rank-sum test shown below each metric pair report high significance. Download figure Open in new tab Fig 9. Total performance improvement versus näıve baseline Discussion Histopathological grading of prostate samples is challenging, and expert opinions still differ with a reported low concordance [ 2 ]. In this study, we focus on assembling a cohort of exemplars of whole slide images with smaller patches of uniform glandular regions with labels derived at the tile level from a clinical expert. We built a statistical procedure, called ‘confidence filtering’, adapted for deep models, to grade the samples, which allows us to focus on high-confidence samples to improve deep models’ discrimination ability. The hypothesis assumes inference error is a function of pseudo-confidence, therefore, if we eliminate lower-confidence samples, measured classification performance will improve. Training data is improved by removing samples that classify incorrectly across a diverse set of models. We find further improvements in trained models by flipping labels on samples that appear to be egregiously incorrect (or incorrect with high-significance). Gleason patterns on small glandular features are a challenging problem for expert histopathologists and AI algorithms alike. At such a fine-level, we require careful tuning and exploration of model selection and parameters to achieve practical performance. Proper tile preparation (grayscale, resize by cropping, and masking), model simplification, and robust learning rate scheduling provides measurable but minor improvement so was largely left out of the discussion. More significant improvements are found by ensembling, label refinement, and ignoring low-confidence inferences. Disregarding all of these techniques, it is difficult to achieve discrimination of Gleason pattern 3 and 4 on 40-100 micron patches much above chance as was shown on a näıve baseline, which had mean accuracy and F 1 around 57%. The major optimizations improved this performance by almost 20%. As others have reported sample labeling biases influence the model performance [ 17 ]. There are several prior works that proposed selective classification in the context of better uncertainty quantification [ 19 , 26 , 27 , 32 , 34 , 35 ] and data sanitizing in the context of classifier training [ 15 , 23 – 25 , 36 ]. DL and ML has shown to be effective at determining prostate cancer (PCa) from non-cancer with very high accuracy but is more challenging in determining degree of malignancy between low-grade and high-grade prostate cancer or neighboring Gleason patterns. Butt et al reported high performance on the SICAPv2 dataset with 512x512 images at 10x resolution by applying multi-labels to individual patches; though results cannot be compared directly since individual patch-level performance is not reported [ 6 ]. Duenweg et al classified patches with a bagged ensemble ML and ResNet-101 DL. Similar to our study, the authors classified individually-labeled patches of 1024x1024 (∼ 1020 µm 2 ) in size. Their results of comparing high-grade (HG, Gleason 4 or 5) and low-grade (LG, Gleason 3) cancer is comparable to our GP3 versus GP4 study. In their best case with a ResNet-101, they achieved an overall accuracy of 0.72 [ 3 ], while our result achieves 0.74 accuracy after confidence filtering. Our result does not include Gleason pattern 5 samples, which in theory would improve our result further since GP3 and GP5 Gleason patterns have more differentiation than GP3 to GP4 samples. Our results improved on a previous study on multi-institutional data [ 13 ] but the results in this study are computed using the stronger holdout test methodology (versus Monte-Carlo CV used in prior studies). In Fig 7 , we showed boxplots to demonstrate qualitatively that refining the training/validation sets through pruning and relabeling significantly improves holdout performance; holdout data is strictly based on original GT. The strong performance improvement in label flipping supports the need for clean data prior to clinical application and has practical applications across areas that depend on expert opinions. Aleatoric noise is evident in most clinical studies that are often dependent on expert opinion and often influenced by their training [ 17 ]. It is evident that the Gleason pattern in prostate pathology is known to have high variability due to the complexity in the interpretation of glandular structures, coupled with limitations of specimen sampling [ 6 ]. If DL classification is performed over an entire WSI, the output decision is often too abstract to generate an evidence-based result and is vulnerable to confounding factors. Explainable AI (XAI) methods may someday shed light and provide further evidence on DL behavior, but a feasible approach is to train DLs to look and classify smaller regions. The deep-classifiers presented in this study perform at a much finer-level than those published in competing works, which makes classification more challenging. These classifiers can be used in a suite of decision support system tools to augment drill down interrogation of WSI classification, or to build support for an overall Gleason grade decision. Conclusion The study results demonstrate the efficacy of using deep models to classify glandular patches at a fine scale. Confidence filtering on properly calibrated DL models demonstrated a performance increase and is appropriate in this modality with noisy labels, which is a common problem in most human expert-driven studies. Our study finds that glandular patches of prostate histopathology improve the discrimination of pathological grades using confidence filtering. We also find that ensembles of models and data sanitation (label flipping and pruning) along with confidence filtering have the largest impacts in designing a system to classify primary Gleason pattern at the gland level. Evidence at this low level can operate in a larger suite of decision support system tools to augment and reify expert pathology decisions and improve concordance among clinicians. We envision a decision support system for pathologists that improves performance by confidence filtering either by rank sorting, calibrated confidence thresholding, or both. When using an ensemble of models, calibrated confidence from each prediction can be used to compute a discriminate average versus simple voting schemes. Additionally, clinicians can observe a subject’s array of calibrated confidences Q̂ to make insights on the quality of the overall classification decision. Limitations Current work is at the glandular level; we expect to leverage techniques developed in this research to improve classifiers that observe larger patches of the prostate specimen. We did not attempt to optimize a clinical score over entire WSIs or patients as the data cohort was not evenly labeled across subjects - many patients had very few epithelial glands or regions labeled, with limited expert-level review. Data Availability All data produced in the present study are available upon reasonable request to the authors. Acknowledgments We would like to acknowledge our patients, care providers, and the voluntary contribution of biological samples to our institutional repository (Total Cancer Care), which has enabled us to conduct this study. We are grateful to Dr. Alexis Lopez and Joseph Johnson, along with their core team members, for providing research annotations and data preparation. References 1. ↵ Offermann A , Hupe M , Sailer V , Merseburger A , Perner S. The new ISUP 2014/WHO 2016 prostate cancer grade group system: first résumé 5 years after introduction and systemic review of the literature . World Journal of Urology . 2020 ; 38 : 657 – 662 . OpenUrl PubMed 2. ↵ Goodman M , Ward KC , Osunkoya AO , Datta MW , Luthringer D , Young AN , et al. Frequency and determinants of disagreement and error in gleason scores: A population-based study of prostate cancer . The Prostate . 2012 ; 72 ( 13 ): 1389 – 1398 . doi: 10.1002/pros.22484 . OpenUrl CrossRef PubMed 3. ↵ Duenweg SR , Brehler M , Bobholz SA , Lowman AK , Winiarz A , Kyereme F , et al. Comparison of a machine and deep learning model for automated tumor annotation on digitized whole slide prostate cancer histology . Plos one . 2023 ; 18 ( 3 ): e0278084 . OpenUrl CrossRef PubMed 4. ↵ Behzadi MM , Madani M , Wang H , Bai J , Bhardwaj A , Tarakanova A , et al. Weakly-supervised deep learning model for prostate cancer diagnosis and gleason grading of histopathology images . Biomedical Signal Processing and Control . 2024 ; 95 : 106351 . OpenUrl 5. ↵ Xu H , Usuyama N , Bagga J , Zhang S , Rao R , Naumann T , et al. A whole-slide foundation model for digital pathology from real-world data . Nature . 2024 ; doi: 10.1038/s41586-024-07441-w . OpenUrl CrossRef PubMed 6. ↵ Butt MA , Kaleem MF , Bilal M , Hanif MS . Using multi-label ensemble CNN classifiers to mitigate labelling inconsistencies in patch-level Gleason grading . Plos one . 2024 ; 19 ( 7 ): e0304847 . OpenUrl PubMed 7. ↵ Müller D , Meyer P , Rentschler L , Manz R , Bäcker J , Cramer S , et al. DeepGleason: a System for Automated Gleason Grading of Prostate Cancer using Deep Neural Networks . arXiv preprint arXiv : 240316678 . 2024 ;. 8. ↵ Institute NC . The Cancer Genome Atlas ; 2025 . https://www.cancer.gov/tcga . 9. ↵ Geert Litjens , Pinckaers H , Kartasalo K , Maggie , Eklund M , Pekka Ruusuvuori , et al. Prostate cANcer graDe Assessment (PANDA) Challenge ; 2020 . https://kaggle.com/competitions/prostate-cancer-grade-assessment . 10. ↵ Nir G , Hor S , Karimi D , Fazli L , Skinnider B , Tavassoli P , et al. Automatic grading of prostate cancer in digitized histopathology images: Learning from multiple experts . Medical image analysis . 2018 ; 50 : 167 – 80 . OpenUrl CrossRef PubMed 11. ↵ Karimi D , Nir G , Fazli L , Black P , Goldenberg L , Salcudean S. Deep Learning-Based Gleason Grading of Prostate Cancer From Histopathology Images—Role of Multiscale Decision Aggregation and Data Augmentation . IEEE journal of biomedical and health informatics . 2019 ; 24 ( 5 ): 1413 – 1426 . OpenUrl 12. ↵ Liu Z , Lin Y , Cao Y , Hu H , Wei Y , Zhang Z , et al. Swin transformer: Hierarchical vision transformer using shifted windows . In : Proceedings of the IEEE/CVF international conference on computer vision ; 2021 . p. 10012 – 10022 . 13. ↵ Fogarty R , Goldgof D , Hall L , Lopez A , Johnson J , Gadara M , et al. Classifying Malignancy in Prostate Glandular Structures from Biopsy Scans with Deep Learning . Cancers . 2023 ; 15 ( 8 ): 2335 . OpenUrl PubMed 14. ↵ Fischler MA , Bolles R . Random sample consensus: a paradigm for model fitting with applications to image analysis and automated cartography . Communications of the ACM . 1981 ; 24 ( 6 ): 381 – 395 . OpenUrl CrossRef Web of Science 15. ↵ Northcutt C , Jiang L , Chuang I . Confident learning: Estimating uncertainty in dataset labels . Journal of Artificial Intelligence Research . 2021 ; 70 : 1373 – 1411 . OpenUrl 16. ↵ Cleanlab . Cleanlab library ; 2025 . https://cleanlab.ai/ . 17. ↵ George T , Nodet P , Bondu A , Lemaire V . Mislabeled examples detection viewed as probing machine learning models: concepts, survey and extensive benchmark . arXiv preprint arXiv : 241015772 . 2024 ;. 18. ↵ Jiang X , Osl M , Kim J , Ohno-Machado L . Calibrating predictive model estimates to support personalized medicine . Journal of the American Medical Informatics Association . 2012 ; 19 ( 2 ): 263 – 274 . OpenUrl CrossRef PubMed 19. ↵ Gawlikowski J , Tassi CRN , Ali M , Lee J , Humt M , Feng J , et al. A survey of uncertainty in deep neural networks . Artificial Intelligence Review . 2023 ; 56 ( Suppl 1 ): 1513 – 1589 . OpenUrl 20. ↵ Parzen E . On Estimation of a Probability Density Function and Mode . The Annals of Mathematical Statistics . 1962 ; 33 ( 3 ): 1065 – 1076 . OpenUrl CrossRef 21. ↵ Pocevičiūtė M , Eilertsen G , Jarkman S , Lundström C. Generalisation effects of predictive uncertainty estimation in deep learning for digital pathology . Scientific Reports . 2022 ; 12 ( 1 ): 8329 . OpenUrl PubMed 22. ↵ Rajaraman S , Ganesan P , Antani S . Deep learning model calibration for improving performance in class-imbalanced medical image classification tasks . PloS one . 2022 ; 17 ( 1 ): e0262838 . OpenUrl CrossRef PubMed 23. ↵ Zhan K , Wang Y , Zhuo Y , Zhan Y , Yan Q , Shan F , et al. An uncertainty-aware self-training framework with consistency regularization for the multilabel classification of common computed tomography signs in lung nodules . Quantitative Imaging in Medicine and Surgery . 2023 ; 13 ( 9 ): 5536 . OpenUrl 24. Acharja S , Hasan MZ , Chamok FH , Fahim KU , Shuva TF , Bulbul AAM , et al. OBoctNet: Enhancing Ophthalmic Biomarker Detection Through Active Learning and Explainable AI in Radiological Analysis . Cognitive Computation . 2025 ; 17 ( 3 ): 1 – 18 . OpenUrl 25. ↵ Ouyang M , Fu Y , Yan R , Shi S , Ling X , Zhu L , et al. MergeUp-augmented Semi-Weakly Supervised Learning for WSI Classification . arXiv preprint arXiv:240812825. 2024 ;. 26. ↵ Yang J , Chen L , Liu E , Wang B , Driman DK , Zhang Q , et al. Deep learning system for true-and pseudo-invasion in colorectal polyps . Scientific Reports . 2024 ; 14 ( 1 ): 426 . OpenUrl PubMed 27. ↵ Del Amor R , Silva-Rodríguez J , Naranjo V. Labeling confidence for uncertainty-aware histology image classification . Computerized Medical Imaging and Graphics . 2023 ; 107 : 102231 . OpenUrl PubMed 28. ↵ Wilcoxon F . Individual comparisons by ranking methods . Biometrics Bulletin . 1945 ); 1 ( 6 ): 80 – 83 . doi : doi: 10.2307/3001968 . OpenUrl CrossRef 29. ↵ Iczkowski KA , Paner GP , Van der Kwast T . The new realization about cribriform prostate cancer . Advances in anatomic pathology . 2018 ; 25 ( 1 ): 31 – 37 . OpenUrl CrossRef PubMed 30. ↵ Abadi M , Agarwal A , Barham P , Brevdo E , Chen Z , Citro C , et al. TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems ; 2015 . Available from: https://www.tensorflow.org/ . 31. ↵ Simonyan K , Zisserman A. Very Deep Convolutional Networks for Large-Scale Image Recognition . arXiv 2015, arXiv:14091556. 2015 ;. 32. ↵ Mehrtash A , Wells WM , Tempany CM , Abolmaesumi P , Kapur T . Confidence calibration and predictive uncertainty estimation for deep medical image segmentation . IEEE transactions on medical imaging . 2020 ; 39 ( 12 ): 3868 – 3878 . OpenUrl CrossRef PubMed 33. ↵ Precup D , Teh YW Guo C , Pleiss G , Sun Y , Weinberger KQ. On Calibration of Modern Neural Networks . In: Precup D , Teh YW , editors. Proceedings of the 34th International Conference on Machine Learning . vol. 70 of Proceedings of Machine Learning Research. PMLR ; 2017 . p. 1321 – 1330 . Available from: https://proceedings.mlr.press/v70/guo17a.html . OpenUrl 34. ↵ Sensoy M , Kaplan L , Kandemir M . Evidential deep learning to quantify classification uncertainty . Advances in neural information processing systems . 2018 ; 31 . 35. ↵ Hoppe F , Verdun CM , Laus H , Endt S , Menzel MI , Krahmer F , et al. Imaging with Confidence: Uncertainty Quantification for High-Dimensional Undersampled MR Images . In: European Conference on Computer Vision . Springer ; 2024 . p. 432 – 450 . 36. ↵ Ekambaram R , Goldgof DB , Hall LO. Finding label noise examples in large scale datasets . In: 2017 IEEE International Conference on Systems, Man, and Cybernetics (SMC). IEEE; 2017 . p. 2420 – 2424 . View the discussion thread. Back to top Previous Next Posted November 27, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Improving automated prostate pathological grading via confidence filtering Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Improving automated prostate pathological grading via confidence filtering Ryan B. Fogarty , Dmitry B. Goldgof , Lawrence O. Hall , Jasreman Dhillon , Vaibhav Chumbalkar , Yoganand Balagurunathan medRxiv 2025.11.25.25340482; doi: https://doi.org/10.1101/2025.11.25.25340482 Share This Article: Copy Citation Tools Improving automated prostate pathological grading via confidence filtering Ryan B. Fogarty , Dmitry B. Goldgof , Lawrence O. Hall , Jasreman Dhillon , Vaibhav Chumbalkar , Yoganand Balagurunathan medRxiv 2025.11.25.25340482; doi: https://doi.org/10.1101/2025.11.25.25340482 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Pathology Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4414) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15208) Forensic Medicine (30) Gastroenterology (1120) Genetic and Genomic Medicine (6575) Geriatric Medicine (666) Health Economics (994) Health Informatics (4512) Health Policy (1365) Health Systems and Quality Improvement (1609) Hematology (537) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15905) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6576) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (955) Oncology (3319) Ophthalmology (969) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5423) Public and Global Health (9208) Radiology and Imaging (2192) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1193) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9fed992ffd1c06db',t:'MTc3OTMwMzc3NA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.