Diagnostic Performance of Self-Supervised Foundation Models for Intraoperative Quantification of Hepatic Macrovesicular Steatosis

doi:10.1101/2025.09.16.25335833

Diagnostic Performance of Self-Supervised Foundation Models for Intraoperative Quantification of Hepatic Macrovesicular Steatosis

2025 · doi:10.1101/2025.09.16.25335833

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 45,038 characters · extracted from preprint-html · click to expand

Diagnostic Performance of Self-Supervised Foundation Models for Intraoperative Quantification of Hepatic Macrovesicular Steatosis | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Diagnostic Performance of Self-Supervised Foundation Models for Intraoperative Quantification of Hepatic Macrovesicular Steatosis View ORCID Profile Shunsuke Koga , Anjani Guda , Yujie Wang , Aarush Sahni , Jiahui Wu , Alyssa Rosen , Jaxson Nield , Nilan Nandish , Krunal Patel , Haviva Goldman , Chamith S. Rajapakse , Selemon Walle , Kristen Stashek , Rashmi Tondon , Zahra Alipour doi: https://doi.org/10.1101/2025.09.16.25335833 Shunsuke Koga 1 Department of Pathology and Laboratory Medicine, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA MD, Ph.D. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Shunsuke Koga Anjani Guda 2 Department of Neurobiology and Anatomy, Drexel University , Philadelphia, Pennsylvania, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yujie Wang 3 Department of Radiology, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aarush Sahni 3 Department of Radiology, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jiahui Wu 3 Department of Radiology, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alyssa Rosen 3 Department of Radiology, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jaxson Nield 3 Department of Radiology, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nilan Nandish 3 Department of Radiology, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Krunal Patel 3 Department of Radiology, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Haviva Goldman 2 Department of Neurobiology and Anatomy, Drexel University , Philadelphia, Pennsylvania, USA Ph.D. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chamith S. Rajapakse 3 Department of Radiology, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA Ph.D. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Selemon Walle 1 Department of Pathology and Laboratory Medicine, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kristen Stashek 1 Department of Pathology and Laboratory Medicine, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rashmi Tondon 1 Department of Pathology and Laboratory Medicine, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zahra Alipour 1 Department of Pathology and Laboratory Medicine, Hospital of the University of Pennsylvania , Philadelphia, Pennsylvania, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: zahra.alipour{at}pennmedicine.upenn.edu Abstract Full Text Info/History Metrics Preview PDF Abstract Introduction Accurate intraoperative assessment of macrovesicular steatosis in donor liver biopsies is critical for transplantation decisions but is often limited by inter-observer variability and freezing artifacts that can obscure histological details. Artificial intelligence (AI) offers a potential solution for standardized and reproducible evaluation. To evaluate the diagnostic performance of two self-supervised learning (SSL)-based foundation models, Prov-GigaPath and UNI, for classifying macrovesicular steatosis in frozen liver biopsy sections, compared with assessments by surgical pathologists. Methods We retrospectively analyzed 131 frozen liver biopsy specimens from 68 donors collected between November 2022 and September 2024. Slides were digitized into whole-slide images, tiled into patches, and used to extract embeddings with Prov-GigaPath and UNI; slide-level classifiers were then trained and tested. Intraoperative diagnoses by on-call surgical pathologists were compared with ground truth determined from independent reviews of permanent sections by two liver pathologists. Accuracy was evaluated for both five-category classification and a clinically significant binary threshold (<30% vs. ≥30%). Results For binary classification, Prov-GigaPath achieved 96.4% accuracy, UNI 85.7%, and surgical pathologists 84.0% ( P = .22). In five-category classification, accuracies were lower: Prov-GigaPath 57.1%, UNI 50.0%, and pathologists 58.7% ( P = .70). Misclassification primarily occurred in intermediate categories (5%–<30% steatosis). Conclusions SSL-based foundation models performed comparably to surgical pathologists in classifying macrovesicular steatosis, at the clinically relevant <30% vs. ≥30% threshold. These findings support the potential role of AI in standardizing intraoperative evaluation of donor liver biopsies; however, the small sample size limits generalizability and requires validation in larger, balanced cohorts. Introduction Orthotopic liver transplantation is the standard treatment for end-stage liver disease, but outcomes depend significantly on donor organ quality. Among various histopathologic factors, macrovesicular steatosis strongly influences graft viability, with increasing levels of steatosis correlated with increased risks of primary non-function and early allograft dysfunction. 1 – 3 While donor livers with mild macrovesicular steatosis (<30%) generally show comparable outcomes to non-steatotic grafts, 4 moderate-to-severe steatosis (≥30%) markedly compromises graft survival and overall transplant success. 5 Recent large-scale analyses revealed that livers with macrovesicular steatosis ≥31% have significantly reduced transplant utilization rates, with approximately 55% of these organs discarded due to elevated concerns regarding graft failure. 3 Despite these risks, steatotic donor livers remain a valuable resource, particularly as their prevalence rises alongside increasing obesity and metabolic syndrome rates. Therefore, precise and standardized assessment of macrovesicular steatosis is critical to optimizing organ utilization while mitigating transplant risks. In current practice, intraoperative frozen section evaluation of donor liver biopsies is widely used to assess steatosis and guide real-time transplant decisions. 6 – 8 However, this approach has inherent limitations, including freezing artifacts that obscure tissue architecture and significant inter-observer variability in steatosis estimation. Additionally, there is considerable variability among pathologists, including fundamental disagreements regarding concepts such as microvesicular steatosis. 9 This variability highlights the limitations of relying solely on pathologist-based assessments and underscores the need for more objective and standardized methods. Furthermore, intraoperative frozen sections are often evaluated by on-call pathologists who infrequently assess liver biopsies and thus face challenges interpreting freezing artifacts. These factors contribute to discrepancies between frozen and permanent section assessments, leading to potential misclassification of graft suitability. 10 Overestimation of steatosis may result in unnecessary organ discard, while underestimation increases the risk of transplanting marginal grafts with compromised function. Objective and standardized assessment methods, such as artificial intelligence-based analysis trained to recognize steatosis and freezing artifacts, may decrease subjective variability, reducing unnecessary organ discard and the risk associated with marginal graft transplantation. AI-based digital pathology offers promising solutions to these challenges. 11 – 17 Supervised approaches such as convolutional neural networks (CNNs) require large annotated datasets, while self-supervised learning (SSL) methods can learn effectively from unlabeled images. 13 , 18 Both approaches can rapidly and consistently analyze digitized histology images, reduce subjective variability, and provide quantitative measures. Previous CNN-based studies for steatosis quantification in donor liver biopsies have shown strong correlations (r = 0.85, ICC = 0.85) with expert annotations, highlighting their potential to outperform intraoperative pathologists assessments. 19 Another CNN-based study reported independent associations between AI-based measurements and early allograft dysfunction, with improved predictive calibration compared to manual evaluation. 20 Integrating AI models into intraoperative frozen-section workflows could therefore enhance the accuracy and reliability of donor liver assessments, facilitating more informed decisions regarding organ acceptance. 21 In this study, we evaluated two recently developed SSL-based foundation models, Prov-GigaPath 22 and UNI, 23 for assessing macrovesicular steatosis in 131 frozen liver biopsy specimens. We compared their performance with intraoperative frozen-section diagnoses from pathologists, using consensus ground truth determined from permanent-section, evaluated by two experienced liver pathologists. Materials and Methods Cohort This retrospective study included 131 donor liver biopsy specimens from 68 donors, obtained during organ procurement procedures performed at the Hospital of the University of Pennsylvania (Philadelphia, Pennsylvania) between November 2022 and September 2024. All biopsy specimens were collected in collaboration with the Gift of Life Donor Program. 24 The hospital follows a subspecialty sign-out model, and donor liver biopsies are interpreted on frozen section by the on-call service. During this period, a total of nineteen board-certified anatomic pathology attendings served on the on-call frozen-section service and evaluated donor liver biopsies contemporaneously during organ procurement. The overall workflow of the study is summarized in Figure 1 . The study protocol was approved by the Institutional Review Board at the Hospital of the University of Pennsylvania. Download figure Open in new tab Figure 1. Workflow of AI-based evaluation and pathologist assessment of donor liver biopsies. Liver biopsies are obtained from 68 donors, yielding 131 specimens. Slides are hematoxylin & eosin (H&E)-stained and scanned to generate 131 whole-slide images (WSIs), which are tiled into image patches used for training and testing artificial intelligence (AI) models. Intraoperative frozen section diagnoses (H&E-stained) are made by on service or on call surgical pathologists as part of routine clinical workflow. Permanent H&E-stained sections are independently reviewed by two liver pathologists specifically for this study to establish the consensus ground truth. AI-based diagnoses are generated using two foundation models, Prov-GigaPath and UNI. Pathological Assessment and Ground Truth Determination Macrovesicular steatosis was defined as the presence of large lipid droplets within hepatocytes that displace the nucleus, consistent with definitions widely adopted by liver transplant pathologists. 25 Microvesicular steatosis was not assessed in this study due to its limited established clinical relevance in transplantation decisions, inconsistent usage of the term among pathologists, and lack of standardized definitions and assessment criteria across institutions. 9 Steatosis categories were determined according to the Gift of Life Donor Program’s standard pathology evaluation form, as described previously, and classified into five categories: <5%, 5% to <10%, 10% to <20%, 20% to <30%, and ≥30%. These categories were applied for both intraoperative frozen-section assessments and permanent-section reviews. In December 2024, two liver pathologists (RT and ZA), both with fellowship training in medical liver, independently established the ground truth by reviewing permanent hematoxylin & eosin (H&E) sections, without access to the frozen-section diagnoses at the time of review. When discrepancies occurred between the two pathologists, the slides were jointly reviewed with a third GI pathologist (KS) to reach consensus. Digital Pathology and Feature Extraction Whole-slide images (WSIs) of frozen liver biopsies were digitized using a high-throughput scanner at an original resolution of 0.104 µm/pixel and subsequently downsampled to 0.5 µm/pixel for AI processing. Two SSL-based foundation models, Prov-GigaPath and UNI, were used to classify macrovesicular steatosis in frozen-section WSIs. Prov-GigaPath is a Vision Transformer-based model pretrained in a self-supervised manner on more than 170,000 WSIs from multiple tissue types. 22 It employs dilated self-attention mechanisms to capture spatial relationships across entire slides, enabling robust recognition of histopathological features. UNI is a self-supervised pathology encoder trained on 100 million histopathology image patches and 100,000 WSIs, designed for multi-scale representation learning with strong transfer performance. 23 From each WSI, model-derived embeddings were computed and used as inputs to downstream slide-level classifiers. A linear classifier was trained on these embeddings with class weights to address category imbalance, using permanent-section steatosis categories as labels. Model Training and Evaluation Donors were randomly assigned 80:20 to a training cohort and a held-out test cohort. The held-out test cohort comprised 28 slides. Because the split was performed at the donor level, the number of slides per cohort does not necessarily follow an exact 80:20 ratio. All slides from the same donor were confined to a single cohort to prevent data leakage. Within the training cohort, we performed four-fold cross-validation for model selection and hyperparameter tuning. Folds were created at the donor level with approximate balance of steatosis categories. After cross-validation, the linear classifier was refit on the full training cohort with the selected settings and then evaluated once on the held-out test cohort. Performance was assessed for both the original five-category task and a binary task using the clinically relevant threshold of 30% (<30% vs ≥30%). Statistical Analysis Interobserver agreement between two liver pathologists in quantifying macrovesicular steatosis was assessed using weighted Cohen’s kappa (κ) with quadratic weighting, accounting for the ordinal steatosis severity categories <5%, 5% to <10%, 10% to <20%, 20% to <30%, and ≥30%. Agreement between intraoperative frozen section diagnoses by on-call pathologists and final permanent section diagnoses was similarly evaluated using weighted Cohen’s kappa. Weighted kappa was calculated using the cohen_kappa_score function from the scikit-learn package (version 1.2) in Python 3.8. Prior to calculating weighted Cohen’s kappa, categorical labels were encoded numerically as 0 (<5%), 1 (5%–<10%), 2 (10%–<20%), 3 (20%–<30%), and 4 (≥30%). No missing values were present in the dataset; thus, no additional imputation was performed. For comparative analysis, diagnostic accuracy of intraoperative frozen section diagnoses by on-call pathologists and AI models (Prov-GigaPath and UNI) was evaluated. Accuracy was assessed for both the five-category classification (steatosis severity categories) and a binary classification using a clinically relevant threshold of 30% (<30% vs. ≥30%). Performance was not stratified by individual pathologist training or years of experience due to the limited number of cases per reviewer; our aim was to assess overall diagnostic accuracy rather than subgroup differences. Statistical significance of accuracy differences between groups was determined using the chi-square test, performed in Python 3.8 (scipy.stats.chi2_contingency, SciPy version 1.10). Results Cohort Summary The study cohort included 131 liver biopsy specimens obtained from 68 donors (31 men and 37 women). Macrovesicular steatosis distribution was as follows: <5% (n = 65; 49.6%), 5% to <10%, (n = 12; 9.2%), 10% to 20%, (n = 21; 16.0%), 20% to 30% (n = 8; 6.1%), and ≥30% (n = 25; 19.1%) ( Figure 2 ). Interobserver agreement between the two liver pathologists was high (weighted Cohen’s kappa = 0.98), indicating strong consistency in steatosis assessment. Download figure Open in new tab Figure 2. Distribution of macrovesicular steatosis in the study cohort The histogram illustrates the distribution of 131 donor liver biopsies across five macrovesicular steatosis categories: <5%, 5% to <10%, 10% to <20%, 20% to <30%, and ≥30%. Intraoperative frozen and permanent sections Among 131 donor liver biopsy specimens, intraoperative frozen section diagnoses showed an overall accuracy of 84.0% (110/131) when evaluated as a binary classification using a 30% steatosis threshold (<30% vs. ≥30%). Frozen section assessments correctly identified 95.2% (100/105) of specimens with <30% steatosis, but only 38.4% (10/26) of those with ≥30% steatosis. When evaluating all five steatosis categories (<5%, 5% to <10%, 10% to 20%, 20% to 30%, ≥30%), frozen section diagnoses were concordant with permanent section diagnoses in 58.7% (77/131) of specimens, with a weighted Cohen’s kappa of 0.40, indicating fair agreement. As shown in the confusion matrix ( Figure 3 ), concordance was highest in the <5% steatosis category (87%, 57/65), whereas discrepancies occurred more frequently in categories ≥5%. Download figure Open in new tab Figure 3. Confusion matrix of diagnostic discrepancies between frozen and permanent sections The confusion matrix compares intraoperative frozen section diagnoses (x-axis) with ground truth permanent section assessments (y-axis). Color intensity indicates the number of cases in each category. Notably, significant discrepancies included one specimen categorized as >30% steatosis by frozen section but subsequently found to have <5% steatosis on the permanent section ( Figure 4A , B ). Another specimen initially assessed as 5% to 30% steatosis upon permanent section review ( Figure 4C , D ). These discrepancies likely arose from freezing artifacts, sampling variability, or inherent limitations of frozen section interpretation. Download figure Open in new tab Figure 4. Representative cases of diagnostic discrepancies between frozen and permanent sections (A, B) A donor liver biopsy is diagnosed as 50% steatosis on frozen section (A) but is reassessed as <5% on permanent section (B). (C, D) Another case is initially diagnosed as 5% to 30% steatosis on permanent section (D). All images are stained with hematoxylin-eosin, original magnification ×10. Prov-GigaPath model In the training dataset, the Prov-GigaPath achieved an accuracy of 98.4% (129/131) in binary classification (<30% vs. ≥30%), correctly classifying 98.2% (109/111) with <30% steatosis and 100% (20/20) with ≥30% steatosis. For the five-category classification, overall accuracy was 88.4%. The model correctly identifying all 20 cases (100%) in the ≥30% category and 15 of 16 cases (93.8%) in the 10%–<20% category. Misclassifications were primarily observed in the <5% category, where 8 cases were incorrectly assigned to adjacent higher categories, indicating difficulty distinguishing minimal from mild steatosis ( Figure 5A ). Download figure Open in new tab Figure 5. Confusion matrices for AI-based steatosis classification in frozen liver sections. The x-axis represents predicted categories, and the y-axis represents ground truth based on pathologist consensus. Color intensity indicates case count. (A) Prov-GigaPath model – training dataset. (B) Prov-GigaPath model – test dataset. (C) The UNI model – training dataset. (D) The UNI model – test dataset. In the test dataset, the Prov-GigaPath achieved an accuracy of 96.4% (27/28 cases) in binary classification, correctly classifying all 23 cases with <30% steatosis and 4 of 5 cases (80.0%) with ≥30% steatosis. For the five-category classification, overall accuracy was lower at 57.1% (16/28). The model correctly classified 69.2% (9/13) of cases in the <5% category and 80.0% (4/5) in the ≥30% category. However, accuracy markedly decreased in the other three categories. Notably, the 10%–<20% category had no correctly classified cases, highlighting challenges in differentiating subtle differences in steatosis percentages under test conditions ( Figure 5B ). UNI Model In the training dataset, the UNI achieved an accuracy of 96.9% (127/131) in binary classification (<30% vs. ≥30%), correctly classifying 96.4% (107/111) with <30% steatosis and 95.0% (19/20) with ≥30% steatosis. For the five-category classification, overall accuracy of 67.0%. The model performed well in classifying cases at the extremes of macrovesicular steatosis, correctly identifying 49 out of 52 cases (94.2%) in the lowest (30%) steatosis categories. However, accuracy decreased in intermediate categories. The 10% to <20% steatosis group had 13 of 16 cases correctly classified, whereas the 5% to <10% and 20% to <30% categories exhibited more frequent misclassifications ( Figure 5C ). In the test dataset, the UNI achieved an accuracy of 85.7% (24/28 cases) in binary classification, correctly classifying 95.7% (22/23) with <30% steatosis and 40.0% (2/5) with ≥30% steatosis. For the original five-category classification, overall accuracy was 50.0% (14/28). Accuracy was highest in the ≥30% category (80.0%, 4/5) and the <5% category (76.9%, 10/13), but substantially lower in the 5%–<10% and 10%–<20% categories, where no cases were correctly classified ( Figure 5D ). Diagnostic accuracy of intraoperative frozen section assessment by on-call pathologists and AI models (Prov-GigaPath and UNI) is summarized in Table 1 . In both binary and five-category classification tasks, accuracy differences among on-call pathologists and AI models were not statistically significant (binary classification: P = .22; five-category classification: P = .70). View this table: View inline View popup Download powerpoint Table 1. Comparison of diagnostic accuracy between on-call pathologists and ai models for steatosis classification Discussion Our study evaluated two self-supervised foundation models, Prov-GigaPath and UNI, for quantifying macrovesicular steatosis in donor liver biopsies. In the clinically critical binary classification (<30% vs. ≥30%), both models achieved high accuracy on the test dataset (Prov-GigaPath: 96.4%; UNI: 85.7%). Although their accuracy was numerically higher than those of intraoperative frozen-section assessments by on-call pathologists (84.0%), the differences were not statistically significant. Nevertheless, the high performance of these AI models highlights their potential for reliably identifying clinically relevant steatosis thresholds. For the five-category steatosis classification, accuracy decreased for both AI models (Prov-GigaPath: 57.1%; UNI: 50.0%) and was numerically lower than that of intraoperative frozen section evaluations by on-call pathologists (58.7%), although this difference was not statistically significant. This lower accuracy was mainly due to the difficulty of differentiating subtle histological differences in underrepresented categories, such as 5% to <10%, 10% to 20%, and 20% to 30%. The dataset was imbalanced, with nearly half of samples (49.6%) in the <5% steatosis category and very few cases in other categories (e.g., only 6.1% in 20%–30%). Although SSL methods generally require fewer labeled data for fine-tuning than conventional supervised models, the extremely small sample sizes in these specific categories likely limited effective model training. Additionally, freezing artifacts common in intraoperative frozen sections may further complicate precise categorization at these intermediate levels. Given these findings, future implementation should prioritize optimized binary classification algorithms focusing on clinically significant thresholds (e.g., ≥30%) rather than multi-class categorization, which would enhance integration of AI into routine clinical decision-making. Histopathological image analysis traditionally relied on CNN models, which have demonstrated robust performance in various pathology tasks, including steatosis quantification in liver biopsies. 26 , 27 However, CNN-based models typically require substantial amounts of labeled data for effective training and fine-tuning, a significant limitation in pathology given the cost and complexity of obtaining expert-labeled histopathological datasets. 13 , 15 In contrast, SSL-based foundation models, such as Prov-GigaPath and UNI, address this limitation by leveraging large-scale pretraining on extensive, unlabeled histopathological image repositories, enabling rapid adaptation to specific tasks with substantially fewer labeled samples. 22 , 23 Recent comparative studies, such as the evaluation of the UNI foundation model, reported that UNI outperformed ResNet-50 and other pretrained encoders on average, with the greatest gains in rare cancer classification and complex diagnostic tasks. 28 Our results support these findings, highlighting SSL-based foundation models’ capacity for effective feature extraction and generalization, which is particularly in settings where labeled data are limited. These models inherently capture broad morphological features and context across WSIs, an advantage over traditional CNN methods that typically analyze small local patches and may overlook spatially relevant histological features. The transformer architecture, employed by Prov-GigaPath, further enhances this global feature integration by utilizing self-attention mechanisms to better preserve spatial context across gigapixel images. 29 , 30 Collectively, these strengths position SSL-based foundation models as a highly attractive approach for clinical deployment, enabling precise histopathological assessments from relatively limited annotated data. The integration of WSI into intraoperative frozen section evaluation presents several practical challenges. A key limitation is the additional scanning time required, as even a few extra minutes per slide could disrupt workflow efficiency in the time-sensitive setting of liver transplantation. While scanning technology continues to improve, current turnaround times remain suboptimal for real-time decision-making. In our study, all 131 frozen-section slides were digitized in a single batch, and because we did not record individual scan or inference times, we cannot report exact single-slide turnaround, which limits the quantitative assessment. Additionally, pathology residents or assistants responsible for frozen section preparation would need training to operate scanners, introducing workflow adjustments and potential variability. Further studies should assess the feasibility, time efficiency, and cost-effectiveness of incorporating WSI and AI-based analysis into routine intraoperative workflows. Another limitation is the modest size of our independent test set (nD=D28), which limits statistical power to detect differences between AI models and pathologists. In addition, the uneven distribution of steatosis categories, predominantly <5%, limited AI model performance in intermediate categories. Our analysis was also focused solely on classical macrovesicular steatosis, and we did not evaluate small-droplet macrovesicular steatosis or microvesicular steatosis due to the lack of standardized definitions. 3 Consequently, the AI models may misclassify these patterns, representing a limitation of the current study. Future research should include balanced datasets that represent these lipid patterns, employ advanced data augmentation techniques, and integrate other histologic parameters such as inflammation and fibrosis. Addressing these elements would enhance the generalizability of AI models and provide a more comprehensive and clinically relevant assessment for transplant. Finally, this study did not include transplant recipient outcome data, as intraoperative frozen section evaluations were performed in collaboration with the Gift of Life donor program, which does not provide post-transplant follow-up. Our study compared AI models and pathologists in assessing macrovesicular steatosis but adding transplant outcomes would increase clinical relevance. Ideally, AI models should be trained to predict graft survival and patient prognosis, which would help guide transplant decisions. In conclusion, our study supports the clinical utility of SSL-based foundation models for reliable quantification of macrovesicular steatosis in donor liver biopsies. While limitations remain, particularly regarding intermediate steatosis categories and clinical integration challenges, the demonstrated advantages in accuracy, label efficiency, and generalization highlight their promising potential. Further refinement of training methods and inclusion of clinical outcome data could advance AI-based steatosis evaluation toward standardized, objective intraoperative assessments that improve transplant decision-making and patient outcomes. Ethical Approval This study was approved by the Institutional Review Board at the Hospital of the University of Pennsylvania, meeting exemption criteria (category 4) with a HIPAA waiver. This study was conducted in accordance with the Declaration of Helsinki 1975. Informed Consent In accordance with institutional policy, the hospital obtains general consent at the time of care that permits secondary research use of de-identified clinical data and archived pathology material; therefore, the Institutional Review Board waived the requirement for study-specific informed consent. Declaration of Conflicting Interests None declared. Funding This study did not receive any funding. Data Availability Statement All data produced in the present study are available upon reasonable request to the authors. References ↵ Yoong , K. F. et al. Impact of donor liver microvesicular steatosis on the outcome of liver retransplantation . Transplant Proc 31 , 550 – 551 ( 1999 ). doi: 10.1016/s0041-1345(98)01550-4 OpenUrl CrossRef PubMed Brunt , E. M . Nonalcoholic Fatty Liver Disease: Pros and Cons of Histologic Systems of Evaluation . Int J Mol Sci 17 ( 2016 ). doi: 10.3390/ijms17010097 OpenUrl CrossRef ↵ Kwong , A. J. et al. Impact of Donor Liver Macrovesicular Steatosis on Deceased Donor Yield and Posttransplant Outcome . Transplantation 107 , 405 – 409 ( 2023 ). doi: 10.1097/TP.0000000000004291 OpenUrl CrossRef ↵ Selzner , M. & Clavien , P. A . Fatty liver in liver transplantation and surgery . Semin Liver Dis 21 , 105 – 113 ( 2001 ). doi: 10.1055/s-2001-12933 OpenUrl CrossRef PubMed Web of Science ↵ Chu , M. J. , Dare , A. J. , Phillips , A. R. & Bartlett , A. S . Donor Hepatic Steatosis and Outcome After Liver Transplantation: a Systematic Review . J Gastrointest Surg 19 , 1713 – 1724 ( 2015 ). doi: 10.1007/s11605-015-2832-1 OpenUrl CrossRef PubMed ↵ D’Alessandro , A. M. et al. The predictive value of donor liver biopsies for the development of primary nonfunction after orthotopic liver transplantation . Transplantation 51 , 157 – 163 ( 1991 ). doi: 10.1097/00007890-199101000-00024 OpenUrl CrossRef PubMed Web of Science Markin , R. S. et al. Frozen section evaluation of donor livers before transplantation . Transplantation 56 , 1403 – 1409 ( 1993 ). doi: 10.1097/00007890-199312000-00025 OpenUrl CrossRef PubMed Web of Science ↵ Cesaretti , M. , Addeo , P. , Schiavo , L. , Anty , R. & Iannelli , A . Assessment of Liver Graft Steatosis: Where Do We Stand? Liver Transpl 25 , 500 – 509 ( 2019 ). doi: 10.1002/lt.25379 OpenUrl CrossRef PubMed ↵ Neil , D. A. H. et al. Banff consensus recommendations for steatosis assessment in donor livers . Hepatology 75 , 1014 – 1025 ( 2022 ). doi: 10.1002/hep.32208 OpenUrl CrossRef PubMed ↵ Intraobserver and interobserver variations in liver biopsy interpretation in patients with chronic hepatitis C. The French METAVIR Cooperative Study Group . Hepatology 20 , 15 – 20 ( 1994 ). OpenUrl CrossRef PubMed Web of Science ↵ Bera , K. , Schalper , K. A. , Rimm , D. L. , Velcheti , V. & Madabhushi , A . Artificial intelligence in digital pathology - new tools for diagnosis and precision oncology . Nat Rev Clin Oncol 16 , 703 – 715 ( 2019 ). doi: 10.1038/s41571-019-0252-y OpenUrl CrossRef PubMed Niazi , M. K. K. , Parwani , A. V. & Gurcan , M. N. Digital pathology and artificial intelligence . Lancet Oncol 20 , e253 – e261 ( 2019 ). doi: 10.1016/S1470-2045(19)30154-8 OpenUrl CrossRef PubMed ↵ Komura , D. , Ochi , M. & Ishikawa , S . Machine learning methods for histopathological image analysis: Updates in 2024 . Comput Struct Biotechnol J 27 , 383 – 400 ( 2025 ). doi: 10.1016/j.csbj.2024.12.033 OpenUrl CrossRef PubMed Koga , S. , Ikeda , A. & Dickson , D. W . Deep learning-based model for diagnosing Alzheimer’s disease and tauopathies . Neuropathol Appl Neurobiol 48 , e12759 ( 2022 ). doi: 10.1111/nan.12759 OpenUrl CrossRef ↵ Kim , M. et al. Diagnosis of Alzheimer Disease and Tauopathies on Whole-Slide Histopathology Images Using a Weakly Supervised Deep Learning Algorithm . Lab Invest 103 , 100127 ( 2023 ). doi: 10.1016/j.labinv.2023.100127 OpenUrl CrossRef PubMed Koga , S. & Du , W . Integrating AI in medicine: Lessons from Chat-GPT’s limitations in medical imaging . Dig Liver Dis ( 2024 ). doi: 10.1016/j.dld.2024.02.014 OpenUrl CrossRef ↵ Du , W. et al. Large language models in pathology: A comparative study of ChatGPT and Bard with pathology trainees on multiple-choice questions . Ann Diagn Pathol 73 , 152392 ( 2024 ). doi: 10.1016/j.anndiagpath.2024.152392 OpenUrl CrossRef ↵ Krishnan , R. , Rajpurkar , P. & Topol , E. J . Self-supervised learning in medicine and healthcare . Nat Biomed Eng 6 , 1346 – 1352 ( 2022 ). doi: 10.1038/s41551-022-00914-1 OpenUrl CrossRef ↵ Sun , L. et al. Deep learning quantification of percent steatosis in donor liver biopsy frozen sections . EBioMedicine 60 , 103029 ( 2020 ). doi: 10.1016/j.ebiom.2020.103029 OpenUrl CrossRef ↵ Narayan , R. R. et al. Artificial intelligence for prediction of donor liver allograft steatosis and early post-transplantation graft failure . HPB (Oxford ) 24 , 764 – 771 ( 2022 ). doi: 10.1016/j.hpb.2021.10.004 OpenUrl CrossRef PubMed ↵ Jiao , J. , Tang , H. , Sun , N. & Zhang , X . Artificial intelligence-aided steatosis assessment in donor livers according to the Banff consensus recommendations . Am J Clin Pathol 162 , 401 – 407 ( 2024 ). doi: 10.1093/ajcp/aqae053 OpenUrl CrossRef PubMed ↵ Xu , H. et al. A whole-slide foundation model for digital pathology from real-world data . Nature 630 , 181 – 188 ( 2024 ). doi: 10.1038/s41586-024-07441-w OpenUrl CrossRef PubMed ↵ Lu , M. Y. et al. A visual-language foundation model for computational pathology . Nat Med 30 , 863 – 874 ( 2024 ). doi: 10.1038/s41591-024-02856-4 OpenUrl CrossRef PubMed ↵ Timar , J. et al. Successful strategies to increase organ donation: the Gift of Life Donor Program Philadelphia model . Indian J Thorac Cardiovasc Surg 37 , 380 – 394 ( 2021 ). doi: 10.1007/s12055-021-01219-9 OpenUrl CrossRef PubMed ↵ Gluchowski , N. L. , Becuwe , M. , Walther , T. C. & Farese , R. V. , Jr . . Lipid droplets and liver disease: from basic biology to clinical implications . Nat Rev Gastroenterol Hepatol 14 , 343 – 355 ( 2017 ). doi: 10.1038/nrgastro.2017.32 OpenUrl CrossRef PubMed ↵ Roy , M. et al. Deep-learning-based accurate hepatic steatosis quantification for histological assessment of liver biopsies . Lab Invest 100 , 1367 – 1383 ( 2020 ). doi: 10.1038/s41374-020-0463-y OpenUrl CrossRef PubMed ↵ Taylor-Weiner , A. et al. A Machine Learning Approach Enables Quantitative Measurement of Liver Histology and Disease Monitoring in NASH . Hepatology 74 , 133 – 147 ( 2021 ). doi: 10.1002/hep.31750 OpenUrl CrossRef PubMed ↵ Chen , R. J. et al. Towards a general-purpose foundation model for computational pathology . Nat Med 30 , 850 – 862 ( 2024 ). doi: 10.1038/s41591-024-02857-3 OpenUrl CrossRef ↵ Vaswani , A. et al. Attention Is All You Need . arXiv:1706.03762 ( 2017 ). . ↵ Ciga , O. , Xu , T. & Martel , A. L . Self supervised contrastive learning for digital histopathology . arXiv:2011.13971 ( 2020 ). . View the discussion thread. Back to top Previous Next Posted September 17, 2025. Download PDF Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Diagnostic Performance of Self-Supervised Foundation Models for Intraoperative Quantification of Hepatic Macrovesicular Steatosis Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Diagnostic Performance of Self-Supervised Foundation Models for Intraoperative Quantification of Hepatic Macrovesicular Steatosis Shunsuke Koga , Anjani Guda , Yujie Wang , Aarush Sahni , Jiahui Wu , Alyssa Rosen , Jaxson Nield , Nilan Nandish , Krunal Patel , Haviva Goldman , Chamith S. Rajapakse , Selemon Walle , Kristen Stashek , Rashmi Tondon , Zahra Alipour medRxiv 2025.09.16.25335833; doi: https://doi.org/10.1101/2025.09.16.25335833 Share This Article: Copy Citation Tools Diagnostic Performance of Self-Supervised Foundation Models for Intraoperative Quantification of Hepatic Macrovesicular Steatosis Shunsuke Koga , Anjani Guda , Yujie Wang , Aarush Sahni , Jiahui Wu , Alyssa Rosen , Jaxson Nield , Nilan Nandish , Krunal Patel , Haviva Goldman , Chamith S. Rajapakse , Selemon Walle , Kristen Stashek , Rashmi Tondon , Zahra Alipour medRxiv 2025.09.16.25335833; doi: https://doi.org/10.1101/2025.09.16.25335833 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Pathology Subject Areas All Articles Addiction Medicine (569) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1510) Epidemiology (15230) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6609) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15923) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6607) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3337) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (664) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5448) Public and Global Health (9237) Radiology and Imaging (2202) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a019bbd43fcd3fe2',t:'MTc3OTc2NjU2Nw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00