Triage with AI: A Rule-out Framework Quantifying the Risks and Benefits of Screening Mammogram Automation

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 37,446 characters · extracted from preprint-html · click to expand
Triage with AI: A Rule-out Framework Quantifying the Risks and Benefits of Screening Mammogram Automation | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Triage with AI: A Rule-out Framework Quantifying the Risks and Benefits of Screening Mammogram Automation View ORCID Profile Micheal H. Bernstein , View ORCID Profile Maggie Chung , View ORCID Profile Adam Yala , Grayson L. Baird doi: https://doi.org/10.1101/2025.04.25.25326396 Micheal H. Bernstein 1 Brown Radiology Human Factors Lab, Department of Diagnostic Imaging, The Warren Alpert Medical School, Brown University, and Brown University Health , Providence, RI PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Micheal H. Bernstein Maggie Chung 2 Department of Radiology and Biomedical Imaging, University of California , San Francisco, CA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Maggie Chung Adam Yala 3 Computational Precision Health, University of California, Berkeley and University of California , San Francisco, CA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Adam Yala Grayson L. Baird 1 Brown Radiology Human Factors Lab, Department of Diagnostic Imaging, The Warren Alpert Medical School, Brown University, and Brown University Health , Providence, RI PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: grayson_baird{at}brown.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Background AI has been proposed as a triage or “rule out” device to reduce radiologist workload, but it is presently unclear how an AI triage threshold should be determined. We present a framework for determining an optimal threshold. Materials and Methods 114,229 bilateral 2D digital screening mammograms were retrospectively analyzed from 2006-2023. All mammograms were given an AI score using Mirai, an open-source deep-learning model. Several metrics were examined using two thresholds for determining ruled out versus retained cases: 1) Caseload Reduce Rate (CRR; percent of caseload reduced due to rule-out), 2) Gross AI False Omission Rate (G-FOR; probability of a patient having breast cancer if ruled out), 3) AI Net False Omission Rate (N-FOR; probability of a patient having breast cancer if ruled out and the radiologist would have caught in standard care [i.e. no triage].), 4) AI Adjusted Net False Omission Rate (30%) (AN-FOR[30%]; N-FOR adjusted for the hypothetical scenario where radiologists detect an extra 30% of breast cancers among AI retained cases). The two thresholds were severity scores of 0.2 (Yuden’s J) and 0.05 (AN-FOR[30%]=0). The former is mathematically optimal; the latter reflects a threshold where AI triage does not introduce any total increase in False Negatives. Results At the 0.20 threshold, G-FOR, N-FOR, and AN-FOR(30%) were 0.26%, 0.017%, and 0.14%, respectively (223, 141, and 121, respectively, missed cancer cases) and CRR=75%. At the 0.05 threshold, the G-FOR, N-FOR, and AN-FOR (30%) are 0.12%, 0.07%, and 0.00% (49, 30, and 0, respectively, missed cancer cases) and CRR=36%. Conclusion We demonstrate how radiology practices can consider the trade-offs of using different AI scores triage thresholds. At the AN-FOR rate of 30%, the Yuden’s J threshold results in 121 additional missed cancers for a 75% caseload reduction. We estimate no additional missed cancers at a 36% caseload reduction. Introduction The increasing use of artificial intelligence (AI) in radiology has prompted considerations about its potential in addressing the field’s mounting challenges. In recent years, the workload of radiologists has grown significantly. For instance, one study found that the workload for on-call radiologists in the Emergency Department quadrupled between 2006 and 2020. 1 Another study 2 that examined billed work relative value units (RVUs) among more than 35,000 academic radiologists found a 60% increase in workload from 2008 to 2020. This growing burden contributes to rising rates of burnout. 3 , 4 Furthermore, the number of individuals entering radiology residency has not kept pace with the rise of imaging volume, 5 creating a growing workforce imbalance that is unlikely to be resolved in the near future. AI has the potential to alleviate some of this burden. Studies have found that AI can reduce medical imaging interpretation times. 6 – 8 One promising application to improve efficiency is using AI as a triage or “rule out” device. By identifying mammograms that are extremely low-risk, AI can reduce the number of cases that require interpretation by a radiologist. Low-risk, non-triaged cases can be safely ruled out and recorded as negative (i.e., no evidence of abnormality) without radiologist review. This "rule-out" approach is well-suited for pathologies with a low prevalence rate, where there are many true negative cases that can be ruled out at the cost of very few false negatives. Thus, AI triage may be suited for screening mammograms where fewer than 1% of cases are positives. 9 , 10 The goal is to safely exclude the majority of normal cases and allow radiologists to concentrate on more suspicious exams. Using AI triage for “rule out” in radiology has been proposed by several groups. 11 – 18 Recently, empirical studies have suggested that AI triage can perform comparably to, and in some cases better than, standard of care where radiologists interpret all mammograms. 12 , 19 – 21 One critical component of AI triage is determining the appropriate Triage Threshold. Most AI algorithms generate a continuous risk score for each image, with higher scores indicating a greater likelihood of pathology. However, where the precise cut-off should be placed for distinguishing which cases are ruled out (i.e., non-triage) versus reviewed by a radiologist (i.e., triage) remains an open question. 22 – 25 Setting the threshold depends on balancing a variety of benefits and risks, which we discuss below. For clarity, these are divided into “ruled out” and “retained” cases. In this article, we present a framework for determining the optimal AI triage threshold for screening mammogram automation. We outline key metrics to evaluate the trade-offs between benefits and risks at different thresholds. Specifically, using AI risk scores from the Mirai model applied to 114,229 screening mammograms, we simulate triage thresholds to quantify their effects on caseload reduction and cancer detection. We propose approaches for identifying the optimal threshold based on these metrics. Methods University of California, San Francisco (UCSF) Institutional Review Board gave ethical approval for this Health Insurance Portability and Accountability Act–compliant study and waived the requirement for written informed consent. Operational definitions Benefits and Risks in Ruled-out Cases. The primary benefit of ruling out cases is caseload reduction, which can be quantified as the Caseload Reduction Rate (CRR) ( Table 1 ) . The higher the threshold, the higher the CRR; that is, the caseload reduction for radiologists will be higher when a more stringent (i.e. higher) threshold is set, ruling out a larger pool of cases. However, the benefit of a higher CRR must be weighed against a variety of other considerations. View this table: View inline View popup Table 1. Key Metrics and Definitions First, one must consider how accurate an AI is at correctly ruling out cancer, given the cancer prevalence in the population; this is reflected by the AI Negative Predictive Value (AI-NPV) . AI-NPV is the probability that a patient ruled out by AI truly does not have breast cancer. Some negative cases ruled out by AI might have otherwise been recalled by the radiologist in standard care (interpretating mammograms without AI triaging), potentially leading to unnecessary, costly, and stress-inducing diagnostic imaging and biopsies that turn out to be benign. The aforementioned benefit must be carefully weighed against the AI Gross False Omission Rate (G-FOR, or 1-AI-NPV), which is the probability that a patient ruled out by AI actually has breast cancer. As the threshold is raised to exclude more cases, the G-FOR increases. That is, the CRR and the G-FOR come at a clear tradeoff; the more cases AI rules out, the higher the G-FOR will be. Nonetheless, it is important to note that not all cancers missed by AI in the ruled-out cases would have been detected by radiologists under standard practice (SP ) (i.e., radiologist workflow absent an AI triaging model). That is, some cancer cases would likely have been missed regardless of whether AI triaging was used. To account for this, we define the AI Net False Omission Rate (N-FOR) as the G-FOR minus cancer cases that would have been missed by AI and radiologists (i.e. “deduct” cancer cases mutually missed by both radiologists and AI triage from the numerator). Benefits and Risks in Retained Cases The AI Positive Predictive Value (AI-PPV) reflects the probability that a patient has breast cancer given that the case was retained for radiologist review. The AI False Discovery Rate (AI-FDR, or 1-AI-PPV) refers to the probability that a patient retained by AI for radiologist review does not actually have breast cancer. The higher the triage threshold, the more cases AI will rule out (i.e. the larger the CCR). This means that remaining (i.e. retained) cases are more likely to be true positives, which increases the AI-PPV and reduces the AI-FDR. However, decreasing the number of retained cases (and by definition also increasing the number of ruled-out cases) can have important implications for how they are interpreted. Radiologist performance may improve as the retained reading pool size decreases due to reading fewer cases, 26 reading an enriched batch with higher prevalence, 27 – 29 and by consciously or unconsciously knowing that the cases were triaged 25 (i.e., anchoring or automation bias). These additional cancer detections could potentially offset a portion of the cancers missed among ruled-out cases due to the use of AI triage (N-FOR). Taking this into account, the Adjusted Net False Omission Rate (AN-FOR) refers to the probability of a patient having breast cancer that would have been detected by a radiologist in standard practice if ruled out, adjusted for the additional cancer detections due to AI triage that would have missed in standard practice (i.e., “credit” cancer cases that radiologists would have otherwise missed without AI triaging them). Another risk worth considering is that although radiologists are more likely to catch cancer cases they would have otherwise missed had AI not retained them, it is also likely that for the same reason, radiologists may also increase unnecessary recalls (i.e., radiologists recall non-cancer cases they would not have otherwise recalled had they not been retained by triage). 25 Simulation Methods To illustrate the trade-offs associated with different AI triage thresholds, we conducted a “simulation” using risk scores from a deep learning model with screening mammography. Study Sample We conducted a single institution retrospective review of 114,229 bilateral 2D digital screening mammograms acquired between January 2006 and January 2023. Exams with histopathologically confirmed breast cancer within 12 months of the screening mammogram were considered positive. Exams with at least 12 months of follow-up without a breast cancer diagnosis were considered negative. Based on these criteria, 864 cases (0.76%) were identified as positive. AI Model Mammograms were assessed using Mirai, an open-source deep-learning model trained to predict breast cancer risk from mammograms. 30 , 31 One-year risk scores (henceforth “scores” or “Mirai scores”) were used to simulate triage thresholds. Threshold Simulation Framework. We simulated various triage thresholds based on Mirai scores. For each threshold, we calculated the following metrics: Caseload reduction rate ( CRR ): the percentage of screening mammograms that would have been read by a radiologist under standard care but were excluded from review due to AI-based triage. CRR=Total cases ruled out /Total cases. AI Negative predictive value ( AI-NPV ): probability of a patient not having breast cancer if ruled out. AI-NPV = TN / (FN + TN). AI Positive predictive value ( AI-PPV ): probability of a patient having breast cancer given if retained. PPV = TP / (TP + FP). Gross AI False omission rate ( G-FOR ): probability of a patient having breast cancer if ruled out. G-FOR = (1-NPV) = FN / (FN + TN). AI Net False omission rate ( N-FOR ): probability of a patient having breast cancer if ruled out and the radiologist would have caught it. AI Adjusted Net False omission rate ( AN-FOR ): probability of a patient having breast cancer if ruled out and the radiologist would have caught it (i.e. N-FOR), adjusted for breast cancer cases that AI retained but radiologists would have missed. AI False discovery rate ( AI-FDR ): probability of a patient not having breast cancer if retained. AI-FDR = (1-PPV) = FP/ (TP + FP). Note, TP (true positive), TN (true negative), FP (false positive), FN (false negative). Modeling Assumptions To model AN-FOR, we simulated four hypothetical scenarios in which 10%, 30%, 50%, or 70% of missed cancers in standard practice were detected by using AI triage. Statistics All modeling was conducted using SAS 9.4 (SAS Cary, NC), where sensitivities and specificities were estimated using the %ROCPLOT macro, and PPV, NPV, FDR, and FOR were calculated using Bayes’ Theorem. The base rate of cancer was 0.76%. Results Approaches to Identifying Triage Threshold Data were simulated using two triage thresholds that can be generalized across practices. The first uses diagnostic performance—Youden’s J—to define a threshold by optimizing the balance of sensitivity and specificity. The second defines a threshold using an outcome, in this case, avoiding any overall increase in missed breast cancers compared to standard practice without triage. That is, this threshold is set so that an AN-FOR of 0 is achieved, meaning all cancers missed by using AI triage (rule-out cases) is then offset by an identical number of cancer cases that a radiologist would catch because they were retained. Identifying Threshold using Diagnostic Performance (Youden’s J) For these data, we observed that the Youden’s J value is a Mirai score of 0.20, achieving a sensitivity of 74% and a specificity of 75% (see Table 2 and 4). Given a local prevalence of 0.76%, this translated into ruling out 85,220 cases and retaining 29,009 cases, resulting in a CRR of 75% (85,220/114,229). Of these ruled-out cases, 223 had breast cancer and 84,997 did not, thus achieving a G-FOR of 223/85,220 (0.26%). Of the retained cases, 641 had breast cancer and 28,368 did not, thus achieving an AI-FDR of 97.8%. View this table: View inline View popup Table 2. Error Rates Using Youden’s J Threshold Of the 223 breast cancer cases that were ruled out, 82 were not recalled. That is, 82 were also missed by radiologists in standard of care while they recalled the remaining 141, thus achieving an N-FOR of 141/85,220 or 0.17%. Regarding the retained cases, AI retained 66 cases that radiologists missed. Assuming radiologists detect 10%, 30%, 50% or 70% of these cases in AI triage, the adjusted net number of missed cancers in AI triage would be reduced to by 7, 20, 33, or 46 to 134, 121, 108, or 95, respectively. This would correspond to Adjusted Net FOR values of 0.16%, 0.14%, 0.13%, and 0.11%, respectively. These values are visualized in Table 4 and Figure 1 . Download figure Open in new tab Figure 1. False Omission Rate by Caseload Reduction Rate. X-axis is caseload reduction rate (10% to 100%) and Y-axis is False Omission Rate (0.0% to 0.70%). Youden refers to Youden’s J (thin black line). G-FOR is Gross False Omission Rate (solid red). N-FOR is Net False Omission Rate (longest dash, bright blue). AN-FOR 10% (long dash, light blue), AN-FOR 30% (short dash, medium blue), AN-FOR 50% (short dash, dark blue), AN-FOR 70% (shortest dash, grey blue) refer to the Adjusted Net False Omission Rate at various percentages of additional breast cancers that radiologists would detect (10%, 30%, 50%, and 70% respectively) in AI-retained cases using an AI triage model relative to standard of care. Identifying Threshold using Outcomes Another approach to identifying the threshold is by considering the type of error and number of errors that would result from AI triage based on historical data. As shown in Figure 1 and Tables 3 and 4 , depending on the percentage of additional breast cancer cases (i.e., 10%, 30%, 50%, and 70%) that radiologists would have detected among those retained by AI triage (compared to standard of care), the rule-out threshold can be set by determining the caseload reduction rate where AN-FOR intersects a certain value (here 0). As discussed above, this threshold corresponds to no additional missed cancers overall (among both retained and ruled out cases) relative to standard practice. As illustrated in Figure 1 and Table 4 ( bold ), assuming radiologists detect an additional 30% of missed cancers in cases retained by AI, a threshold of Mirai = 0.05 would achieve an AN-FOR of 0, which would translate into a CRR of about 36%. If radiologists detect an additional 70% of missed cancers, a threshold of Mirai=0.09 would achieve an AN-FOR of 0, which would translate into a CRR of about 53%. View this table: View inline View popup Table 3. Key for Table 4 View this table: View inline View popup Table 4. Table of metrics and outcomes These CRR values can then be used to examine the corresponding number of false positives. As seen in Figure 2 , the AI-FDR was between about 98% and 99% for all thresholds considered, indicating that FDR was largely stable. Given that false positives are unlikely to vary significantly, mainly because of low cancer prevalence, 24 false negatives will be the primary focus here. Download figure Open in new tab Figure 2. False Discovery Rate by Caseload Reduction Rate. X-axis is caseload reduction rate (10% to 100%) and Y-axis is False Discovery Rate (60% to 100%). Thick black line is the relationship between Caseload Reduction Rate and False Discovery Rate Youden refers to Youden’s J (thin black line). AN-FOR 10% (long dash, light blue), AN-FOR 30% (short dash, medium blue), AN-FOR 50% (short dash, dark blue), AN-FOR 70% (shortest dash, grey blue) refer to the Adjusted Net False Omission Rate at various percentages of additional breast cancers that radiologists would detect (10%, 30%, 50%, and 70% respectively) in AI-retained cases using an AI triage model relative to standard of care. Comparing Thresholds To assess the trade-off between errors and benefits, we compare two thresholds: Mirai score of 0.20 corresponding to a 75% CRR (Youden’s J) and Mirai score of 0.05 corresponding to a 36% caseload reduction assuming AN-FOR of 0 where 30% of additional breast cancers would have been detected among retained cases by radiologists using an AI triage model compared to standard of care. At the 0.20 threshold, the G-FOR, N-FOR, and AN-FOR (30%) are 0.26%, 0.017%, and 0.14%, respectively. This corresponds to 223, 141, and 121 missed cancer cases for the benefit of reading 85,220 fewer cases with an FDR of 97.8%. In contrast, at the 0.05 threshold, the G-FOR, N-FOR, and AN-FOR (30%) are 0.12%, 0.07%, and 0.00% (rounded), corresponding to 49, 30, and 0 missed cancer cases for the benefit of reading 41,127 fewer cases with an FDR of 98.9%. Tables 3 and 4 provide all combinations for comparison. Discussion We demonstrate how radiology practices can consider the trade-offs of using different AI scores to determine the triage threshold. Using the Mirai AI algorithm and historical data, our simulation demonstrates how a risk-benefit analysis could be quantified. The purpose of this framework is not to advocate for a specific threshold or risk-benefit ratio but rather to demonstrate how a risk-benefit ratio could be quantified to inform policy and clinical implementation of AI triage. All numerical values provided are illustrative and are not intended as recommendations for clinical use. The optimal threshold will vary depending on the AI model, the pathology (and the corresponding trade-offs of false positives and false negatives), the AI model’s sensitivity and specificity for a local population, the prevalence of the local population, the local caseload volume and radiologist staffing ability, and institutional risk tolerances. Our simulation highlights how error rates (risk) and caseload reduction rate (benefit) can be estimated using historical data. This estimation not only accounts for the type of errors (i.e., false positive and false negative) but also the number of errors (i.e., false discovery and omission rates instead of false positive and negative rates) . While our simulation focused on the number of any missed cancers, the type (e.g. in situ versus invasive) and stages of cancers missed by AI could be incorporated to further assess the clinical significance of triage-related errors. What is more, we only evaluated cancers diagnosed within a year of the screening mammogram; other time frames (e.g., 1 and 2-year cancer outcomes) could be incorporated as well. Finally, for simplicity, we calculated the G-FOR, N-FOR, AN-FOR, and FDR using the direct rates, although confidence, prediction, or credible interval estimates could be used instead. Again, the point of the current study is to demonstrate the general framework of how triage could be used relative to standard of care. Fan et al. also propose evaluating AI triage using PPV and NPV. Our approach builds upon Fan and colleagues in two key ways. Namely, Fan et al. do not account for key counterfactuals such as cancers that would have been missed by radiologists without triage and cancer only detected with triage because of changes in radiologist performance. 16 In addition, Fan et al. propose using expected utility (EU) to assess AI triage. However, this relies on baseline relative utility values, which are difficult to define and when defined, may be difficult to justify, economically, ethically, and otherwise. Along with a framework for determining a threshold for AI triage of screening mammograms, there are several important considerations that must be addressed before AI triage can be implemented in clinical practice. First, prospective validation of AI rule-out strategies is needed. This validation will be important for understanding how AI triage impacts radiologist performance in the retained cases, such as increased recalls and increased cancer detection to empirically determine AN-FOR (whereas we simulated potential values). The validation will also be critical for ensuring that AI triage performs equitably across patient groups. Second, standards need to be developed for the safe deployment of AI triage tools in clinical settings and address approaches for ongoing monitoring of AI performance and safety over time. Third, there are psychological, ethical, legal, economic, and insurance considerations that must be weighed if implementing triage. Finally, there will need to be significant changes to the policy and regulatory landscape to allow AI triage in clinical practice. Addressing these considerations is necessary for the implementation of AI triage. Conclusion We present a framework for quantifying AI triage thresholds based on errors and benefits. Such a framework can help translate the potential of AI into strategies that help alleviate the growing workload pressures and resource limitations in radiology. Data Availability All data produced in the present study are available upon reasonable request to the authors Footnotes ↵ * shared first authorship Figure 3 was removed. Definitions were revised for clarity. Typological errors corrected. Reference 28 corrected. References 1. ↵ Bruls , R.J.M. & Kwee , R.M . Workload for radiologists during on-call hours: dramatic increase in the past 15 years . Insights Imaging 11 , 121 ( 2020 ). 2. ↵ Burns , J. , Chung , Y. , Rula , E.Y. , Duszak , R. , Jr. & Rosenkrantz , A.B . Evolving Trainee Participation in Radiologists’ Workload Using A National Medicare-Focused Analysis From 2008 to 2020 . J Am Coll Radiol 22 , 98 – 107 ( 2025 ). OpenUrl PubMed 3. ↵ Harry , E. , et al. Physician Task Load and the Risk of Burnout Among US Physicians in a National Survey . Jt Comm J Qual Patient Saf 47 , 76 – 85 ( 2021 ). OpenUrl CrossRef PubMed 4. ↵ Chetlen , A.L. , et al. Addressing Burnout in Radiologists . Acad Radiol 26 , 526 – 533 ( 2019 ). OpenUrl CrossRef PubMed 5. ↵ Smith-Bindman , R. , et al. Trends in Use of Medical Imaging in US Health Care Systems and in Ontario, Canada, 2000-2016 . Jama 322 , 843 – 856 ( 2019 ). OpenUrl CrossRef PubMed 6. ↵ Shin , H.J. , Han , K. , Ryu , L. & Kim , E.-K . The impact of artificial intelligence on the reading times of radiologists for chest radiographs . npj Digital Medicine 6, 82 ( 2023 ). 7. van Winkel , S.L. , et al. Impact of artificial intelligence support on accuracy and reading time in breast tomosynthesis image interpretation: a multi-reader multi-case study . Eur Radiol 31 , 8682 – 8691 ( 2021 ). OpenUrl PubMed 8. ↵ Conant , E.F. , et al. Improving Accuracy and Efficiency with Concurrent Use of Artificial Intelligence for Digital Breast Tomosynthesis . Radiol Artif Intell 1, e180096 ( 2019 ). 9. ↵ Ellington , T.D. , et al. Trends in Breast Cancer Incidence, by Race, Ethnicity, and Age Among Women Aged ≥20 Years - United States, 1999-2018 . MMWR Morb Mortal Wkly Rep 71 , 43-47 ( 2022 ). 10. ↵ Grabler , P. , Sighoko , D. , Wang , L. , Allgood , K. & Ansell , D . Recall and Cancer Detection Rates for Screening Mammography: Finding the Sweet Spot . AJR Am J Roentgenol 208 , 208 – 213 ( 2017 ). OpenUrl PubMed 11. ↵ Larsen , M. , Aglen , C.F. , Hoff , S.R. , Lund-Hanssen , H. & Hofvind , S . Possible strategies for use of artificial intelligence in screen-reading of mammograms, based on retrospective data from 122,969 screening examinations . Eur Radiol 32 , 8238 – 8246 ( 2022 ). OpenUrl PubMed 12. ↵ Rodriguez-Ruiz , A. , et al. Can we reduce the workload of mammographic screening by automatic identification of normal exams with artificial intelligence? A feasibility study . Eur Radiol 29 , 4825 – 4832 ( 2019 ). OpenUrl CrossRef PubMed 13. Plesner , L.L. , et al. Using AI to Identify Unremarkable Chest Radiographs for Automatic Reporting . Radiology 312 , e240272 ( 2024 ). OpenUrl PubMed 14. Pedemonte , S. , et al. A Semiautonomous Deep Learning System to Reduce False Positives in Screening Mammography . Radiol Artif Intell 6, e230033 ( 2024 ). 15. Tommi Keski-Filppula , M.N. , Marianne Haapea , Naglis Ramanauskas , Osmo Tervonen . Using artificial intelligence to detect chest X-rays with no significant findings in a primary health care setting in Oulu, Finland . arXiv ( 2022 ). 16. ↵ Kwok Lung Fan , Y.L.E.T. , Weijie Chen , Craig K. Abbey , Frank W Samuelson . Use of Expected Utility (EU) to Evaluate Artificial Intelligence-Enabled Rule-Out Devices for Mammography Screening . arXiv ( 2024 ). 17. Obuchowski , N.A. & Bullen , J.A . Statistical considerations for testing an AI algorithm used for prescreening lung CT images . Contemp Clin Trials Commun 16 , 100434 ( 2019 ). 18. ↵ Krupinski , E.A . Artificial Intelligence: Lessons Learned from Radiology . Healthcare Transformation , 5 – 10 ( 2019 ). 19. ↵ Yoon , S.H. , et al. Use of artificial intelligence in triaging of chest radiographs to reduce radiologists’ workload . European Radiology 34 , 1094 – 1103 ( 2024 ). OpenUrl CrossRef PubMed 20. Lång , K. , Hofvind , S. , Rodríguez-Ruiz , A. & Andersson , I . Can artificial intelligence reduce the interval cancer rate in mammography screening? Eur Radiol 31 , 5940 – 5947 ( 2021 ). OpenUrl PubMed 21. ↵ Yala , A. , Schuster , T. , Miles , R. , Barzilay , R. & Lehman , C . A Deep Learning Model to Triage Screening Mammograms: A Simulation Study . Radiology 293 , 38 – 46 ( 2019 ). OpenUrl CrossRef PubMed 22. ↵ Dembrower , K. , et al. Effect of artificial intelligence-based triaging of breast cancer screening mammograms on cancer detection and radiologist workload: a retrospective simulation study . Lancet Digit Health 2 , e468 – e474 ( 2020 ). OpenUrl 23. Xavier , D. , et al. Artificial intelligence for triaging of breast cancer screening mammograms and workload reduction: A meta-analysis of a deep learning software . J Med Screen 31 , 157 – 165 ( 2024 ). OpenUrl PubMed 24. ↵ Scaringi , J.A. , et al. Implementing an AI algorithm in the clinical setting: a case study for the accuracy paradox . Eur Radiol , 1-7 ( 2024 ). 25. ↵ Bernstein , M.H. , et al. Can incorrect artificial intelligence (AI) results impact radiologists, and if so, what can we do about it? A multi-reader pilot study of lung cancer detection with chest radiography . Eur Radiol 33 , 8263 – 8269 ( 2023 ). OpenUrl CrossRef PubMed 26. ↵ Krupinski , E.A. , Berbaum , K.S. , Caldwell , R.T. , Schartz , K.M. & Kim , J . Long radiology workdays reduce detection and accommodation accuracy . J Am Coll Radiol 7 , 698 – 704 ( 2010 ). OpenUrl CrossRef PubMed 27. ↵ Wolfe , J.M. , et al. Low target prevalence is a stubborn source of errors in visual search tasks . J Exp Psychol Gen 136 , 623 – 638 ( 2007 ). OpenUrl CrossRef 28. Al-Bazzaz H. , Janicijevic M. , & Strand F . Reader bias in breast cancer screening related to cancer prevalence and artificial intelligence decision support—a reader study . Eur Radiol 34 , 5415 – 24 ( 2024 ). OpenUrl PubMed 29. ↵ Egglin , T.K. & Feinstein , A.R . Context bias: a problem in diagnostic radiology . Jama 276 , 1752 – 1755 ( 1996 ). OpenUrl CrossRef PubMed Web of Science 30. ↵ Yala , A. , et al. Multi-Institutional Validation of a Mammography-Based Breast Cancer Risk Model . J Clin Oncol 40 , 1732 – 1740 ( 2022 ). OpenUrl CrossRef PubMed 31. ↵ Yala , A. , et al. Toward robust mammography-based models for breast cancer risk . Sci Transl Med 13( 2021 ). View the discussion thread. Back to top Previous Next Posted June 10, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Triage with AI: A Rule-out Framework Quantifying the Risks and Benefits of Screening Mammogram Automation Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Triage with AI: A Rule-out Framework Quantifying the Risks and Benefits of Screening Mammogram Automation Micheal H. Bernstein , Maggie Chung , Adam Yala , Grayson L. Baird medRxiv 2025.04.25.25326396; doi: https://doi.org/10.1101/2025.04.25.25326396 Share This Article: Copy Citation Tools Triage with AI: A Rule-out Framework Quantifying the Risks and Benefits of Screening Mammogram Automation Micheal H. Bernstein , Maggie Chung , Adam Yala , Grayson L. Baird medRxiv 2025.04.25.25326396; doi: https://doi.org/10.1101/2025.04.25.25326396 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Radiology and Imaging Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9219) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffb1675bc9bdfa9',t:'MTc3OTQ0NTIyMA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00