Zero-Shot Prompting is the Most Accurate and Scalable Strategy for Abstracting the Mayo Endoscopic Subscore from Colonoscopy Reports Using GPT-4

doi:10.1101/2024.03.22.24304745

Zero-Shot Prompting is the Most Accurate and Scalable Strategy for Abstracting the Mayo Endoscopic Subscore from Colonoscopy Reports Using GPT-4

2024 · doi:10.1101/2024.03.22.24304745

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 19,525 characters · extracted from preprint-html · click to expand

Zero-Shot Prompting is the Most Accurate and Scalable Strategy for Abstracting the Mayo Endoscopic Subscore from Colonoscopy Reports Using GPT-4 | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Zero-Shot Prompting is the Most Accurate and Scalable Strategy for Abstracting the Mayo Endoscopic Subscore from Colonoscopy Reports Using GPT-4 View ORCID Profile Richard P. Yim , View ORCID Profile Vivek A. Rudrapatna doi: https://doi.org/10.1101/2024.03.22.24304745 Richard P. Yim 1 Bakar Computational Health Sciences Institute, University of California – San Francisco , San Francisco, CA 94143 MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Richard P. Yim Vivek A. Rudrapatna 1 Bakar Computational Health Sciences Institute, University of California – San Francisco , San Francisco, CA 94143 2 Division of Gastroenterology and Hepatology, Department of Medicine, University of California – San Francisco , San Francisco, CA 94143 MD, PHD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Vivek A. Rudrapatna For correspondence: vivek.rudrapatna{at}ucsf.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Structured Abstract Introduction Large-language models can help extract information from clinical notes, making them potentially useful for research in ulcerative colitis. However, it remains unclear if these models will scale well in practice. Methods We analyzed the performance and cost of programmatically using GPT-4 to abstract Mayo endoscopic subscores (MES) from 499 colonoscopy reports using different prompting strategies. Results Zero-shot prompting, where GPT-4 is instructed without examples, was most accurate (83.55%) and cost-effective ($0.097/note). Discussion Using GPT-4 to automatically curate the MES and other variables is a practical strategy for quantifying UC activity and measuring improvements to clinical care. Introduction The Mayo endoscopic subscore (MES) is a core measure of ulcerative colitis (UC) activity ( 1 ), but is not always explicitly documented in colonoscopy reports. Thus, clinical studies that use the MES frequently require manual review of these reports to abstract these scores from free-text descriptions. Large language models like GPT-4 have shown promise in their ability to extract information from clinical notes Prior studies have used the more user friendly, chatbot interface to interact with these models. However, these models can also be used in a programmatic fashion, raising the possibility of being natively deployed within electronic health record (EHR) systems to dynamically maintain disease registries, optimize study recruitment, and support quality improvement. As a next step, we studied the scalability and cost-effectiveness of using GPT-4 to automate the extraction of the modified MES. We hypothesized that more sophisticated, n-shot and iterative-style prompts would yield more accurate results despite higher costs associated with this strategy. Methods We utilized an existing set of 499 annotated colonoscopy reports sourced from two hospitals in California 217 were from San Francisco General Hospital (SFGH), a safety net hospital, and 282 from the University of California, San Francisco Health, a tertiary care hospital. These reports were annotated based on 1) their suitability for MES scoring (e.g. clear diagnosis of UC, surgically unaltered anatomy), and, 2) the modified MES ( 4 ) if appropriate. We developed two generic conversation templates for zero-shot and n -shot prompting to programmatically interact with GPT-4-turbo via LangChain ( 5 ), a framework that enables context-rich prompts, and UCSF Versa, a PHI-compliant programmatic interface with GPT-4. (See Table 1 , Supplemental Digital Content 1 , for precise prompt templates and protocol texts.) We refer to n -shot prompting as providing n colonoscopy reports per Mayo score plus a non-Mayo scorable report in addition to the scoring protocol; zero shot refers to providing only the scoring protocol. For these templates we also studied the performance of GPT-4 with prompts that asked it to not only include the MES, but an explanation as well. We also provided GPT-4 with a parsed variation of colonoscopy report for UCSF and SFGH centers where only the main relevant text of the colonoscopy procedure report was provided as opposed to the colonoscopy report text in its entirety, which includes extraneous text strings. See Method Details, Supplemental Digital Content 1 , for additional explanation. View this table: View inline View popup Download powerpoint Table 1. Performance results, classification error, and cost are shown across each prompt template variation and data variation. Bolded text means higher is better; italicized text means lower is better. Best numeric values along each measurement have been underlined. Table has been partitioned according to template variation. Results Zero-shot prompts on trimmed notes produced the best performing results consistently on both UCSF (81.45-83.55% weighted average accuracy) and SFGH (73.15-78.11% weighted average accuracy) reports ( Table 1 ). We found that that n -shot prompting actually decreased classification performance across multiple metrics, rejecting our hypothesis that providing examples would improve GPT-4’s performance. Further, the cost of n -shot prompting is prohibitively more expensive on average (e.g., more than 8 times the cost for UCSF reports and 11 times for SFGH reports between n -shot and zero-shot prompting per note ). For Mayo scorable accuracy, whether an MES can be assigned to the colonoscopy report, we find that GPT-4 performs very well for zero-shot prompt templates (accuracy 90.28-93.66%), where n -shot prompting reduced its accuracy (best score at 84.28%). We also studied results for splitting Mayo scorable reports and MES separately (“zero-shot, two-task prompting”) but performance gains were negligible ( Table 1 ). With respect to prompt variations such as parsing colonoscopy reports and soliciting explanations for MES values, across all strata we find that the greatest difference in performance is 4.02% for trimming and -3.31% for requiring an explanation ( Table 2 ). In particular, parsing the text generally increases performance across all measures and decreases under classification as well. Interestingly, we find that across standard statistical learning metrics, the UCSF data shows an increase in performance when requiring explanation of the MES and a decrease in performance for SFGH data although the magnitude of these differences is minimal (worst difference in magnitude amongst accuracy, precision, recall and F1-score is 2.76%). View this table: View inline View popup Download powerpoint Table 2. DIfferences in performance stratified on parsing colonoscopy report text ( Parsed Report - Original Report ) and requiring explanation (Prompt with Explanation - Prompt without explanation ). Discussion This is among the first few studies to use an LLM in a programmatic fashion to extract study variables from clinical notes. We found that the most accurate and scalable prompting strategy is conveniently the most simple when it comes to producing MES scores from colonoscopy reports. Zero-shot prompts are not only easy to implement, but cost effective. GPT-4 proves itself to be reasonably effective at being able to simultaneously determine whether a colonoscopy procedure report is Mayo scorable, and providing an MES when it is. Further, we find that n-shot prompting is unreliable both in performance and cost. Beyond template variations of zero-shot, n -shot, and zero-shot, two-task prompting, our study explores prompt parameter interactions in GPT-4 performance that are currently absent in the literature (e.g., explanation requirement and text parsing). Further, our study explores the possibility of the generalizability of LLM information extraction across different centers. The primary limitation of our study then is a sophisticated endpoint. For instance, although the colonoscopy report distribution for IBD patients is representative across UCSF and SFGH centers, we have limited class representation for more severe MES graded UC (Mayo scores 2 and 3 in particular). Other studies in the literature explore continuously valued endpoints as well multidimensional endpoints extracted from clinical text ( 6 , 7 ). However, these studies focus primarily on the performance of GPT-4 and LLMs on clinical notes. There has been little consideration and commentary on prompt engineering and consequently the costs of deployment—in other words, the practicality of deploying GPT-4 for other studies. Deploying four-shot prompts on 282 parsed UCSF colonoscopy reports, requiring GPT-4 to produce an explanation, comes out to an average total cost of $418.77. There are thousands of colonoscopy procedure reports for IBD patients at our medical centers, but billions of notes across all diseases and patients in the US ( 8 ). Generative AI is and will be very expensive to deploy across all clinical areas and target variables. While LLMs like GPT-4 will enable retrospective information extraction, and consequently new observational studies using EHR data, we strongly advise clinical researchers to be mindful of various prompting strategies and their costs. Data Availability All data is PHI-compliant and is not to be publicly available. Data Acknowledgement The authors thank UCSF Academic Research Services for technical support related to enabling software in a secure, PHI compliant environment; UCSF AI Tiger Team for facilitating and managing access to Versa API (UCSF secure access to Microsoft Azure, OpenAI Large language Models); and the Chancellor’s Task Force for Generative AI. Footnotes Financial/Grant Support Research reported in this publication was supported by the National Library of Medicine of the National Institutes of Health under Award Number K99LM014099, the National Center for Advancing Translational Sciences, National Institutes of Health, through UCSF-CTSI Grant Number UL1 TR001872, as well as the UCLA Clinical and Translational Science Institute through grant number UL1TR001881. Its contents are solely the responsibility of the authors and do not necessarily represent the official views of the NIH. This research project has benefitted from the Microsoft Accelerate Foundation Models Research (AFMR) grant program through which leading foundation models hosted by Microsoft Azure along with access to Azure credits were provided to conduct the research. Conflicts of Interest: VAR receives research support from Alnylam, Takeda, Merck, Genentech, Blueprint Medicines, Stryker, Mitsubishi Tanabe, and Janssen. He also is a shareholder of ZebraMD. RPY has nothing to disclose. Writing Assistance: None. References 1. ↵ Lewis JD , et al. Use of the noninvasive components of the mayo score to assess clinical response in Ulcerative Colitis : Inflammatory Bowel Diseases . 2008 ; 14 ( 12 ): 1660 – 1666 . OpenUrl CrossRef PubMed Web of Science 2. Fink MA , Bischoff A , Fink CA , et al. Potential of ChatGPT and GPT-4 for Data Mining of Free-Text CT Reports on Lung Cancer . Radiology . 2023 ; 308 ( 3 ): e231362 . OpenUrl CrossRef PubMed 3. Silverman AL , Bhasuran B , Mosenia A , et al. Accurate, Robust, and Scalable Abstraction of Mayo Endoscopic Subscores from Colonoscopy Reports . Gastroenterology . 2022 ; 162 ( 7 ): S - 617 -S-618. OpenUrl 4. ↵ Richards K , et al. Ulcerative Colitis: Developing Drugs for Treatment Guidance for Industry . USHHS, CEDR, CEBR. Federal Registrar ; April 29, 2022 . 5. ↵ Harrison C. LangChain AI . LangChain (Version 1.2.0) [Computer software] . https://github.com/langchain-ai/langchain . Released October 17, 2022 . 6. ↵ Ge J , et al. A Comparison of Large Language Model versus Manual Chart Review for Extraction of Data Elements from the Electronic Health Record . Gastroenterology ; 2023 . 7. ↵ Williams CYK , et al. Assessing Clinical Acuity in the Emergency Department Using the GPT-3.5 Artificial Intelligence Model . Health Informatics ; 2023 . 8. ↵ Dinov ID . Volume and value of big healthcare data . J Med Stat Inform . 2016 ; 4 ( 1 ): 3 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted March 24, 2024. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Zero-Shot Prompting is the Most Accurate and Scalable Strategy for Abstracting the Mayo Endoscopic Subscore from Colonoscopy Reports Using GPT-4 Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Zero-Shot Prompting is the Most Accurate and Scalable Strategy for Abstracting the Mayo Endoscopic Subscore from Colonoscopy Reports Using GPT-4 Richard P. Yim , Vivek A. Rudrapatna medRxiv 2024.03.22.24304745; doi: https://doi.org/10.1101/2024.03.22.24304745 Share This Article: Copy Citation Tools Zero-Shot Prompting is the Most Accurate and Scalable Strategy for Abstracting the Mayo Endoscopic Subscore from Colonoscopy Reports Using GPT-4 Richard P. Yim , Vivek A. Rudrapatna medRxiv 2024.03.22.24304745; doi: https://doi.org/10.1101/2024.03.22.24304745 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (302) Cardiovascular Medicine (4453) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15242) Forensic Medicine (30) Gastroenterology (1131) Genetic and Genomic Medicine (6615) Geriatric Medicine (669) Health Economics (1001) Health Informatics (4552) Health Policy (1372) Health Systems and Quality Improvement (1614) Hematology (543) HIV/AIDS (1270) Infectious Diseases (except HIV/AIDS) (15929) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6625) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3344) Ophthalmology (979) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5461) Public and Global Health (9252) Radiology and Imaging (2207) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1197) Rheumatology (597) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a02bf0e11ce1de16',t:'MTc3OTk1NzQ4Mw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00