EndoGPT: A Proof-of-concept Large Language Model Based Assistant for the Management of Thyroid Nodules

doi:10.1101/2024.05.29.24308002

EndoGPT: A Proof-of-concept Large Language Model Based Assistant for the Management of Thyroid Nodules

2024 · doi:10.1101/2024.05.29.24308002

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 17,497 characters · extracted from preprint-html · click to expand

EndoGPT: A Proof-of-concept Large Language Model Based Assistant for the Management of Thyroid Nodules | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search EndoGPT: A Proof-of-concept Large Language Model Based Assistant for the Management of Thyroid Nodules View ORCID Profile Meghal Shah , View ORCID Profile Eric J. Kuo , View ORCID Profile Jennifer H. Kuo , View ORCID Profile Shawn Hsu , View ORCID Profile Catherine McManus , View ORCID Profile Rachel Liou , View ORCID Profile James A. Lee , View ORCID Profile Tejas S. Sathe doi: https://doi.org/10.1101/2024.05.29.24308002 Meghal Shah 1 Columbia University Irving Medical Center MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Meghal Shah For correspondence: ms5835{at}cumc.columbia.edu Eric J. Kuo 1 Columbia University Irving Medical Center MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Eric J. Kuo Jennifer H. Kuo 1 Columbia University Irving Medical Center MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jennifer H. Kuo Shawn Hsu 1 Columbia University Irving Medical Center MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Shawn Hsu Catherine McManus 1 Columbia University Irving Medical Center MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Catherine McManus Rachel Liou 1 Columbia University Irving Medical Center MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Rachel Liou James A. Lee 1 Columbia University Irving Medical Center MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for James A. Lee Tejas S. Sathe 1 Columbia University Irving Medical Center MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tejas S. Sathe Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Large language models (LLMs) are increasingly being explored for their potential to simulate clinical reasoning. Here, we demonstrate our initial experience using the GPT-4o LLM along with prompt engineering and knowledge retrieval to develop EndoGPT, a clinical decision support tool for the management of thyroid nodules. In a pilot study of 50 cases, EndoGPT demonstrated an 83% concordance rate with expert surgeons’ assessments and plans. The highest concordance was in diagnosis (93%), followed by the need for an operation (82%) and type of operation (69%). This work suggests that LLM-based assistants may play a useful role in assisting clinicians in the future. Introduction Though large-language models (LLM) demonstrate the ability to answer medical questions, their ability to simulate clinical reasoning is a topic of current exploration. Recent technical advances allow LLMs to be optimized using prompt engineering and knowledge retrieval from data sources, even without specific fine-tuning. 1 , 2 Here, we describe our implementation of these techniques to prototype an LLM-based clinical decision support tool for the management of thyroid nodules. Methods We abstracted deidentified data from clinic notes of patients referred for evaluation of thyroid nodules or thyroid cancer. We built an assistant (EndoGPT) based on the GPT-4o LLM that could ingest this data and output a predicted assessment and plan (A&P). To provide EndoGPT with additional context, we uploaded the 2015 American Thyroid Association Management Guidelines for Thyroid Nodules and Differentiated Thyroid Cancer as a reference. 3 EndoGPT could then utilize relevant components of the guidelines using vector embeddings and similarity search techniques. 4 For each patient scenario, we generated five predicted A&Ps and ensembled them into a compound A&P using a second assistant. After pre-testing EndoGPT on 25 patient scenarios, we analyzed errors, wrote instructions to avoid them, and added this data to EndoGPT’s prompt for additional context before testing it on new scenarios ( Figure 1 ). Download figure Open in new tab Figure 1: We built an LLM-based assistant called EndoGPT. The input to EndoGPT is a deidentified clinic note excluding the expert surgeon’s assessment and plan. EndoGPT was built using the GPT-4o LLM. We generated vector embeddings from the 2015 American Thyroid Association Management Guidelines for Thyroid Nodules and Differentiated Thyroid Cancer and used vector similarity to determine which components of the guidelines would generate the most useful context for the introductory prompt based on the patient scenario. We also provided feedback generated from a pretest of 25 cases. After running the first assistant five times, we provided all five responses to a compounding assistant which took the most commonly appearing components of each and composited them together. We then evaluated the similarity between the expert A&P and the predicted A&P across the domains of (1) diagnosis, (2) the need for an operation, and (3) type of operation. To evaluate EndoGPT, we measured concordance between the expert-generated and the predicted A&Ps across three domains: (1) diagnosis, (2) need for an operation, and (3) type of operation ( Figure 1 ). This study was deemed exempt by the Columbia University Institutional Review Board ( Protocol AAAV1151 ). Our code is available on GitHub. Results We tested EndoGPT on 50 patient scenarios and achieved an overall concordance of 83%. EndoGPT agreed with the expert’s diagnosis completely in 44/50 cases and partially in 5/50 cases (93% concordant). Moreover, the assistant agreed with the expert’s need for an operation in 41/50 cases (82% concordant). When the expert recommended surgery (n=36 cases), the assistant agreed with the expert’s choice of operation completely in 24 cases and partially in two cases (69% concordant) ( Figure 2 ). Details on the differences in A&Ps are described in Table S1 . Download figure Open in new tab Figure 2: EndoGPT concordance scores in the domains of diagnosis, need for an operation, type of operation, and overall. When assessing concordance in diagnosis and operation type, we allowed partial credit for partially concordant responses. Discussion Our early experience with EndoGPT suggests that surgeons who may not have the technical resources to build their own LLMs can still use general-purpose models like GPT-4o to develop clinical decision support tools. We achieved an 83% concordance with expert A&Ps using knowledge-retrieval and prompt engineering. Our model was most concordant when predicting a diagnosis and least concordant when suggesting a specific operation. Specific areas of recurring discordance were in the type of lymph node dissection (LND) recommended (e.g. EndoGPT did not assign a laterality to central LND) or the recommendation of surgery for benign nodules causing compressive symptoms (rather than performing fine needle aspiration). The latter may have occurred because we gave EndoGPT specific feedback during pretesting to consider surgery for benign, compressive nodules, highlighting the risk of over-prompting the model. In some cases, because we tested concordance with a singular A&P, it is possible that EndoGPT suggested a safe alternative approach. Thus, we may be underestimating EndoGPT’s overall accuracy. In future experiments, a panel of experts can assess EndoGPT’s responses for accuracy. Though not intended to replace physician evaluation, tools like EndoGPT may help train 4 surgical residents, assist non-specialist providers with initial workup and management, or make technical documents such as guidelines more accessible to patients. Utility will likely be greatest in areas of medicine where clear guidelines already exist. Further studies will be needed to fully optimize this system for patient care. Data Availability Our data and code are available on GitHub. https://github.com/tsathe/endogpt Supplementary Tables View this table: View inline View popup Table S1: EndoGPT concordance scores in the domains of diagnosis (Dx), need for an operation (Op?), and type of operation (Type). When EndoGPT achieved a less than perfect score, we explain the areas of discordance. FNA = fine needle aspiration; PTC = papillary thyroid carcinoma; LND = lymph node dissection. References [1]. ↵ Harsha Nori , Yin Tat Lee , Sheng Zhang , Dean Carignan , Richard Edgar , Nicolo Fusi , Nicholas King , Jonathan Larson , Yuanzhi Li , Weishung Liu , Renqian Luo , Scott Mayer McKinney , Robert Osazuwa Ness , Hoifung Poon , Tao Qin , Naoto Usuyama , Chris White , and Eric Horvitz . Can generalist foundation models outcompete Special-Purpose tuning? case study in medicine . November 2023 . URL http://arxiv.org/abs/2311.16452 . [2]. ↵ Tejas S Sathe , Joshua Roshal , Ariana Naaseh , Joseph C L’Huillier , Sergio M Navarro , and Caitlin Silvestri . How I GPT it: Development of custom artificial intelligence (AI) chatbots for surgical education . J. Surg. Educ ., 81 ( 6 ): 772 – 775 , June 2024 . ISSN 1931-7204 , 1878-7452 . doi: 10.1016/j.jsurg.2024.03.004 . URL http://dx.doi.org/10.1016/j.jsurg.2024.03.004 . OpenUrl CrossRef [3]. ↵ Bryan R Haugen , Erik K Alexander , Keith C Bible , Gerard M Doherty , Susan J Mandel , Yuri E Nikiforov , Furio Pacini , Gregory W Randolph , Anna M Sawka , Martin Schlumberger , Kathryn G Schuff , Steven I Sherman , Julie Ann Sosa , David L Steward , R Michael Tuttle , and Leonard Wartofsky . 2015 american thyroid association management guidelines for adult patients with thyroid nodules and differentiated thyroid cancer: The american thyroid association guidelines task force on thyroid nodules and differentiated thyroid cancer . Thyroid , 26 ( 1 ): 1 – 133 , January 2016 . ISSN 1050-7256 , 1557-9077 . doi: 10.1089/thy.2015.0020 . URL http://dx.doi.org/10.1089/thy.2015.0020 . OpenUrl CrossRef PubMed [4]. ↵ Underfitted . Building a RAG application from scratch using python, LangChain, and the OpenAI API , March 2024 . URL https://www.youtube.com/watch?v=BrsocJb-fAo . View the discussion thread. Back to top Previous Next Posted May 31, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following EndoGPT: A Proof-of-concept Large Language Model Based Assistant for the Management of Thyroid Nodules Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share EndoGPT: A Proof-of-concept Large Language Model Based Assistant for the Management of Thyroid Nodules Meghal Shah , Eric J. Kuo , Jennifer H. Kuo , Shawn Hsu , Catherine McManus , Rachel Liou , James A. Lee , Tejas S. Sathe medRxiv 2024.05.29.24308002; doi: https://doi.org/10.1101/2024.05.29.24308002 Share This Article: Copy Citation Tools EndoGPT: A Proof-of-concept Large Language Model Based Assistant for the Management of Thyroid Nodules Meghal Shah , Eric J. Kuo , Jennifer H. Kuo , Shawn Hsu , Catherine McManus , Rachel Liou , James A. Lee , Tejas S. Sathe medRxiv 2024.05.29.24308002; doi: https://doi.org/10.1101/2024.05.29.24308002 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Surgery Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (302) Cardiovascular Medicine (4453) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15242) Forensic Medicine (30) Gastroenterology (1131) Genetic and Genomic Medicine (6615) Geriatric Medicine (669) Health Economics (1001) Health Informatics (4552) Health Policy (1372) Health Systems and Quality Improvement (1614) Hematology (543) HIV/AIDS (1270) Infectious Diseases (except HIV/AIDS) (15929) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6625) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3344) Ophthalmology (979) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5461) Public and Global Health (9252) Radiology and Imaging (2207) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1197) Rheumatology (597) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a02c4be5bf3f1b23',t:'MTc3OTk2MTIxMQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00