Customizing GPT-4 for clinical information retrieval from standard operating procedures

preprint OA: gold CC-BY-NC-4.0
📄 Open PDF Full text JSON View at publisher
Full text 48,544 characters · extracted from preprint-html · click to expand
Customizing GPT-4 for clinical information retrieval from standard operating procedures | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Customizing GPT-4 for clinical information retrieval from standard operating procedures View ORCID Profile Hannah Sophie Muti , Chiara Maria Lavinia Löffler , Marie-Elisabeth Leßmann , Esther Helene Stüker , Johanna Kirchberg , Malte von Bonin , Martin Kolditz , Dyke Ferber , Katharina Egger-Heidrich , Felix Merboth , Daniel E. Stange , Marius Distler , Jakob Nikolas Kather doi: https://doi.org/10.1101/2024.06.24.24309221 Hannah Sophie Muti 1 Department for Visceral, Thoracic and Vascular Surgery, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany 2 Else Kröner Fresenius Center for Digital Health, Dresden University of Technology , Dresden, Germany 3 National Center for Tumor Diseases Dresden (NCT/UCC), a partnership between DKFZ, Faculty of Medicine and University Hospital Carl Gustav Carus, TUD Dresden University of Technology, and Helmholtz-Zentrum Dresden - Rossendorf (HZDR) , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hannah Sophie Muti For correspondence: hannah_sophie.muti{at}tu-dresden.de Chiara Maria Lavinia Löffler 2 Else Kröner Fresenius Center for Digital Health, Dresden University of Technology , Dresden, Germany 4 Medical Department 1, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Marie-Elisabeth Leßmann 2 Else Kröner Fresenius Center for Digital Health, Dresden University of Technology , Dresden, Germany 4 Medical Department 1, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Esther Helene Stüker 2 Else Kröner Fresenius Center for Digital Health, Dresden University of Technology , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Johanna Kirchberg 1 Department for Visceral, Thoracic and Vascular Surgery, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany 3 National Center for Tumor Diseases Dresden (NCT/UCC), a partnership between DKFZ, Faculty of Medicine and University Hospital Carl Gustav Carus, TUD Dresden University of Technology, and Helmholtz-Zentrum Dresden - Rossendorf (HZDR) , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Malte von Bonin 4 Medical Department 1, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Martin Kolditz 4 Medical Department 1, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany 5 East German Lung Center Dresden-Coswig , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dyke Ferber 2 Else Kröner Fresenius Center for Digital Health, Dresden University of Technology , Dresden, Germany 6 Department of Medical Oncology, National Center for Tumor Diseases (NCT), Heidelberg University Hospital , Heidelberg, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Katharina Egger-Heidrich 4 Medical Department 1, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Felix Merboth 1 Department for Visceral, Thoracic and Vascular Surgery, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany 3 National Center for Tumor Diseases Dresden (NCT/UCC), a partnership between DKFZ, Faculty of Medicine and University Hospital Carl Gustav Carus, TUD Dresden University of Technology, and Helmholtz-Zentrum Dresden - Rossendorf (HZDR) , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Daniel E. Stange 1 Department for Visceral, Thoracic and Vascular Surgery, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany 3 National Center for Tumor Diseases Dresden (NCT/UCC), a partnership between DKFZ, Faculty of Medicine and University Hospital Carl Gustav Carus, TUD Dresden University of Technology, and Helmholtz-Zentrum Dresden - Rossendorf (HZDR) , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Marius Distler 1 Department for Visceral, Thoracic and Vascular Surgery, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany 3 National Center for Tumor Diseases Dresden (NCT/UCC), a partnership between DKFZ, Faculty of Medicine and University Hospital Carl Gustav Carus, TUD Dresden University of Technology, and Helmholtz-Zentrum Dresden - Rossendorf (HZDR) , Dresden, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jakob Nikolas Kather 2 Else Kröner Fresenius Center for Digital Health, Dresden University of Technology , Dresden, Germany 4 Medical Department 1, University Hospital and Faculty of Medicine Carl Gustav Carus, Technische Universität Dresden , Dresden, Germany 6 Department of Medical Oncology, National Center for Tumor Diseases (NCT), Heidelberg University Hospital , Heidelberg, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Background The increasing complexity of medical knowledge necessitates efficient and reliable information access systems in clinical settings. For quality purposes, most hospitals use standard operating procedures (SOPs) for information management and implementation of local treatment standards. However, in clinical routine, this information is not always easily accessible. Customized Large Language Models (LLMs) may offer a tailored solution, but need thorough evaluation prior to clinical implementation. Objective To customize an LLM to retrieve information from hospital-specific SOPs, to evaluate its accuracy for clinical use and to compare different prompting strategies and large language models. Methods We customized GPT-4 with a predefined system prompt and 10 SOPs from four departments at the University Hospital Dresden. The model’s performance was evaluated through 30 predefined clinical questions of varying degree of detail, which were assessed by five observers with different levels of medical expertise through simple and interactive question-and-answering (Q&A). We assessed answer completeness, correctness and sufficiency for clinical use and the impact of prompt design on model performance. Finally, we compared the performance of GPT-4 with Claude-3-opus. Results Interactive Q&A yielded the highest rate of completeness (80%), correctness (83%) and sufficiency (60%). Acceptance of the LLM’s answer was higher among early-career medical staff. Degree of detail of the question prompt influenced answer accuracy, with intermediate-detail prompts achieving the highest sufficiency rates. Comparing LLMs, Claude-3-opus outperformed GPT-4 in providing sufficient answers (70.0% vs. 36.7%) and required fewer iterations for satisfactory responses. Both models adhered to the system prompt more effectively in the self-coded pipeline than in the browser application. All observers showed discrepancies between correctness and accuracy of the answers, which rooted in the representation of information in the SOPs. Conclusion Interactively querying customized LLMs can enhance clinical information retrieval, though expert oversight remains essential to ensure a safe application of this technology. After broader evaluation and with basic knowledge in prompt engineering, customized LLMs can be an efficient, clinically applicable tool. Introduction The field of medicine has experienced a substantial knowledge gain over the past century.[ 1 ] In healthcare systems across the globe, shortage of staff and aging populations make time a rare and valuable resource.[ 2 ] Digital transformation has been shown to increase workflow efficiency, patient satisfaction and physician well being.[ 3 ] Large Language Models (LLMs), a Natural Language Processing (NLP) based technology, have substantially evolved and been applied to clinical research.[ 4 , 5 ] Public interest was drawn to LLMs through Chat-GPT, an openly available LLM with a chat-based interface developed by OpenAI.[ 6 ] This made the technology accessible to the general public and raised a debate about its implications in healthcare. [ 7 ] LLMs have been shown to pass medical licensing exams[ 8 , 9 ], develop models for medical data analysis[ 10 ] or generate summaries of patient interactions and medical histories.[ 11 ] However, LLMs have also been criticized for delivering invalid information referred to as hallucinations, which could be harmful in a clinical setting.[ 10 , 12 ] With GPT-4, OpenAI introduced the option to customize LLMs through system instructions (system prompt) and submission of individual documents to the model.[ 13 ] This can be achieved through Retrieval Augmented Generation (RAG), where the documents and prompts are transformed into vector embeddings, over which a similarity search identifies relevant information within the embeddings.[ 14 ] This has been shown to achieve superior results compared to the use of a general LLM in medical use cases.[ 15 ] In clinical routine, hospitals introduce best practice guidelines in the form of standard operating procedures (SOPs), which improves quality of care adapted to local treatment standards, resources or national guidelines.[ 16 , 17 ] Knowledge of a general LLM might not always concur with local standards, while a customized LLM can reduce the rate of hallucinations and the need for human intervention[ 18 ] when instructed to adhere to information provided in predefined documents.[ 19 ] In clinical practice, fast access to reliable medical information is paramount to provide optimal patient care. Therefore, we created an interactive tool to access information form hospital-specific SOPs: We customized GPT-4 with 10 SOPs from four different departments of the University Hospital in Dresden, Germany and examined its ability to accurately answer 30 medical questions as they would occur in clinical routine of various degrees of detail. We evaluated accuracy within medical staff of various degrees of expertise and across different querying strategies. We assessed the impact of prompt design and non-text items in the SOPs on answer accuracy. Finally, we compared the browser-generated GPT with a self-coded RAG approach and benchmarked GPT-4 by OpenAI against Claude-3-opus by Anthropic. We show that a customized LLM can be used as an interactive information retrieval tool from an individual set of documents and we make our browser-generated GPTs and code publicly available. To our knowledge, this is the first study using an SOP-augmented LLM for medical information retrieval. Methods Ethics statement The present study was conducted in accordance with the declaration of Helsinki. The study was approved by the ethics committee of the Dresden University of Technology under the reference number BO-EK-400092023. The study does not process patient-related data. Data collection We collected 10 SOPs from four different departments from the SOP database of the University Hospital Dresden, which were written in German. Four SOPs originated from the Department of Internal Medicine 1 (IM1), four SOPs originated from the Department of General Surgery (GS), one SOP originated from the Department of Clinical Infectiology (CIF) and one SOP originated from the Department of Pharmacology and Toxicology (PT). The 10 SOPs covered antibiotic treatment standards, COVID-19, opioids, neutropenia, CAR-T-Cell therapy, colorectal cancer, pancreatic cancer, oesophageal cancer, intestinal cleansing and intraperitoneal chemotherapy and were selected randomly to ensure diversity in length, content and complexity. LLM customization With a pre-defined system prompt we used GPT-4 (accessed 16.03.2024) to customize a GPT based on our 10 SOPs, which we refer to as SOPHIA (SOP-based Hospital-specific Information Access). The system prompt can be found in Suppl. table 1. We redacted all SOPs to ensure author anonymity prior to uploading them to GPT-4. Web search, DALL-E use and the use of our uploaded data for internal purposes by OpenAI were denied. With the same protocol, we created a second GPT with a slightly modified system prompt (Suppl. table 1), which we named CARL (Clinical Assistant for Retrieval of Local information, accessed 21.03.2024). SOPHIA was prompted to be an assistant, CARL was prompted to be a doctor. Both system prompts contained instructions to answer questions asked by doctors querying the provided documents, to answer in a precise and professional way and to strictly adhere to the given information. Experimental design Two physicians with clinical expertise from the departments of GS and IM1, respectively, created three questions of varying grades of detail and patient case vignettes per SOP. The questions can be found in Suppl. table 2 and were used for a question-and-answer analysis (Q&A) with the GPTs. The analysis was performed by 6 individuals of various degrees of professional experience: one medical student (observer 1), three residents, out of which two in IM1 and one in GS (observers 2-4) and two senior, board-certified physicians in IM1 and GS, respectively, who performed the Q&A according to their fields of expertise and who will be referred to as combined observer 5. ( Fig. 1A ) Observer 1, 3, 4 and 5 performed the analysis with SOPHIA, out of which observers 1, 3 and 5 conducted simple Q&A and observer 4 applied an interactive Q&A approach. Observer 2 performed the analysis with CARL. Each question was submitted once per observer, resulting in 30 questions per observer and 150 questions in total. Each observer submitted three “safety” questions querying for author names and information that was not contained in the SOPs, to verify adherence to the system prompt. These questions can be found in Suppl. Table 2. Download figure Open in new tab Figure 1. Experimental design and overall results A: Experimental design: a publicly accessible GPT is customized with hospital-specific Standard Operating Procedures from different departments. 5 observers of various professional expertise query the GPT for hospital-specific information retrieval. The analysis is repeated within a RAG pipeline using the most successful experimental approach under comparison of two different Large Language models, GPT-4 and Claude-3-opus. B: Overall results covering completeness, correctness and sufficiency of the GPT’s answers for clinical use. C: Results by observer: completeness, correctness and sufficiency of the GPT’s answers for clinical use stratified by medical staff of various levels of professional expertise. All icons were obtained from https://www.flaticon.com/ . Evaluation Every observer individually interacted with the GPT and rated the given answers for completeness, correctness and sufficiency. Answers were defined as complete if every required detail was mentioned. Answers were defined as correct if every detail was reported according to the SOP, if the prompt requested a multitude of information of which only the majority was stated correctly (e.g. a list of drugs out of which one dosage interval was reported incorrectly), the answer was defined as partially correct. Answers were defined as sufficient if the observer found them suitable for use in a clinical routine setting. We furthermore designed the questions with different degrees of detail: Low detail questions targeted a broad section of an SOP and did not include patient- or case-specific information. Intermediate detail questions targeted a subset of a section of an SOP and contained one additional case-specific information detail. High detail questions targeted a subset of a section of an SOP and contained multiple case-specific details. We furthermore assessed the performance between genders in the patient case vignettes and SOP-specific characteristics like tables or flowcharts on the quality of the results with chi-squared tests. RAG analysis In addition to OpenAI’s browser application, we constructed a pipeline using retrieval augmented generation (RAG) using the LlamaIndex framework ( https://github.com/run-llama , first accessed 18.03.2024, last accessed 16.05.2024).[ 20 ] First, we generated vector embeddings of our PDF documents with Chroma ( https://www.trychroma.com/ , first accessed 18.03.2024, last accessed 16.05.2024) and stored them in a local vector storage.[ 21 ] Then, we created a chat engine to interact with the information in our vector storage under specification with our system prompt. Observer 3 repeated the analysis once with GPT-4 and once with Claude-3-opus via the OpenAI API and the Anthropic API, respectively. All code was written in Python . Hyperparameters are listed in Suppl. Table 3. Data and code availability Our customized GPTs can be interacted with under https://chatgpt.com/g/g-MLkk5w66d-sophia (SOPHIA) and https://chat.openai.com/g/g-bFsNYtnu1-carl (CARL), respectively, with an OpenAI account. Due to local policy, we cannot make the SOP documents publicly available, however, the GPTs are finetuned with all SOPs used for this manuscript. All code is openly available under https://github.com/MutiHannah/SOPhia . Results Clinical-grade assessment of a customized LLM for information retrieval from standard operating procedures using GPT-4 We customized a GPT using OpenAI’s GPT-4 by supplying it with 10 SOPs in the form of PDF documents and assessed its ability to answer 30 predefined clinical-grade questions of various degrees of detail through observers of various degrees of experience. Six out of 10 SOPs contained plain text, six contained tables, two contained flowcharts, three contained pictures. The number of pages ranged from 1 to 11.( Table 1 ) Overall, our observers defined the GPT’s answers as complete in 60.67% of cases (vs. incomplete in 39.33% of cases), as correct in 62.67% of cases (vs. partially correct in 14.67% and incorrect in 22.67% of cases) and as sufficient in 58% of cases (vs. insufficient in 42% of cases). ( Fig. 1B ) We conclude that customized GPTs can retrieve information from a diverse set of SOPs, but that the overall approach can be optimized. View this table: View inline View popup Download powerpoint Table 1. SOP characteristics. Overview of all SOPs regarding the length in pages and the presence or absence of plain text, tables or flowcharts. Interactive querying outperforms simple Q&A for clinical information retrieval Furthermore, we compared the results of our Q&A with medical staff of various professional levels, hypothesizing that especially early-career medical staff would rate the GPT’s answers as helpful. The satisfaction with the given answers was indeed higher in early-career observers compared to the advanced professionals. ( Fig. 1C ) The student observer, despite reporting a lower proportion of complete and correct answers (50.0% and 56.7%), classified 66.7% of the answers as sufficient. ( Fig. 1C , observer 1) Out of the early-career medical professionals, the best performance was achieved through the interactive approach ( Fig. 1C , observer 4), which yielded the highest proportion of complete (80.0%) and correct (83.7%) answers. In 20 cases, one request was sufficient to obtain a suitable answer, one additional request was made in eight cases and 4 additional requests were made in 2 cases until a satisfactory answer was obtained. ( Fig 2A ) Senior physicians were most critical and classified 56% of the answers as correct and 50% as sufficient. ( Fig. 1C , observer 5). The mean answering time with the GPT across all observers was 45 seconds with a standard deviation of 24 seconds compared to 144 seconds with a standard deviation of 133 seconds to find the respective information in our hospital’s knowledge database. ( Fig. 2B and C ) Finally, we assessed the GPT’s adherence to the system prompt through safety questions. As instructed, the GPT did not give away information beyond its set of SOPs. ( Fig 2D ) We conclude that with interactive Q&A, a customized GPT could be a clinically applicable information retrieval tool, but that professional expertise is irreplaceable to critically assess and contextualize given information. In addition, every observer reported discrepancies between the correctness and sufficiency of the given answers, so we administered further investigations. Download figure Open in new tab Figure 2. Results in respect to prompt design and SOP design A: Pie chart showing the number of interactions needed to obtain a final answer through interactive querying for medical information through observer 4. B: Boxplot showing the time to retrieve information from an LLM. C: Boxplot showing the time to retrieve information from a classical knowledge database in a German university hospital. D: Assessment of the GPT’s answers to safety questions. E: Assessment of different prompting strategies to obtain answers: questions with low degree of detail contain no patient case-specific details and query for broad and unspecific information from a section of the SOPs. Questions with an intermediate degree of detail contain a patient-case specific aspect and query for a specific section of the SOPs. Questions with a high degree of detail contain more than one case-specific detail and query for precise information in a sub-section of the SOPs. F: Comparison of answer completeness, correctness and sufficiency by gender of the patient in the constructed case-vignettes. G Assessment of the completeness and correctness of the given answers by the presence or absence of tables or flowcharts in the given SOP through chi-squared tests. Prompt engineering optimizes answer quality We hypothesized that the accuracy of GPT-generated answers might depend on prompt engineering or SOP-specific characteristics. To test this, we examined how question complexity affected answer quality. Per SOP, we asked one highly, one intermediately and one non-detailed question. We found that the rate of complete and correct answers was highest with the intermediate and high detail prompts, while there was no notable difference between the sufficiencies of the answers according to question detail. ( Fig. 2E ) In German, unlike in English, all nouns, including the noun “patient”, have a gender. Hence, we analyzed model performance by gender of the patient in the given case-vignette, which was not the case. ( Fig. 2F ) Overall, answer sufficiency did not seem to depend on question design or patient gender, hence, we analyzed the impact of non plain text items on the completeness and correctness of the given answers. The presence of tables did not impact the GPT’s performance, whereas the presence of a flowchart seemed to reduce completeness (p = 0.01) without having any impact on the correctness of the answers (p = 0.27). ( Fig. 2G ) We conclude that prompt engineering is important for the quality of the answers and that the most suitable answers can be achieved through precise but not extensive prompts. Moreover, prompt engineering, patient gender or the presence of non-text items in the SOP documents did not explain discrepancies between correct and sufficient answers. Claude-3-opus outperforms GPT-4 On the basis of our previous results, we chose the most successful prompting strategy, interactive Q&A, to compare our results which were generated through a browser-based application with a RAG system while comparing GPT-4 by OpenAI with Claude-3-opus by Anthropic. Using the same question prompts, the rate of correct answers was higher with GPT-4 than with Claude-3-opus (66.7% compared to 53.3%, Fig. 3A and C ), whereas the rate of complete and sufficient answers was higher with Claude-3-opus than GPT-4 (66.7% vs. 43.3% and 70.0% vs. 36.7%, Fig. 3 C and A ). In addition, the number of requests was higher with GPT-4 ( Fig. 3B ) than with Claude-3-opus ( Fig. 3D ). In seven cases, we were unable to obtain an answer through GPT-4 because the model claimed it was not supplied with the respective information (which was not the case) compared to Claude-3-opus, where this happened once ( Fig. 3D ). In the browser-based approach, this never occurred. ( Fig. 2A ) Given that the system prompt instructed the models to deny answers in case of an information deficit, self-coded pipelines seem to be less prone to giving faulty information than the browser application, while Claude-3-opus was more flexible in this setting. Download figure Open in new tab Figure 3. Comparison of GPT-4 and Claude-3-opus and distribution of answers perceived as correct vs. sufficient stratified by professional expertise of the observers. A Q&A results achieved with GPT-4. B Number of iterations needed with GPT-4. C Q&A results achieved with Claude-3-opus. D Number of iterations needed with Claude-3-opus. E Bubble chart showing the distribution of correct vs. sufficient answers stratified by professional expertise with each bubble representing one Q&A pair. Discussion In the present study, we customized an LLM with SOPs from a German university hospital. We assessed the GPT’s performance in answering clinical-grade questions across observers from various professional levels and through different prompting strategies. We show that through interactive prompting, GPTs are suitable for streamlined information retrieval from preselected sources. According to our results, self-constructed GPTs are less prone to misinformation compared to pre-designed browser applications. Truhn et al. argued that instead of single-shot Q&A, the true potential of LLMs lies in interactive reasoning[ 22 ], which our findings clearly corroborate.( Fig. 1C ) However, discrepancies between correctness and sufficiency rates in our results indicate that an LLM’s performance is tied to the underlying sources of information.[ 23 , 24 ] According to our early-career observers, non-text information was better understandable in the original SOP than in the GPT-produced answer through human interpretation. However, information on further steps or when to consult senior professionals was not included in most SOPs but in the GPT’s answer, which was perceived as helpful. Hence, early-career staff can substantially profit from this technology, but is also substantially prone to misinformation through it, as previously discussed.[ 7 ] Correspondingly, we observed that senior physicians were more distinct in rating answers for correctness and sufficiency compared to junior observers. ( Fig. 3C ) Especially when senior expertise is unavailable, quick and easy access to professional information is invaluable for patient (and physician) wellbeing - standards save lives.[ 25 ] Notably, GPT-generated answers included statements on patient wellbeing and social situations, which only one SOP took into account. In line with this, LLMs have been described to outperform physicians in the expression of empathy before.[ 26 , 27 ] In line with ethical and practical considerations for the purpose of generative AI in medicine, with a tool like ours, clinical-grade information can be made available to hospital staff quickly and easily accessible.[ 28 ] The idea that precisely asked questions improve answer quality accounts for human beings and LLMs equally. However, humans have a superior ability to derive information from context compared to LLMs.[ 29 ] For safe clinical use, basic prompt engineering skills will be paramount for optimal utilization of this technology and LLM-related education is expected to be integrated into medical workflows and education.[ 30 ] Remarkably, all SOPs, prompts and answers were in German language. In line with previous findings, our results gave no evidence that language had any impact on our output quality.[ 31 ] As healthcare systems become more diverse and globalized[ 32 ], it would be highly relevant to leverage multilingual LLMs and assess prompting through non-native speakers. Limitations Our study has several limitations. First and foremost, 10 SOPs are not representative of a hospital’s entire SOP database. After this initial proof-of-concept, our approach should be refined with a larger and more diverse set of documents, as described before.[ 19 ] Moreover, for economic purposes, the use of an open-source LLM like Llama should be evaluated in this use case.[ 33 , 34 ], In addition, incorrect answers can be harmful when searching for an adequate therapy for patients in potentially life-threatening conditions. However, in prior research LLMs outperformed human beings in medical information reproduction, the effects of this should be further analyzed.[ 35 ] In addition, there were discrepancies between correct and sufficient answers which based on our results cannot be explained through technical details. However, LLMs can uncover inconsistencies within a dataset and an LLM’s answer can only be as good as their source of information.[ 19 ] Lastly, we did not include nurses or physiotherapists into our experimental setup. In follow-up studies, every group of healthcare professionals should be integrated into method development in order to tailor applications to the users’ needs and to increase acceptance of this technology.[ 36 , 37 ] Outlook LLM applications in healthcare are a dynamic area of research. In the recent months alone, new models with significant performance increase and the capacity for multimodality have been released. With the advent of customization, LLMs can be individualized to serve the exact needs of healthcare staff. Our work represents an application of customized LLMs on one of the most practical, real-world use cases for this technology in medicine. In the future, this can be refined towards multimodal output, in-line calculation of drug dosages or diagnostic scores, or even documentation aid. Before clinical implementation, approval of this technology by medical staff and regulatory institutions is paramount. Data Availability Our customized GPTs can be interacted with under https://chatgpt.com/g/g-MLkk5w66d-sophia (SOPHIA) and https://chat.openai.com/g/g-bFsNYtnu1-carl (CARL), respectively, with an OpenAI account. Due to local policy, we cannot make the SOP documents publicly available, however, the GPTs are finetuned with all SOPs used for this manuscript. All code is openly available under https://github.com/MutiHannah/SOPhia . Author contributions HSM conceptualized the study. JK, MvB, MK, KEH, FM, DS and MD wrote the SOPs. HSM and CMLL collected the data. HSM, CMLL, MEL, ES, JK and MvB performed the analysis. HSM wrote the code. HSM and ES evaluated the results. JNK and DF contributed expertise and resources. HSM wrote the first draft of the manuscript. All other authors critically revised the manuscript. Disclosures JNK declares consulting services for Owkin, France; DoMore Diagnostics, Norway; Panakeia, UK, and Scailyte, Basel, Switzerland; furthermore JNK holds shares in Kather Consulting, Dresden, Germany; and StratifAI GmbH, Dresden, Germany, and has received honoraria for lectures and advisory board participation by AstraZeneca, Bayer, Eisai, MSD, BMS, Roche, Pfizer and Fresenius. The other authors have no other financial or non-financial conflicts of interest to disclose. Funding JNK is supported by the German Cancer Aid (DECADE, 70115166), the German Federal Ministry of Education and Research (PEARL, 01KD2104C; CAMINO, 01EO2101; SWAG, 01KD2215A; TRANSFORM LIVER, 031L0312A; TANGERINE, 01KT2302 through ERA-NET Transcan), the German Academic Exchange Service (SECAI, 57616814), the German Federal Joint Committee (TransplantKI, 01VSF21048) the European Union’s Horizon Europe and innovation programme (ODELIA, 101057091; GENIAL, 101096312), the European Research Council (ERC; NADIR, 101114631) and the National Institute for Health and Care Research (NIHR, NIHR203331) Leeds Biomedical Research Centre. The views expressed are those of the author(s) and not necessarily those of the NHS, the NIHR or the Department of Health and Social Care. This work was funded by the European Union. Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union. Neither the European Union nor the granting authority can be held responsible for them. References 1. ↵ Densen P. Challenges and opportunities facing medical education . Trans Am Clin Climatol Assoc . 2011 ; 122 : 48 – 58 . OpenUrl PubMed 2. ↵ Lorkowski J , Jugowicz A. Shortage of Physicians: A Critical Review . Adv Exp Med Biol . 2021 ; 1324 : 57 – 62 . OpenUrl 3. ↵ Agarwal AK , Southwick L , Gonzales RE , Bellini LM , Asch DA , Shea JA , et al. Digital Engagement Strategy and Health Care Worker Mental Health: A Randomized Clinical Trial . JAMA Netw Open . 2024 ; 7 : e2410994 . OpenUrl 4. ↵ Thirunavukarasu AJ , Ting DSJ , Elangovan K , Gutierrez L , Tan TF , Ting DSW . Large language models in medicine . Nat Med . 2023 ; 29 : 1930 – 1940 . OpenUrl 5. ↵ Piat C , Blampey Q , Joutard A , Qabel MA , Di Piazza T , Benassayag U , et al. A validated and explainable deep learning model instantly predicts survival from consultation reports . 2023 . doi: 10.2139/ssrn.4410792 OpenUrl CrossRef 6. ↵ Ouyang L , Wu J , Jiang X , Almeida D , Wainwright CL , Mishkin P , et al. Training language models to follow instructions with human feedback . arXiv [cs.CL] . 2022 . Available: http://arxiv.org/abs/2203.02155 7. ↵ Clusmann J , Kolbinger FR , Muti HS , Carrero ZI , Eckardt J-N , Laleh NG , et al. The future landscape of large language models in medicine . Communications Medicine . 2023 ; 3 : 1 – 8 . OpenUrl 8. ↵ Brin D , Sorin V , Konen E , Nadkarni G , Glicksberg BS , Klang E. How Large Language Models Perform on the United States Medical Licensing Examination: A Systematic Review . medRxiv . 2023 . p. 2023.09.03.23294842. doi: 10.1101/2023.09.03.23294842 OpenUrl Abstract / FREE Full Text 9. ↵ Liévin V , Hother CE , Motzfeldt AG , Winther O. Can large language models reason about medical questions? Patterns (N Y) . 2024 ; 5 : 100943 . OpenUrl 10. ↵ Tayebi Arasteh S , Han T , Lotfinia M , Kuhl C , Kather JN , Truhn D , et al. Large language models streamline automated machine learning for clinical studies . Nat Commun . 2024 ; 15 : 1 – 12 . OpenUrl CrossRef 11. ↵ Dave T , Athaluri SA , Singh S. ChatGPT in medicine: an overview of its applications, advantages, limitations, future prospects, and ethical considerations . Front Artif Intell . 2023 ; 6 : 1169595 . OpenUrl 12. ↵ Sandmann S , Riepenhausen S , Plagwitz L , Varghese J. Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks . Nat Commun . 2024 ; 15 : 2050 . OpenUrl 13. ↵ OpenAI , Achiam J , Adler S , Agarwal S , Ahmad L , Akkaya I , et al. GPT-4 Technical Report . arXiv [cs.CL] . 2023 . Available: http://arxiv.org/abs/2303.08774 14. ↵ Lewis P , Perez E , Piktus A , Petroni F , Karpukhin V , Goyal N , et al. Retrieval-augmented generation for knowledge-intensive NLP tasks . Adv Neural Inf Process Syst . 2020 ;abs/2005.11401. Available: https://proceedings.neurips.cc/paper/2020/hash/6b493230205f780e1bc26945df7481e5-Abstract.html 15. ↵ Zakka C , Shad R , Chaurasia A , Dalal AR , Kim JL , Moor M , et al. Almanac - Retrieval-Augmented Language Models for Clinical Medicine . NEJM AI . 2024 ; 1 . doi: 10.1056/aioa2300068 OpenUrl CrossRef 16. ↵ Cuschieri J , Johnson JL , Sperry J , West MA , Moore EE , Minei JP , et al. Benchmarking outcomes in the critically injured trauma patient and the effect of implementing standard operating procedures . Ann Surg . 2012 ; 255 : 993 – 999 . OpenUrl CrossRef PubMed 17. ↵ Olson CJ , Arthur M , Mullins RJ , Rowland D , Hedges JR , Mann NC . Influence of trauma system implementation on process of care delivered to seriously injured patients in rural trauma centers . Surgery . 2001 ; 130 : 273 – 279 . OpenUrl CrossRef PubMed Web of Science 18. ↵ Shah NH , Entwistle D , Pfeffer MA . Creation and Adoption of Large Language Models in Medicine . JAMA . 2023 ; 330 : 866 – 869 . OpenUrl 19. ↵ Ferber D , Wiest IC , Wölflein G , Ebert MP , Beutel G , Eckardt J-N , et al. GPT-4 for Information Retrieval and Comparison of Medical Oncology Guidelines . NEJM AI . 2024 [cited 24 May 2024 ]. doi: 10.1056/AIcs2300235 OpenUrl CrossRef 20. ↵ Gheorghiu A. Building Data-Driven Applications with LlamaIndex : A practical guide to retrieval-augmented generation (RAG) to enhance LLM applications . Packt Publishing Ltd ; 2024 . 21. ↵ Singh PN , Talasila S , Banakar SV . Analyzing Embedding Models for Embedding Vectors in Vector Databases . 2023 IEEE International Conference on ICT in Business Industry & Government (ICTBIG) . IEEE ; 2023 . pp. 1 – 7 . 22. ↵ Truhn D , Reis-Filho JS , Kather JN . Large language models should be used as scientific reasoning engines, not knowledge databases . Nat Med . 2023 ; 29 : 2983 – 2984 . OpenUrl 23. ↵ Albalak A , Elazar Y , Xie SM , Longpre S , Lambert N , Wang X , et al. A Survey on Data Selection for Language Models . arXiv [cs.CL] . 2024 . Available: http://arxiv.org/abs/2402.16827 24. ↵ Mitchell M , Luccioni AS , Lambert N , Gerchick M , McMillan-Major A , Ozoani E , et al. Measuring Data . arXiv [cs.AI] . 2022 . Available: http://arxiv.org/abs/2212.05129 25. ↵ Umemura Y , Abe T , Ogura H , Fujishima S , Kushimoto S , Shiraishi A , et al. Hour-1 bundle adherence was associated with reduction of in-hospital mortality among patients with sepsis in Japan . PLoS One . 2022 ; 17 : e0263936 . OpenUrl 26. ↵ Topol EJ . Machines and empathy in medicine . Lancet . 2023 ; 402 : 1411 . OpenUrl 27. ↵ Sorin V , Brin D , Barash Y , Konen E , Charney A , Nadkarni G , et al. Large Language Models (LLMs) and empathy - A systematic review . bioRxiv . 2023 . doi: 10.1101/2023.08.07.23293769 OpenUrl CrossRef 28. ↵ Harrer S. Attention is not all you need: the complicated case of ethically using large language models in healthcare and medicine . EBioMedicine . 2023 ; 90 : 104512 . OpenUrl 29. ↵ Gilbert S , Kather JN , Hogan A. Augmented non-hallucinating large language models as medical information curators . npj Digital Medicine . 2024 ; 7 : 1 – 5 . OpenUrl 30. ↵ Stretton B , Kovoor J , Arnold M , Bacchi S. ChatGPT-Based Learning: Generative Artificial Intelligence in Medical Education . Med Sci Educ . 2024 ; 34 : 215 – 217 . OpenUrl 31. ↵ Rogers A , Boyd-Graber J , Okazaki N Agrawal S , Zhou C , Lewis M , Zettlemoyer L , Ghazvininejad M. In-context Examples Selection for Machine Translation . In: Rogers A , Boyd-Graber J , Okazaki N , editors. Findings of the Association for Computational Linguistics: ACL 2023 . Toronto, Canada : Association for Computational Linguistics ; 2023 . pp. 8857 – 8873 . 32. ↵ Medical language proficiency: A discussion of interprofessional language competencies and potential for patient risk . Int J Nurs Stud . 2016 ; 54 : 158 – 172 . OpenUrl 33. ↵ Nievas M , Basu A , Wang Y , Singh H. Distilling large language models for matching patients to clinical trials . J Am Med Inform Assoc . 2024 . doi: 10.1093/jamia/ocae073 OpenUrl CrossRef 34. ↵ Wiest IC , Verhees FG , Ferber D , Zhu J , Bauer M , Lewitzka U , et al. Detection of suicidality through privacy-preserving Large Language Models . bioRxiv . 2024 . doi: 10.1101/2024.03.06.24303763 OpenUrl CrossRef 35. ↵ Van Veen D , Van Uden C , Blankemeier L , Delbrouck J-B , Aali A , Bluethgen C , et al. Adapted large language models can outperform medical experts in clinical text summarization . Nat Med . 2024 ; 30 : 1134 – 1142 . OpenUrl CrossRef PubMed 36. ↵ Lambert SI , Madi M , Sopka S , Lenes A , Stange H , Buszello C-P , et al. An integrative review on the acceptance of artificial intelligence among healthcare professionals in hospitals . NPJ Digit Med . 2023 ; 6 : 111 . OpenUrl 37. ↵ Shepherd M , Endacott R , Quinn H. Bridging the gap between research and clinical care: strategies to increase staff awareness and engagement in clinical research . J Res Nurs . 2022 ; 27 : 168 – 181 . OpenUrl View the discussion thread. Back to top Previous Next Posted June 24, 2024. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Customizing GPT-4 for clinical information retrieval from standard operating procedures Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Customizing GPT-4 for clinical information retrieval from standard operating procedures Hannah Sophie Muti , Chiara Maria Lavinia Löffler , Marie-Elisabeth Leßmann , Esther Helene Stüker , Johanna Kirchberg , Malte von Bonin , Martin Kolditz , Dyke Ferber , Katharina Egger-Heidrich , Felix Merboth , Daniel E. Stange , Marius Distler , Jakob Nikolas Kather medRxiv 2024.06.24.24309221; doi: https://doi.org/10.1101/2024.06.24.24309221 Share This Article: Copy Citation Tools Customizing GPT-4 for clinical information retrieval from standard operating procedures Hannah Sophie Muti , Chiara Maria Lavinia Löffler , Marie-Elisabeth Leßmann , Esther Helene Stüker , Johanna Kirchberg , Malte von Bonin , Martin Kolditz , Dyke Ferber , Katharina Egger-Heidrich , Felix Merboth , Daniel E. Stange , Marius Distler , Jakob Nikolas Kather medRxiv 2024.06.24.24309221; doi: https://doi.org/10.1101/2024.06.24.24309221 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (574) Allergy and Immunology (865) Anesthesia (304) Cardiovascular Medicine (4460) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (611) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15251) Forensic Medicine (31) Gastroenterology (1132) Genetic and Genomic Medicine (6621) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4564) Health Policy (1372) Health Systems and Quality Improvement (1617) Hematology (544) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15938) Intensive Care and Critical Care Medicine (1107) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6642) Nursing (346) Nutrition (1001) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3350) Ophthalmology (981) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1698) Pharmacology and Therapeutics (694) Primary Care Research (714) Psychiatry and Clinical Psychology (5464) Public and Global Health (9259) Radiology and Imaging (2212) Rehabilitation Medicine and Physical Therapy (1372) Respiratory Medicine (1198) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (533) Surgery (715) Toxicology (100) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a039b61a6f743fe2',t:'MTc4MDEwMTg3Ng=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-21T05:10:58.409756+00:00
License: CC-BY-NC-4.0