Comparison of large language models for citation screening: A protocol for a prospective study

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 21,010 characters · extracted from preprint-html · click to expand
Comparison of large language models for citation screening: A protocol for a prospective study | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Comparison of large language models for citation screening: A protocol for a prospective study Takehiko Oami , Yohei Okada , Taka-aki Nakada doi: https://doi.org/10.1101/2024.06.26.24309513 Takehiko Oami 1 Department of Emergency and Critical Care Medicine, Chiba University Graduate School of Medicine , Chiba, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: seveneleven711thanks39{at}msn.com Yohei Okada 2 Department of Preventive Services, Kyoto University Graduate School of Medicine , Kyoto, Japan 3 Health Services and Systems Research, Duke-NUS Medical school, National University of Singapore , Singapore Find this author on Google Scholar Find this author on PubMed Search for this author on this site Taka-aki Nakada 1 Department of Emergency and Critical Care Medicine, Chiba University Graduate School of Medicine , Chiba, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Background Systematic reviews require labor-intensive and time-consuming processes. Large language models (LLMs) have been recognized as promising tools for citation screening; however, the performance of LLMs in screening citations remained to be determined yet. This study aims to evaluate the potential of three leading LLMs - GPT-4o, Gemini 1.5 Pro, and Claude 3.5 Sonnet for literature screening. Methods We will conduct a prospective study comparing the accuracy, efficiency, and cost of literature citation screening using the three LLMs. Each model will perform literature searches for predetermined clinical questions from the Japanese Clinical Practice Guidelines for Management of Sepsis and Septic Shock (J-SSCG). We will measure and compare the time required for citation screening using each method. The sensitivity and specificity of the results from the conventional approach and each LLM-assisted process will be calculated and compared. Additionally, we will assess the total time spent and associated costs for each method to evaluate workload reduction and economic efficiency. Trial registration This research is submitted with the University hospital medical information network clinical trial registry (UMIN-CTR) [UMIN000054783]. Background A systematic review comprises several steps, including the formulation of a query, citation screening, qualitative assessment, and meta-analysis. Among these processes, citation screening is known to be time-consuming and resource-intensive [ 1 - 3 ]. Although recent studies have explored machine learning applications for citation screening [ 4 - 9 ], achieving both time efficiency and high accuracy continues to be challenging [ 9 - 11 ]. The advent of large language models (LLMs) has illuminated new possibilities in natural language processing and the completion of complex tasks [ 12 , 13 ]. These tools have demonstrated potential in revolutionizing citation screening through their sophisticated comprehension and human-like response generation capabilities [ 14 , 15 ]. Prior research has suggested the potential of LLMs in citation screening tasks [ 16 ]. However, comprehensive studies comparing the performance of LLMs are lacking. Therefore, we will seek to investigate the performance of different LLMs in screening citations. This study aims to evaluate and compare three recent LLMs— GPT-4o, Gemini 1.5 Pro, and Claude 3.5 Sonnet—in their ability to conduct citation screening. Methods Study design and settings We will conduct a prospective study to evaluate the performance of LLMs in citation screening. To enhance the transparency and accessibility of our methodology, we have submitted our comprehensive review protocol to the medRxiv pre-print platform. Additionally, we have registered our study with the University Hospital Medical Information Network (UMIN) clinical trials registry (UMIN000054783). Clinical questions in the J-SSCG Our study will evaluate the accuracy of LLMs using clinical questions (CQs) from the upcoming J-SSCG 2024, an updated version of the 2020 guidelines. Developed by the Japanese Society of Intensive Care Medicine (JSICM) and the Japanese Association for Acute Medicine (JAAM), these guidelines specifically address sepsis and septic shock management in Japanese clinical settings [ 17 ]. We will employ the same five clinical questions (CQs) as in our previous research ( Table 1 ) [ 11 ]. These CQs underwent comprehensive literature reviews across multiple databases, including CENTRAL, PubMed, and Ichushi-Web. The working group meticulously developed search strategies to guarantee the inclusion of all relevant studies. Our search was confined to literature in Japanese and English. For J-SSCG 2024, we utilized EndNote as our citation management tool. This software facilitated the downloading, compiling, and removal of duplicates from all titles and abstracts gathered during our literature search. View this table: View inline View popup Download powerpoint Table 1. The list of the patient/population/problem, intervention, and comparison of the selected clinical questions Conventional citation screening Members of J-SSCG 2024 transferred files processed in EndNote to Rayyan, a software specifically designed to facilitate systematic reviews. The screening protocol involved two independent reviewers each assessing the title and abstract of each study. Disagreements were resolved through collaborative discussions or, when necessary, by consulting a third reviewer for an impartial evaluation. As a standard reference for assessing accuracy, we will utilize the screening results from conventional citation screening methods. Large language model Our prospective study will critically assess the accuracy, time efficiency, and cost of three LLMs, including GPT-4o (OpenAI, San Francisco, CA), Gemini 1.5 Pro (Google, Mountain View, CA), and Claude 3.5 Sonnet (Anthropic, San Francisco, CA), released on May 13, 2024, May 23, 2024, and June 21, 2024, respectively. After importing the dataset from citation managers using the same procedure as the conventional tool for citation screening, we interfaced the dataset with the Application Programming Interface (API) using pandas (version 1.0.5) in Python (version 3.9.0). We will utilize the publicly available API for each LLM. To conduct LLM-assisted citation screening, we developed a command prompt that enables the LLMs to automatically execute the citation screening process. For each query, we will adhere strictly to the same phrases outlined in the framework of CQs that the J-SSCG2024 members formulated for conventional citation screening. Prompt You are conducting a systematic review and meta-analysis, focusing on a specific area of medical research. Your task is to evaluate research studies and determine whether they should be included in your review. To do this, each study must meet the following criteria: Target Patients: ------------ Intervention: ---------- Comparison: --------- Study Design: The study must be a randomized controlled trial. Additionally, any study protocol that meets these criteria should also be included. However, you should exclude studies in the following cases: The study does not meet all of the above eligibility criteria. The study’s design is not a randomized controlled trial. Examples of unacceptable designs include case reports, observational studies, systematic reviews, review articles, animal experiments, letters to editors, and textbooks. After reading the title and abstract of a study, you will decide whether to include or exclude it based on these criteria. Let’s think step by step. Please answer with include or exclude only. Title: --------- Abstract --------------------------------------------- Through the process of the automated citation screening using LLMs, inclusion or exclusion decisions was provided without prior context. Upon completion of this phase, we will review the judgement documented in the output file. The source code for this procedure will be made available in a public GitHub repository ( https://github.com/seveneleven711thanks39/gpt-assisted_citation_screening.git ). Data collection This study will collect and evaluate the following variables: Accuracy Accuracy: After compiling the number of references included by each LLM, we will compare the sensitivity and specificity of these results to those obtained through manual screening. Time Efficiency The time required for citation screening with each LLM will be measured and compared to that of manual methods. Cost The study will assess the overall costs associated with API usage, based on a usage-based billing system. Statistical analysis To assess and compare the accuracy of LLMs, we will calculate the sensitivity and specificity of citations accurately identified as “relevant” by the LLMs. Our primary analysis will utilize the results from the qualitative assessment of conventional screening as the standard reference. The secondary analysis will employ the results from the title and abstract review of conventional screening as the standard reference. To assess time efficiency, we will aggregate the durations of systematic review sessions across all clinical questions. To calculate the cost of LLM-assisted citation screening using APIs, we will document the total charges incurred under the pay-as-you-go system. Additionally, we will perform a sensitivity analysis to investigate how variations in the LLM’s prompts influence screening accuracy, focusing on the effects of prompt engineering on the model’s performance in citation assessment tasks. In our analysis, we will present continuous data as means and standard deviations or medians and interquartile ranges, depending on the distribution of the data. For the statistical analysis, we will use GraphPad Prism 10 (GraphPad Software, San Diego, CA). Data Availability All data produced in the present study are available upon reasonable request to the authors. Conflicts of interest All authors declare no conflicts of interest to have. Funding None References 1. ↵ Borah R , Brown AW , Capers PL , Kaiser KA : Analysis of the time and workers needed to conduct systematic reviews of medical interventions using data from the PROSPERO registry . BMJ Open 2017 , 7 ( 2 ): e012545 . OpenUrl Abstract / FREE Full Text 2. Sampson M , Tetzlaff J , Urquhart C : Precision of healthcare systematic review searches in a cross-sectional sample . Res Synth Methods 2011 , 2 ( 2 ): 119 – 125 . OpenUrl 3. ↵ Wang Z , Nayfeh T , Tetzlaff J , O’Blenis P , Murad MH : Error rates of human reviewers during abstract screening in systematic reviews . PLoS One 2020 , 15 ( 1 ): e0227742 . OpenUrl CrossRef PubMed 4. ↵ van de Schoot R , de Bruin J , Schram R , Zahedi P , de Boer J , Weijdema F , Kramer B , Huijts M , Hoogerwerf M , Ferdinands G et al : An open source machine learning framework for efficient and transparent systematic reviews . Nat Mach Intell 2021 , 3 : 125 – 133 . OpenUrl 5. Marshall IJ , Wallace BC : Toward systematic review automation: a practical guide to using machine learning tools in research synthesis . Syst Rev 2019 , 8 ( 1 ): 163 . OpenUrl CrossRef PubMed 6. Harrison H , Griffin SJ , Kuhn I , Usher-Smith JA : Software tools to support title and abstract screening for systematic reviews in healthcare: an evaluation . BMC Med Res Methodol 2020 , 20 ( 1 ): 7 . OpenUrl CrossRef PubMed 7. O’Mara-Eves A , Thomas J , McNaught J , Miwa M , Ananiadou S : Using text mining for study identification in systematic reviews: a systematic review of current approaches . Syst Rev 2015 , 4 : 5 . OpenUrl CrossRef PubMed 8. Wallace BC , Trikalinos TA , Lau J , Brodley C , Schmid CH : Semi-automated screening of biomedical citations for systematic reviews . BMC Bioinformatics 2010 , 11 : 55 . OpenUrl CrossRef PubMed 9. ↵ Gates A , Guitard S , Pillay J , Elliott SA , Dyson MP , Newton AS , Hartling L : Performance and usability of machine learning for screening in systematic reviews: a comparative evaluation of three tools . Syst Rev 2019 , 8 ( 1 ): 278 . OpenUrl CrossRef PubMed 10. O’Connor AM , Tsafnat G , Thomas J , Glasziou P , Gilbert SB , Hutton B : A question of trust: can we build an evidence base to gain trust in systematic review automation technologies? Syst Rev 2019 , 8 ( 1 ): 143 . OpenUrl PubMed 11. ↵ Oami T , Okada Y , Sakuraya M , Fukuda T , Shime N , Nakada TA : Efficiency and workload reduction of semi-automated citation screening software for creating clinical practice guidelines: a prospective observational study . J Epidemiol 2023 . 12. ↵ Haug CJ , Drazen JM : Artificial Intelligence and Machine Learning in Clinical Medicine, 2023 . N Engl J Med 2023 , 388 ( 13 ): 1201 – 1208 . OpenUrl CrossRef PubMed 13. ↵ Lee P , Bubeck S , Petro J : Benefits, Limits, and Risks of GPT-4 as an AI Chatbot for Medicine . N Engl J Med 2023 , 388 ( 13 ): 1233 – 1239 . OpenUrl CrossRef PubMed 14. ↵ Singhal K , Azizi S , Tu T , Mahdavi SS , Wei J , Chung HW , Scales N , Tanwani A , Cole-Lewis H , Pfohl S et al : Large language models encode clinical knowledge . Nature 2023 , 620 ( 7972 ): 172 – 180 . OpenUrl 15. ↵ Shah NH , Entwistle D , Pfeffer MA : Creation and Adoption of Large Language Models in Medicine . JAMA 2023 , 330 ( 9 ): 866 – 869 . OpenUrl 16. ↵ Kohandel Gargari O , Mahmoudi MH , Hajisafarali M , Samiee R : Enhancing title and abstract screening for systematic reviews with GPT-3.5 turbo . BMJ Evid Based Med 2023 . 17. ↵ Egi M , Ogura H , Yatabe T , Atagi K , Inoue S , Iba T , Kakihana Y , Kawasaki T , Kushimoto S , Kuroda Y et al : The Japanese Clinical Practice Guidelines for Management of Sepsis and Septic Shock 2020 (J-SSCG 2020) . J Intensive Care 2021 , 9 ( 1 ): 53 . OpenUrl View the discussion thread. Back to top Previous Next Posted June 26, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Comparison of large language models for citation screening: A protocol for a prospective study Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Comparison of large language models for citation screening: A protocol for a prospective study Takehiko Oami , Yohei Okada , Taka-aki Nakada medRxiv 2024.06.26.24309513; doi: https://doi.org/10.1101/2024.06.26.24309513 Share This Article: Copy Citation Tools Comparison of large language models for citation screening: A protocol for a prospective study Takehiko Oami , Yohei Okada , Taka-aki Nakada medRxiv 2024.06.26.24309513; doi: https://doi.org/10.1101/2024.06.26.24309513 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Epidemiology Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (302) Cardiovascular Medicine (4453) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15242) Forensic Medicine (30) Gastroenterology (1131) Genetic and Genomic Medicine (6615) Geriatric Medicine (669) Health Economics (1001) Health Informatics (4552) Health Policy (1372) Health Systems and Quality Improvement (1614) Hematology (543) HIV/AIDS (1270) Infectious Diseases (except HIV/AIDS) (15929) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6625) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3344) Ophthalmology (979) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5461) Public and Global Health (9252) Radiology and Imaging (2207) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1197) Rheumatology (597) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a02d0cdbcc05f047',t:'MTc3OTk2OTExNQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00