Full text
19,986 characters
· extracted from
preprint-html
· click to expand
Comparison of Large Language Models’ Performance on Neurosurgical Board Examination Questions | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Comparison of Large Language Models’ Performance on Neurosurgical Board Examination Questions View ORCID Profile Nicholas S. Andrade , Surya Donty doi: https://doi.org/10.1101/2025.02.20.25322623 Nicholas S. Andrade 1 Department of Neurosurgery, Christus Trinity Mother Frances Hospital , Tyler, TX, USA M.D. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nicholas S. Andrade For correspondence: nsa122{at}gmail.com Surya Donty 2 University of Texas at Tyler School of Medicine , Tyler, TX, USA M.S. Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Background Multiple-choice board examinations are a primary objective measure of competency in medicine. Large language models (LLMs) have demonstrated rapid improvements in performance on medical board examinations in the past two years. We evaluated five leading LLMs on neurosurgical board exam questions. Methods We evaluated five LLMs (OpenAI o1, OpenEvidence, Claude 3.5 Sonnet, Gemini 2.0, and xAI Grok2) on 500 multiple-choice questions from the Self-Assessment in Neurological Surgery (SANS) American Board of Neurological Surgery (ABNS) Primary Board Examination Review. Performance was analyzed across 12 subspecialty categories and compared to established passing thresholds. Results All models exceeded the threshold for passing, with OpenAI o1 achieving the highest accuracy (87.6%), followed by OpenEvidence (84.2%), Claude 3.5 Sonnet (83.2%), Gemini 2.0 (81.0%) and xAI Grok2 (79.0%). Performance was strongest in Other General (97.4%) and Peripheral Nerve (97.1%) categories, while Neuroradiology showed the lowest accuracy (57.4%) across all models. Conclusions State of the art LLMs continue to improve, and all models demonstrated strong performance on neurosurgical board examination questions. Medical image analysis continues to be a limitation of current LLMs. The current level of LLM performance challenges the relevance of written board examinations in trainee evaluation and suggests that LLMs are ready for implementation in clinical medicine and medical education. Introduction Medical board certification relies on multiple-choice questions to provide an objective, efficient way to assess knowledge at scale [ 3 ], [ 7 ]. The American Board of Neurological Surgery adopted this approach in 1962 [ 17 ] and continues to use it today, requiring candidates to pass a 100-question Neuroanatomy exam and a 375-question primary examination covering basic science and clinical knowledge. The 2017 development of the transformer architecture fundamentally changed how machines process language [ 16 ]. This innovation allowed models to analyze relationships between words across long text sequences, enabling more sophisticated understanding and reasoning. The impact has been dramatic. Early large language models in 2020-21 produced confident but often incorrect answers [ 11 ]. Today’s models consistently demonstrate human-level performance across professional examinations in medicine, law, and PhD-level mathematics and science [ 4 ], [ 8 ], [ 2 ], [ 19 ]. Initial studies of LLM performance on neurosurgery board examinations showed promising results. In early 2023, ChatGPT achieved a 73.4% score on mock board examinations, while its successor GPT-4 scored 83.4%, significantly outperforming both ChatGPT and human test-takers [ 1 ]. A study of the European Board Examination in Neurological Surgery found that commercial LLMs could pass the written portion, though they struggled with image-based questions [ 15 ]. Most recently, a systematic review found that GPT-4 achieved passing scores on 26 of 29 medical board examinations across specialties [ 12 ]. We evaluated five leading LLMs on 500 neurosurgical board examination questions to understand their current capabilities and implications for medical education. Our results suggest both opportunities and challenges in how we train and assess medical professionals in an AI-augmented future. Methods The Congress of Neurological Surgeons Self-Assessment in Neurological Surgery (SANS) Primary Board Examination Review is an educational resource designed to prepare neurosurgical residents for board certification and practitioners for continuing medical education [ 5 ]. These questions are behind a paywall, reducing the chances that they formed part of the training data for any of the LLMs studied. We used Exam 1, which comprises 500 questions in the following categories: Vascular, Pain, Spine, Peripheral Nerve, Fundamentals, Trauma, Other General, Neuroradiology, Neuropathology, Functional, Tumor, and Pediatrics. We evaluated five leading LLMs: OpenAI o1 [ 8 ], Anthropic’s Claude 3.5 Sonnet [ 2 ], Google’s Gemini 2.0 [ 6 ], xAI’s Grok2 [ 18 ] and OpenEvidence [ 13 ], a specialized medical answer engine. Each question’s text and answer choices were entered into the models’ standard web interfaces. For questions containing images, we uploaded these when supported by the model’s interface. The model responses were then entered into the SANS website to validate the accuracy and record the results of the sections. Statistical analysis compared each model’s performance to the standard passing threshold using the one-sample binomial test. Comparisons between models used the Z test for two proportions (significance: p < 0.05). Results All models achieved greater accuracy than the 70% passing threshold (p < 0.05). OpenAI o1 significantly outperformed xAI Grok2 (p = 0.0003), Gemini 2.0 (p = 0.0041) and Claude 3.5 (p = 0.0488), while OpenEvidence also significantly outperformed xAI Grok2 (p = 0.0338); all other model comparisons did not show statistically significant differences in performance. View this table: View inline View popup Download powerpoint Table 1. Performance of Different Models on Neurosurgical Board Examination Questions The models performed significantly above the mean in five sections (Other General, Peripheral Nerve, Functional, Fundamentals, and Spine) and significantly below the mean in two sections (Neuroradiology and Vascular). View this table: View inline View popup Download powerpoint Table 2. Average Accuracy of Models Across Sections Discussion All tested LLMs exceeded the neurosurgical board examination passing threshold of 70%. Lower performance in Neuroradiology reflects current limitations in LLM image processing capabilities [ 1 ], though specialized computer vision models already match or exceed human performance in tasks like mammogram interpretation [ 10 ] and tumor segmentation [ 9 ]. Some advanced medical AI systems like Google’s Med-PaLM [ 14 ] remain restricted from public testing, suggesting our results may underestimate AI’s current potential in medicine. The strong performance of LLMs challenges the relevance of multiple-choice examinations in medical education. When machines can consistently outperform humans on tests meant to assess medical knowledge, evaluation methods should be reconsidered. The traditional acceptance of 60-70% passing scores implies that physicians graduate with significant knowledge gaps. If the tested knowledge is truly essential, LLMs should be used to ensure comprehensive understanding rather than accepting these gaps. LLMs arrive at a time of controversy for MCQ assessments like the SAT, MCAT and Step 1: are they meant to confirm basic competency or identify talent for desirable careers? LLMs could transform medical education through personalized learning paths, on-demand explanations, and simulation of rare cases. Rather than spending years memorizing information that LLMs can instantly recall, medical training should focus on skills that AI cannot yet replicate. Developing objective measures of technical expertise is an obvious goal for procedural specialties. As clinical tools, LLMs could provide always-on analysis of medical documentation, flag potential diagnoses, suggest relevant literature, and identify gaps in clinical workup. EHR integration and clear frameworks for liability remain unresolved problems. Most importantly, these systems must enhance rather than disrupt physician cognitive processes, providing insights at appropriate moments without causing alert fatigue or cognitive overload. Data Availability All data produced are available online at GitHub.com/nsa122/LLM-NS-boards https://github.com/nsa122/LLM-NS-boards References 1. ↵ R. Ali , O. Tang , I. Connolly , P. Zadnik Sullivan , J. Shin , J. Fridley , W. Asaad , D. Cielo , A. Oyelese , C. Doberstein , Z. Gokaslan , and A. Telfeian . Performance of chatgpt and gpt-4 on neurosurgery written board examinations . Neurosurgery , 93 ( 6 ): 1353 – 1365 , Dec 2023 . OpenUrl CrossRef PubMed 2. ↵ Anthropic . Claude 3.5 sonnet model card addendum , 2024 . Accessed: 2025-01-13 . 3. ↵ J. B. Carmody and S. K. Rajasekaran . On Step 1 Mania, USMLE score reporting, and financial conflict of interest at the National Board of Medical Examiners . Academic Medicine , 95 ( 9 ): 1332 – 1337 , 2020 . OpenUrl PubMed 4. ↵ L. Chen et al. Artificial intelligence performance on bar examination questions. Working paper, Stanford Law School , 2023 . Preliminary analysis of AI performance on legal reasoning tasks . 5. ↵ Congress of Neurological Surgeons . Sans lifelong learning , 2025 . Accessed: 2025-01-12 . 6. ↵ Google . Gemini 2.0 flash model card , 2024 . Accessed: 2025-01-13 . 7. ↵ S. A. Haist , A. P. Butler , and M. A. Paniagua . Testing and evaluation: the present and future of the assessment of medical professionals . Advances in Physiology Education , 41 ( 1 ): 149 – 153 , 2017 . OpenUrl CrossRef PubMed 8. ↵ O. A. Jaech et al. Openai o1 system card . arXiv preprint arxiv: 2412.16720 , 2024 . 9. ↵ K. Kikuchi , O. Togao , K. Yamashita , et al. Comparison of diagnostic performance of radiologist- and ai-based assessments of t2-flair mismatch sign and quantitative assessment using synthetic mri in the differential diagnosis between astrocytoma, idh-mutant and oligodendroglioma, idh-mutant and 1p/19q-codeleted . Neuroradiology , 66 ( 3 ): 333 – 341 , 2024 . OpenUrl PubMed 10. ↵ K. L°ang , V. Josefsson , A. Larsson , S. Larsson , C. Högberg , H. Sartor , S. Hofvind , Andersson, and A. Rosso . Artificial intelligence-supported screen reading versus standard double reading in the mammography screening with artificial intelligence trial (masai): a clinical safety analysis of a randomised, controlled, non-inferiority, single-blinded, screening accuracy study . Lancet Oncology , 24 ( 8 ): 936 – 944 , Aug 2023 . OpenUrl CrossRef PubMed 11. ↵ S. Lin , J. Hilton , and O. Evans . Truthfulqa: Measuring how models mimic human falsehoods . arXiv preprint arxiv: 2109.07958 , 2021 . 12. ↵ M. Liu , T. Okuhara , X. Chang , R. Shirabe , Y. Nishiie , H. Okada , and T. Kiuchi . Performance of chatgpt across different versions in medical licensing examinations worldwide: Systematic review and meta-analysis . Journal of Medical Internet Research , 26 : e60807 , Jul 2024 . OpenUrl PubMed 13. ↵ OpenEvidence . Openevidence: Ai-powered medical search platform , 2023 . Accessed: 2025-01-13 . 14. ↵ K. Singhal , T. Tu , J. Gottweis , R. Sayres , E. Wulczyn , L. Hou , K. Clark , S. Pfohl , H. Cole-Lewis , D. Neal , et al. Towards expert-level medical question answering with large language models . arXiv preprint arxiv: 2305.09617 , 2023 . 15. ↵ F. C. Stengel , M. N. Stienen , M. Ivanov , M.L. Gandía-González , G. Raffa , M. Ganau , P. Whitfield , and S. Motov . Can ai pass the written european board examination in neurological surgery? - ethical and practical issues . Brain and Spine , 4 : 102765 , Feb 2024 . OpenUrl 16. ↵ A. Vaswani , N. Shazeer , N. Parmar , J. Uszkoreit , L. Jones , A. N. Gomez , L. Kaiser, and I. Polosukhin . Attention is all you need . In Advances in Neural Information Processing Systems , pages 5998 – 6008 , 2017 . 17. ↵ M. C. Wang , F. A. Boop , D. Kondziolka , D. K. Resnick , S. N. Kalkanis , E. Koehnen , N. R. Selden , C. B. Heilman , A. B. Valadka , K. M. Cockroft , J. A. Wilson , R. G. Ellenbogen , A. L. Asher , R. W. Byrne , P. J. Camarata , J. Huang , J. J. Knightly , E. I. Levy , R. R. Lonser , E. S. Connolly , F. B. Meyer , and L. M. Liau . Continuous improvement in patient safety and quality in neurological surgery: the American Board of Neurological Surgery in the past, present, and future . Journal of Neurosurgery , 135 ( 2 ): 637 – 643 , 2020 . OpenUrl PubMed 18. ↵ xAI . Grok-2 model card , 2024 . Accessed: 2025-01-13 . 19. ↵ T. Zhong , Z. Liu , Y. Pan , Y. Zhang , Y. Zhou , S. Liang , Z. Wu , Y. Lyu , P. Shu , X. Yu , et al. Evaluation of openai o1: Opportunities and challenges of agi . arXiv preprint arxiv: 2409.18486 , 2024 . View the discussion thread. Back to top Previous Next Posted February 24, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Comparison of Large Language Models’ Performance on Neurosurgical Board Examination Questions Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Comparison of Large Language Models’ Performance on Neurosurgical Board Examination Questions Nicholas S. Andrade , Surya Donty medRxiv 2025.02.20.25322623; doi: https://doi.org/10.1101/2025.02.20.25322623 Share This Article: Copy Citation Tools Comparison of Large Language Models’ Performance on Neurosurgical Board Examination Questions Nicholas S. Andrade , Surya Donty medRxiv 2025.02.20.25322623; doi: https://doi.org/10.1101/2025.02.20.25322623 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Medical Education Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4422) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15217) Forensic Medicine (30) Gastroenterology (1122) Genetic and Genomic Medicine (6583) Geriatric Medicine (667) Health Economics (996) Health Informatics (4524) Health Policy (1367) Health Systems and Quality Improvement (1611) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15908) Intensive Care and Critical Care Medicine (1103) Medical Education (622) Medical Ethics (145) Nephrology (667) Neurology (6581) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3330) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5436) Public and Global Health (9218) Radiology and Imaging (2194) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1195) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ff7149c7928df88',t:'MTc3OTQwMzIwMQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.