Closing the Pediatric Divide: A Performance Analysis of the GPT-5 Family in Medical Diagnostics

doi:10.1101/2025.08.28.25334657

Closing the Pediatric Divide: A Performance Analysis of the GPT-5 Family in Medical Diagnostics

2025 · doi:10.1101/2025.08.28.25334657

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

⚙ AI-generated deep summary by claude@2026-06, 2026-06-24 · read from full text ⓘ

This study evaluated GPT-5 (full), GPT-5 Mini, and GPT-5 Nano on 2,000 single-answer multiple-choice questions from the MedQA dataset, equally split between adult medicine and pediatrics and stratified across multiple specialties, with models queried via API using standardized low-variability settings (temperature=0, minimal reasoning effort, low verbosity, max tokens=170). The authors found a dose-response pattern where increasing model size improved accuracy and reduced an adult–pediatric performance gap, with GPT-5 Nano showing 71.0% adult vs 55.4% pediatric accuracy (15.6-point gap, p<0.001), and GPT-5 Mini narrowing the gap to 5.7 points (p=0.001). The full GPT-5 model eliminated the disparity, reaching 86.3% in adults and 88.5% in pediatrics (p=0.138). A key caveat is that results are based on standardized exam-style questions rather than real-world clinical workflows. The paper does not explicitly discuss endometriosis or adenomyosis; it was included in the corpus via a keyword match in the upstream search index.

Read from the paper's body, not the abstract. Not a substitute for reading the paper. No clinical advice. How this works

Full text 31,209 characters · extracted from preprint-html · click to expand

Closing the Pediatric Divide: A Performance Analysis of the GPT-5 Family in Medical Diagnostics | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Closing the Pediatric Divide: A Performance Analysis of the GPT-5 Family in Medical Diagnostics View ORCID Profile Gianluca Mondillo , Fabio Giovanni Abbate , Mariapia Masino , Simone Colosimo , Alessandra Perrotta , Vittoria Frattolillo doi: https://doi.org/10.1101/2025.08.28.25334657 Gianluca Mondillo 1 Department of Woman, Child and of General and Specialized Surgery, Università degli Studi della Campania “Luigi Vanvitelli” , Via Luigi De Crecchio 2, Naples, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Gianluca Mondillo For correspondence: gianluca.mondillo{at}gmail.com Fabio Giovanni Abbate 1 Department of Woman, Child and of General and Specialized Surgery, Università degli Studi della Campania “Luigi Vanvitelli” , Via Luigi De Crecchio 2, Naples, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mariapia Masino 1 Department of Woman, Child and of General and Specialized Surgery, Università degli Studi della Campania “Luigi Vanvitelli” , Via Luigi De Crecchio 2, Naples, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Simone Colosimo 1 Department of Woman, Child and of General and Specialized Surgery, Università degli Studi della Campania “Luigi Vanvitelli” , Via Luigi De Crecchio 2, Naples, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alessandra Perrotta 1 Department of Woman, Child and of General and Specialized Surgery, Università degli Studi della Campania “Luigi Vanvitelli” , Via Luigi De Crecchio 2, Naples, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Vittoria Frattolillo 1 Department of Woman, Child and of General and Specialized Surgery, Università degli Studi della Campania “Luigi Vanvitelli” , Via Luigi De Crecchio 2, Naples, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF A bstract Background Large Language Models (LLMs) have demonstrated significant potential in clinical medicine, but a persistent performance gap exists in the pediatric domain due to its unique complexities. This study provides the first comparative evaluation of the new GPT-5 family (Nano, Mini, and full) to assess the impact of model scale on diagnostic accuracy and this specific adult-pediatric disparity. Methods A benchmarking study was conducted using 2,000 multiple-choice questions from the MedQA dataset, equally divided between adult (n=1,000) and pediatric (n=1,000) domains. GPT-5, GPT-5 Mini, and GPT-5 Nano were tested via API with standardized parameters (temperature=0, reasoning effort=minimal, verbosity=low, maxtoken=170). Accuracy was calculated and statistically compared across domains for each model. Results A clear dose-response relationship was observed between model size and accuracy. GPT-5 Nano exhibited a significant performance gap, with an accuracy of 71.0% in adult medicine versus 55.4% in pediatrics (a 15.6 percentage point difference, p<0.001). GPT-5 Mini substantially narrowed this gap to 5.7 points (81.5% vs. 75.8%, p=0.001). Critically, the full GPT-5 model eliminated the disparity, achieving comparable accuracy in adult medicine (86.3%) and slightly higher accuracy in pediatrics (88.5%) (p=0.138). Performance gains from scaling up were disproportionately larger for the pediatric domain. Conclusion The GPT-5 family marks a substantial advancement in medical AI. The full-size model not only achieves high diagnostic accuracy but, crucially, overcomes the previously documented performance limitations in pediatrics. This demonstrates that sufficient model scale is vital for mastering the nuances of specialized clinical domains. These findings support a tiered implementation strategy based on task criticality and underscore the need for continued validation in real-world clinical settings. 1 Introduction Large Language Models (LLMs) represent one of the most significant innovations in contemporary Artificial Intelligence (AI), based on transformer architectures and trained on vast text corpora that include medical literature, clinical guidelines, and case studies [ 1 ]. These models have demonstrated emergent capabilities in clinical reasoning, interpretation of complex symptoms, and diagnostic decision support, opening new perspectives for the application of AI in the healthcare setting [ 2 , 3 ]. The performance of LLMs in medicine has shown promising results in recent years. Recent studies have documented that advanced models like GPT-4 can achieve diagnostic accuracies comparable to or higher than those of specialist physicians in training on standardized exams and structured clinical questions [ 4 , 5 ]. However, most of these evaluations have focused on general or adult medicine, with relatively few studies specifically dedicated to the pediatric domain. Pediatrics presents unique peculiarities that could significantly influence the diagnostic performance of LLMs. The pathophysiological specificities of the pediatric patient, weight-based pharmacological and laboratory dosages, clinical presentations that are often atypical compared to adults, and the need to consider neurocognitive developmental stages represent elements of additional complexity [ 6 ]. A recent study by Mondillo et al. highlighted how several contemporary LLMs, including advanced models like ChatGPT-3.5, Gemini 2.0, and Claude 3.5 Sonnet, show significantly reduced performance on pediatric questions compared to those in adult medicine, with accuracy differences of up to 10 percentage points for some models [ 7 ]. This systematic gap between adult and pediatric medicine suggests intrinsic limitations in the understanding of pediatric specificities by current LLMs. The GPT-5 family from OpenAI [ 8 ] represents the most recent evolution of this technology, introducing substantial improvements in reasoning capabilities and architectures optimized for specialized tasks. The availability of three size variants (GPT-5, Mini, Nano) offers a unique opportunity to investigate the relationship between computational complexity and diagnostic accuracy, particularly in the comparison between adult and pediatric medicine. The objective of this study is to comparatively evaluate the diagnostic accuracy of the three GPT-5 variants on a large dataset of clinical questions, with a particular focus on identifying any performance differences between adult and pediatric medicine and the patterns of improvement correlated with model size. 2 Materials and Methods A benchmarking study was conducted to evaluate the diagnostic accuracy of GPT-5, GPT-5 Mini, and GPT-5 Nano on structured clinical questions from the MedQA dataset [ 9 , 10 ]. From the full dataset, a total of 2000 multiple-choice questions with a single correct answer were randomly extracted, equally distributed between adult medicine (n=1000) and pediatrics (n=1000), and stratified by medical specialties: Allergology, Cardiology, Dermatology, Emergency Medicine, Endocrinology, Gastroenterology, Genetics, Gynecology, Hematology, Immunology, Infectious Diseases, Nephrourology, Neurology, Oncology, Ophthalmology, Orthopedics, Otolaryngology, Pneumology, Psychiatry (Child Psychiatry for pediatrics), Rheumatology, and Surgery. All models were tested via API, sending one question at a time, using standardized parameters: temperature=0 to ensure deterministic outputs, reasoning effort=minimal to reduce response latency by testing the models’ minimum capabilities, verbosity=low to obtain concise and focused answers and max tokens=170 to constrain output length. Accuracy was calculated as the percentage of correct answers out of the total for each model and domain. The 95% confidence intervals for total accuracy were calculated using the Clopper–Pearson method, while the Wilson method was applied for the per-specialty analyses. Differences between adult and pediatric medicine were assessed using the chi-square test ( α < 0.05). Effect size was quantified with Cramér’s V, and odds of providing a correct response in pediatrics relative to adults were estimated with 95% confidence intervals. Accuracy deltas between the two domains were calculated with their respective confidence intervals using the Newcombe–Wilson method. Confusion matrices, alluvial plots, and radar charts were generated for result visualization. 3 Results Table 1 summarizes the overall accuracy and statistical parameters for all three GPT-5 models. GPT-5 Nano showed an accuracy of 71.0% (95%CI: 68.1%-73.8%) in adult medicine and 55.4% (95%CI: 52.3%-58.5%) in pediatrics, with a difference of 15.6 percentage points (95%CI: -19.8%; -11.4%, p<0.001, Cramér’s V=0.16, OR=0.50, 95%CI: 0.42-0.61). GPT-5 Mini demonstrated an accuracy of 81.5% (95%CI: 79.0%-83.9%) in adult medicine and 75.8% (95%CI: 73.0%-78.4%) in pediatrics, with a difference of 5.7 percentage points (95%CI: -9.3%; -2.1%, p=0.001, Cramér’s V=0.07, OR=0.71, 95%CI: 0.57-0.88). GPT-5 achieved an accuracy of 86.3% (95%CI: 84.0%-88.4%) in adult medicine and 88.5% (95%CI: 86.4%-90.4%) in pediatrics, with a difference of +2.2 percentage points in favor of pediatrics (95%CI: -0.7%; +5.1%, p=0.138, Cramér’s V=0.03, OR=1.22, 95% CI: 0.937-1.59). View this table: View inline View popup Download powerpoint Table 1: Comparison of GPT-5 models’ accuracy across adult and pediatric domains, with odds ratios (OR) and 95% confidence intervals. In adult medicine, GPT-5 showed accuracy above 80% in 18 of the 21 specialties. The highest performances were observed in Allergology (100%), Emergency Medicine (100%), and Otolaryngology (100%). The most challenging specialties were Ophthalmology (58.3%) and Orthopedics (71.4%). In pediatrics, GPT-5 maintained accuracies above 90% in 6 specialties: Allergology (100%), Child Psychiatry (95.6%), Surgery (100%), Pneumology (95.1%), Orthopedics (94.4%), and Otolaryngology (90.5%). In pediatrics, GPT-5 Nano showed deficient performance in Immunology (41.5%), Nephrourology (45.9%), and Hematology (46.1%). See Supplementary Appendix for detailed tables on metrics per specialty. The alluvial plots ( Figures 1 and 2 ) showed that in the transition from GPT-5 Nano to GPT-5 Mini, an improvement of 10.5% was observed for adult medicine and 20.4% for pediatrics. In the transition from GPT-5 Mini to GPT-5, the improvement was 4.8% for adult and 12.7% for pediatric. The radar charts ( Figures 3 and 4 ) show that GPT-5 maintains consistent performance across all specialties, while GPT-5 Nano exhibits marked variability, which is particularly pronounced in pediatrics. Figures 5 , 6 , and 7 show the confusion matrices for each model. Download figure Open in new tab Figure 1. Alluvial Plot for Pediatric questions, displayed on a landscape page for enhanced readability. Download figure Open in new tab Figure 2. Alluvial Plot for Adult medicine questions, displayed on a landscape page for enhanced readability. Download figure Open in new tab Figure 3. Radar chart for pediatric questions comparing GPT-5 Nano, GPT-5 Mini, and GPT-5. Download figure Open in new tab Figure 4. Radar chart for adult questions comparing GPT-5 Nano, GPT-5 Mini, and GPT-5. Download figure Open in new tab Figure 5. Confusion matrix pediatric vs adult on GPT-5 Nano responses. Download figure Open in new tab Figure 6. Confusion matrix pediatric vs adult on GPT-5 Mini responses. Download figure Open in new tab Figure 7. Confusion matrix pediatric vs adult on GPT-5 responses. 4 Discussion The results of this study reveal substantial progress in the performance of GPT-5 models in the medical field, with significant implications for the clinical implementation of diagnostic artificial intelligence systems. The observed dose-response relationship between model size and diagnostic accuracy confirms the importance of computational complexity in clinical reasoning, showing a progressive improvement from GPT-5 Nano (55.4-71.0%) to GPT-5 Mini (75.8-81.5%) to GPT-5 (86.3-88.5%), a pattern that is particularly significant when compared with previous studies on other LLMs. Nori et al. documented that GPT-3.5 achieves an average accuracy of 53.61%, while GPT-4 reaches an average accuracy of 86.7% on the United States Medical Licensing Examination (USMLE), surpassing the exam’s passing threshold by over 20 points [ 11 ], a result comparable to the performance of GPT-5 in our study. A more recent study by Bicknell et al. on GPT-4o showed an accuracy of 90.4% vs 81.1% for GPT-4 vs 60.0% for GPT-3.5 on 750 USMLE questions, with medical students achieving an average accuracy of 59.3% [ 12 ]. In the study by Kung et al. [ 13 ], the version of ChatGPT based on GPT-3.5 (January 2023) achieved an accuracy between 36.1% and 61.3% on USMLE Steps, close to the passing threshold (60%). A second study by Garabet et al. [ 14 ] evaluated GPT-4 on USMLE Step 1 style questions and found an accuracy of 86%. In specific skills, GPT-4 had shown 90% accuracy on medical soft skills vs 62.5% for ChatGPT [ 15 ]. The performance of GPT-5 (86.3-88.5%) is positioned between that of GPT-4 (86.7%) and GPT-4o (90.4%), suggesting that the incremental improvement in the GPT family continues, but with smaller magnitudes than previous generational leaps. Particularly significant is the comparison with specialized diagnostic capabilities, where GPT-4o showed a diagnostic accuracy of 92.7% and management accuracy of 88.8% [ 12 ], values higher than the average performance of GPT-5 but comparable to its performance in the best specialties. The second critical aspect concerns the progressive reduction of the gap between adult and pediatric medicine correlated with model size. GPT-5 Nano shows a marked disparity with 71.0% accuracy for adults versus 55.4% for pediatrics (a difference of 15.6 points, p<0.001, Cramér’s V=0.16). GPT-5 Mini significantly reduces this gap to 5.7 points (81.5% vs 75.8%, p=0.001, Cramér’s V=0.07), while GPT-5 completely eliminates the disparity (+2.2 points in favor of pediatrics, p=0.138). This pattern represents a substantial advancement compared to previous evaluations, in which several less recent generation LLMs showed significantly lower performance on pediatric questions compared to adult ones. In the same study, the most recent models instead showed a substantial reduction of this gap: ChatGPT-4o achieved almost perfect parity between the two domains (83.57% for adult and 83.18% for pediatric, p=0.80) [ 7 ]. This result is now surpassed by GPT-5, which shows a slight, though not statistically significant, superiority in pediatrics. This finding is consistent with independent pediatric-focused evaluations, such as an analysis on 500 pediatric MedQA questions that reported an accuracy of 92.8% for ChatGPT O1 and 87.0% for DeepSeek-R1 [ 16 ]. The alluvial plots reveal interesting patterns in the transitions between models. The transition from GPT-5 Nano to GPT-5 Mini shows a greater improvement in pediatrics (20.4%) compared to adult medicine (10.5%), as does the subsequent transition from GPT-5 Mini to GPT-5 (12.7% pediatric vs 4.8% adult). This suggests that the increase in model size yields disproportionately greater benefits for understanding pediatric specificities. The radar charts highlight distinctive patterns by specialty. In pediatrics, GPT-5 maintains excellent and consistent performance across all specialties, whereas GPT-5 Nano shows marked irregularities, particularly in Immunology (41.5%), Nephrourology (45.9%), and Hematology (46.1%). Some specialties represent consistent challenges for all models. Ophthalmology shows an accuracy of 58.3% even for GPT-5 in adults, likely due to its highly visual and specialized nature. The choice of API parameters merits discussion. The parameter temperature=0 ensures deterministic outputs, appropriate for reproducibility, though higher temperatures can improve creativity [ 17 ]. The parameter reasoning effort=minimal reduces latency by testing minimum capabilities but may have limited the maximum potential, particularly for GPT-5 [ 18 , 19 , 20 ]. The verbosity=low parameter produces concise outputs but may not fully reflect clinical utility where detailed explanations are crucial. For clinical implementation, the results suggest differentiated strategies. GPT-5 represents the optimal choice for critical applications, particularly in pediatrics. GPT-5 Mini offers an acceptable compromise for preliminary screening, and GPT-5 Nano could find application in high-speed scenarios with intensive human supervision. The study’s limitations include the use of multiple-choice questions, which may not reflect real clinical complexity and could be part of the training data. Further limitations are the absence of validation on direct clinical cases and the use of speed-optimized API parameters. Future studies should evaluate performance with higher reasoning effort and on prospective clinical datasets. 5 Conclusions This study provides the first systematic evaluation of the comparative performance of GPT-5 models in adult and pediatric medicine, revealing significant progress in medical artificial intelligence. Performance improves substantially with increasing model size, with GPT-5 achieving accuracy above 85% in both clinical domains. Particularly significant is the elimination of the gap between adult and pediatric medicine with the full model, overcoming limitations documented in previous studies on other LLMs. The analysis of transitions between models demonstrates that the benefits of increased size are greater in pediatrics. Variations across specialties highlight the importance of domain-specific validations before clinical implementation. Practical implementation should consider the trade-off between accuracy and computational resources, with GPT-5 recommended for critical applications and GPT-5 Mini for preliminary screening. Future research should focus on validation in real clinical contexts and on optimization for specific specialties that show sub-optimal performance. Data Availability All data produced in the present study are available upon reasonable request to the authors. Supplementary Appendix 1. Accuracy for GPT-5 View this table: View inline View popup Download powerpoint Table 2: Accuracy by Specialty (Adult) for GPT-5. Note: the 95% confidence intervals in the table are calculated using the Wilson method. View this table: View inline View popup Download powerpoint Table 3: Accuracy by Specialty (Pediatric) for GPT-5. Note: the 95% confidence intervals in the table are calculated using the Wilson method. 2. Accuracy for GPT-5 Mini View this table: View inline View popup Download powerpoint Table 4: Accuracy by Specialty (Adult) for GPT-5 Mini. Note: the 95% confidence intervals in the table are calculated using the Wilson method. View this table: View inline View popup Download powerpoint Table 5: Accuracy by Specialty (Pediatric) for GPT-5 Mini. Note: the 95% confidence intervals in the table are calculated using the Wilson method. 3. Accuracy for GPT-5 Nano View this table: View inline View popup Download powerpoint Table 6: Accuracy by Specialty (Adult) for GPT-5 Nano. Note: the 95% confidence intervals in the table are calculated using the Wilson method. View this table: View inline View popup Download powerpoint Table 7: Accuracy by Specialty (Pediatric) for GPT-5 Nano. Note: the 95% confidence intervals in the table are calculated using the Wilson method. References [1]. ↵ Brown , T.B. , Mann , B. , Ryder , N. et al. Language Models are Few-Shot Learners . ArXiv, abs/2005.14165 , 2020 . doi: 10.48550/arXiv.2005.14165 . OpenUrl CrossRef [2]. ↵ Singhal , K. et al. Large language models encode clinical knowledge . Nature , 620 ( 7972 ): 172 – 180 , 2023 . doi: 10.1038/s41586-023-06291-2 . OpenUrl CrossRef PubMed [3]. ↵ Lee , P. , Bubeck , S. , & Petro , J. Benefits, Limits, and Risks of GPT-4 as an AI Chatbot for Medicine . N Engl J Med , 388 ( 13 ): 1233 – 1239 , 2023 . doi: 10.1056/NEJMsr2214184 . OpenUrl CrossRef PubMed [4]. ↵ Rutledge , G.W. Diagnostic accuracy of GPT-4 on common clinical scenarios and challenging cases . Learn Health Syst , 8 ( 3 ): e10438 , 2024 . doi: 10.1002/lrh2.10438 . OpenUrl CrossRef [5]. ↵ Thirunavukarasu , A.J. , Ting , D.S.J. , Elangovan , K. , Gutierrez , L. , Tan , T.F. , & Ting , D.S.W. Large language models in medicine . Nat Med , 29 ( 8 ): 1930 – 1940 , 2023 . doi: 10.1038/s41591-023-02448-8 . OpenUrl CrossRef PubMed [6]. ↵ Muralidharan , V. , Schamroth , J. , Youssef , A. , Celi , L.A. , & Daneshjou , R. Applied artificial intelligence for global child health: Addressing biases and barriers . PLOS Digit Health , 3 ( 8 ): e0000583 , 2024 . doi: 10.1371/journal.pdig.0000583 . OpenUrl CrossRef [7]. ↵ Mondillo , G. , Colosimo , S. , Perrotta , A. , Frattolillo , V. , & Masino , M. Are LLMs ready for pediatrics? A comparative evaluation of model accuracy across clinical domains . medRxiv , 2025 . doi: 10.1101/2025.04.25.25326437 . OpenUrl Abstract / FREE Full Text [8]. ↵ OpenAI . Introducing GPT-5 . 2025 . https://openai.com/index/introducing-gpt-5/ . xAccessed 11 August 2025 . [9]. ↵ Jin , D. , Pan , E. , Oufattole , N. , Weng , W.-H. , Fang , H. , & Szolovits , P. What Disease Does This Patient Have? A Large-Scale Open Domain Question Answering Dataset from Medical Exams . Applied Sciences , 11 ( 14 ): 6421 , 2021 . doi: 10.3390/app11146421 . OpenUrl CrossRef [10]. ↵ Kaggle . MedQA-USMLE: A Large-scale Open Domain Question Answering Dataset from Medical Exams . 2025 . https://www.kaggle.com/datasets/moaaztameer/medqa-usmle . xAccessed 11 August 2025 . [11]. ↵ Nori , H. , King , N. , McKinney , S.M. , Carignan , D. , & Horvitz , E. Capabilities of GPT-4 on Medical Challenge Problems . ArXiv, abs/2303.13375 , 2023 . doi: 10.48550/arXiv.2303.13375 . OpenUrl CrossRef [12]. ↵ Bicknell , B.T. et al. ChatGPT-4 Omni Performance in USMLE Disciplines and Clinical Skills: Comparative Analysis . JMIR Med Educ , 10 : e63430 , 2024 . doi: 10.2196/63430 . OpenUrl CrossRef [13]. ↵ Kung , T.H. et al. Performance of ChatGPT on USMLE: Potential for AI-assisted medical education using large language models . PLOS Digit Health , 2 ( 2 ): e0000198 , 2023 . doi: 10.1371/journal.pdig.0000198 . OpenUrl CrossRef [14]. ↵ Garabet , R. , Mackey , B.P. , Cross , J. , & Weingarten , M. ChatGPT-4 Performance on USMLE Step 1 Style Questions and Its Implications for Medical Education . Med Sci Educ , 34 ( 1 ): 145 – 152 , 2023 . doi: 10.1007/s40670-023-01956-z . OpenUrl CrossRef PubMed [15]. ↵ Brin , D. et al. Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments . Sci Rep , 13 ( 1 ): 16492 , 2023 . doi: 10.1038/s41598-023-43436-9 . OpenUrl CrossRef PubMed [16]. ↵ Mondillo , G. , Colosimo , S. , Perrotta , A. , Frattolillo , V. , & Masino , M. Comparative evaluation of advanced AI reasoning models in pediatric clinical decision support: ChatGPT O1 vs. DeepSeek-R1 . medRxiv , 2025 . doi: 10.1101/2025.01.27.25321169 . OpenUrl Abstract / FREE Full Text [17]. ↵ Boonstra , L. Prompt engineering . 2024 . https://www.kaggle.com/whitepaper-prompt-engineering . Accessed 11 August 2025 . [18]. ↵ OpenAI . GPT-5 API Documentation: Reasoning Effort Parameter . 2025 . https://platform.openai.com/docs/guides/reasoning . Accessed 11 August 2025 . [19]. ↵ DataCamp . GPT-5: New Features, Tests, Benchmarks, and More . 2025 . https://www.datacamp.com/blog/gpt-5 . Accessed 11 August 2025 . [20]. ↵ OpenAI . GPT-5 Developer Guide: Verbosity Control . 2025 . https://cookbook.openai.com/examples/gpt-5/gpt-5_new_params_and_tools . Accessed 11 August 2025 . View the discussion thread. Back to top Previous Next Posted August 29, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Closing the Pediatric Divide: A Performance Analysis of the GPT-5 Family in Medical Diagnostics Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Closing the Pediatric Divide: A Performance Analysis of the GPT-5 Family in Medical Diagnostics Gianluca Mondillo , Fabio Giovanni Abbate , Mariapia Masino , Simone Colosimo , Alessandra Perrotta , Vittoria Frattolillo medRxiv 2025.08.28.25334657; doi: https://doi.org/10.1101/2025.08.28.25334657 Share This Article: Copy Citation Tools Closing the Pediatric Divide: A Performance Analysis of the GPT-5 Family in Medical Diagnostics Gianluca Mondillo , Fabio Giovanni Abbate , Mariapia Masino , Simone Colosimo , Alessandra Perrotta , Vittoria Frattolillo medRxiv 2025.08.28.25334657; doi: https://doi.org/10.1101/2025.08.28.25334657 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Pediatrics Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4411) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15205) Forensic Medicine (30) Gastroenterology (1119) Genetic and Genomic Medicine (6574) Geriatric Medicine (666) Health Economics (994) Health Informatics (4511) Health Policy (1365) Health Systems and Quality Improvement (1608) Hematology (537) HIV/AIDS (1263) Infectious Diseases (except HIV/AIDS) (15903) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6573) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (954) Oncology (3319) Ophthalmology (968) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (662) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5422) Public and Global Health (9205) Radiology and Imaging (2191) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1191) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9feac84d1b15ad07',t:'MTc3OTI3NDI0Nw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00