Dual-Model LLM Ensemble via Web Chat Interfaces Reaches Near-Perfect Sensitivity for Systematic-Review Screening: A Multi-Domain Validation with Equivalence to API Access

doi:10.1101/2025.11.03.25339455

Dual-Model LLM Ensemble via Web Chat Interfaces Reaches Near-Perfect Sensitivity for Systematic-Review Screening: A Multi-Domain Validation with Equivalence to API Access

2025 · doi:10.1101/2025.11.03.25339455

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 53,065 characters · extracted from preprint-html · click to expand

Dual-Model LLM Ensemble via Web Chat Interfaces Reaches Near-Perfect Sensitivity for Systematic-Review Screening: A Multi-Domain Validation with Equivalence to API Access | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Dual-Model LLM Ensemble via Web Chat Interfaces Reaches Near-Perfect Sensitivity for Systematic-Review Screening: A Multi-Domain Validation with Equivalence to API Access View ORCID Profile Petter Fagerberg , Oscar Sallander , View ORCID Profile Kim Vikhe Patil , Anders Berg , View ORCID Profile Anastasia Nyman , Natalia Borg , View ORCID Profile Thomas Lindén doi: https://doi.org/10.1101/2025.11.03.25339455 Petter Fagerberg 1 The National Board of Health and Welfare PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Petter Fagerberg For correspondence: Petter.fagerberg{at}socialstyrelsen.se Oscar Sallander 1 The National Board of Health and Welfare MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kim Vikhe Patil 1 The National Board of Health and Welfare PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Kim Vikhe Patil Anders Berg 1 The National Board of Health and Welfare BSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Anastasia Nyman 1 The National Board of Health and Welfare PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Anastasia Nyman Natalia Borg 1 The National Board of Health and Welfare PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Thomas Lindén 1 The National Board of Health and Welfare 2 Institute of Neuroscience and Physiology, Sahlgrenska Academy, Gothenburg University , Sweden MD, PhD Roles: Associate Professor Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Thomas Lindén Abstract Full Text Info/History Metrics Data/Code Preview PDF ABSTRACT Background Prior work showed that state-of-the-art (mid-2025) large language models (LLMs) prompted with varying batch sizes can perform well on systematic review (SR) abstract screening via public APIs within a single medical domain. Whether comparable performance holds when using no-code web interfaces (GUIs) and whether results generalize across medical domains remain unclear. Objective To evaluate the screening performance of a zero-shot, large-batch, two-model LLM ensemble (OpenAI GPT-5 Thinking; Google Gemini 2.5 Pro) operated via public chat GUIs across a diverse range of medical topics, and to compare its performance with an equivalent API-based workflow. Methods We conducted a retrospective evaluation using 736 titles and abstracts from 16 Cochrane reviews (330 included, 406 excluded), all published in May-June 2025. The primary outcome was the sensitivity of a pre-specified “OR” ensemble rule designed to maximize sensitivity, benchmarked against final full-text inclusion decisions (reference standard). Secondary outcomes were specificity, single-model performance, and duplicate-run reliability (Cohen’s κ). Because models saw only titles/abstracts while the reference standard reflected full-text decisions, specificity estimates are conservative for abstract-level screening. Results The GUI-based ensemble achieved 99.7% sensitivity (95% CI, 98.3%-100.0%) and 49.3% specificity (95% CI, 44.3%-54.2%). The API-based workflow yielded comparable performance, with 99.1% sensitivity (95% CI, 97.4%-99.8%) and 49.3% specificity (95% CI, 44.3%-54.2%). The difference in sensitivity was not statistically significant (McNemar p=0.625) and met equivalence within a ±2-percentage-point margin (TOST<0.05). Duplicate-run reliability was substantial to almost perfect (Cohen’s κ: 0.78-0.93). The two models showed complementary strengths: Gemini 2.5 Pro consistently achieved higher sensitivity (94.5%-98.2% across single runs), whereas GPT-5 Thinking yielded higher specificity (62.3%-67.0%). Conclusions A zero-code, browser-based workflow using a dual-LLM ensemble achieves near-perfect sensitivity for abstract screening across multiple medical domains, with performance equivalent to API-based methods. Ensemble approaches spanning two model families may mitigate model-specific blind spots. Prospective studies should quantify workload, cost, and operational feasibility in end-to-end systematic review pipelines. INTRODUCTION Systematic reviews are the cornerstone of evidence-based medicine, forming the empirical basis for clinical guidelines, health policy, and critical treatment decisions [ 1 ]. However, their production is notoriously resource-intensive; a single review can require over a year of work [ 2 ] and cost more than €100,000 to complete [ 3 ]. A primary bottleneck in this process is title and abstract screening, which involves manually sifting through thousands of reports. The current gold standard, utilizing independent screening by two human reviewers [ 4 ], to maximize sensitivity and minimize bias [ 5 ] (especially important with less experienced reviewers [ 6 , 7 ]), is so demanding that it is often infeasible to implement fully. Consequently, many teams resort to single-reviewer screening, which increases the risk of erroneously excluding relevant studies and introducing reviewer-specific bias. Furthermore, to manage this workload, research teams sometimes apply search filters that risk excluding relevant but poorly indexed studies. In this context, sensitivity is the dominant screening metric because missed eligible studies can bias effect estimates and guideline recommendations. Large language models (LLMs) offer a promising solution to this screening bottleneck. While early investigations with older-generation LLMs demonstrated high sensitivity [ 8 , 9 ], these approaches often required complex prompt engineering, API access, and custom code, creating a significant technical barrier for many review teams. More recent state-of-the-art models can achieve high performance with simpler, zero-shot prompting strategies, even when processing titles and abstracts in large batches [ 10 ]. Despite these advances, critical knowledge gaps persist. It remains unclear whether the high performance of these models, while prompted with large batches of titles and abstracts, is generalizable across diverse clinical domains. Furthermore, the run-to-run reliability of these models when accessed via public graphical user interfaces (GUIs), the modality most accessible to non-technical reviewers, has not been systematically evaluated. To address these gaps, we evaluated a pragmatic, zero-code workflow for abstract screening. We deployed a two-model ensemble (OpenAI GPT-5 Thinking; Google Gemini 2.5 Pro) using simple, identical zero-shot prompts to screen the titles and abstracts from 16 diverse Cochrane reviews. The performance of this ensemble was validated against the final full-text inclusion/exclusion decisions made by the original human review teams. Our primary objective was to evaluate the sensitivity of the LLM ensemble operated via public web chat GUIs and to compare its performance with an equivalent API-based workflow. Secondary outcomes included specificity, the performance of each constituent model, and the inter-run reliability of both the GUI-based approach and the API approach. We hypothesized that this accessible, browser-based workflow would achieve screening sensitivity comparable to that of more technically demanding API-based methods, thereby validating a potentially more practical tool for support during evidence synthesis. METHODS Study Design and Objectives We conducted a retrospective, multi-domain validation study to evaluate the performance of large language models (LLMs) for systematic review abstract screening. The study compared two distinct operational modalities: 1) a no-code approach using web-based graphical user interfaces (GUIs), and 2) a programmatic approach using application programming interfaces (APIs). Our primary objective was to determine the screening sensitivity of a pre-specified two-model, two-run LLM ensemble. Secondary objectives were to evaluate the sensitivity and specificity of each individual model and to quantify the inter-run reliability (agreement between duplicate runs) for each model and modality. Analyses labeled pre-specified were defined before any model outputs were inspected; sensitivity analyses were exploratory. Data Source and Reference Standard The validation dataset was derived from 16 diverse systematic reviews [ 11 – 26 ] published in the Cochrane Library (Issue 6, May-June 2025). The Cochrane reference standard was chosen due to its rigorous methodology, which includes independent screening by two human reviewers (the current gold standard), with a third human reviewer for final adjudication in cases of disagreements. For each Cochrane review, we extracted all titles and abstracts that the original authors had classified as either “included” or “excluded” following full-text assessment, through their provided EndNote libraries. Studies with indeterminate status (“ongoing” or “awaiting classification”) were omitted. We then programmatically retrieved the corresponding titles and abstracts in EndNote. Any citation for which an abstract could not be automatically retrieved was excluded from the final analysis dataset, as an abstract is required for the LLM screening task. The final dataset comprised 736 unique studies that had undergone full-text review, of which 348 were included and 388 were excluded by the Cochrane authors. The number of studies included per review ranged from 0 to 101, and the number of excluded studies ranged from 0 to 39. LLM Models and Access Modalities We evaluated two state-of-the-art LLMs: OpenAI’s GPT-5 Thinking [ 27 ] and Google’s Gemini 2.5 Pro [ 28 ] due to their high performance in our previous study as well as other public benchmarks. For each model, we tested its performance via both its public web chat GUI and its public API. Google’s Gemini 2.5 Pro (API identifier gemini-2.5-pro) features a knowledge cutoff of January 2025 and an API input token limit of 1,048,576. In comparison, OpenAI’s GPT-5 Thinking (gpt-5-2025-08-07) has an October 2024 knowledge cutoff and a 400,000-input token limit. It is important to specify that these token limits are for the API versions; the corresponding limits for the GUI versions are unknown. We configured API settings to align with the default settings (Gemini: temperature = 1.0; OpenAI: reasoning effort = medium). For each run, we initiated a fresh chat/session with no prior context. Prompting and Screening Workflow We employed a zero-shot system prompt that instructed the LLM to function as an expert systematic reviewer screening titles and abstracts against eligibility criteria (see our previous publication for more details [ 10 ]). We extracted PICOS (Population, Intervention, Comparator, Outcomes, Study design) eligibility criteria from each Cochrane review’s methods section. The prompt directed the model to classify each citation as include, exclude, or full-text review (in cases of uncertainty), provide a single-sentence justification, and format the output as a machine-readable markdown table. Screening was performed by providing large batches of titles and abstracts to the models ( Table 1 ). For reviews with ≤100 records, we screened in one batch; for those with >100, we split them into two batches to remain under model token limits applicable in the GUI versions of each model. To handle occasional technical failures (i.e., non-completed output), the batch was run in a fresh session to avoid context contamination. View this table: View inline View popup Download powerpoint Table 1. The prompt level batch size (number of titles and abstracts per prompt) that was used for each systematic review [ 11 – 26 ]. Experimental Protocol and Ensemble Definition The entire dataset of 736 titles and abstracts was screened by each model four times: two independent, duplicate runs via the GUI and two duplicate runs via the API. This 2 models × 2 runs × 2 modalities design yielded paired duplicate runs for each model within each access condition. For the primary analysis, we defined a highly inclusive ensemble decision rule. Within each modality (GUI and API), the ensemble consisted of all four corresponding runs (two from GPT-5, two from Gemini 2.5 Pro). A citation was classified as positive if any of the four runs labeled it “Include” or “Full-text”. This “OR” logic was chosen to maximize overall sensitivity. Outcomes and Statistical Analysis The primary outcome was the sensitivity of the ensemble’s binary classification (include vs. exclude) relative to the reference standard. We mapped “include” and “full-text” to positive and “exclude” to negative for all primary and reliability analyses. Secondary outcomes included sensitivity and specificity for each individual model run. We computed exact (Clopper-Pearson) 95% CIs for proportions, used McNemar tests [ 29 ] for paired comparisons, and conducted a TOST equivalence test on sensitivity with a ±2-point margin [ 30 ]. Inter-run reliability for duplicate runs was quantified using Cohen’s κ coefficient [ 31 , 32 ], with 95% CIs generated via bootstrapping (2000 replicates). All analyses were conducted in R (version 4.5.1 [ 33 ]). Targeted Adjudication of Discrepancies To explore the nature of critical disagreements between the LLMs and the reference standard, we implemented a pre-specified targeted adjudication protocol. This analysis was designed to understand potential LLM failure modes as well as correcting potential reference standard decisions that were discordant with the eligibility criteria. A citation was flagged for adjudication if: 1) both duplicate runs of the two LLMs disagreed with the reference label, or 2) both duplicate runs of both models classified a record as “exclude”, while Cochrane also had labeled it as “exclude”. The second condition was used because every citation in our dataset was deemed relevant enough for full-text review by the original authors, making any consistent LLM-based exclusion a high-priority event to investigate. Two adjudicators, trained in the systematic review process, were provided with the flagged titles/abstracts and the review’s eligibility criteria. Disagreements were resolved by consensus. This adjudication was performed to characterize errors made by the LLMs as well as to better align the reference standard with the actual criteria of each original Cochrane review. The primary outcome results are presented with this improved reference standard and results vs. the original Cochrane EndNote libraries are provided with supplementary material. 18 records (2.3%) met adjudication criterion 1, and 23 (3.1%) met criterion 2. All 18 criterion 1 records were judged to be probable misclassifications in the original EndNote libraries when evaluated against the prespecified review criteria. Of the 23 criterion 2 records, which were initially sent for full-text assessment by Cochrane authors but later excluded, only one had an abstract sufficiently ambiguous that, based on title and abstract information alone, it should have been advanced to full-text review. On full-text assessment, this study was correctly excluded. Nonetheless, we cannot rule out that AI models may have been trained on the full-text content of this paper, because the article was openly accessible rather than behind a paywall. Overall, the discrepancies appear to stem primarily from library misclassification rather than screening errors. The final gold standard dataset after the adjudication process contained 330 records labeled as “include” and 406 “exclude”. The final gold standard was used for all comparisons. RESULTS Primary Outcome: Ensemble Performance The pre-specified two-model, two-run ensemble demonstrated high sensitivity in both operational modalities. In the code-free GUI condition, the ensemble identified 329 of 330 included studies, achieving 99.7% sensitivity (95% CI, 98.3%-100.0%) and 49.3% specificity (95% CI, 44.3%-54.2%) ( Figure 2 ). Performance in the programmatic API condition was nearly identical, yielding 99.1% sensitivity (95% CI, 97.4%-99.8%) by identifying 327 of 330 included studies, with the same specificity of 49.3% (95% CI, 44.3%-54.2%). McNemar testing indicated no significant difference (p = 0.625). The TOST procedure supported equivalence within ±2 percentage points (paired; p lower = 1.1×10⁻⁵, p upper = 0.011). As expected from the inclusive “OR” ensemble rule, false negatives were rare: the GUI ensemble missed 1/330 (0.3%) and the API ensemble 3/330 (0.9%). Download figure Open in new tab Figure 1. Study flow diagram showing the derivation of reference standard. Records progressed from initial Cochrane EndNote library classifications through targeted adjudication to create the final gold standard dataset. Numbers indicate studies at each stage with reason for classification changes. Download figure Open in new tab Figure 2. Performance of the four-run ensemble. Sensitivity and specificity of the pre-specified two-model, two-run ensemble using an “OR” decision rule across the full dataset (N=736 titles and abstracts). Error bars represent 95% Clopper-Pearson confidence intervals. Single-Run Ensemble Performance A pre-specified sensitivity analysis of a simpler ensemble (one run per model) showed that specificity increases with only a small decrease in sensitivity. Using run 1 for each model, the GUI single-run ensemble achieved 99.1% sensitivity (327/330) with 56.2% specificity (95% CI, 51.2%-61.0%), while the API single-run ensemble achieved 98.5% sensitivity (325/330) with 53.9% specificity (95% CI, 49.0%-58.9%). Single-Model Performance Analysis of individual runs revealed complementary profiles underlying the ensemble’s performance ( Figure 3 and Figure 4 ). Gemini 2.5 Pro consistently achieved higher sensitivity across both GUI and API modalities (94.5%-98.2% across runs), whereas GPT-5 Thinking consistently yielded higher specificity (65.0%-67.0%). Detailed per-run metrics are provided in the supplementary material. Download figure Open in new tab Figure 3. Individual model run sensitivity. Screening sensitivity for each of the eight individual runs across the full dataset (N=736). Error bars represent 95% Clopper-Pearson confidence intervals. Download figure Open in new tab Figure 4. Individual model run specificity. Screening specificity for each of the eight individual runs across the full dataset (N=736). Error bars represent 95% Clopper-Pearson confidence intervals. Duplicate-Run Reliability Both models demonstrated a high degree of stochastic consistency between duplicate runs. Inter-run reliability was substantial to almost perfect across all model-modality pairings, with Cohen’s kappa (κ) values ranging from 0.78 to 0.93 ( Figure 5 ). These findings indicate that for a given prompt, the models produce highly reliable classifications from one run to the next. Download figure Open in new tab Figure 5. Duplicate-Run Reliability. Inter-run reliability (Cohen’s κ) for duplicate screening runs (N=736). The dashed line at κ = 0.81 indicates the threshold for “almost perfect agreement”, at κ = 0.61 “substantial agreement”, and at κ = 0.41 ”moderate agreement” [ 31 ]. Error bars represent 95% bootstrap confidence intervals. Performance Across Clinical Domains While overall performance was robust for the ensemble, we observed some variability across the 16 different Cochrane reviews within individual runs of each model, suggesting certain clinical topics were more challenging for the models. For instance, GPT-5 Thinking single-run sensitivity was lowest (43%, 3/7 true positives) in systematic review 7 (Vaccines for preventing infections in adults with haematological malignancies [ 17 ]), suggesting model specific blind spots and the value of ensembling. Reviews with very few positives displayed wide uncertainty (i.e., some SRs had ≤3 included studies), as reflected in broad confidence intervals. Per-review performance data are available in supplementary material. DISCUSSION Our study makes two key contributions. First, we demonstrate the cross-domain generalizability of a zero-shot large batch title and abstract screening approach by testing it on a diverse corpus of 16 Cochrane reviews. The two-model LLM ensemble achieved near-perfect sensitivity (99.1%-99.7%) on this task. Second, we provide the first direct comparison of public web GUIs (99.7%) versus programmatic APIs (99.1%), confirming that comparable, high-sensitivity results can be achieved without programming expertise. As designed, our inclusive ensemble rule prioritized sensitivity at the expense of specificity, a trade-off confirmed by analysis of the constituent models; Gemini 2.5 Pro consistently provided higher sensitivity, while GPT-5 Thinking yielded higher specificity. These complementary profiles justify a cross-family ensemble. Importantly, we observed that individual models exhibited unique failure modes on certain reviews, underscoring the importance of the ensemble approach to mitigate model-specific biases and reduce the risk of idiosyncratic sensitivity failures. For example, a post-hoc examination of the most challenging review for the LLMs (vaccines in haematological populations [ 17 ]) provided a clear example of a model-specific failure mode. GPT-5 Thinking systematically excluded studies involving the general population, incorrectly assuming the target subgroup (patients with haematological malignancies) was absent. This represents an over-confident reasoning error. In contrast, Gemini 2.5 Pro demonstrated a more cautious approach, correctly identifying the ambiguity and flagging these same records for full-text assessment to confirm the presence of the subgroup. Prompt engineering techniques, such as using few-shot examples of erroneous exclusions, could be a viable method to steer GPT-5 towards a more cautious screening approach. While we cannot fully generalize this specific behavior from our limited samples, a cross-family ensemble provides an important safeguard. In this case, one model’s caution compensates for another’s confident error, directly mirroring the rationale for using dual human reviewers to mitigate individual human bias. Because performance was benchmarked against final full-text inclusion/exclusion decisions, this dataset represents a particularly challenging benchmark for abstract screening. Thus, the obtained specificities represent conservative lower bounds for real-world abstract-level screening. Furthermore, both models demonstrated substantial to almost perfect inter-run reliability (Cohen’s κ: 0.78-0.93), indicating a high degree of stochastic stability. The efficiency of LLM-based screening has important implications for the initial search strategy. To manage workload, review teams may apply methodological search filters that, while greatly reducing the volume of scientific papers, carry the risk of systematically excluding relevant studies that are poorly indexed. Because the LLM workflow is not constrained by human time in the same way, it can reduce the need for more aggressive filtering. Researchers can instead use broader, more sensitive search strategies and allow the LLM ensemble to perform the initial comprehensive triage in the broader list of studies. This capability can serve as an important protection layer, enhancing the completeness of a review by mitigating the risk of inadvertently omitting studies at the search stage. Additionally, exclusions made by the ensemble at the abstract level, save the time and cost associated with retrieving and assessing full-text articles that would have been excluded anyway. For systematic reviews, failing to identify an eligible study (a false negative) is typically the most critical error [ 4 ]. Previous studies utilizing a single human screener estimated overall human performance at 87-92% sensitivity (reported range 42-100% for individual reviewers) as a comparison [ 6 , 7 ], highlighting the risk of missing relevant studies with such approach. Our findings therefore validate a practical, zero-code workflow that rigorously minimizes this risk. This work opens the door for several models of human-AI collaboration. First, in the gold-standard dual-reviewer workflow, the LLM ensemble could act as a decision-support tool, providing a third opinion to resolve initial human disagreements and potentially reducing the burden on a senior adjudicator. Second, for resource-constrained teams unable to perform dual review, the LLM ensemble could serve as a robust second screener alongside a single human reviewer. This approach could significantly reduce the risk of error associated with single-reviewer screening while requiring only half the human effort of the traditional gold standard. Lastly, a semi-automated workflow could involve the LLM ensemble performing the initial screen of all studies. A human reviewer would then assess all records flagged for inclusion or full-text review, plus a small, random sample of the excluded studies to safeguard against systematic model bias (e.g., 5-10% or balanced with the number of inclusions). This approach enables monitoring of emerging failure modes and concept drift. Our finding that individual models had unique blind spots underscores the continued importance of a human-in-the-loop to ensure accountability. While LLMs can potentially assist at multiple stages of a systematic review [ 34 ], i.e., from search strategy formulation to data extraction and risk of bias assessment, title and abstract screening represents a more accessible entry point. This is because titles and abstracts are publicly available bibliographic data. In contrast, subsequent steps like data extraction or quality appraisal require processing of full-text content. Therefore, abstract screening might be a more pragmatic initial proof-of-concept for LLM adoption within the systematic review process. A central implication of our work is the validation of public web GUIs as a technically viable access point for high-sensitivity abstract screening. This finding greatly reduces the barrier to adoption for research teams that lack the specialized programming expertise or infrastructure required for API-based methods, thereby helping to democratize the use of AI in evidence synthesis. However, this accessibility is coupled with significant governance challenges. Public-facing models are subject to unannounced updates (“model drift”), which may compromise scientific reproducibility over time. The ensemble approach itself offers a degree of resilience against such challenges; by diversifying across two model families from different providers, the risk of a single silent update compromising an entire workflow is mitigated. To address these challenges further, a framework for responsible implementation might include: i) using fixed model versions or snapshots with documented run dates; ii) ensuring session isolation and disabling external tool access (e.g., browsing); iii) securing institutional data processing agreements; iv) maintaining comprehensive audit logs of all prompts and outputs; and v) conducting periodic performance checks against a frozen benchmark dataset to detect model drift. Furthermore, the human-computer interaction aspect is also crucial when evaluating the use of a GUI for this process. The accuracy of prompt engineering and data handling directly governs the validity of the LLM’s output. For example, simple user errors during manual data transfer, such as faulty copy-pasting, can lead to highly problematic or entirely incorrect outcomes. Therefore, implementing rigorous training protocols and safeguards is essential to support novices screening systematic reviews with this new methodology. To minimize such user-induced errors, the optimal long-term approach may be a fully automated, API-based pipeline featuring a purpose-built, simplified interface, which offers greater reliability than open-ended public LLM GUIs. A real-world implementation might therefore still need to trend towards enterprise-grade, privacy-preserving platforms (e.g., Google Vertex AI [ 35 ], OpenAI API platform [ 36 ]) that offer better versioning, security, and regulatory compliance. Another alternative would be to extend this validation to high-performing open-source LLMs, which offer the potential for local deployment. For government agencies, healthcare systems, and other organizations, a locally hosted ensemble would mitigate many of the governance challenges inherent to commercial cloud services. It would ensure stable, consistent performance by allowing teams to lock in a specific model version, thus avoiding performance drift. Furthermore, it provides a more robust approach for navigating the complex and evolving legal landscape surrounding data privacy and sovereignty. Beyond governance, local deployment could unlock the capability to fine-tune models using expert human adjudication decisions as training data, creating a feedback loop to correct systematic model errors and progressively enhance screening accuracy over time. Furthermore, local deployment would enable the use of LLMs for downstream systematic review tasks, such as data extraction and quality appraisal, which are currently unfeasible with third-party APIs due to further restrictions on processing full-text articles. Our findings should be interpreted in the context of several limitations. First, our reference standard was the final full-text inclusion decision, whereas the LLMs screened only titles and abstracts. This design choice robustly tests sensitivity but yields a conservatively low estimate of specificity. Many abstracts correctly excluded by the LLMs would also have been excluded at the abstract screening stage by human reviewers, but our methodology counts these as false positives against the full-text standard. Relatedly, our dataset comprised only studies that had already survived an initial human screening to be considered for full-text review, representing a set of uniquely challenging records. Real-world specificity on a completely unscreened search result set would therefore likely be higher, as demonstrated in our prior study [ 10 ]. Second, methodological constraints prevented perfect parity between the GUI and API environments. Stochastic settings like temperature could not be fully harmonized, and we could not rule out minor, unannounced vendor-side updates during the study period, although the high inter-run reliability suggests this had minimal impact. Third, our analysis did not measure operational factors such as cost, processing time (throughput), or user acceptability, which are critical for real-world implementation and should be assessed in prospective studies. Fourth, a potential for data contamination exists if the Cochrane reviews used for this validation were part of the models’ pre-training corpora, although Cochrane reviews published in May 2025 after AI model training mitigated this risk. Finally, while robust across 16 diverse reviews, our analysis may not capture all rare edge cases. Beyond augmenting the screening process itself, our findings suggest a novel application for LLMs in the foundational stages of systematic review protocol development. Traditionally, formulating PICOS criteria is an arduous process, and flaws may only become apparent after substantial human screening effort has been invested. LLMs could facilitate a paradigm of rapid, iterative protocol testing; a review team could draft PICOS criteria, conduct a preliminary search, and immediately deploy an LLM ensemble to see which studies are included. This provides immediate, concrete feedback on the criteria’s real-world performance, allowing for rapid refinement before committing to full-scale human review. Also, author teams could test database searches with and without search filters to see if there is a high risk of losing important studies when applying the filters. By lowering the barrier to iterative testing and sensitive search approaches, the quality of systematic reviews could be improved. This work also establishes the foundation for prospective, end-to-end integration studies. Future research should quantify the real-world impact of these workflows on reviewer workload, project timelines, and cost-effectiveness. Key technical questions remain, including the optimization of ensemble strategies and the development of risk-calibrated workflows that route only the most uncertain cases to human experts. Critically, the research community should try to agree on robust governance frameworks and reporting standards to guide the ethical and reproducible use of LLMs in evidence synthesis. CONCLUSION A zero-code, browser-based workflow using a dual-LLM ensemble can achieve near-perfect sensitivity for systematic review abstract screening across diverse medical topics, with performance statistically non-inferior to an API-based workflow. These findings validate an accessible approach that can augment review teams, support novel applications like iterative protocol refinement, and ultimately accelerate the pace of evidence synthesis. The use of a cross-family ensemble was shown to mitigate the risks of model-specific biases and maximize sensitivity. The successful integration of these tools will now depend on quantifying their operational benefits and establishing robust governance models, including the exploration of locally deployed open-source alternatives, that ensure security, privacy, and scientific reproducibility. Ethics approval and consent to participate Not applicable. Author contributions PF and OS had full access to all the data in the study and took responsibility for the integrity of the data and the data analysis. Concept and design: PF, OS and KVP. Acquisition, analysis, or interpretation of data: PF and OS. Drafting of the manuscript: PF, KVP and OS. Critical review of the manuscript for important intellectual content: All authors. Statistical analysis: PF and OS. Administrative, technical, or material support: AB, AN, TL, NB. Supervision: AB, AN, TL, NB. All authors read and approved the definitive version of the manuscript. Competing interests None. Data availability statement The datasets used and analysed in this study are available upon reasonable request. Funding statement The authors received no specific funding for this work. SUPPLEMENTARY MATERIAL Complete List of Cochrane Reviews Included as Reference Standard Material [ 11 – 26 ] SR1. Atypical antipsychotics for autism spectrum disorder: a network meta-analysis SR2. Sustained-release naltrexone for opioid dependence SR3. Yoga for fatigue in people with cancer SR4. Dietary Approaches to Stop Hypertension (DASH) for the primary and secondary prevention of cardiovascular diseases SR5. Stem cell treatment for acute myocardial infarction SR6. Post-incident debriefing for people with schizophrenia after coercive measures SR7. Vaccines for preventing infections in adults with haematological malignancies SR8. Exercise for patellar tendinopathy SR9. Fenestrated endovascular repair for abdominal aortic aneurysms SR10. Adjuvant epidermal growth factor receptor (EGFR) tyrosine kinase inhibitors (TKIs) for the treatment of people with resected stage I to III non-small-cell lung cancer and EGFR mutation SR11. Multifaceted behavioral interventions to improve topical glaucoma therapy adherence in adults SR12. Antiplatelet versus anticoagulation treatment for people with heart failure in sinus rhythm SR13. Aural toilet (ear cleaning) for chronic suppurative otitis media SR14. Community care navigation intervention for people who are at risk of unplanned hospital presentations SR15. Corticosteroids for treating sepsis in children and adults SR16. Antibiotics versus topical antiseptics for chronic suppurative otitis media LLM Performance vs. Corrected Cochrane EndNote library View this table: View inline View popup Download powerpoint Table S1. Overall Performance of All Model Variants. Summary of performance metrics, including true positives (TP), false negatives (FN), true negatives (TN), false positives (FP), sensitivity, and specificity. Confidence intervals (CI) are 95% Clopper-Pearson intervals. Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Supplementary Figure S1-S12: Per-Review Sensitivity for Each Variant. Bar charts showing the sensitivity for each of the 16 systematic reviews (SRs) for a specific model variant. Error bars represent 95% Clopper-Pearson confidence intervals. The number of positive records (n) for each SR is shown on the x-axis. LLM Performance vs. Original Cochrane EndNote library View this table: View inline View popup Download powerpoint Table S2. Overall Performance of All Model Variants. Summary of performance metrics, including true positives (TP), false negatives (FN), true negatives (TN), false positives (FP), sensitivity, and specificity. Confidence intervals (CI) are 95% Clopper-Pearson intervals. Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Supplementary Figure S13-S24: Per-Review Sensitivity for Each Variant. Bar charts showing the sensitivity for each of the 16 systematic reviews (SRs) for a specific model variant. Error bars represent 95% Clopper-Pearson confidence intervals. The number of positive records (n) for each SR is shown on the x-axis. Acknowledgements We thank the authors of the original Cochrane Reviews for their rigorous work and for making their EndNote libraries available. REFERENCES 1. ↵ Higgins JPT , Thomas J , Chandler J , Cumpston M , Li T , Page MJ , et al. Cochrane Handbook for Systematic Reviews of Interventions. Version 6.5 (updated August 2024) uppl: Cochrane; 2024 . 2. ↵ Borah R , Brown AW , Capers PL , Kaiser KA . Analysis of the time and workers needed to conduct systematic reviews of medical interventions using data from the PROSPERO registry . BMJ Open . 2017 ; 7 ( 2 ): e012545 . OpenUrl Abstract / FREE Full Text 3. ↵ Michelson M , Reuter K . The significant cost of systematic reviews and meta-analyses: A call for greater involvement of machine learning to assess the promise of clinical trials . Contemp Clin Trials Commun . 2019 ; 16 : 100443 . OpenUrl PubMed 4. ↵ Lefebvre C , Glanville J , Briscoe S , Featherstone R , Littlewood A , Metzendorf MI , et al. Chapter 4: Searching for and selecting studies . In: Higgins JPT , Thomas J , Chandler J , Cumpston M , Li T , Page MJ , Welch VA (editors) Cochrane Handbook for Systematic Reviews of Interventions version 651. 2025 . 5. ↵ Edwards P , Clarke M , DiGuiseppi C , Pratap S , Roberts I , Wentz R . Identification of randomized controlled trials in systematic reviews: accuracy and reliability of screening records . Stat Med . 2002 ; 21 ( 11 ): 1635 – 40 . OpenUrl CrossRef PubMed Web of Science 6. ↵ Waffenschmidt S , Knelangen M , Sieben W , Bühn S , Pieper D . Single screening versus conventional double screening for study selection in systematic reviews: a methodological systematic review . BMC Medical Research Methodology . 2019 ; 19 ( 1 ): 132 . OpenUrl PubMed 7. ↵ Gartlehner G , Affengruber L , Titscher V , Noel-Storr A , Dooley G , Ballarini N , et al. Single-reviewer abstract screening missed 13 percent of relevant studies: a crowd-based, randomized controlled trial . J Clin Epidemiol . 2020 ; 121 : 20 – 8 . OpenUrl CrossRef PubMed 8. ↵ Cao C , Sang J , Arora R , Chen D , Kloosterman R , Cecere M , et al. Development of Prompt Templates for Large Language Model-Driven Screening in Systematic Reviews . Ann Intern Med . 2025 ; 178 ( 3 ): 389 – 401 . OpenUrl PubMed 9. ↵ Sanghera R , Thirunavukarasu AJ , El Khoury M , O’Logbon J , Chen Y , Watt A , et al. High-performance automated abstract screening with large language model ensembles . Journal of the American Medical Informatics Association . 2025 ; 32 ( 5 ): 893 – 904 . OpenUrl PubMed 10. ↵ Fagerberg P , Sallander O , Vikhe Patil K , Berg A , Nyman A , Borg N , et al. Scaling the Prompt: How Batch Size Shapes Performance of Mid-2025 State-of-the-Art LLMs in Automated Title-and-Abstract Screening . medRxiv . 2025 :2025.09.02.25334159. 11. ↵ Meza N , Franco JVA , Sguassero Y , Núñez V , Escobar Liquitay CM , Rees R , et al. Atypical antipsychotics for autism spectrum disorder: a network meta-analysis . Cochrane Database of Systematic Reviews . 2025 ; ( 5 ). 12. Kornør H , Lobmaier PPK , Kunøe N . Sustained-release naltrexone for opioid dependence . Cochrane Database of Systematic Reviews . 2025 ; ( 5 ). 13. Messer S , Oeser A , Wagner C , Wender A , Cryns N , Scherer RW , et al. Yoga for fatigue in people with cancer . Cochrane Database of Systematic Reviews . 2025 ; ( 5 ). 14. Bensaaud A , Seery S , Gibson I , Jones J , Flaherty G , McEvoy JW , et al. Dietary Approaches to Stop Hypertension (DASH) for the primary and secondary prevention of cardiovascular diseases . Cochrane Database of Systematic Reviews . 2025 ; ( 5 ). 15. Zwetsloot PP , van der Naald M , Jones D , Reid A , Chamuleau S , Mathur A . Stem cell treatment for acute myocardial infarction . Cochrane Database of Systematic Reviews . 2025 ; ( 6 ). 16. Välimäki M , Varpula J , Lantta T . Post-incident debriefing for people with schizophrenia after coercive measures . Cochrane Database of Systematic Reviews . 2025 ; ( 5 ). 17. ↵ Zorger AM , Hirsch C , Baumann M , Feldmann M , Bröckelmann PJ , Mellinghoff S , et al. Vaccines for preventing infections in adults with haematological malignancies . Cochrane Database Syst Rev . 2025 ; 5 ( 5 ): Cd015530 . OpenUrl PubMed 18. Lopes AD , Rizzo RRN , Hespanhol L , Costa LOP , Kamper SJ . Exercise for patellar tendinopathy . Cochrane Database of Systematic Reviews . 2025 ; ( 5 ). 19. Daudu D , Cai PL , Srinivas A , Best LMJ , Cross J , Hammond CJ , et al. Fenestrated endovascular repair for abdominal aortic aneurysms . Cochrane Database of Systematic Reviews . 2025 ; ( 5 ). 20. Occhipinti M , Imbimbo M , Ferrara R , Simeon V , Fiscon G , Marchal C , et al. Adjuvant epidermal growth factor receptor (EGFR) tyrosine kinase inhibitors (TKIs) for the treatment of people with resected stage I to III non-small-cell lung cancer and EGFR mutation . Cochrane Database of Systematic Reviews . 2025 ; ( 5 ). 21. Kruoch Z , Amin P , Shelton E , Zimmerman AB , Stephey E , Hunter M , et al. Multifaceted behavioral interventions to improve topical glaucoma therapy adherence in adults . Cochrane Database of Systematic Reviews . 2025 ; ( 6 ). 22. Kozieł-Siołkowska M , Shantsila E , Shantsila A , Lip GYH . Antiplatelet versus anticoagulation treatment for people with heart failure in sinus rhythm . Cochrane Database of Systematic Reviews . 2025 ; ( 6 ). 23. Bhutta MF , Head K , Chong LY , Daw J , Schilder AGM , Brennan-Jones CG . Aural toilet (ear cleaning) for chronic suppurative otitis media . Cochrane Database of Systematic Reviews . 2025 ; ( 6 ). 24. Pang RK , Shannon B , Collyer T , Srikanth V , Andrew NE . Community care navigation intervention for people who are at risk of unplanned hospital presentations . Cochrane Database of Systematic Reviews . 2025 ; ( 6 ). 25. Annane D , Briegel J , Granton D , Bellissant E , Bollaert PE , Keh D , et al. Corticosteroids for treating sepsis in children and adults . Cochrane Database of Systematic Reviews . 2025 ; ( 6 ). 26. ↵ Head K , Chong LY , Bhutta MF , Daw J , Veselinović T , Morris PS , et al. Antibiotics versus topical antiseptics for chronic suppurative otitis media . Cochrane Database of Systematic Reviews . 2025 ; ( 6 ). 27. ↵ OpenAi. GPT-5 System Card. [Internet] 2025 2025/08/07 . Accessed from: https://openai.com/index/gpt-5-system-card/ 28. ↵ Google D. Gemini 2.5 Pro: Model Card. [Internet] 2025 . Accessed from: https://storage.googleapis.com/model-cards/documents/gemini-2.5-pro.pdf 29. ↵ Leon AC. 3.12 - Descriptive and Inferential Statistics . I: Bellack AS , Hersen M , red. Comprehensive Clinical Psychology . Oxford : Pergamon ; 1998 . s. 243 – 85 . 30. ↵ Lakens D . Equivalence Tests: A Practical Primer for t Tests, Correlations, and Meta-Analyses . Soc Psychol Personal Sci . 2017 ; 8 ( 4 ): 355 – 62 . OpenUrl CrossRef PubMed 31. ↵ McHugh ML . Interrater reliability: the kappa statistic . Biochem Med (Zagreb ). 2012 ; 22 ( 3 ): 276 – 82 . OpenUrl PubMed 32. ↵ Cohen J . A Coefficient of Agreement for Nominal Scales . Educational and Psychological Measurement . 1960 ; 20 ( 1 ): 37 – 46 . OpenUrl CrossRef Web of Science 33. ↵ Team RC . R: A Language and Environment for Statistical Computing . Vienna, Austria : R Foundation for Statistical Computing ; 2017 . 34. ↵ Lieberum JL , Toews M , Metzendorf MI , Heilmeyer F , Siemens W , Haverkamp C , et al. Large language models for conducting systematic reviews: on the rise, but not yet ready for use-a scoping review . J Clin Epidemiol . 2025 ; 181 : 111746 . OpenUrl CrossRef PubMed 35. ↵ Google. Vertex AI. [Internet] 2025 . Accessed from: https://cloud.google.com/vertex-ai 36. ↵ OpenAi. OpenAI API. [Internet] 2025 . Accessed from: https://openai.com/api/ View the discussion thread. Back to top Previous Next Posted November 06, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Dual-Model LLM Ensemble via Web Chat Interfaces Reaches Near-Perfect Sensitivity for Systematic-Review Screening: A Multi-Domain Validation with Equivalence to API Access Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Dual-Model LLM Ensemble via Web Chat Interfaces Reaches Near-Perfect Sensitivity for Systematic-Review Screening: A Multi-Domain Validation with Equivalence to API Access Petter Fagerberg , Oscar Sallander , Kim Vikhe Patil , Anders Berg , Anastasia Nyman , Natalia Borg , Thomas Lindén medRxiv 2025.11.03.25339455; doi: https://doi.org/10.1101/2025.11.03.25339455 Share This Article: Copy Citation Tools Dual-Model LLM Ensemble via Web Chat Interfaces Reaches Near-Perfect Sensitivity for Systematic-Review Screening: A Multi-Domain Validation with Equivalence to API Access Petter Fagerberg , Oscar Sallander , Kim Vikhe Patil , Anders Berg , Anastasia Nyman , Natalia Borg , Thomas Lindén medRxiv 2025.11.03.25339455; doi: https://doi.org/10.1101/2025.11.03.25339455 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4421) Dentistry and Oral Medicine (443) Dermatology (381) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15212) Forensic Medicine (30) Gastroenterology (1121) Genetic and Genomic Medicine (6581) Geriatric Medicine (667) Health Economics (996) Health Informatics (4520) Health Policy (1366) Health Systems and Quality Improvement (1611) Hematology (539) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15906) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (667) Neurology (6580) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1141) Occupational and Environmental Health (956) Oncology (3324) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5431) Public and Global Health (9212) Radiology and Imaging (2193) Rehabilitation Medicine and Physical Therapy (1368) Respiratory Medicine (1194) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ff168101b9658f4',t:'MTc3OTM0MzcwNQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00