Full text
48,070 characters
· extracted from
preprint-html
· click to expand
Assessing the feasibility and acceptability of a bespoke large language model pipeline to extract data from different study designs for public health evidence reviews | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Assessing the feasibility and acceptability of a bespoke large language model pipeline to extract data from different study designs for public health evidence reviews View ORCID Profile Zalaya Simmons , View ORCID Profile Beti Evans , View ORCID Profile Tamsyn Harris , Harry Woolnough , Lauren Dunn , Jonathon Fuller , Kerry Cella , View ORCID Profile Daphne Duval doi: https://doi.org/10.1101/2025.07.21.25331917 Zalaya Simmons a Research, Evidence and Knowledge Division , UK Health Security Agency, London, E14 5EA, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zalaya Simmons Beti Evans a Research, Evidence and Knowledge Division , UK Health Security Agency, London, E14 5EA, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Beti Evans For correspondence: science.evidence{at}ukhsa.gov.uk Tamsyn Harris b All Hazards Public Health Response , UKHSA, London E14 5EA, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tamsyn Harris Harry Woolnough c Chief Data Officer group , UK Health Security Agency, London E14 5EA, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lauren Dunn c Chief Data Officer group , UK Health Security Agency, London E14 5EA, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jonathon Fuller c Chief Data Officer group , UK Health Security Agency, London E14 5EA, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kerry Cella c Chief Data Officer group , UK Health Security Agency, London E14 5EA, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Daphne Duval a Research, Evidence and Knowledge Division , UK Health Security Agency, London, E14 5EA, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Daphne Duval Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Introduction Data extraction is a critical but resource-intensive step of the evidence review process. Whilst there is evidence that artificial intelligence (AI) and large language models (LLMs) can improve the efficiency of data extraction from randomised controlled trials, their potential for other study designs is unclear. In this context, this study aimed to evaluate the performance of a bespoke LLM model pipeline (Retrieval-Augmented Generation pipeline utilising LLaMa 3-70B) to automate data extraction from a range of study designs by assessing the accuracy, reliability and acceptability of the extractions. Methods Accuracy was assessed by comparing the LLM outputs for 173 data fields with data extracted from a sample of 24 articles (including experimental, observational, qualitative, and modelling studies) from a previously conducted review, of which 3 were used for prompt engineering. Reliability (consistency) was assessed by calculating the mean maximum agreement rate (the highest proportion of identical returns from 10 consecutive extractions) for 116 data fields from 16 of the 24 studies. Acceptability of the accuracy and reliability outputs for each data field was assessed on whether it would be usable in real-world settings if the model acted as one reviewer and a human as a second reviewer. Results Of the 173 data fields evaluated for accuracy, 68% were rated by human reviewers as acceptable (consistent with what is deemed to be acceptable data extraction from a human reviewer). However, acceptability ratings varied depending on the data field extracted (33% to 100%), with at least 90% acceptability for ‘objective’, ‘setting’, and ‘study design’, but 54% or less for data fields such as ‘outcome’ and ‘time period’. For reliability, the mean maximum agreement rate was 0.71 (SD: 0.28), with variation across different data fields. Conclusion This evaluation demonstrates the potential for LLMs, when paired with human quality assurance, to support data extraction in evidence reviews that include a range of study designs, however further improvements in performance and validation are required before the model can be introduced into review workflows. 1. Introduction The advancement of artificial intelligence (AI) offers new opportunities to improve the efficiency and scalability of evidence reviews by accelerating and automating steps in the review process ( 1 ). Traditional machine learning (ML) approaches, which involve training algorithms on structured and task-specific datasets, may already be integrated into the evidence review process through several methods, such as the use of machine learning algorithms to prioritise relevant articles in the title and abstract screening process ( 2 , 3 ). The emergence of generative large language models (LLMs), such as GPT, Claude and LLaMa, has introduced further opportunities for automation of steps in the evidence review process. LLMs are pre-trained on large quantities of unstructured text which enables them to follow instructions in natural language and be applied more flexibly across a wider range of tasks ( 4 ). This can include extracting textual and numerical data from full text articles ( 5 ) or drafting text for reports ( 6 ). Generative LLMs have shown promising performance for some steps of the evidence review process, such as screening ( 7 ) and data extraction ( 8 – 10 ). However, results have been less promising for other steps such as search strategy generation ( 11 ), or risk of bias assessments ( 12 ). Additional limitations such as hallucinating content, misinterpreting or oversimplifying nuanced data, and generating outputs which can be difficult to validate, raise concerns around transparency, reproducibility and the integrity of outputs produced by generative LLMs ( 1 ). Of the steps in the evidence review process that have potential for AI integration, data extraction can be particularly resource and time-intensive, and requires a high degree of accuracy to uphold validity ( 13 ). There is the potential to use LLMs alongside human reviewers to semi-automate this task to improve efficiency. However, while LLMs have shown promising results when extracting data from randomised controlled trials (RCTs) ( 8 – 10 , 14 ), their ability to extract data from a wider range of study designs, including observational research, is unclear. This is particularly important in public health, where evidence is often synthesised from diverse study designs to inform advice, guidance, policy and practice. To address this gap, data scientists and evidence reviewers at UKHSA co-developed a bespoke LLM pipeline for semi-automated data extraction for evidence reviews (evidence synthesis outputs following systematic methodologies), integrated within UKHSA infrastructure and designed to work with human oversight. The aim of this feasibility study was to evaluate the performance of a bespoke LLM pipeline designed to automate data extraction in evidence reviews and assess the accuracy, reliability and acceptability of outputs to human reviewers. 2. Methods A Retrieval-Augmented Generation (RAG) pipeline utilising a LLM was developed to automate data extraction (see Figure 1 ). The LLM used was LLaMa 3-70B. Download figure Open in new tab Figure 1. RAG Pipeline 2.1 Reference standard LLM data extraction outputs were compared against data that had been extracted and quality assured by 2 reviewers as part of a previously published rapid mapping review ( 15 , 16 ). The rapid mapping review aimed to identify and categorise the evidence generated during the COVID-19 pandemic on the effectiveness of non-pharmaceutical interventions (NPIs) implemented in community settings in the UK. A convenience sample of 24 open access studies was selected from the 151 studies included in the review, ensuring the sample contained examples of the different study designs included (2 RCTs, 5 prospective longitudinal, 3 cross-sectional, 4 ecological, 4 mixed-methods, 3 qualitative, and 3 modelling studies) ( 17 – 40 ). This was to ensure that the model’s performance in extracting data from different study designs that are regularly encountered when reviewing evidence relevant to public health could be evaluated. PDF copies of the articles were accessed within the organisation’s secure infrastructure, with no data used to train the LLM, no third party access permitted, and no data leaving the internal environment. 2.2 Data fields extracted The LLM extracted a total of 173 data fields from 24 full text articles, which corresponded to those that had been extracted by human reviewers for the published rapid mapping review ( 15 , 16 ). The data fields extracted varied by study design. Seven data fields were extracted for each study design except for RCTs and modelling studies for which 8 data fields were extracted (see Table 1 for details of data fields extracted). View this table: View inline View popup Table 1 Data fields extracted and prompts used 2.3 Pipeline development and prompt engineering A RAG pipeline utilising an LLM (LLaMa 3-70B) was developed to automate data extraction ( Figure 1 ). PDF copies of the articles for data extraction were inputted into the pipeline. Text from each PDF was tokenized (broken down into smaller units [tokens]), split into chunks and stored in a vector database (a type of database that stores text as numbers based on meaning, so similar pieces of text can be found quickly) using Python and the Langchain framework. The chunking of data is necessary to enable processing within the context window, which equates to the maximum amount of data that can be passed to the LLM at one time. These chunks of text are then turned into numbers (called vectors), that represent their meaning so that the model can understand and compare them. This permits the database to be queried and select the most relevant chunks to the user’s query. The chunks were then selected for each data field, from which the LLM extracted and summarised the information using prompting. The summarised information was outputted into an Excel file (see example in supplementary material Table S1). Prompts for each data field extracted were developed in an iterative and collaborative process between evidence reviewers and data scientists using 4 studies ( 17 , 36 , 39 , 41 ), 3 of which were included in the evaluation ( 17 , 36 , 39 ). For the purpose of generating consistent responses with a rich vocabulary, the temperature setting of the model (which controls randomness) was set very low (0.01) to reduce randomness and ensure consistency, and the top_p setting (which controls how much of the vocabulary was considered) was set high to (0.99). Table 1 shows the prompts used in the LLM and examples of expected extracted content for each data field. See supplementary material Table S2 for system message and prompt for the overall task. 2.4 Evaluation An evaluation framework was developed to assess the accuracy, reliability, and acceptability of the LLM’s outputs for each data field based on error classification used in a published evaluation ( 8 ). Using these frameworks, 2 reviewers (TH, BE) independently assessed the outputs for each data field according to the pre-defined error and acceptability criteria ( Tables 2 and 3 ). Assessments were then agreed by consensus, with a third reviewer (ZS) present to resolve disagreements and record decisions. View this table: View inline View popup Download powerpoint Table 2. Accuracy evaluation framework View this table: View inline View popup Download powerpoint Table 3. Reliability evaluation framework Accuracy Accuracy of the LLM’s output for each data extraction field for the 24 studies was assessed according to the evaluation framework ( Table 2 ). Each data field was assigned one error type (no error, major, minor, missing data, not stated in article, or hallucination). The LLM outputs were compared to the data extracted by human reviewers for the published mapping review, defining accuracy as the proportion of extractions with no errors. Reliability Reliability (consistency) of outputs produced by the LLM was assessed by running 10 extractions consecutively using identical prompts. The reference standard for reliability was that the LLM was expected to return identical values in the same format. The group of 10 extractions for each data field was evaluated against pre-defined error categories (no error, value error, voice error, or value not returned) ( Table 3 ). Each data field could be designated as having both a value and voice error, or one error category only. Reliability was assessed for each data field for 16 studies (2 RCTs, 3 prospective longitudinal, 1 cross-sectional, 4 ecological, 2 mixed-methods, 2 qualitative, 2 modelling) ( 17 – 19 , 21 – 28 , 31 , 33 , 34 , 36 , 39 ). Acceptability Acceptability of both accuracy and reliability outputs produced by the LLM was assessed against whether the output was useable, and whether the level of variation would be accepted in a human reviewer’s work ( Tables 2 and 3 ). Errors could be acceptable if they were consistent with the types of errors a human reviewer might reasonably make (such as minor differences in wording), and did not affect interpretation of the study findings. Errors were considered unacceptable if they risked misinterpretation of the study characteristics or findings. Analysis Results were analysed in Python 3.11 using the Pandas library for data wrangling (also called data cleaning or preprocessing) and the Matplotlib and Plotly libraries for visualisation. Distribution of error types and the acceptability of identified errors were analysed for extractions both overall and disaggregated by data field and study design. Maximum agreement rate (the highest proportion of the 10 extractions that returned identical extractions) was used as a measure of reliability. 3. Results 3.1 Accuracy The accuracy of 173 data fields extracted from 24 full text articles was assessed. Findings are presented in Table 4 and Figure 2 . Of the 173 data fields extracted, 42% of outputs were accurate (no error), 28% had minor errors, 21% had major errors, 4.6% were categorised as missing data, 3.5% as data not stated in article, and 1.2% of errors were categorised as hallucinations. Overall, 68% of outputs returned were designated acceptable (consistent with what is deemed to be acceptable data extraction from a human reviewer). Download figure Open in new tab Figure 2. Distribution of accuracy error types by data field View this table: View inline View popup Download powerpoint Table 4. Accuracy errors and acceptability by data field (total number of extractions = 173) However, findings varied across data fields, with acceptability ranging from 33% to 100% ( Table 4 ). For data fields where at least 21 studies were evaluated, the proportion of data fields with no error ranged from 4.2% for ‘outcome’ to 79% for ‘objective’, whilst the proportion with a major error ranged from 0 for ‘study design’ to 46% for ‘NPI’. At least 90% of the outputs for data fields ‘objective’, ‘setting’, and ‘study design’ were designated acceptable by human reviewers (with a high no error rate of more than 70%). The data fields ‘NPI’ and ‘outcome’ were designated acceptable for 50% or less of extractions. The data field ‘model’ (for modelling studies only) had a correct extraction (no error) rate and acceptability of 100%, however this was only assessed in 3 studies. Accuracy acceptability findings also varied by study design, from 56% overall for RCTs (2 studies) to 86% for cross-sectional studies (3 studies) (supplementary material Table S3). Interestingly, the acceptability of the outputs for the data field’s ‘objective’ and ‘setting’ was 100% for all study designs, except for mixed methods studies (4 studies, 75% for both data fields) and for qualitative studies (3 studies, 67% for both data fields). However, the number of studies in each study design category was too low to allow conclusions about differential model performance by study design to be drawn. 3.2 Reliability The reliability of 116 data fields extracted from 16 full text articles was assessed. The model’s reliability (consistency) varied across different data fields. The mean maximum agreement rate across all data fields was 0.71 (SD: 0.28), showing that on average 7.1 of the 10 consecutive extractions returned the same value for a given data field ( Table 5 ). The data fields ‘study design’, ‘setting’, and ‘objective’ showed higher consistency, with mean maximum agreement rates above 0.85. Lower agreement rates were observed for fields such as ‘outcome’ (mean: 0.55), and ‘participants/population’ (mean: 0.61). View this table: View inline View popup Download powerpoint Table 5. Reliability evaluation maximum agreement rate For 47 of the 116 data fields extracted (41%), the 10 extractions were consistent (‘no error’). In the remaining 59% of data fields, inconsistency was either due to value errors (differences in the information returned) or voice errors (variation in how the same information was phrased or formatted), or both (see Figure 3 and supplementary Table S4 for details of error types and their acceptability). Download figure Open in new tab Figure 3. Distribution of reliability error types by data field 3.3 Reviewer observations of the LLM generated-extractions Reviewers shared a number of qualitative observations related to the model-generated extractions and model behaviour. A common theme was that the model tended to be verbose, with broad and unfocused responses which gave the impression of returning all potentially relevant content from the source article, rather than summarising contextually relevant information. One reviewer described this as “responding with everything it could find, rather than answering the question”. Recurring issues were noted across specific data fields, such as ‘time period’, ‘outcomes’ and ’NPI’s. For example, for ‘time period’, within the 46% of unacceptable extractions, the model often returned metadata such as the manuscript submission or acceptance dates, rather than the actual study period. For ‘outcomes’ and ‘NPI’s (that were rated as not acceptable for at least 50% of extractions), the model sometimes extracted content from the introduction, discussion, or title, which often referenced broader or future directions, rather than the specific interventions or outcomes assessed in the study, suggesting a potential lack of contextual prioritisation. Reviewers also noted that prompt design appeared to influence the model’s behaviour and outputs in unintended ways. For example, the prompt for the ‘participant/population’ data field included an instruction to describe any relevant subgroups. In response, the model sometimes inferred subgroups even when none were described in the study, suggesting that the wording of prompts may have encouraged overinterpretation rather than strict extraction. Despite these observations, reviewers noted the potential benefits of using the model as one reviewer whilst keeping a human in the loop as second reviewer. For instance, in a rapid review context, the model could act as a first reviewer and the human reviewer as a second reviewer, checking the extraction of the model. In a systematic review context, extraction by both the model and the human would be done independently and then compared to reach consensus. However, the length and complexity of outputs may limit the value. As one reviewer remarked, “it would be just as easy to read the original article”. 4. Discussion This study explored the feasibility of using a bespoke RAG-LLM technical pipeline using LLaMA-3 to complete data extraction for evidence reviews in the public health context, with the intention that outputs would be quality assured by a second human reviewer ( Figure 4 ). When extracting data from full-text articles for a range of data fields, the model returned accurate outputs (no error) for only 42% of data fields, whilst a further 28% had only minor errors. In terms of acceptability, 68% of the outputs assessed for accuracy were deemed acceptable, meaning that it would be deemed to be acceptable data extraction from a human reviewer and therefore usable in a real-world evidence review. Accuracy and reliability varied by data field, similar to previous observations by others ( 42 ). The model demonstrated strong performance for certain data fields (‘study objective’, ‘study design’ and ‘setting’), where acceptability of accuracy extractions was at least 90%, and reliability (consistency) was highest (mean maximum agreement rate of at least 0.86). However, other data fields such as ‘outcomes’, ‘participants/population’ and ‘time period’, proved more difficult for the model to extract accurately (accuracy 4.2% to 29%, accuracy acceptability 50% to 64%) and reliably (mean maximum agreement rates 0.52 to 0.69), possibly due to varied expression of data across studies and the reporting format, such as tables, or its location ( 43 ). Download figure Open in new tab Figure 4. Workflow for prompt development, model extraction, and human review Direct comparison between the findings of this evaluation and other published evaluations of LLMs’ data extraction, which report accuracy of between 68% and 96% ( 8 – 10 , 14 , 42 ), is not possible due to differences in the study designs and data fields extracted, and in the LLMs and evaluation frameworks used. In particular, the published evaluations tend to focus on extraction of data from RCTs of clinical evidence, therefore mainly numerical data, and following standardised guidance for reporting ( 8 – 10 , 14 ). In contrast, this study evaluated the model’s performance across a range of study designs assessing public health evidence which tends to be more wordy and less structured than RCTs of clinical evidence. In this context, it is worth noting that the data extraction for the 2 RCTs included in this evaluation had the lowest acceptability of all the study designs included, which may reflect the complexity of the interventions and comparisons (both evaluated test and release strategies which involve multi-step conditional interventions) ( 36 , 39 ). The range of study designs in this evaluation, along with variation in how data is reported across full texts, contributed to an increased complexity of the task but is reflective of the real-world challenges in evidence reviews in public health. The results may reflect not only the complexity of the data fields themselves, but also the performance of the underlying model and the effectiveness of the prompt engineering approach. Performance of different LLMs varies ( 43 , 44 ). Llama-3 has previously been identified as higher-performing for classification or data extraction- related tasks in the public health context ( 43 ), which informed the choice of Llama-3 for this bespoke pipeline. The effectiveness of prompt engineering also likely influenced performance ( 45 , 46 ), particularly the ability of the model to extract nuanced information across data fields and from different study designs. The reliability results raise an important question around the validity of deriving accuracy from an evaluation of a single run, which may be inadequate to evaluate model performance. This evaluation, like others ( 42 ), showed that generative AI models can produce different outputs from the same input, which complicates the interpretation of accuracy metrics. Setting the temperature close to but not at zero may have resulted in some of the variation seen, although variation has been observed with the temperature set at zero ( 42 ). Criteria used to evaluate accuracy and reliability of LLM extractions differ in published evaluations ( 8 – 10 , 14 , 42 ). The evaluation framework used in this study was composed of a structured accuracy and reliability error classification system, and reviewer- informed acceptability judgement. Considering acceptability facilitated a more nuanced and practical assessment of model performance than binary accuracy scoring alone, by evaluating the output in the context of what a human reviewer might reasonably extract. This layered evaluation approach reflects how these tools might be used in real-world scenarios where human oversight is retained, and was aligned with the potential integration of this pipeline into rapid systematic methodologies, where a human reviewer would quality assure all model-generated extractions. However, a clearer understanding of what constitutes acceptable accuracy and reliability of LLM extractions for integration into evidence review processes is needed. Establishing what minimum threshold of acceptability is needed for the adoption of LLMs remains an open question for future research. The development of standards to assess consistency, potentially through averaged outputs, confidence thresholds, or stability metrics, will be important for the responsible adoption of AI-assisted tools in evidence review methods. Limitations of this work Conclusions about differential model performance by study design cannot be drawn due to the small number of studies representing each study design within the sample. Three of the 24 studies were used during prompt development, which may have introduced an unknown degree of bias in model performance during evaluation. These studies were retained as part of the evaluation, which prioritised learning about feasibility and model behaviour over a strict separation of the data used to develop the prompts and pipeline from the data used to evaluate the model. All data fields were weighted equally and did not account for relative differences in complexity or the potential influence of certain data fields on the interpretation of the review findings. A single set of prompts (tailored to studies included in this evaluation) and a single dataset were used, so further testing is needed to understand how performance might vary across review types, prompt phrasing, or model configurations. As this evaluation compared LLM data extraction with extraction previously completed by humans, data was not available to compare the time impact of LLM and human data extraction. Next steps With continued close collaboration between data scientists and evidence reviewers, the technical pipeline will be refined and reassessed to improve performance by exploring adjustments to the current RAG-based approach, and by testing alternative prompting methods that do not rely on retrieval components. Consideration will be given to which type of reviews, study designs, and data fields the pipeline is most appropriate and valuable for. Time savings and resource impact will also need to be captured in future testing. It will be important to consider the principles and practicalities of responsible AI integration and develop appropriate guidelines for its use. This includes ensuring transparency in how technical pipelines and the models embedded within them are developed, evaluated, and deployed. Users will need to maintain transparent documentation of prompt engineering, model parameters, and versioning, and establish clear decision thresholds and audit trails to support reproducibility and accountability. There also needs to be a consensus-based decision about acceptable performance thresholds and risk. These considerations are aligned with guidance on responsible AI use in evidence synthesis, currently in development ( 47 ), which highlights the importance of fairness, robustness, and transparency in the application of AI in evidence reviews. 5. Conclusion This evaluation demonstrates the potential for LLMs, when paired with human quality assurance, to support data extraction in evidence reviews that include a range of study designs, however further improvements in performance are required before the model can be introduced into review workflows. While overall accuracy was modest, acceptability of outputs, defined as their practical usability by reviewers was higher, showing potential for real-world application. Performance varied across data fields, and reliability issues highlighted the limitations of single run evaluations. However, despite overall variability, the model performed well for the data fields ‘objective’, ‘setting’, and ‘study design’. The layered evaluation framework, which considered acceptability as well as accuracy and reliability, offered a more nuanced measure of how these tools might function in evidence review contexts, and the co-development process between data scientists and evidence reviewers was important in aligning technical design with practical needs. However, further testing across review types, refined prompting strategies, and clearer thresholds for acceptable model performance are needed. Future work must also address the principles of responsible AI integration, including transparency, reproducibility, and appropriate risk-benefit trade-offs. With these considerations in place, semi-automated data extraction using LLMs could play a valuable role in improving the efficiency of public health evidence reviews. Data Availability All data produced are available in supplementary files 2 and 3 Disclaimer The views expressed in this article are those of the authors and are not necessarily those of UK Health Security Agency or the Department of Health and Social Care. Acknowledgements We acknowledge the support of colleagues in UKHSA Chief Data Officer Group (Chris Moyle, Josh Harris, Luke Hounsome, Timothy Laurence, Leo Loman), in UKHSA Knowledge and Library Services, and in UKHSA Evidence Network. Footnotes ↵ * joint first authors References 1. ↵ Siemens W , von Elm E , Binder H , et al. Opportunities, challenges and risks of using artificial intelligence for evidence synthesis . BMJ Evidence-Based Medicine . 2025 :bmjebm- 2024-113320. 2. ↵ Waffenschmidt S , Sieben W , Jakubeit T , et al. Increasing the efficiency of study selection for systematic reviews using prioritization tools and a single-screening approach . Systematic Reviews . 2023 ; 12 ( 1 ): 161 . OpenUrl PubMed 3. ↵ Burgard T , Bittermann A . Reducing literature screening workload with machine learning: A systematic review of tools and their performance . Zeitschrift für Psychologie . 2023 ; 231 ( 1 ): 3 – 15 . OpenUrl 4. ↵ Castro A , Pinto J , Reino L , et al. Large language models overcome the challenges of unstructured text data in ecology . Ecological Informatics . 2024 ; 82 : 102742 . 5. ↵ Khan N , Zahid K , Anis K , et al. Global insights and the impact of generative AI-ChatGPT on multidisciplinary: a systematic review and bibliometric analysis . Connection Science . 2024 ; 36 ( 1 ): 2353630 . OpenUrl 6. ↵ Luo J . A critical review of GenAI policies in higher education assessment: a call to reconsider the “originality” of students’ work . Assessment & Evaluation in Higher Education . 2024 ; 49 ( 5 ): 651 – 64 . OpenUrl 7. ↵ Clark J , Barton B , Albarqouni L , et al. Generative artificial intelligence use in evidence synthesis: A systematic review . Research Synthesis Methods . 2025 : 1 – 19 . 8. ↵ Gartlehner G , Kahwati L , Hilscher R , et al. Data extraction for evidence synthesis using a large language model: A proof-of-concept study . Research Synthesis Methods . 2024 ; 15 ( 4 ): 576 – 89 . OpenUrl PubMed 9. Konet A , Thomas I , Gartlehner G , et al. Performance of two large language models for data extraction in evidence synthesis . Research Synthesis Methods . 2024 ; 15 ( 5 ): 818 – 24 . OpenUrl PubMed 10. ↵ Motzfeldt Jensen M , Brix Danielsen M , Riis J , et al. ChatGPT-4o can serve as the second rater for data extraction in systematic reviews . PLoS ONE . 2025 ; 20 ( 1 ): e0313401 . OpenUrl PubMed 11. ↵ Adam GP , DeYoung J , Paul A , et al. Literature search sandbox: a large language model that generates search queries for systematic reviews . JAMIA Open . 2024 ; 7 ( 3 ): ooae098 . OpenUrl 12. ↵ Barsby J , Hume S , Lemmey HAL , et al. Pilot study on large language models for risk-of- bias assessments in systematic reviews: A(I) new type of bias? BMJ Evidence-Based Medicine . 2025 ; 30 ( 1 ): 71 . OpenUrl 13. ↵ Horton J , Vandermeer B , Hartling L , et al. Systematic review data extraction: cross- sectional study showed that experience did not increase accuracy . Journal of Clinical Epidemiology . 2010 ; 63 ( 3 ): 289 – 98 . OpenUrl CrossRef PubMed 14. ↵ Sun Z , Zhang R , Doi SA , et al. How good are large language models for automated data extraction from randomized trials? medRxiv . 2024 :2024.02.20.24303083. 15. ↵ UK Health Security Agency . Effectiveness of non-pharmaceutical interventions to reduce transmission of COVID-19 in the UK: a rapid mapping review . 2023 . 16. ↵ Duval D , Evans B , Sanders A , et al. Non-pharmaceutical interventions to reduce COVID-19 transmission in the UK: a rapid mapping review and interactive evidence gap map . Journal of Public Health . 2024 ; 46 ( 2 ): e279 – e93 . OpenUrl PubMed 17. ↵ Abernethy GM , Glass DH . Optimal COVID-19 lockdown strategies in an age-structured SEIR model of Northern Ireland . Journal of the Royal Society Interface . 2022 ; 19 ( 188 ): 20210896 . OpenUrl PubMed 18. Aggarwal D , Page Andrew J , Schaefer U , et al. An integrated analysis of contact tracing and genomics to assess the efficacy of travel restrictions on SARS-CoV-2 introduction and transmission in England from June to September , 2020 . MedRxiv. 2021:2021.03.15.21253590 19. ↵ Almagor J , Picascia S . Exploring the effectiveness of a COVID-19 contact tracing app using an agent-based model . Scientific Reports . 2020 ; 10 ( 1 ): 22235 . OpenUrl PubMed 20. Arnold KF , Gilthorpe MS , Alwan NA , et al. Estimating the effects of lockdown timing on COVID-19 cases and deaths in England: A counterfactual modelling study . PLoS ONE . 2022 ; 17 ( 4 ): e0263432 . OpenUrl CrossRef PubMed 21. ↵ Bernal JL , Sinnathamby MA , Elgohari S , et al. The impact of social and physical distancing measures on COVID-19 activity in England: findings from a multi-tiered surveillance system . Euro surveillance : bulletin Europeen sur les maladies transmissibles = European communicable disease bulletin . 2021 ; 26 ( 11 ): 2001062 . OpenUrl PubMed 22. Blackmore C , Czachorowski M , Farrington E , et al. Testing for COVID-19 during an outbreak within a large UK prison: an evaluation of mass testing to inform outbreak control . International Journal of Infectious Diseases . 2022 ; 125 : 138 – 44 . OpenUrl PubMed 23. Blake H , Corner J , Cirelli C , et al. Perceptions and experiences of the university of nottingham pilot sars-cov-2 asymptomatic testing service: a mixed-methods study . International Journal of Environmental Research and Public Health . 2021 ; 18 : 1 – 26 . OpenUrl 24. Blake H , Knight H , Jia R , et al. Students’ views towards sars-cov-2 mass asymptomatic testing, social distancing and self-isolation in a university setting during the covid-19 pandemic: A qualitative study . International Journal of Environmental Research and Public Health . 2021 ; 18 ( 8 ): 4182 . OpenUrl 25. Cai S , Zhang T , Robin C , et al. Learning about COVID-19 across borders: public health information and adherence among international travellers to the UK . Public Health . 2022 ; 203 : 9 – 14 . OpenUrl PubMed 26. Coleman PC , Pailing A , Roy A , et al. Implementation of novel and conventional outbreak control measures in managing COVID-19 outbreaks in a large UK prison . BMC public health . 2022 ; 22 ( 1 ): 677 . OpenUrl PubMed 27. Dallera G , Alaa A , El-Osta A , et al. Evaluating the feasibility and acceptability of a safety protocol to mitigate SARS-CoV-2 transmission risks when participating in full-capacity live mass events: a cross-sectional survey and interview-based study . BMJ Open . 2022 ; 12 ( 12 ): e063838 . OpenUrl Abstract / FREE Full Text 28. ↵ Davies M , Hill J , Goggins L , Peirce N . Risk assessed daily contact testing enabling elite sporting events during the COVID-19 pandemic: a prospective cohort study . SSRN . 2022 . 29. Francis NA , Becque T , Willcox M , et al. Non-pharmaceutical interventions and risk of COVID-19 infection: survey of U.K. public from November 2020 - May 2021 . BMC Public Health . 2023 ;23(1):389. 30. French CE , Denford S , Brooks-Pollock E , et al. Low uptake of COVID-19 lateral flow testing among university students: a mixed methods evaluation . Public Health . 2022 ;204:54-62. 31. ↵ Gianino MM , Nurchis MC , Politano G , et al. Evaluation of the strategies to control COVID- 19 pandemic in four European countries . Frontiers in Public Health . 2021 ; 9 : 700811 . 32. Gillam TB , Cole J , Gharbi K , et al. Norwich COVID-19 testing initiative pilot: evaluating the feasibility of asymptomatic testing on a university campus . Journal of Public Health . 2021 ; 43 ( 1 ): 82 – 8 . OpenUrl PubMed 33. ↵ Hounsome L , Herr D , Bryant R , et al. Epidemiological impact of a large number of incorrect negative SARS-CoV-2 test results in South West England during September and October 2021 . MedRxiv . 2022 :2022.11.30.22282922. 34. ↵ Hunter PR , Brainard J , Grant A . The Impact of the November 2020 English National Lockdown on COVID-19 case counts . MedRxiv . 2021 :2021.01.03.21249169. 35. Kumari M , Chandola T , Booker Cara L , Benzeval Michaela J . Targeted Shielding and Coronavirus Symptoms Among Adults in the UK . Research Square . 2021 . 36. ↵ Love NK , Ready DR , Turner C , et al. Daily use of lateral flow devices by contacts of confirmed COVID-19 cases to enable exemption from isolation compared with standard self-isolation to reduce onward transmission of SARS-CoV-2 in England: a randomised, controlled, non-inferiority trial . The Lancet Respiratory Medicine . 2022 ; 10 ( 11 ): 1074 – 85 . OpenUrl PubMed 37. Mensah AA , Sinnathamby M , Zaidi A , et al. SARS-CoV-2 infections in children following the full re-opening of schools and the impact of national lockdown: Prospective, national observational cohort surveillance, July-December 2020, England . Journal of Infection . 2021 ; 82 ( 4 ): 67 – 74 . OpenUrl 38. Watson D , Baralle NL , Alagil J , et al. How do we engage people in testing for COVID-19? A rapid qualitative evaluation of a testing programme in schools, GP surgeries and a university . BMC public health . 2022 ; 22 ( 1 ): 305 . OpenUrl PubMed 39. ↵ Young BC , Eyre DW , Kendrick S , et al. Daily testing for contacts of individuals with SARS- CoV-2 infection and attendance and SARS-CoV-2 transmission in English secondary schools and colleges: an open-label, cluster-randomised trial . The Lancet . 2021 ; 398 ( 10307 ): 1217 – 29 . OpenUrl 40. ↵ Zhang T , Robin C , Cai S , et al. Public health information on COVID-19 for international travellers: lessons learned from a mixed-method evaluation . Public Health . 2021 ; 193 : 116 – 23 . OpenUrl PubMed 41. ↵ Marshall G , Skeva R , Jay C , et al. Public perceptions and interactions with UK COVID-19 Test, Trace and Isolate policies, and implications for pandemic infectious disease modelling [version 1; peer review: awaiting peer review] . F1000Research . 2022 ;11(1005). 42. ↵ Schmidt L , Hair K , Graziosi S , et al. Exploring the use of a Large Language Model for data extraction in systematic reviews: a feasibility study . arXiv. 2025 :arXiv:2405.14445v2. 43. ↵ Harris J , Laurence T , Loman L , et al. Evaluating large language models for public health classification and extraction tasks . arXiv. 2025 (v2): arXiv:2405.14766. 44. ↵ Peters U , Chin-Yee B . Generalization bias in large language model summarization of scientific research . Royal Society Open Science . 2025 ; 12 ( 4 ): 241776 . OpenUrl PubMed 45. ↵ Chang K , Xu S , Wang C , et al. Efficient Prompting Methods for Large Language Models: A Survey . arXiv. 2024 :arXiv:2404.01077. 46. ↵ Xu D , Chen W , Peng W , et al. Large language models for generative information extraction: a survey . Frontiers of Computer Science . 2024 ; 18 ( 6 ): 186357 . OpenUrl 47. ↵ Responsible AI in evidence synthesis (RAISE): guidance and recommendations . Draft for consultation and revision 2024 [Available from: https://osf.io/cn7x4 . View the discussion thread. Back to top Previous Next Posted July 21, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Assessing the feasibility and acceptability of a bespoke large language model pipeline to extract data from different study designs for public health evidence reviews Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Assessing the feasibility and acceptability of a bespoke large language model pipeline to extract data from different study designs for public health evidence reviews Zalaya Simmons , Beti Evans , Tamsyn Harris , Harry Woolnough , Lauren Dunn , Jonathon Fuller , Kerry Cella , Daphne Duval medRxiv 2025.07.21.25331917; doi: https://doi.org/10.1101/2025.07.21.25331917 Share This Article: Copy Citation Tools Assessing the feasibility and acceptability of a bespoke large language model pipeline to extract data from different study designs for public health evidence reviews Zalaya Simmons , Beti Evans , Tamsyn Harris , Harry Woolnough , Lauren Dunn , Jonathon Fuller , Kerry Cella , Daphne Duval medRxiv 2025.07.21.25331917; doi: https://doi.org/10.1101/2025.07.21.25331917 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Public and Global Health Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9220) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffdfb303e2a58d3',t:'MTc3OTQ3NTU2MA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.