Full text
77,512 characters
· extracted from
preprint-html
· click to expand
EAGLE-AI: A large language model workflow for automated extraction and scoring of literature evidence linking genes to autism spectrum disorder | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search EAGLE-AI: A large language model workflow for automated extraction and scoring of literature evidence linking genes to autism spectrum disorder Vinicius Furlan , View ORCID Profile Julian Moran , Nelson B. Salazar , Olivia Rennie , Ny Hoang , Andrew Wan , Marla Mendes de Aquino , View ORCID Profile Worrawat Engchuan , Jacob A.S. Vorstman , View ORCID Profile Stephen W. Scherer doi: https://doi.org/10.1101/2025.09.10.25334730 Vinicius Furlan 1 The Centre for Applied Genomics, The Hospital for Sick Children , Toronto, ON, Canada 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Julian Moran 1 The Centre for Applied Genomics, The Hospital for Sick Children , Toronto, ON, Canada 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Julian Moran Nelson B. Salazar 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada 3 Department of Molecular Genetics, University of Toronto , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Olivia Rennie 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada 4 Temerty Faculty of Medicine, University of Toronto , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ny Hoang 1 The Centre for Applied Genomics, The Hospital for Sick Children , Toronto, ON, Canada 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada 3 Department of Molecular Genetics, University of Toronto , Toronto, ON, Canada 5 Department of Genetic Counselling, The Hospital for Sick Children , Toronto, Ontario, Canada 6 Autism Research Unit, The Hospital for Sick Children , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Andrew Wan 1 The Centre for Applied Genomics, The Hospital for Sick Children , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Marla Mendes de Aquino 1 The Centre for Applied Genomics, The Hospital for Sick Children , Toronto, ON, Canada 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Worrawat Engchuan 1 The Centre for Applied Genomics, The Hospital for Sick Children , Toronto, ON, Canada 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Worrawat Engchuan Jacob A.S. Vorstman 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada 7 Department of Psychiatry, The Hospital for Sick Children , Toronto, ON, Canada 8 Department of Psychiatry, Temerty Faculty of Medicine, University of Toronto , Toronto, ON, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Stephen W. Scherer 1 The Centre for Applied Genomics, The Hospital for Sick Children , Toronto, ON, Canada 2 Genetics and Genome Biology Program, The Hospital for Sick Children , Toronto, ON, Canada 3 Department of Molecular Genetics, University of Toronto , Toronto, ON, Canada 9 McLaughlin Centre , Toronto, ON M5G 0A4, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Stephen W. Scherer For correspondence: stephen.scherer{at}sickkids.ca Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract We previously developed the Evaluation of Autism Gene Link Evidence (EAGLE) manual curation framework and used it to characterise 219 autism-associated genes. However, this effort took years of human work. We present EAGLE-AI, an automated evidence collection, screening, extraction, and scoring system incorporating agentic large language model (LLM) workforces. On a test set of 116 manuscripts screened for ease of machine-readability, EAGLE-AI achieves F1 score of 91% in reproducing human curators’ data extractions. Its evidence scores differ from those of human scorers by 14.3%. Our findings indicate that EAGLE-AI can successfully automate most of a clinical genomics evidence curation process. Background Next-generation sequencing has greatly improved our ability to detect genetic variants across the human genome 1 . In leveraging next-generation methods, genetic testing achieves a greater diagnostic yield for many disorders, including autism spectrum disorder (ASD, also called autism) 2 . However, this relies on enough scientific understanding to determine the significance of the genetic variants identified. Here, knowing whether a gene is related to a condition is a crucial first step 3 . In the case of ASD, research has demonstrated a significant genetic component driving the condition 4 , 5 , 6 , 7 , 8 . Previous studies have shown considerable inconsistency in the genes included in autism gene panels 9 , 10 . This highlights the need to distinguish between bona-fide confirmed genes and candidate genes, referred to as establishing gene-phenotype clinical validity 11 . To promote consistency among laboratories, researchers, and clinicians, the Clinical Genome Resource (ClinGen) has developed a framework for systematically evaluating the evidence supporting a gene-phenotype association 12 . This framework involves curating publicly available evidence, including genetic and experimental data, and applying a rules-based scoring system. The evidence level is categorized as definitive, strong, moderate, or limited based on the strength of evidence. With this framework, teams of hundreds of ClinGen volunteers from the United States and around the world have evaluated more than 2,420 gene-phenotype relationships and established several gene curation expert panels, including one for Intellectual Disability and Autism 13 , 14 , 15 . Intellectual disability and ASD are often grouped under the umbrella term of neurodevelopmental disorders (NDDs). As an indication for genetic testing and clinical consultation, NDD as a category is commonly accepted by clinical laboratories and genetic clinics, which is reasonable given the high comorbidity among the conditions included in this general term 11 , 16 . However, for gene curation purposes, it is important to assess the clinical validity for ASD independently from NDDs as a whole, as the implications in clinical management may differ depending on the gene annotation. For instance, ASD often manifests independently of intellectual disability, epilepsy, or other NDDs 17 , 18 . Moreover, the support needs differ between individuals with ASD and those with both ASD and other comorbid NDDs 19 , 20 . As genome sequencing is incorporated into clinical care more widely and at earlier stages of development, understanding a gene’s relevance to ASD specifically will better empower precision care 21 . Efforts to parse the heterogeneity of autism are ongoing: for example, recent work has identified clinically distinct autism subtypes linked to specific genetic programs and developmental trajectories 22 , 23 , 24 , 25 . To this end, we previously established the Evaluation of Autism Gene Link Evidence (EAGLE) framework for curating ASD clinical genomics evidence 10 . This approach builds on the ClinGen gene-phenotype clinical validity framework by adding a rules-based method for assessing level of confidence in the reported ASD phenotype. EAGLE also adjusts the genetic evidence scoring approach to account for the variability and complexity of ASD genetics. EAGLE reserves high scores for variants linked to a confirmed ASD diagnosis rather than a case with autism-like features. This helps in distinguishing ASD-associated variants from variants linked to NDDs more broadly. Previously, ASD gene lists have been defined from large-scale sequencing studies of individuals with ASD. They include lists from the Simons Foundation Powering Autism Research for Knowledge (SPARK) 26 , Autism Speaks – MSSNG 27 , and the Autism Sequencing Consortium (ASC) 28 . Together, these three projects identify a total of 236 unique genes as being associated with ASD with high confidence. In contrast, EAGLE is used to define an autism gene list based on evidence from the full diversity of the scientific literature 10 . EAGLE scores are currently included in the Simons Foundation Autism Research Initiative (SFARI) database 29 and characterize level of evidence for many prototypical ASD genes: for example, NRXN1 , SCN2A , MECP2 , CHD8, DDX3X, SHANK3 , and PTEN . However, the EAGLE framework’s manual curation workflow is both time-consuming and resource-intensive, as it requires in-depth assessment of ASD clinical case reports, functional studies, and database information. Since at least 2003, research groups have been using natural language processing models to automate or semi-automate literature curation steps scalably 30 . Transformer architectures ushered in a new wave of interest: for instance, BERT-based models have been used to automate literature data extraction and paper classification tasks 31 , 32 . During the COVID-19 pandemic, a variety of automation resources emerged for dealing with the onslaught of actionable scientific literature 32 , 33 , 34 . With the more recent advent of large language models (LLMs) such as GPTs, other research groups have leveraged these models in automating literature review, such as for paper screening or data extraction 24 , 25 , 26 . However, most peer-reviewed work in this area has focused on automating only a single workflow phase 35 . In a similar vein, and to address the costs of manual literature curation in the context of ASD clinical genomics, we developed EAGLE-AI, an evidence collection, screening, extraction, and scoring workflow that incorporates LLMs. This workflow aims to accelerate the accurate identification of ASD-associated genes by operating, with minimal manual intervention, on a large and evolving corpus of literature ( Fig. 1 ). Download figure Open in new tab Fig. 1 EAGLE-AI system architecture. Schematic of EAGLE-AI’s whole-system architecture, with initial input consisting of natural-language search terms and final output consisting of the populated relational database. Search terms consist of a gene name (e.g. “ DMD ”) and the fixed search terms “ASD” and “autism”. Database entries are dedicated to single ASD cases and include a case patient description, phenotype description, gene name, genomic variant ID, and evidence score. The data extraction and scoring modules of EAGLE-AI both deploy multi-agent workforces using o3-mini 36 and GPT-4o 37 , 38 ( Fig. 2 ). We developed two additional versions of EAGLE-AI: one incorporating a BiLSTM-CRF model, trained in-house for named entity recognition (NER), and another deploying a Llama 3.1-8B model finetuned for EAGLE-compliant data extraction. Download figure Open in new tab Fig. 2 Agentic EAGLE-AI core data extraction workflow. Schematic of Eagle-AI core’s agentic data extraction workforce. The workforce includes an additional task planner and coordinator agent for incorporating recommendations from the judge in the event of a “NEEDS_REVISION” (NR) verdict. Depending on the recommendations, this can yield revision tasks to any of the four workforce agents. The final output report complies with the format presented in Table S1. EAGLE-AI showcases not only the power of third-party LLMs in extracting data from literature, but also their lesser suitability for evidence scoring when a deterministic rules-based algorithm can be applied instead. We expect that EAGLE-AI’s system architecture can generalize, in modular fashion, to automate important aspects of evidence curation for other clinical genomics research questions. Results System architecture In evaluating the association between a genomic variant and ASD, the manual EAGLE curation standard assesses evidence from three categories: genetic evidence, quality of ASD diagnostic evaluation, and experimental evidence (e.g. animal models) ( Table 1 ). Because the EAGLE curation standard is rigorously rules-based, EAGLE-AI implements the same rules in automated fashion for the tasks of data extraction and scoring. As such, the system is comprised of two interconnected modules: an evidence collection service and EAGLE-AI core, which performs the data extraction and scoring steps ( Fig. 1 ). View this table: View inline View popup Download powerpoint Table 1A: EAGLE scoring guidelines, genetic evidence View this table: View inline View popup Download powerpoint Table 1B: EAGLE scoring guidelines, ASD phenotype evaluation View this table: View inline View popup Download powerpoint Table 1C: EAGLE scoring guidelines, experimental evidence The evidence collection service maintains a list of genes currently under investigation and submits requests to Papers-Nexus, a purpose-built web API for automatically retrieving and downloading open-access full-text PDFs from PubMed, Elsevier, and Google Scholar using search terms. To keep the EAGLE-AI databases up to date, the evidence collection service requests papers pertaining to a given gene on a scheduled periodic basis. The evidence collection service subsequently processes the collected papers by converting the PDFs to plain text, extracting table contents, and composing image-to-text descriptions of the paper figures. All text extracted from the papers is converted to embeddings using OpenAI’s text-embedding-3-large matrix and stored in a vector database. The EAGLE-AI workflow handles internal data storage using two distinct databases ( Fig. 1 ). It stores tabular data containing string entries, i.e. paper metadata, extracted cases, and variant information, in a relational database. Embeddings representing each paper’s text are indexed using a vector database. Using an API endpoint for o3-mini 36 , EAGLE-AI assigns semantic flags to each text chunk, where the flags pertain to genes, phenotype terms, and variants. EAGLE-AI core deploys eight distinct LLM agents, four of which are dedicated to data extraction and four of which are dedicated to evidence-scoring each gene and candidate ASD case ( Fig. 1 ). The data extraction agents write to the relational database, using a tabular format populated with detailed information on the ASD case subject, gene, etiological genomic variant, and phenotype (Table S1). EAGLE-AI core’s scoring agents add a final score, indicating the strength of ASD etiological evidence, to the relational database as well. The relational database is therefore the final repository of EAGLE-AI’s data extractions and evidence scores. Evidence collection service architecture EAGLE-AI’s evidence collection service submits requests to Papers-Nexus, an API developed in-house for performing automated literature searches and bulk full-text retrieval from natural-language search queries. To find relevant paper metadata, EAGLE-AI submits automated requests to the search endpoint of Papers-Nexus, where the request includes a general search term field, a gene search term field, and an exclusion term field. Search criteria consist of a gene name (e.g. “ DMD ”) and the fixed search terms “ASD” and “autism”. Using the Bio.Entrez 39 and scholarly 40 Python packages, Papers-Nexus searches for relevant papers in the PubMed, Google Scholar, and Elsevier databases. Its response to the client provides the paper title, author name(s), abstract, PMID, DOI, and citation information for each paper that satisfies the search criteria. In light of our design goal to have EAGLE-AI extract data from a large corpus of papers with minimal manual input, we developed a PDF-miner endpoint for Papers-Nexus. This endpoint is dedicated to bulk downloading open-access full-text PDFs of papers, where the client’s request contains a list of PMIDs or DOIs. Papers-Nexus retrieves full-text PDFs using the PubMed-OA FTP server 41 and the Unpaywall API 42 . Extractify, a module within EAGLE-AI’s evidence collection service, uses Adobe’s pdf.services 43 and openpyxl 44 to convert retrieved papers and supplementary tables to plain text. For each paper, a gene flagger module deploys an o3-mini model to extract HGNC symbols for all ASD-relevant genes mentioned by the paper, as well as paper metadata. The gene symbols are assigned as flags to the paper’s full text. Here, gene symbols are flagged only if the paper presents clinical or experimental evidence linking the gene to ASD. Evidence collection service retrieval rate Using a test set of metadata for 172 papers from the PubMed database, we found that Papers-Nexus retrieved full-text PDFs for 114 (66.3%) of the papers. For all remaining papers, Papers-Nexus retrieved the title, author names, and abstract. Papers-Nexus failed to retrieve a paper’s full-text if the paper is not open-access or if the paper’s host denies transparent requests to automatically download the PDF. Agentic data extraction architecture EAGLE-AI’s data extraction workforce is responsible for extracting three categories of relevant information required for scoring: genomic variant, case ID, and phenotype. It deploys four distinct LLM agents, which perform automated tasks based on their own decision-making from inputs. The agents are assigned to the roles of data extractor, data verifier, data reporter, and data extraction judge. The data extractor, verifier, and judge use OpenAI’s o3-mini model 36 , whereas the reporter uses GPT-4o 37 . The agentic workforce is managed using the CAMEL-AI multi-agent framework 45 Input to the data extraction workforce consists of a single paper’s plain full-text and all gene symbols flagged by the gene flagger ( Fig. 2 ). For each gene in the paper, the data extractor agent receives a prompt that includes the given gene symbol, instructing the extractor to identify and extract all ASD cases in the paper linked to the given gene. For each case, the case subject must have an ASD diagnosis under specified criteria; the gene must be described with identifying detail; and the paper must describe a specific ASD-driving genomic variant. For all eligible cases, the extractor gives as output the case subject’s identifier, the location of the text where the case is described, a brief description of the variant, and an account of pertinent data missing from the paper (Table S2). A prompt to the verifier agent instructs it to review all sections of the paper full-text and identify any cases missed by the extractor. Using CAMEL’s SearchToolkit, the verifier is able seek further information through web search 46 . For each case, its output format resembles that of the extractor (Table S3). The reporter agent’s task is to synthesize all cases from the extractor and verifier. The reporter is instructed to remove any duplicate cases and to reformat the data to match the example presented in Table S1, which in turn matches the schema of EAGLE-AI’s relational database. The judge agent assesses the reporter’s output for quality and consistency. In evaluating quality, the judge checks for whether the data types of populated fields comply with the database schema and whether gene and variant details are logically consistent. The judge flags cases with insufficient details about the gene or candidate ASD-driving variant. It also flags any potentially duplicate cases. Its output includes an overall quality score of high, medium, or low (Table S4). Additionally, it issues a verdict of the gene data from the paper: namely, as “PASS” or “NEEDS_REVISION”. As per the CAMEL-AI multi-agent framework 45 , an additional task planner agent and coordinator agent handle task assignment to the workforce beyond the initial prompts. In cases where the judge issues a verdict of “NEEDS_REVISION”, these agents assign new tasks to the extractor, verifier, reporter, and/or judge based on the judge’s account of the issues. The workforce’s passed output writes to EAGLE-AI core’s relational database. EAGLE-AI data extraction precision-recall We evaluated EAGLE-AI core’s data extraction precision-recall for four different configurations. Here, we measured how well the agentic workforce reproduced the extractions of the EAGLE manual curators. In the first configuration, the agentic workforce was provided no input-output examples in the extractor prompt (“zero-shot”); in the second, the prompt included three input-output examples (“three-shot”). In the third configuration, we incorporated the BiLSTM-CRF NER model into EAGLE-AI core. Here, we coupled the NER model with regular expression pattern-matching fallbacks and a single GPT-4o endpoint 37 for data extraction. We also finetuned a single Llama 3.1-8B model 47 for the task of data extraction. In the fourth configuration, this model was coupled with the BiLSTM-CRF NER model: we used the NER model to relevance-filter and assemble paper text chunks, which we then incorporated into the prompt to the finetuned Llama 3.1-8B model for data extraction. We evaluated all four configurations of EAGLE-AI core on the same set of 116 publications and 96 cases screened for ease of machine-readability. The initial test set was constrained to cases with a final EAGLE score and a reported variant in DMD, SHANK1 , CACNA1D , DDX3X , DDX53 , or MECP2 . Average F1 score was calculated as the mean of the case, variant, and phenotype F1 scores. Each configuration’s extracted cases were matched to the manual curations using a confidence-based case-matching algorithm (see Methods). The zero-shot agentic workforce achieved an average F1 score of 86% ( Fig. 3c ), whereas the three-shot agentic workforce achieved average F1 of 91% ( Fig. 3d ). The NER model achieved an average F1 of 69% ( Fig. 3a ). Finetuned Llama 3.1-8B achieved average F1 of 86% ( Fig. 3b ). Raw performance metrics are included in Table S5, Table S6, and Table S7. Download figure Open in new tab Fig. 3 EAGLE-AI data extraction performance. EAGLE-AI’s annotation extraction precision-recall metrics out of 100 using four different approaches, evaluated across manually scored cases from 116 papers; here, the evaluation set was restricted to cases with a final EAGLE score and with a reported variant in DMD, SHANK1, CACNA1D, DDX3X, DDX53 , or MECP2 . a Extraction performance using the NER model for relevance filtering and text chunk assembly, coupled with a GPT-4o endpoint for data extraction. b Extraction performance of the NER model coupled with a finetuned Llama 3.1-8B model for data extraction. c Extraction performance of agentic EAGLE-AI core using a zero-shot task prompt to the data extraction workforce. d Extraction performance of agentic EAGLE-AI core using a three-shot task prompt to the data extraction workforce. Table S8 presents an example whereby, after finetuning, the model was better able to identify the pathogenicity impact of a DDX3X variant NM_001356.5:c.1171-2A>G as reported in a paper abstract and case patient description. For a case with a MECP2 indel variant, Llama 3.1-8B assigned the accurate phenotype confidence level (low) only after finetuning, resulting in an EAGLE score closer to the manual ground truth (Table S9). Agentic evidence scoring architecture Fig. 4 presents a schematic of EAGLE-AI core’s multi-agent workforce for scoring possible ASD cases and their reported variants. Much like the data extraction workforce, the scoring workforce deploys four distinct LLM agents, with similar task breakdowns: a planner, researcher, reporter, and judge. As of the current version of EAGLE-AI core, the scoring planner and judge use o3-mini 36 , while the scoring researcher and reporter used GPT-4o 37 . Download figure Open in new tab Fig. 4 Agentic EAGLE-AI core scoring workflow. Schematic of the scoring workforce within agentic EAGLE-AI core. Initial inputs consist of case-relevant paper text chunks from the vector database; extracted gene, variant, ASD case, and phenotype data from the relational database; variant metrics from external API calls; and a prompt containing the EAGLE scoring guidelines. The workforce includes an additional task planner and coordinator agent for incorporating recommendations from the judge in the event of a “NEEDS_REVISION” (NR) verdict. Since the EAGLE scoring guidelines ( Table 1 ) require variant information such as null or missense status, functional data, de-novo status, population frequency, and splice-site-modifying status, an upstream variant annotator agent searches for these metrics using third-party APIs ( Fig. 1 ). The annotator sends requests to the Ensemble variant effect predictor’s (VEP) 48 API for transcript-level, protein-level, and clinical impact; VariCarta’s 49 for known ASD association and pathogenicity scores (CADD 1.3 50 , SIFT 51 , and polyphen 52 ); gnomAD’s 53 for population frequency; and LitVar2’s 54 for HGVS protein variant notation and pathogenicity status. The planner agent receives the query case ID, the case’s candidate ASD-driving variant annotated with the third-party metrics, all paper text chunks relevant to the case as stored in the vector database, and all associated data from the data extraction module as stored in the relational database ( Fig. 1 ). It receives also the EAGLE scoring guidelines ( Table 1 ) and a checklist of data required for applying the guidelines (Table S10). Any missing data are tagged by the planner agent as requiring additional search. For all data labelled as missing, the researcher agent uses the suggested search terms and the CAMEL-AI web search toolkit to attempt to retrieve the data using a suite of search engines 46 . A downstream reporter agent leverages all input data from the relational database, vector database, annotator, and scoring researcher in applying the EAGLE scoring guidelines as per Table 1 . It returns a report for the case, summarizing the data and assigning an EAGLE score to the case’s genetic evidence, ASD phenotype evaluation, and experimental evidence (Table S11). It also summarises any evidence against the case’s ASD diagnosis or against the variant’s driving of ASD likelihood. It yields a final EAGLE score for the case, with a maximum of 2 points for genetic evidence and a maximum of 6 points for experimental evidence. The judge agent appraises the quality of the report and issues a final verdict of “PASS” or “NEEDS_REVISION” (Table S12). In the event of a “PASS” verdict, the workforce writes the case’s EAGLE score to the relational database. In the event of a “NEEDS_REVISION” verdict, the scoring workforce incorporates the judge’s feedback and assigns follow-up tasks to the agents as appropriate ( Fig. 4 ). EAGLE-AI scoring error We tested performance of the agentic scoring workforce when provided with three correct scoring examples in the initial task prompt (“three-shot”). Additionally, we measured performance of a deterministic if-else scoring algorithm written in Python. We coupled the algorithm with the BiLSTM-CRF NER model, which was used for relevance-screening and assembling text chunks. To generate the data extractions, we used post-NER regular expression matching to augment entity labelling for the purposes of relevance filtering. We then submitted a single dynamic prompt to a GPT-4o-mini endpoint 38 , which instructed the model to extract the data (Table S13). We used the resulting extractions as input to the algorithm. The algorithm applies the rules-based EAGLE scoring guidelines for genetic, phenotypic, and experimental evidence as described in Table 1 . However, because if-else logic cannot program subjective decision-making, the algorithm applies only the suggested scores instead of the discretionary score ranges specified in brackets. Across 165 cases, the scoring workforce achieved a case-wise symmetric mean absolute percentage error (SMAPE) of 14.3% as per equation [1] (see Methods). As this is an error term, this indicates 85.7% correspondence with manual scores. Across 128 cases, the deterministic scoring algorithm achieved a case-wise SMAPE of 10.1%, thereby outperforming the agentic scoring workforce. Distributions of scores for both the agentic workforce and the scoring algorithm closely match that of the manually curated scores ( Fig. 5a, c ). For both the workforce and the algorithm, Bland-Altman plots do not indicate error bias with respect to score magnitude ( Fig. 5b, d ). For the workforce, 85.45% of its case scores have zero difference from their corresponding manual case scores; for the algorithm, 87.5% of its case scores have zero difference. Download figure Open in new tab Fig. 5 EAGLE-AI scoring performance. EAGLE-AI’s scoring performance under different conditions, where greater score means greater strength of evidence linking a case’s variant to ASD. a, b Agentic scoring workforce performance across 78 papers and 165 ASD cases. a Violin plots of manually curated case scores (orange) v. agentic (purple) EAGLE-AI case scores. b Bland-Altman plot for agentic EAGLE-AI’s scores benchmarked against manual scores, where darker points indicate overlap; here, each point represents an ASD case. c, d Deterministic scoring performance across 90 papers and 128 ASD cases. c Violin plots of manually curated case scores (orange) v. deterministic algorithm’s (purple) scores. d Bland-Altman plot for deterministic algorithm’s scores benchmarked against manual scores. Whole-system performance on eight genes without prior screening We computed performance metrics for the whole EAGLE-AI system ( Fig. 1 ) using papers pertaining to eight genes that were excluded from all prior training, test, and validation sets. The eight genes evaluated were CACNA1A , CASK , CDKL5 , CHAMP1 , CHD3 , FMR1 , GIGYF2 , and GRIN1 . We evaluated all 36 papers related to these genes retrieved by Papers-Nexus; the papers contained a total of 85 candidate ASD cases. Here, we evaluated performance of EAGLE-AI while using the agentic workforces for the data extraction and scoring steps, where the workforces received three-shot task prompts. The 36 papers were not screened for ease of machine-readability or access. Table S14 includes the raw performance metrics. Table S15 includes the manual data extractions, EAGLE-AI’s data extractions, and the raw performance data. EAGLE-AI achieved moderate performance on data extraction, with average accuracy of 72% ( Table 2 ). EAGLE-AI’s scoring accuracy was low, with gene-wise SMAPE of 45.0% ( Table 2 ). View this table: View inline View popup Download powerpoint Table 2: EAGLE-AI performance on 8 genes excluded from prior training, test, and validation sets Discussion On a set of 116 papers pertaining to six ASD-associated genes, EAGLE-AI achieved near-human performance on data extraction ( Fig. 3 ); it also achieved near-human performance on scoring across 165 papers ( Fig. 4 ). The multi-agent extraction workforce, incorporating GPT-4o and o3-mini, outperformed alternative versions using BiLSTM-CRF or finetuned Llama-3.1-8B. Our findings serve as proof-of-concept for applying third-party LLMs, organized in multi-agent workforces, to clinical genomics evidence curation. We expect this approach will save significant amounts of time for humans. We interpret these results with an important caveat: namely, the test papers were screened for favourable text extraction conditions. Converting the entirety of a paper to plain text is nontrivial, so we first screened these papers for ease of machine-readability (see Methods). We conducted this assessment in order to isolate performance of the data extraction and scoring modules. However, this means the input dataset may not fully represent the diversity of relevant scientific literature available for testing. During this initial round of performance assessment, we treated human curation as the gold standard. For this reason, instances of super-human performance, whereby EAGLE-AI avoids errors made by human curators, were not possible to detect. The reported performance metrics of F1 93% and scoring error 14.3% may therefore understate EAGLE-AI’s true performance on these tests. On follow-up, whereby we evaluated the whole-system performance on eight genes without prior screening of the papers, the assessment was manual (see Methods). Here, we noticed instances whereby, on review, EAGLE-AI included more correct details about ASD case phenotypes than did the manual curators (Table S15). In all such instances, the details pertained to highly specific co-occurring conditions and test results (e.g. anatomical abnormalities, brain imaging findings). Under the phenotype accuracy rating system used for performance assessment, these contributed only 10 out of 100 points to the phenotype accuracy rating (see Methods). We therefore updated the manually curated descriptions with EAGLE-AI’s findings and continued to treat the curations as the gold standard. Beyond these details in the phenotype descriptions, we did not observe instances of super-human performance from EAGLE-AI. The follow-up assessment without paper screening exposed EAGLE-AI’s limitations. On this test, data extraction performance was moderate at 72% accuracy and scoring performance was low with error of 45%. Here, EAGLE-AI struggled most with variant recall at 41% (Table S14), meaning that more than half the time it failed to extract the correct ASD-associated variant as reported in the literature. Most often, this was because the variants were reported in supplementary tables that EAGLE-AI could not access through Papers-Nexus. A “missing supplementary” flagger, which prompts human curators to retrieve and upload supplementary materials to EAGLE-AI in the event they are missing, may help address this. Given Papers-Nexus’s 66.3% PDF retrieval rate, a similar “missing paper” flagger would improve EAGLE-AI’s coverage of the literature. This is one example whereby automated tools for evidence curation may require ongoing human intervention. The tables in the papers themselves also contributed to low variant recall. Many papers present cases and their ASD-driving variants in a large table within the full-text. Since these tables are coded as images within PDFs, EAGLE-AI relies on image-to-text conversion to read the table contents, which is more error-prone when the table is in landscape orientation and reads up-to-down. We plan to apply OpenCV’s optical character recognition and image rotation tools 55 to standardize all paper tables before extraction. Regarding EAGLE-AI’s lower performance on scoring, context overload may have contributed. As the 36 follow-up papers were not screened, EAGLE-AI more often encountered critical missing information and depended on background knowledge and the web-search toolkit more. The quality of information from web search was variable and had the potential to confuse the scoring workforce. For instance, when scoring FMR1 — a gene canonically associated with Fragile X Syndrome, which is accompanied by autism in about 40% of male and 15% of female carriers 56 — EAGLE-AI treated Fragile X Syndrome diagnoses as equivalent to ASD. This in turn yielded erroneous scores for many of the FMR1 cases. Additionally, the follow-up paper set included review papers or purely functional studies that did not reproduce ASD-specific phenotypes, which EAGLE-AI sometimes failed to exclude. This latter phenomenon also accounts for why data extraction performance was generally lower. To refine context inputs to EAGLE-AI, we are considering deprecating the web search toolkit, grounding the model to information in the paper texts, and placing a stronger emphasis on inclusion and exclusion criteria in the data extraction workforce prompts. Furthermore, the difference between our initial and follow-up results suggested that human intervention to screen out reviews and off-topic papers may improve EAGLE-AI’s performance considerably. On a test set of 90 papers, our deterministic scoring algorithm performed comparably to the agentic scoring workforce ( Fig. 4 ) and achieved a lower scoring error of 10.1%. EAGLE’s scoring framework is rigorously rules-based 10 such that we could viably program it with if-else algorithmic logic. While LLMs are well suited for extracting data of a standardized format from scientific literature, this finding supports using traditional rules-based algorithms for evidence scoring instead of LLM agents. We intend to deprecate EAGLE-AI’s agentic scoring workforce and replace it with the deterministic algorithm, with the API calls to external resources instead integrated into the data extraction workforce. In our current work, we use EAGLE-AI to regularly assess genes for their role in ASD; this includes genes emerging from our own discovery research and those added to the SFARI database. Setting aside the current performance limitations, we propose that EAGLE-AI’s architecture should be able to generalize to clinical genomics evidence curation as a whole. For instance, EAGLE-AI’s workflow may be able to replace or augment the manual curation approach used by ClinGen’s gene-disease validity working groups. Notably, the human genome project initiated early efforts to digitize data at large scale and to standardize nomenclature, genome references, and databases 57 , 58 , 59 , 60 . For this reason, within the realm of biomedical annotation, artificial intelligence may perform better when applied to genetics data. Some features of EAGLE-AI are tailored to the EAGLE curation task. However, we designed EAGLE-AI with modularity in mind. To apply the workflow to a different curation task, for example to curating for a different disease or disorder, much of the architecture is reusable. For a workflow that performs only data extraction, one would need to change only the search terms, prompts to the data extraction workforce, and schema of the relational database. For each new curation task, however, the prompts and schema would require design consultation with both manual curators and domain subject-matter experts. For additional evidence scoring based on ClinGen’s semi-quantitative (“Definitive”, “Strong”, “Moderate”, “Limited”) framework 13 , a tailored scoring algorithm would likely need to be developed for each genetic disease or disorder/condition. For curation tasks whereby scoring requires considerable subjective decision-making, an agentic LLM workforce may prove more suitable. However, one might re-apply a suite of general principles (e.g. nonsense de-novo variants or deletions present stronger evidence for a monogenic role than inherited or missense variants), such that the development time for disease-specific scoring modules may be short. With the advent of LLMs, we expect that automated workflows for evidence curation will become more common. As a result, the human labour of curating evidence will likely transition away from manual data extraction or evidence scoring and toward designing database schemas, LLM prompts, and evidence scoring algorithms. Moreover, researchers and publishers may come to reconsider how scientific material is published, for example by more often supporting open-access and machine-readable formats. Conclusion EAGLE-AI automates, at near-human performance, the majority of a clinical genomics evidence curation workflow. This includes the data extraction, evidence scoring, and database entry steps. While LLMs are likely better suited to automating data extraction from papers, a deterministic if-else algorithm outperforms the LLM workforce on evidence scoring. Performance limitations emerge when EAGLE-AI is additionally tasked with retrieval of supplementary materials and paper screening. While further development may ameliorate these limitations, so too would human intervention at these retrieval and screening stages of curation. Methods Named entity recognition (NER) model architecture With the aim of benchmarking the LLM workforce against traditional machine learning methods, we developed a version of EAGLE-AI core that does not use LLM agentic workforces but rather depends on a bilateral long short-term memory (BiLSTM) neural network with a conditional random-field (CRF) layer. Using PyTorch 61 , we trained the BiLSTM-CRF model to perform named entity recognition (NER) of genes, variants, and medical condition terms from the extracted literature text. The entity mentions are used to relevance-filter and assemble text chunks before loading them into the EAGLE-AI data extraction module. Here, the data extraction module queries a single GPT-4o API endpoint 37 for data extraction or, alternatively, a finetuned Llama 3.1-8B model 47 . The NER model consists of an embedding layer, an attention layer, a BiLSTM layer, and a CRF layer. We used pretrained BioWordVec embeddings 62 for the embedding layer, with a vocabulary size of 10,475 and where each embedding vector is 200-long. Each embedding vector is concatenated to a corresponding 150-long output from a character encoder, consisting sequentially of a linear layer and ReLU activation function. Both the embedding layer and the character encoder were further trained alongside the rest of the model. The BiLSTM layer consists of two feed-forward LSTM networks with 192-long hidden states. Its output is a tensor of emission scores, which feed into an attention layer consisting sequentially of a linear layer, Tanh activation, multiplication by a context vector, and SoftMax activation. Output from the attention layer feeds into the CRF layer; the CRF layer also receives a target tensor, which contains the true classification scores as per the labelled train/test data. The NER model labels each token as one of 31 named entities pertaining to case, variant, and phenotype information. On the test dataset, it achieved an F1 score of 56% across 201 “PHENOTYPE” entities, 87% across 26 “ASD” entities, 65% across 71 “DISEASE” entities, and 77% across 154 “GENE” entities. NER model training We trained the NER model in PyTorch 61 using the BioRED dataset, which consists of 600 clinical PubMed abstracts 63 . Tokens were generated by first splitting the text into sentences by punctuation boundaries, then separating words by whitespace and isolating leading or trailing punctuation marks as individual tokens. Here, the labelled entities of interest pertained to genes, genomic variants, diseases/disorders, patient case information, signs/symptoms, and experimental validation. We converted the labelling scheme from beginning, inside, outside (BIO) to a binary entity labelling scheme whereby each token is labelled as either inside an entity or outside any entity. We treated output of the CRF layer as the loss function. The model was trained for 36 epochs at a learning rate of 2e-2 and with dropout of 0.5. Llama 3.1-8B finetuning With the aim of benchmarking an LLM workforce against a single finetuned LLM, we finetuned a single Llama 3.1-8B model 47 using a JSON of manually extracted data as per the EAGLE curation guidelines described in Table 1 . The JSON contained query and target response pairs. 20 query-response pairs gave probing questions and target answers about the EAGLE extraction and scoring guidelines; 390 subsequent query-response pairs consisted of assembled paper text chunks and the human-extracted data and scores. Text was tokenized using LlamaTokenizerFast. Using a low-rank (LoRA) matrix finetuning approach, we targeted parameters in the query, key, value, and output projection tensors of the model’s attention mechanism, as well as the gate, up, and down projection tensors of the multilayer perceptron block. Finetuning was conducted over a single epoch using the SFTTrainer from the HuggingFace transformer reinforcement learning library 64 , with a cross-entropy loss function and a learning rate of 2e-4. For parameter optimization, we used paged AdamW 32-bit. Data extraction precision-recall evaluation In testing performance of EAGLE-AI core’s data extraction precision-recall, we drew from evidence extracted by EAGLE’s manual curators. Here, the aim was to test the data extraction step in isolation. We constrained the cases to only those that had a final EAGLE score and had a reported variant in DMD, SHANK1, CACNA1D, DDX3X, DDX53 , or MECP2 . We also screened papers for ease of machine-readability: we checked that machine-extracted text contained minimal misidentified special characters, missing words, incorrect table structures, or missing supplementary data. In total, we included 116 papers. EAGLE’s manual curation workflow is already established and described elsewhere 10 . We collected and stored the source literature from which the manual curators extracted patient/subject information, phenotype descriptions, and variant information. We stored embeddings for the documents in EAGLE-AI’s vector database and used them as input to EAGLE-AI core. In evaluating EAGLE-AI’s data extraction performance, we treated presence of an extraction as positive and absence of an extraction as negative. We evaluated the case ID extractions, variant ID extractions, and phenotype descriptions as separate categories. For each data extraction category, we computed the true positive, false positive, and false negative rate using the definitions in Table 3 . This gave us the corresponding precision, recall, and F1-score of EAGLE-AI core’s data extractions. View this table: View inline View popup Download powerpoint Table 3: EAGLE-AI core data extraction performance testing definitions We matched cases extracted by EAGLE-AI to their corresponding manually curated cases by first matching by normalized publication name and author. We achieved further case-specific matching by adding confidence scores in the event of a matching gene name, case ID, or inheritance pattern. We tested EAGLE-AI’s multi-agent workforce when using task prompts with zero input-output examples (“zero-shot”) ( Fig. 3c ) and three input-output examples (“three-shot”) ( Fig. 3d ), respectively. We also tested data extraction precision-recall of the NER model when used for relevance filtering and text chunk assembly for a given case: during testing, we coupled the NER model with either a single GPT-4o API endpoint 37 for data extraction ( Fig. 3a ) or the finetuned Llama 3.1-8B model for data extraction ( Fig. 3b ). Evidence scoring error evaluation In evaluating EAGLE-AI core’s scoring error, we computed the symmetric mean absolute percentage error (SMAPE) as per equation [1]. The aim was to evaluate performance of the scoring step in isolation. Here, n denotes the number of genes when computing gene-wise SMAPE or the number of candidate ASD cases when computing case-wise SMAPE. We matched cases scored by EAGLE-AI to their corresponding manually curated cases by first matching by normalized publication name and author. We achieved further case-specific matching by adding confidence scores in the event of a matching gene name, case ID, or inheritance pattern. For each case extracted by EAGLE-AI, we treated its highest-confidence match as the corresponding manually curated case when applying equation [1]. We evaluated the scoring error of EAGLE-AI’s multi-agent workflow when given zero-shot and three-shot task prompts. We also evaluated scoring performance of EAGLE-AI when using the NER model augmented by regular expression pattern matching and coupled to a single GPT-4o-mini endpoint 38 for data extraction using the prompt in Table S13. Here, scoring was performed using a deterministic algorithm as per the rules in Table 1 . Whole-system evaluation on eight genes without prior screening We evaluated performance of the whole EAGLE-AI system using the agentic LLM workforces for the data extraction and scoring steps. The aim was to evaluate performance of the entire workflow, including retrieval of supplementary materials, paper screening, data extraction, scoring, and database incorporation. We also aimed to test EAGE-AI on data never before used during design, training, or testing. Here, the workforces received three-shot prompts. The genes evaluated were CACNA1A , CASK , CDKL5 , CHAMP1 , CHD3 , FMR1 , GIGYF2 , and GRIN1 , which had never before been scored using the EAGLE framework. These genes were selected because some have strong evidence for association with ASD as per the SFARI database 29 , while others have relatively little evidence in the literature. We included 36 papers related to these genes; papers were included only if they were successfully downloaded by Papers-Nexus and assigned to these genes by the gene flagger module. We evaluated EAGLE-AI’s performance across 85 candidate ASD cases extracted by manual curators from the 36 papers. The manual curators extracted the gene name, paper name, case ID, phenotype description, phenotype confidence level, sex of the case, age of the case, variant ID, zygosity of the variant, variant inheritance (i.e. de-novo or inherited), and variant impact (e.g. missense). The manual curators also assigned evidence scores to each of the cases using the EAGLE standard operating procedures ( Table 1 ). The curators received the 36 papers and 8 gene names but were blinded to EAGLE-AI’s data extractions and scores for the 85 cases. We measured EAGLE-AI’s performance by manually matching EAGLE-AI’s output to the output of the curators. We matched cases by case ID; in the absence of a matching case ID, we matched cases by phenotype description, age of the case, sex of the case, and/or paper name. We matched variants by rsID 65 , ClinVar ID 66 , HGVS NM number and coding position, HGVS NC or NG number and genome position, HGVS NP number and protein position 67 , or chromosome:position:ref>alt notation relative to a reference genome. We designated matches as true positive (TP), false positive (FP), false negative (FN), or true negative (TN). Sometimes, a case and variant extracted by the manual curators matched to a case extracted by EAGLE-AI, but EAGLE-AI was confidently wrong about the variant. Under these conditions, we counted the variant as both FP and FN. From the count of TPs, FPs, FNs, and TNs, we computed the precision, recall, and F1 score out of 100 percent for both the cases and variants ( Table 2 ). For each of the 85 cases, we measured the phenotype accuracy rating out of 100 points. If EAGLE-AI was correct about ASD status, it received +50 points. If EAGLE-AI was correct about ASD confidence level and reason, it received +20 points. If EAGLE-AI was correct about major co-occurring conditions, i.e. epilepsy, seizures, intellectual disability, and developmental delay, it received +20 points. If EAGLE-AI was correct about detailed co-occurring conditions and testing results, e.g. other psychiatric disorders, electroencephalogram results, brain imaging results, micro or macrocephaly, or anatomical abnormalities, it received +10 points. Table 2 reports the mean phenotype accuracy rating as a percent. For each of the eight genes, we also measured the gene-wise scoring error as per equation [1], where both score EAGLEAI and score manual are summed across all cases for a given gene. Table 2 reports the mean scoring error across the eight genes. Ethics approval and consent to participate This study was approved by The Hospital for Sick Children Research Ethics Board (1000080561). Consent for publication Not applicable: all case reports contained in the present study are from published literature. Availability of data and materials Much of the data generated and analyzed during the current study are included in the published article, its supplementary information files, and the EAGLE-AI proof-of-concept data repository at https://github.com/juliandeanmoran/EAGLE-AI_poc_publicData . Additional EAGLE manual curation data used in training the models of the current study, beyond those data included in the supplementary information files, are too large to make publicly available; however they are available from the corresponding author upon reasonable request. Code for performance assessment is available at the public repository at https://github.com/juliandeanmoran/EAGLE-AI_poc_publicData . The underlying code and some of the training/test/validation datasets for the current study are not publicly available for proprietary reasons. Competing interests JASV serves as a consultant for NoBias Therapeutics Inc. and has received speaker fees for Henry Steward Talks Ltd.. SWS is on the Scientific Advisory Committees of Population Bio and Deep Genomics, and intellectual property from aspects of his research held at the Hospital for Sick Children are licensed to Athena Diagnostics and Population Bio. These relationships did not influence data interpretation or presentation during this study, but are still being disclosed for potential future considerations. Funding We acknowledge support from the Northbridge Chair in Paediatric Research, the SickKids Psychiatry Associates Chair in Developmental Psychopathology, the University of Toronto McLaughlin Centre, the Canada Foundation for Innovation Major Sciences Initiative, and the SickKids Foundation. Authors’ contributions VF developed Extractify, the gene flagger module, the search endpoint of Papers-Nexus, and all versions of EAGLE-AI core; he conducted bulk automated performance assessments of all EAGLE-AI core data extraction and scoring module versions. JM developed the PDF-miner endpoint of Papers-Nexus, conducted manual curation and follow-up performance assessment for agentic EAGLE-AI on eight novel genes, tested and debugged EAGLE-AI core and Extractify, assisted in automated performance assessment of the scoring modules, and was the principal writer of the manuscript. NBS and OR manually curated EAGLE-AI’s ground truth datasets and tested and gave feedback on EAGLE-AI; NBS assisted in the manual performance assessment of EAGLE-AI on eight genes. VF, OR, and NH contributed to writing the manuscript. AW contributed to developing Papers-Nexus. VF, MMA, and WE initiated the project and conceptualized the framework. JASV and SWS provided supervision, funding acquisition, oversight, and guidance throughout the project. All authors read, gave feedback on, and approved the manuscript. Acknowledgements We wish to acknowledge the following resources: MSSNG ( www.mss.ng ), by Autism Speaks and The Centre for Applied Genomics at The Hospital for Sick Children, Toronto, Canada; and SPARK ( www.sparkforautism.org ), by the Simons Foundation Autism Research Initiative. We also thank the participating families for their time and contributions to these databases, as well as the generosity of the donors who supported these programs. Additionally, we wish to thank all research authors whose data published under open access enabled development of EAGLE-AI. Footnotes Figure 5 was updated to include a new batch of scoring performance data for the agentic workforce. As the new data include cases that are not publicly available, we have updated our research ethics statement and acknowledgements accordingly. The paper has been reformatted in line with submission criteria to Genome Biology, the new journal we are submitting to. References 1. ↵ Satam H , et al. Next-Generation Sequencing Technology: Current Trends and Advancements . Biology . 2023 ; 12 ( 997 ). DOI: 10.3390/biology12070997 OpenUrl CrossRef 2. ↵ Stefanski A , Calle-Lopez Y , Leu C , Perez-Palma E , Pestana-Knight E . Clinical sequencing yield in epilepsy, autism spectrum disorder, and intellectual disability: a systematic review and meta-analysis . Epilepsia . 2020 ; 62 ; 143 – 151 . DOI: 10.1111/epi.16755 OpenUrl CrossRef PubMed 3. ↵ Selvanayagam T , et al. Clinical utility of genome sequencing in autism: illustrative examples from a genomic research study . J Med Genet . 2025 ; 62 : 413 – 421 . DOI: 10.1136/jmg-2024-110463 OpenUrl Abstract / FREE Full Text 4. ↵ Quick VS , Wang B , State MW. Leveraging large genomic datasets to illuminate the pathobiology of autism spectrum disorders . Neuropsychopharmacol . 2021 ; 46 ; 55 – 69 . DOI: 10.1038/s41386-020-0768-y OpenUrl CrossRef PubMed 5. ↵ Trost B , et al. Genomic architecture of autism from comprehensive whole-genome sequence annotation . Cell . 2022 ; 185 : 4409 – 4427 . DOI: 10.1016/j.cell.2022.10.009 OpenUrl CrossRef PubMed 6. ↵ Tammimies K , et al. Molecular diagnostic yield of chromosomal microarray analysis and whole-exome sequencing in children with autism spectrum disorder . JAMA . 2015 ; 314 : 895 – 903 . DOI: 10.1001/jama.2015.1wo0078 OpenUrl CrossRef PubMed 7. ↵ Feliciano P , et al. Exome sequencing of 457 autism families recruited online provides evidence for autism risk genes . NPJ Genom Med . 2019 ; 4 : 19 . DOI: 10.1038/s41525-019-0093-8 OpenUrl CrossRef PubMed 8. ↵ Fu JM , et al. Rare coding variation provides insight into the genetic architecture and phenotypic context of autism . Nat Genet . 2022 ; 54 : 1320 – 1331 . DOI: 10.1038/s41588-022-01148-2 OpenUrl CrossRef PubMed 9. ↵ Hoang N , Buchanan JA , Scherer SW . Heterogeneity in clinical sequencing tests marketed for autism spectrum disorders . NPJ Genom Med . 2018 ; 3 : 27 . DOI: 10.1038/s41525-018-0066-3 OpenUrl CrossRef PubMed 10. ↵ Schaaf CP , et al. A framework for an evidence-based gene list relevant to autism spectrum disorder . Nat Rev Genet . 2020 ; 21 : 367 – 376 . DOI: 10.1038/s41576-020-0231-2 OpenUrl CrossRef PubMed 11. ↵ Srivastava S , et al. Meta-analysis and multidisciplinary consensus statement: exome sequencing is a first-tier clinical diagnostic test for individuals with neurodevelopmental disorders . Genet Med . 2019 ; 21 : 2413 – 2421 . DOI: 10.1038/s41436-019-0554-6 OpenUrl CrossRef 12. ↵ Strande NT , et al. Evaluating the clinical validity of gene-disease associations: an evidence-based framework developed by the Clinical Genome Resource . Am J Hum Genet . 2017 ; 100 : 895 – 906 . DOI: 10.1016/j.ajhg.2017.04.015 OpenUrl CrossRef PubMed 13. ↵ ClinGen Consortium . The Clinical Genome Resource (ClinGen): advancing genomic knowledge through global curation . Genet Med . 2025 ; 27 : 101228 . DOI: 10.1016/j.gim.2024.101228 OpenUrl CrossRef PubMed 14. ↵ Riggs ER , et al. Clinical validity assessment of genes frequently tested on intellectual disability/autism sequencing panels . Genet Med . 2022 ; 24 : 1899 – 1908 . DOI: 10.1016/j.gim.2022.05.001 OpenUrl CrossRef PubMed 15. ↵ Balzotti M , et al. Clinical validity of expanded carrier screening: Evaluating the gene-disease relationship in more than 200 conditions . Hum Mutat . 2020 ; 41 : 1365 – 1371 . DOI: 10.1002/humu.24033 OpenUrl CrossRef PubMed 16. ↵ Carter MT , Scherer SW . Autism spectrum disorder in the genetics clinic: a review . Clin Genet . 2013 ; 83 : 399 – 407 . DOI: 10.1111/cge.12101 OpenUrl CrossRef PubMed 17. ↵ Assuah FB , et al. A literature review of similarities between and among patients with autism spectrum disorder and epilepsy . Cureus . 2023 ; 15 : e33946 . DOI: 10.7759/cureus.33946 OpenUrl CrossRef 18. ↵ Etyemez S , et al. The role of intellectual disability with autism spectrum disorder and the documented co-occurring conditions: a population-based study . Autism Res . 2022 ; 15 : 2399 – 2408 . DOI: 10.1002/aur.2831 OpenUrl CrossRef PubMed 19. ↵ Vorstman JAS , Scherer SW. Contemplating syndromic autism . Genet Med . 2023 ; 25 : 100919 . DOI: 10.1016/j.gim.2023.100919 OpenUrl CrossRef PubMed 20. ↵ Vorstman JAS , et al. Autism genetics: opportunities and challenges for clinical translation . Nat Rev Genet . 2017 ; 18 : 362 – 376 . DOI: 10.1038/nrg.2017.4 OpenUrl CrossRef PubMed 21. ↵ Fernandez BA , Marshall CR , Vorstman JAS , Scherer SW . Molecular testing in autism spectrum disorder . In: Diagnostic Molecular Pathology . 2nd ed . 2024 : 291 – 301 . DOI: 10.1016/B978-0-12-822824-1.00016-X OpenUrl CrossRef 22. ↵ Kim SW , et al. Evaluation of familial phenotype deviation to measure the impact of de novo mutations in autism . Genome Med . 2025 ; 17 : 93 . DOI: 10.1186/s13073-025-01532-7 OpenUrl CrossRef 23. ↵ Leblond CS , et al. Meta-analysis of SHANK mutations in autism spectrum disorders: a gradient of severity in cognitive impairments . PLoS Genet . 2014 ; 10 : e1004580 . DOI: 10.1371/journal.pgen.1004580 OpenUrl CrossRef PubMed 24. ↵ Sanders SJ , et al. Progress in understanding and treating SCN2A-mediated disorders . Trends Neurosci . 2018 ; 41 : 442 – 456 . DOI: 10.1016/j.tins.2018.03.011 OpenUrl CrossRef PubMed 25. ↵ Litman A , et al. Decomposition of phenotypic heterogeneity in autism reveals underlying genetic programs . Nat Genet . 2025 ; 57 : 1611 – 1619 . DOI: 10.1038/s41588-025-02224-z OpenUrl CrossRef PubMed 26. ↵ SPARK Consortium . SPARK: a US cohort of 50,000 families to accelerate autism research . Neuron . 2018 ; 97 : 488 – 493 . DOI: 10.1016/j.neuron.2018.01.015 OpenUrl CrossRef PubMed 27. ↵ Yuen RK , et al. Whole genome sequencing resource identifies 18 new candidate genes for autism spectrum disorder . Nat Neurosci . 2017 ; 20 : 602 – 611 . OpenUrl CrossRef PubMed 28. ↵ De Rubeis S , et al. Synaptic, transcriptional and chromatin genes disrupted in autism . Nature . 2014 ; 515 : 209 – 215 . DOI: 10.1038/nature13772 OpenUrl CrossRef PubMed Web of Science 29. ↵ Abrahams BS , et al. SFARI Gene 2.0: a community-driven knowledgebase for the autism spectrum disorders (ASDs) . Mol Autism . 2013 ; 4 : 36 . DOI: 10.1186/2040-2392-4-36 OpenUrl CrossRef PubMed 30. ↵ Shi M , et al. A machine learning approach for the curation of biomedical literature . Advances in Information Retrieval . 2003 ; 597 – 604 . DOI: 10.1007/3-540-36618-0_47 OpenUrl CrossRef Web of Science 31. ↵ Varela-Vega A , Posada-Reyes A , Méndez-Cruz C . Automatic extraction of transcriptional regulatory interactions of bacteria from biomedical literature using a BERT-based approach . Database . 2024 ; baae094 . DOI: 10.1093/database/baae094 OpenUrl CrossRef PubMed 32. ↵ Chen Q , Du J , Allot A , Lu Z . LitMC-BERT: Transformer-based multi-label classification of biomedical literature with an application on COVID-19 literature curation . IEEE/ACM Trans Comput Biol Bioinform . 2022 ; 19 : 2584 – 2595 . DOI: 10.1109/TCBB.2022.3173562 OpenUrl CrossRef 33. ↵ Wang Q , Du J , Allot A , Lu Z . LitMC-BERT: Transformer-based multi-label classification of biomedical literature with an application on COVID-19 literature curation . IEEE/ACM Trans Comput Biol Bioinform . 2022 ; 19 : 2584 – 2595 . DOI: 10.1109/TCBB.2022.3173562 OpenUrl CrossRef 34. ↵ Zhang Y , et al. Automation of literature screening using machine learning in medical evidence synthesis: a diagnostic test accuracy systematic review protocol . Syst Rev . 2022 ; 11 : 11 . DOI: 10.1186/s13643-021-01881-5 OpenUrl CrossRef PubMed 35. ↵ Scherbakov D , Hubig N , Jansari V , Bakumenko A , Lenert LA . The emergence of large language models as tools in literature reviews: a large language model-assisted systematic review . J Am Med Inform Assoc . 2025 ; 32 : 1071 – 1086 . DOI: 10.1093/jamia/ocaf063 OpenUrl CrossRef PubMed 36. ↵ o3-mini. OpenAI ; 2024 . Accessed February 13, 2025 . Available from: https://platform.openai.com/docs/models/o3-mini 37. ↵ GPT-4o. OpenAI ; 2024 . Accessed November 26, 2024 . Available from: https://platform.openai.com/docs/models/gpt-4o 38. ↵ GPT-4o mini. OpenAI ; 2024 . Accessed November 26, 2024 . Available from: https://platform.openai.com/docs/models/gpt-4o-mini 39. ↵ Cock PJA , et al. Biopython: freely available Python tools for computational molecular biology and bioinformatics . Bioinformatics . 2009 ; 25 : 1422 – 1423 . DOI: 10.1093/bioinformatics/btp163 OpenUrl CrossRef PubMed Web of Science 40. ↵ Cholewiak SA , Ipeirotis P , Silva V , Kannawadi A. SCHOLARLY: simple access to Google Scholar authors and citation using Python [software]: Version 1.5 .; 2021 . Accessed Nov 26, 2024 . Available from: https://github.com/scholarly-python-package/scholarly 41. ↵ National Library of Medicine . Data from: PMC Open Access Subset. National Center for Biotechnology Information . Deposited Sept 11, 2025 . Accessed September 25, 2025 . Available from: https://pmc.ncbi.nlm.nih.gov/tools/openftlist/ 42. ↵ Piwowar H , Orr R , Priem J. Unpaywall REST API ; n.d.. Accessed September 25, 2025 . Available from: https://unpaywall.org/products/api 43. ↵ Adobe Inc .. Adobe PDF Services API ; 2025 . Accessed September 25, 2025 . Available from: https://developer.adobe.com/document-services/docs/overview/ 44. ↵ openpyxl contributors. openpyxl ; 2025 . Accessed September 25, 2025 . Available from: https://pypi.org/project/openpyxl/ 45. ↵ Li G , et al. CAMEL: Communicative Agents for “Mind” Exploration of Large Language Model Society . arXiv . 2023 ; 2303 . 17760 . OpenUrl 46. ↵ CAMEL-AI contributors . CAMEL-AI 0.2.36: Key Modules: Tools ; 2024 . Accessed September 25, 2025 . Available from: https://docs.camel-ai.org/key_modules/tools.html 47. ↵ Meta AI. Llama 3.1–8B; n.d .. Accessed September 25, 2025 . Available from: https://huggingface.co/meta-llama/Llama-3.1-8B 48. ↵ McLaren W , et al. The Ensembl Variant Effect Predictor . Genome Biol . 2016 ; 17 : 122 . DOI: 10.1186/s13059-016-0974-4 OpenUrl CrossRef PubMed 49. ↵ Belmadani M , et al. VariCarta: A comprehensive database of harmonized genomic variants found in autism spectrum disorder sequencing studies . Autism Res . 2019 ; 12 : 1728 – 1736 . DOI: 10.1002/aur.2200 OpenUrl CrossRef PubMed 50. ↵ Rentzsch P , Witten D , Cooper GM , Shendure J , Kircher M . CADD: predicting the deleteriousness of variants throughout the human genome . Nucleic Acids Res . 2019 ; 47 : D886 – D894 . DOI: 10.1093/nar/gky1016 OpenUrl CrossRef PubMed 51. ↵ Ng PC , Henikoff S . SIFT: predicting amino acid changes that affect protein function . Nucleic Acids Res . 2003 ; 31 : 3812 – 3814 . DOI: 10.1093/nar/gkg509 OpenUrl CrossRef PubMed Web of Science 52. ↵ Adzhubei I , Jordan DM , Sunyaev SR . Predicting functional effect of human missense mutations using PolyPhen-2 . Curr Protoc Hum Genet . 2013 ; 7 :Unit 7.20. DOI: 10.1002/0471142905.hg0720s76 OpenUrl CrossRef PubMed 53. ↵ Karczewski KJ , et al. The mutational constraint spectrum quantified from variation in 141,456 humans . Nature . 2020 ; 581 : 434 – 443 . DOI: 10.1038/s41586-020-2308-7 OpenUrl CrossRef PubMed 54. ↵ Allot A , et al. Tracking genetic variants in the biomedical literature using LitVar 2.0 . Nat Genet . 2023 ; 55 : 901 – 903 . DOI: 10.1038/s41588-023-01492-4 OpenUrl CrossRef PubMed 55. ↵ Bradski G. The OpenCV Library. Dr. Dobb’s J Software Tools . 2000 ; 25 : 120 – 125 . OpenUrl 56. ↵ Richter JD , Zhao X . The molecular biology of FMRP: new insights into Fragile X Syndrome . Nat Rev Neurosci . 2021 ; 22 : 209 – 222 . DOI: 10.1038/s41583-021-00432-0 OpenUrl CrossRef PubMed 57. ↵ Bruford EA , Braschi B , Denny P , Jones TEM , Seal RL , Tweedie S . Guidelines for human gene nomenclature . Nat Genet . 2020 ; 52 : 754 – 758 . DOI: 10.1038/s41588-020-0669-3 OpenUrl CrossRef PubMed 58. ↵ Green ED , Watson JD , Collins FS . Human Genome Project: twenty-five years of big biology . Nature . 2015 ; 526 : 29 – 31 . DOI: 10.1038/526029a OpenUrl CrossRef PubMed 59. ↵ Trost B , Loureiro LO , Scherer SW . Discovery of genomic variation across a generation . Hum Mol Genet . 2021 ; 30 : R174 – R186 . DOI: 10.1093/hmg/ddab209 OpenUrl CrossRef PubMed 60. ↵ Amberger JS , Bocchini CA , Schiettecatte F , Scott AF , Hamosh A. OMIM.org: Online Mendelian Inheritance in Man (OMIM), an online catalog of human genes and genetic disorders . Nucleic Acids Res . 2015 ; 43 : D789 – D798 . DOI: 10.1093/nar/gku1205 OpenUrl CrossRef PubMed 61. ↵ Paszke A , et al. PyTorch: An imperative style, high-performance deep learning library . arXiv . 2019 ; 1912.01703 . 62. ↵ Zhang Y , Chen Q , Yang Z , Lin H , Lu Z . BioWordVec, improving biomedical word embeddings with subword information and MeSH . Sci Data . 2019 ; 6 : 52 . DOI: 10.1038/s41597-019-0055-0 OpenUrl CrossRef PubMed 63. ↵ Luo L , Lai P-T , Wei C-H , Arighi CN , Lu Z. BioRED: a rich biomedical relation extraction dataset . Brief Bioinform . 2022 ; 23 : bbac282 . DOI: 10.1093/bib/bbac282 OpenUrl CrossRef 64. ↵ HuggingFace. Transformer reinforcement learning (TRL) library ; n.d.. Accessed September 25, 2025 . Available from: https://huggingface.co/docs/trl/v0.21.0/en/index 65. ↵ Sherry ST , et al. dbSNP: the NCBI database of genetic variation . Nucleic Acids Res . 2001 ; 29 : 308 – 311 . DOI: 10.1093/nar/29.1.308 OpenUrl CrossRef PubMed Web of Science 66. ↵ Landrum MJ , et al. ClinVar: public archive of relationships among sequence variation and human phenotype . Nucleic Acids Res . 2014 ; 42 : D980 – D985 . DOI: 10.1093/nar/gkt1113 OpenUrl CrossRef PubMed Web of Science 67. ↵ Hart RK , et al. HGVS Nomenclature 2024: improvements to community engagement, usability, and computability . Genome Med . 2024 ; 16 : 149 . DOI: 10.1186/s13073-024-01421-5 OpenUrl CrossRef 68. Chaves LD , et al. Skewed X-chromosome inactivation in women with idiopathic intellectual disability is indicative of pathogenic variants . Mol Neurobiol . 2023 ; 60 : 3758 – 3769 . DOI: 10.1007/s12035-023-03311-0 OpenUrl CrossRef PubMed 69. Amadori E , et al. Targeted re-sequencing for early diagnosis of genetic causes of childhood epilepsy: the Italian experience from the ‘beyond epilepsy’ project . Ital J Pediatr . 2020 ; 46 : 92 . DOI: 10.1186/s13052-020-00860-1 OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted October 02, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following EAGLE-AI: A large language model workflow for automated extraction and scoring of literature evidence linking genes to autism spectrum disorder Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share EAGLE-AI: A large language model workflow for automated extraction and scoring of literature evidence linking genes to autism spectrum disorder Vinicius Furlan , Julian Moran , Nelson B. Salazar , Olivia Rennie , Ny Hoang , Andrew Wan , Marla Mendes de Aquino , Worrawat Engchuan , Jacob A.S. Vorstman , Stephen W. Scherer medRxiv 2025.09.10.25334730; doi: https://doi.org/10.1101/2025.09.10.25334730 Share This Article: Copy Citation Tools EAGLE-AI: A large language model workflow for automated extraction and scoring of literature evidence linking genes to autism spectrum disorder Vinicius Furlan , Julian Moran , Nelson B. Salazar , Olivia Rennie , Ny Hoang , Andrew Wan , Marla Mendes de Aquino , Worrawat Engchuan , Jacob A.S. Vorstman , Stephen W. Scherer medRxiv 2025.09.10.25334730; doi: https://doi.org/10.1101/2025.09.10.25334730 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15228) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6599) Geriatric Medicine (668) Health Economics (997) Health Informatics (4536) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3332) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9231) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a0067dca3f6e593a',t:'MTc3OTU2NDc5Ng=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.