Can large language models reliably extract human disease genes from full-text scientific literature?

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 51,633 characters · extracted from preprint-html · click to expand
Can large language models reliably extract human disease genes from full-text scientific literature? | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Can large language models reliably extract human disease genes from full-text scientific literature? View ORCID Profile Danqing Yin , Matthew Ka Siu Leung , Darren Wan Ho Pun , Fiona Haixin Chen , Julie Yujin Kwon , Xinyi Lin , View ORCID Profile Joshua W. K. Ho doi: https://doi.org/10.1101/2025.07.27.667022 Danqing Yin 1 School of Biomedical Sciences, Li Ka Shing Faculty of Medicine, The University of Hong Kong , Hong Kong SAR, China 2 Laboratory of Data Discovery for Health, Hong Kong Science Park , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Danqing Yin Matthew Ka Siu Leung 3 ESF South Island School , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Darren Wan Ho Pun 3 ESF South Island School , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Fiona Haixin Chen 3 ESF South Island School , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Julie Yujin Kwon 3 ESF South Island School , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Xinyi Lin 1 School of Biomedical Sciences, Li Ka Shing Faculty of Medicine, The University of Hong Kong , Hong Kong SAR, China 2 Laboratory of Data Discovery for Health, Hong Kong Science Park , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Joshua W. K. Ho 1 School of Biomedical Sciences, Li Ka Shing Faculty of Medicine, The University of Hong Kong , Hong Kong SAR, China 2 Laboratory of Data Discovery for Health, Hong Kong Science Park , Hong Kong SAR, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joshua W. K. Ho For correspondence: jwkho{at}hku.hk Abstract Full Text Info/History Metrics Preview PDF Abstract Manual extraction of high-fidelity gene-disease-phenotype information from human genetics literature is a labor-intensive task that requires trained human genetics researchers to read through many primary research papers. This presents a major challenge for maintaining up-to-date human disease genetic databases. Recent exploration into large language models (LLMs) opens new directions in automating this manual process. However, most approaches depend on pre-training, finetuning, or specialized generative artificial intelligence (GenAI) tools, but there is a lack of empirical evidence to show whether commercially-available LLMs can be directly used to reliably extract gene-disease-phenotype for human genetic diseases. Herein, we perform a benchmark of the use of three zero-shot prompted LLMs, namely GPT-4, DeepSeek and Claude, without task-specific fine-tuning, to extract human genetic information directly from full text of scientific papers. Using known congenital heart diseases (CHD) genes found in the open access CHDgene database ( https://chdgene.victorchang.edu.au/ ) as the benchmark data set, GPT-4o achieved overall 88.8% extraction accuracy across 23 gene entries containing over 57 references, with 100% accuracy in gene name, 78.3% and 76.7% in disease and phenotype fields respectively. This work introduces a lightweight, easy-to-deploy, and yet robust LLM-based agent named GeneAgent, analyze sources of disagreement, and highlight the feasibility of integrating powerful LLM into genetic evidence synthesis workflows. Highlight - First systematic benchmark of LLMs for extracting human gene–disease–phenotype relationships from full-text biomedical articles - GeneAgent: a lightweight, highly accurate prompt-only LLM agent - New domain task-specific evaluation framework Introduction The growing pace of human disease genetic research has resulted in an unprecedented volume of scientific literature, the majority of it containing essential information about gene–disease associations, phenotypes, variants, and underlying pathological mechanisms. There have been various attempts to benchmark the use of generative AI systems to extract gene–disease–phenotype relationships from full text biomedical literature in human genetics. While previous study ( Chen et al., 2025 ) does not directly investigate the use of LLM in extracting gene-disease-phenotype relationship, it applies relation extraction on datasets e.g. ChemProt and DDI2013, which involves identifying relationships between protein-protein and drug-drug interactions with the use of F1-scores for relation extraction to provide benchmark for comparison with zero-shot, few-shot, fine-tuning performances of GPT-3.5, GPT-4, LLaMA 2, PMC-LLaMA on 12 Biomedical Natural Language Processing (BioNLP) datasets. Another study ( Chen et al., 2023 ) also investigated the use of ChatGPT on a broad set of BioNLP tasks, including relation extraction by evaluating GPT-3.5’s performance in extracting ChemProt, Drug-Drug Interaction (DDI) and GAD information using the Biomedical Language Understanding and Reasoning Benchmark (BLURB) framework. Such a benchmarking framework can be adapted or extended for gene-disease-phenotype relationship extraction. Another pilot study ( Jahan et al., 2024 ) also evaluated multiple LLM in six BioNLP tasks including relation extraction on 26 biomedical datasets including BC5CDR, KD-DTI and DDI with BC5CDR being closely similar to gene-disease-phenotype extraction. In addition, LORE is a framework introduced ( Li et al., 2025 ) for producing knowledge graphs of disease-gene relationships mined from PubMed abstracts with 90% mean average precision. A team further assessed the use of GPT-3.5 and GPT-4 in extracting genetic and genomic data from full-text journal articles have ( Poretsky et al., 2025 ), even though the tested literature is on wheat and barley genetics, the methodology applies to extracting gene-phenotype and disease associations from unstructured scientific literature. What’s more, an automation framework has been proposed ( Chang et al., 2024 ) for extracting disease-gene associations from literature which uses Hit Ratio to measure how well the framework ranks known disease-gene associations. However, it only focuses on abstracts due to length constraints. Afterall, in the field of human genetics, extraction of human gene–disease–phenotype information depends heavily on expert-driven, manual extraction from the scientific paper, often in unstructured text. This is often a labor-intensive and time-consuming process that does not keep pace with the rate of human disease genetic discovery ( Hein et al., 2025 ; Lee et al., 2020 ). Automation of the information extraction process using artificial intelligence (AI) methods is important. Traditional text mining in biomedical research mostly relied on classical natural language processing methods ( Rebholz-Schuhmann et al., 2012 ). These systems employed patterns, dependency parsing, and classifiers to extract genetic associations from abstracts and titles in literature repositories such as PubMed Central. While reliable, interpretable, and causally aware AI agents are yet to be developed to accelerate biomedical knowledge discovery (Gao et al, 2025; Ayesha et al, 2025 ; Li et al., 2025 ). Traditional biomedical NLP models are limited by their sentence-level scope, reliance on annotated corpus, and inability to reason over full documents or multi-hop relationships ( Zheng et al., 2024 ; Lamurias et al., 2025 ). Subsequent advances in BioBERT— a fine-tuned large pretrained language model on biomedical corpora ( Lee et al., 2020 ) has improved particularly on named entity recognition (NER) and relation extraction (RE) and full document processing. Recent emerging techniques in large language models (LLMs) such as GPT-3.5, GPT-4, and BioGPT have opened new possibilities for zero- and few-shot learning in biomedical information extraction ( Gill et al., 2024 ). These models demonstrate superficial fluency and task generalization, including for entity and relation extraction without task-specific training. Several studies have revealed the potentials of LLMs for tasks such as drug–gene–cancer triad extraction ( Lai et al., 2025 ), gene–trait retrieval from plant literature ( Poretsky et al., 2025 ), structured evidence extraction from full-text papers (Gao et al., 2025; Li et al., 2025 ) gene function extraction from full-text papers ( Kumar et al., 2025 ) as well as genetic variant prediction from unstructured text ( Ayesha et al., 2025 ). However, most of these approaches rely on retrieval-augmented generation (RAG), model pretraining, task-specific fine-tuning, or external confidence scoring pipelines—all of which involve engineering complexity, computational cost, and risks of hallucination. Despite the abovementioned technical challenges, human genetics literature itself is in nature highly sophisticated to parse. Gene names often overlap with common words (e.g., “MAP,” “ACE”), entity resolution is confounded by species-specific synonyms, and phenotype associations are frequently described implicitly or across sentences ( Poretsky et al., 2025 ). Consequently, LLMs struggle to resolve gene–phenotype relations without clearly stated contextual metadata ( Groza et al., 2024 , Neeley et al., 2025 , Kim et al., 2024 ), which is insufficient or unstructured in the source literature ( Hein et al., 2025 ). These factors affect model accuracy and curator agreement. Furthermore, LLMs are prone to hallucination, where fabricated but plausible-sounding gene–phenotype relations are generated without factual support ( Kodikara and Verspoor, 2024 ). There is a lack of systematic evaluation of the use of generative AI models to extract human disease genes from the full text of scientific papers. One main bottleneck is the lack of high-quality human-curated gene lists as a ground truth. To address this issue, we used a recently published open-access comprehensive database on human congenital heart disease (CHD) genes, called CHDgene ( Yang et al., 2022 ). CHD is a malformation of the heart present at birth with an incidence of 1% of live births. CHDgene is a comprehensive database containing a human-curated list of 189 CHD-causing genes based on human evaluation and curation of 438 publications. In this study, CHDgene serves as a gold-standard resource for benchmarking. In domains like CHD, there has not been any systematic benchmarking on LLM use in the field of human disease genetics. Our expertise in CHD and one of our authors’ involvement in development of CHDgene provides us with a unique opportunity to perform this benchmarking study. This work is the first of its kind in the field of human disease genetics. We also introduced a domain-specific benchmark framework, namely GeneAgent, to evaluate LLM-based extraction of gene–disease–phenotype associations from biomedical literature. GeneAgent, a robust, domain-constrained LLM-based agent that reliably extracts gene-disease-phenotype associations from biomedical literature using zero-shot prompting alone. Unlike prior approaches, our method requires no fine-tuning, no RAG, and no ontology integration. This framework includes a multi-dimensional evaluation schema for assessing consistency across gene, disease, and phenotype outputs. To better capture near-matches and address variability in biomedical terminology, we incorporated fuzzy and semantic matching into the side-by-side evaluation process for disease and CHD phenotype fields. The framework also includes a systematic protocol for identifying and categorizing disagreement cases—such as false positives, false negatives, and partial matches—enabling error analysis towards a more realistic and interpretable assessment of LLM performance in human genetics. Methods and materials Dataset construction and retrieval To assess the ability of LLMs to extract gene-disease-phenotype information for CHD, we leverage the CHDgene repository ( https://chdgene.victorchang.edu.au/ ),a high-quality, manually curated database. CHDgene integrates published findings of gene-level associations with congenital heart defects and serves as our benchmark for genetic evidence synthesis. We restricted our analysis to protein-coding genes listed in CHDgene, each associated with at least one PubMed-indexed primary research article included under Selected Reference section in CHDgene database. Consistency level of information extracted by three state-of-the-art, off-the-shelf, commercially available LLMs—GPT-4o, Claude-Opus-4, and DeepSeek-V3—are quantified across multiple dimensions including gene symbol, gene name, disease associated, and CHD phenotypes. We have randomly sampled 30 CHD genes from the CHDgene’s gene list, and 23 out of the 30 genes have one or more full-length articles as supporting evidence, the rest of the genes were filtered out. For each of the 23 gene, we have compiled custom webscraping scripts in Python and retrieved the following items ( Figure 1 ): The gene symbol and full name Disease annotations (e.g., “Heterotaxy”, “TOF”) Associated CHD phenotypes Indexed biomedical literature with PubMed links Download figure Open in new tab Figure 1. Overall workflow of the GeneAgent study design. Overview of the GeneAgent pipeline for benchmarking LLM-based evidence synthesis of gene– disease–phenotype relationships. The workflow includes data preparation (blue), prompt design (orange), LLM information extraction framework (purple) with result evaluation (yellow), manual error analysis (pink), and final benchmarking (green). The data are all manually verified for correctness. This benchmark data set contains 23 unique CHD-associated genes, containing a total of 57 scientific papers. This corpus formed the benchmark for evaluating the accuracy of LLM-based extraction. To construct the test set for LLM evaluation, we retrieved full-text PDFs of the selected literatures from PubMed Central. PDF files were stored and labeled with standardized filenames containing the gene symbol. Manual inspection was performed to ensure PDF quality and relevance by humans. Articles not containing original genetic data or not in English were excluded, ensuring 57 high-quality PDF documents as the final test corpus. GeneAgent design We developed a zero-shot prompt ( Figure 2 ) which is designed to emulate the structure of entries in CHDgene. The goal was to instruct the LLM to extract, in a structured and faithful manner, the gene symbol, full gene name, associated CHD phenotype(s), and disease term (if specified) from the full-text scientific paper. Download figure Open in new tab Figure 2. GeneAgent prompt design template. The prompt consisted of two parts (1) main body of GeneAgent and (2) evaluation. The GeneAgent contains the following components: A concise instruction (system message) defining the task of structured evidence synthesis An example with a known CHDgene record (e.g., “Symbol: ACVR2B; Name of gene: activin A receptor type 2B; Disease: Heterotaxy, visceral, 4; CHD Phenotype: - Atrial septal defect; - Pulmonary atresia; …”) Evaluation prompt is as below: A set of evaluation guidelines to compare the synthesized evidence extracted from literature against formal CHDgene database records No retrieval augmentation, external tools, or task-specific fine-tuning were applied. Evaluation Metrics To systematically evaluate the evidence synthesis performance of LLMs, we assessed the structured outputs along multiple dimensions aligned with the CHDgene database schema utilizing the evaluation prompt ( Figure 2 ). For each input paper, the LLM-generated evidence was compared to the gold-standard CHDgene record and scored using a multi-dimensional consistency framework. Scoring System: Consistency Evaluation Each synthesized field — including gene symbol, gene name, disease, and phenotype list — was scored independently using the following criteria: Consistent (1): The LLM output exactly matched or was semantically equivalent to the CHDgene entry. Inconsistent (0): The output contradicted the CHDgene record (e.g., incorrect gene name, unrelated disease). Unknown (-1): The output included plausible but unverifiable information not present in CHDgene or the reference paper (e.g., ambiguous synonyms, inferred phenotypes without direct evidence). In particular, for CHD Phenotype field, since it contains a list of one or more phenotype terms, was evaluated as follows: We computed the percentage of phenotypes correctly extracted compared to the curated CHDgene phenotype list. This score was treated as a continuous metric (0–100%), reflecting partial recovery of phenotype content. To assess model reliability, we recorded: True Positives: Responses containing CHD existing phenotypes supported by the CHDgene entry and the PDF content. False Positives: Responses containing CHD phenotypes unsupported by either the CHDgene entry or the PDF content. False Negatives: Cases where the reference paper clearly contained CHD phenotype information, but the LLM failed to return any structured output. Total score equals to the sum of consistency scores across all fields. Benchmark design for LLM evaluation We initially established an exact-match scoring strategy, where perfect alignment of extracted terms with the CHDgene database entries was required for positive scoring. However, recognizing inherent variations in biomedical nomenclature, we progressively enhanced our matching criteria. For the disease dimension, we implemented a fuzzy-matching algorithm using Levenshtein similarity scores (threshold ≥ 85%), accommodating minor differences such as inheritance notations (e.g., autosomal dominant denoted as “(AD)”). . In the beginning, phenotype terms were standardized by lowercasing, punctuation removal, and singular/plural harmonization (e.g., “defects” unified to “defect”). Subsequently, we implemented synonym mappings to resolve common abbreviations and acronyms (such as “VSD” to “ventricular septal defect”), thereby reducing false negatives stemming from terminological discrepancies. Phenotype consistency was quantified by calculating the proportion of successfully matched phenotype terms, with a threshold of 80% match rate designating a positive consistency count. Negative controls To ensure robustness, we tested LLM behavior on some negative control papers, which consists of 26 randomly sampled articles from PubMed Central what are not related to CHD or genetics. These samples were selected such that the correct response when presented to the system is that the system will indicate that the paper does not contain any CHDgene related information for extraction. Data and code availability All processing scripts were implemented in Python (v3.12). Our entire pipeline, including prompt templates, webscrapping scripts, evaluation notebooks and evaluated results, is made publicly available at https://github.com/hiyin/GeneAgent . Result Overall accuracy of three LLMs To assess overall performance, we aggregated consistency scores across all evaluated fields— gene name, disease association, and CHD phenotype—into a single average score per model. As shown in Figure 3 , GPT-4o achieved the highest overall consistency, with an average rate of 88.8%. This performance reflects the model’s superior alignment with the CHDgene gold-standard database across both categorical and list-based biomedical fields. Claude-Opus-4 and DeepSeek-V3 displayed comparable overall consistency rates (69.9% and 66.9%, respectively). This convergence suggests their overall extraction fidelity is limited when compared to GPT-4o. These findings underscore the importance of evaluating LLMs not only on isolated information types but also in terms of integrated biomedical comprehension. Download figure Open in new tab Figure 3. Overall consistency of LLMs in gene–disease–phenotype extraction benchmark. A bar plot showing the average consistency rate (%) of three large language models (LLMs) — GPT-4o, Claude-Opus-4, and DeepSeek-V3 — evaluated across three core categories: gene name, associated disease, and CHD phenotype. Consistency was measured against expert-curated annotations from the CHDgene database. GPT-4o achieves the highest average consistency (88.8%), followed by Claude-Opus-4 (69.9%) and DeepSeek-V3 (66.9%). Detailed analysis of LLMs With respect to multi-dimensional comparison in Figure 4 , GPT-4o demonstrated the highest consistency across all dimensions, achieving perfect accuracy in gene name or symbol recognition (100%), and strong performance in disease (78.3%) and CHD phenotype (76.7%) fields. In contrast, Claude-Opus-4 and DeepSeek-V3 both showed moderate performance in CHD phenotype extraction (75.4% and 58%, respectively) but lagged substantially in disease association recognition, with consistency rates of only 30.4% and 28.6%. GPT-4o reached 100% accuracy in name of gene field, followed by Claude-Opus-4 and DeepSeek-V3 scoring 73.9% and 81% closely. Notably, gene symbol extraction appeared relatively robust across all models, whereas disease and phenotype annotations exhibited greater variability. Download figure Open in new tab Figure 4. Model-wise consistency of gene–disease–phenotype extraction benchmarked against CHDgene gold-standard annotations. A bar plot depicting the consistency rates (%) of three leading LLMs—GPT-4o, Claude-Opus-4, and DeepSeek-V3. Consistency was assessed across four dimensions: gene names or symbol, name of gene disease associations, and CHD phenotypes, based on comparison against expert-curated CHDgene database entries. These findings suggest that while LLMs such as GPT-4o are capable of approaching expert-level consistency in certain structured tasks, extracting multi-faceted biomedical associations — especially disease and phenotypes — remains an open challenge for all models. The observed variability highlights the importance of fine-grained benchmarking strategies and domain-specific alignment in biomedical LLM deployment. In-depth error analysis: Disease field One question we ask is which diseases are most or least likely to be wrong under which model, and we detected some patterns, such as length of disease name. Our error analysis has produced both qualitative and quantitative summary on the disease labels below: It is clear that long, punctuation-rich disease names are far more error-prone (see Table 2 ). Names that tripped up ≥ 2 models are almost twice as long as universally correct ones (41 vs 20 characters; 5 vs 2 words) and much more likely to contain commas (79 % vs 20 %) or inheritance qualifiers such as “(AD)/(AR)” (86 % vs 40 %). In contrast, short, uncluttered names with minimal punctuation are reliably handled by all models. View this table: View inline View popup Table 1. Matching strategies adopted in evaluating the consistency of LLM extracted fields against CHDgene entries. View this table: View inline View popup Download powerpoint Table 2. Lexical features of the 23 benchmarked disease names, stratified by model performance. Median length (characters and words) and frequency of punctuation markers are reported for (i) the full test set, (ii) the 14 diseases that at least two LLMs failed to reproduce, and (iii) the 5 diseases that every model rendered correctly. View this table: View inline View popup Download powerpoint Table 3. Description on textual characteristics of most missed and least missed disease by LLMs. Welch two-sample t tests show that disease labels missed by ≥ 2 models are significantly longer than those rendered correctly by all three models, both in character count (p ≈ 0.033) and word count (p ≈ 0.017). There are no significances detected for the other two categories (Has commas and Has parentheses). We observe a clear monotonic relationship between disease-name length and extraction failure—-longer disease name labels, more errors: adding ∼20–25 characters raises the expected error count by one model (see Table 2 ; Figure 5 ). Error dispersion rises sharply once ≥2 models fail. When two or more systems miss the same gene, the corresponding labels are no longer just longer—they are structurally more complex, often embedding commas, semicolons, or inheritance qualifiers such as “(AD)/(AR)”. These composite strings stretch the upper whiskers of the boxLJplot beyond 100 characters. Conversely, short, canonical names remain safe. The only 4 genes that every model handled perfectly—ANKRD11, CHD7, HRAS, and KANSL1—all carry concise, single-clause disease terms. Their ground-truth labels cluster tightly around 20 characters and lack additional punctuation, underscoring that brevity and lexical familiarity protect against hallucination. Download figure Open in new tab Figure 5. Disease-name length drives extraction errors across models. A box plot that compares the character-length distribution of ground-truth disease names for genes where 0, 1, 2 or 3 large language models (GPT-4o, Claude-Opus-4, DeepSeek-V3) produced a mismatch. Red colours encode the number of models extracted wrongly, grey cross mark each disease name and red “×” symbols mark group means. In-depth error analysis: CHD Phenotypes field Next we ask which phenotypes are most or least likely to be wrong? We counted the CHD phenotypes that are false positives and false negatives. The error analysis ( Figure 6 ) indicates that large language models systematically over-predict highly prevalent septal phenotypes while under-detecting anatomically nuanced or terminologically heterogeneous phenotypes. Ventricular septal defect and patent ductus arteriosus alone constitute almost 1/3 of all FP , implying that when uncertainty arises the models default to “textbook” phenotypes. By contrast, omissions are enriched for complex entities such as heterotaxy, aberrant vascular branches, and subtle valvular malformations, revealing limited recall for less stereotypical descriptors that are likely under-represented in training data. The bidirectional presence of aortic stenosis and patent ductus arteriosus among both false positives and false negatives further underscores inconsistent decision thresholds rather than a unidirectional bias. Collectively, these trends suggest that although there is propensity for LLMs to “hallucinate” these broad commonly cited CHD phenotypes as FP when the source text mentions them tangentially, FN terms are either (i) nuanced variants of common phenotypes (e.g. mitral stenosis vs mitral regurgitation ), or (ii) rarer phenotypes that appear less often in training data. Download figure Open in new tab Figure 6. Model-specific distribution of false-positive (FP) and false-negative (FN) CHD phenotypes. Horizontal bar plots show the 15 most frequent false-positive (panel a) and false-negative (panel b) CHD phenotypes produced by the LLM ensemble when compared with the CHDgene reference. Bars are ordered by descending absolute count, with numeric labels indicating the number of errors for each phenotype. Ventricular septal defect (12) and patent ductus arteriosus (10) were the most common false positives, followed by pulmonary stenosis, atrial septal defect, and pulmonary atresia (8 each). False negatives were led by patent ductus arteriosus and heterotaxy (4 each), with aortic stenosis, mitral stenosis, and patent foramen ovale recorded three times apiece. Patent ductus arteriosus and aortic stenosis appeared in both error categories. Download figure Open in new tab Figure 7. Stacked horizontal bar chart of the 15 most frequent false negative and false-positive CHD phenotypes. Bars are partitioned by model — GPT-4o (green), Claude-Opus-4 (orange) and DeepSeek-V3 (purple) — and ranked by the combined error count for each phenotype. Numeric tick marks on the abscissa denote absolute counts contributed by the three models. Across the top-15 CHD phenotypes (89 total false positives), Claude-Opus-4 accounted for 48 errors (54 %), DeepSeek-V3 for 31 errors (35 %) and GPT-4o for 10 errors (11 %). The two most over-called phenotypes—ventricular septal defect (12) and patent ductus arteriosus (10)— were driven largely by Claude-Opus-4 (≈67 % and 50 % of their respective totals). A similar pattern was observed for other common septal anomalies (pulmonary/atrial septal defects, pulmonary atresia), whereas model contributions converged for the least frequent items (e.g. hypoplastic aortic arch, common atrium; each three total errors with roughly equal model shares). Prompt-specific gate-keeping effectiveness The zero-shot prompt we used does more than ask the model to list a gene, disease and CHD phenotype; it also lays out a set of gate-keeping rules that must be satisfied before any extraction is attempted. If the provided paper’s PDF file does not contain any human genetic content. Specifically: No gene symbols or gene full names are mentioned. No references are made to genetic diseases, including CHD. There is a complete absence of gene–disease–phenotype relationships or any molecular biology terminology relevant to CHDgene extraction. As such, the GeneAgent prompt correctly fails this document because it lacks any extractable content within the scope of human genetics or CHD. GeneAgent successfully distinguishes genetics papers from the 26 non-genetics articles by matching against human genetics signatures with 100% gate-keeping effectiveness. True genetics papers repeatedly reference gene symbols (KMT2D, SMAD4), disease, and CHD phenotypes, excluded paper lack of these markers or mentions genes only parenthetically as background. Discussion The emergence of general-purpose LLMs capable of extracting gene–disease–phenotype relationships from unstructured literature presents an inflection point in the curation of human genetics knowledge bases. In this study, we systematically benchmarked three leading commercially available LLMs — GPT-4o, Claude-Opus-4, and DeepSeek-V3 — on a curated corpus of CHD genetics articles, and assessed their capacity to replicate structured outputs across four key fields. Our findings reveal both the immediate potential and the critical limitations of current LLMs in evidence synthesis. Fully automatic evidence synthesis of human gene-disease-phenotype relationship remains a challenge All three LLMs accurately extracted gene symbols and gene names with minimal error. GPT-4o consistently outperformed Claude-Opus-4 and DeepSeek-V3 across all fields. However, accurate disease attribution—particularly fine-grained diagnoses that include inheritance patterns or CHD-specific subtypes — remained challenging for all models, with exact match rates falling below 28.6% ( Figure 4 ) even under relaxed matching criteria. This trend recapitulates findings from earlier phenotype-focused evaluations ( Groza et al., 2024 ) and underscores the limitations of generalist language models when tasked with clinically granular output. Disease and phenotype labels are difficult to extract The disease field remains the most error-prone output. We identify three dominant causes: (1) Granularity mismatch, where models omit diagnostic qualifiers (e.g., “CHARGE syndrome (AD)” simplified to “CHARGE syndrome”); (2) Lack of ontology anchoring, which prevents synonym normalization (e.g., “Heart and brain malformation syndrome(AR)” vs “SMG9-deficiency syndrome (Heart and brain malformation syndrome, OMIM #616920)”); and (3) Bias toward syndromic generalizations, likely rooted in pretraining corpora that over-represent high-level diagnostic labels ( Ji et al., 2023 ). The CHD Phenotype field errors exhibit a different but equally systematic pattern. False positives overwhelmingly target “textbook” phenotypes — ventricular septal defect (VSD), patent ductus arteriosus (PDA), and tetralogy of Fallot — regardless of whether these are emphasized in the source paper. This reflects a long-tail prior bias: the over-representation of certain phenotypes in biomedical corpora skews token probability and inflates common terms under uncertainty ( Hoffman & Kaplan, 2002 ; Groza et al., 2024 ). Conversely, false negatives are concentrated among subtle or anatomically complex malformations — heterotaxy, arch anomalies, valvular defects — often expressed in indirect or diffuse language. Importantly, many phenotype errors arise not from hallucination in the traditional sense, but from intrinsic co-reference drift. As documented in the clinical summarization literature ( Gandhi et al., 2025 ; Huang et al., 2024 ), LLMs tend to elevate secondary findings or umbrella terms when a precise causal link is under-specified in text. Our hallucination audit confirmed that >90% of “incorrect” disease and phenotype outputs were present somewhere in the article but assigned incorrect salience by the model. Truly extrinsic inventions—terms absent from the source text— were exceedingly rare, confirming prior taxonomies of hallucination in biomedical natural language generation ( Li et al., 2025 ). Divergent model behaviour: decoding mechanisms shapes error profiles While GPT-4o achieved the most balanced overall performance, model-specific decoding strategies shaped distinct error modes. Claude-Opus-4 contributed over half of all false-positive phenotypes, a consequence of its higher sampling temperature and permissive top-p settings ( Anthropic Model Card, 2025 ), which favour exhaustive recall over precision. By contrast, GPT-4o, tuned with reward models that prioritize calibration and brevity ( OpenAI, 2025 ), exhibited fewer hallucinations but occasionally under-reported less salient phenotypes. DeepSeek-V3 trailed both models in rare disease resolution, particularly in multi-clause or non-canonical expressions, suggesting that training corpus diversity governs model resilience to linguistic variability in human genetics-related terms. These distinctions matter for future application design. Claude’s aggressive generation may be preferable in workflows where high recall is acceptable, while GPT-4o offers greater accuracy in unsupervised extraction settings. Toward reliable model-assisted curation Our results suggest three practical strategies to bridge the gap between raw LLM output and clinical annotations in human genetics database: Ontology-constrained decoding . Anchoring generation to structured vocabularies such as OMIM, Orphanet, or the Human Phenotype Ontology (HPO) improves lexical normalization and reduces semantic drift ( Groza et al., 2024 ; Farquhar et al., 2024 ). Probability-based confidence gating . Biomedical hallucination detectors that leverage output log-probability variance can effectively flag low-certainty candidates for human review ( Farquhar et al., 2024 ), minimizing curation overhead. Prompt-based role restriction . Explicit instructions to exclude umbrella terms and extract only the primary CHD diagnosis mitigate over-inclusion and co-reference inflation, as also recommended in recent clinical summarization protocols ( Gandhi et al., 2025 ). Together, these strategies do not require fine-tuning and can be deployed in a zero-shot manner. Limitations and future directions Several limitations are worth considering. First, our benchmark corpus comprises only English-language papers; extending evaluation to multilingual corpora is essential in a global context as some genetic papers are written in a language outside English. Second, all experiments were performed in a zero-shot setting, improvement in performance can be expected through few-shot, chain of thought prompting, and RAG (Verspoor, 2024). We have not specifically compared our zero-shot setting to any of these settings. Third, our field set was constrained to four core identities (Symbol, Name, Disease, Phenotype); other clinically relevant variables— variant detail, inheritance mode, population-specific effects—remain unexplored. Incorporating these will be critical for end-to-end database curation pipelines. Conclusion The current generation of LLMs demonstrates strong competence in extracting structured genetic knowledge from full-text biomedical literature. Yet they remain fragile in the case of semantic nuance, co-reference dilemma, and clinical specificity. Rather than rely on model scale alone, future advances should prioritize ontology-guided evidence synthesis and context-sensitive prompting. By addressing these painpoints, we are close to building a truly scalable, trustworthy biomedical AI agent for evidence synthesis in human genetics literatures. Funding This study was supported in part by the AIR@InnoHK initiative of the Innovation and Technology Commission of the Hong Kong Special Administrative Region Government. Author contribution Conceptualization of the study including the methodology was jointly led by JWK Ho and D Yin. DWH Pun, MKS Leung, F Chen and J Kwon contributed to feasibility study and initial prompt development. D Yin developed and LLM prompts and the benchmarking framework. D Yin performed manual literature screening and completion of LLM output cross-checking. X Lin critically evaluated the results and provided feedback. D Yin and JWK Ho wrote and revised the manuscript. Acknowledgements We are grateful to the CHDgene project team members for curating the gold-standard database used in this benchmark. Funder Information Declared AIR@InnoHK References 1. ↵ Anthropic . ( 2025 ). Claude-Opus-4 Model Card (Technical Report v1.1) . 2. ↵ Ayesha , N. , Mujahid , M. , Mirdad , A. R. , Alamri , F. S. , & Khan , A. R . ( 2025 ). Large language model in healthcare for the prediction of genetic variants from unstructured text medicine data using natural language processing. Computers , Materials and Continua , 84 ( 1 ), 1883 – 1899 . doi: 10.32604/cmc.2025.063560 OpenUrl CrossRef 3. Baddour , M. , Paquelet , S. , Rollier , P. , De Tayrac , M. , Dameron , O. , & Labbé , T. ( 2024 ). Phenotypes extraction from text: Analysis and perspective in the LLM era . 2024 IEEE 12th International Conference on Intelligent Systems (IS) , 1 – 8 . doi: 10.1109/IS61756.2024.10705235 OpenUrl CrossRef 4. Bhattarai , K. , Oh , I. Y. , Moran Sierra , J. , et al. ( 2025 ). Leveraging GPT-4 for identifying cancer phenotypes in electronic health records: A performance comparison between GPT-4, GPT-3.5-turbo, Flan-T5, Llama-3-8B and spaCy methods . JAMIA Open , 7 ( 3 ), ooae060 . doi: 10.1093/jamiaopen/ooae060 OpenUrl CrossRef 5. ↵ Chang , J. , Wang , S. , Ling , C. , Qin , Z. , & Zhao , L . ( 2024 ). Gene-associated disease discovery powered by large language models (No . arXiv : 2401 . 09490 ). arXiv. doi: 10.48550/arXiv.2401.09490 OpenUrl CrossRef 6. ↵ Chen , Q. , Hu , Y. , Peng , X. , Xie , Q. , Jin , Q. , Gilson , A. , Singer , M. B. , Ai , X. , Lai , P.-T. , Wang , Z. , Keloth , V. K. , Raja , K. , Huang , J. , He , H. , Lin , F. , Du , J. , Zhang , R. , Zheng , W. J. , Adelman , R. A. , … Xu , H . ( 2025 ). Benchmarking large language models for biomedical natural language processing applications and recommendations . Nature Communications , 16 ( 1 ), 3280 . doi: 10.1038/s41467-025-56989-2 OpenUrl CrossRef PubMed 7. ↵ Chen , Q. , Sun , H. , Liu , H. , Jiang , Y. , Ran , T. , Jin , X. , Xiao , X. , Lin , Z. , Chen , H. , & Niu , Z . ( 2023 ). An extensive benchmark study on biomedical text generation and mining with ChatGPT . Bioinformatics , 39 ( 9 ), btad557 . doi: 10.1093/bioinformatics/btad557 OpenUrl CrossRef 8. ↵ Farquhar , S. , Kossen , J. , Kuhn , L. , Gal , Y. , et al. ( 2024 ). Detecting hallucinations in large language models using semantic entropy . Nature , 630 , 625 – 630 . doi: 10.1038/s41586-024-07421-0 OpenUrl CrossRef PubMed 9. ↵ Gandhi , P. , et al. ( 2025 ). A framework to assess clinical safety and hallucination rates of LLMs for medical text summarisation. npj Digital Medicine , 8 , 58 . https://www.nature.com/articles/s41746-025-01670-7 OpenUrl 10. ↵ Gill , R. , Kelnar , K. , & Duong , D . ( 2024 ). GIX: Gene interaction extraction from biomedical literature using hybrid architectures . In Proceedings of the 2024 ALTA Workshop . https://aclanthology.org/2024.alta-1.10/ 11. ↵ Groza , T. , Caufield , H. , Gration , D. , et al. ( 2024 ). An evaluation of GPT models for phenotype concept recognition . BMC Medical Informatics and Decision Making , 24 , 30 . doi: 10.1186/s12911-024-02439-w OpenUrl CrossRef 12. ↵ Hein , D. , Christie , A. , Holcomb , M. , et al. ( 2025 ). Iterative refinement and goal articulation to optimize large language models for clinical information extraction . npj Digital Medicine , 8 , 301 . doi: 10.1038/s41746-025-01686-z OpenUrl CrossRef PubMed 13. ↵ Hoffman , J. I. , & Kaplan , S . ( 2002 ). The incidence of congenital heart disease . Journal of the American College of Cardiology , 39 , 1890 – 1900 . OpenUrl FREE Full Text 14. ↵ Huang , L. , et al. ( 2024 ). A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions . ACM Transactions on Information Systems , https://arxiv.org/pdf/2311.05232 15. ↵ Jahan , I. , Laskar , M. T. R. , Peng , C. , & Huang , J . ( 2024 ). A comprehensive evaluation of large language models on benchmark biomedical text processing tasks (No . arXiv : 2310 . 04270 ). arXiv. doi: 10.48550/arXiv.2310.04270 OpenUrl CrossRef 16. ↵ Ji , Z. , Lee , N. , Frieske , R. , Yu , T. , Su , D. , Xu , Y. , Ishii , E. , Bang , Y. J. , Madotto , A. , & Fung , P . ( 2023 ). Survey of hallucination in natural language generation . ACM Computing Surveys , 55 ( 12 ), Article 248. doi: 10.1145/3571730 OpenUrl CrossRef 17. ↵ Kim , J. , Yang , J. , Wang , K. , Weng , C. , & Liu , C . ( 2024 ). Assessing the utility of large language models for phenotype-driven gene prioritization in the diagnosis of rare genetic disease . American Journal of Human Genetics , 111 ( 10 ), 2190 – 2202 . doi: 10.1016/j.ajhg.2024.08.010 OpenUrl CrossRef PubMed 18. ↵ Kodikara , N. , & Verspoor , K . ( 2024 ). Evaluating hallucination in multilingual biomedical LLMs . PLOS ONE , 19 ( 5 ), e0303231 . doi: 10.1371/journal.pone.0303231 OpenUrl CrossRef PubMed 19. ↵ Kumar , N. , & Mukhtar , M. S . ( 2025 ). An NLP-based method to mine gene and function relationships from published articles . Scientific Reports , 15 , 7503 . doi: 10.1038/s41598-025-91809-z OpenUrl CrossRef PubMed 20. ↵ Lamurias , A. , Sousa , D. F. , & Couto , F. M. ( 2025 ). Text mining for bioinformatics using biomedical literature . In S. Ranganathan , M. Cannataro , & A. M. Khan (Eds.), Encyclopedia of bioinformatics and computational biology ( 2nd ed., pp. 50 – 61 ). Elsevier . doi: 10.1016/B978-0-323-95502-7.00017-8 OpenUrl CrossRef 21. ↵ Lai , A. , Becker , B. , & Zhou , Z . ( 2025 ). Grounded LLMs for drug–gene–cancer triad extraction using literature snippets . Nature Communications , 16 , 2332 . doi: 10.1038/s41467-024-45563-x OpenUrl CrossRef PubMed 22. ↵ Lee , J. , Yoon , W. , Kim , S. , et al. ( 2020 ). BioBERT: A pre-trained biomedical language representation model for biomedical text mining . Bioinformatics , 36 ( 4 ), 1234 – 1240 . doi: 10.1093/bioinformatics/btz682 OpenUrl CrossRef PubMed 23. ↵ Li , P.-H. , Sun , Y.-Y. , Juan , H.-F. , Chen , C.-Y. , Tsai , H.-K. , & Huang , J.-H . ( 2025 ). A large language model framework for literature-based disease–gene association prediction . Briefings in Bioinformatics , 26 ( 1 ), bbaf070. doi: 10.1093/bib/bbaf070 OpenUrl CrossRef 24. Lin , A. E. , et al. ( 2000 ). Hospital-based prevalence and spectrum of heterotaxy . Journal of Pediatrics , 136 , 735 – 742 . OpenUrl 25. ↵ Neeley , M. , Qi , G. , Wang , G. , Tang , R. , Mao , D. , Liu , C. , & Hu , X . ( 2025 ). Survey and improvement strategies for gene prioritization with large language models (arXiv 2501.18794) . arXiv . doi: 10.48550/arXiv.2501.18794 OpenUrl CrossRef 26. ↵ Poretsky , E. , Blake , V. C. , Andorf , C. M. , & Sen , T. Z . ( 2025 ). Assessing the performance of generative artificial intelligence in retrieving information against manually curated genetic and genomic data . Database , 2025 , baaf011. doi: 10.1093/database/baaf011 OpenUrl CrossRef 27. Poretsky , R. , Bhandari , B. , & Sandhu , K . ( 2025 ). Assessing the performance of generative artificial intelligence in retrieving information against manually curated genetic and genomic data . Database: The Journal of Biological Databases and Curation . doi: 10.1093/database/baae052 OpenUrl CrossRef 28. ↵ Rebholz-Schuhmann , D. , Oellrich , A. , & Hoehndorf , R . ( 2012 ). Text-mining solutions for biomedical research: Enabling integrative biology . Nature Reviews Genetics , 13 , 829 – 839 . doi: 10.1038/nrg3337 OpenUrl CrossRef PubMed 29. Suhardi , R. , Shrestha , A. , & Wickramasuriya , A. ( 2025 ). Large language models for trait extraction from plant genetics literature . arXiv preprint . https://arxiv.org/abs/2412.18154 30. Wei , Z. , Wang , S. , Rong , X. , Liu , X. , & Li , H. ( 2025 ). Shadows in the attention: Contextual perturbation and representation drift in the dynamics of hallucination in LLMs . arXiv preprint arXiv:2505.16894 . https://arxiv.org/abs/2505.16894 31. ↵ Yang , A. , Alankarage , D. , Cuny , H. , Ip , E. K. K. , Almog , M. , Lu , J. , Das , D. , Enriquez , A. , Szot , J. O. , Humphreys , D. T. , Blue , G. M. , Ho , J. W. K. , Winlaw , D. S. , Dunwoodie , S. L. , & Giannoulatou , E . ( 2022 ). CHDgene: A curated database for congenital heart disease genes . Circulation: Genomic and Precision Medicine . 32. ↵ Zheng , H. , Wang , S. , & Huang , L . ( 2024 ). A comprehensive survey on document-level information extraction . In Proceedings of the Workshop on the Future of Event Detection (FuturED) (pp. 58 – 72 ). Association for Computational Linguistics . 33. ↵ OpenAI . ( 2025 ). GPT-4o System Card (Technical Report v0.9) . View the discussion thread. Back to top Previous Next Posted July 31, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Can large language models reliably extract human disease genes from full-text scientific literature? Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Can large language models reliably extract human disease genes from full-text scientific literature? Danqing Yin , Matthew Ka Siu Leung , Darren Wan Ho Pun , Fiona Haixin Chen , Julie Yujin Kwon , Xinyi Lin , Joshua W. K. Ho bioRxiv 2025.07.27.667022; doi: https://doi.org/10.1101/2025.07.27.667022 Share This Article: Copy Citation Tools Can large language models reliably extract human disease genes from full-text scientific literature? Danqing Yin , Matthew Ka Siu Leung , Darren Wan Ho Pun , Fiona Haixin Chen , Julie Yujin Kwon , Xinyi Lin , Joshua W. K. Ho bioRxiv 2025.07.27.667022; doi: https://doi.org/10.1101/2025.07.27.667022 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17697) Bioengineering (13895) Bioinformatics (41951) Biophysics (21456) Cancer Biology (18594) Cell Biology (25520) Clinical Trials (138) Developmental Biology (13381) Ecology (19903) Epidemiology (2067) Evolutionary Biology (24323) Genetics (15612) Genomics (22510) Immunology (17738) Microbiology (40401) Molecular Biology (17184) Neuroscience (88622) Paleontology (667) Pathology (2833) Pharmacology and Toxicology (4825) Physiology (7644) Plant Biology (15158) Scientific Communication and Education (2046) Synthetic Biology (4296) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00