Full text
54,885 characters
· extracted from
preprint-html
· click to expand
Agentic memory-augmented retrieval and evidence grounding for medical question-answering tasks | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Agentic memory-augmented retrieval and evidence grounding for medical question-answering tasks Shuyue Jia , Subhrangshu Bit , Varuna H. Jasodanand , Yi Liu , View ORCID Profile Vijaya B. Kolachalama doi: https://doi.org/10.1101/2025.08.06.25333160 Shuyue Jia 1 Department of Electrical and Computer Engineering, Boston University , Boston, MA 02215, USA MPhil Find this author on Google Scholar Find this author on PubMed Search for this author on this site Subhrangshu Bit 2 Department of Computer Science, Boston University , Boston, MA 02215, USA BS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Varuna H. Jasodanand 3 Department of Medicine, Boston University Chobanian & Avedisian School of Medicine, Boston University , Boston, MA 02215, USA BS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yi Liu 4 Faculty of Computing & Data Sciences, Boston University , Boston, MA 02215, USA MSc Find this author on Google Scholar Find this author on PubMed Search for this author on this site Vijaya B. Kolachalama 2 Department of Computer Science, Boston University , Boston, MA 02215, USA 3 Department of Medicine, Boston University Chobanian & Avedisian School of Medicine, Boston University , Boston, MA 02215, USA 4 Faculty of Computing & Data Sciences, Boston University , Boston, MA 02215, USA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Vijaya B. Kolachalama For correspondence: vkola{at}bu.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF ABSTRACT Objective To evaluate if a tool-using agent-based system utilizing large language models (LLMs) for medical question-answering (QA) tasks outperforms standalone LLMs. Methods We developed a unified, open-source LLM-based agentic system that integrates document retrieval, re-ranking, evidence grounding, and diagnosis generation to support dynamic, multi-step medical reasoning. Our system features a lightweight retrieval-augmented generation pipeline coupled with a cache-and-prune memory bank, enabling efficient long-context inference beyond standard LLM limits. The system autonomously invokes specialized tools, eliminating the need for manual prompt engineering or brittle multi-stage templates. We compared the agentic system against standalone LLMs on various medical QA benchmarks. Results Evaluated on five well-known medical QA benchmarks, our system outperforms or closely matches state-of-the-art proprietary and open-source medical LLMs in multiple-choice and open-ended formats. Specifically, our system achieved accuracies of 82.98% on USMLE Step 1 and 86.24% on USMLE Step 2, surpassing GPT-4’s 80.67% and 81.67%, respectively, while closely matching on USMLE Step 3 (88.52% vs. 89.78%). Conclusion Our findings highlight the value of combining tool-augmented and evidence-grounded reasoning strategies to build reliable and scalable medical AI systems. INTRODUCTION Large language models (LLMs) are transforming medical research and practice, showing promise in tasks such as medical question answering (QA) and clinical decision support 1 – 4 . However, several challenges continue to limit their reliability and scalability in real-world applications. One major concern is hallucination, which involves the generation of confident yet factually incorrect or ungrounded responses. Another issue is the limited context window of current LLMs, which restricts the amount of information they can process at once, often necessitating retrieval-augmented generation (RAG) pipelines. While RAG improves grounding, it typically incorporates only a subset of relevant evidence, which can introduce bias or lead to incomplete assessments 5 – 7 . Additionally, many diagnostic systems require manually engineered multi-stage prompts 8 – 11 , making them difficult to scale and adapt. To improve reliability, recent work has explored continual pretraining on medical corpora 12 – 14 , instruction fine-tuning and reinforcement learning to enhance medical reasoning 12 , 14 – 16 , and RAG frameworks for grounding model outputs in high-quality evidence 5 , 6 , 8 , 9 . Despite this progress, most systems focus on either improving reasoning or grounding, rather than jointly optimizing both. Yet, evidence-based medical practice requires sound diagnostic reasoning and alignment with high-quality clinical evidence 17 . Recent advances in medical reasoning and diagnosis using LLMs have generally progressed along three major directions. The first focuses on continual pretraining of publicly available general-purpose LLMs on domain-specific medical corpora, including textbooks, research articles, and podcast transcripts 12 – 14 , 18 . The second emphasizes instruction tuning or reinforcement learning using medical datasets, which may be manually curated or generated using systems like ChatGPT; these models are fine-tuned through supervised learning or reward feedback to improve chain-of-thought reasoning and emulate realistic doctor-patient interactions 12 , 14 – 16 . Both strategies aim to enhance the medical reasoning skills of general-purpose LLMs, yet despite gains on benchmarks, these models remain vulnerable to hallucinating factually incorrect or unsupported content. A third direction has explored RAG pipelines to address hallucination risks by grounding model outputs in retrieved medical documents 5 , 6 , 8 , 9 , 11 , improving factuality but often prioritizing retrieval without simultaneously optimizing for complex diagnostic reasoning. These observations motivate the need for unified approaches that seamlessly combine robust evidence retrieval with dynamic, multi-step medical reasoning. Medical AI agents leverage the reasoning and language capabilities of LLMs to perform complex clinical tasks, including diagnosis and decision support 19 . Recent work on medical AI agents has evolved in three directions. The first focuses on role simulation, where agents emulate clinical roles such as doctors, nurses, and patients in simulated environments; these multi-agent systems aim to model clinical workflows through collaborative interactions and reasoning 20 – 24 . The second centers on visual question answering, where agents are augmented with domain-specific tools, such as segmentation models for identifying salient regions in medical images and optical character recognition systems for processing textual content from clinical documents 25 , 26 . While promising, these approaches often lack explicit mechanisms for diagnostic reasoning or robust integration with large-scale medical knowledge bases. The third involves tool-augmented LLMs, where agents are equipped with capabilities such as document retrieval, function calling, and database access; however, these systems often depend on resource-intensive model retraining or rely on closed-source, paid platforms (e.g., GPT-4) 2 , 27 – 30 , limiting scalability and transparency. Current trends point toward an unmet need for flexible, lightweight, and interpretable frameworks that can dynamically orchestrate evidence gathering, reasoning, and clinical decision-making without prohibitive computational overhead. Our work addresses this emerging need by designing a modular, open, and deployment-friendly system for medical diagnosis support. To address these challenges, we present a unified, agentic system that integrates evidence retrieval, reranking, grounding, and diagnosis generation. Our system uses open-source tools to orchestrate the entire pipeline, from query analysis to final diagnosis, drawing from a comprehensive evidence base that includes PubMed abstracts and full texts, ClinicalTrials.gov entries, the New England Journal of Medicine (NEJM) case reports, medical textbooks, and curated Wikipedia content 5 , 31 – 34 . To efficiently manage this information, we adopted a two-stage retrieval process including coarse-grained retrieval followed by fine-grained reranking. To circumvent the limitations of LLM context windows, we introduced a cache-and-prune memory mechanism that retains high-relevance documents across reasoning steps, allowing the system to make informed decisions over extended sequences. Our contributions are summarized as follows: (i) We propose a unified, fully-automated system that integrates document retrieval and reranking, evidence grounding, and diagnosis generation through an open-source AI agent; (ii) We present a tool-augmented LLM-based agentic architecture that enables dynamic multi-step tool use, eliminating the need for manually engineered prompts or multi-stage pipelines; (iii) We introduce a cache-and-prune memory bank mechanism that efficiently extends the retention of relevant documents for evidence grounding, enhancing diagnostic accuracy and computational efficiency. METHODS Our agentic system comprises three core components ( Fig. 1 ): (1) a lightweight RAG pipeline for efficient evidence retrieval and reranking; (2) an open-source LLM-based agent that autonomously orchestrates diagnostic workflows, from retrieval to reasoning, grounding, and diagnosis generation; and (3) a cache-and-prune memory bank that preserves relevant long-context documents to improve evidence use and diagnostic accuracy. Below we provide additional details on these components. Download figure Open in new tab Figure 1. Overview of the agentic system. Our pipeline is powered by an open-source LLM-based agent that operates within a fully automated, dynamic workflow. When presented with either multiple-choice or open-ended medical questions, the agent leverages a suite of specialized tools to generate a structured comparison of answer choices or to synthesize plausible options in open-ended scenarios. It then dynamically assesses whether external evidence is needed to answer the question. If no external information is required, the agent proceeds directly to produce a final diagnosis. Otherwise, it initiates a retrieval process, querying a curated knowledge base to obtain the TopK relevant documents and rerank the TopR most informative sources. This evidence pool includes clinical case reports from NEJM, article abstracts from PubMed, full-text articles and textbooks from PubMed Central, clinical trials from ClinicalTrials.gov, and general content from Wikipedia. To manage long-context documents efficiently, the agent employs a cache-and-prune memory bank mechanism. It iteratively reviews B documents in ⌈ R/B ⌉ batches until sufficient information is gathered, ensuring optimal comprehension within the model’s context window. After synthesizing the selected evidence, the agent integrates key insights to deliver a grounded diagnosis. Its performance is further enhanced by an aid kit of five custom-designed tools, detailed in Supplementary Section Designed tools. Lightweight RAG pipeline We implemented a lightweight yet effective RAG pipeline to acquire relevant medical evidence tailored to patient-specific queries. This pipeline consists of two main stages: document retrieval and evidence reranking. In the retrieval stage, we utilized SPECTER , a semantic retriever trained with citation-informed objectives, which improved document-level representation, making it particularly effective in biomedical and scientific domains 5 , 35 . Denoted as ϕ , SPECTER retrieves documents by computing semantic similarity between the query representation x and document embeddings from the evidence corpus V , using L2 distance as the similarity metric: As summarized in Table 1 , our evidence corpus includes diverse resources such as research paper abstracts and full texts, medical textbooks, clinical case reports, clinical trials, and curated Wikipedia articles. These are drawn from publicly accessible databases such as PubMed, PubMed Central, ClinicalTrials.gov, and Wikipedia. To refine the quality of retrieved TopK evidence, we implemented a reranking stage. Here, a quantized general text embedding model, gte-Qwen2-7B-instruct , was used to score and rank the candidate snippets at a finer granularity, and denoted as ψ 36 , 37 . This ensures that the top-ranked documents are semantically aligned with the query and optimally suited for downstream diagnostic reasoning: View this table: View inline View popup Download powerpoint Table 1. Performance evaluation on multiple choice medical QA benchmarks. Accuracy scores across five benchmarks: USMLE Step 1–3, MedQA, and MedExpQA. The table compares our agentic system with proprietary (GPT-4, ChatGPT) and open-source (BioMistral, OpenBioLLM, UltraMedical, PodGPT) language models. Bold and underlined values denote the best and second-best performances for each benchmark, respectively. where K represents the pool of documents retrieved from the six data sources, and R denotes the final ranked subset selected for use by the AI agent. Together, these two stages ensured that only the most relevant, high-quality evidence is forwarded for diagnostic processing. This design mitigates hallucination risks and supports accurate, grounded medical reasoning. Agent for diagnostic workflow We integrated an open-source LLM-based agent π as the core multi-step reasoning engine of our system to enable autonomous and interpretable medical decision-making. This agent orchestrates the entire diagnostic workflow, including document retrieval and reranking, patient query interpretation, evidence grounding, and diagnosis generation. We designed the agent to operate using a set of predefined tools (See Section), eliminating the need for manually crafted prompts or rigid, hard-coded stages. Each tool encapsulated a specific function, such as querying external evidence sources, grounding highly-relevant documents, or synthesizing diagnostic conclusions. This allows the agent to perform complex clinical tasks in a structured and interpretable manner. By leveraging explicit tool usage and structured reasoning, the agent interacted dynamically and efficiently with the RAG pipeline and memory bank, enabling long-context, evidence-based clinical inference. Specifically, in the initial step, given a predefined set of tools T , the patient’s background and medical query Q , and instructions I , the AI agent generates a response sequence y following an autoregressive policy: where y <t denotes the previously generated tokens up to time step t − 1. Furthermore, at each step of the multi-step reasoning process, the agent autonomously selects the most appropriate tool to address the current subtask and produces intermediate responses in a multi-turn conversational format. Let C denote the full conversation history. At each step, the agent selects an action a from the action space A . Formally, During execution, each intermediate reasoning step produced by the agent, along with any corresponding tool outputs, is appended to the conversation history C , enabling coherent multi-turn interactions. This modular tool-based design empowers the agent to flexibly respond to a wide range of clinical queries while ensuring transparency, reproducibility, and traceability throughout the diagnostic workflow. A detailed description of each tool’s output parameters is provided in Fig. 4. Unlike traditional prompt engineering approaches, the agent autonomously determines when and how to invoke each tool through multi-step reasoning. This enables transparent, step-by-step justification of clinical decisions grounded in retrieved evidence. Importantly, the entire workflow operates locally, preserving patient privacy and minimizing reliance on proprietary APIs or cloud-based infrastructure. Cache-and-prune memory bank mechanism To overcome the context window limitations of LLMs and ensure persistent access to relevant evidence for the final diagnostic response, we implemented a cache-and-prune memory bank mechanism. This memory module functions as an external, dynamically updated storage that retains high-relevance documents retrieved and reranked during earlier stages of the pipeline. As shown in Algorithm 1 , at each reasoning step indexed by i , the AI agent stores the grounded evidence in the memory bank M i . During the final diagnosis generation, the agent accesses M i , enabling long-horizon reasoning across multi-turn interactions. To avoid information overload, we designed a cache-and-prune mechanism that filters out outdated or unused evidence, guided by grounding tool usage patterns: Algorithm 1 Agentic memory-augmented retrieval and evidence grounding system Download figure Open in new tab where represents the top-ranked documents from each reranked batch R , and Prune(·) is a logistic filtering function that removes documents that are not grounded by the AI agent. The final diagnosis is synthesized by conditioning on the complete conversational context, task, instructions, and the curated memory bank M i : Unlike standard RAG pipelines, which statically inject evidence into the prompt and risk truncation, our memory bank enables selective retention of key information and strategic pruning of less relevant content. This design supports broader context integration and sustained reasoning, mitigating fixed-window constraints and ensuring that only the most salient knowledge informs the agent’s output 5 . Implementation details All experiments were conducted locally on a distributed setup with four NVIDIA L40S GPUs, powered by the vLLM inference engine 38 . We employed Qwen2.5-72B-Instruct as the primary backbone (i.e., AI agent), with the tensor parallelism and pipeline parallelism settings configured to 4 and 1, respectively. By default, the sampling parameters were set to a temperature of 0 and top_p of 1. To address occasional issues with final answer extraction, we re-evaluated the experiments with a temperature of 0.7 and top_p of 0.8. Due to diminished instruction following capabilities after enabling the static YaRN technique, we assigned the maximum context window to 32,768 tokens 39 . In practice, however, we observed an effective context window limit of approximately 10,000 tokens. For each multi-turn conversation, we restricted the maximum number of tokens to 8, 192. Additionally, we selected the top 3 most relevant evidence documents for the baseline model that operates without tool access. For evidence retrieval, we fixed TopK = 32 per source, resulting in 192 candidate documents from six sources. After reranking, we selected TopR = 32 documents for use by the agent in downstream tasks 5 , 6 . Lastly, the cache-and-prune memory bank operates with a default batch size B = 4 for incremental evidence integration and pruning. EXPERIMENTAL SETTINGS Database for evidence retrieval To ensure grounding in credible and up-to-date medical evidence, we assembled a comprehensive evidence corpus drawn from six trusted sources. They include peer-reviewed articles from PubMed Central, medical textbooks curated from the NLM LitArch Open Access Subset, and registered clinical trials from the National Library of Medicine at the U.S. National Institutes of Health 31 , 32 , 34 . To enhance clinical relevance and provide real-world diagnostic context, we also incorporated clinical case reports published since 2016 in NEJM 33 . We also included two supplementary sources, article abstracts and Wikipedia entries, originally curated by Xiong et al. 5 . Section Database for evidence retrieval in the supplementary materials includes a detailed summary and description of each source included in our evidence retrieval database. Benchmark evaluation across question formats To evaluate the performance of our agentic system, we used five widely adopted medical question answering benchmarks: the United States Medical Licensing Examination (USMLE) Step 1, Step 2, and Step 3, and the English subsets of MedQA and MedExpQA 6 , 40 , 41 . These datasets encompass a range of medical knowledge, clinical reasoning, and decision-making skills, and are well-established standards for evaluating LLMs. See Supplementary Section Experimental benchmarks and Table 3 for more details. We ran experiments in two settings to test our approach: (1) multiple-choice QA, where models choose from given answer options, and (2) open-ended QA, where models generate answers without being given choices. We compared the performance of the agent against proprietary and open-source medical LLMs. Proprietary models included OpenAI GPT-4 and GPT-3.5 (i.e., ChatGPT), while the open-source models evaluated were BioMistral (7B), OpenBioLLM (8B/70B), UltraMedical (8B/70B), and PodGPT (70B) 2 , 18 , 42 – 44 . We provided a detailed description of these models and our used prompts in Supplementary Section Backbone large language models and Used prompts. To ensure a fair comparison, we manually ran all open-source models using the vLLM serving engine and applied a consistent zero-shot direct-response prompt. This decision was based on our observation that the performance of some models tended to degrade when presented with more complex instruction prompts. We also set model-specific maximum input lengths and generation token limits to accommodate varying context window sizes. See Supplementary Section Database for evidence retrieval for more details. The full source code developed for this study, including all implementation and evaluation scripts, will be made publicly available on GitHub, along with detailed documentation and instructions to facilitate reproducibility. For multiple-choice QA experiments, we activated four core tools within the AI agent: perform_comparison, enable_search, relevance_analysis , and locate_evidence . Accuracy was used as the primary evaluation metric, consistent with standard practices in the field 5 , 6 , 13 , 15 , 45 . In the open-ended QA setting, we removed predefined answer options from the prompts and extended the generate_options tool by building it on top of the same four tools used in the multiple-choice setting. Performance was evaluated by cosine similarity based on two state-of-the-art embedding models: SFR-Embedding-2_R (SFR) from Salesforce Research and gte-Qwen2-7B-instruct (GTE) from Alibaba Group 36 , 46 . We also employed BERTScore’s F1 metric, calculated using Microsoft’s deberta-xlarge-mnli model, to compare the model-generated answer against ground truth 47 . See Supplementary Sections Designed tools and Evaluation models for open-ended question answering for more details. RESULTS AND DISCUSSION Evaluation of multiple-choice benchmarks Our agentic system achieved state-of-the-art performance across multiple-choice medical QA benchmarks, surpassing all evaluated models on USMLE Step 1, Step 2, and MedExpQA ( Table 1 ). Specifically, it achieved 82.98% on Step 1 and 86.24% on Step 2, representing relative improvements of 2.31% and 4.57%, respectively, over GPT-4, which is the strongest baseline. On MedExpQA, where GPT-4 was not available, our model outperformed the next-best model (OpenBioLLM 70B at 71.20%) by a relative margin of 7.20%. For USMLE Step 3, our model reached 88.52%, narrowly trailing GPT-4 (89.78%) by only 1.26%. On MedQA, it scored 73.29%, which is 5.58% below GPT-4 but still ahead of all open-source models. When compared to the strongest open-source baseline, PodGPT (70B), our model demonstrated consistent and significant gains: 9.58% on Step 1, 13.76% on Step 2, 13.93% on Step 3, 8.25% on MedQA, and 15.20% on MedExpQA. Evaluation of open-ended medical questions Our agentic system achieved the highest performance across all five benchmarks in the open-ended question answering setting, outperforming all baseline models on nearly every metric ( Table 2 ). For semantic textual similarity measured using SFR model, it achieved the top score on four of five benchmarks, including USMLE Step 1 (0.87), Step 2 (0.85), Step 3 (0.86), and MedExpQA (0.84), while ranking second on MedQA (0.85 vs. 0.86 from OpenBioLLM 70B). While measured by the GTE model, it outperformed all baselines on USMLE Steps 1–3 (0.66, 0.62, and 0.65 respectively), and was second-best on MedQA (0.61) and MedExpQA (0.60). Similarly, our system achieved the highest or second-highest BERTScore on all benchmarks, tying for the highest score on USMLE Step 1 (0.68), Step 2 (0.67) and MedExpQA (0.65), and ranking second on USMLE Step 3 (0.70 vs. 0.71 from OpenBioLLM 70B) and MedQA (0.67 vs. 0.70 from OpenBioLLM 70B). View this table: View inline View popup Download powerpoint Table 2. Performance evaluation on open-ended medical questions. This table reports model performance without answer choices using three embedding-based evaluation metrics: semantic textual similarity scores computed by two state-of-the-art embedding models (SFR and GTE) and BERTScore. Results are shown as mean ± standard deviation across five benchmarks (USMLE Steps 1–3, MedQA, and MedExpQA). Bold indicates the highest score, and underlined indicates the second-highest score for each metric within each benchmark. Analysis of tool usage Tool usage patterns revealed that the agent adapted its strategy to the complexity of each benchmark ( Fig. 2a & Fig. 2b ). While perform_comparison remained a consistent first-line tool across all exams, enable_search was used selectively, indicating the agent’s discretion in deciding when external evidence was necessary to resolve clinical uncertainty. The progressively higher use of relevance_analysis and locate_evidence tools from Step 1 to Step 3 underscores the agent’s increasing reliance on iterative evidence appraisal and grounding in more advanced clinical scenarios. This aligns with the expectation that Step 3 questions, which often involve multi-system reasoning or longitudinal management, demand a deeper chain-of-thought and external validation. The wide distribution in the number of calls to these tools further supports the hypothesis that the agent’s behavior is not hardcoded but context-dependent. In particular, questions that required repeated invocations of relevance_analysis and locate_evidence likely reflected either ambiguous clinical presentations or sparse initial document matches, prompting further rounds of evidence screening. Such behavior demonstrates the value of the cache-and-prune memory mechanism, which allowed the agent to incrementally accumulate, filter, and retain salient information while pruning irrelevant context. This architecture enabled scalable reasoning over long contexts without overwhelming the model’s input window, supporting robust performance even in highly iterative diagnostic tasks. Overall, the tool usage patterns validate both the flexibility and compositional reasoning capabilities of the agent in adapting to a diverse range of clinical question formats. Download figure Open in new tab Figure 2. Tool usage statistics across USMLE benchmarks. (a) Bar plot showing the average number of times each tool was invoked per question across the USMLE Step 1, Step 2, and Step 3 benchmarks. Tools include perform_comparison, enable_search, relevance_analysis , and locate_evidence . (b) Stacked bar plot indicating the proportion of tool usage frequencies (from 1 to 8 calls) for relevance_analysis and locate_evidence , grouped by USMLE exam. Ablation studies We compared performance with and without tool access to evaluate the impact of incorporating tools into the agentic pipeline. Specifically, we performed evaluation using structured instructions I without tool access (w/o Tools), and using the same instructions with full access to the toolset T (Ours). As shown in Table 3 , tool integration led to performance improvements: 1.07% on USMLE Step 1, 3.67% on USMLE Step 2, and 4.91% on Step 3, with an average gain of 3.22% across all of them. These results underscore the value of equipping the agent with specialized tools. View this table: View inline View popup Download powerpoint Table 3. Impact of core components of the agentic system. Performance comparison of the agentic system with ablated versions lacking key components: tool integration, cache-and-prune memory mechanism, and external evidence search. Values for ablations indicate the relative percentage drop in accuracy compared to the full model across USMLE Step 1, Step 2, and Step 3 benchmarks. To isolate the contribution of individual components, we conducted targeted ablations. Removing the relevance_analysis and locate_evidence tools (denoted w/o Cache & Prune) resulted in an average drop of 2.36%, with performance reductions of 1.07%, 2.75%, 3.27% on USMLE Step 1-3, highlighting the utility of the iterative memory mechanism. When we removed the enable_search tool and the document retrieval and reranking modules ( w/o Evidence Search ), performance dropped by 4.12% on average, with declines of 2.13%, 3.67%, and 6.55% on Steps 1, 2, and 3, respectively, emphasizing the critical role of external evidence in clinical reasoning. We evaluated how the number of documents retrieved and reranked influenced the performance ( Figure 3 ). Accuracy generally improved with increasing context length up to TopR = 32, beyond which gains plateaued. For Step 2, performance peaked at TopR = 8 with a 7.80% improvement over GPT-4 and remained stable (5.60% gain) from TopR = 32 onward. Step 1 exhibited a similar trend, with gains peaking at 5.50% at TopR = 4 and plateauing beyond TopR = 8. In contrast, while step 3 exhibited lower performance relative to GPT-4, its performance fluctuated slightly at lower TopR values and stabilized around −1.40% to −0.50% from TopR = 4 onward. These results highlight the effectiveness of our cache-and-prune memory bank in leveraging extended context efficiently, while also demonstrating the diminishing utility of low-ranked evidence beyond TopR = 32. Download figure Open in new tab Figure 3. Impact of evidence context length. The figure shows the relative performance change on USMLE Step 1, Step 2, and Step 3 benchmarks as a function of the number of top reranked documents (TopR) processed by the agentic system. Each point represents the performance difference relative to GPT-4. Different line styles and colors indicate the benchmark type. The y-axis shows the relative difference in accuracy, and the x-axis denotes the number of retrieved documents. Limitations and future work Despite the strong performance of our agentic system, some limitations highlight important directions for future research. First, while our system is designed as a general-purpose medical QA agent, its toolset may require domain-specific customization to handle specialized tasks, such as rare disease diagnosis or surgical decision-making. Incorporating adaptive or plug-and-play tools tailored to niche clinical domains could expand its applicability. Second, the sequential execution of tools, particularly for evidence retrieval and analysis, can introduce latency and limit scalability in real-time or high-throughput settings. Future work will explore parallelized tool execution, caching strategies across sessions, and learned policies for tool invocation to improve computational efficiency. Third, while our evaluation covered a range of benchmarks, real-world clinical scenarios often involve ambiguous, noisy or incomplete data. Expanding evaluations to include complex settings such as NEJM clinicopathological conferences, longitudinal case reports, or multimodal inputs will be important to assess robustness in high-stakes use cases 1 , 48 . Looking ahead, we envision broader societal impacts of our work in democratizing medical expertise through accessible, open-source AI systems. However, these benefits must be pursued alongside safeguards for transparency, accountability, and patient safety. As tool-based agents become more capable, interdisciplinary collaboration between clinicians, ethicists, and technologists will be important to ensure their responsible integration into clinical workflows. CONCLUSION Our study advances the application of LLMs in medicine through an open-source agent that orchestrates retrieval, evidence refinement, and reasoned diagnosis in a cohesive workflow. By leveraging adaptive tools for on-demand processing and a memory system that selectively preserves critical insights, we enable more fluid and context-aware analysis, sidestepping common pitfalls like fixed prompts and truncated inputs. Our evaluations on diverse QA datasets reveal consistent advantages over established baselines, with marked improvements in precision for foundational and applied medical knowledge. Overall, this framework enhances diagnostic fidelity, accessibility, and adaptability through tool-based reasoning, laying a foundation for reliable, scalable AI that aligns with evolving healthcare demands and enables patient-centered innovations. Data Availability All data produced in the present study are available upon reasonable request to the authors. COMPETING INTERESTS V.B.K. is a co-founder and equity holder of deepPath, Inc. and CogniScreen, Inc. He also serves on the scientific advisory board of Altoida Inc. The remaining authors declare no competing interests. DATA AND CODE AVAILABILITY The clinical case data from NEJM used in this study are not publicly available and can be obtained under an exclusive licensing agreement with the NEJM Group. All other datasets used in this work, sourced from publicly accessible platforms such as PubMed Central, ClinicalTrials.gov, and the National Library of Medicine, will be released via Hugging Face ( https://huggingface.co/vkola-lab ) under a CC-BY-NC-ND license. The full source code developed for this study, including all implementation and evaluation scripts, will be made publicly available on GitHub ( https://github.com/vkola-lab ), along with detailed documentation and instructions to facilitate reproducibility. ACKNOWLEDGMENTS This project was supported by grants from the National Institute on Aging’s Artificial Intelligence and Technology Collaboratories (P30-AG073104 & P30-AG073105), and the National Institutes of Health (R01-NS142076, R01-HL159620, R01-AG062109, and R01-AG083735). REFERENCES 1. ↵ McDuff D , Schaekermann M , Tu T , Palepu A , Wang A , Garrison J , et al. Towards accurate differential diagnosis with large language models . Nature . 2025 : 1 – 7 . 2. ↵ Nori H , King N , McKinney SM , Carignan D , Horvitz E. Capabilities of GPT-4 on medical challenge problems . arXiv preprint arXiv:230313375. 2023 . 3. Hager P , Jungmann F , Holland R , Bhagat K , Hubrecht I , Knauer M , et al. Evaluation and mitigation of the limitations of large language models in clinical decision-making . Nature Medicine . 2024 ; 30 ( 9 ): 2613 – 22 . OpenUrl CrossRef PubMed 4. ↵ Sandmann S , Hegselmann S , Fujarski M , Bickmann L , Wild B , Eils R , et al. Benchmark evaluation of DeepSeek large language models in clinical decision-making . Nature Medicine . 2025 . 5. ↵ Ku L , Martins A , Srikumar V Xiong G , Jin Q , Lu Z , Zhang A. Benchmarking Retrieval-Augmented Generation for Medicine . In: Ku L , Martins A , Srikumar V , editors. Findings of the Association for Computational Linguistics, ACL 2024, Bangkok, Thailand and virtual meeting, August 11-16, 2024 . Association for Computational Linguistics ; 2024 . p. 6233 – 51 . Available from : doi: 10.18653/v1/2024.findings-acl.372 . OpenUrl CrossRef 6. ↵ Alonso I , Oronoz M , Agerri R . MedExpQA: Multilingual benchmarking of Large Language Models for Medical Question Answering . Artificial Intelligence in Medicine . 2024 ; 155 : 102938 . Available from : doi: 10.1016/j.artmed.2024.102938 . OpenUrl CrossRef PubMed 7. ↵ Yang R , Ning Y , Keppo E , Liu M , Hong C , Bitterman DS , et al. Retrieval-augmented generation for generative artificial intelligence in health care . npj Health Systems . 2025 ; 2 ( 1 ): 2 . OpenUrl 8. ↵ Jeong M , Sohn J , Sung M , Kang J . Improving medical reasoning through retrieval and self-reflection with retrieval-augmented large language models . Bioinformatics . 2024 ; 40 ( Supplement_1 ): i119 – 29 . Available from : doi: 10.1093/bioinformatics/btae238 . OpenUrl CrossRef PubMed 9. ↵ Xiong G , Jin Q , Wang X , Zhang M , Lu Z , Zhang A. Improving retrieval-augmented generation in medicine with iterative follow-up questions. In: Biocomputing 2025: Proceedings of the Pacific Symposium . World Scientific ; 2024 . p. 199 – 214 . 10. Naumann T , Abacha AB , Bethard S , Roberts K , Bitterman DS Alzghoul R , Ayaabdelhaq A , Tabaza A , Altamimi A. CLD-MEC at MEDIQA-CORR 2024 Task: GPT-4 Multi-Stage Clinical Chain of Thought Prompting for Medical Errors Detection and Correction . In: Naumann T , Abacha AB , Bethard S , Roberts K , Bitterman DS , editors. Proceedings of the 6th Clinical Natural Language Processing Workshop, ClinicalNLP@NAACL 2024 , Mexico City, Mexico , June 21, 2024. Association for Computational Linguistics ; 2024 . p. 537 – 56 . Available from : doi: 10.18653/v1/2024.clinicalnlp-1.52 . OpenUrl CrossRef 11. ↵ Chen Y , Sun P , Li X , Chu X . MRD-RAG: Enhancing Medical Diagnosis with Multi-Round Retrieval-Augmented Generation . arXiv preprint arXiv:250407724. 2025 . 12. ↵ Wu C , Lin W , Zhang X , Zhang Y , Xie W , Wang Y . PMC-LLaMA: Toward building open-source language models for medicine . Journal of the American Medical Informatics Association . 2024 ; 31 ( 9 ): 1833 – 43 . Available from : doi: 10.1093/jamia/ocae045 . OpenUrl CrossRef PubMed 13. ↵ Wang X , Chen N , Chen J , Hu Y , Wang Y , Wu X , et al. Apollo: An Lightweight Multilingual Medical LLM towards Democratizing Medical AI to 6B People . arXiv preprint arXiv:240303640. 2024 . Available from : doi: 10.48550/arXiv.2403.03640 . OpenUrl CrossRef 14. ↵ Qiu P , Wu C , Zhang X , Lin W , Wang H , Zhang Y , et al. Towards building multilingual language model for medicine . Nature Communications . 2024 ; 15 ( 1 ): 8384 . OpenUrl PubMed 15. ↵ Bouamor H , Pino J , Bali K Zhang H , Chen J , Jiang F , Yu F , Chen Z , Chen G , et al. HuatuoGPT, Towards Taming Language Model to Be a Doctor . In: Bouamor H , Pino J , Bali K , editors. Findings of the Association for Computational Linguistics: EMNLP 2023 , Singapore , December 6-10, 2023. Association for Computational Linguistics ; 2023 . p. 10859 – 85 . Available from : doi: 10.18653/v1/2023.findings-emnlp.725 . OpenUrl CrossRef 16. ↵ Chen J , Cai Z , Ji K , Wang X , Liu W , Wang R , et al. HuatuoGPT-o1, Towards Medical Complex Reasoning with LLMs . arXiv preprint arXiv:241218925. 2024 . Available from : doi: 10.48550/arXiv.2412.18925 . OpenUrl CrossRef 17. ↵ Subbiah V . The next generation of evidence-based medicine . Nature Medicine . 2023 ; 29 ( 1 ): 49 – 58 . OpenUrl CrossRef PubMed 18. ↵ Jia S , Bit S , Searls E , Lauber MV , Claus LA , Fan P , et al. PodGPT: An audio-augmented large language model for research and education . npj Biomedical Innovations . 2025 . 19. ↵ Luo J , Zhang W , Yuan Y , Zhao Y , Yang J , Gu Y , et al. Large Language Model Agent: A Survey on Methodology, Applications and Challenges . arXiv preprint arXiv:250321460. 2025 . Available from : doi: 10.48550/arXiv.2503.21460 . OpenUrl CrossRef 20. ↵ Yu H , Zhou J , Li L , Chen S , Gallifant J , Shi A , et al. AIPatient: Simulating Patients with EHRs and LLM Powered Agentic Workflow . arXiv preprint arXiv:240918924. 2024 . Available from : doi: 10.48550/arXiv.2409.18924 . OpenUrl CrossRef 21. Li J , Wang S , Zhang M , Li W , Lai Y , Kang X , et al. Agent Hospital: A Simulacrum of Hospital with Evolvable Medical Agents . arXiv preprint arXiv:240502957. 2024 . Available from : doi: 10.48550/arXiv.2405.02957 . OpenUrl CrossRef 22. Yan W , Liu H , Wu T , Chen Q , Wang W , Chai H , et al. ClinicalLab: Aligning Agents for Multi-Departmental Clinical Diagnostics in the Real World . arXiv preprint arXiv:240613890. 2024 . Available from : doi: 10.48550/arXiv.2406.13890 . OpenUrl CrossRef 23. Almansoori MK , Kumar K , Cholakkal H . Self-Evolving Multi-Agent Simulations for Realistic Clinical Interactions . arXiv preprint arXiv : 250322678 . 2025 . Available from : doi: 10.48550/ arXiv.2503.22678. OpenUrl CrossRef 24. ↵ Li H , Pan W , Rajendran S , Zang C , Wang F. TrialGenie: Empowering Clinical Trial Design with Agentic Intelligence and Real World Data . medRxiv . 2025 . Available from: https://www.medrxiv.org/content/early/2025/04/20/2025.04.17.25326033 . 25. ↵ Fallahpour A , Ma J , Munim A , Lyu H , Wang B . MedRAX: Medical Reasoning Agent for Chest X-ray . arXiv preprint arXiv:250202673. 2025 . Available from : doi: 10.48550/arXiv.2502.02673 . OpenUrl CrossRef 26. ↵ Sharma N . CXR-Agent: Vision-language models for chest X-ray interpretation with uncertainty aware radiology reporting . arXiv preprint arXiv:240708811. 2024 . Available from : doi: 10.48550/arXiv.2407.08811 . OpenUrl CrossRef 27. ↵ Gao S , Zhu R , Kong Z , Noori A , Su X , Ginder C , et al. TxAgent: An AI Agent for Therapeutic Reasoning Across a Universe of Tools . arXiv preprint arXiv : 250310970 . 2025 . Available from : doi: 10.48550/arXiv.2503.10970 . OpenUrl CrossRef 28. Lu P , Chen B , Liu S , Thapa R , Boen J , Zou J . OctoTools: An Agentic Framework with Extensible Tools for Complex Reasoning . arXiv preprint arXiv:250211271. 2025 . Available from : doi: 10.48550/arXiv.2502.11271 . OpenUrl CrossRef 29. Liao Y , Jiang S , Wang Y , Wang Y . ReflecTool: Towards Reflection-Aware Tool-Augmented Clinical Agents . arXiv preprint arXiv:241017657. 2024 . Available from : doi: 10.48550/arXiv.2410.17657 . OpenUrl CrossRef 30. ↵ Goodell AJ , Chu SN , Rouholiman D , Chu LF. Large language model agents can use tools to perform clinical calculations . npj Digital Medicine . 2025 ; 8 ( 1 ). Available from : doi: 10.1038/ s41746-025-01475-8. OpenUrl CrossRef 31. ↵ National Library of Medicine (US) . PubMed Central . 2024 . National Center for Biotechnology Information, U.S. National Library of Medicine . Available from: https://www.ncbi.nlm.nih.gov/pmc/ . 32. ↵ Fieschi M , Coiera EW , Li YJ Gillen JE , Tse T , Ide NC , McCray AT. Design, Implementation and Management of a Web-Based Data Entry System for ClinicalTrials.gov . In: Fieschi M , Coiera EW , Li YJ , editors. MEDINFO 2004 - Proceedings of the 11th World Congress on Medical Informatics , San Francisco, California, USA , September 7-11, 2004 . vol. 107 of Studies in Health Technology and Informatics. IOS Press ; 2004. p. 1466 – 70 . Available from : doi: 10.3233/978-1-60750-949-3-1466 . OpenUrl CrossRef 33. ↵ Campion EW , Scott L , Graham A , Prince JM , Morrissey S , Drazen JM. NEJM.org — 20 Years on the Web . New England Journal of Medicine . 2016 ; 375 ( 10 ): 993 – 4 . Available from: https://www.nejm.org/ doi/full/10.1056/NEJMe1610607. OpenUrl PubMed 34. ↵ National Center for Biotechnology Information (US) . About Bookshelf [Internet] . Bethesda (MD) : National Center for Biotechnology Information (US) ; 2010 . NLM LitArch Open Access Subset. https://www.ncbi.nlm.nih.gov/books/about/openaccess/ . 35. ↵ Jurafsky D , Chai J , Schluter N , Tetreault JR Cohan A , Feldman S , Beltagy I , Downey D , Weld DS. SPECTER: Document-level Representation Learning using Citation-informed Transformers . In: Jurafsky D , Chai J , Schluter N , Tetreault JR , editors. Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, ACL 2020 , Online, July 5-10, 2020. Association for Computational Linguistics ; 2020. p. 2270 – 82 . Available from : doi: 10.18653/v1/2020.acl-main.207 . OpenUrl CrossRef 36. ↵ Li Z , Zhang X , Zhang Y , Long D , Xie P , Zhang M . Towards General Text Embeddings with Multi-stage Contrastive Learning . arXiv preprint arXiv:230803281. 2023 . Available from : doi: 10.48550/arXiv.2308.03281 . OpenUrl CrossRef 37. ↵ Bitsandbytes Development Team . Accessible large language models via k-bit quantization for PyTorch ; 2024 . GitHub repository . Available from: https://github.com/bitsandbytes-foundation/bitsandbytes . 38. ↵ Flinn J , Seltzer MI , Druschel P , Kaufmann A , Mace J Kwon W , Li Z , Zhuang S , Sheng Y , Zheng L , Yu CH , et al. Efficient Memory Management for Large Language Model Serving with PagedAttention . In: Flinn J , Seltzer MI , Druschel P , Kaufmann A , Mace J , editors. Proceedings of the 29th Symposium on Operating Systems Principles, SOSP 2023 , Koblenz, Germany , October 23-26, 2023. Association for Computing Machinery ; 2023. p. 611 – 26 . Available from : doi: 10.1145/3600006.3613165 . OpenUrl CrossRef 39. ↵ Peng B , Quesnelle J , Fan H , Shippole E. YaRN: Efficient Context Window Extension of Large Language Models. In: The Twelfth International Conference on Learning Representations, ICLR 2024 , Vienna, Austria , May 7-11, 2024. International Conference on Learning Representations ; 2024. Available from: https://openreview.net/forum?id=wHBfxhZu1u . 40. ↵ Kung TH , Cheatham M , Medenilla A , Sillos C , De Leon L , Elepaño C , et al. Performance of ChatGPT on USMLE: Potential for AI-assisted medical education using large language models . PLOS Digital Health . 2023 ; 2 ( 2 ): e0000198 . OpenUrl 41. ↵ Jin D , Pan E , Oufattole N , Weng WH , Fang H , Szolovits P . What Disease Does This Patient Have? A Large-Scale Open Domain Question Answering Dataset from Medical Exams . Applied Sciences . 2021 ; 11 ( 14 ). Available from: https://www.mdpi.com/2076-3417/11/14/6421 . 42. ↵ Ku L , Martins A , Srikumar V Labrak Y , Bazoge A , Morin E , Gourraud P , Rouvier M , Dufour R. BioMistral: A Collection of Open-Source Pretrained Large Language Models for Medical Domains . In: Ku L , Martins A , Srikumar V , editors. Findings of the Association for Computational Linguistics, ACL 2024 , Bangkok, Thailand and virtual meeting, August 11-16, 2024. Association for Computational Linguistics; 2024. p. 5848 – 64 . Available from : doi: 10.18653/v1/2024.findings-acl.348 . OpenUrl 43. Meta AI. How Llama is helping Saama deliver new possibilities in personalized medicine and data-driven care ; 2025 . https://ai.meta.com/blog/saama-data-driven-care-built-with-llama . 44. ↵ Globersons A , Mackey L , Belgrave D , Fan A , Paquet U , Tomczak JM Zhang K , Zeng S , Hua E , Ding N , Chen Z , Ma Z , et al. UltraMedical: Building Specialized Generalists in Biomedicine . In: Globersons A , Mackey L , Belgrave D , Fan A , Paquet U , Tomczak JM , et al. , editors. Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024 , NeurIPS 2024, Vancouver, BC, Canada , December 10 - 15, 2024; 2024. Available from: http://papers.nips.cc/paper_files/paper/2024/hash/2dfc26ce9039f00eee4aba0c54931e46-Abstract-Datasets_and_Benchmarks_Track.html . 45. ↵ Gao Y , Dligach D , Miller TA , Caskey JR , Sharma B , Churpek MM , et al. DR.BENCH: Diagnostic Reasoning Benchmark for Clinical Natural Language Processing . Journal of Biomedical Informatics . 2023 ; 138 : 104286 . Available from : doi: 10.1016/j.jbi.2023.104286 . OpenUrl CrossRef 46. ↵ Meng R , Liu Y , Joty SR , Xiong C , Zhou Y , Yavuz S. SFR-Embedding-2: Advanced Text Embedding with Multi-stage Training ; 2024 . 47. ↵ Zhang T , Kishore V , Wu F , Weinberger KQ , Artzi Y. BERTScore: Evaluating Text Generation with BERT . In: The Eleventh International Conference on Learning Representations, ICLR 2020 , Addis Ababa, Ethiopia , April 26-30, 2020. International Conference on Learning Representations ; 2020. Available from: https://openreview.net/forum?id=SkeHuCVFDr . 48. ↵ Eriksen AV , Möller S , Ryg J . Use of GPT-4 to diagnose complex clinical cases . NEJM AI . 2024 ; 1 ( 1 ): AIp2300031 . OpenUrl View the discussion thread. Back to top Previous Next Posted August 08, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Agentic memory-augmented retrieval and evidence grounding for medical question-answering tasks Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Agentic memory-augmented retrieval and evidence grounding for medical question-answering tasks Shuyue Jia , Subhrangshu Bit , Varuna H. Jasodanand , Yi Liu , Vijaya B. Kolachalama medRxiv 2025.08.06.25333160; doi: https://doi.org/10.1101/2025.08.06.25333160 Share This Article: Copy Citation Tools Agentic memory-augmented retrieval and evidence grounding for medical question-answering tasks Shuyue Jia , Subhrangshu Bit , Varuna H. Jasodanand , Yi Liu , Vijaya B. Kolachalama medRxiv 2025.08.06.25333160; doi: https://doi.org/10.1101/2025.08.06.25333160 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (569) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1511) Epidemiology (15230) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6610) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15923) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6607) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3338) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5448) Public and Global Health (9239) Radiology and Imaging (2202) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a01ec9ad6a3fdf94',t:'MTc3OTgxOTU2Mw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.