Full text
39,432 characters
· extracted from
preprint-html
· click to expand
Accelerating Insight Discovery in Large Biomedical Text with Scalable Processing Framework | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Accelerating Insight Discovery in Large Biomedical Text with Scalable Processing Framework Dongeun Kim , Megan Hauptman , Matthew T. Patrick doi: https://doi.org/10.1101/2025.08.14.670384 Dongeun Kim 1 College of Literature, Science, and the Arts, University of Michigan , Ann Arbor, Michigan, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Megan Hauptman 2 Department of Dermatology, University of Michigan Medical School , Ann Arbor, Michigan, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Matthew T. Patrick 2 Department of Dermatology, University of Michigan Medical School , Ann Arbor, Michigan, USA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: mattpat{at}med.umich.edu Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Large language models are increasingly being used by dermatology professionals to support diagnostic investigation, patient education, and medical research. While these models can help manage information overload and improve efficiency, concerns persist regarding their accuracy and potential reliance on dubious sources. We introduce Quanta, a hybrid system that combines large language models with established evaluation metrics, such as cosine similarity, to enable efficient summarization and interpretation of curated research corpora. This methodology ensures that synthesized insights remain domain-specific and contextually relevant, thereby supporting clinicians and researchers in navigating the expanding digital landscape of dermatology literature. Deployed within an interactive chatbot, the tool delivers direct answers to user queries, provides cross-publication insights, and can suggest new directions for research. Comparative evaluations on benchmark datasets demonstrate improvements in accuracy, efficiency, and computational cost, with the curated document approach enhancing reliability and reducing misinformation risk. Introduction Large language models (LLMs) are a form of deep neural network, which harness advanced computational techniques and massive training data, to provide meaningful output in response to textual queries ( Kasneci et al., 2023 ). LLM usage has increased dramatically in recent years. A survey from the American Academy of Dermatology found more than half of dermatologists use LLMs every week ( Gui et al., 2024 ), with the most popular application being to investigate diagnoses and medical management. Further uses include preparing patient handouts and assisting in dermatology research ( Jin and Dobry, 2023 ). LLMs have the potential to enable dermatology researchers and clinicians to keep up to date with a rapidly increasing body of knowledge. They address information overload and facilitate greater efficiencies in work, by handling laborious and time-consuming tasks, particularly those that involve the extraction of meaningful insights from massive text corpora. However, it is important to consider these benefits in the context of any introduced risks. The vast range of data on which LLMs are trained can make it possible for irrelevant or disreputable sources to affect the results ( De Cassai and Dost, 2024 , Lechien et al., 2024 ). Furthermore, the influence of these sources can be difficult to perceive, because of the complicated and unclear ways in which LLMs select and summarize information, often combining multiple documents together. Many dermatologists are concerned about the accuracy of LLMs ( Gui et al., 2024 ), particularly since mistakes could have a negative impact on patients’ health. A study into using an LLM for novel idea generation in research also found the feasibility of these ideas (evaluated by expert review) was lower than those identified by humans ( Si et al., 2024 ). Recognizing that commonly used LLMs may include unreliable or irrelevant sources, users benefit from providing a specific set of curated and trusted literature on which to ask their questions ( Ben Abacha and Demner-Fushman, 2019 ). However, even powerful language models, such as GPT-4 and other currently available tools, are limited in the number and/or size of files that can be analyzed ( Wang et al., 2024 ). The need for effective methods to process and analyze vast amounts of textual information is becoming increasingly important, as digital resources grow in number and accessibility. We aim to address these problems by introducing a new tool (Quanta), which can efficiently process and synthesize relevant information from large-scale textual data. To extract meaningful insights from heterogeneous textual data, researchers have explored various methods, including traditional lexical similarity measures such as Cosine Similarity, Jaccard Similarity, and Term Frequency - Inverse Document Frequency (TF-IDF) ( Rinjeni et al., 2024 ). Cosine similarity is particularly effective for tasks such as finding related information (semantic search), clustering similar items, and retrieving relevant documents. It takes advantage of multiple distinct semantic dimensions to capture different aspects of meaning and, as a result, cosine similarity is well-suited for analyzing text in a way that preserves and reveals the underlying meaning of the output. However, traditional similarity approaches struggle to fully capture semantic meaning, particularly in more contextually rich documents, as they lack mechanisms for modeling word order and context. So, more advanced deep learning models have been proposed, including Bidirectional Encoder Representations from Transformers (BERT) (Gorenstein et al., 2024). These approaches take advantage of multiple layers of abstraction to learn numerical (vector) representations of each text, embedding their semantic meaning in a way that can be effectively used for computation. Yet, deep learning-based methods can be computationally expensive, particularly when handling vast datasets. To address these limitations, this research introduces a hybrid methodology that combines conventional similarity measures (e.g. cosine similarity) with transformer-based semantic models such as BERT. A key innovation of our approach is the integration of LangChain 1 to optimize query decomposition, dynamic prompt generation, and contextual refinement. By abstracting much of the implementation complexity, LangChain enables seamless integration of recent language technologies without requiring extensive low-level programming. Specifically, it supports the transformation of raw text into numerical representations suitable for semantic search, manages retrieval operations from the database of vector embeddings, and ensures that the input provided to the language model follows a consistent and structured format. This foundation contributes to a modular and scalable system design, allowing for consistently accurate and context-aware responses across diverse clinical and biomedical text sources. By allowing dermatologists and dermatology researchers to input their own carefully curated and pertinent set of literature into the system, we ensure that the synthesized information remains domain-specific and contextually relevant, while delegating the time-consuming tasks of document review to the LLM. Furthermore, we leverage ChromaDB for efficient embedding storage and fast nearest-neighbor retrieval, especially considering the usage of LangChain, which natively supports ChromaDB for efficient storage of the vector database. This integration simplifies retrieval workflows by enabling seamless coordination between embedding generation, metadata filtering, and document ranking, without the need for manual configuration or complex indexing. Results Overview of Approach We propose an integrated pipeline built upon recent technologies, as illustrated in ( Figure 1 ) . Download figure Open in new tab Figure 1. Quanta Architecture Three-stage architecture designed to enable intelligent querying over user-uploaded documents. Consists of the Preprocess Pipeline, Query Pipeline, and Response Pipeline, integrating BERT embeddings, ChromaDB, and OpenAI’s GPT-4 through LangChain. The proposed architecture is designed to enable semantic search and response generation over heterogeneous document formats using a modular, retrieval-augmented generation (RAG) framework. The system is composed of three main pipelines, which interact via shared vector representations and LangChain integration: Preprocess Pipeline, Query Pipeline, and Response Pipeline. Preprocess Pipeline The preprocess pipeline is responsible for preparing the uploaded documents for semantic indexing. Users can upload documents in various formats (DOCX, PDF, TXT), which are parsed using appropriate loaders (DocxLoader, PDFLoader, TextLoader, etc.). Then, the text is preprocessed to ensure consistency and reduce noise, by applying a series of normalization operations. These include conversion to lowercase, removal of stop words and line breaks, as well as normalization of spacing. The processed texts are then divided into semantically manageable chunks. Each chunk is embedded using a BERT-based encoder to produce fixed-size vector representations. The resulting embeddings are stored in ChromaDB, a vector database optimized for fast similarity search when used with LangChain. Query Pipeline The query pipeline processes incoming user questions and retrieves the most semantically relevant document chunks by applying content relevance filtering. This is achieved by embedding user queries using the BERT model, to ensure consistency in semantic space with the BERT-embedded document chunks. These embeddings, from user queries and documents, are then used to compute cosine similarity, and the top k (5 by default, but can be adjusted) most similar chunks are selected as context. Response Pipeline The response pipeline formulates a natural language response based on retrieved document content. A system prompt ( Supplementary Note 1 ) is formatted using LangChain’s PromptTemplate functionality and provided to OpenAI’s GPT-4 language model 2 , along with the selected chunks and original query, to ensure the response is well-structured, coherent and context-aware. This is possible as it only focuses on the most relevant contents, which is filtered and selected during the query pipeline. The output of each query is displayed via the Quanta interface, which was created using Streamlit 3 ( Figure 2 ) . LangChain plays a central role in helping us to simplify implementation complexities, as follows: (1) Embedding Integration: LangChain supports wrapper classes for generating embeddings (e.g., via Hugging Face), (2) Retrieval Abstraction: LangChain simplifies ChromaDB access, eliminating the need for direct application programming interface (API) interactions, (3) Prompt Standardization: The PromptTemplate module ensures consistency and quality in the prompts passed to the LLM. Download figure Open in new tab Figure 2. Interface for Quanta Assistant Embedding Model Selection and Sensitivity Analysis We evaluated different configurations for semantic search and information retrieval, using the Benchmarking Information Retrieval (BEIR) framework 4 . It comprises 19 datasets for retrieval tasks across a wide range of domains, including biomedical, finance, and open-domain question answering. They each have comprehensive “ground-truth” labeling that facilitates the reliable comparison of embedding methodologies. Using 5 different embedding models (see Materials and Methods ), Figure 3 presents our comparison of the Euclidean L2 (blue) and cosine similarity (orange) approach for semantic search and information retrieval performance. The cosine similarity approach consistently outperformed L2 across all metrics ( Table 3 ). When differences were present, the cosine approach achieved approximately 48.7% to 76.9% higher mean scores for Normalized Discounted Cumulative Gain (NDCG), around 7.1% higher for F1-score, roughly 33.3% higher for Recall@5, and between 20% and 100% higher for Mean Reciprocal Rank (MRR). It also demonstrated shorter average processing time, decreasing the duration by 95.6% compared to the L2 approach on average across various embedding models. This performance advantage stems from cosine similarity’s ability to capture semantic relevance more effectively by emphasizing the direction of embeddings rather than their magnitude, making it better suited for high-dimensional spaces where vector orientation encodes core semantic meaning. Download figure Open in new tab Figure 3. Comparative performance of cosine vs. L2 similarity across embedding models. Retrieval performance of various embedding models (MiniLM, MPNet, DistilUSE, NLI-BERT, SciBERT) when evaluated using five key metrics: F1-score, NDCG, MRR, Recall@5, and computational time. Of the embedding models we evaluated, NLI-BERT performed as well as or better than all other models across all metrics ( Figure 3 ), with highest mean scores for NDCG, MRR, and shorter average processing time, suggesting it to be an optimal embedding model for the Quanta retrieval pipeline. Figure 4 presents a heatmap of standard deviations for the retrieval metrics scores of each embedding model. MiniLM had the lowest average standard deviation at 0.342. However, this is just 4.8% lower than the 3rd best model, NLI-BERT (0.358), indicating only a slight difference in retrieval consistency. Given this minimal variance gap, the combination of consistent high mean scores ( Figure 3 ) and low variance makes NLI-BERT a strong candidate for robust semantic search at large-scale. Download figure Open in new tab Figure 4. Heatmap of standard deviations across metrics for each embedding model. Visualizes the stability of each tested embedding model by showing the standard deviation across retrieval metrics scores. Lower standard deviation values indicate greater consistency, shown by lighter opacity of yellow. Validating Integration of LangChain We then tested whether integrating LangChain into our vector-based retrieval pipeline could enhance semantic matching performance. This was evaluated in terms of the relevance of retrieved answers using BEIR, compared to raw vector similarity alone ( Tables 1 and 2 ) . For each metric, we conducted paired t-tests to assess the distribution of differences across query-level scores. The paired t-test results showed statistically significant improvements for LangChain, validating our hypothesis with p-values of 0.01302 (F1), 0.00920 (NDCG) and 0.01116 (Recall@5), respectively. Similarly, a Wilcoxon signed-rank test for MRR confirmed a significant effect, with a p-value of 0.01776, further supporting the hypothesis that LangChain improves semantic retrieval performance. View this table: View inline View popup Download powerpoint Table 1. Baseline BERT Results *SciBERT was used as our Baseline BERT, as it showed best performance in Figure 3 and Figure 4 . Furthermore, a graphical representation of the per-query performance scores is shown in the violin plot in Figure 5 , which emphasizes both the distribution shape and median values for each metric. In general, the LangChain system (orange) exhibits a higher median across most metrics compared to the baseline system (blue), indicating better performance. Notably, the NDCG (ranked relevance quality) distribution is skewed substantially higher for LangChain, reflecting stronger retrieval precision, where relevant items were frequently placed near the top of the ranked output. Figure 5 ’s visualization complements the statistical findings by offering a more intuitive view of LangChain’s relative effectiveness. Finally, numerical values from Table 1 and Table 2 show that although LangChain-based retrieval incurred higher latency due to deeper semantic processing, it reduced memory usage by approximately 96.98% (see Supplementary Note 2 for calculation details), significantly improving efficiency in storage and scalability. View this table: View inline View popup Download powerpoint Table 2. LangChain Results Download figure Open in new tab Figure 5. Violin plot of LangChain-integrated and baseline retrieval performances. Visualizes distribution of per-query F1-scores between the baseline vector retrieval pipeline and the LangChain-enhanced retrieval system. Each distribution reflects the spread and median value of the results across 50 test queries. Assessing Output To validate Quanta’s abilities in a real-world setting, we tested whether it could accurately locate answers from a curated set of documents. The focus was to determine if our model could retrieve correct responses when asked specific questions. After uploading six key papers on prurigo nodularis, chronic itch, and cutaneous gene dysregulation, we asked a series of research-relevant questions. The responses ( Supplementary Note 3 ) highlight the ability of our tool to provide meaningful insights, rooted in the literature, that may not be immediately obvious to the user. This demonstrates how our approach could be used to help facilitate further research, or provide valuable information in a clinical setting. Discussion By combining advanced modeling techniques (large language models, LangChain and ChromaDB) with carefully tested evaluation techniques, including cosine similarity, we have developed an effective new tool for summarizing and interpreting large, curated sets of literature. Our approach can be of great use to clinicians and researchers by reducing the cognitive and time burden of manually reviewing extensive research papers, enhancing access to domain-specific knowledge, and facilitating discovery of nuanced insights that may span across multiple publications. As our model is incorporated into an interactive chatbot system that dynamically retrieves information from research papers, it can provide users direct responses to specific questions, as well as generate new research topics. Our system precomputes document embeddings using OpenAI models and stores them in an internal document structure in memory, where cosine similarity is used to rank relevance at query time. This design allows fast retrieval, making it well-suited for interactive use cases. Furthermore, our hybrid retrieval strategy (combining dense vector similarity with metadata-based filtering) narrows down the search space before full ranking is computed, thus significantly improving our efficiency. Finally, our architecture supports content truncation and refinement, allowing us to flexibly limit the input length sent to the LLM while maintaining semantic completeness. We conducted a comparative analysis of our modeling decisions on the well-established BEIR datasets, evaluating performance in terms of accuracy, efficiency, and computational cost. Our results indicate that the model we developed can improve the synthesis of insights from large-scale research datasets. Nevertheless, alternative approaches have been developed, and it is important to consider our work in the context of previous research. Hooper et al. ( Hooper et al., 2024 ) introduced a method (Squeezed Attention) to accelerate long-context LLM inference by clustering fixed prompt keys based on semantic similarity and selectively computing attention only over the most relevant clusters. This achieves substantial improvements in speed and memory efficiency particularly for applications with long prompts; however, it requires inputs to be constrained to fixed contexts. By contrast, our approach addresses the challenge of big data by embedding content into a vector space and combining it with metadata-based filtering. This allows our approach to be more flexible, providing real-time semantic retrieval across dynamic and heterogeneous user queries for a diverse range of documents and tasks. Other implementation decisions we could have made, to address the challenges of extracting meaningful insights from massive text corpora, include the combination of cosine similarity and principal component analysis (PCA). To improve computational efficiency and reduce memory usage, cosine similarity is often used in conjunction with PCA, which compresses high-dimensional embedding vectors by projecting them onto a lower-dimensional subspace. PCA does this by identifying the directions (principal components) along which the data varies the most and retaining only the top components that capture the greatest variance. This effectively removes less informative or redundant dimensions. Consequently, efficiency is gained at the cost of semantic nuance, which can negatively impact both the accuracy and relevance of retrieved responses. Another approach is the Map-Reduce Summarization Method 5 , a pipeline designed to overcome context-length limitations. This method involves segmenting long documents into manageable chunks, summarizing each chunk individually while calculating token constraints, and then synthesizing these intermediate summaries into a final condensed summary. By summarizing in stages, this method enables the system to handle lengthy documents that would otherwise exceed input limits for large language models. While this approach effectively mitigates context window constraints, it introduces notable limitations. First, it leans toward summary-based retrieval rather than enabling access to the original full-text content, potentially omitting important contextual information. Second, because the summaries are generated independently of specific queries, the resulting outputs tend to be generalized and may lack the specificity required for rigorous research inquiries. Finally, this approach often incurs significant computational and financial costs due to repeated external API calls throughout the summarization pipeline. Nevertheless, there are some limitations of our work that future research could investigate. LLMs are known to exhibit hallucinations ( Huang et al., 2025 ), generating confident but inaccurate responses that can compromise the reliability of downstream research or clinical applications. Prompt engineering strategies, including techniques such as Chain-of-Thought (CoT) prompting ( Wei et al., 2022 ), which encourages the model to reason step-by-step before producing an answer, might offer a way to reduce hallucinations. In addition, other plausible approach could be citation-constrained prompting, where the model is explicitly required to reference specific uploaded documents as the basis for its responses, else return “insufficient evidence” if no source is found. A key advantage of our framework lies in its use of a curated and controlled set of documents, a setup that not only reduces the likelihood of misinformation but also gives researchers confidence in the trustworthiness of the source material. Moving forward, we aim to expand this approach by incorporating optional domain-specific filters (ex. restricting retrieval to clinical studies or expert-reviewed content for biomedical setting). This will further strengthen the system’s value as a transparent tool for navigating large-scale scientific literature. Materials & Methods To enhance performance and ensure an optimal embedding model tailored for large-scale dermatological research applications, a structured evaluation process was implemented. We assessed the effectiveness of various embedding models through retrieval accuracy and stability, thereby identifying optimal implementation decisions for the Quanta model, as well as confirming the advantage of using cosine similarity over the Euclidean (L2) method, and investigating the benefits of LangChain integration. Computational Setup The testing procedure was performed on standard computational hardware without the need for specialized external equipment. Yet, to ensure optimal performance and minimal latency, it is recommended to utilize a computer with at least 8GB of RAM. Selection of Embedding Models Five widely adopted embedding models were selected based on their popularity, computational efficiency, and effectiveness in capturing semantic meaning: MiniLM ( Wang et al., 2020 ), Masked and Permuted Pre-training for Language Understanding (MPNet) ( Song et al., 2020 ), DistilUniversal Sentence Encoder (DistilUSE) ( Reimers and Gurevych, 2019 ), Natural Language Inference BERT (NLI-BERT) ( Reimers and Gurevych, 2019 ), and Scientific Text BERT (SciBERT) ( Beltagy et al., 2019 ). These models were chosen due to their demonstrated capabilities in previous literature and practical usage in other Natural Language Processing (NLP) tasks, particularly in contexts requiring precise semantic understanding. Evaluation Dataset The NFCorpus dataset 6 from the BEIR framework was chosen for evaluation purposes. NFCorpus is specifically designed for information retrieval tasks, where the objective is to retrieve relevant documents based on natural language queries. It consists of over 3,000 fact-based documents, primarily drawn from the biomedical domain. Each query is paired with a set of relevance judgments, enabling the use of performance metrics such as F1-score, which requires ground truth labels for accurate evaluation. This makes NFCorpus an ideal benchmark for assessing how well a system understands and retrieves meaningful information, especially in the context of Quanta’s usage. Strong performance on this dataset suggests that the retrieval model is capable of handling complex, real-world queries - translating to more accurate and contextually relevant responses in chatbot applications (i.e. Quanta). Moreover, BEIR is known to provide a more standardized format (including documents, queries, and relevance labels), supporting reproducibility and fair comparison across retrieval methods. Evaluation Metrics We employed five key performance metrics to comprehensively assess embedding model performance ( Table 3 ). View this table: View inline View popup Download powerpoint Table 3. Furthermore, to evaluate the stability, in addition to the score calculated, we’ve recorded the standard deviation of each embedder’s performance across all metrics mentioned above. A lower standard deviation indicates greater stability and consistent performance across various scenarios. Prior to testing the effectiveness of LangChain, the requirements of the paired t-test were verified. Specifically, the observations were dependent (paired across the same query instances), and the normality of the paired differences was assessed using the Shapiro-Wilk test. While most metrics satisfied the normality assumption (p > 0.05), the MRR differences did not, with a Shapiro-Wilk p-value of 0.013. Therefore, F1 (p = 0.281), NDCG (p = 0.095), and Recall@5 (p = 0.069) were analyzed using the paired t-test, and non-parametric Wilcoxon signed-rank test was used for the MRR comparison to ensure statistical validity. Embedding Model Analysis The evaluation procedure involved the following sequential steps. First, NFCorpus data was downloaded and limited to 500 documents and 20 queries for computational feasibility. This number, however, can be adjusted; the consequence of increasing the dataset is the increase in compile time. Then, each embedding model encoded the documents and queries into vector representations. After, both cosine and L2 similarities were computed using the FAISS library. For cosine similarity, all embeddings were normalized to ensure that the similarity score (ranges from -1 (completely dissimilar) to 1 (highly similar), with 0 indicating orthogonality) was based solely on vector direction rather than magnitude, as cosine similarity assumes unit-length vectors. The means and standard deviations of the 5 metrics were calculated and recorded in a CSV file for each embedding model with each similarity approach (cosine vs. L2). This CSV file was later used to graph a comparison bar graph of the means, as well as a heatmap for the standard deviation of each score. LangChain Evaluation & Integration Upon identifying the optimal embedding model, LangChain was integrated into the architecture of Quanta to assess its potential for enhancement in semantic matching performance. The process began by loading the NFCorpus dataset, limited to 500 documents and 50 queries to ensure efficiency during testing. The identified optimal embedding model (NLI-BERT) was then used to generate document embeddings, which were stored in a Facebook AI Similarity Search (FAISS) vector index to enable efficient similarity-based retrieval. A FakeListLLM was employed to simulate the language model component within the LangChain retrieval chain. This allowed us to isolate and evaluate the performance of the retrieval mechanism itself, without introducing variability from generated outputs and additional API cost. The LangChain retrieval chain combined the FAISS retriever with a custom “stuff” document chain to aggregate and format the top 5 most similar documents based on cosine similarity. To assess LangChain’s impact, query-level retrieval performance was measured using four metrics: F1, NDCG, MRR, and Recall@5. Memory usage was tracked using Python’s tracemalloc module, and latency was measured using the time module. To determine the statistical significance of observed improvements, paired statistical tests were performed. Before applying the paired t-test, key conditions were verified: Query-level performance scores were paired across identical query instances. Specifically, the two samples correspond to the performance scores of the baseline (without LangChain) and LangChain systems, evaluated on the same set of queries via query_id, and the normality of the paired differences was assessed using the Shapiro-Wilk test, calculated with the shapiro function from the scipy.stats package. While F1, NDCG, and Recall@5 differences met the normality assumption (p > 0.05), MRR did not (p = 0.013). Consequently, the paired t-test (ttest_rel from scipy.stats) was applied to the first three metric, while the Wilcoxon signed-rank test (non-parametric) was used to evaluate MRR. Each test computed the mean difference between LangChain and baseline scores and compared it to the distribution expected under the null hypothesis (no performance improvement), yielding p-values that confirmed the statistical significance of LangChain’s improvements, and not just due to random chance. Footnotes ↵ 1 https://python.langchain.com/ ↵ 2 https://openai.com/index/gpt-4/ ↵ 3 https://streamlit.io/ ↵ 4 https://github.com/beir-cellar/beir ↵ 5 https://python.langchain.com/docs/tutorials/summarization/ ↵ 6 https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/ References ↵ Beltagy I , Lo K , Cohan A. SciBERT: A Pretrained Language Model for Scientific Text . arXiv 2019 . ↵ Ben Abacha A , Demner-Fushman D. A question-entailment approach to question answering . BMC Bioinformatics 2019 ; 20 ( 1 ): 511 . OpenUrl CrossRef PubMed ↵ De Cassai A , Dost B. Concerns regarding the uncritical use of ChatGPT: a critical analysis of AI-generated references in the context of regional anesthesia . Reg Anesth Pain Med 2024 ; 49 ( 5 ): 378 – 80 . OpenUrl FREE Full Text ↵ Gui H , Rezaei SJ , Schlessinger D , Weed J , Lester J , Wongvibulsin S , et al. Dermatologists’ Perspectives and Usage of Large Language Models in Practice: An Exploratory Survey . J Invest Dermatol 2024 ; 144 ( 10 ): 2298 – 301 . OpenUrl PubMed ↵ Hooper C , Kim S , Mohammadzadeh H , Maheswaran M , Zhao S , Palik J , et al. Squeezed Attention: Accelerating Long Context Length LLM Inference . arXiv 2024 . ↵ Huang L , Yu W , Ma W , Zhong W , Feng Z , Wang H , et al. A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions . ACM Trans Inf Syst 2025 ; 43 ( 2 ):Article 42. ↵ Jin JQ , Dobry AS . ChatGPT for healthcare providers and patients: Practical implications within dermatology . J Am Acad Dermatol 2023 ; 89 ( 4 ): 870 – 1 . OpenUrl PubMed ↵ Kasneci E , Sessler K , Küchemann S , Bannert M , Dementieva D , Fischer F , et al. ChatGPT for good? On opportunities and challenges of large language models for education . Learning and Individual Differences 2023 ; 103 : 102274 . OpenUrl ↵ Lechien JR , Briganti G , Vaira LA . Accuracy of ChatGPT-3.5 and -4 in providing scientific references in otolaryngology-head and neck surgery . Eur Arch Otorhinolaryngol 2024 ; 281 ( 4 ): 2159 – 65 . OpenUrl PubMed ↵ Reimers N , Gurevych I. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks . Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing: Association for Computational Linguistics ; 2019 . ↵ Rinjeni TP , Indriawan A , Rakhmawati NA . Matching Scientific Article Titles using Cosine Similarity and Jaccard Similarity Algorithm . Procedia Computer Science 2024 ; 234 : 553 – 60 . OpenUrl ↵ Si C , Yang D , Hashimoto T. Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with 100+ NLP Researchers . arXiv 2024 . ↵ Song K , Tan X , Qin T , Lu J , Liu T-Y. MPNet: Masked and Permuted Pre-training for Language Understanding . arXiv 2020 . ↵ Wang W , Wei F , Dong L , Bao H , Yang N , Zhou M. MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers . arXiv 2020 . ↵ Wang ZP , Bhandary P , Wang Y , Moore JH . Using GPT-4 to write a scientific review article: a pilot evaluation study . BioData Min 2024 ; 17 ( 1 ): 16 . OpenUrl PubMed ↵ Wei J , Wang X , Schuurmans D , Bosma M , Ichter B , Xia F , et al. Chain-of-thought prompting elicits reasoning in large language models . Proceedings of the 36th International Conference on Neural Information Processing Systems . New Orleans, LA, USA : Curran Associates Inc .; 2022 . p. Article 1800. View the discussion thread. Back to top Previous Next Posted August 19, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Accelerating Insight Discovery in Large Biomedical Text with Scalable Processing Framework Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Accelerating Insight Discovery in Large Biomedical Text with Scalable Processing Framework Dongeun Kim , Megan Hauptman , Matthew T. Patrick bioRxiv 2025.08.14.670384; doi: https://doi.org/10.1101/2025.08.14.670384 Share This Article: Copy Citation Tools Accelerating Insight Discovery in Large Biomedical Text with Scalable Processing Framework Dongeun Kim , Megan Hauptman , Matthew T. Patrick bioRxiv 2025.08.14.670384; doi: https://doi.org/10.1101/2025.08.14.670384 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7642) Biochemistry (17715) Bioengineering (13907) Bioinformatics (42003) Biophysics (21470) Cancer Biology (18624) Cell Biology (25533) Clinical Trials (138) Developmental Biology (13390) Ecology (19935) Epidemiology (2067) Evolutionary Biology (24356) Genetics (15617) Genomics (22529) Immunology (17753) Microbiology (40432) Molecular Biology (17200) Neuroscience (88681) Paleontology (667) Pathology (2840) Pharmacology and Toxicology (4828) Physiology (7653) Plant Biology (15171) Scientific Communication and Education (2046) Synthetic Biology (4304) Systems Biology (9826) Zoology (2271)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.