Evaluating the Efficacy of Large Language Models in Addressing Patient-Centric Inquiries in Multiple Cancers

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 58,518 characters · extracted from preprint-html · click to expand
Evaluating the Efficacy of Large Language Models in Addressing Patient-Centric Inquiries in Multiple Cancers | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Evaluating the Efficacy of Large Language Models in Addressing Patient-Centric Inquiries in Multiple Cancers Soheila Borhani , Xiaoqian Jiang doi: https://doi.org/10.1101/2025.08.05.25332968 Soheila Borhani 1 McWilliams School of Biomedical Informatics, University of Texas Health Science Center at Houston , Houston, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: soheila.borhani{at}uth.tmc.edu Xiaoqian Jiang 1 McWilliams School of Biomedical Informatics, University of Texas Health Science Center at Houston , Houston, TX, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Background Large Language Models (LLMs) have transformed how patients access health information online. Chatbots like ChatGPT allow users to ask direct questions and receive tailored answers almost instantly. However, for LLMs to be effective, the answers they provide must be reliable and accessible to patients. Our review assessed the reliability and accessibility of LLMs in answering patient inquiries about breast, prostate, and lung cancer. Methods A systematic search of the PubMed, Embase, and Web of Science databases was conducted. Included studies were peer-reviewed original research, published in English, that evaluated one or more LLMs in answering patients’ oncology questions. To enable result aggregation, a linear transformation was applied to standardize data from studies that used different Likert scales. Results We identified three common measures of reliability (accuracy, quality, consistency), and three measures of accessibility (readability, understandability, actionability) across the thirty-six studies that met our inclusion criteria. Accuracy and quality scores showed roughly similar distributions, with median values of 79.0% and 76.5%, respectively. Consistency levels were high in the few studies that provided this data (median = 100%). Despite all LLMs having readability scores significantly below the recommended level for patient-facing materials (median = 40.4%), several studies reported substantial improvements through prompt engineering. Understandability (median = 69.0%) and particularly, actionability (median = 40.0%) scores were lower than desired. Conclusions Despite current limitations, LLMs hold significant potential as an assistant tool for disseminating health information to patients. Active involvement of physicians in model training and validation can help improve their performance. 1. Introduction Patients are increasingly turning to the internet for health information, with Google alone processing over a billion health-related searches daily [ 1 ]. Queries related to cancer make up a notable portion of these searches, due to the high prevalence of cancer, and the anxiety that it evokes in patients. Nearly nine in ten (89%) cancer survivors reported searching for cancer information online after receiving their diagnosis [ 2 ]. However, the quality of information available online can be quite poor. A review of the most popular cancer-related articles on Facebook, Reddit, Twitter, and Pinterest found that over 30% of them contained misleading or harmful information [ 3 ]. A similar study [ 4 ] revealed that 90% of prostate cancer content on Instagram was of low to medium quality, with 40% containing significant misinformation. Recent advances in Artificial Intelligence (AI), particularly in the domain of Large Language models (LLMs), present a paradigm shift in how patients access health information online. Rather than using search engines to navigate numerous links and synthesize information themselves, patients can pose direct questions to LLM-powered chatbots, and receive specific, detailed, and tailored answers in a matter of seconds. The advantage of LLMs over traditional search engines is underscored by the immense popularity and rapid adoption of this new technology. ChatGPT, the first widely accessible LLM-based chatbot developed by OpenAI Inc., achieved a record-breaking 100 million users within just two months of its launch in late 2022 [ 5 ]. Shortly after, other technology companies followed suit by releasing their own LLM-based chatbots, as summarized visually in the timeline shown in Figure 1 . Download figure Open in new tab Figure 1. A timeline of the public releases of popular LLM-based chatbots. Despite their transformative potential, LLMs remain susceptible to inaccuracies and misinformation [ 6 , 7 ]. As generative AI models, LLMs utilize deep neural network architectures with billions of tunable parameters. These parameters, algorithmically adjusted in a process called training, enable LLMs to identify statistical patterns and word relationships in written language [ 8 ]. The data used to train LLMs is sourced from a wide and diverse array of digital content including books, scientific papers, news articles, blogs, and social media posts, to name a few. LLMs can therefore inherit any inaccuracies or biases present in their training data [ 9 ]. For example, it is estimated that the majority of the data for training LLMs originates in the U.S. [ 10 ]. As a result, cancer screening recommendations may be skewed towards U.S. standards and guidelines, potentially providing misleading information to patients outside the United States [ 11 ]. Moreover, much of the information found online about a specific cancer drug or treatment may originate from the manufacturing company, rather than authoritative medical sources, which creates a real risk of commercial bias [ 12 ]. Considering the risk of inaccuracies and misinformation in LLMs, their successful use as a source of information for cancer patients depends entirely on the reliability of their output; otherwise, they could cause widespread harm through large-scale dissemination of false information. Reliability, however, is only a necessary condition not a sufficient one. To be beneficial to patients, the information must be presented in a manner that is easy to access and comprehend. In other words, LLM-generated content should be both reliable and accessible . Therefore, in this study we aimed to review the literature to determine whether the answers provided by LLMs to patients’ cancer-related questions were reliable and accessible. We focused on the three most commonly diagnosed cancers (i.e., breast, prostate, and lung cancer) which combined, account for over 40% of new cancer cases in the United States [ 13 ]. To our knowledge, this is the first systematic review to evaluate the performance of LLMs as a source of information for cancer patients. 2. Methods Using the search string displayed in Figure 2 , we conducted a search of article titles and abstracts indexed in three international databases, namely, PubMed, Embase, and Web of Science. Due to LLMs’ recency, we limited the scope of our search to studies published within the past 5 years. Initially, 496 articles were retrieved across all three databases. This number was subsequently reduced to 283 after removing duplicate records. Next, the titles and abstracts of the remaining articles were screened to exclude those that appeared prima facie irrelevant to the topic at hand. Also excluded were articles in languages other than English, publications that were not peer-reviewed, editorials, correspondences, and commentaries. The full text of the remaining articles was read to ensure they specifically addressed and assessed the use of at least one LLM in the context of answering patient questions. In total, thirty-six studies met the criteria described above, and were included in the final review. The selection process is summarized in the Preferred Reporting Items for Systematic Reviews and Meta-Analyses (PRISMA) diagram shown in Figure 2 . Download figure Open in new tab Figure 2. PRISMA flow chart summarizing the process of identification, screening, and selection of the studies included in the review. Data extracted from the selected studies included: cancer site (e.g., breast, prostate, lung, or multiple), LLM type and version (e.g., ChatGPT-3.5), the questions posed to LLMs, the number and expertise of human evaluators, and how they graded the LLM responses. To enable result aggregation across studies employing different Likert scales, we applied a data conversion scheme by linearly mapping all scores onto a common 0-100 range, as described in the Results section in further detail. Python version 3.9.6 was used for linear regression analysis and data visualization. 3. Results Table 1 presents an overview of the studies included in this review, detailing, among other information, the LLMs utilized in each study along with the number and source of the questions posed to them. These questions originated from a variety of sources, including the FAQ sections of the websites of various cancer associations (e.g., [ 40 ]), questions formulated by the authors based on their clinical oncology practice and experience (e.g., [ 46 ]), questions designed by focus groups of patient advocates (e.g., [ 35 ]), and questions identified by analyzing Google Trends data (e.g., [ 28 ]). Notably, in one study [ 19 ] the researchers tasked the LLM itself to generate the most common patient questions about prostate cancer. View this table: View inline View popup Table 1. Overview of studies included in the review. Reliability measures included accuracy (Acc.), quality (Qual.), and consistency (Cons.). Accessibility measures included readability (Read.), understandability (Und.), and actionability (Act.). To illustrate the types of questions asked, a representative sample is provided in Table 2 . As can be seen in this exemplar, patient inquiries cover a wide range of topics, from general information on the symptoms and causes of each type of cancer, to various screening and treatment options. The interested reader can find the full list of questions for each study in the Multimedia Appendix section of this article (when made available by the authors). View this table: View inline View popup Download powerpoint Table 2. Sample patient questions used to query LLMs. In all studies, once questions were formulated, they were posed to one or more LLMs, and the AI-generated responses were evaluated by a panel of experts. Table 1 also contains information on the number and background of the expert evaluators who assessed the LLM responses on multiple criteria. These criteria fall broadly into one of two categories: reliability measures (encompassing accuracy, quality, and consistency), and accessibility measures (encompassing readability, understandability, and actionability). In what follows, we summarize the findings of the reviewed studies along each of these six dimensions. 3.1. Reliability Measures 3.1.1. Accuracy Accuracy refers to the correctness of the LLM responses. For patients to trust this new technology, the answers provided to their questions must be free of errors and misinformation. Accuracy metrics were reported in twenty-seven of the reviewed articles (see Table 1 ). Despite varied methodologies, researchers commonly used Likert scales to assess expert opinions on response accuracy. For example, Chiarelli et al. [ 18 ] used a 3-point Likert scale to label each LLM response as either inaccurate (score = 0), partially accurate (score = 1), or accurate (score = 2). Belge Bilgin et al. [ 16 ] used a 4-point Likert scale, where scores from 1 to 4 represented completely incorrect, partially correct, mostly correct, and completely correct answers, respectively. Roldan-Vasquez et al. [ 40 ] used a 5-point Likert scale where 1 represented complete inaccuracy and 5 complete accuracy, while Ye et al. [ 48 ] employed the most granular Likert scale with 7 points. To allow for direct comparison of results, we used the following linear transformation scheme designed to standardize Likert scores by mapping them onto a common 0-100 range: . Here, x is the original score, a and b are the scale’s minimum and maximum values, and y denotes the transformed score that lies within the 0-100 range. This approach has been shown to minimize information loss during the transformation of data from its original scale structure [ 49 ]. Additionally, the transformed scores can be conveniently interpreted as percentages. In the left panel of Figure 3 , we illustrate the distribution of reported accuracy scores post-standardization for n=50 LLMs, noting that some of the twenty-seven studies that measured accuracy evaluated more than one LLM. The normalized accuracy scores ranged from 43.0% to 100%, with a median score of 79.0%. Download figure Open in new tab Figure 3. (Left panel) The box plot distribution of accuracy, quality, consistency, readability, understandability, and actionability of LLM responses. (Right panel) The scatter plot of Flesch Reading Ease Scale (FRES) scores and corresponding Grade Level (GL) scores for studies that reported both metrics. 3.1.2. Quality Accuracy, while essential, does not fully capture the quality of LLM responses. A factually accurate response may be deemed low-quality if it is not entirely relevant or responsive to the question, omits essential information such as risks and benefits associated with screening or treatment, or manifests biases stemming from the training data. Most reviewed studies which reported on the quality of LLM responses utilized the DISCERN tool [ 50 ] — a standardized instrument designed to assess the quality of written health information. DISCERN consists of 15 questions, each scored from 1 (low-quality) to 5 (high-quality), assessing relevance, transparency, comprehensiveness, and lack of bias in the answer provided. A 16th question offers an overall quality rating, also using the same 1-to-5 scale. Besides DISCERN, other studies measured quality using the Global Quality Scale (GQS) with the following scoring structure: 1 = poor quality, not useful for patients; 2 = poor quality, limited usefulness; 3 = partially useful, but missing key information; 4 = good quality, useful for patients; and, 5 = excellent quality, highly useful for patients [ 15 , 20 , 22 , 32 ]. Since both DISCERN and GQS utilize 5-point ordinal scales, their results can be mapped to a 0-100 range using the same linear transformation described previously. The distribution of the reported quality scores post-standardization is shown in the left panel of Figure 3 . The normalized quality scores ranged from 47.0% to 100%, with a median score of 76.5%. 3.1.3. Consistency LLMs may not generate the same response when asked the same question multiple times. This is largely due to the stochastic (non-deterministic) nature of text generation in these models. Specifically, most LLMs use a parameter called temperature to control the level of randomness in their output [ 51 ]. A higher temperature increases randomness, leading to more diverse responses. In contrast, a lower temperature will result in more deterministic and consistent answers. While a high temperature setting is suitable for creative tasks and applications, it can have a detrimental effect in high-stakes healthcare applications such as answering patient questions. Six of the reviewed studies evaluated consistency by repeatedly posing the same question to the LLM. Five of these six had experts rate response similarity or consistency [ 14 , 17 , 24 , 38 , 48 ]. The remaining study [ 35 ] used the intraclass correlation coefficient (ICC) of word counts as a proxy for consistency between repeated responses. The box plot of normalized consistency scores is shown in the left panel of Figure 3 . Excluding one outlier, the normalized consistency scores of LLMs ranged from 75.0% to 100%, with a median consistency score of 100%. 3.2. Accessibility Measures 3.2.1. Readability Readability refers to how easily LLM responses can be read and understood by patients. Even a reliable model, one which consistently produces accurate and high-quality responses, would be ineffective if the responses were not easily readable. In this context, readability is commonly measured via the Flesch Reading Ease Scale (FRES), which calculates readability based on the average sentence length and the average number of syllables per word [ 52 ]. FRES scores range from 0 to 100, with larger scores indicating better readability: 90–100 = very easy to read; 80–89 = easy to read; 70–79 = fairly easy to read; 60–69 = plain English; 50–59 = fairly difficult to read; 30–49 = difficult to read; and, FRES scores below 30 = very difficult to read. The FRES score distribution of n=28 LLMs in fifteen studies which calculated this readability measure is plotted in the left panel of Figure 3 . The reported FRES scores ranged from 24.0% to 66.4%, with a median score of 40.4%. Beyond FRES, researchers also assessed readability using metrics such as the Flesch-Kincaid grade level [ 53 ], the Gunning Fog index [ 54 ], the automated readability index [ 55 ], the Coleman-Liau index [ 56 ], and the Simplified Measure of Gobbledygook [ 57 ]. These metrics, despite differing computational methodologies, all aim to estimate the U.S. grade level required for understanding written information. For instance, a Flesch-Kincaid grade score of 8 indicates that the text is readable for someone with an eighth-grade education level. Figure 3 (right panel) displays a scatter plot comparing FRES scores and grade level (GL) scores from studies that reported both. When studies reported more than one GL score, we followed the approach recommended by Haver et al. [ 27 ] and averaged them. The American Medical Association (AMA) recommends that consumer health information be written at or below a sixth grade reading level to ensure accessibility for all patients, regardless of their education or literacy level [ 58 ]. As can be seen in the figure, all LLMs fell considerably outside this recommended region (highlighted in gray). Unsurprisingly, a fairly strong negative correlation was observed between the FRES and GL scores of LLM responses (R-squared = 0.69). 3.2.2. Understandability The Patient Education Materials Assessment Tool (PEMAT) is a validated instrument designed to assess the effectiveness of patient-facing health education materials [ 59 ]. It offers two distinct versions for printable (PEMAT-P) and audiovisual (PEMAT-A/V) formats. The PEMAT-P section of the instrument includes an understandability component focused on measuring patients’ comprehension of key messages via 19 yes/ no questions, which are scored and reported as a percentage. The left panel of Figure 3 includes the box plot distribution of PEMAT-P understandability scores from six studies (n=12 LLMs) which measured understandability as part of their assessment. The reported understandability scores ranged from 40.0% to 91.7%, with a median score of 69.0%. 3.2.3. Actionability The second component of the PEMAT-P instrument includes seven yes/no questions designed to determine the ability of patients to identify actionable steps in the provided content. Akin to understandability scores, actionability scores are also calculated as the percentage of positive responses to the seven binary questions. As displayed in the left panel of Figure 3 , excluding one outlier, the actionability scores ranged from 20.0% to 76.0%, with a median score of 40.0%. 4. Discussion Our review focused on evaluating the performance of LLMs in addressing patients’ oncology questions, across these six dimensions: accuracy, quality, consistency, readability, understandability, and actionability. As shown in Figure 3 , accuracy and quality scores exhibit roughly similar distributions, with median values of 79.0% and 76.5%, respectively. Although these numbers suggest that LLMs generally provide reliable information, inaccurate and low-quality answers do occasionally occur. Such inaccuracies range from minor oversights to significant errors. For example, Coskun et al. [ 20 ] highlighted an instance where the LLM responded by stating that “a normal PSA level [is] considered to be 4 ng/mL or lower”. This statement is generally correct but fails to account for individual factors (such as age and prostate volume) that can affect a patient’s normal PSA level. Piao et al. [ 37 ] documented another instance wherein ChatGPT incorrectly stated that “surgery can ensure the complete removal of all cancer cells in the patient’s body”. In contrast to these generally correct but imprecise statements, some LLM answers were entirely incorrect or misleading. For example, Rahsepar et al. [ 38 ] and Piao et al. [ 37 ] respectively reported LLM responses containing non-existent lung (e.g., Lung-RADS 5) and breast (e.g., BI-RADS 7) imaging categories. In another study, Spuur et al. [ 42 ] found several instances of fabricated citations in LLM outputs, likely resulting from the hybridization of real references. This undesired generation of false information by LLMs is termed “hallucination” in computer science literature [ 60 ]. To avoid using anthropomorphic language in medical contexts, “fact fabrication” might be a more suitable term to describe this phenomenon [ 61 ]. Irrespective of terminology, fact fabrications or hallucinations present a substantial risk, particularly in healthcare applications. Mitigating hallucinations remains an active area of research, with several strategies demonstrating great promise. One such strategy is supervised fine-tuning where a pre-trained LLM’s parameters are fine-tuned using a smaller, task-specific dataset to optimize model performance [ 62 ]. Using Retrieval-Augmented Generation (RAG), the accuracy and quality of LLM responses can be improved by incorporating information from external, authoritative sources such as medical textbooks, clinical guidelines, and consensus statements [ 63 ]. Chain-of-thought prompting is another strategy that encourages the LLM to explain its reasoning process in a step-by-step sequence, making it easier to identify potential errors in the model’s logic [ 64 ]. The third dimension of reliability was consistency, or the ability of LLMs to provide consistent answers when prompted multiple times. Some researchers referred to this concept as stability [ 14 ] or reproducibility [ 17 ]. With a median consistency score of 100%, our results suggest that LLMs generally perform well in this area (see Figure 3 ). However, given that only a small fraction of the reviewed studies (6 out of 36) assessed consistency, these results should be interpreted with caution. In terms of readability, all LLMs exhibited suboptimal performance, as shown in the right panel of Figure 3 . Our results indicate that LLM outputs often use complex language which surpasses the average U.S. adult’s eighth grade reading level [ 65 ]. Fortunately, there is evidence that prompt engineering, or the deliberate and careful modification of input prompts, can improve the readability of LLM responses. For example, Musheyev et al. [ 31 ] reported that adding the phrase “Explain the following at a sixth grade reading level” to the beginning of each prompt lowered the reading level of LLM outputs by two grade levels. Similar substantial improvements in readability have been reported by other researchers through the use of appropriate prompts (see Table 3 ). View this table: View inline View popup Download powerpoint Table 3. The effect of prompt engineering on improving the readability of LLM answers. With median scores of 69.0% and 40.0%, respectively, understandability and particularly, actionability of LLM responses were lower than desired. It should be noted, however, that the PEMAT-P assessment tool employed to measure understandability and actionability, was not originally designed for evaluating AI-generated content. For instance, PEMAT-P includes several questions about layout, design, and inclusion of visual aids. LLM responses, being purely text-based, may be unfairly penalized on these criteria, contributing to low understandability and actionability scores. Therefore, it is necessary and timely to establish standardized methods and guidelines for evaluating LLMs in medical settings. The Chatbot Assessment Reporting Tool (CHART) is an initiative aimed at filling this gap, specifically for LLMs tasked with providing clinical advice [ 66 ]. Our review revealed some deficiencies and limitations that future studies in this area should address. First, the expert evaluation of LLM responses was rarely blinded. In most cases, evaluators knew the responses were AI-generated, and this awareness could have influenced their grading. To mitigate this risk of bias, we recommend that future studies, when feasible, mix human-generated and AI-generated responses in a blinded evaluation. Second, almost all studies used physicians exclusively to evaluate LLM responses. While this is entirely appropriate and necessary for assessing medical accuracy, patients are better suited to judge the accessibility of the answers, as they are the technology’s end-users in this scenario. Future studies should consider recruiting patients with diverse backgrounds and education levels to better assess the patient-centric measures of LLM performance. Patients’ attitudes, perspectives, and feedback could also inform prompt engineering and guide model development. Ideally, randomized controlled trials (RCTs) should be used to test the true efficacy of LLMs, in which real patients pose their questions to chatbots in a controlled environment. In a recent RCT study, Baumgärtner et al. [ 67 ] reported a significant reduction in the patients’ information needs following the use of the PROState cancer Conversational Agent (PROSCA). This study was not included in our review because the chatbot PROSCA does not qualify as an LLM, as it only provides fixed and pre-vetted responses. Finally, personalizing LLM interactions by integrating patients’ individual information into the prompt could be a key focus area for future investigations. Without such personalization, LLMs can only offer generalized, one-size-fits-all advice, which may not be suitable for a patient based on their unique medical history or pre-existing conditions. 5. Conclusion Our review of the data suggests that significant improvements are still needed to realize the full potential of LLMs as a source of information for cancer patients. Specifically, future advancements in LLMs must ensure they deliver personalized, hallucination-free information that is easily readable, understandable, and actionable for patients. Though still maturing, LLM-powered chatbots have already influenced how patients access medical information. As with any disruptive technology, LLMs have also been met with a certain degree of skepticism [ 68 ]. Decades ago, similar concerns were expressed about the use of computers in medicine [ 69 – 71 ]. Over time, however, the healthcare community adapted and embraced computers and electronic health records. We posit that the adoption of LLMs will follow a similar pattern. But, to achieve effective integration of LLMs, the physician’s role relative to AI must be clearly defined. It is important to remember that LLMs are not meant to replace doctors, nor can they, as some have feared or warned [ 72 ]. Even if AI were to reach a point where it knows every fact about cancer and can convey them accurately, it could never replicate the unique experiences that physicians gain through direct human interactions with patients [ 73 , 74 ]. Therefore, LLMs should be envisioned as assistant tools for disseminating high-quality information to patients. Physicians can play a critical role as a human-in-the-loop, providing clinical expertise and oversight during training and validation of LLMs. Using techniques such as Reinforcement Learning from human feedback [ 75 ], physicians can actively guide LLMs in learning accurately from data, helping to ensure these models uphold the core medical principle of “first, do no harm.” Data Availability All data produced in the present study are available upon reasonable request to the authors. 6. References 1. ↵ Murphy , M . ( 2019 ). Dr Google will see you now: Search giant wants to cash in on your medical queries . The Telegraph , Mar. 10, 2019 . Accessed: Mar. 15, 2025 . [Online]. Available: https://www.telegraph.co.uk/technology/2019/03/10/google-sifting-one-billion-health-questions-day/ 2. ↵ Rosenberg , J . ( 2017 ). Majority of Patients Go Online for Information Following Cancer Diagnosis, Study Finds . The American Journal of Managed Care , Dec. 5 , 2017 . Accessed: Mar. 15, 2025 . [Online]. Available: https://www.ajmc.com/view/majority-of-patients-go-online-for-information-following-cancer-diagnosis-study-finds OpenUrl 3. ↵ Johnson , S. B. , Parsons , M. , Dorff , T. , Moran , M. S. , Ward , J. H. , Cohen , S. A. , Akerley , W. , Bauman , J. , Hubbard , J. , Spratt , D. E. , Bylund , C. L. , Swire-Thompson , B. , Onega , T. , Scherer , L. D. , Tward , J. , & Fagerlin , A . ( 2022 ). Cancer Misinformation and Harmful Information on Facebook and Other Social Media: A Brief Report . Journal of the National Cancer Institute , 114 ( 7 ), 1036 – 1039 . doi: 10.1093/jnci/djab141 OpenUrl CrossRef PubMed 4. ↵ Xu , A. J. , Myrie , A. , Taylor , J. I. , Matulewicz , R. , Gao , T. , Pérez-Rosas , V. , Mihalcea , R. , & Loeb , S . ( 2022 ). Instagram and prostate cancer: using validated instruments to assess the quality of information on social media . Prostate cancer and prostatic diseases , 25 ( 4 ), 791 – 793 . doi: 10.1038/s41391-021-00473-7 OpenUrl CrossRef PubMed 5. ↵ Hu , K. ( 2023 ). ChatGPT sets record for fastest-growing user base – analyst note . Reuters , Feb. 2, 2023. Accessed: Mar. 15, 2025 . [Online]. Available: https://www.reuters.com/technology/chatgpt-sets-record-fastest-growing-user-base-analyst-note-2023-02-01 6. ↵ Bélisle-Pipon J. C . ( 2024 ). Why we need to be careful with LLMs in medicine . Frontiers in medicine , 11 , 1495582 . doi: 10.3389/fmed.2024.1495582 OpenUrl CrossRef PubMed 7. ↵ Han , T. , Nebelung , S. , Khader , F. , Wang , T. , Müller-Franzes , G. , Kuhl , C. , Försch , S. , Kleesiek , J. , Haarburger , C. , Bressem , K. K. , Kather , J. N. , & Truhn , D . ( 2024 ). Medical large language models are susceptible to targeted misinformation attacks . NPJ digital medicine , 7 ( 1 ), 288 . doi: 10.1038/s41746-024-01282-7 OpenUrl CrossRef PubMed 8. ↵ Borhani , R. , Borhani , S. , Katsaggelos , A. K . ( 2022 ). Fundamentals of Machine Learning and Deep Learning in Medicine . Springer . doi: 10.1007/978-3-031-19502-0 OpenUrl 9. ↵ Fang , X. , Che , S. , Mao , M. , Zhang , H. , Zhao , M. , & Zhao , X . ( 2024 ). Bias of AI-generated content: an examination of news produced by large language models . Scientific reports , 14 ( 1 ), 5224 . doi: 10.1038/s41598-024-55686-2 OpenUrl CrossRef PubMed 10. ↵ Dodge , J. , Marasovic , A. , Ilharco , G. , Groeneveld , D. , Mitchell , M. , & Gardner , M. ( 2021 ). Documenting Large Webtext Corpora: A Case Study on the Colossal Clean Crawled Corpus . Conference on Empirical Methods in Natural Language Processing . 11. ↵ Gibson , D. , Jackson , S. , Shanmugasundaram , R. , Seth , I. , Siu , A. , Ahmadi , N. , Kam , J. , Mehan , N. , Thanigasalam , R. , Jeffery , N. , Patel , M. I. , & Leslie , S . ( 2024 ). Evaluating the Efficacy of ChatGPT as a Patient Education Tool in Prostate Cancer: Multimetric Assessment . Journal of medical Internet research , 26 , e55939 . doi: 10.2196/55939 OpenUrl CrossRef PubMed 12. ↵ Betschart , P. , Pratsinis , M. , Müllhaupt , G. , Rechner , R. , Herrmann , T. R. , Gratzke , C. , Schmid , H. P. , Zumstein , V. , & Abt , D . ( 2020 ). Information on surgical treatment of benign prostatic hyperplasia on YouTube is highly biased and misleading . BJU international , 125 ( 4 ), 595 – 601 . doi: 10.1111/bju.14971 OpenUrl CrossRef 13. ↵ Siegel , R. L. , Giaquinto , A. N. , & Jemal , A . ( 2024 ). Cancer statistics, 2024 . CA: a cancer journal for clinicians , 74 ( 1 ), 12 – 49 . doi: 10.3322/caac.21820 OpenUrl CrossRef PubMed 14. ↵ Alasker , A. , Alsalamah , S. , Alshathri , N. , Almansour , N. , Alsalamah , F. , Alghafees , M. , AlKhamees , M. , & Alsaikhan , B . ( 2024 ). Performance of large language models (LLMs) in providing prostate cancer information . BMC urology , 24 ( 1 ), 177 . doi: 10.1186/s12894-024-01570-0 OpenUrl CrossRef 15. ↵ Bayley , E. M. , Liu , H. Y. , Bonetti , M. A. , Egro , F. M. , & Diego , E. J . ( 2025 ). ChatGPT as a Valuable Patient Education Resource in Breast Cancer Care . Annals of surgical oncology , 32 ( 2 ), 653 – 655 . doi: 10.1245/s10434-024-16369-4 OpenUrl CrossRef PubMed 16. ↵ Belge Bilgin , G. , Bilgin , C. , Childs , D. S. , Orme , J. J. , Burkett , B. J. , Packard , A. T. , Johnson , D. R. , Thorpe , M. P. , Riaz , I. B. , Halfdanarson , T. R. , Johnson , G. B. , Sartor , O. , & Kendi , A. T. ( 2024 ). Performance of ChatGPT-4 and Bard chatbots in responding to common patient questions on prostate cancer 177Lu-PSMA-617 therapy . Frontiers in oncology , 14 , 1386718 . doi: 10.3389/fonc.2024.1386718 OpenUrl CrossRef PubMed 17. ↵ Caglar , U. , Yildiz , O. , Meric , A. , Ayranci , A. , Yusuf , R. , Sarilar , O. , & Ozgor , F . ( 2023 ). Evaluating the performance of ChatGPT in answering questions related to benign prostate hyperplasia and prostate cancer . Minerva urology and nephrology , 75 ( 6 ), 729 – 733 . doi: 10.23736/S2724-6051.23.05450-2 OpenUrl CrossRef 18. ↵ Chiarelli , G. , Stephens , A. , Finati , M. , Cirulli , G. O. , Beatrici , E. , Filipas , D. K. , Arora , S. , Tinsley , S. , Bhandari , M. , Carrieri , G. , Trinh , Q. D. , Briganti , A. , Montorsi , F. , Lughezzani , G. , Buffi , N. , Rogers , C. , & Abdollah , F . ( 2024 ). Adequacy of prostate cancer prevention and screening recommendations provided by an artificial intelligence-powered large language model . International urology and nephrology , 56 ( 8 ), 2589 – 2595 . doi: 10.1007/s11255-024-04009-5 OpenUrl CrossRef PubMed 19. ↵ Collin , H. , Keogh , K. , Basto , M. , Loeb , S. , & Roberts , M. J . ( 2024 ). ChatGPT can help guide and empower patients after prostate cancer diagnosis . Prostate cancer and prostatic diseases , 10 . 1038 / s41391-024-00864-6. Advance online publication. doi: 10.1038/s41391-024-00864-6 OpenUrl CrossRef 20. ↵ Coskun , B. , Ocakoglu , G. , Yetemen , M. , & Kaygisiz , O . ( 2023 ). Can ChatGPT, an Artificial Intelligence Language Model, Provide Accurate and High-quality Patient Information on Prostate Cancer? . Urology , 180 , 35 – 58 . doi: 10.1016/j.urology.2023.05.040 OpenUrl CrossRef PubMed 21. Erkan , A. , Koc , A. , Barali , D. , Satir , A. , Zengin , S. , Kilic , M. , Dundar , G. , & Guzelsoy , M . ( 2024 ). Can Patients With Urogenital Cancer Rely on Artificial Intelligence Chatbots for Treatment Decisions? . Clinical genitourinary cancer , 22 ( 6 ), 102206 . doi: 10.1016/j.clgc.2024.102206 OpenUrl CrossRef PubMed 22. ↵ Ferrari-Light , D. , Merritt , R. E. , D’Souza , D. , Ferguson , M. K. , Harrison , S. , Madariaga , M. L. , Lee , B. E. , Moffatt-Bruce , S. D. , & Kneuertz , P. J . ( 2024 ). Evaluating ChatGPT as a patient resource for frequently asked questions about lung cancer surgery-a pilot study . The Journal of thoracic and cardiovascular surgery , S0022-5223(24)00837-7. Advance online publication. doi: 10.1016/j.jtcvs.2024.09.030 OpenUrl CrossRef 23. Geantă , M. , Bădescu , D. , Chirca , N. , Nechita , O. C. , Radu , C. G. , Rascu , S. , Rădăvoi , D. , Sima , C. , Toma , C. , & Jinga , V . ( 2024 ). The Potential Impact of Large Language Models on Doctor-Patient Communication: A Case Study in Prostate Cancer . Healthcare (Basel, Switzerland) , 12 ( 15 ), 1548 . doi: 10.3390/healthcare12151548 OpenUrl CrossRef PubMed 24. ↵ Gencer A . ( 2024 ). Readability analysis of ChatGPT’s responses on lung cancer . Scientific reports , 14 ( 1 ), 17234 . doi: 10.1038/s41598-024-67293-2 OpenUrl CrossRef PubMed 25. Gummadi , R. , Dasari , N. , Kumar , D. S. , & Pindiprolu , S. K. S. S . ( 2024 ). Evaluating the Accuracy of Large Language Model (ChatGPT) in Providing Information on Metastatic Breast Cancer . Advanced pharmaceutical bulletin , 14 ( 3 ), 499 – 503 . doi: 10.34172/apb.2024.060 OpenUrl CrossRef PubMed 26. Haver , H. L. , Gupta , A. K. , Ambinder , E. B. , Bahl , M. , Oluyemi , E. T. , Jeudy , J. , & Yi , P. H . ( 2024 ). Evaluating the Use of ChatGPT to Accurately Simplify Patient-centered Information about Breast Cancer Prevention and Screening . Radiology. Imaging cancer , 6 ( 2 ), e230086 . doi: 10.1148/rycan.230086 OpenUrl CrossRef 27. ↵ Haver , H. L. , Lin , C. T. , Sirajuddin , A. , Yi , P. H. , & Jeudy , J . ( 2023 ). Use of ChatGPT, GPT-4, and Bard to Improve Readability of ChatGPT’s Answers to Common Questions About Lung Cancer and Lung Cancer Screening. AJR . American journal of roentgenology , 221 ( 5 ), 701 – 704 . doi: 10.2214/AJR.23.29622 OpenUrl CrossRef 28. ↵ Hershenhouse , J. S. , Mokhtar , D. , Eppler , M. B. , Rodler , S. , Storino Ramacciotti , L. , Ganjavi , C. , Hom , B. , Davis , R. J. , Tran , J. , Russo , G. I. , Cocci , A. , Abreu , A. , Gill , I. , Desai , M. , & Cacciamani , G. E . ( 2024 ). Accuracy, readability, and understandability of large language models for prostate cancer information to the public . Prostate cancer and prostatic diseases , doi: 10.1038/s41391-024-00826-y . Advance online publication. 10.1038/s41391-024-00826-y OpenUrl CrossRef 29. Janopaul-Naylor , J. R. , Koo , A. , Qian , D. C. , McCall , N. S. , Liu , Y. , & Patel , S. A . ( 2024 ). Physician Assessment of ChatGPT and Bing Answers to American Cancer Society’s Questions to Ask About Your Cancer . American journal of clinical oncology , 47 ( 1 ), 17 – 21 . doi: 10.1097/COC.0000000000001050 OpenUrl CrossRef PubMed 30. Lombardo , R. , Gallo , G. , Stira , J. , Turchi , B. , Santoro , G. , Riolo , S. , Romagnoli , M. , Cicione , A. , Tema , G. , Pastore , A. , Al Salhi , Y. , Fuschi , A. , Franco , G. , Nacchia , A. , Tubaro , A. , & De Nunzio , C. ( 2025 ). Quality of information and appropriateness of Open AI outputs for prostate cancer . Prostate cancer and prostatic diseases , 28 ( 1 ), 229 – 231 . doi: 10.1038/s41391-024-00789-0 OpenUrl CrossRef PubMed 31. ↵ Musheyev , D. , Pan , A. , Gross , P. , Kamyab , D. , Kaplinsky , P. , Spivak , M. , Bragg , M. A. , Loeb , S. , & Kabarriti , A. E . ( 2024 ). Readability and Information Quality in Cancer Information From a Free vs Paid Chatbot . JAMA network open , 7 ( 7 ), e2422275 . doi: 10.1001/jamanetworkopen.2024.22275 OpenUrl CrossRef 32. ↵ Ozgor , F. , Caglar , U. , Halis , A. , Cakir , H. , Aksu , U. C. , Ayranci , A. , & Sarilar , O . ( 2024 ). Urological Cancers and ChatGPT: Assessing the Quality of Information and Possible Risks for Patients . Clinical genitourinary cancer , 22 ( 2 ), 454 – 457 .e4. doi: 10.1016/j.clgc.2023.12.017 OpenUrl CrossRef PubMed 33. Pan , A. , Musheyev , D. , Bockelman , D. , Loeb , S. , & Kabarriti , A. E . ( 2023 ). Assessment of Artificial Intelligence Chatbot Responses to Top Searched Queries About Cancer . JAMA oncology , 9 ( 10 ), 1437 – 1440 . doi: 10.1001/jamaoncol.2023.2947 OpenUrl CrossRef PubMed 34. Owens , O. L. , & Leonard , M . ( 2025 ). A Comparison of Prostate Cancer Screening Information Quality on Standard and Advanced Versions of ChatGPT, Google Gemini, and Microsoft Copilot: A Cross-Sectional Study. American journal of health promotion : AJHP, 8901171251316371 . Advance online publication . doi: 10.1177/08901171251316371 OpenUrl CrossRef 35. ↵ Park , K. U. , Lipsitz , S. , Dominici , L. S. , Lynce , F. , Minami , C. A. , Nakhlis , F. , Waks , A. G. , Warren , L. E. , Eidman , N. , Frazier , J. , Hernandez , L. , Leslie , C. , Rafte , S. , Stroud , D. , Weissman , J. S. , King , T. A. , & Mittendorf , E. A . ( 2025 ). Generative artificial intelligence as a source of breast cancer information for patients: Proceed with caution . Cancer , 131 ( 1 ), e35521 . doi: 10.1002/cncr.35521 OpenUrl CrossRef PubMed 36. Patel , J. M. , Hermann , C. E. , Growdon , W. B. , Aviki , E. , & Stasenko , M . ( 2024 ). ChatGPT accurately performs genetic counseling for gynecologic cancers . Gynecologic oncology , 183 , 115 – 119 . doi: 10.1016/j.ygyno.2024.04.006 OpenUrl CrossRef PubMed 37. ↵ Piao , Y. , Chen , H. , Wu , S. , Li , X. , Li , Z. , & Yang , D . ( 2024 ). Assessing the performance of large language models (LLMs) in answering medical questions regarding breast cancer in the Chinese context . Digital health , 10 , 20552076241284771 . doi: 10.1177/20552076241284771 OpenUrl CrossRef PubMed 38. ↵ Rahsepar , A. A. , Tavakoli , N. , Kim , G. H. J. , Hassani , C. , Abtin , F. , & Bedayat , A . ( 2023 ). How AI Responds to Common Lung Cancer Questions: ChatGPT vs Google Bard . Radiology , 307 ( 5 ), e230922 . doi: 10.1148/radiol.230922 OpenUrl CrossRef PubMed 39. Rogasch , J. M. M. , Metzger , G. , Preisler , M. , Galler , M. , Thiele , F. , Brenner , W. , Feldhaus , F. , Wetz , C. , Amthauer , H. , Furth , C. , & Schatka , I . ( 2023 ). ChatGPT: Can You Prepare My Patients for [18F]FDG PET/ CT and Explain My Reports?. Journal of nuclear medicine : official publication , Society of Nuclear Medicine , 64 ( 12 ), 1876 – 1879 . doi: 10.2967/jnumed.123.266114 OpenUrl Abstract / FREE Full Text 40. ↵ Roldan-Vasquez , E. , Mitri , S. , Bhasin , S. , Bharani , T. , Capasso , K. , Haslinger , M. , Sharma , R. , & James , T. A . ( 2024 ). Reliability of artificial intelligence chatbot responses to frequently asked questions in breast surgical oncology . Journal of surgical oncology , 130 ( 2 ), 188 – 203 . doi: 10.1002/jso.27715 OpenUrl CrossRef PubMed 41. Shah , Y. B. , Ghosh , A. , Hochberg , A. , Mark , J. R. , Lallas , C. D. , & Shah , M. S . ( 2024 ). Artificial intelligence improves urologic oncology patient education and counseling . The Canadian journal of urology , 31 ( 5 ), 12013 – 12018 . OpenUrl PubMed 42. ↵ Spuur , K. , Currie , G. , Al-Mousa , D. , & Pape , R . ( 2024 ). Suitability of ChatGPT as a Source of Patient Information for Screening Mammography . Health promotion practice , 15248399241285060 . Advance online publication. doi: 10.1177/15248399241285060 OpenUrl CrossRef 43. Stenzl , A. , Armstrong , A. J. , Rogers , E. , Habr , D. , Walz , J. , Gleave , M. , Sboner , A. , Ghith , J. , Serfass , L. , Schuler , K. W. , Garas , S. , Chari , D. , Truman , K. , & Sternberg , C. N . ( 2025 ). Evaluation of ChatGPT as a Reliable Source of Medical Information on Prostate Cancer for Patients: Global Comparative Survey of Medical Oncologists and Urologists . Urology practice , 12 ( 2 ), 229 – 240 . doi: 10.1097/UPJ.0000000000000740 OpenUrl CrossRef PubMed 44. Szczesniewski , J. J. , Ramos Alba , A. , Rodríguez Castro , P. M. , Lorenzo Gómez , M. F. , Sainz González , J. , & Llanes González , L . ( 2024 ). Quality of information about urologic pathology in English and Spanish from ChatGPT, BARD, and Copilot . Actas urologicas espanolas , 48 ( 5 ), 398 – 403 . doi: 10.1016/j.acuroe.2024.02.009 OpenUrl CrossRef PubMed 45. Thia , I. , & Saluja , M . ( 2024 ). ChatGPT: Is This Patient Education Tool for Urological Malignancies Readable for the General Population? . Research and reports in urology , 16 , 31 – 37 . doi: 10.2147/RRU.S440633 OpenUrl CrossRef 46. ↵ Trapp , C. , Schmidt-Hegemann , N. , Keilholz , M. , Brose , S. F. , Marschner , S. N. , Schönecker , S. , Maier , S. H. , Dehelean , D. C. , Rottler , M. , Konnerth , D. , Belka , C. , Corradini , S. , & Rogowski , P . ( 2025 ). Patient- and clinician-based evaluation of large language models for patient education in prostate cancer radiotherapy. Strahlentherapie und Onkologie : Organ der Deutschen Röntgengesellschaft … [et al] , 201 ( 3 ), 333 – 342 . doi: 10.1007/s00066-024-02342-3 OpenUrl CrossRef 47. Troian , M. , Lovadina , S. , Ravasin , A. , Arbore , A. , Aleksova , A. , Baratella , E. , & Cortale , M . ( 2025 ). An Assessment of ChatGPT’s Responses to Common Patient Questions About Lung Cancer Surgery: A Preliminary Clinical Evaluation of Accuracy and Relevance . Journal of clinical medicine , 14 ( 5 ), 1676 . doi: 10.3390/jcm14051676 OpenUrl CrossRef PubMed 48. ↵ Ye , Z. , Zhang , B. , Zhang , K. , Méndez , M. J. G. , Yan , H. , Wu , T. , Qu , Y. , Jiang , Y. , Xue , P. , & Qiao , Y . ( 2024 ). An assessment of ChatGPT’s responses to frequently asked questions about cervical and breast cancer . BMC women’s health , 24 ( 1 ), 482 . doi: 10.1186/s12905-024-03320-8 OpenUrl CrossRef PubMed 49. ↵ Murat , N . ( 2021 ). A hybrid transformation approach for common scaling on various type Likert scales in Bayesian structural equation modeling . Communications in Statistics - Theory and Methods , 51 ( 5 ), 1217 – 1231 . doi: 10.1080/03610926.2020.1853774 OpenUrl CrossRef 50. ↵ Charnock , D. , Shepperd , S. , Needham , G. , & Gann , R . ( 1999 ). DISCERN: an instrument for judging the quality of written consumer health information on treatment choices . Journal of epidemiology and community health , 53 ( 2 ), 105 – 111 . doi: 10.1136/jech.53.2.105 OpenUrl Abstract 51. ↵ Hinton , G. E. , Vinyals , O. , & Dean , J. ( 2015 ). Distilling the Knowledge in a Neural Network . ArXiv . https://arxiv.org/abs/1503.02531 52. ↵ Flesch , R. F . ( 1979 ). How to write plain English: a book for lawyers and consumers . Harper & Row . 53. ↵ Kincaid , J. P. , Fishburne Jr , R. P. , Rogers , R. L. , & Chissom , B. S. ( 1975 ). Derivation of new readability formulas for navy enlisted personnel . Technical Report, Naval Technical Training Command Millington TN Research Branch . 54. ↵ Gunning , R . ( 1952 ). The Technique of Clear Writing . McGraw-Hill . 55. ↵ Smith , E. A. , & Senter , R. J . ( 1967 ). Automated readability index . AMRL-TR. Aerospace Medical Research Laboratories (U.S .), 1 – 14 . 56. ↵ Coleman , M. , & Liau , T. L . ( 1975 ). A computer readability formula designed for machine scoring . Journal of Applied Psychology , 60 ( 2 ), 283 – 284 . doi: 10.1037/h0076540 OpenUrl CrossRef Web of Science 57. ↵ McLaughlin , G. H . ( 1969 ). SMOG grading: A new readability formula . Journal of Reading , 12 ( 8 ), 639 – 646 . OpenUrl Web of Science 58. ↵ Weiss , B.D. ( 2007 ). Health literacy: a manual for clinicians . American Medical Association Foundation. http://lib.ncfh.org/pdfs/6617.pdf 59. ↵ Shoemaker , S. J. , Wolf , M. S. , & Brach , C . ( 2014 ). Development of the Patient Education Materials Assessment Tool (PEMAT): a new measure of understandability and actionability for print and audiovisual patient information . Patient education and counseling , 96 ( 3 ), 395 – 403 . doi: 10.1016/j.pec.2014.05.027 OpenUrl CrossRef PubMed 60. ↵ Ji , Z. , Lee , N. , Frieske , R. , Yu , T , Su , D. , Xu , Y. , Ishii , E. , Bang , Y. , Madotto , A. , & Fung , P . ( 2023 ). Survey of Hallucination in Natural Language Generation . ACM Comput. Surv ., 55 ( 12 ), 1 – 38 . doi: 10.1145/3571730 OpenUrl CrossRef 61. ↵ Thirunavukarasu , A. J. , Ting , D. S. J. , Elangovan , K. , Gutierrez , L. , Tan , T. F. , & Ting , D. S. W . ( 2023 ). Large language models in medicine . Nature medicine , 29 ( 8 ), 1930 – 1940 . doi: 10.1038/s41591-023-02448-8 OpenUrl CrossRef PubMed 62. ↵ Touvron , H. , Martin , L. , Stone , K. , Albert , P. , Almahairi , A. , Babaei , Y. , Bashlykov , N. , Batra , S. , Bhargava , P. , Bhosale , S. , Bikel , D. , Blecher , L. , Canton Ferrer , C. , Chen , M. , Cucurull , G. , Esiobu , D. , Fernandes , J. , Fu , J. , Fu , W. , … Scialom , T . ( 2023 ). Llama 2: Open Foundation and Fine-Tuned Chat Models . arXiv . doi: 10.48550/arXiv.2307.09288 OpenUrl CrossRef 63. ↵ Gao , Y. , Xiong , Y. , Gao , X. , Jia , K. , Pan , J. , Bi , Y. , Dai , Y. , Sun , J. , Wang , M. , & Wang , H . ( 2024 ). Retrieval-Augmented Generation for Large Language Models: A Survey . ArXiv . doi: 10.48550/arXiv.2312.10997 OpenUrl CrossRef 64. ↵ Wei , J. , Wang , X. , Schuurmans , D. , Bosma , M. , Ichter , B. , Xia , F. , Chi , E. , Le , Q. , & Zhou , D . ( 2023 ). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models . ArXiv . doi: 10.48550/arXiv.2201.11903 OpenUrl CrossRef 65. ↵ Davis , T. C. , & Wolf , M. S . ( 2004 ). Health literacy: implications for family medicine . Family medicine , 36 ( 8 ), 595 – 598 . OpenUrl PubMed Web of Science 66. ↵ CHART Collaborative ( 2024 ). Protocol for the development of the Chatbot Assessment Reporting Tool (CHART) for clinical advice . BMJ open , 14 ( 5 ), e081155 . doi: 10.1136/bmjopen-2023-081155 OpenUrl Abstract / FREE Full Text 67. ↵ Baumgärtner , K. , Byczkowski , M. , Schmid , T. , Muschko , M. , Woessner , P. , Gerlach , A. , Bonekamp , D. , Schlemmer , H. P. , Hohenfellner , M. , & Görtz , M . ( 2024 ). Effectiveness of the Medical Chatbot PROSCA to Inform Patients About Prostate Cancer: Results of a Randomized Controlled Trial . European urology open science , 69 , 80 – 88 . doi: 10.1016/j.euros.2024.08.022 OpenUrl CrossRef PubMed 68. ↵ Ranji S. R . ( 2024 ). Large Language Models-Misdiagnosing Diagnostic Excellence? . JAMA network open , 7 ( 10 ), e2440901 . doi: 10.1001/jamanetworkopen.2024.40901 OpenUrl CrossRef PubMed 69. ↵ Schwartz W. B . ( 1970 ). Medicine and the computer. The promise and problems of change . The New England journal of medicine , 283 ( 23 ), 1257 – 1264 . doi: 10.1056/NEJM197012032832305 OpenUrl CrossRef PubMed Web of Science 70. Anderson , J. G. , Jay , S. J. , Schweer , H. M. , & Anderson , M. M . ( 1986 ). Why doctors don’t use computers: some empirical findings . Journal of the Royal Society of Medicine , 79 ( 3 ), 142 – 144 . doi: 10.1177/014107688607900305 OpenUrl CrossRef PubMed Web of Science 71. ↵ Pelosi , A. J. , & Lewis , G . ( 1989 ). The computer will see you now . BMJ (Clinical research ed .), 299 ( 6692 ), 138 – 139 . doi: 10.1136/bmj.299.6692.138 OpenUrl FREE Full Text 72. ↵ Zilber , A . ( 2025 ). Bill Gates says AI will replace doctors, teachers within 10 years — and claims humans won’t be needed ‘for most things’. New York Post , Mar . 27 , 2025 . Accessed: Apr. 5, 2025 . [Online]. Available: https://nypost.com/2025/03/27/business/bill-gates-said-ai-will-replace-doctors-teachers-within-10-years/ OpenUrl 73. ↵ Montemayor , C. , Halpern , J. , & Fairweather , A . ( 2022 ). In principle obstacles for empathic AI: why we can’t replace human empathy in healthcare . AI & society , 37 ( 4 ), 1353 – 1359 . doi: 10.1007/s00146-021-01230-z OpenUrl CrossRef PubMed 74. ↵ Perry , A . AI will never convey the essence of human empathy . Nat Hum Behav 7 , 1808 – 1809 ( 2023 ). doi: 10.1038/s41562-023-01675-w OpenUrl CrossRef PubMed 75. ↵ Ouyang , L. , Wu , J. , Jiang , X. , Almeida , D. , Wainwright , C. L. , Mishkin , P. , Zhang , C. , Agarwal , S. , Slama , K. , Ray , A. , Schulman , J. , Hilton , J. , Kelton , F. , Miller , L. , Simens , M. , Askell , A. , Welinder , P. , Christiano , P. , Leike , J. , & Lowe , R ( 2022 ). Training language models to follow instructions with human feedback . ArXiv . doi: 10.48550/arXiv.2203.02155 OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted August 07, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Evaluating the Efficacy of Large Language Models in Addressing Patient-Centric Inquiries in Multiple Cancers Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Evaluating the Efficacy of Large Language Models in Addressing Patient-Centric Inquiries in Multiple Cancers Soheila Borhani , Xiaoqian Jiang medRxiv 2025.08.05.25332968; doi: https://doi.org/10.1101/2025.08.05.25332968 Share This Article: Copy Citation Tools Evaluating the Efficacy of Large Language Models in Addressing Patient-Centric Inquiries in Multiple Cancers Soheila Borhani , Xiaoqian Jiang medRxiv 2025.08.05.25332968; doi: https://doi.org/10.1101/2025.08.05.25332968 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15228) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6599) Geriatric Medicine (668) Health Economics (997) Health Informatics (4536) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3332) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9231) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a0065d2c38e6ad07',t:'MTc3OTU2MzQ2MA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00