Full text
19,044 characters
· extracted from
preprint-html
· click to expand
Evaluating a Medical-Grade Voice AI for Patient and Caregiver Guidance: A Multi-Scenario Nurse Panel Study | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Evaluating a Medical-Grade Voice AI for Patient and Caregiver Guidance: A Multi-Scenario Nurse Panel Study Sahitya Sridhar , Rajashekar Vasantha , Supreet Deshpande , Parul Pathak doi: https://doi.org/10.1101/2025.09.18.25336107 Sahitya Sridhar 1 SynthioLabs , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: sahitya{at}synthiolabs.com Rajashekar Vasantha 1 SynthioLabs , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Supreet Deshpande 1 SynthioLabs , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Parul Pathak 1 SynthioLabs , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Medical-grade conversational AI offers the potential to extend patient support programs (PSPs) in regulated therapeutic areas, but its safety and reliability must be rigorously evaluated. We conducted a large-scale, nurse-led assessment of a voice-based AI system across 30 patient and caregiver scenarios spanning diabetes, oncology, neurology, cardiometabolic disease, and rare disorders. Nearly 1,000 U.S.-licensed nurses role-played patients or caregivers in more than 20,000 interactions, scoring the AI across five domains: clinical accuracy, empathy, communication clarity, appropriateness of advice, and compliance with evidence. The system achieved over 97% top ratings across all domains, with empathy noted most strongly in oncology and rare disease caregiving contexts, and clarity reflecting minor opportunities in pacing and call-closing behavior. Qualitative feedback emphasized tone, personalization, and regulatory compliance as consistent strengths. These findings demonstrate that voice-based AI can safely and effectively support patient and caregiver interactions, suggesting readiness for scaled deployment in pharma-led PSPs to expand after-hours coverage, provide consistent patient engagement, and generate feedback loops to inform both AI and human nurse training. 1. Introduction Digital health interventions are increasingly used to support patients outside of traditional clinical encounters. In particular, conversational AI offers the possibility of scaling engagement while maintaining clinical-grade safety and empathy. However, regulatory concerns around accuracy, off-label avoidance, and patient trust have limited adoption in high-stakes settings such as patient support programs (PSPs). This study aimed to rigorously evaluate a medical-grade voice AI system developed for life sciences applications. We designed a large-scale, nurse-led panel assessment to measure its performance in real-world therapeutic contexts, focusing on safety-critical domains such as accuracy, empathy, clarity, appropriateness of advice, and risk of inaccurate claims. 2. Methods 2.1 Study Design The evaluation involved 30 patient and caregiver scenarios spanning oncology, cardiometabolic, neurology, dermatology, and rare disease conditions. Scenarios were derived from common PSP interactions, including missed doses, cold-chain travel, pediatric administration, and caregiver stress management. 2.2 Participants A total of ∼1,000 U.S.-licensed nurses participated. Nurses were recruited across diverse states and specialties, representing 20+ years of average clinical experience. Each nurse was randomly assigned scenarios to role-play as patients or caregivers. 2.3 Procedure Briefing: Nurses received standardized role-play instructions for each scenario. Interaction: Each interacted live with the AI voice system in real time. Evaluation: After the interaction, nurses scored the AI across five domains: clinical accuracy, empathy, communication clarity, appropriateness of advice, and risk of inaccurate claims. Qualitative Feedback: Free-text notes were collected to capture narrative impressions and improvement suggestions. Compliance Checks: Nurses flagged any off-label promotion, lack of escalation, or inappropriate safety language. 2.4 Measures Clinical Accuracy: Concordance with product labeling and standard of care. Empathy: Validating patient emotions, offering reassurance. Communication Clarity: Use of plain language and logical sequencing. Appropriateness of Advice: Staying within the AI’s informational role, escalating when necessary. Compliance with Evidence: Avoiding overstatements or unfounded certainty. 2.5 Feedback-to-Improvement Cycle To ensure continuous refinement of the AI system, we implemented a structured six-step loop: ( 1 ) scenario role-play by licensed nurses, ( 2 ) evaluator feedback, ( 3 ) challenge identification, ( 4 ) model refinement, ( 5 ) retesting and validation, and ( 6 ) performance uplift. This cyclical process ( Figure 1 ) supported rapid iteration while maintaining compliance and safety standards. Download figure Open in new tab Figure 1. Feedback-to-Improvement Cycle. A six-step cycle showing how nurse role-play evaluations fed into model refinement. Steps included ( 1 ) Scenario Role-Play, ( 2 ) Evaluator Feedback, ( 3 ) Challenge Identification, ( 4 ) Model Refinement, ( 5 ) Retesting & Validation, and ( 6 ) Performance Uplift. 3. Results 3.1 Quantitative Outcomes Nurses rated each interaction across five domains using a three-point scale : Top ( 1 ): Full demonstration of the domain (e.g., clinically accurate, empathetic, or clear with no gaps). Mid ( 2 ): Acceptable performance with minor limitations (e.g., slightly rushed pacing, somewhat abrupt escalation). Low ( 3 ): Missed expectations, requiring correction (e.g., unclear instruction, missed escalation opportunity). Across 20,000+ evaluated interactions: View this table: View inline View popup 3.2 Domain Trends Accuracy & Advice: Strongest performance across all therapeutic clusters, especially in dosing and adherence contexts. Empathy: Widely praised in oncology and rare disease caregiving scenarios. Clarity: Slightly lower scores linked to rapid pacing and early call wrap-ups. 3.3 Qualitative Strengths Regulatory Compliance: “ No off-label promotion; consistent escalation when needed.” Empathy: “ Tone was calm and reassuring.” Contextual Adaptation: “ It adjusted well to caregiver vs. patient personas.” Safety Language: “ Clear framing of knowns vs. unknowns.” 3.4 Areas for Refinement Pacing: Nurses noted occasional rapid delivery during complex instructions. Closing Behavior: Calls sometimes ended before confirming additional patient questions. 4. Discussion This study provides large-scale evidence that voice-based AI can safely and effectively support patient and caregiver interactions across diverse therapeutic areas. The high top-box scores (>97% across all domains) demonstrate consistent clinical accuracy and empathy — critical dimensions for trust in regulated healthcare contexts. Notably, qualitative nurse feedback highlighted the AI’s ability to adapt communication style depending on persona (e.g., slower and calmer with grandparents, concise with teenagers). This adaptability suggests potential for personalization at scale. Opportunities for refinement include improving pacing during procedural guidance and reinforcing structured call-closing prompts. However, these areas represented <3% of total feedback and did not undermine overall performance. Data Availability All data produced in the present study are available upon reasonable request and will be considered on a case-by-case basis by the corresponding authors. 5. Implications for PSPs The findings of this study suggest several practical applications for pharmaceutical patient support programs (PSPs). Scalability emerged as a core strength: the AI was able to handle a high volume of simulated patient interactions without loss of accuracy, pointing toward its potential as a force multiplier for nurse call centers. After-hours coverage is another key implication; by providing consistent, empathetic guidance outside of traditional nurse availability, the system can reduce patient anxiety and ensure continuity of support. The AI also functions as a training tool , with aggregated nurse feedback highlighting opportunities to refine human PSP scripts and educational content. Finally, the system demonstrated strong regulatory readiness , consistently aligning responses with approved product labeling and escalation standards — an essential requirement for deployment in life sciences contexts. 6. Future Work Future development of the system will focus on extending both accessibility and personalization. A multilingual rollout is planned, beginning with Spanish, French, and Mandarin, to address diverse patient populations and reduce language barriers in PSP delivery. Enhancements in behavioral personalization will allow the AI to adapt tone, pacing, and level of detail to the patient’s communication style, thereby improving trust and comprehension. Finally, deeper metrics are needed to complement evaluator scoring, including structured post-call comprehension testing to assess how well patients retain information provided during AI interactions. Together, these directions point toward a more adaptive and inclusive AI system that can expand the reach and impact of patient support services. 7. References • ↵ Bickmore TW , Schulman D , Sidner CL . Automated interventions for multiple health behaviors using conversational agents . Patient Educ Couns . 2013 ; 92 ( 2 ): 142 – 142 . OpenUrl CrossRef PubMed • ↵ Miner AS , Milstein A , Schueller S , Hegde R , Mangurian C , Linos E. Smartphone-based conversational agents and responses to questions about mental health, interpersonal violence, and physical health . JAMA Intern Med . 2016 ; 176 ( 5 ): 619 – 619 . OpenUrl PubMed • ↵ Laranjo L , Dunn AG , Tong HL , et al. Conversational agents in healthcare: a systematic review . J Am Med Inform Assoc . 2018 ; 25 ( 9 ): 1248 – 1248 . OpenUrl CrossRef PubMed • ↵ Bibault J-E , Chaix B , Guillemassé A , et al. A chatbot versus physicians to provide information for patients with breast cancer: blind, randomized controlled noninferiority trial . J Med Internet Res . 2019 ; 21 ( 11 ): e15787 . OpenUrl PubMed • ↵ Lin SY , Mahoney MR , Sinsky CA . Ten ways artificial intelligence will transform primary care . J Gen Intern Med . 2019 ; 34 ( 8 ): 1626 – 1626 . OpenUrl CrossRef PubMed • ↵ Topol EJ . High-performance medicine: the convergence of human and artificial intelligence . Nat Med . 2019 ; 25 ( 1 ): 44 – 44 . OpenUrl CrossRef PubMed • Kocaballi AB , Quiroz JC , Rezazadegan D , Berkovsky S , Magrabi F , Coiera E , Laranjo L. Responses of conversational agents to health and lifestyle prompts: investigation of appropriateness and presentation structures . J Med Internet Res . 2020 ; 22 ( 2 ): e15823 . OpenUrl PubMed • Bibault J-E , Petit S , Chaix B. Healthcare ex Machina: Are conversational agents ready for prime time in oncology? Clin Transl Radiat Oncol . 2019 ; 16 : 55 – 59 . OpenUrl PubMed • World Health Organization . WHO guideline: recommendations on digital interventions for health system strengthening. World Health Organization ; 2019 . • Berner ES Berner ES , La Lande TJ . Overview of Clinical Decision Support Systems . In: Berner ES , ed. Clinical Decision Support Systems: Theory and Practice . Springer, Cham ; 2016 . • IQVIA Institute . Digital Health Trends 2021: Innovation, Evidence, Regulation, and Adoption. IQVIA Institute for Human Data Science ; 2021 . • Indegene . Transform patient experience with intelligent hyper-personalization. Whitepaper. Indegene ; 2021 . View the discussion thread. Back to top Previous Next Posted September 19, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Evaluating a Medical-Grade Voice AI for Patient and Caregiver Guidance: A Multi-Scenario Nurse Panel Study Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Evaluating a Medical-Grade Voice AI for Patient and Caregiver Guidance: A Multi-Scenario Nurse Panel Study Sahitya Sridhar , Rajashekar Vasantha , Supreet Deshpande , Parul Pathak medRxiv 2025.09.18.25336107; doi: https://doi.org/10.1101/2025.09.18.25336107 Share This Article: Copy Citation Tools Evaluating a Medical-Grade Voice AI for Patient and Caregiver Guidance: A Multi-Scenario Nurse Panel Study Sahitya Sridhar , Rajashekar Vasantha , Supreet Deshpande , Parul Pathak medRxiv 2025.09.18.25336107; doi: https://doi.org/10.1101/2025.09.18.25336107 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9220) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffd7015cad006fb',t:'MTc3OTQ2OTg2Mw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.