Full text
80,630 characters
· extracted from
preprint-html
· click to expand
Towards Richer AI-Assisted Psychotherapy Note-Making and Performance Benchmarking | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Towards Richer AI-Assisted Psychotherapy Note-Making and Performance Benchmarking View ORCID Profile Prottay Kumar Adhikary , Sahajpreet Singh , View ORCID Profile Suruchi Singh , View ORCID Profile Panna Sharma , View ORCID Profile Pankhuri Soni , View ORCID Profile Rashmi Choudhary , View ORCID Profile Charu Saxena , View ORCID Profile Prachi Chauhan , View ORCID Profile Swati Kedia Gupta , View ORCID Profile Koushik Sinha Deb , View ORCID Profile Salam Michael Singh , View ORCID Profile Tanmoy Chakraborty doi: https://doi.org/10.1101/2025.06.25.25330252 Prottay Kumar Adhikary 1 Department of Electrical Engineering, Indian Institute of Technology , Delhi, New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Prottay Kumar Adhikary Sahajpreet Singh 4 School of Computing, National University of Singapore , Singapore Find this author on Google Scholar Find this author on PubMed Search for this author on this site Suruchi Singh 3 Department of Psychiatry, All India Institute of Medical Sciences , New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Suruchi Singh Panna Sharma 3 Department of Psychiatry, All India Institute of Medical Sciences , New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Panna Sharma Pankhuri Soni 3 Department of Psychiatry, All India Institute of Medical Sciences , New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Pankhuri Soni Rashmi Choudhary 3 Department of Psychiatry, All India Institute of Medical Sciences , New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Rashmi Choudhary Charu Saxena 3 Department of Psychiatry, All India Institute of Medical Sciences , New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Charu Saxena Prachi Chauhan 3 Department of Psychiatry, All India Institute of Medical Sciences , New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Prachi Chauhan Swati Kedia Gupta 3 Department of Psychiatry, All India Institute of Medical Sciences , New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Swati Kedia Gupta Koushik Sinha Deb 3 Department of Psychiatry, All India Institute of Medical Sciences , New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Koushik Sinha Deb Salam Michael Singh 1 Department of Electrical Engineering, Indian Institute of Technology , Delhi, New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Salam Michael Singh Tanmoy Chakraborty 1 Department of Electrical Engineering, Indian Institute of Technology , Delhi, New Delhi, India 2 Yardi School of Artificial Intelligence, Indian Institute of Technology , Delhi, New Delhi, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tanmoy Chakraborty For correspondence: tanchak{at}iitd.ac.in Abstract Full Text Info/History Metrics Data/Code Preview PDF ABSTRACT Psychotherapy note-making is crucial for effective patient care. However, traditional formats such as SOAP (Subjective, Objective, Assessment, and Plan) and BIRP (Behavior, Intervention, Response, and Plan) often fail to capture the nuanced complexities of therapeutic sessions, as they primarily focus on surface-level details and lack a comprehensive understanding of the patient’s history, mental status, and therapeutic process. While recent advances in Artificial Intelligence (AI) and Large Language Models (LLMs) show promise in clinical documentation, their application in psychotherapy note summarisation remains unexplored. We present iCARE ( i dentifiers, C hief Concerns and Clinical History, A ssessment and Analysis, R isk and Crisis, E ngagement and Next Steps), a comprehensive framework for AI-assisted psychotherapy documentation that addresses these limitations. iCARE comprises of 17 clinically relevant aspects, developed collaboratively with mental health professionals, and aligned with established guidelines. We further introduce PATH ( P sychotherapy A spects and T reatment H istory summary), a novel dataset of annotated therapy sessions. Through extensive benchmarking with 11 LLMs, including both open and closed-source models, we evaluate their performance across different note-taking aspects using automatic and human evaluation metrics. Our results show that closed-source models like Gemini Pro and GPT4o-mini excel in various aspects, with Gemini Pro achieving superior human evaluation scores. Notably, all models struggle with temporal reasoning and complex therapeutic interpretations. The findings suggest that current LLMs can assist in basic documentation but require improvements in handling longitudinal therapeutic relationships and aspects that require deeper clinical understanding and interpretative reasoning. This work advances mental health care documentation while emphasising the need for continued clinical expertise in psychotherapy note summarisation. Introduction Documenting mental health through psychotherapy notes is a vital part of the therapeutic process, supporting effective communication between therapists and clients while ensuring adherence to ethical and legal standards. Thoughtful documentation not only monitors client progress and informs treatment planning, but also reflects the depth and evolution of the therapeutic relationship. With enhanced awareness and an increase in treatment-seeking, there is a growing need to re-examine traditional note-taking practices and develop more comprehensive frameworks that capture both static and dynamic aspects of patient information and therapeutic processes over time. Structured clinical note-writing came into the forefront with the development of the Problem-Oriented Medical Records (POMR) system by Dr. Lawrence Weed in the 1960s 1 – 4 . Designed to standardise and organise complex clinical information, POMR soon led to the introduction of SOAP (Subjective, Objective, Assessment, Plan) 5 notes, which offered a systematic way to document clinical information. Adaptations of this structured approach led to the development of other formats like DAP (Data, Assessment, Plan) 6 , BIRP (Behaviour, Intervention, Response, Plan) 7 , and DART (Description, Assessment, Response, Treatment) 8 , 9 . as shown in Table 1 . While these structured formats provide therapists with outlines that facilitate documentation and promote accountability, they share a significant drawback of overlooking the subtle emotional and contextual nuances of the patient, and therapist’s exchange. As a result, critical therapeutic insights can be lost and the evolving dynamics of the therapeutic relationship can be underrepresented. View this table: View inline View popup Download powerpoint Table 1. Commonly used psychotherapy note-making formats. To answer the question, “can a computer take psychiatric history?” , a computer program was designed in 1983 to collect personal history from patients admitted to a general psychiatry ward 10 . Since then, several attempts have been made to automate and help psychotherapeutic history-taking. In addition, attempts to educate and establish guidelines for psychologists to make decisions about professional record keeping have also evolved over time 11 . Accurate and comprehensive record keeping is vital in ensuring high-quality care and maintaining secure, detailed, and organised clinical records by medical professionals 12 . Maintaining separate psychotherapy notes from general patient records has been emphasised to protect client privacy and adhere to ethical standards 13 . At the same time, research has highlighted the lack of structured documentation guidelines in specialised areas like marital and family therapy 14 . The transition to Electronic Health Records (EHRs) marked a significant shift in clinical documentation practices. While EHRs introduced benefits like improved accessibility and standardised formats. However, they also brought new ethical considerations and potential challenges in maintaining the quality and confidentiality of clinical notes 15 . A systematic review of taking medical history using EHRs revealed both the advantages and complexities of digital record-keeping systems 16 . Efforts to enhance documentation efficiency led to innovations like the APSO (Assessment, Plan, Subjective, Objective) format, which demonstrated higher healthcare provider satisfaction than the traditional SOAP notes 17 , suggesting that reordering information can improve usability. In psychotherapy, note-taking practices must balance thorough documentation with client confidentiality. Recent studies have explored practical aspects of note-writing in mental health settings using video scenarios, providing insights into the practical aspects of clinical documentation in mental health settings 18 and examined the nuances of psychotherapy documentation 19 , discussing ethical and practical considerations that therapists must navigate. Integrating artificial intelligence (AI) has opened new frontiers in psychotherapy documentation. Recent research has demonstrated AI’s potential through scoping reviews of patient information summarization 20 , highlighting the potential of AI to improve the efficiency and quality of clinical documentation. Advanced frameworks have emerged, including knowledge-infused abstractive summarisation for clinical diagnostic interviews to generate concise summaries while preserving essential information 21 . Further advancements include using large language models (LLMs) for psychotherapy session summarisation by structuring the conversations along multiple dimensions 22 , pushing the application of AI in the realm of psychotherapy documentation 23 . AI-based systems have also been developed to support clinicians by streamlining note-taking processes 24 , with LLMs showing promise in improving productivity and enabling automatic generation of clinical notes 25 . These innovations suggest that LLMs are poised to transform how psychotherapy documentation is approached. Despite these advancements, significant gaps remain in psychotherapy documentation. Current formats struggle to capture the nuanced and longitudinal nature of therapeutic relationships. Existing formats fail to capture the depth and evolving nature of therapeutic relationships, often lacking the flexibility to accommodate varied clinical approaches or the detail needed to reflect the emotional and contextual nuances of therapist-patient interactions. Moreover, while AI shows promise in clinical documentation, its application in psychotherapy note-taking remains largely unexplored, with no standardised frameworks for evaluation or comprehensive datasets for training and assessment. To address these limitations, we make several contributions as follows ( Figure 1 presents an overview of our entire methodology): Download figure Open in new tab Figure 1. An overview of the methodology employed in this study. (A) The formation of the PATH dataset illustrates the sequential steps involved in preprocessing raw psychotherapy conversation data to prepare it for annotation. This includes data cleaning, segmentation into meaningful units, and structuring according to therapeutic contexts. (B) The annotation process follows the proposed Psychotherapy Note-Taking format, ensuring systematic labelling of dialogue acts, therapeutic techniques, and session metadata. This structured annotation framework aligns with clinical best practices and enhances interpretability. (C) Benchmarking leveraging diverse prompting techniques applied to multiple state-of-the-art language models. These prompts are tailored to elicit model responses that align with task-specific requirements. (D) Evaluation metrics are employed to assess model performance rigorously, incorporating quantitative metrics alongside qualitative assessments to ensure alignment with clinical and therapeutic standards. First, we present iCARE ( i dentifiers, C hief Concerns and Clinical History, A ssessment and Analysis, R isk and Crisis, E ngagement and Next Steps), a novel and a clinically grounded note-taking framework for psychotherapy documentation, designed in alignment with the Mental Healthcare Act, 2017 26 . It incorporates both intake and progress components and provides a structured yet flexible alternative to traditional formats. It comprises 17 clinically validated aspects that cover the complete therapeutic process in collaboration with mental health professionals. This framework captures essential elements such as patient demographics, clinical identifiers, temporal progression (past sessions, current status, future plans), clinical assessments, diagnostic information, therapeutic techniques and their effectiveness, and crisis markers. The framework seamlessly integrates with existing clinical guidelines while providing mechanisms for tracking therapeutic progress and treatment outcomes. Second, using our framework, we introduce PATH ( P sychotherapy A spects and T reatment H istory summary), a novel dataset of annotated therapy sessions. The dataset is based on pre-recorded counselling videos between therapy-patient conversations from YouTube. We employed comprehensive transcription and diarization, followed by the validation of the transcription and annotation to the iCARE format by clinical experts. The dataset features rich annotations covering speaker diarization with precise timestamps and annotated aspects. Our multi-stage validation process ensures the highest data quality, making it suitable for AI applications in the clinical domain. Third, through extensive benchmarking using 11 LLMs (both open and closed source), we provide valuable insights into the current state of AI capabilities in mental health documentation. Our benchmarking evaluation by the experts demonstrates that while closed-source models like Gemini Pro and GPT4o-mini excel in different aspects, with Gemini Pro achieving superior human evaluation scores, all models struggle with temporal reasoning and complex therapeutic interpretations. Our findings indicate that while current LLMs are capable of supporting basic mental health documentation tasks, they exhibit limitations in capturing the longitudinal nuances and interpretative depth inherent in therapeutic relationships. This work establishes a foundation for the future development of more sophisticated documentation tools and underscores the enduring need for clinical expertise in accurately summarizing and contextualizing psychotherapy notes. iCARE : Our Proposed Psychotherapy Note-Taking Format To address the limitations inherent in traditional psychotherapy note-making formats, we propose iCARE , a comprehensive, clinician-centered documentation framework that aligns with the Mental Health Care Act, 2017 (MHCA) 26 of India and the guidelines set forth by the American Psychological Association (APA). This structured yet flexible format promotes thorough documentation while allowing practitioners to create nuanced narratives that reflect the complexities of each therapeutic session. By including diverse categories such as therapist reflections, psychotherapy techniques used, and longitudinal tracking of clinical progress, iCARE aims to improve both therapeutic techniques and outcomes over time. Its adaptable structure allows therapists to tailor documentation to their specific practice needs while adhering to established guidelines, ultimately enhancing the quality of clinical records and therapy outcomes for both therapists and clients. Although SOAP, BIRP, and DAP formats offer general frameworks to document therapy sessions, they often fail to capture the full complexity of psychotherapeutic care. Originating primarily for medical note-taking and interdisciplinary communication, these formats provide brief, high-level summaries and often omit psychotherapy-specific elements such as the type of therapy delivered, specific techniques used during sessions, crisis markers, clinician reflections, and the use of assessments across sessions. iCARE explicitly addresses these gaps by encompassing structured fields for patient particulars, referral information, therapist identifiers, crisis markers, past session summaries, therapy type, techniques used, issues discussed, and therapist reflections. Many of these aspects are often assumed to be retrievable from hospital records but are inconsistently documented, undermining clinical care and research efforts. Table 2 shows that iCARE systematically covers a wider and clinically richer set of aspects than traditional formats, establishing a more comprehensive foundation for psychotherapy documentation. View this table: View inline View popup Download powerpoint Table 2. Comparison between traditional note-taking formats (mentioned in Table 1 ) and iCARE . Existing formats often lack fields for key psychotherapy aspects, while iCARE provides a comprehensive structure covering identifiers, clinical history, therapeutic process, and planning. A major strength of iCARE lies in its flexibility and modular design, allowing clinicians to map detailed session notes into simpler formats like SOAP or BIRP when needed, while preserving the richness and continuity essential for therapeutic work. By integrating structured history-taking, psychotherapy interventions, crisis markers, therapist reflections, and longitudinal tracking into one system, iCARE bridges the gap between clinical depth and record-keeping convenience. A comparative table is provided to highlight how iCARE extends the capabilities of traditional formats, offering a more holistic framework for psychotherapy documentation. There are 17 major aspects in the proposal based on which a psychotherapy session is summarised, which are given below: Patient Particulars include information related to the patient, which covers socio-demographic details such as name, age, sex, contact information, etc. Clinical Identifiers is associated with the patient’s clinical records and hospital interactions. These included hospital name, ID, OPD/inpatient/Telepsychiatry/Clinic, and other relevant information. Referral Information enlists the reasons for referral, the referee’s name, department and hospital, and any other relevant notes regarding the previous treatment. Therapist Information details the therapist/clinician providing the counselling session, including the therapist’s name, clinical department, current hospital, and other relevant information. Past Session Information is documentation and notes from previous therapy sessions that can provide context for ongoing treatment. This includes the date of past sessions, topics discussed, challenges, and any treatment adjustments made. Presenting Complaints (Symptoms) are the description of “Presenting Symptoms” and “Chief Complaints” and their duration. History is the comprehensive overview of a patient’s personal, medical, and mental health background relevant to the major presenting complaints. It also includes, but is not limited to, past traumas and significant life events that may affect the patient’s current mental state. Crisis Markers are the indicators that suggest that the patient may be at risk of harming themselves or others. This can include recent suicidal thoughts or attempts, self-harm, abuse, etc. Current Mental Status Examination is an assessment of the most recent mental state (i.e., in the current counselling session), which includes the observation of the patient’s mood, thought processes, cognitive function, and other behaviours. Psychotherapy Type defines the type of psychotherapy used in the treatment. Some examples are Cognitive Behavioural Therapy (CBT), Dialectical Behaviour Therapy (DBT), psychodynamic therapy, and humanistic therapy. Also, a short explanation of the choice of therapy type is provided. Psychotherapy Technique refers to methods employed within the chosen psychotherapy type. This includes cognitive restructuring, exposure therapy, mindfulness practices, role-playing, and other specific interventions. Assessments are used to assess the patient’s mental health and treatment progress, including psychological assessments, standardised tests (e.g. Beck Depression Inventory), clinical evaluations, etc. Issues Discussed in Current Session refers to special topics and concerns addressed during the most recent session. It includes detailed notes on the main issues discussed, progress made, etc. Reflections by the Therapist are the insights and observations made by the therapist regarding the session. This includes notes on the major issues and any new challenges, etc. Clinical Diagnosis by Reviewer includes reviews conducted by qualified psychologists, updated diagnoses, and recommendations based on the review findings. Action Plan is a structured plan for ongoing treatment and goals for the patient. Therapy goals, planned interventions, timelines, follow-up actions, etc., are some of the significant components of the action plan. Next Session Details is the information related to the date, time, location, mode of therapy (in person or telehealth), and homework (if any) required for the next session. PATH : Our Proposed Dataset Our study is based on the HOPE dataset 27 , which consists of pre-recorded counselling videos between therapy-patient conversations from YouTube. However, we observed some instances of incomplete transcripts in the HOPE dataset. To address this limitation, we re-transcribe and diarize the audio content from these videos. Dialogue Transcripts We first downloaded the YouTube videos and extracted the audio, ensuring compatibility with the transcription tool. Specifically, for the transcription and speaker diarization, we used WhisperX 28 , a speech transcription model based on OpenAI’s Whisper 29 . WhisperX is an extension of OpenAI’s Whisper model that integrates speaker diarization and word-level timestamp generation. It combines the robust transcription capabilities of Whisper with additional processing for speaker identification and precise timing information, utilising voice activity detection and forced phoneme alignment. Transcribing and diarizing videos in the HOPE dataset using WhisperX involves the following steps: (i) Audio Preprocessing: The audio is prepared for input into the WhisperX model. (ii) Transcription: WhisperX generates a transcript of the audio content. (iii) Speaker Diarization: The tool identifies and separates different speakers in the audio using pyannote 30 in the backend. (iv) Word-level Timestamp Generation: WhisperX assigns precise timestamps to each transcribed word. (v) Speaker assignment: Transcribed segments are associated with identified speakers. To this end, we curated a comprehensive representation of the therapist-patient conversations in a structured format, which includes a word-level timestamp of each utterance demarcating the starting and the end time, speaker tag (either patient or therapist) and the transcribed text. Furthermore, to ensure the highest quality of our dataset, we conducted rigorous manual validation and post-processing phases after automatic transcription and diarization. This process involved carefully verifying the transcriptions against the original videos, identifying and correcting utterance errors, and confirming the correct speaker assignment for each utterance. We meticulously detected and transcribed any missed text and refined utterances to ensure coherence and accuracy while preserving the original timestamps to maintain temporal integrity. Additionally, we removed any erroneously repeated utterances during the automatic transcription process. This way, we aimed to ensure the dataset’s quality after the automatic process. Annotating as per iCARE Psychotherapy Note Taking Format To ensure clinical utility and relevance, we followed a collaborative development process with mental health professionals, both psychiatrists and clinical psychologists. A shared consensus emerged around the need to first document psychotherapy sessions in the form of a structured clinical history, which is a format that captures both the baseline intake data, followed by a session-wise progress in a unified and comprehensive manner. While widely used note-taking templates like SOAP, DAP and BIRP offer structured formats for ongoing therapy documentation, they are more suited for tracking session-wise progress after the initial clinical formulation has been established. These formats tend to emphasise session-specific reflections and treatment adjustments rather than grounding the therapist in a contextualised understanding of the patient. They fall short when used as standalone formats for initial documentation. Structured clinical history formats, which are usually taught in psychiatry and medical training, help capture the important information about a patient required across the treatment process. These types of notes are easier for clinicians to assess, since writing and evaluating structured histories is a routine part of clinical training. Therefore, summarisation into structured history occupies the sweet niche between brevity and relevance and will be helpful in a broader range of clinical situations. We manually summarised the session dialogues into structured history notes to generate a gold standard for comparison. Two clinical psychologists and a psychiatrist generated these notes under the supervision of a faculty psychiatrist with 16 years of clinical experience and a faculty psychologist with 10 years of clinical experience. To this end, we present the multimodal dataset, PATH annotated according to our proposed framework, iCARE . Table 3 presents the statistics of the PATH dataset. View this table: View inline View popup Download powerpoint Table 3. Dataset Statistics. A quantitative overview of session-level and utterance-level details from therapy conversations, with parameters including total conversation count, utterance distribution between patients and therapists, and average talk time. View this table: View inline View popup Download powerpoint Table 4. Aspect-specific performance ranking of LLMs. We show top three models on aggregated scores of BLEU, METEOR, ROUGE-L, BERTScore, BLEURT and InfoLM w.r.t each aspect based on one-shot prompting. Results We experimented using our PATH dataset containing 174 conversations and 27,200 utterances structured according to the 17 aspects in the iCARE framework. We employed 11 state-of-the-art LLMs, including both open (Gemma 31 , Llama 3.1 32 , Llama 3 32 , MentalLLama 33 , Mistral 34 , Orca 2 35 , Phi 3 36 , Vicuna 37 , and Zephyr 38 ) and closed-source (GPT4o-mini 39 , and Gemini Pro 40 ) models, utilizing zero-shot and one-shot prompting techniques (see Methods for the detailed description). We employed both the automatic and human evaluation metrics. The automatic evaluation uses a combination of lexical metrics (BLEU 41 , METEOR 42 , ROUGE-L 43 ) and semantic metrics (BERTScore 44 , BLUERT 45 , InfoLM 46 ) to assess model performance comprehensively (see Section S1.1 in Supplementary Information for the detailed information on the automatic evaluation metrics). All the scores are reported without applying lowercasing or additional preprocessing to the predictions or references. Additionally, human evaluation by clinical experts provides insights into practical utility across five key dimensions: accuracy, relevance, comprehensiveness, clarity, and safety (see Methods for their detailed descriptions). We start this section by reporting the automatic scores and observations, followed by the human evaluation results. Automatic Evaluation We present our findings on the performance of various language models in generating psychotherapy notes across different aspects of the counselling session. Our analysis covers zero-shot and one-shot settings using a range of evaluation metrics. Full automatic evaluation scores are provided in the Supplementary Information (see Table S2 for zero-shot results and Table S3 for one-shot results). Improvement from Zero-Shot to One-Shot Across both the zero-shot (no example is given in the prompt) and one-shot (one example is given in the prompt) settings, the closed-source models viz GPT4o-mini and Gemini Pro outperformed the open-source models. In the zero-shot setting, GPT4o-mini and Gemini Pro consistently outperformed the other LLMs in most aspects. For example, in the ‘Assessments’ aspect, GPT4o-mini achieved a ROUGE-L score of 0.854 and a BERTScore of 0.972. Similarly, Gemini Pro gives a competitive performance with scores of 0.841 and 0.971 for the ROUGE-Land BERTScore, respectively. Mistral and Orca 2 demonstrated competitive performance among open-source models, particularly in aspects like ‘ Current Mental Status Examination ’ where Mistral scored 0.213 ROUGE-L and 0.870 BERTScore, while Orca 2 achieved scores of 0.182 and 0.825, respectively. These scores highlight that, despite these open-source models lagging behind the stronger closed-source models, they can generate meaningful content for complex psychological aspects. Most models struggled with aspects like ‘Next Session Details’ and ‘Past Session Information’ . For instance, in ‘Next Session Details’ GPT4o-mini scored a BLEU score of zero and 2.881 InfoLM scores, while Mistral scored a BLEU score of zero and 2.416 InfoLM scores. The one-shot prompting led to notable improvements across the models in most aspects. For example, in GPT4o-mini, the performance in ‘Clinical Identifiers’ improved from ROUGE-L of 0.593 to 0.658 and BERTScore from 0.907 to 0.926. A similar trend was observed in Gemini Pro with its ROUGE-L for the same aspect improving from 0.256 to 0.442 and BERTScore from 0.822 to 0.868. This improvement is not limited to closed-source models; Mistral, an open-source model, also showed improvement in several aspects, providing competitive performance with closed-source models. However, there is a performance drop in some aspects. For example, in ‘Assessments,’ GPT4o-mini’s ROUGE-L decreased from 0.854 to 0.790, though it maintained a strong BERTScore (dropping slightly from 0.972 to 0.966). Mistral observed a similar performance drop in ‘Current Mental Status Examination,’ where its ROUGE-L score dropped from 0.213 to 0.191. However, its semantic structure remained relatively stable, as indicated by the BERTScore (0.870 to 0.863). These contrasting results show that the one-shot generally improves performance, but its effectiveness varies depending on the particular aspect of the psychotherapy notes. Performance Across Different Aspects of Psychotherapy Note We observed varying performances across different aspects of the psychotherapy notes, with some categories showing improvements while others presented some challenges. Notably, most models demonstrated higher performance in three aspects, viz ‘Patient Particulars,’ ‘Clinical Identifiers,’ and ‘Assessments’ ; meanwhile, ‘Next Session Details,’ ‘Past Session Information,’ and ‘History’ posed as challenging aspects. In ‘Patient Particulars,’ both GPT4o-mini and Gemini Pro achieved strong semantic scores, with Gemini Pro slightly outperforming GPT4o-mini with a higher ROUGE-L score (0.281 over 0.239) while maintaining similar BERTScores (0.856 and 0.851 respectively) in a one-shot setting. The models demonstrated a significant performance in ‘Clinical Identifiers,’ where GPT4o-mini achieved a ROUGE-L score of 0.658 and a high BERTScore of 0.926, followed by Gemini Pro with scores of 0.442 and 0.868, respectively. Similarly, the models showcased the summarising capabilities in ‘Assessments’ . Both achieved notably high performance, with GPT4o-mini achieving a ROUGE-L of 0.790 and a BERTScore of 0.966, while Gemini Pro followed closely with 0.785 and 0.940, respectively. These results suggest that the models can handle structured, factual information that requires less temporal or contextual reasoning. In contrast, some aspects proved consistently challenging for all models. The models struggled with temporal and forward-looking aspects of the notes such as the ‘Next Session Details’ . GPT4o-mini and Gemini Pro achieved a BLEU score of zero. Although Gemini Pro showed a higher alignment with the reference semantics indicated by a lower InfoLM score (1.372 compared to GPT4o-mini’s 2.805). Similarly, the Past Session Information aspect also challenged the models. Both GPT4o-mini and Gemini Pro again scored a BLEU score of zero and high InfoLM scores (3.004 for GPT4o-mini and 2.837 for Gemini Pro), indicating substantial deviation from gold reference texts. The ‘History’ aspect observed similar challenging low performance. These models with current in-context learning capabilities can handle structured, present-focused information. However, they still struggled significantly across the temporal and historical aspects, revealing the limitations of these models. Figure 2 presents a detailed visualisation of the performance patterns. The weighted scores across various aspect categories indicate the consistent superiority of closed-source models in interpretative tasks while also underscoring the common challenges all model architectures face in temporal reasoning tasks. Download figure Open in new tab Figure 2. Performance analysis of the top three LLMs (GPT4o-mini, Gemini Pro, and Mistral) across the different aspects of psychotherapy note generation. The first five plots from a to e show weighted scores (30% METEOR, 30% ROUGE-L, 40% BERTScore) for each aspect within categories: Basic Information, Temporal & Historical, Clinical Assessment, Session Content, and Treatment Planning. The final plot (f) presents the aggregated performance across all categories to give a holistic overview of the models’ effectiveness in handling different aspects. Note: Each category comprises specific aspects as follows: Basic Information (Patient Particulars (PP), Clinical Identifiers (CI), Therapist Information (TI), Referral Information (RI)); Temporal & Historical (Past Session Information (PS), Next Session Details (NS), History (HI)); Clinical Assessment (Current Mental Status Examination (CMS), Assessments (AS), Clinical Diagnosis by Reviewer (CD), Crisis Markers (CM)); Session Content (Presenting Complaints/Symptoms (PC), Issues Discussed in Current Session (ID), Reflections by Therapist (RT)); and Treatment Planning (Psychotherapy Type (PT), Psychotherapy Technique (PTC), Action Plan (AP)). Metric-specific Observations Our analyses across different evaluation metrics revealed nuanced insights into model performance. Our evaluation encompassed lexical similarity and semantic alignment with the reference. Across all models, there is a lower lexical similarity score, as highlighted by a consistently low BLEU score across all aspects, indicating the models’ challenges in achieving exact phrase matches with the reference. This is evident in ‘Psychotherapy Type,’ where GPT4o-mini and Gemini Pro achieved BLEU scores of only 0.225 and 0.205, respectively, in one-shot settings. However, the models demonstrated notably better performance with a relatively flexible lexical matching metric such as METEOR and ROUGE-L, where Gemini Pro achieved METEOR and ROUGE-L scores of 0.414 and 0.427, respectively against the BLEU score of 0.205 for the same aspect. In contrast, semantic similarity scores are higher than lexical metric scores across all the models’ aspects. BERTScore, in particular, showed strong semantic alignment in most aspects. This is evident in the ‘Assessments’ aspect, where both GPT4o-mini and Gemini Pro achieved significantly high scores, with GPT4o-mini achieving 0.966 and Gemini Pro following closely at 0.940 in contrast to the substantial lower lexical similarity scores of 0.398 and 0.400 measured by METEOR. However, BLUERT scores revealed areas for improvement, particularly in fluency and relevance. This is illustrated in ‘Crisis Markers’ with GPT4o-mini scoring −0.511, while Gemini Pro improved with a positive score of 0.298, suggesting room for improvement in fluency and relevance. InfoLM scores varied widely for all models, with improvements observed in zero-shot to one-shot settings, further implying that providing examples helps models to better align with the semantic expectations of psychotherapy notes. These multi-metric analyses suggest that the models at this current state struggle with exact phrase matching of human-written notes, as indicated by relatively low BLEU scores. However, the models demonstrate a more robust capability to retain the text’s underlying semantic meaning and clinical significance of the content, as evidenced by higher BERTScore and ROUGE-L scores. As visualised in Figure 3(a-c) , this pattern of stronger semantic performance compared to lexical metrics is consistently observed across all three top-performing models. The radar plot in Figure 3(d) further emphasises this trend while highlighting the relative strengths of each model across different aspects of psychotherapy note summarisation. Download figure Open in new tab Figure 3. Metric-specific observations across different models. Metric comparison (METEOR, ROUGE-L, and BERTScore) for (a) GPT4o-mini, (b) Gemini Pro, and (c) Mistral across 17 aspects of psychotherapy notes in one-shot setting. (d) The Radar plot comparing the weighted average scores of METEOR, ROUGE-L, and BERTScore of all models across the aspects. Note: Aspects are abbreviated as follows: Patient Particulars (PP), Clinical Identifiers (CI), Referral Information (RI), Therapist Information (TI), Past Session Information (PS), Presenting Complaints/Symptoms (PC), History (HI), Crisis Markers (CM), Current Mental Status Examination (CMS), Psychotherapy Type (PT), Psychotherapy Technique (PTC), Assessments (AS), Issues Discussed in Current Session (ID), Reflections by Therapist (RT), Clinical Diagnosis by Reviewer (CD), Action Plan (AP), and Next Session Details (NS). Model-specific Observations The performance of individual models reveals distinct patterns and capabilities. GPT4o-mini demonstrated superior overall performance in most aspects, particularly in structured information. This is evident with the overall highest scores in the one-shot setting, where GPT4o-mini achieved the highest ROUGE-L and BERTScore values in the following aspects: 0.658 and 0.926 in ‘Clinical Identifiers’ , 0.791 and 0.966 for ‘Assessments’ , and 0.720 and 0.941 for ‘Referral Information ‘ , respectively. Apart from these, the model also observed performance gain from zero-shot to one-shot settings. Meanwhile, Gemini Pro showed dominance in the narrative and interpretative aspects. Notably, the model demonstrated the highest score in ‘Crisis Markers’ with ROUGE-L and BERTScore values of 0.733 and 0.912 in a one-shot setting, respectively, and a strong performance in ‘Clinical Diagnosis’ (BERTScore of 0.926). Furthermore, Gemini Pro often achieved better BLUERT scores than other models, suggesting better fluency and coherence in its generation. For instance, it achieved a positive BLUERT score of 0.298 for ‘Crisis Markers’ while other models gave negative scores in the one-shot setting. Mistral, an open-source model, gave strong competition to the closed-source counterparts, viz GPT4o-mini and Gemini Pro. It gave a comparable one-shot performance (ROUGE-L = 0.205, BERTScore = 0.851) with GPT4o-mini (ROUGE-L = 0.238, BERTScore = 0.851) but fell short of Gemini Pro (ROUGE-L = 0.280, BERTScore = 0.856) in ‘Patient Particulars’ . However, the performance gap widened significantly in more complex aspects, such as ‘Reflections by Therapist’ , which requires understanding the therapeutic process, interpretative reasoning, and combining insights from the session. Among the other open-source models, Orca 2 and Zephyr gave a decent performance in aspects such as ‘Assessments’ and ‘Psychotherapy Type’ , which required lesser interpretative reasoning and insight understandability from the session. Llama 3 and Llama 3.1 demonstrated consistent but moderate performances across the metrics and limited improvement using a one-shot setting. Gemma and MentalLLaMA exhibited the poorest performance with consistently low scores. Human Evaluation Our human evaluation result reported in Table 5 reveals interesting patterns that complement and contrast the automatic metric results. Expert clinicians evaluated the generated notes across five key dimensions: Accuracy, Relevance, Comprehensiveness, Clarity, and Safety, using a 5-point Likert scale. For this task, we chose the top three performing models across the aspects viz. GPT4o-mini, Gemini Pro, and Mistral in the one-shot setting in terms of their automatic scores. The result reported in Table 5 indicates that GPT4o-mini outperforms the other models across all human evaluation metrics. The human assessment aligns with specific automatic scores, particularly in aspects requiring clinical understanding. For example, GPT4o-mini gives superior performance in ‘Crisis Markers’ (ROUGE-L: 0.733, BERTScore: 0.912‘) and ‘Clinical Diagnosis’ (BERTScore: 0.926), which corresponds to the high human evaluation scores in Safety (3.04) and Accuracy (2.90). This suggests that the model effectively captures clinically sensitive information while maintaining safety standards. Interestingly, Gemini Pro provided a competitive performance with GPT4o-mini in the automatic evaluation. However, it received lower human evaluation scores across all dimensions than GPT4o-mini and Mistral (which is a smaller and open-source model). This disparity highlights that strong performance in lexical and semantic similarity metrics does not necessarily translate into better practical clinical utility, as judged by human experts. Mistral, the open-source model, gave competitive human evaluation scores with GPT4o-mini. Interestingly, it even surpassed Gemini Pro across all the human evaluation metrics. This highlights the potential of smaller and open-sourced LLMs in the medical domain, which requires further exploration in the future. Hitherto, the multi-faceted evaluation results suggest that while models excel in different aspects of note summarisation, their capabilities vary significantly based on the complexity and nature of the aspect or the content. Combining automatic and human evaluation provides a holistic overview of model performance, highlighting that stronger automatic performance does not always reflect its practical clinical utility. This underscores the importance of human expert evaluation in assessing AI systems in the mental health domain. View this table: View inline View popup Download powerpoint Table 5. Expert evaluation result of the top three models obtained from Table 4 based on the human evaluation parameter viz. accuracy, relevance, conciseness, clarity and safety. Discussion Our comprehensive analysis reveals several key insights about the capabilities and limitations of LLMs in generating psychotherapy notes. The performance patterns across different aspects highlight that, despite the LLMs’ potential in generating the notes, their capabilities vary significantly based on the complexity and nature of the aspects. Both closed and open-source models perform adequately in basic documentation tasks like recording patient particulars and clinical identifiers; this suggests that clinical documentation requiring basic patient information is within reach of current LLM capabilities. However, the performance gap between closed and open-source models becomes more significant in aspects that require deeper clinical understanding and interpretative reasoning. All models face consistent challenges with handling the past session information and future action plans, suggesting fundamental limitations in current LLMs regarding temporal reasoning. Furthermore, this limitation indicates that while LLMs can effectively capture clinical observations, they may need architectural improvements to better handle therapeutic relationships’ longitudinal nature. We also observed that the models demonstrate varying strengths in different aspects. For example, Gemini Pro excels in ‘Crisis Markers’ and ‘Clinical Diagnosis’ , while GPT4o-mini excels in ‘Assessments’ , suggesting that different models have different areas of expertise and can be utilized in an ensemble manner for comprehensive psychotherapy documentation. In this work, we make several contributions. First, we introduce iCARE , a novel framework for the psychotherapy note-taking format that addresses the limitations of traditional formats by collaborating with clinical experts. Our 17-aspect framework provides a more nuanced and comprehensive approach to documenting therapeutic sessions. This framework captures the essential clinical information and accommodates psychotherapy practice’s complex, longitudinal, and interpretative nature. Second, our extensive benchmarking of various LLMs provides valuable insights into the current state of AI capabilities in mental health documentation. The evaluation across multiple metrics and aspects offers a detailed understanding of where these models excel and where they need improvement. This understanding is crucial for the technical development and clinical implementation of AI-assisted documentation systems. Looking forward, we lay the groundwork for more specialised and effective AI-assisted note-making in psychotherapy. The detailed model performance analysis across different aspects can guide technical development and clinical implementation strategies. Although the results show the promising capabilities of the LLMs in their current setup, the overall analysis indicates that current LLMs should be viewed as assistive tools rather than replacements for clinical judgment in psychotherapy notes summarisation. To conclude, our framework and findings contribute to improving mental health, particularly by enhancing psychotherapy notes summarisation while maintaining clinical validity and therapeutic value. As LLMs evolve, the structured format we have developed – iCARE and the proposed dataset – PATH , can serve as a benchmark to assess future models and improvements in this area. In this work, we emphasise that while AI can significantly aid in the notes summarisation, the complex nature of psychotherapy requires the experts’ careful consideration and continuous involvement to improve and develop a better way to integrate these AI tools into clinical practice. Methods Experimental Setup We formulate the counselling therapy conversation data report filling as a summarisation task. Specifically, we benchmark the proposed dataset using existing LLMs to summarise each component/aspect of the counselling conversation. Benchmarking Models To examine the capabilities of LLMs in automated summary generation, we benchmark open-source and closed-source models. Since transcripts often contain many tokens, we use the latest LLMs that offer comparatively larger context windows to produce more context-aware responses. We select open-source models in the 7-8 billion parameter range for a fair and balanced comparison. On the closed-source side, we assess recent models from OpenAI and Google. Table 6 provides a detailed description of the models used. View this table: View inline View popup Download powerpoint Table 6. Description of the models employed ( # denotes the closed-source models). Prompting Techniques LLMs show impressive in-context learning (ICL) capabilities across various NLP tasks 47 , 48 . To use their ability to understand natural language, we utilize both open-source and closed-source LLMs (as outlined in Table 6 ) with zero-shot and one-shot prompting methods. For summary generation, we use the default model generation parameters, with the exception that for open-source models, the maximum number of new tokens is set to 100. For prompt design, a collaborative team of psychiatrists and a computer science graduate worked together to create self-exploratory prompts that specify task details and desired formats. The full set of prompts employed in the experiments is detailed in Supplementary Information , Table S1 . Evaluation We carry out a two-step evaluation strategy to benchmark the proposed dataset with parts of automated and human evaluation on different metrics and measures. We conduct experiments with multiple prompting methods and LLMs for 17 aspects. Automatic Evaluation We explore lexical and semantic-based methods to evaluate the generated responses automatically. Although lexical-based metrics have limitations, we still employ them because they help assess elements like extracted nouns (e.g., patient details) and ensure a certain level of format matching. To this end, we use BLEU 41 , ROUGE 43 , and METEOR 42 scores. However, for more nuanced aspects—like evaluating summaries related to patient history, reflections, and other context-rich information—we need to capture the semantic meaning of the responses. For this purpose, we utilise BERTScore 44 , BLUERT 45 , and Infolm 46 using the Fisher-Rao distance 49 . Unless stated otherwise, all scores are represented as real numbers ranging from 0 to 1, with 1 being the highest possible score. See Section S1.1 in Supplementary Information for the detailed information on the automatic evaluation metrics. Human Evaluation Human evaluation and supervision remain essential in developing clinical notes derived from the large language model (LLM) in healthcare, particularly psychotherapy. Although LLMS can synthesise information efficiently, it sometimes fails to prioritise information with clinical importance. Human evaluators, particularly those with clinical experience, can assess whether the model-generated responses align with the therapeutic goals of each session. This ensures that the content meets the technical criteria and resonates with clinical care’s nuanced and often subjective nature. This step is crucial in psychotherapy, where the therapeutic relationship and emotional factors are fundamental to effective care. Clinical history, which is comprehensive yet concise, is a skill mastered over many years. Even the most advanced LLMs, such as new residents, often miss critical details when summarising complex patient histories or therapy sessions. Detecting omissions or highlighting subtle and fleeting events in patients’ life stories makes clinical documentation more meaningful. The astute clinician is ever vigilant for the presence of crisis markers (like abuse or self-harm), comorbidities (of other medical and substance use disorders) and the absence of symptoms of similar disorders. Human supervisors ensure the generated summaries cover all necessary elements and present a holistic view of the patient’s progress. In healthcare, providing incorrect or misleading information can severely affect patient outcomes. Factual accuracy is therefore crucial for generating trust between the LLM agent and the clinician to ensure any clinical usefulness of such machine translations and history notes. Human reviewers help mitigate such risks by cross-checking the accuracy and precision of AI-generated content against clinical records and guidelines, enhancing the system’s trustworthiness. Human evaluation ensures the safety and ethical integrity of the content and, therefore, aids in developing AI systems that provide reliable and clinically meaningful notes. In psychotherapy, automated tools alone often fall short of capturing the complexities and nuances of human experiences and interactions. Automated summaries usually give equal weightage to all parts of a client-therapist conversation, which seldom happens in real life. Psychotherapy is not a discussion between two parties to be summarised. At its core, psychotherapy is a reflective exploration, where progress is made in fleeting significant moments when the therapist and the client “understand” one another. Such moments are found in words and the silences between them 50 . Human supervision can help identify these nuances of communication and make the clinical history rich in meaning. For our evaluation framework, we identified five key metrics (dimensions) based on established healthcare AI reporting guidelines. While several guidelines exist for AI in healthcare, including CLAIM (Checklist for Artificial Intelligence in Medical Imaging) 51 , STARD-AI (Standards for Reporting of Diagnostic Accuracy Studies-AI) 52 , CONSORTAI (Consolidated Standards of Reporting Trials-AI) 53 , and MI-CLAIM (Minimum Information about Clinical Artificial Intelligence Modelling) 54 . However, none of these guidelines explicitly addresses standards for the human evaluation of LLMs in healthcare. Therefore, we come up with the following evaluation metrics: Accuracy: Measures how factually correct and precise the LLM’s response is. The responses should be free from errors and provide the correct information. Rated on a scale from least to most accurate. Relevance: Evaluates how well the LLM’s response aligns with the user’s query, ensuring it directly addresses the question. Irrelevant or off-topic information should be minimised and rated from least to most relevant. Comprehensiveness: Assesses whether the response covers all key aspects of the user’s query, offering a complete and thorough explanation when needed and rated from least to complete fully. Clarity: Focuses on how clear and understandable the response is, ensuring the content is easy to follow and free from ambiguity. Rated from not clear at all to fully clear and understandable. Safety & Trustworthiness: Ensures the response is safe, reliable and does not provide harmful or misleading information. The LLM should maintain user trust by offering fair and accurate responses. Rated from completely unsafe to fully safe and trustworthy. All metrics are rated on a Likert scale from 1 to 5, where 1 is the lowest and 5 is the highest. For evaluation, three LLM outputs and one gold expert summary were provided to each invited expert (clinical psychologist), who was blinded to the models and the human summary. The result that our LLM outputs are equal to or sometimes even better than human-coded summary validated our assumption of using structured history sections as building blocks for clinical summary generation. Data Availability All conversation transcripts used in this study and all source code for benchmarking experiments are publicly available in https://github.com/proadhikary/iCARE/. https://github.com/proadhikary/iCARE Ethics and Inclusion Statement This research included local researchers throughout all stages of the research process. Roles and responsibilities were mutually agreed upon by all collaborators prior to the commencement of the study. No health, safety, security, or other risks were posed to participants or researchers during the course of this research. Data Access and Ethics This research was conducted using publicly available counselling conversations, ensuring no involvement of human subjects, and was therefore exempt from additional ethical review. Data and Code Availability All conversation transcripts used in this study and all source code for benchmarking experiments are publicly available in https://github.com/proadhikary/iCARE/ . Author contributions statement The contributions of individual authors are as follows: Conceptualisation: SKG, KSD, TC; Data curation: SRS, PS, PKS, PKA; Experimentation, Software, Visualisation: SS and PKA, Formal Analysis and Investigation: SRS, RC, CS, PC; Supervision and Validation: KSD, TC; Funding Acquisition: TC; Project Administration: TC, KSD; Writing: All authors. Supplementary Information S1 Methods S1.1 Instruction Prompts Table S1 presents the instructional prompts for summarising notes based on the definitions of each aspect. These prompts were curated in consultation with experts. View this table: View inline View popup Table S1. Instruction prompts used for psychotherapy note generation. Detailed prompts designed in collaboration with mental health professionals for each aspect of the SynapNote framework. Each prompt specifies the format, required fields, and example outputs to guide the LLMs in generating structured psychotherapy notes. The prompts were crafted to capture essential clinical information while maintaining standardization across different therapeutic contexts. S1.2 Automatic Evaluation Metrics The following are brief definitions and key details of the automatic evaluation metrics used in our study: BLEU (Bilingual Evaluation Understudy): BLEU 41 was initially developed for machine translation. It measures the precision of n-gram matches between a candidate summary and reference summaries. It calculates how many n-grams in the candidate appear in the references, with a brevity penalty to avoid favouring very short summaries. While simple and widely used, BLEU doesn’t account for recall and struggles with semantic equivalence. ROUGE (Recall-Oriented Understudy for Gisting Evaluation): ROUGE 43 focuses on recall by measuring the overlap between the generated and reference summaries. ROUGE-N measures n-gram overlap, and ROUGE-L measures the longest common subsequence. METEOR (Metric for Evaluation of Translation with Explicit Ordering): METEOR 42 improves upon BLEU by considering both precision and recall, emphasising recall. It handles stemming, synonymy, and word order. METEOR uses WordNet 55 to identify synonyms and applies alignment between candidate and reference texts, making it more correlated with human judgments than BLEU for many tasks. BERTScore 44 : It leverages contextual embeddings from BERT 56 to compute similarity scores between candidate and reference summaries. It aligns words in the two texts based on the cosine similarity of their embeddings, then aggregates these similarities using precision, recall, and F1 measures. This allows it to capture semantic similarity beyond exact matches, addressing a significant limitation of n-gram-based metrics. BLEURT 45 : It is a learned evaluation metric based on BERT fine-tuned on human judgments. It starts with a pre-trained BERT model and further trains it on synthetic data and human ratings, enabling it to better align with human evaluations. BLEURT shows a strong correlation with human judgments across various text generation tasks. Infolm 46 : InfoLM is an information-theoretic evaluation framework that uses language model embeddings to measure both adequacy (content preservation) and fluency. It calculates similarity between texts using information-theoretic measures like Kullback-Leibler divergence 57 on contextualised embeddings. This approach allows it to capture deeper semantic relationships between candidate and reference summaries. S2 Results We provide results for all models across the different aspects of the notes. Table S2 and Table S3 present the results for the zero-shot and the one-shot settings, respectively. View this table: View inline View popup Table S2. Zero-shot results across different aspects of psychotherapy note summarization. The table reports the comprehensive zero-shot results of 11 LLMs using six automatic metrics: BLEU (exact matching), METEOR and ROUGE-L (flexible lexical matching), BERTScore (semantic similarity), BLUERT (fluency and relevance), and InfoLM (semantic distance). The scores demonstrate varying model capabilities across 17 different aspects of psychotherapy notes, with closed-source models (GPT4o mini, Gemini Pro) generally outperforming open-source alternatives. View this table: View inline View popup Table S3. One-shot results across different aspects of psychotherapy note summarization. Detailed comparison of model performance when provided with a single example for each aspect of psychotherapy note summarization. The table presents scores across six evaluation metrics (BLEU, METEOR, ROUGE-L, BERTScore, BLUERT, and InfoLM) for all 11 models on 17 aspects. Compared to zero-shot results ( Table S2 ), most models show significant improvements, particularly in structured aspects. Acknowledgement The authors would like to express their sincere gratitude to the following Clinical Psychologists for their valuable time, expertise, and thoughtful evaluations that greatly contributed to the quality and rigor of this work: Dr. Ayushi Sobhani, Dr. Ojasvi Meena, Dr. Sumit Bakshi, Dr. Manish Shukla, Dr. Sonia Puar, Dr. Medhavi Sood, Dr. Aishwarya Raj, Dr. Anusha Thakur, Dr. Jemima Jacob, Dr. Akanksha, Dr. M. Kaur, Dr. Vidushi Sharma, Dr. Shreyanshi Sata, Ms Shireen Chaturvedi, Mr. Rishabh Jain, Ms. Paakhi Srivastava, Mr. Anurag Sharma, Ms. Farhin Mustaque, Ms. Neha Singh, Mr. Ratnakar, Mr. Nitish Kumar and Mr. R. K. Meena. Tanmoy Chakraborty acknowledges the support of Tower Research Capital Markets toward using machine learning for social good and Rajiv Khemani Young Faculty Chair Professorship in Artificial Intelligence. Footnotes ↵ * The work was done when he was a Research Associate in the Department of Electrical Engineering, Indian Institute of Technology, Delhi, New Delhi, India References 1. ↵ Gayatri Jadav , U. Problem Oriented Medical Record — physio-pedia.com . https://www.physio-pedia.com/Problem_Oriented_Medical_Record . [Accessed 26-04-2025 ]. 2. Shukor , A. R . An alternative paradigm for evidence-based medicine: Revisiting lawrence weed, md’s systems approach . The Perm. J . 21 , 16 – 147 , DOI: 10.7812/TPP/16-147 ( 2017 ). https://www.thepermanentejournal.org/doi/pdf/10.7812/TPP/16-147 . OpenUrl CrossRef 3. Kahan . The Note is Dead, Long Live the Note”: Assessing the Past, Present, and Future of Medical Charting . https://roundtablejournal.org/2022/03/02/the-note-is-dead-long-live-the-note-assessing-the-past-present-and-future-of-medical-charting/ ( 2022 ). [Accessed 26-04-2025 ]. 4. ↵ Wright , A. , Sittig , D. F. , McGowan , J. , Ash , J. S. & Weed , L. L . Bringing science to medicine: an interview with larry weed, inventor of the problem-oriented medical record . J. Am. Med. Informatics Assoc . 21 , 964 – 968 ( 2014 ). OpenUrl CrossRef PubMed 5. ↵ Podder , V. , Lew , V. & Ghassemzadeh , S. SOAP notes . In StatPearls ( StatPearls Publishing , Treasure Island (FL) , 2024 ). 6. ↵ Medesk . Dap notes for mental health professionals . https://www.medesk.net/en/blog/how-to-write-dap-notes/ . 7. ↵ Boyles , O. Birp notes guide for mental health professionals . https://www.icanotes.com/2021/08/18/birp-notes-guide/ ( 2021 ). 8. ↵ Management , O . Can we return to the ABCs of crafting a medical record note? | MDedge — mdedge.com . https://www.mdedge.com/obgmanagement/article/247109/practice-management/can-we-return-abcs-crafting-medical-record-note/page/0/2 ( 2021 ). [Accessed 26-04-2025 ]. 9. ↵ Jaroudi , S. & Payne , J. D . Remembering lawrence weed: A pioneer of the soap note. Acad . Medicine 94 ( 2019 ). 10. ↵ Carr , A. C. , Ghosh , A. & Ancill , R . Can a computer take a psychiatric history? Psychol . medicine 13 , 151 – 158 ( 1983 ). OpenUrl 11. ↵ Association , A. P. et al. Record keeping guidelines . The Am. Psychol. 62 , 993 – 1004 ( 2007 ). OpenUrl PubMed 12. ↵ Association , A. P. , et al. Ethical principles of psychologists and code of conduct . ( 2016 ). 13. ↵ DeLettre , J. L. & Sobell , L. C . Keeping psychotherapy notes separate from the patient record . Clin. Psychol. & Psychother. An Int. J. Theory & Pract . 17 , 160 – 163 ( 2010 ). OpenUrl 14. ↵ Harris , S. M. et al. Are clinical records really that important? the dearth of research and practice guidelines in mft literature. The Am . J. Fam. Ther . 37 , 373 – 387 ( 2009 ). OpenUrl 15. ↵ Layman , E. J . Ethical issues and the electronic health record . The health care manager 27 , 165 – 176 ( 2008 ). OpenUrl PubMed 16. ↵ Lino , L. & Martins , H . Medical history taking using electronic medical records: a systematic review . Int. J. Digit. Heal . 1 , 12 ( 2021 ). OpenUrl 17. ↵ Lin , C.-T. , McKenzie , M. , Pell , J. & Caplan , L . Health care provider satisfaction with a new electronic progress note format: Soap vs apso format . JAMA internal medicine 173 , 160 – 162 ( 2013 ). OpenUrl PubMed 18. ↵ Martin , K. , Bickle , K. , Ricciardelli , R. & Lok , J . Exploration of note writing by mental health nurses using a video scenario . J. Clin. Nurs . 32 , 2672 – 2683 ( 2023 ). OpenUrl PubMed 19. ↵ Tudor , K. & Gledhill , K. Notes on notes . Ata: J. Psychother. Aotearoa New Zealand 26 , 123 – 144 ( 2022 ). OpenUrl 20. ↵ Keszthelyi , D. , Gaudet-Blavignac , C. , Bjelogrlic , M. , Lovis , C. et al. Patient information summarization in clinical settings: scoping review . JMIR Med. Informatics 11 , e44639 ( 2023 ). OpenUrl 21. ↵ Manas , G. , et al. Knowledge-infused abstractive summarization of clinical diagnostic interviews: Framework development study . JMIR Mental Heal . 8 , e20865 ( 2021 ). OpenUrl 22. ↵ Srivastava , A. , Joshi , S. , Chakraborty , T. & Akhtar , M. S. Knowledge planning in large language models for domain-aligned counseling summarization . arXiv preprint arXiv:2409.14907 ( 2024 ). 23. ↵ Adhikary , P. K. , et al. Exploring the efficacy of large language models in summarizing mental health counseling sessions: Benchmark study . JMIR Mental Heal . 11 , e57306 ( 2024 ). OpenUrl 24. ↵ Han , J. et al. Ascleai: A llm-based clinical note management system for enhancing clinician productivity . In Extended Abstracts of the CHI Conference on Human Factors in Computing Systems , 1 – 7 ( 2024 ). 25. ↵ Yuan , D. , et al. A continued pretrained llm approach for automatic medical note generation . In NAACL (Short Papers) ( 2024 ). 26. ↵ The mental healthcare act , 2017 . https://www.indiacode.nic.in/bitstream/123456789/2249/1/A2017-10.pdf . Accessed: 2025-04-26 . 27. ↵ Malhotra , G. , Waheed , A. , Srivastava , A. , Akhtar , M. S. & Chakraborty , T . Speaker and time-aware joint contextual learning for dialogue-act classification in counselling conversations . In Proceedings of the Fifteenth ACM International Conference on Web Search and Data Mining, WSDM ‘ 22 , 735 – 745 , DOI: 10.1145/3488560.3498509 ( Association for Computing Machinery , New York, NY, USA , 2022 ). OpenUrl CrossRef 28. ↵ Bain , M. , Huh , J. , Han , T. & Zisserman , A . Whisperx: Time-accurate speech transcription of long-form audio . arXiv preprint arXiv:2303.00747 ( 2023 ). 29. ↵ Krause , A. Radford , A. et al. Robust speech recognition via large-scale weak supervision . In Krause , A. et al. (eds.) Proceedings of the 40th International Conference on Machine Learning, vol. 202 of Proceedings of Machine Learning Research , 28492 – 28518 ( PMLR , 2023 ). 30. ↵ Bredin , H. pyannote.audio 2.1 speaker diarization pipeline: principle, benchmark, and recipe . In Proc. INTERSPEECH 2023 ( 2023 ). 31. ↵ Team , G. et al. Gemma: Open models based on gemini research and technology . arXiv preprint arXiv:2403.08295 ( 2024 ). 32. ↵ Dubey , A. , et al. The llama 3 herd of models . arXiv preprint arXiv:2407.21783 ( 2024 ). 33. ↵ Yang , K. et al. Mentallama: Interpretable mental health analysis on social media with large language models . In Proceedings of the ACM Web Conference 2024, WWW ’24 , 4489 – 4500 , DOI: 10.1145/3589334.3648137 ( Association for Computing Machinery , New York, NY, USA , 2024 ). OpenUrl CrossRef 34. ↵ Jiang , A. Q. , et al. Mistral 7b . arXiv preprint arXiv:2310.06825 ( 2023 ). 35. ↵ Mitra , A. , et al. Orca 2: Teaching small language models how to reason . arXiv preprint arXiv:2311.11045 ( 2023 ). 36. ↵ Abdin , M. , et al. Phi-3 technical report: A highly capable language model locally on your phone . arXiv preprint arXiv:2404.14219 ( 2024 ). 37. ↵ Oh , A. Zheng , L. et al. Judging llm-as-a-judge with mt-bench and chatbot arena . In Oh , A. et al. (eds.) Advances in Neural Information Processing Systems , vol. 36 , 46595 – 46623 ( Curran Associates, Inc. , 2023 ). OpenUrl 38. ↵ Tunstall , L. , et al. Zephyr: Direct distillation of lm alignment . arXiv preprint arXiv:2310.16944 ( 2023 ). 39. ↵ Achiam , J. , et al. Gpt-4 technical report . arXiv preprint arXiv:2303.08774 ( 2023 ). 40. ↵ Reid , M. et al. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context . arXiv preprint arXiv:2403.05530 ( 2024 ). 41. ↵ Papineni , K. , Roukos , S. , Ward , T. & Zhu , W.-J . Bleu: a method for automatic evaluation of machine translation . In Proceedings of the 40th annual meeting of the Association for Computational Linguistics , 311 – 318 ( 2002 ). 42. ↵ Banerjee , S. & Lavie , A . Meteor: An automatic metric for mt evaluation with improved correlation with human judgments . In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and/or summarization , 65 – 72 ( 2005 ). 43. ↵ Lin , C.-Y. & Hovy , E . Automatic evaluation of summaries using n-gram co-occurrence statistics . In Proceedings of the 2003 human language technology conference of the North American chapter of the association for computational linguistics , 150 – 157 ( 2003 ). 44. ↵ Yuan , W. , Neubig , G. & Liu , P . Bartscore: Evaluating generated text as text generation . Adv. neural information processing systems 34 , 27263 – 27277 ( 2021 ). OpenUrl 45. ↵ Sellam , T. , Das , D. & Parikh , A . Bleurt: Learning robust metrics for text generation . In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics , 7881 – 7892 ( 2020 ). 46. ↵ Colombo , P. J. A. , Clavel , C. & Piantanida , P . Infolm: A new metric to evaluate summarization amp; data2text generation . Proc. AAAI Conf. on Artif. Intell . 36 , 10554 – 10562 , DOI: 10.1609/aaai.v36i10.21299 ( 2022 ). OpenUrl CrossRef 47. ↵ Kojima , T. , Gu , S. S. , Reid , M. , Matsuo , Y. & Iwasawa , Y . Large language models are zero-shot reasoners . Adv. neural information processing systems 35 , 22199 – 22213 ( 2022 ). OpenUrl 48. ↵ Brown , T. B . Language models are few-shot learners . arXiv preprint arXiv:2005.14165 ( 2020 ). 49. ↵ Colombo , P. J. A. , Clavel , C. & Piantanida , P . Infolm: A new metric to evaluate summarization & data2text generation . In Proceedings of the AAAI conference on artificial intelligence , vol. 36 , 10554 – 10562 ( 2022 ). OpenUrl 50. ↵ Montgomery , M. R. , Luca , M. & Gordon-Finlayson , A . The shifting sound of silence: A constructivist grounded theory. Couns . Psychother. Res . 24 , 275 – 285 , DOI: 10.1002/capr.12654 ( 2024 ). https://onlinelibrary.wiley.com/doi/pdf/10.1002/capr.12654 . OpenUrl CrossRef 51. ↵ Mongan , J. , Moy , L. & Kahn , C. E . Checklist for artificial intelligence in medical imaging (claim): A guide for authors and reviewers . Radiol. Artif. Intell. 2 , e200029 , DOI: 10.1148/ryai.2020200029 ( 2020 ). PMID: 33937821 , 10.1148/ryai.2020200029. OpenUrl CrossRef PubMed 52. ↵ Sounderajah , V. et al. Developing a reporting guideline for artificial intelligence-centred diagnostic test accuracy studies: the stard-ai protocol . BMJ Open 11 , DOI: 10.1136/bmjopen-2020-047709 ( 2021 ). https://bmjopen.bmj.com/content/11/6/ e047709.full.pdf. OpenUrl Abstract / FREE Full Text 53. ↵ Liu , X. et al. Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the consort-ai extension. Nat . Medicine 26 , 1364 – 1374 , DOI: 10.1038/s41591-020-1034-x ( 2020 ). OpenUrl CrossRef PubMed 54. ↵ Norgeot , B. et al. Minimum information about clinical artificial intelligence modeling: the mi-claim checklist . Nat. Medicine 26 , 1320 – 1324 , DOI: 10.1038/s41591-020-1041-y ( 2020 ). OpenUrl CrossRef PubMed 55. ↵ Miller , G. A. WordNet: A lexical database for English . In Human Language Technology: Proceedings of a Workshop held at Plainsboro, New Jersey, March 8-11 , 1994 ( 1994 ). 56. ↵ Burstein , J. , Doran , C. Solorio , T. Devlin , J. , Chang , M.-W. , Lee , K. & Toutanova , K. BERT: Pre-training of deep bidirectional transformers for language understanding . In Burstein , J. , Doran , C. & Solorio , T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , Volume 1 (Long and Short Papers), 4171 – 4186 , DOI: 10.18653/v1/N19-1423 ( Association for Computational Linguistics , Minneapolis, Minnesota , 2019 ). OpenUrl CrossRef 57. ↵ Kullback , S. & Leibler , R. A. On information and sufficiency . The annals mathematical statistics 22 , 79 – 86 ( 1951 ). OpenUrl View the discussion thread. Back to top Previous Next Posted June 25, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Towards Richer AI-Assisted Psychotherapy Note-Making and Performance Benchmarking Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Towards Richer AI-Assisted Psychotherapy Note-Making and Performance Benchmarking Prottay Kumar Adhikary , Sahajpreet Singh , Suruchi Singh , Panna Sharma , Pankhuri Soni , Rashmi Choudhary , Charu Saxena , Prachi Chauhan , Swati Kedia Gupta , Koushik Sinha Deb , Salam Michael Singh , Tanmoy Chakraborty medRxiv 2025.06.25.25330252; doi: https://doi.org/10.1101/2025.06.25.25330252 Share This Article: Copy Citation Tools Towards Richer AI-Assisted Psychotherapy Note-Making and Performance Benchmarking Prottay Kumar Adhikary , Sahajpreet Singh , Suruchi Singh , Panna Sharma , Pankhuri Soni , Rashmi Choudhary , Charu Saxena , Prachi Chauhan , Swati Kedia Gupta , Koushik Sinha Deb , Salam Michael Singh , Tanmoy Chakraborty medRxiv 2025.06.25.25330252; doi: https://doi.org/10.1101/2025.06.25.25330252 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Psychiatry and Clinical Psychology Subject Areas All Articles Addiction Medicine (570) Allergy and Immunology (863) Anesthesia (301) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1511) Epidemiology (15231) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6610) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15924) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6608) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3338) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5449) Public and Global Health (9240) Radiology and Imaging (2203) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a02063af5e40f047',t:'MTc3OTgzNjM1Nw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.