Scalable depression monitoring with smartphone speech: a multimodal benchmark and topic analysis

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 56,579 characters · extracted from preprint-html · click to expand
Scalable depression monitoring with smartphone speech: a multimodal benchmark and topic analysis | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Scalable depression monitoring with smartphone speech: a multimodal benchmark and topic analysis View ORCID Profile Daniel Emden , View ORCID Profile Maike Richter , View ORCID Profile Astrid Chevance , View ORCID Profile Ramona Leenings , View ORCID Profile Julian Herpertz , Lara Gutfleisch , View ORCID Profile Anna Fleuchaus , Rogério Blitz , View ORCID Profile Vincent L. Holstein , View ORCID Profile Janik Goltermann , View ORCID Profile Nils R. Winter , Jennifer Spanagel , View ORCID Profile Susanne Meinert , View ORCID Profile Tiana Borgers , Kira Flinkenflügel , View ORCID Profile Frederike Stein , View ORCID Profile Nina Alexander , View ORCID Profile Hamidreza Jamalabadi , View ORCID Profile Jonathan Repple , View ORCID Profile Christian Dobel , View ORCID Profile Elisabeth J. Leehr , View ORCID Profile Ronny Redlich , View ORCID Profile Ulrich Ebner-Priemer , View ORCID Profile Igor Nenadić , View ORCID Profile Tilo Kircher , View ORCID Profile Udo Dannlowski , Tim Hahn , View ORCID Profile Nils Opel doi: https://doi.org/10.1101/2025.07.17.25331744 Daniel Emden 1 Department of Psychiatry and Psychotherapy, Jena University Hospital , Jena, Germany 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Daniel Emden Maike Richter 3 Department of Psychiatry and Neuroscience, Campus Benjamin Franklin, Charité–Universitätsmedizin Berlin , Berlin, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Maike Richter Astrid Chevance 4 Université Paris Cité and Université Sorbonne Paris Nord, INSERM INRAE, Centre for Research in Epidemiology and Statistics , Paris, France 5 Centre d’Epidémiologie Clinique, AP-HP, Hôpital Hôtel Dieu , Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Astrid Chevance Ramona Leenings 1 Department of Psychiatry and Psychotherapy, Jena University Hospital , Jena, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ramona Leenings Julian Herpertz 1 Department of Psychiatry and Psychotherapy, Jena University Hospital , Jena, Germany 3 Department of Psychiatry and Neuroscience, Campus Benjamin Franklin, Charité–Universitätsmedizin Berlin , Berlin, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Julian Herpertz Lara Gutfleisch 1 Department of Psychiatry and Psychotherapy, Jena University Hospital , Jena, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Anna Fleuchaus 1 Department of Psychiatry and Psychotherapy, Jena University Hospital , Jena, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Anna Fleuchaus Rogério Blitz 1 Department of Psychiatry and Psychotherapy, Jena University Hospital , Jena, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Vincent L. Holstein 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany 6 Department of Psychiatry, Harvard Medical School , Boston, MA, USA 7 Stanley Center for Psychiatric Research, Broad Institute of MIT and Harvard , Cambridge, MA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Vincent L. Holstein Janik Goltermann 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Janik Goltermann Nils R. Winter 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nils R. Winter Jennifer Spanagel 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Susanne Meinert 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Susanne Meinert Tiana Borgers 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tiana Borgers Kira Flinkenflügel 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Frederike Stein 8 Department of Psychiatry and Psychotherapy, Philipps-University of Marburg , Marburg, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Frederike Stein Nina Alexander 8 Department of Psychiatry and Psychotherapy, Philipps-University of Marburg , Marburg, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nina Alexander Hamidreza Jamalabadi 8 Department of Psychiatry and Psychotherapy, Philipps-University of Marburg , Marburg, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hamidreza Jamalabadi Jonathan Repple 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany 9 Department of Psychiatry, Psychosomatic Medicine and Psychotherapy, University Hospital Frankfurt , Frankfurt am Main, Germany 10 Cooperative Brain Imaging Center - CoBIC, Goethe University Frankfurt , Frankfurt am Main, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jonathan Repple Christian Dobel 11 Department of Otorhinolaryngology, Institute of Phoniatry and Pedaudiology, Jena University Hospital , Jena, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Christian Dobel Elisabeth J. Leehr 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Elisabeth J. Leehr Ronny Redlich 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany 12 Department of Psychology, Martin Luther University Halle-Wittenberg , Halle, Germany 13 Center for Intervention and Research on adaptive and maladaptive brain Circuits underlying mental health (C-I-R-C) , Jena-Magdeburg-Halle, Germany 14 German Center for Mental Health (DZPG) , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ronny Redlich Ulrich Ebner-Priemer 15 Mental mHealth Lab, Institute of Sports and Sports Science, Karlsruhe Institute of Technology , Karlsruhe, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ulrich Ebner-Priemer Igor Nenadić 8 Department of Psychiatry and Psychotherapy, Philipps-University of Marburg , Marburg, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Igor Nenadić Tilo Kircher 8 Department of Psychiatry and Psychotherapy, Philipps-University of Marburg , Marburg, Germany 16 Center for Mind, Brain and Behavior (CMBB), Philipps-University of Marburg , Marburg, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tilo Kircher Udo Dannlowski 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Udo Dannlowski Tim Hahn 2 Institute for Translational Psychiatry, University of Münster , Münster, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nils Opel 3 Department of Psychiatry and Neuroscience, Campus Benjamin Franklin, Charité–Universitätsmedizin Berlin , Berlin, Germany 14 German Center for Mental Health (DZPG) , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nils Opel For correspondence: nils.opel{at}charite.de Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Objective, scalable biomarkers are needed for continuous monitoring of major depressive disorder (MDD). Smartphone-collected speech is promising, yet extracting clinically useful signals remains difficult. We analysed 3 151 weekly voice diaries from 284 German-speaking adults (128 MDD, 156 controls) and regressed Beck Depression Inventory (BDI) scores. Sentence embeddings from the open-source 8-billion-parameter Qwen3-8B model predicted scores with MAE = 4.45 and R 2 = 0.35, explaining 16 more points of variance than the best traditional feature set (TF-IDF). Adding lexical–prosodic or TF-IDF features provided only marginal improvement (best MAE = 4.39). To interpret the embeddings we applied BERTopic and uncovered ten coherent themes; BDI scores peaked for “Persistent Low Mood” and “Pain Distress”, confirming clinical relevance. Large-language-model embeddings therefore capture the dominant signal of depression severity in everyday speech and, paired with interpretable topic analysis, offer a privacy-preserving, scalable route to digital mental-health phenotyping. 1 Introduction Depression is a leading contributor to the global burden of disease, defined by its chronic and recurrent nature. Consequently, long-term observation is crucial for prevention, yet its clinical course is typically monitored through infrequent, clinic-based self-reports [ 1 , 2 ]. This “snapshot” view often misses the day-to-day fluctuations that can signal relapse or treatment response. Digital phenotyping seeks to close that gap by continuously harvesting data from personal devices, turning everyday behaviour into continuous mental-health read-outs [ 3 , 4 ]. Among the many candidate signals, spoken language is unique: it is produced spontaneously, carries rich semantic content, and embeds prosodic cues that mirror affect and cognition [ 5 , 6 ]. Voice diaries recorded in naturalistic (ecologically valid) contexts therefore offer a non-invasive lens on mood that can scale to the population level. Early speech-based studies for depression focused on handcrafted acoustic markers or word-count approaches. While informative, these shallow features capture only limited facets of meaning and often fail to generalise across speakers and conditions. The paradigm shift towards transformer-based language models [ 7 ] now provides embeddings from large, pre-trained language models that compress nuanced semantics into dense vectors. These methods are increasingly being applied to mental health analysis [ 8 , 9 ]. Yet few works have systematically benchmarked such embeddings against traditional lexical and acoustic pipelines on naturalistic data. Equally important is the challenge of interpretability. Black-box predictions risk limited uptake in clinical settings unless researchers can articulate why a model outputs a high severity score [ 10 ]. Topic modelling methods such as BERTopic [ 11 ], an algorithm that automatically analyzes text to identify underlying discussion topics, bridge this gap by clustering embeddings into coherent themes whose prevalence can be related back to symptoms. This dual focus on performance plus insight is critical if digital phenotyping tools are to inform therapeutic decision-making rather than merely flag anomalies [ 12 ]. While promising, research in speech-based depression assessment has historically faced key limitations. Much of the foundational work has focused on classifying individuals with major depression versus healthy controls, often using speech elicited in controlled, laboratory settings (e.g., reading tasks or structured interviews) [ 13 , 14 ]. This approach, while crucial for establishing a signal, does not capture the day-to-day symptom variability within a clinical population. More recent digital phenotyping studies have begun to leverage data from personal devices, but often remain cross-sectional or have yet to robustly model symptom severity from speech collected naturalistically over time [ 15 , 16 ]. The present study directly addresses these gaps. We advance this agenda in three ways. First, and most critically, we analyze a unique dataset of 3,151 weekly voice diaries from 284 participants with major depression, allowing us to move beyond simple case-control classification and instead model the nuanced, longitudinal fluctuations of symptom severity as they occur in a real-world clinical context. Second, we bench-mark a multimodal feature set (classical lexical and basic prosodic metrics, TF-IDF, openSMILE acoustics, and Qwen3-8B sentence embeddings, a state-of-the-art 8-billionparameter model) using a consistent support-vector regression protocol. Third, we pair the best-performing embeddings with BERTopic to expose the linguistic themes that drive prediction and to quantify their relationship with Beck Depression Inventory (BDI) scores. Drawing on the ability of large language models to capture deep contextual information from text, we hypothesize that (i) Qwen3-8B embeddings will outperform all lexical and acoustic baselines in predicting continuous BDI scores. We further hypothesize that (ii) the resulting topic structure will reveal clinically coherent themes, contrasting expressions of internal distress with descriptions of external events, whose prevalence explains a substantial fraction of variance in concurrently measured symptom severity (BDI). By combining language representations from large-scale, billion-parameter models with unsupervised thematic analysis on a real-world corpus, our work aims to deliver a scalable and interpretable pipeline for continuous, voice-based depression monitoring. 2 Methods 2.1 Participants The data for this study were drawn from a longitudinal project that used the ReMAP (Remote Monitoring Application in Psychiatry) smartphone application to acquire weekly speech samples from a cohort of both healthy controls (HC) and individuals with a lifetime or current diagnosis of Major Depressive Disorder (MDD). The full dataset consists of 3,151 weekly speech samples contributed by 284 unique participants (HC: 156; MDD: 128), collected between May 2019 and April 2025 over a mean period of 151 days per participant (median = 17 days, IQR = 0-137 days). On average, each participant provided 11.1 samples (SD = 28.5); however, the number of contributions was highly variable, with a median of 2 samples per participant (IQR = 1-7) and a maximum of 282 samples from a single individual. The study design, recruitment procedures, and the feasibility of digital data acquisition via the ReMAP smartphone application have been detailed in previous publications [ 17 , 18 ]. Briefly, participants were recruited from ongoing mental health cohort studies. More than two-thirds of the participants in this dataset are part of the Marburg-Münster Affective Disorders Cohort Study (MACS) [ 19 , 20 ], with data collected at two sites (Marburg and Münster, Germany) using identical study protocols. The MDD group comprised both acutely depressed individuals and those with a lifetime diagnosis who were in remission at the time of participation thus including a wide range of depression symptom severity. The presence or absence of a lifetime mental disorder was confirmed using the Structured Clinical Interview for DSM-IV (SCID-IV; [ 21 ]) as part of the original cohort protocols. The primary outcome measure for this analysis, depressive symptom severity, was the temporally closest Beck Depression Inventory (BDI) score collected within a ±7 day window of each speech sample. Demographic and clinical characteristics of the sample are presented in Table 1 . View this table: View inline View popup Download powerpoint Table 1. Demographic and Clinical Characteristics of the Study Sample and Group Differences. MDD = Major Depressive Disorder; BDI = Beck Depression Inventory; SD = Standard Deviation. P-values from independent t-tests for continuous variables and Chi-squared tests for categorical variables. 2.2 Data Acquisition Data were collected using the ReMAP application, a native smartphone app developed for both iOS and Android platforms. The data acquisition protocol has been detailed in previous publications [ 18 ]. Following informed consent, participants installed the application on their personal smartphones, following a bring-your-own-device approach. The ReMAP app prompts users for active data collection, which includes weekly completion of the Beck Depression Inventory (BDI) and weekly voice samples. For the voice data collection central to this study, participants received randomized prompts during the day, asking them to speak freely for 1 to 3 minutes in response to the question: “How did you feel during the last week?“ in German language. 2.3 Feature Extraction and Processing To create a comprehensive feature set for modelling, we processed both the audio and the transcribed text from each speech sample. 2.3.1 Speech-to-Text Transcription and Preprocessing All audio samples were first transcribed locally using the open-source OpenAI Whisper (large-v2) model to generate verbatim transcripts [ 22 ]. From this raw text, a second, normalized version was created for specific downstream tasks. This cleaning process, performed using spaCy, involved converting text to lowercase, removing stop words and punctuation, and lemmatizing tokens. 2.3.2 Acoustic Feature Extraction From the raw audio files, we extracted two well-established, high-dimensional acoustic feature sets using the Python port of the openSMILE toolkit: the comprehensive Com-ParE functionals set (6,373 features; [ 23 ]) and the psychologically relevant eGeMAPS v02 set (88 features; [ 24 ]). 2.3.3 Text-Based Feature Extraction We extracted three distinct types of text-based features: Lexical and Basic Prosodic Metrics: A set of 18 features computed from the transcripts and audio timings, including metrics such as word count, lexical diversity, speech rate, and pause ratios. The complete list of lexical and prosodic metrics is provided in Table 2 . TF-IDF Features: Term Frequency–Inverse Document Frequency vectors calculated from 1- and 2-word n-grams in the normalized transcripts; they weight each term by its rarity across the corpus, thus highlighting words and phrases that uniquely characterise an individual diary. Sentence Embeddings: State-of-the-art semantic representations were generated using the Qwen3-8B embeddings model. This open-source model was selected as its performance is comparable to leading proprietary models on public benchmarks (MTEB Leaderboard, checked 16 June 2025), while ensuring that sensitive participant data could be processed locally. To tailor the embeddings to our specific task, each verbatim transcript was prepended with the instruction: “Generate a German embedding to analyse emotional and cognitive states from this weekly voice diary for a depression severity regression task.“ This ensures the model generates representations optimized for our goal while preserving the full contextual and grammatical structure of the speech. View this table: View inline View popup Download powerpoint Table 2. List of Lexical and Prosodic Metrics 2.4 Predictive Modelling Predictive-modelling pipeline We predicted the severity of depressive symptoms with a machine-learning pipeline that (1) standardised each feature matrix, (2) reduced dimensionality by principal-component analysis (PCA), and (3) fitted a support-vector regression (SVR) model. The ground-truth label was the Beck Depression Inventory (BDI) score recorded in the ReMAP app within ± 7 days of the corresponding voice diary. A separate pipeline was trained for every single-modal feature set and for each predefined multimodal combination. Software All analyses were run in Python 3.11 using scikit-learn 1.6; PCA and SVR were GPU-accelerated with cuML 25.02. Cross-validation and hyper-parameter tuning Model performance was estimated with nested, participant-stratified five-fold cross-validation (GroupKFold in both outer and inner loops). The outer loop produced unbiased test estimates, whereas the inner loop tuned the SVR kernel, regularisation strength (C), ɛ -insensitive loss width, and the number of retained PCA components. Hyper-parameter search was orchestrated by Optuna’s median-pruner algorithm. Performance is reported as mean absolute error (MAE) and explained variance ( R 2 ). Baselines As points of reference we fitted (i) a DummyRegressor with strategy=“mean” for all regression tasks and (ii) a DummyClassifier with strategy=“stratified” for the HC vs. MDD supplementary classification. Supplementary analyses To contextualise the regression results we additionally (i) trained a binary classifier to distinguish healthy controls (HC) from participants with major depressive disorder (MDD), reporting balanced accuracy and AUROC, and (ii) repeated the full regression pipeline on the MDD subgroup alone to verify that performance reflected intra-group symptom variation rather than diagnostic differences. Statistical significance Significance was assessed with 1,000 label-shuffling permutations per feature set and 1,000 paired permutations comparing the leading models. Twotailed p -values were computed as ( r + 1) / ( N + 1), and all permutations were executed within the outer cross-validation folds to preserve the nested data-splitting structure. 2.5 Topic Modelling To gain insight into the linguistic themes associated with depression severity, we conducted an unsupervised topic analysis using BERTopic [ 11 ] on the best-performing text embeddings (Qwen3-8B). The BERTopic pipeline employed UMAP (Uniform Manifold Approximation and Projection) for dimensionality reduction and HDBSCAN for clustering. The topic modelling was performed on the lemmatized transcripts to ensure coherent keyword representations. The resulting topics were manually labelled based on an inspection of each topic’s dominant keywords and a review of its most representative documents. After automatically reducing the number of topics to merge semantically similar clusters, we performed a series of statistical tests. A Kruskal-Wallis H-test was used to assess overall differences in BDI scores across topics, followed by pairwise Mann-Whitney U tests (with Benjamini-Hochberg FDR correction) to identify which specific themes differed significantly. Cliff’s Delta ( δ ) was calculated as a non-parametric measure of effect size. 3 Results 3.1 Predictive Modelling We evaluated a range of feature modalities for their ability to predict continuous BDI scores ( Table 3 ). All feature-based models substantially outperformed the dummy regressor (MAE = 6.24). The model based solely on Qwen3-8B sentence embeddings achieved an MAE of 4.45 and an R 2 of 0.35, decisively outperforming every classical single-modality alternative (best classical: TF–IDF, MAE = 5.25, R 2 = 0.19). View this table: View inline View popup Download powerpoint Table 3. Predictive Performance of Feature Sets for BDI Score Regression. All models are Support Vector Regressors with hyperparameters tuned via 5-fold nested cross-validation. MAE = Mean Absolute Error (lower is better); R 2 = Coefficient of Determination (higher is better); SD = Standard Deviation across outer folds. Best single modality and multimodal combination are highlighted in bold. Adding shallow features yielded small but consistent gains. Two combinations, namely embeddings + lexical–prosodic metrics and embeddings + TF–IDF, both yielded modest performance improvements (best: MAE = 4.39, R 2 = 0.35). Paired permutation tests confirmed that each of these multimodal models was significantly superior to embeddings alone ( p = 0.021 and p = 0.018, respectively), although the absolute improvements were modest (ΔMAE < 0.07). No other fusion, acoustic or otherwise, matched this level of benefit. Label-shuffling permutation tests (1,000 iterations per feature set) verified that every reported model performed above chance ( p < 0.001). Effect sizes mirrored the descriptive results: embeddings delivered a large performance leap over the strongest classical baseline (ΔMAE = 0.80, Δ R 2 = 0.16), whereas the multimodal additions provided only incremental refinement. To benchmark the robustness of our findings, we conducted two validity checks. First, we trained binary classifiers to distinguish participants with Major Depressive Disorder (MDD) from healthy controls (HC). Results (see Table 4 ) showed that the best-performing models combined Qwen3-8B embeddings with either lexical–prosodic or eGeMAPS acoustic features, each achieving a balanced accuracy of 0.70. The embeddings alone reached 0.68, markedly surpassing the strongest classical baseline (TF–IDF, 0.62) and all acousticonly models, confirming that the semantic information captured by large-scale embeddings carries a strong, readily extractable signal of depression status. Label-shuffling permutation tests confirmed that every classification model performed significantly above chance ( p 0.05). View this table: View inline View popup Download powerpoint Table 4. Classification performance (mean (SD) across outer folds) for distinguishing participants with major depressive disorder from healthy controls. Balanced accuracy and area under the ROC curve (AUC-ROC) are averaged across outer folds of nested cross-validation; the dummy baseline predicts the majority class. Second, to test whether our models were simply separating healthy controls from patients rather than tracking symptom variation within a clinical cohort, we repeated the entire predictive-modelling pipeline using only the 1 271 diaries contributed by the 128 MDD participants ( Table 5 ). In this more challenging setting the Qwen3-8B embeddings were the only feature set to demonstrate predictive power, achieving an MAE of 6.01 and an R 2 of 0.15. In contrast, all other feature modalities (lexical, acoustic, and TF-IDF) performed at, or slightly below, the dummy baseline (MAE = 7.37; R 2 = –0.06). View this table: View inline View popup Download powerpoint Table 5. Predictive Performance within MDD Cohort Only (N=128). All models are Support Vector Regressors with hyperparameters tuned via 5-fold nested cross-validation. MAE = Mean Absolute Error; R 2 = Coefficient of Determination. Best performance is highlighted in bold. 3.2 Topic Modelling To understand the linguistic content driving the predictive performance, we conducted an exploratory topic analysis using BERTopic. This data-driven approach yielded 10 distinct, interpretable topics within the speech diaries that we labelled according to the dominant keywords and most representative documents (see Table 6 ). A Kruskal-Wallis H-test revealed a highly significant association between BDI scores and these discovered topics (H = 663.92, p < .001). View this table: View inline View popup Download powerpoint Table 6. Topic Summary and BDI Statistics. Topics are ordered from highest to lowest Mean BDI score. Documents (n) = number of voice diary entries assigned to each topic; Subjects (N) = number of unique participants contributing to each topic. The outlier topic has been excluded from this analysis. Post-hoc pairwise comparisons (Mann-Whitney U tests with FDR correction) confirmed that specific topics were strongly linked to symptom severity. In particular, themes reflecting emotional distress such as “Persistent Low Mood” (Mean BDI = 14.03) and “Pain Distress” (Mean BDI = 12.24) were associated with the highest depression scores. The most pronounced difference was found between “Persistent Low Mood” and the activityfocused topic “Planning & Leisure” (Mean BDI = 0.67), which yielded the largest effect size in the analysis (Cliff’s δ = 0.99, p < .001). The distribution of BDI scores for each topic is visualized in Figure 1 . To visualize the relationships between these themes, we computed their pairwise semantic similarity, revealing the distinct topical structure of the diaries ( Figure 2 ). Download figure Open in new tab Figure 1. Distribution of Beck Depression Inventory (BDI) scores across 10 machine-learning-derived topics. The topics are arranged as violin plots along the horizontal axis in descending order of their mean BDI score. This ordering reveals a clear severity gradient. The leftmost topics, such as “Persistent Low Mood” and “Pain Distress”, are visibly associated with higher BDI scores, whereas the rightmost topics, centered on daily activities like “Planning & Leisure”, correspond to lower scores. Download figure Open in new tab Figure 2. Pairwise cosine similarities among the ten BERTopic themes are displayed as a hierarchically ordered heat-map (Qwen3-8B embeddings; diagonal fixed at 1.0). A compact high-similarity block (deep blue) links the routine narratives Routine Stress, Social Joy and Casual Updates, while a second, looser block groups the domestic-routine themes Planning & Leisure and Travel & Partner. In contrast, the health-related topics diverge markedly: Rehab Progress shows only moderate affinity to Pain Distress, and Rebuilding Stability occupies the most isolated position in the matrix, underscoring its distinct lexical profile. The pale band separating these regions confirms the semantic distance between day-to-day event descriptions and more introspective health reflections, complementing the super-cluster pattern observed in the UMAP projection. To quantify the predictive utility of these themes alone, we trained a separate regression model using only the topic probabilities as features. This topic-only model was able to predict BDI scores with an MAE of 5.68 ± 0.77 on 5-fold cross-validation, demonstrating that the discovered topics themselves capture a substantial and clinically meaningful portion of the variance in depression severity. 4 Discussion This study developed and evaluated a pipeline for predicting depressive symptom severity from repeated, naturalistic voice diaries. Our findings yielded two principal insights: first, that large-scale sentence embeddings significantly outperform classical text and acoustic features for this task; and second, that an exploratory topic analysis of these embeddings can reveal clinically coherent themes that are strongly associated with depression severity. Our primary predictive finding is the clear superiority of the Qwen3-8B sentence embeddings over all classical feature approaches. This result strongly supports the growing consensus that the semantic content of speech, as captured by large language models, is the dominant signal for detecting depressive states. While adding either lexical–prosodic or TF–IDF features yielded slightly better scores (best: embeddings + TF–IDF, MAE = 4.39; R 2 = 0.35), this represents only a modest improvement, suggesting that embeddings capture the vast majority of the predictive signal. The small additional contribution from features like speech duration and pause ratios warrants further investigation, but it may indicate that basic speech timing and word rarity provide complementary, non-semantic information. Our findings clearly demonstrate that modern language model representations have fundamentally shifted the landscape of speech-based depression detection. Our sensitivity analysis, which evaluated the model exclusively on the MDD cohort, further strengthens this conclusion. This result strongly suggests that while many features can distinguish between healthy and depressed states, only the rich semantic representations from the Qwen3-8B embeddings are capable of capturing the nuanced variance in symptom severity within a clinical population. A central contribution of this study is to show that high-performance models can be made interpretable rather than remaining inscrutable “black boxes”. By applying BERTopic to the embeddings, we moved beyond prediction and towards interpretation. Our data-driven approach independently uncovered topics that align closely with established clinical knowledge. The strong association between higher BDI scores and themes like “Pain Distress” and “Persistent Low Mood” suggests our model is sensitive to both somatic complaints and the language of internal coping or distress. This finding corroborates previous work showing that themes of self-focus and negative cognitive patterns are robust linguistic markers of depression [ 25 – 27 ]. Crucially, the ability of a separate model to predict BDI scores using only these topic probabilities confirms their clinical relevance. This dual approach of using embeddings for prediction and topic modelling for interpretation offers a powerful model for future research. It allows for the development of accurate screening tools while also providing a pathway for clinicians to understand the potential drivers of a high predicted score, thereby fostering trust and aiding therapeutic dialogue [ 28 ]. The present findings highlight the potential of using speech samples collected in naturalistic settings via participants’ personal devices, combined with advanced natural language processing techniques, for monitoring depressive symptoms. This approach offers a scalable and ecologically valid method for digital phenotyping in mental health. Future research should investigate the generalizability of these findings across diverse populations, contexts, and technological platforms. Given the potential utility of such tools, further work is warranted that includes patient and public involvement and integrates the perspectives of caregivers and clinicians. This would help identify feasible and acceptable pathways for implementing speech-based monitoring in both research and clinical practice. Although not directly assessed in this study, it is worth considering the possibility that speech-based assessments may be perceived as less burdensome than traditional methods, such as text-based patient-reported outcomes or ecological momentary assessments. The act of recording speech can be conveniently integrated into daily activities, a concept already applied in emerging clinical decision support systems such as DAX Copilot 1 , which could potentially improve user engagement and adherence. Several limitations should be acknowledged. First, our study relies on self-reported BDI scores as the primary outcome measure. While we previously demonstrated high correspondence between smartphone-derived self-reports via ReMAP and external clinical ratings of depression severity [ 17 ], the reliance on self-report remains a methodological consideration. Second, our findings are based on a German-speaking cohort recruited primarily from two metropolitan areas (Marburg and Münster) using a bespoke research application (ReMAP). The specific demographics and clinical characteristics of this sample may therefore limit the broader generalizability of our findings to other languages, cultures, and healthcare systems. Third, our analysis is fundamentally correlational in nature; while we can demonstrate strong associations between linguistic themes and BDI scores, we cannot establish causal relationships between specific language patterns and depressive symptoms. Finally, the naturalistic data collection approach, while ecologically valid, introduces variability in recording conditions and speech content that may affect model performance in more controlled clinical settings. This study demonstrates that modern, open-source language models can predict depression severity from repeated, smartphone-collected voice diaries. We have shown that this predictive power is not opaque; it is driven by interpretable, clinically relevant linguistic themes that can be discovered directly from the data. By combining state-of-theart language model embeddings with data-driven interpretation, this work represents a meaningful step towards developing scalable, non-invasive, and understandable tools for monitoring mental health in the real world. Data Availability As data was derived from multiple study protocols with different limitations, the potential sharing of data would need to be discussed and evaluated on a case-by-case basis with the senior authors and different study leaders Author Contributions TH and NO had the initial idea of the ReMAP app and designed the project from which these data are drawn. DE conceived and designed the study, developed the ReMAP app, implemented all preprocessing and modelling code, performed statistical analyses, created visualisations and wrote the main manuscript text. MR, AC, RL, JH, VH and EL contributed to study design, data interpretation and substantive manuscript revisions. LG, AF, RB, JG, NW, JS, SM, TB, KF, FS, NA and HJ coordinated participant recruitment, collected speech diaries and associated clinical data, performed data quality assurance and edited the manuscript. JR, CD, RR, UEP, IN, TK, UD, TH and NO provided clinical oversight, project supervision and edited the manuscript. All authors provided critical feedback and approved the final version. DE and NO are the guarantors of the study. Acknowledgements This work was funded in part by the consortia grants from the German Research Foundation (DFG) FOR 2107 and SFB/TRR 393 (project grant no 521379614), as well as the DYNAMIC center, funded by the LOEWE program of the Hessian Ministry of Science and Arts (grant number: LOEWE1/16/519/03/09.001(0009)/98). Footnotes Title shortened to remove "non-invasive" for concision; author list expanded (adds Jonathan Repple and Christian Dobel) and multiple affiliations corrected; Abstract rewritten with updated metrics (MAE = 4.45, R2 = 0.35) and new comparison highlighting a 16-point gain over TF-IDF instead of the earlier "modest improvement over embeddings" wording; Methods section now specifies software versions, Optuna search space and 1,000-iteration permutation tests; Results section integrates binary HC-vs-MDD classification (Table 4) and within-MDD sensitivity analysis (Table 5) that were previously in the Supplement; wording throughout adjusted for clarity, consistent terminology; all tables and figures renumbered and Figure 2 updated (heatmap now in main text); supplementary material removed. ↵ 1 Microsoft Dragon Copilot | Microsoft Cloud for Healthcare. Available at: https://www.microsoft.com/en-us/health-solutions/clinical-workflow/dragon-copilot References 1. ↵ Sartorius , N . The economic and social burden of depression. eng . The Journal of Clinical Psychiatry 62 Suppl 15 , 8 – 11 . issn: 0160-6689 ( 2001 ). OpenUrl CrossRef PubMed Web of Science 2. ↵ König , H. et al. The excess costs of depression: a systematic review and meta-analysis. en . Epidemiology and Psychiatric Sciences 29 . Publisher: Cambridge University Press (CUP) . issn: 2045-7960, 2045-7979. doi: 10.1017/s2045796019000180 . https://www.cambridge.org/core/product/identifier/S2045796019000180/type/journal_article ( 2025 ) (2020). OpenUrl CrossRef 3. ↵ Melcher , J. et al. Digital phenotyping for mental health of college students: a clinical review. en . Evidence Based Mental Health 23 . Publisher: BMJ , 161 – 166 . issn: 1362-0347, 1468-960X. doi: 10.1136/ebmental-2020-300180 . https://mentalhealth.bmj.com/lookup/doi/10.1136/ebmental-2020-300180 ( 2025 ) (Nov. 2020). OpenUrl CrossRef 4. ↵ Bufano , P. et al. Digital Phenotyping for Monitoring Mental Disorders: Systematic Review. en . Journal of Medical Internet Research 25 . Publisher: JMIR Publications Inc ., e46778 . issn: 1438-8871. doi: 10.2196/46778 . https://www.jmir.org/2023/1/e46778 ( 2025 ) (Dec. 2023). OpenUrl CrossRef PubMed 5. ↵ Riad , R. et al. Automated Speech Analysis for Risk Detection of Depression, Anxiety, Insomnia, and Fatigue: Algorithm Development and Validation Study. en . Journal of Medical Internet Research 26 . Publisher: JMIR Publications Inc ., e58572. issn: 1438-8871. doi: 10.2196/58572 . https://www.jmir.org/2024/1/e58572 ( 2025 ) (Oct. 2024). OpenUrl CrossRef 6. ↵ Wankhade , M. et al. A survey on sentiment analysis methods, applications, and challenges. en . Artificial Intelligence Review 55 . Publisher: Springer Science and Business Media LLC , 5731 – 5780 . issn: 0269-2821, 1573-7462. doi: 10.1007/s10462-022-10144-1 . https://link.springer.com/10.1007/s10462-022-10144-1 ( 2025 ) (Oct. 2022). OpenUrl CrossRef 7. ↵ Devlin , J. , et al. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding en. in Proceedings of the 2019 Conference of the North ( Association for Computational Linguistics, Minneapolis, Minnesota , 2019 ), 4171 – 4186 . doi: 10.18653/v1/N19-1423 . http://aclweb.org/anthology/N19-1423 (2025). OpenUrl CrossRef 8. ↵ Hossain , M. M. et al. Multi task opinion enhanced hybrid BERT model for mental health analysis. en . Scientific Reports 15 . Publisher: Springer Science and Business Media LLC . issn: 2045-2322. doi: 10.1038/s41598-025-86124-6 . https://www.nature.com/articles/s41598-025-86124-6 ( 2025 ) (Jan. 2025). OpenUrl CrossRef 9. ↵ Pourkeyvan , A. et al. Harnessing the Power of Hugging Face Transformers for Predicting Mental Health Disorders in Social Networks . IEEE Access 12 . Publisher: Institute of Electrical and Electronics Engineers (IEEE) , 28025–28035. issn: 2169-3536. doi: 10.1109/access.2024.3366653 . https://ieeexplore.ieee.org/document/10438433/ ( 2025 ) (2024). OpenUrl CrossRef 10. ↵ Bey , R. et al. Natural language processing of multi-hospital electronic health records for public health surveillance of suicidality. en . npj Mental Health Research 3 . Publisher: Springer Science and Business Media LLC . issn: 2731-4251. doi: 10.1038/s44184-023-00046-7 . https://www.nature.com/articles/s44184-023-00046-7 ( 2025 ) (Feb. 2024). OpenUrl CrossRef 11. ↵ Grootendorst , M. BERTopic: Neural topic modeling with a class-based TF-IDF procedure Version Number: 1. 2022 . doi: 10.48550/ARXIV.2203.05794 . https://arxiv.org/abs/2203.05794 (2025). OpenUrl CrossRef 12. ↵ Irving , J. et al. Using Natural Language Processing on Electronic Health Records to Enhance Detection and Prediction of Psychosis Risk. en . Schizophrenia Bulletin 47 . Publisher: Oxford University Press (OUP) , 405 – 414 . issn: 0586-7614, 1745-1701. doi: 10.1093/schbul/sbaa126 . https://academic.oup.com/schizophreniabulletin/article/47/2/405/5918729 ( 2025 ) (Mar. 2021). OpenUrl CrossRef 13. ↵ Cummins , N. et al. A review of depression and suicide risk assessment using speech analysis. en . Speech Communication 71 . Publisher: Elsevier BV , 10 – 49 . issn: 0167-6393. doi: 10.1016/j.specom.2015.03.004 . https://linkinghub.elsevier.com/retrieve/pii/S0167639315000369 ( 2025 ) (July 2015). OpenUrl CrossRef 14. ↵ Gratch , Jonathan et al. The Distress Analysis Interview Corpus of human and computer interviews in Proceedings of LREC 2014 ( European Language Resources Association (ELRA) , Reykjavik, Iceland , 2014 ), 3123 – 3128 . 15. ↵ Low , D. M. et al. Automated assessment of psychiatric disorders using speech: A systematic review. en . Laryngoscope Investigative Otolaryngology 5 . Publisher: Wiley , 96 – 116 . issn: 2378-8038, 2378-8038. doi: 10.1002/lio2.354 . https://onlinelibrary.wiley.com/doi/10.1002/lio2.354 ( 2025 ) (Feb. 2020). OpenUrl CrossRef 16. ↵ Torous , J. et al. New Tools for New Research in Psychiatry: A Scalable and Customizable Platform to Empower Data Driven Smartphone Research. en . JMIR Mental Health 3 . Publisher: JMIR Publications Inc ., e16. issn: 2368-7959. doi: 10.2196/mental.5165 . http://mental.jmir.org/2016/2/e16/ ( 2025 ) (May 2016). OpenUrl CrossRef PubMed 17. ↵ Goltermann , J. et al. Smartphone-Based Self-Reports of Depressive Symptoms Using the Remote Monitoring Application in Psychiatry (ReMAP): Interformat Validation Study. en . JMIR Mental Health 8 . Publisher: JMIR Publications Inc ., e24333. issn: 2368-7959. doi: 10.2196/24333 . https://mental.jmir.org/2021/1/e24333 ( 2025 ) (Jan. 2021). OpenUrl CrossRef 18. ↵ Emden , D. et al. Technical feasibility and adherence of the Remote Monitoring Application in Psychiatry (ReMAP) for the assessment of affective symptoms. en . Journal of Affective Disorders 294 , 652 – 660 . issn: 01650327. doi: 10.1016/j.jad.2021.07.030 . https://linkinghub.elsevier.com/retrieve/pii/S0165032721007096 ( 2025 ) (Nov. 2021). OpenUrl CrossRef 19. ↵ Vogelbacher , C. et al. The Marburg-Münster Affective Disorders Cohort Study (MACS): A quality assurance protocol for MR neuroimaging data. en . NeuroImage 172 , 450 – 460 . issn: 10538119. doi: 10.1016/j.neuroimage.2018.01.079 . https://linkinghub.elsevier.com/retrieve/pii/S105381191830079X ( 2025 ) (May 2018). OpenUrl CrossRef 20. ↵ Kircher , T. et al. Neurobiology of the major psychoses: a translational perspective on brain structure and function—the FOR2107 consortium. en . European Archives of Psychiatry and Clinical Neuroscience 269 , 949 – 962 . issn: 1433-8491. doi: 10.1007/s00406-018-0943-x . 10.1007/s00406-018-0943-x ( 2025 ) (Dec. 2019). OpenUrl CrossRef 21. ↵ First , Michael B. et al. Structured Clinical Interview for DSM-IV Axis I Disorders (SCID-I) ( American Psychiatric Press , 1997 ). 22. ↵ Radford , A. , et al. Robust Speech Recognition via Large-Scale Weak Supervision arXiv:2212.04356 [eess]. Dec. 2022 . doi: 10.48550/arXiv.2212.04356 . http://arxiv.org/abs/2212.04356 (2025). OpenUrl CrossRef 23. ↵ Eyben , F. , et al. Opensmile: the munich versatile and fast open-source audio feature extractor en. in Proceedings of the 18th ACM international conference on Multimedia ( ACM, Firenze Italy , Oct. 2010 ), 1459 – 1462 . isbn: 9781605589336. doi: 10.1145/1873951.1874246 . https://dl.acm.org/doi/10.1145/1873951.1874246 (2025). OpenUrl CrossRef 24. ↵ Eyben , F. et al. The Geneva Minimalistic Acoustic Parameter Set (GeMAPS) for Voice Research and Affective Computing . IEEE Transactions on Affective Computing 7 , 190 – 202 . issn: 1949-3045. doi: 10.1109/TAFFC.2015.2457417 . http://ieeexplore.ieee.org/document/7160715/ ( 2025 ) (Apr. 2016). OpenUrl CrossRef 25. ↵ Rude , S. et al. Language use of depressed and depression-vulnerable college students. en . Cognition & Emotion 18 . Publisher: Informa UK Limited , 1121 – 1133 . issn: 0269-9931, 1464-0600. doi:10. 1080/02699930441000030. http://www.tandfonline.com/doi/abs/10.1080/02699930441000030 ( 2025 ) (Dec. 2004). OpenUrl 26. Ramirez-Esparza , N. et al. The Psychology of Word Use in Depression Forums in English and in Spanish: Testing Two Text Analytic Approaches . Proceedings of the International AAAI Conference on Web and Social Media 2 . Publisher: Association for the Advancement of Artificial Intelligence (AAAI) , 102–108. issn: 2334-0770, 2162-3449. doi: 10.1609/icwsm.v2i1.18623 . https://ojs.aaai.org/index.php/ICWSM/article/view/18623 ( 2025 ) (Sept. 2021). OpenUrl CrossRef 27. ↵ De Choudhury , M. et al. Predicting Depression via Social Media . Proceedings of the International AAAI Conference on Web and Social Media 7 . Publisher: Association for the Advancement of Artificial Intelligence (AAAI) , 128 – 137 . issn: 2334-0770, 2162-3449. doi: 10.1609/icwsm.v7i1.14432 . https://ojs.aaai.org/index.php/ICWSM/article/view/14432 ( 2025 ) (Aug. 2021). OpenUrl CrossRef 28. ↵ Jim , J. R. et al. Recent advancements and challenges of NLP-based sentiment analysis: A state-ofthe-art review. en . Natural Language Processing Journal 6 . Publisher: Elsevier BV , 100059 . issn: 2949-7191. doi: 10.1016/j.nlp.2024.100059 . https://linkinghub.elsevier.com/retrieve/pii/S2949719124000074 ( 2025 ) (Mar. 2024). OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted August 03, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Scalable depression monitoring with smartphone speech: a multimodal benchmark and topic analysis Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Scalable depression monitoring with smartphone speech: a multimodal benchmark and topic analysis Daniel Emden , Maike Richter , Astrid Chevance , Ramona Leenings , Julian Herpertz , Lara Gutfleisch , Anna Fleuchaus , Rogério Blitz , Vincent L. Holstein , Janik Goltermann , Nils R. Winter , Jennifer Spanagel , Susanne Meinert , Tiana Borgers , Kira Flinkenflügel , Frederike Stein , Nina Alexander , Hamidreza Jamalabadi , Jonathan Repple , Christian Dobel , Elisabeth J. Leehr , Ronny Redlich , Ulrich Ebner-Priemer , Igor Nenadić , Tilo Kircher , Udo Dannlowski , Tim Hahn , Nils Opel medRxiv 2025.07.17.25331744; doi: https://doi.org/10.1101/2025.07.17.25331744 Share This Article: Copy Citation Tools Scalable depression monitoring with smartphone speech: a multimodal benchmark and topic analysis Daniel Emden , Maike Richter , Astrid Chevance , Ramona Leenings , Julian Herpertz , Lara Gutfleisch , Anna Fleuchaus , Rogério Blitz , Vincent L. Holstein , Janik Goltermann , Nils R. Winter , Jennifer Spanagel , Susanne Meinert , Tiana Borgers , Kira Flinkenflügel , Frederike Stein , Nina Alexander , Hamidreza Jamalabadi , Jonathan Repple , Christian Dobel , Elisabeth J. Leehr , Ronny Redlich , Ulrich Ebner-Priemer , Igor Nenadić , Tilo Kircher , Udo Dannlowski , Tim Hahn , Nils Opel medRxiv 2025.07.17.25331744; doi: https://doi.org/10.1101/2025.07.17.25331744 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Psychiatry and Clinical Psychology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6600) Geriatric Medicine (668) Health Economics (997) Health Informatics (4538) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (541) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3333) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9232) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00dd4cc5ed1df88',t:'MTc3OTY0MTc2MA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00