Automated Detection of Speech Disorders in Parkinson’s Disease using Deep Convolutional Neural Networks: A Pilot Study

doi:10.1101/2025.07.18.25331737

Automated Detection of Speech Disorders in Parkinson’s Disease using Deep Convolutional Neural Networks: A Pilot Study

2025 · doi:10.1101/2025.07.18.25331737

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 39,775 characters · extracted from preprint-html · click to expand

Automated Detection of Speech Disorders in Parkinson’s Disease using Deep Convolutional Neural Networks: A Pilot Study | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Automated Detection of Speech Disorders in Parkinson’s Disease using Deep Convolutional Neural Networks: A Pilot Study Sara A. Jones , Jeremy Cosgrove , He Wang , Ryan K. Mathew doi: https://doi.org/10.1101/2025.07.18.25331737 Sara A. Jones 1 The University of Leeds Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jeremy Cosgrove 2 Leeds Teaching Hospitals NHS Trust Find this author on Google Scholar Find this author on PubMed Search for this author on this site He Wang 3 University College London Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: r.k.mathew{at}leeds.ac.uk Ryan K. Mathew 4 University of Leeds and Leeds Teaching Hospitals NHS Trust Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: r.k.mathew{at}leeds.ac.uk Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Background Patients with Parkinson’s disease (PD) frequently exhibit deficits in functional communication due to the presence of speech disorders associated with dysarthria that can be characterized by monotony of pitch (or fundamental frequency), reduced loudness, irregular rate of speech, imprecise consonants, and changes in voice quality. This pilot study investigates the application of a speech classifier based on deep-convolutional neural networks (CNNs) for aiding early diagnosis of PD. Methods In this study, we analyse the performance capabilities of two audio feature extraction techniques and associated model architectures: low-level time-frequency based features classified using a Support Vector Machine (SVM); and classifying log mel-spectrograms of segmented audio signals using varying depths of Deep Convolutional Neural Networks. The models were trained using an open-source data set comprised of 73 audio recordings of continuous dialogue from 37 subjects, including 16 people with PD (5 females and 11 males) and 21 healthy controls (19 females and 2 males), who were required to perform two speech production tasks. Results The experimental results show that the deep CNN model, trained on the log mel-spectrograms of 5-second segmented audio signals, can successfully differentiate PD subjects from healthy controls (HC) with a mean accuracy of 84.7%, sensitivity of 87.9% sensitivity and specificity of 89.4%, thus demonstrating its potential for aiding early diagnosis of PD in a clinical setting. The saliency maps show that the deep CNN model can distinguish between PD participants and healthy controls by detecting centralised, low-frequency regions of the spectrograms representing the speech of PD subjects, whereas a larger range of frequencies are detected in the spectrograms representing speech from healthy controls. 1 Introduction Parkinson’s disease (PD) is a chronic, progressive, incurable neurodegenerative disorder resulting from the degeneration and subsequent loss of dopaminergic neurons within the substantia nigra, part of the basal ganglia within the brain. The disease, which affects approximately 0.2 % (incidence of 15-20 per 100,000) of the general population in the UK [ 1 ], is currently diagnosed based on motor parkinsonism - the presence of bradykinesia (slowness of movement) in combination with either rigidity or resting tremor [ 2 ]. However, approximately 50–70 % of nigrostriatal dopaminergic neurons have degenerated before neurologists and geriatricians are able to establish the diagnosis of PD according to this widely accepted clinical diagnostic criteria [ 3 ]. There is substantial research attention on the early detection of PD, as this would allow more focused clinical trials and earlier delivery of a disease modifying medication, if and when this becomes available. Prior studies indicate that between 70 % to 90 % [ 4 ][ 5 ] of individuals with PD exhibit hypokinetic dysarthria (HD), a motor speech disorder characterised by a reduction in speech intensity (hypophonia), altered voice quality (dysphonia), flattened pitch inflection (monopitch), loss of stress (monoloudness), festination, breathiness, irregular pauses and hesitancy of speech [ 6 ]. Evidence suggests that voice dysfunction is the earliest indicator of motor impairment and may manifest 6-7 years before a clinical diagnosis of PD; i.e., preceding the onset of other motor features [ 7 ]. Therefore, automated acoustic analysis is considered by many researchers as an effective, non-invasive tool for PD screening and early detection. Current approaches to detecting neurodegenerative disorders through speech analysis pre-dominately focus on extracting perturbed features from repeated phonations of sustained vowels [ 8 ][ 9 ][ 10 ] to minimise acoustic variability between participants (e.g. lexical variations) and facilitate robust feature extraction. Whilst sustained phonation may be beneficial in eliciting some aspects of dysphonia, recent studies [ 11 ][ 12 ] suggest that they lack the high-level prosodic information (e.g. variations of fundamental frequency, pauses and phonation onsets) required to detect the wide range of articulatory deficits related to hypokinetic dysarthria. Pompili et al. [ 11 ] propose that a more articulatory demanding speech task than sustained phonation is required to capture subtle changes of speech due to neurodegeneration and therefore extract features obtained from natural voice production (running speech) in which subjects are instructed to speak a pre-devised sentence or passage containing representative linguistic units. The work presented here focuses on two complex speech production tasks, spontaneous dialogue and text-dependent running speech, which require sustained attention and therefore have a tendency to elicit articulatory deficits in the speech of PD subjects compared with simple tasks (e.g. repetition of isolated words and vowels). To the best of our knowledge, there are no studies that specifically focus on comparing complex speech production tasks in terms of their utility for automatic PD discrimination. Several studies have utilised time-frequency based features extracted from speech signals to classify between PD and control subjects, including jitter (temporal perturbation of the fundamental frequency), shimmer (temporal perturbation of the amplitude of the signal), pitch, amplitude perturbation quotient (APQ), noise-to-harmonic ratios (NHR), proportion of the vocalic duration and degree of voiced breaks [ 13 ][ 14 ]. Whilst short-term fluctuations in periodic vocal samples of phonatory signals can be quantified using low-level acoustic descriptors (LLD) such as jitter and shimmer, they may be unsuitable for severely disordered voices, which are typically characterised by poor periodicity and from which a period of sustained phonation is harder to extract [ 15 ]. In recent years, as an alternative to using classical feature extraction methods to model patho-logical speech, deep learning methods have been successfully implemented to evaluate specific phenomena in speech, including the detection and monitoring of PD speech disorders [ 16 ][ 17 ]. Korzekwa et al.[ 17 ] proposed a novel deep learning approach for the detection and reconstruction of dysarthric speech in which the authors use a Recurrent Convolutional Neural Network (RCNN) to encode the input mel-spectrogram, reconstruct the spectrogram from a low-dimensional dysarthric latent space and encoded text and apply a logistic classification model to predict the probability of dysarthric speech. By contrast, a study into the ability of Convolutional Neural Networks (CNN) to learn representative features directly from raw audio signals reveals that whilst the networks are able to autonomously discover frequency decompositions from raw audio, this method does not outperform spectrogram-based approaches [ 18 ]. Piczak [ 19 ] demonstrated that by representing an audio input using time-frequency representations (i.e., log-scaled mel-spectrograms), it is possible to directly utilise architectures that were initially developed for image data processing, such as CNNs, to classify environmental and urban audio samples. However, a 2-layered network is considered shallow in comparison to networks used by other applications, which is a shortcoming of the architecture used by [ 19 ]. The ability of convolutional networks to generate highly accurate results is dependent on the depth of the network, hence using deeper networks could improve the accuracy of vocal disorder detection. Therefore we intend to demonstrate the capabilities of deep learning in the detection of vocal disorders when deeper networks are used and demonstrate that an increase in the number of layers leads to an increase in the ability of the network in discriminating between PD and control subjects. This study focuses on identifying the optimal audio feature extraction technique and model architecture for PD vocal disorder detection in spontaneous dialogue and text-dependent running speech. Our contributions can hence be summarized as follows: We compare two complex speech production tasks (spontaneous dialogue and text-dependent running speech) in terms of their utility for automatic PD discrimination. We analyse the performance capabilities of two audio feature extraction techniques and associated model architectures: low-level time-frequency based features classified using a Support Vector Machine (SVM); and classifying mel-spectrograms of segmented audio signals using varying depths of Deep Convolutional Neural Networks (DCNNs). We study how the depth (the number of layers) of the deep neural networks effects the performance of PD vocal disorder detection capability of the network. We evaluate and compare the saliency maps generated by the Deep Convolutional Neural Network in identifying regions of the time-frequency waveform that provide an indication of hypokinetic dysarthria. 2 Methods The Mobile Device Voice Recordings at King’s College London (MDVR-KCL) dataset used in this paper is open-source [ 20 ], therefore a separate ethics review was not required for the use of open-source data in secondary data analysis. The original study was reviewed and approved by the London - Dulwich Research Ethics Committee (REC reference 17/LO/0909). All subjects signed written informed consent to participate in this study. 2.1 Dataset Description and Audio Pre-processing The Mobile Device Voice Recordings at King’s College London (MDVR-KCL) data set [ 20 ] consists of 73 audio recordings of continuous dialogue from 37 subjects, including 16 people with PD (5 females and 11 males) and 21 healthy controls (HC) (19 females and 2 males). Subjects were required to perform two speech production tasks; to recite a passage from the fable “The North Wind and the Sun” and to engage in spontaneous dialog with the test executor about places of interest, local traffic, or personal interests. The participants were interviewed in a standard examination room, measuring approximately 10 × 10 m and with a reverberation time of 0.5 seconds. The audio signals were recorded using a Motorola Moto G4 Smartphone at a sampling rate of 44.1 kHz with 16-bit resolution. Subjects held the recording device adjacent to their preferred ear, such that the in-built microphone was in direct proximity to their mouth, therefore one can assume that all recordings were performed within the reverberation radius and that the recordings can be considered as “clean”. The length of the recordings vary from 73 to 221 seconds, with an average duration of 143 seconds. The audio samples were assessed by expert neurologists and the speech intelligibility of each participant was scored according to the speech components of the Unified Parkinson’s Disease Rating Scale (UPDRS)-II (Part 5), UPDRS-III (Part 18) and Hoehn & Yahr, which are five-point standard scales that measure the severity and symptom progression of PD. The clinical and demographic characteristics of the participants are presented in Table 1 . View this table: View inline View popup Download powerpoint Table 1. Demographic and clinical characteristics of subjects. Note : n = number of participants, PD = Parkin-son’s Disease, M = Male, F = Female, H-Y = Hoehn & Yahr, UDPRS = Unified Parkinson’s Disease Rating Scale. Data is expressed as mean ± standard deviation During pre-processing, we manually removed all audio segments containing dialogue spoken by the test executor, which may have otherwise negatively affected the performance of the classifiers in distinguishing between PD and HC participants. In order to augment the size of the data set, we employed a sliding window approach [ 21 ] [ 22 ] [ 23 ] to divide the original audio signal into 5 and 10 second segments with a 50% overlap in the temporal dimension between two consecutive segments, thereby generating 1575 five second segments and 806 ten second segments. Since any element of the audio signal is contained within two consecutive segments, with the exception of the first and last segments, any information loss at a boundary of a segment is recovered by referencing to the adjacent overlapping segment. The length of segment was chosen as this duration was deemed sufficient to capture slow-changing, long-range temporal features of the speech waveform. 2.2 Feature Extraction 2.2.1 Low-Level Acoustic Features We extracted 24 low-level quantitative vocal parameters from each audio segment using Parselmouth [ 24 ], a Python library which provides a wide range of algorithms related to the acoustic analysis, manipulation, and synthesis of speech signals. To assess the auditory features of vocal dysfunction in PD, we examined parameters relating to: rate of vocal fold vibration (fundamental frequency, F 0 , or pitch), frequency perturbation (jitter), amplitude perturbation (shimmer), changes in vocal quality (harmonics-to-noise ratio, HNR), and aperiodicity (degree of voicelessness, DUV), amongst others. Functionals characterizing statistical and temporal properties of these low-level descriptors were also computed. Linguistic features have not been extracted since the purpose of this study was to analyse the auditory characteristics of vocal dysfunction in PD, regardless of the language used. 2.2.2 Mel-Spectrogram Feature Representation In this approach, input audio signals are represented as spectrograms in which the time and frequency axes are considered as spatial dimensions so that a CNN model, commonly used in image recognition tasks, can be applied. Although any spectrogram variant is applicable, we employed the log mel-scaled spectrogram, as it reduces the resolution of high band frequencies and more accurately resembles the auditory perception capabilities of humans. Utterances shorter than the length of interval (i.e. 5 or 10 seconds) are zero-padded. The mel-scaled spectrogram is extracted as a feature from the audio segments using the Python Librosa library [ 25 ] with the original sampling frequency of 44.1 kHz, 1024 Fast Fourier Transform (FFT) points, 128 mel-bins, and a hop-length of 512 frames. The amplitude of the mel-spectrogram is scaled logarithmically, and the scaled mel-spectrogram is resized in the temporal dimension to fit the input size required by the CNN classifier (224 × 224 pixels). 2.3 Model Architectures In this study, we analyse the performance capabilities of two audio feature extraction techniques and associated model architectures: low-level time-frequency based features classified using a Support Vector Machine (SVM); and classifying mel-spectrograms of segmented audio signals using varying depths of Deep Convolutional Neural Networks. We employed two variants of the densely connected convolutional network (DenseNet) [ 26 ] architecture, DenseNet-121 and DenseNet-169, to classify the mel-spectrograms described in Section 2.2.2. The architectures of the DenseNet models are provided in Table 2 . DenseNet consists of a series of densely connected convolutional and transitional blocks in which the output feature maps of each layer are concatenated with the output feature maps of all successive layers, thereby providing variety in the inputs of the subsequent layers. Each layer is connected to both the previous layer and all preceding layers in a feed-forward fashion. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation and encourage feature re-use, whilst substantially reducing the number of network parameters. View this table: View inline View popup Download powerpoint Table 2. Sizes of outputs and convolutional kernels for different DenseNet architectures. FC: Fully Connected, conv: convolution. This table has been adapted from [ 26 ]. We compared the performance capabilities of the Deep CNNs against traditional Machine learning methodologies, whereby a Principal Component Analysis (PCA)/Support Vector Machine (SVM) based classifier is employed to classify the 24 low-level quantitative vocal descriptors detailed in Section 2.2.1. By calculating the eigenvectors of the covariance matrix of the original inputs, PCA linearly transforms a high-dimensional input vector into a low-dimensional input vector whose components are uncorrelated. The optimal parameters of the Radial Basis Function (RBF) kernel SVM were identified using a grid search approach and C=100 and Gamma = 0.1 exhibited the least classification error. We trained each DenseNet model for 50 epochs using batch sizes of 8 and applied the Adam optimizer [ 27 ] with an initial learning rate of l r =0.001, which was reduced by a factor of 0.01 when no improvement in validation loss (binary cross-entropy) for five consecutive epochs was observed, to ensure that the model did not overfit during the training process. To prevent group data leakage, we randomly apportioned the data into training, validation and test sets, with an approximate 70 % -10 % -20 % split, ensuring that audio segments originating from the same participant were not contained in both the testing and training data sets. To evaluate the performance of the models in terms of their ability to distinguish between PD and HC participants, we calculated four performance metrics: Accuracy, Sensitivity/Recall, Specificity and the Matthews Correlation Coefficient (MCC) [ 28 ]. 2.4 Saliency Maps To interpret the network predictions, we generated saliency maps using a gradient-based back-propagation method to visualise which regions of the input mel-spectrogram image influenced the output prediction and reveal which vocal characteristics are most indicative of PD. To highlight class-relevant pixels, the convolutional neural network propagates the output back to the input image space by calculating the gradient of the class score S class with respect to the input pixels, where S class is usually taken to be the activation of the neuron in the output layer encoding the class of interest. 3 Results and Discussion By examining the experimental resultcs presented in Table 3 , it can be observed that the DenseNet-169 network architecture conducted on the log-mel-spectrograms of 5-second segmented audio signals achieves the best performance, with a mean sensitivity of 87.9 % (SD = 5.6 % ) and specificity of 89.4 % (SD = 7.3 % ). The classification performance of the networks degrade as the length of the audio segment increases, which implies that 10-second segments are perhaps too long to capture subtle vocal microperturbations characteristic of PD. Furthermore, since the sizes of the input log-mel spectrogram images are constant (224×224 pixels), regardless of the duration of the audio segment, segments of a longer duration may lose temporal resolution, thereby reducing their utility for automatic PD discrimination. Learned audio features using deep CNNs marginally outperform the low-level hand-crafted audio features classified with the PCA-SVM model, which suggests that whilst short-term fluctuations in periodic vocal samples of phonatory signals can be quantified using low-level acoustic descriptors (LLD), they may unsuitable for severely disordered voices, which are typically characterised by poor periodicity and from which a period of sustained phonation is harder to extract. View this table: View inline View popup Download powerpoint Table 3. Results for the three large language models trained on ASR-generated transcripts generated using WhisperX (including variable pauses and disfluencies). The results obtained for 10 test runs of each model are presented in the following format: Mean ±Standard Deviation. MCC: Matthews Correlation Coefficient, DN: DenseNet. The performance of the DenseNet models diminishes as the depth of the network decreases, thereby demonstrating that an increase in the number of layers leads to an increase in the ability of the network in discriminating between PD and control subjects. The classification performance of the three models is dependent upon the nature of the complex speech-production tasks; although the best results are achieved on the text-dependent running speech task, the results achieved with spontaneous dialogue are not significantly dissimilar which may be because the motor-planning process required to produce spontaneous speech is more complex than monologue tasks and contribute to the appearance of articulation errors and disfluency in the speech of PD patients. By observing the saliency maps presented Figure 1 , it is apparent that the DenseNet-161 classifier is able to distinguish between PD participants and healthy controls by detecting centralised, low-frequency regions of the spectrograms representing the speech of PD subjects, whereas a larger range of frequencies are detected in the spectrograms representing speech from healthy controls. By categorising the PD cohort into two subsets according to the subject’s H&Y score (I–II mild; III–IV severe), it is apparent that there is an equal proportion of participants with early-stage PD (n = 8) as late-stage PD (n=8). The ability of the network in distinguishing between the voices of healthy controls and subjects with early and late-stage Parkinson’s disease is reflected by the final probability distribution generated by the softmax layer for each class, such that high probabilities are associated with late-stage PD (i.e. HY:III-IV) and vice versa. Download figure Open in new tab Fig. 1. Log mel-spectrogram and saliency map generated by the DenseNet-169 model of a 5s audio segment of the phrase “ .. immediately the traveller took off his cloak.” (A) Log Mel-Spectrogram - Parkinson’s Disease (B) Saliency Map - Parkinson’s Disease. (C) Log Mel-Spectrogram - Healthy Control. (D) Saliency Map - Healthy Control. 4 Conclusion and Future Works This work presents an approach that combines audio processing and deep learning techniques in order to develop an assistive tool that can differentiate between the speech of subjects with PD and healthy controls. The limitations of this study include the small sample size, unbalanced gender distribution of subjects and limited additional clinical metadata. Although promising, the findings of this pilot study must be validated using larger, gender-balanced, prospectively- collected datasets, with greater clinical metadata detail, before this technology could be used as a diagnostic decision support tool in a clinical setting. Future works ought to consider other potential uses of speech and audio analysis techniques for the detection of brain-based pathologies, such as brain tumours or Alzheimer’s, by incorporating the Dense CNN mel-spectrogram classifier proposed in this paper with Natural Language Processing techniques and patient Electronic Health Record (EHR) data in order to develop a multimodal diagnostic tool. Furthermore, the data used in this study was collected in an acoustically controlled environment, therefore these findings could be extended to more realistic acoustic conditions, which would extend the potential of the proposed technology for use in more practical settings. Data availability The Mobile Device Voice Recordings at King’s College London (MDVR-KCL) data set used in this study is publicly available at https://zenodo.org/record/2867216#.YFSECGT7SdZ . Credit Authorship Contribution Statement Study concept and design: S.A.J, H.W, R.K.M. Model development, analysis, and interpretation of results: S.A.J, H.W, R.K.M. Drafting of the manuscript: S.A.J, H.W, R.K.M. Critical revision of the manuscript: S.A.J, J.C, H.W, R.K.M. Corresponding Authors Correspondence to Ryan Mathew or He Wang. Conflict of Interest Statement The authors declare no competing interests. References [1]. ↵ Parkinson’s UK . The incidence and prevalence of parkinson’s in the uk. results from the clinical practice research datalink , 2017 . [2]. ↵ Ronald B. Postuma , Daniela Berg , Matthew Stern , Werner Poewe , C. Warren Olanow , Wolfgang Oertel , José Obeso Kenneth Marek , Irene Litvan , Anthony E. Lang , Glenda Halliday , Christopher G. Goetz , Thomas Gasser , Bruno Dubois , Piu Chan , Bastiaan R. Bloem , Charles H. Adler , and Günther Deuschl . Mds clinical diagnostic criteria for parkinson’s disease . Movement Disorders , 30 ( 12 ): 1591 – 1601 , October 2015 . Publisher Copyright: © 2015 International Parkinson and Movement Disorder Society. Copyright: Copyright 2015 Elsevier B.V., All rights reserved. OpenUrl CrossRef PubMed [3]. ↵ Michal Wegrzynowicz , Dana Bar-On , Laura Calo , Oleg Anichtchik , Mariangela Iovino , Jing Xia , Sergey Ryazanov , Andrei Leonov , Armin Giese , Jeffrey W. Dalley , Christian Griesinger , Uri Ashery , and Maria Grazia Spillantini . Depopulation of dense -synuclein aggregates is associated with rescue of dopamine neuron dysfunction and death in a new parkinson’s disease model . Acta Neuropathol , 138 : 575 – 595 , 2019 . OpenUrl CrossRef PubMed [4]. ↵ Lena Hartelius and Peter Svensson . Speech and swallowing symptoms associated with parkinson’s disease and multiple sclerosis: a survey . Folia Phoniatrica et Logopaedica , 46 : 9 – 17 , 1994 . OpenUrl CrossRef PubMed Web of Science [5]. ↵ Jeri A. Logemann , Hilda B. Fisher , Benjamin Boshes , and Eugene R. Blonsky . Frequency and cooccurrence of vocal tract dysfunctions in the speech of a large sample of parkinson patients . Journal of Speech and Hearing Disorders , 43 : 47 – 57 , February 1978 . OpenUrl CrossRef PubMed Web of Science [6]. ↵ Andrew Ma , Kenneth K. Lau , and Dominic Thyagarajan . Voice changes in parkinson’s disease: What are they telling us? Journal of Clinical Neuroscience , 72 : 1 – 7 , January 2020 . OpenUrl CrossRef PubMed [7]. ↵ Seyed-Mohammad Fereshtehnejad , Chun Yao , Amelie Pelletier , Jacques Y Montplaisir , Jean- François Gagnon , and Ronald B Postuma . Evolution of prodromal parkinson’s disease and dementia with lewy bodies: a prospective study . Brain , 142 : 2051 – 2067 , 2019 . OpenUrl CrossRef PubMed [8]. ↵ Daria Hemmerling , J. R. Orozco-Arroyave , A. Skalski , J. Gajda , and E. Nöth . Automatic detection of parkinson’s disease based on modulated vowels . In Annual conference of the international speech communication association (INTERSPEECH) , pages 1190 – 1194 , 2016 . [9]. ↵ Liaqat Ali , Ce Zhu , Mingyi Zhou , and Yipeng Liu . Early diagnosis of Parkinson’s disease from multiple voice recordings by simultaneous sample and feature selection . Expert Systems with Applications , 2019 . [10]. ↵ Athanasios Tsanas , Max A. Little , Patrick E. McSharry , Jennifer Spielman , and Lorraine O. Ramig . Novel speech signal processing algorithms for high-accuracy classification of parkinson’s disease . IEEE Transactions on Biomedical Engineering , 59 : 1264 – 1271 , May 2012 . OpenUrl CrossRef PubMed [11]. ↵ Kamil Ekštein and Václav Matoušek Anna Pompili , Alberto Abad , Paolo Romano , Isabel P Martins , Rita Cardoso , Helena Santos , Joana Carvalho , Isabel Guimarães , and Joaquim J Ferreira . Automatic Detection of Parkinson’s Disease: An Experimental Analysis of Common Speech Production Tasks Used for Diagnosis . In Kamil Ekštein and Václav Matoušek , editors, Text, Speech, and Dialogue , pages 411 – 419 . Springer International Publishing , 2017 . [12]. ↵ Juan Rafael Orozco-Arroyave , Juan Camilo Vásquez-Correa , Jesús Francisco Vargas-Bonilla , R Arora , N Dehak , PS Nidadavolu , H Christensen , F Rudzicz , M Yancheva , H Chinaei , A Vann , N Vogler , T Bocklet , M Cernak , J Hannink , and Elmar Nöth . NeuroSpeech: An open-source software for Parkinson’s speech analysis . Digital Signal Processing , 77 : 207 – 221 , 2018 . OpenUrl [13]. ↵ Luca Parisi , Narrendar RaviChandran , and Marianne Lyne Manaog . Feature-driven machine learning to improve early diagnosis of Parkinson’s disease . Expert Systems with Applications , 110 : 182 – 190 , 2018 . OpenUrl [14]. ↵ Mehrbakhsh Nilashi , Othman Ibrahim , Hossein Ahmadi , Leila Shahmoradi , and Mohammadreza Farahmand . A hybrid intelligent system for the prediction of Parkinson’s Disease progression using machine learning techniques . Biocybernetics and Biomedical Engineering , 38 ( 1 ): 1 – 15 , 2018 . OpenUrl [15]. ↵ Victoria S Lee , Xiao Ping Zhou , Douglas A Rahn , Emily Q Wang , and Jack J Jiang . Perturbation and nonlinear dynamic analysis of acoustic phonatory signal in Parkinsonian patients receiving deep brain stimulation . Journal of communication disorders , 41 ( 6 ): 485 – 500 , 2008 . OpenUrl PubMed [16]. ↵ J.C. Vásquez-Correa , J. R. Orozco-Arroyave , and E. Nöth . Convolutional neural network to model articulation impairments in patients with Parkinson’s disease . In Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH , 2017 . [17]. ↵ Daniel Korzekwa , Roberto Barra-Chicote , Bozena Kostek , Thomas Drugman , and Mateusz Lajszczak . Interpretable Deep Learning Model for the Detection and Reconstruction of Dysarthric Speech . 2019 . [18]. ↵ Sander Dieleman and Benjamin Schrauwen . End-to-end learning for music audio . In ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings , 2014 . [19]. ↵ Karol J. Piczak . Environmental sound classification with convolutional neural networks . In IEEE International Workshop on Machine Learning for Signal Processing, MLSP , 2015 . [20]. ↵ Hagen Jaeger , Dhaval Trivedi , and Michael Stadtschnitzer . Mobile Device Voice Recordings at King’s College London (MDVR-KCL) from both early and advanced Parkinson’s disease patients and healthy controls , May 2019 . [21]. ↵ S. Mirzaei , M. El Yacoubi , S. Garcia-Salicetti , J. Boudy , C. Kahindo , V. Cristancho-Lacroix , H. Kerhervé , and A.S. Rigaud . “ two-stage feature selection of voice parameters for early alzheimer’s disease prediction ”. IRBM , 39 : 430 – 435 , 2018 . OpenUrl [22]. ↵ Diana Torres-Boza , Meshia Cédric Oveneke , Fengna Wang , Dongmei Jiang , Werner Verhelst , and Hichem Sahli . Hierarchical sparse coding framework for speech emotion recognition . Speech Communication , 99 : 80 – 89 , 2018 . OpenUrl [23]. ↵ Chung-Cheng Chiu , Wei Han , Yu Zhang , Ruoming Pang , Sergey Kishchenko , Patrick Nguyen , Arun Narayanan , Hank Liao , Shuyuan Zhang , Anjuli Kannan , et al. A comparison of end-to-end models for long-form speech recognition . In 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU) , pages 889 – 896 . IEEE , 2019 . [24]. ↵ Yannick Jadoul , Bill Thompson , and Bart de Boer . Introducing Parselmouth: A Python interface to Praat . Journal of Phonetics , 71 : 1 – 15 , 2018 . OpenUrl CrossRef [25]. ↵ Brian McFee , Colin Raffel , Dawen Liang , Daniel Ellis , Matt McVicar , Eric Battenberg , and Oriol Nieto . librosa: Audio and Music Signal Analysis in Python . In Proceedings of the 14th Python in Science Conference , 2015 . [26]. ↵ Gao Huang , Zhuang Liu , Laurens Van Der Maaten , and Kilian Q. Weinberger . Densely connected convolutional networks . In Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017 , 2017 . [27]. ↵ Diederik P. Kingma and Jimmy Lei Ba . Adam: A method for stochastic optimization . In 3rd International Conference on Learning Representations, ICLR 2015 - Conference Track Proceedings , 2015 . [28]. ↵ B. W. Matthews . Comparison of the predicted and observed secondary structure of T4 phage lysozyme . BBA - Protein Structure , 1975 . View the discussion thread. Back to top Previous Next Posted July 18, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Automated Detection of Speech Disorders in Parkinson’s Disease using Deep Convolutional Neural Networks: A Pilot Study Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Automated Detection of Speech Disorders in Parkinson’s Disease using Deep Convolutional Neural Networks: A Pilot Study Sara A. Jones , Jeremy Cosgrove , He Wang , Ryan K. Mathew medRxiv 2025.07.18.25331737; doi: https://doi.org/10.1101/2025.07.18.25331737 Share This Article: Copy Citation Tools Automated Detection of Speech Disorders in Parkinson’s Disease using Deep Convolutional Neural Networks: A Pilot Study Sara A. Jones , Jeremy Cosgrove , He Wang , Ryan K. Mathew medRxiv 2025.07.18.25331737; doi: https://doi.org/10.1101/2025.07.18.25331737 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neurology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4426) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15222) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6589) Geriatric Medicine (667) Health Economics (997) Health Informatics (4525) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (971) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9221) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (711) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffec5705e5541e2',t:'MTc3OTQ4Mzg0NQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-06-21T16:06:39.831647+00:00