Full text
45,841 characters
· extracted from
preprint-html
· click to expand
Quantifying Device Type and Handedness Biases in a Remote Parkinson’s Disease AI-Powered Assessment | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Quantifying Device Type and Handedness Biases in a Remote Parkinson’s Disease AI-Powered Assessment Zerin Nasrin Tumpa , Md Rahat Shahriar Zawad , Lydia Sollis , Shubham Parab , Irene Chen , View ORCID Profile Peter Washington doi: https://doi.org/10.1101/2025.01.03.25319964 Zerin Nasrin Tumpa 1 University of Hawaii Find this author on Google Scholar Find this author on PubMed Search for this author on this site Md Rahat Shahriar Zawad 1 University of Hawaii Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lydia Sollis 1 University of Hawaii Find this author on Google Scholar Find this author on PubMed Search for this author on this site Shubham Parab 2 New York University Find this author on Google Scholar Find this author on PubMed Search for this author on this site Irene Chen 3 UC Berkeley – UCSF Find this author on Google Scholar Find this author on PubMed Search for this author on this site Peter Washington 4 University of California , San Francisco Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Peter Washington For correspondence: peter.washington{at}ucsf.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Early detection of Parkinson’s Disease (PD) can enable early access to care, improving patient outcomes. We investigate the use of machine learning to predict PD using data recorded from a web application measuring structured mouse and keypress data through tests assessing finger and hand movement patterns. We evaluate the impact of demographic bias and bias related to device type and handedness, which are particularly relevant to our application. We collected data from 251 participants (99 PD, 152 Non-PD). Using a random forest model, we observed an 84% F1 score, 86% sensitivity, and 92% specificity. When examining only F1-score differences across groups, no significant bias appears. However, conducting a more in-depth analysis using algorithmic fairness metrics uncovers bias regarding the positive prediction and error rates. In particular, we found that sex and ethnicity have no statistically significant impact on receiving a PD prediction. However, we observe biases regarding device type and dominant hand in terms of receiving a PD diagnosis, as evidenced by disparate impact and equalized odds fairness metrics. This work demonstrates that remote digital health diagnostics using consumer devices like desktops or laptops can exhibit nontraditional yet significant biases concerning understudied factors in algorithmic fairness such as device type and handedness. Introduction PD is the fastest-growing neurological disorder in the world. It is projected that the number of individuals aged 50 years or older with PD in the USA will rise by 128 percent from about 340,000 in 2005 to around 610,000 by 2030 2 . Numerous studies have emphasized the possibility of diagnosing PD through motor patterns such as verbal fluency 3 , voice signals-voice disorders 4 , 5 , speech 6 , handwritten trace 7 , and gait patterns 8 . Finger tapping is one of the most reliable tests for motor performance 9 . The necessity for lab equipment and specialist supervision has made the detection process more complex and less cost-effective 10 . We, therefore, aim to develop a flexible and cost-effective method for detecting PD using machine learning applied to mouse/trackpad and keyboard data. Our pilot study 11 explored the feasibility of an earlier version of our web application, which demonstrated the potential for remote PD disease screening using digital motor assessments. Our current study builds upon these findings by refining the data collection framework, expanding participant recruitment from a small sample to over 250 participants, and systematically evaluating bias with respect to several factors. We collected data from participants who engaged in structured interactive tasks on a remotely accessible web application that we created. Each assessment consisted of a series of mouse tracing and keyboard pressing tasks to measure motor movement. For the key press tasks, metrics such as response time, accuracy, and accidental key presses from unintended movements were recorded. For the mouse trace tasks, where participants were asked to trace straight and curved paths displayed on-screen using a trackpad or mouse, we measured the stability and precision of two-dimensional hand movements, providing insights into motor control and coordination of the lower arm. While prior studies have explored remote digital health assessments for PD 12 – 17 , our study delves into issues of algorithmic fairness in remote PD assessments in particular and consumer digital health informatics more broadly by exploring the performance of machine learning models across diverse demographic groups in a web application for remote PD assessment. By examining factors such as sex, race, device type, and dominant hand using multiple fairness metrics, we aim to understand and highlight potential biases in the system’s predictions. We note in particular that exploring algorithmic fairness with respect to device type and dominant hand are understudied aspects of consumer digital health informatics. Results Novel Remotely Curated Dataset We recruited 251 individuals, providing a comprehensive dataset to analyze the effectiveness of our remotely accessible web-based application for detecting PD. Of these participants, 99 were diagnosed with PD and 152 were Non-PD controls ( Figure 1a ). Download figure Open in new tab Figure 1: Data distributions before any upsampling. (a) The dataset is class imbalanced, with approximately 39.4% PD patients and 60.6% Non-PD participants. (b) The total number of male participants is much higher than that of female participants, with differing class balances across sex groups. (c) Most of the participants used a Windows device. Two hundred-one participants used Windows, whereas only 41 users used a Mac, and eight used Linux. (d) Within the White group, Windows is the predominant device type, with 150 users, compared to only 28 users on Mac. Similarly, in the Non-White group, Windows usage remains higher, with 45 users, while Mac usage is much lower, at 12 users. This pattern indicates a strong preference for Windows devices across racial groups, although the White group has a significantly larger sample size overall. (e) Of participants with PD, 51 are White, followed by Black or African American, with 33 responses. White participants consist of 134 responses in the Non-PD group, with only six responses from Black or African American participants. (f) Distribution of race by disease status, specifically comparing White and Non-White groups. There was significant underrepresentation of many groups in our dataset, and we sought to study the effect of this imbalance on disparities in performance across groups. The sex distribution included 39 females and 212 males ( Figure 1b ). Participants utilized various devices to access the web-based application: 201 used Windows desktops, 41 used Mac desktops, and 8 used Linux desktops ( Figure 1c ). The device type distribution was roughly similar between White and Non-White groups ( Figure 1d ). The heatmap in Figure 1e shows how individuals are distributed across different racial categories based on their PD status. The majority of participants were White (185). There were 39 Black or African American participants. Smaller groups included 20 American Indian or Alaska Native individuals, 4 Native Hawaiian or other Pacific Islander individuals, and 2 Asian participants. While the distribution of White and Non-White groups was balanced within the PD group, the Non-PD group was heavily skewed towards White participants ( Figure 1f ). Most of the participants were right-handed, with 176 individuals identifying as right-handed and seventy-five as left-handed ( Figure 1g ). The remotely accessible application included seven structured tasks: three involving mouse exercises, three involving keyboard interactions, and one testing working memory. Each task consisted of three levels of increasing difficulty. The mouse-based tasks involved tracing straight lines and spirals, while the keyboard-based tasks included single-letter presses, multiple-letter presses, and randomized sequences, forming a comprehensive set of activities designed to test different motor and cognitive skills. The working memory game assessed cognitive function. From these interactions, we extracted 80 features across five major categories. Model Selection We first sought to select high-performing models for subsequent analyses. Using manually engineered features, we tested six classical machine-learning models for predicting PD. Each model was evaluated using 5-fold cross-validation on the training set, and the performance of the models was recorded using the mean F1 score across five folds ( Figure 2 ). The dataset in each fold was split into 70% training and 30% testing data. Download figure Open in new tab Figure 2: Comparison of classical machine learning models. The results are generated by performing cross-validation on our training data only. We report the mean with standard deviation as error bars across all the folds. Although all the experimental models exhibited promising performance during this model section stage, the LightGBM classifier outperformed the others by a small margin, achieving a 95.2% F1 score with a 4.19% standard deviation.The Random Forest, XGBoost, and LightGBM classifiers were selected to test their performance further on held-out test data. The 30% test data were used to make predictions using the three trained models ( Table 1 , Figure 3 ). View this table: View inline View popup Download powerpoint Table 1: Performance of the top-performing ML models on held-out test data. Download figure Open in new tab Figure 3: ROC curves and corresponding AUC scores for the random forest, LightGBM, and XGBoost classifiers. During the training phase, the three top-performing models were further tested on the held-out data to report their ROC curve and AUROC scores. The performance differences are minimal when comparing the ROC curves and the area under the curve (AUC). The Random Forest model achieved the highest AUC score of 94%, followed by XGBoost at 93% and LightGBM at 91%. Model Bias and Fairness To evaluate bias, we used traditional accuracy metrics like F1-score, sensitivity, specificity, and AUC as well as fairness-specific metrics such as Disparate Impact, Equal Opportunity, and Equalized Odds. Because we measured these metrics as a ratio across groups, a value of 1 for these fairness metrics indicates perfect fairness, with equal treatment of the privileged and unprivileged groups. Values less than 1 signify biases against the unprivileged group, while values greater than 1 indicate potential biases favoring the unprivileged group. We selected male, White, Mac, and right-handed individuals as the privileged groups in the study ( Table 2 ). While the selection of unprivileged groups corresponded to underrepresentation in our dataset in most cases, we chose to set Mac users as the privileged group despite underrepresentation due to the homogeneity of Mac devices – a contrast to Windows devices created by a wide range of manufacturers. View this table: View inline View popup Download powerpoint Table 2: Our selection of “Privileged” and “Unprivileged” groups in terms of the calculations for common algorithmic fairness metrics. Tables 3.1 and 3.2 display the F1-score, sensitivity, specificity, specificity, and AUROC differences between groups for four attributes: sex, race, device type, and dominant hand. Table 3.1 shows these metrics following upsampling by race, while Table 3.2 provides the values before any resampling was applied. View this table: View inline View popup Table 3.1: Model performance after race upsampling on each group based on four protected attributes: sex, race, device, and dominant hand. View this table: View inline View popup Table 3.2: Model performance before race upsampling on each group based on four protected attributes: Sex, race, device, and dominant hand. Notably, an unusual result highlights how underrepresentation can mask underlying biases in machine learning. Before upsampling, the Non-White group exhibited higher F1 scores than the White group, which was counter-intuitive given the underrepresentation of Non-White data. Closer examination revealed that the “Non-White Non-PD” group had substantially less data than the “White Non-PD” group, likely resulting in an inflated F1 score for the Non-White group overall. Figure 1f illustrates the distribution of participants by race and disease status, clearly highlighting the underrepresentation of the "Non-White Non-PD" group. This is a salient example of how the underrepresentation of certain groups in machine-learning datasets can mask underlying biases. To address this imbalance, we increased the data in the “Non-White Non-PD” group to bring it closer in size to the White group using resampling with replacement. After race resampling, we observed that the F1 score gap between White and Non-White groups significantly narrowed, with no significant differences between the groups after upsampling ( Table 5.1 ). All sensitive categories no longer yielded statistically significant differences in F1 scores ( Figure 4 ). View this table: View inline View popup Download powerpoint Table 4.1: Bias metrics of the models based on the demographic groups from the cross-validation with bootstrap sampling after race upsampling. All metrics are reported as ratios between the privileged and underprivileged groups. View this table: View inline View popup Table 4.2: Bias metrics of the models based on the demographic groups from the cross-validation with bootstrap sampling before race upsampling. All metrics are reported as ratios between the privileged and underprivileged groups. View this table: View inline View popup Download powerpoint Table 5.1: Binary indicator of whether a statistically significant difference in F1-score is observed using a t-test between groups for each sensitive attribute ( after race upsampling). View this table: View inline View popup Download powerpoint Table 5.2: Binary indicator of whether a statistically significance difference in precision is observed using a t-test between groups for each sensitive attribute ( after race upsampling). View this table: View inline View popup Download powerpoint Table 5.3: Binary indicator of whether a statistically significance difference in sensitivity is observed using a t-test between groups for each sensitive attribute ( after race upsampling). View this table: View inline View popup Table 5.4: Binary indicator of whether a statistically significance difference in specificity is observed using a t-test between groups for each sensitive attribute ( after race upsampling). View this table: View inline View popup Download powerpoint Table 5.5: Binary indicator of whether a statistical difference in F1-score is observed using a t-test between groups for each sensitive attribute ( before race upsampling). View this table: View inline View popup Download powerpoint Table 5.6: Binary indicator of whether a statistically significance difference in precision is observed using a t-test between groups for each sensitive attribute ( before race upsampling). View this table: View inline View popup Download powerpoint Table 5.7: Binary indicator of whether a statistically significance difference in sensitivity is observed using a t-test between groups for each sensitive attribute ( before race upsampling). View this table: View inline View popup Table 5.8: Binary indicator of whether a statistically significance difference in specificity is observed using a t-test between groups for each sensitive attribute ( before race upsampling). Download figure Open in new tab Figure 4: F1 score comparisons between the privileged and unprivileged groups for all protected attributes after race upsampling. No significant differences exist between the two groups in any category after upsampling by race. Most algorithms achieved disparate impact scores somewhat close to 1 for sex and race after race resampling ( Table 4.1 ). However, due to consistently high standard deviations, we cannot make any conclusive statements about whether sex or race significantly influences the models’ distribution of predictions towards any particular group. Even after race upsampling, the models exhibit significant biases for two interesting attributes: device and dominant hand, with disparate impact values ranging from 0.38 to 0.70 and tighter standard deviations than for race or sex. This indicates that the unprivileged groups (Windows, left hand) are 38% to 70% less likely to receive a PD diagnosis from the model than the privileged groups (Mac, right hand). Discussion Our PD assessment encompasses a novel approach to PD assessment. While prior PD studies have focused on speech data 4 , 18 , 19 , gait 8 , and handwriting 20 , our approach uniquely leverages data collected from a web application, enabling participants to contribute data remotely. This method aligns with the growing need for self-administered and non-invasive telemonitoring applications, which offer an accessible and convenient alternative to clinical examinations that are often challenging for elderly patients 21 , 22 . Moreover, prior studies have explored keyboard typing and touchscreen interactions for PD detection 23 , 24 , but few have addressed the issue of algorithmic biases 25 . These biases, often stemming from tiny sample sizes for underrepresented groups 26 , can compromise the fairness of machine learning models. Although our models were generally performant, our analyses uncovered critical performance disparities, particularly concerning device type and dominant hand. Despite the goal of using our PD classification website to promote affordable and equitable screening, these biases highlight the need for quantifying biases – including non-obvious biases – in consumer digital health informatics. Our results highlight the need for careful selection of AI bias metrics. Despite showing biases toward privileged groups in terms of receiving a PD diagnosis for device type and dominant hand, the models exhibited strong performance in terms of Equal Opportunity across all demographic groups, with values ranging between 0.87 and 1.22. However, this metric only captures the true positive rate (sensitivity) across groups and does not account for false positive rates or overall fairness. The Equalized Odds metric highlighted substantial imbalances. In some cases, F1 scores were higher for one group than another, but AUROC showed the opposite trend. This discrepancy often occurs when a group’s specificity decreases significantly, affecting AUROC more than F1 scores. Conversely, there were instances where AUROC scores were higher despite lower F1 scores. In these cases, variations in precision mirrored those in F1 scores, while specificity and sensitivity remained strong, contributing to the higher AUROC values. We note a few prominent limitations of our study. One significant limitation is that our model relied on data collected at a single point in time, thus failing to capture the progression of PD over time or symptomatic fluctuations. As PD is a degenerative condition, symptoms can vary due to factors such as medication, stress, and daily activities. Additionally, our study does not include detailed symptom quantification or information about participants’ medication usage or timing, critical factors that can significantly influence symptom severity and, thus, model predictions. We did not control for these factors. In future work, we aim to address these limitations by collecting longitudinal data and controlling medication timing and dose. In these future studies, it will be necessary to consider differences in symptom presentation variability across demographic groups, with the temporal aspect potentially introducing additional biases that were not explored in this work. Methods Study Design and Participant Recruitment We recruited individuals aged 50 and above. The recruitment pool comprised both adults diagnosed with PD and healthy controls. To facilitate recruitment, we partnered with local organizations dedicated to PD support and advocacy, including through recruitment efforts at the 2023 and 2024 Hawaii Parkinson’s Association (HPA) Symposiums. We invited participants to join the study using a web application accessible through a public link. The application provided detailed instructions about the research and its activities. Before commencing any activities, all participants provided informed consent electronically. Figure 5 shows the overall procedure for collecting participant data, preprocessing the dataset to train the models, and evaluating the models for real-world application. The participants used the website to play different games, which required them to use the mouse to click and trace different objects and press keyboard buttons at various times during the game. These mouse trace data and keyboard press data were stored in a cloud database along with the participant’s demographic data ( Figure 5a ). The stored data were then analyzed and engineered to extract features from the mouse trace and keyboard press data. Finally, the dataset was split into 70% training and 30% testing data ( Figure 5b ). The dataset was trained using six common machine learning classification algorithms using 5-fold cross-validation, and the results were compared for further testing and evaluation on held-out data ( Figure 5c ). Afterward, the trained models were tested for bias in different demographic groups ( Figure 6 ). Download figure Open in new tab Figure 5: Study workflow. (a) We recorded hand movement data through a structured assessment on a publicly available website. The data are stored in a cloud database for further analysis. (b) Features are extracted from the hand movement data and organized into a tabular format. The features are preprocessed to remove any empty records. The categorical features were transformed using one-hot encoding, and numerical features were normalized using min-max scaling. (c) We trained the model with six machine learning algorithms using 5-fold cross-validation. Download figure Open in new tab Figure 6: Bias evaluations. We quantified biases concerning sex, race, device type, and handedness. Data Collection Procedure We recorded the participant data using a web-based assessment by evaluating motor functions ( Figure 7 ). This evaluation was conducted through short and structured tasks designed to assess various aspects of user interactions. Download figure Open in new tab Figure 7: User interface of the web assessment. The user starts by filling out a consent form and records their demographic, health, and medication information. The users are then presented with shapes and required to trace the shape with their mouse/trackpad. This is followed by a series of games where the user is asked to press a key on the keyboard immediately when prompted. The game ends with a memory test we did not incorporate into our analysis. The mouse-based tasks included: Straight-Line Tracing : Participants were tasked with tracing straight lines on the screen to evaluate hand stability and precision. This task required participants to maintain consistent control and accuracy, reflecting fine motor skills. Curved Line Tracing : Participants traced spirals and other curved shapes during this activity, offering a more complex evaluation of hand stability and motor coordination. Randomized Clicking Tasks : Participants clicked on targets appearing at random locations on the screen in order to measure reaction time, hand-eye coordination, and accuracy. The keyboard-based tasks consisted of: Single-Letter Presses : Participants were instructed to press designated keys immediately upon receiving a visual prompt, assessing their motor response speed and accuracy. Multiple-Letter Presses : This task involved replicating key sequences, evaluating participants’ ability to execute complex motor tasks and process sequential information. In this phase, the task was confined to only two letters, and the user only needed to press one of the two letters that appear on the screen to pass. Randomized Sequences : Participants completed tasks involving sequences of randomized key presses, testing their adaptability, coordination, and cognitive flexibility. We also added a Memory Game to assess cognitive function through progressively challenging levels. However, we did not use the cognitive assessment data in this study, and thus we do not include details about the assessment. Data Up-sampling The initial dataset used in our study comprised 152 healthy individuals (60.6%) and 99 participants diagnosed with PD (39.4%). While the model initially achieved a strong F1 score, its sensitivity remained low, around 50%, which is particularly concerning in healthcare applications where detecting true positive cases is crucial. To address this issue and to improve the sensitivity of the models, we applied the Synthetic Minority Over-sampling Technique (SMOTE) to balance the dataset by up-sampling the PD patient data (updated data distribution shown in Figure 8 ). Download figure Open in new tab Figure 8: Data distribution after applying SMOTE. Moreover, there was a significant imbalance in the racial distribution within the PD label, with 131 White participants compared to only 15 Non-White participants. To mitigate this imbalance, we applied a random resampling technique to increase the number of Non-White participants in the PD category (updated data distribution without SMOTE shown in Figure 9 and with SMOTE shown in Figure 10 ). Download figure Open in new tab Figure 9: Data distribution after randomly resampling Non-White data points with no PD (before SMOTE). Download figure Open in new tab Figure 10: Data distribution after randomly resampling Non-White data points with no PD (after SMOTE). Bootstrapped Sampling We apply bootstrapped sampling with 100 samples in each iteration to generate error bars. In each bootstrapped sampling step, the sample set consists of an equal number of data points from both groups in each category while calculating the group-wise metrics for that category, e.g., sex, race, device, and dominant hand. For example, when calculating the F1 scores, sensitivity, specificity, precision, and AUROC for sex, the sample set was constructed by taking 50% of the sample from the Male subset and 50% from the Female subset using resampling with replacement. Finally, we report the mean of the metrics. Feature Engineering We engineered 80 features related to Hand Stability - Straight Line Tracing, Hand Stability - Curved Line Tracing, Response Times and Accuracy, Dominant Hand Movement, and Wrong Clicks, all engineered from the raw data. Table 6 summarizes some of the key engineered features we used in our study to detect PD. View this table: View inline View popup Table 6: List of engineered features. Evaluation Metrics To evaluate the performance of our PD prediction model, we used a combination of standard performance metrics and fairness metrics to assess both accuracy and equity across demographic groups. This section provides a detailed description of these metrics. Standard Performance Metrics We compute the F1-score, sensitivity (recall), precision, and specificity for each group across all demographic categories to evaluate model performance. Precision represents the proportion of correctly identified positive cases out of all predicted positive cases. Sensitivity (recall) measures the proportion of true positive (TP) cases accurately determined by the model. At the same time, specificity quantifies the proportion of true negative (TN) cases correctly classified, where true negatives refer to actual negatives correctly identified. In these calculations, false positives (FP) represent cases incorrectly classified as positive, and false negatives (FN) are actual positives misclassified as unfavorable. The F1-score combines precision and recall into a single metric, calculated as their weighted harmonic mean. The formulas for these metrics are: Fairness Metrics To ensure equitable outcomes across protected groups, we employed the following fairness metrics: Disparate impact Disparate Impact (DI) is a fairness metric used to evaluate equity in model outcomes by comparing the rates of positive outcomes between a privileged and an unprivileged group 29 ,30 . It assesses whether the unprivileged group is disproportionately less likely to receive favorable outcomes than the privileged group. DI is calculated as the ratio of the proportion of positive outcomes for the unprivileged group to that of the privileged group, defined as: TP u and TP p represent true positives while FP u and FP p denote false positives for the unprivileged (u) and privileged (p) groups. N u and N p denote the total number of individuals in the unprivileged and privileged groups, respectively. To achieve fairness, the Disparate Impact (DI) value must equal 1, indicating equal rates of positive predictions for both groups 29 . A value below 1 signals potential bias against the unprivileged group, while a value above 1 suggests a potential bias favoring the unprivileged group. Equal Opportunity Equal Opportunity (EO) is a fairness concept requiring that privileged and unprivileged groups achieve equal true positive rates (TPR) 31 . This ensures that individuals from all demographic groups who qualify for a positive outcome are equally likely to receive it. EO is mathematically defined as the ratio of the TPRs between unprivileged and privileged groups and is expressed as: An Equal Opportunity (EO) value of 1 represents perfect fairness, signifying that true positive rate (TPR) are identical across both privileged and unprivileged groups. A value less than 1 indicates that the unprivileged group has a lower TPR than the privileged group, suggesting potential bias against the unprivileged group in receiving favorable outcomes. Conversely, a value greater than 1 indicates that the unprivileged group has a higher TPR than the privileged group, which may reflect a bias favoring the unprivileged group. Equalized Odds Equalized Odds require that both true positive rates (TPRs) and false positive rates (FPRs) are equal across privileged and unprivileged groups 31 , ensuring that the model’s predictions regarding successes and errors are fair. By extending the concept of Equal Opportunity, Equalized Odds ensures that predictions do not disproportionately favor or disadvantage any group. It can be mathematically represented as the ratio of TPRs and FPRs between the unprivileged and privileged groups: An Equalized Odds value of 1 represents ideal fairness, signifying uniform error distribution and success rates across all demographic groups. Deviations from this standard, whether below or above 1, indicate disparities in the model’s predictions, pointing to potential biases in true positive rates (TPRs), false positive rates (FPRs), or both. Data Availability All data produced in the present study are available upon reasonable request to the authors. Ethical Considerations This project received approval from the University of Hawaii Institutional Review Board (IRB), protocol #2022-00857. Acknowledgments This research was partly funded by the National Institutes of Health (NIH) Agreement No. 1OT2OD032581-01. The views and conclusions in this document are those of the authors and should not be interpreted as representing the NIH’s official policies, either expressed or implied. The language and grammar of this manuscript were revised using AI tools such as ChatGPT and Grammarly. However, the authors thoroughly refined the language after the stylistic revisions provided by these tools. Footnotes We have tightened the narrative and writing style of the manuscript. References 1. Dorsey , E. R. , Sherer , T. , Okun , M. S. & Bloem , B. R . The emerging evidence of the Parkinson pandemic . J. Parkinson’s Dis . 8 , S3 – S8 ( 2018 ). doi: 10.3233/JPD-181474 OpenUrl CrossRef 2. ↵ Dorsey , E. R. et al. Projected number of people with Parkinson disease in the most populous nations, 2005 through 2030 . Neurology 68 , 384 – 386 ( 2007 ). doi: 10.1212/01.wnl.0000247740.47667.03 OpenUrl CrossRef PubMed 3. ↵ Almalaq , A. , Dai , X. , Zhang , J. , Hanrahan , S. & Nedrud , J . Causality graph learning on cortical information flow in Parkinson’s disease patients during behaviour tests . Proc. 49th Asilomar Conf. Signals, Syst. Comput . 925 – 929 ( 2015 ). 4. ↵ Benba , A. , Jilbab , A. , Hammouch , A. & Sandabad , S . Voiceprints analysis using MFCC and SVM for detecting patients with Parkinson’s disease . Proc. IEEE Int. Conf. Electr. Inf. Technol. (ICEIT ) 300 – 304 ( 2015 ). doi: 10.1109/EITech.2015.7163000 OpenUrl CrossRef 5. ↵ Sakar , C. O. et al. A comparative analysis of speech signal processing algorithms for Parkinson’s disease classification and the use of the tunable-factor wavelet transform . Appl. Soft Comput . 74 , 255 – 263 ( 2019 ). doi: 10.1016/j.asoc.2018.10.019 OpenUrl CrossRef 6. ↵ Alqahtani , E. J. , Alshamrani , F. H. , Syed , H. F. & Olatunji , S. O . Classification of Parkinson’s disease using NNge classification algorithm . Proc. 21st Saudi Comput. Soc. Natl. Comput. Conf. (NCC) 1 – 7 ( 2018 ). doi: 10.1109/NCG.2018.8593087 OpenUrl CrossRef 7. ↵ Pereira , C. R. et al. A new computer vision-based approach to aid the diagnosis of Parkinson’s disease . Comput. Biol. Med . 136 , 79 – 88 ( 2016 ). doi: 10.1016/j.compbiomed.2016.09.006 OpenUrl CrossRef 8. ↵ Abdulhay , E. et al. Gait and tremor investigation using machine learning techniques for the diagnosis of Parkinson’s disease . Future Gener. Comput. Syst . 83 , 366 – 373 ( 2018 ). doi: 10.1016/j.future.2018.02.009 OpenUrl CrossRef 9. ↵ Asanza , V. et al. Classification of subjects with Parkinson’s disease using finger tapping dataset . IFAC-PapersOnLine 54 , 376 – 381 ( 2021 ). doi: 10.1016/j.ifacol.2021.08.321 OpenUrl CrossRef 10. ↵ Adams , W. R . High-accuracy detection of early Parkinson’s disease using multiple characteristics of finger movement while typing . PLoS One 12 , e0188226 ( 2017 ). doi: 10.1371/journal.pone.0188226 OpenUrl CrossRef PubMed 11. ↵ Parab , S. , Boster , J. & Washington , P . Parkinson Disease Recognition Using a Gamified Website: Machine Learning Development and Usability Study . JMIR Form. Res . 7 , e49898 ( 2023 ). doi: 10.2196/49898 OpenUrl CrossRef 12. ↵ Kassavetis , P. et al. Developing a tool for remote digital assessment of Parkinson’s disease . Mov. Disord. Clin. Pract . 3 , 59 – 64 ( 2016 ). OpenUrl PubMed 13. Papapetropoulos , S. , Mitsi , G. & Espay , A. J . Digital health revolution: is it time for affordable remote monitoring for Parkinson’s disease? Front. Neurol . 6 , 34 ( 2015 ). 14. Gatsios , D. et al. Feasibility and utility of mHealth for the remote monitoring of Parkinson’s disease: ancillary study of the PD_manager randomized controlled trial . JMIR mHealth uHealth 8 , e16414 ( 2020 ). OpenUrl 15. Negi , A. S. , et al. Remote Real Time Digital Monitoring fills a Critical Gap in the Management of Parkinson’s disease . medRxiv ( 2024 ). 16. Hoffman , S. L. et al. Comprehensive real-time remote monitoring for Parkinson’s disease using Quantitative DigitoGraphy . npj Parkinson’s Dis . 10 , 137 ( 2024 ). OpenUrl PubMed 17. ↵ Zawad , M. R. S. , Tumpa , Z. N. , Sollis , L. , Parab , S. & Washington , P . Deep Learning Prediction of Parkinson’s Disease using Remotely Collected Structured Mouse Trace Data . medRxiv ( 2024 ). doi: 10.1101/2024.10.27.24316195 OpenUrl Abstract / FREE Full Text 18. ↵ Neharika , D. B. & Anusuya , S . Machine learning algorithms for detection of Parkinson’s disease using motor symptoms: Speech and tremor . Int. J. Recent Technol. Eng . 8 , 47 – 50 ( 2020 ). OpenUrl 19. ↵ Sakar , B. E. et al. Collection and analysis of a Parkinson speech dataset with multiple types of sound recordings . IEEE J. Biomed. Health Inform . 17 , 828 – 834 ( 2013 ). doi: 10.1109/JBHI.2013.2245674 OpenUrl CrossRef 20. ↵ Al-Wahishi , A. , Belal , N. & Ghanem , N . Diagnosis of Parkinson’s disease by deep learning techniques using handwriting dataset . Proc. Int. Symp. Signal Process. Intell. Recognit. Syst . ( 2020 ). 21. ↵ Isenkul , M. , Sakar , B. & Kursun , O . Improved spiral test using digitized graphics tablet for monitoring Parkinson’s disease . Proc. 2nd Int. Conf. e-Health Telemed. (ICEHTM-2014) 5 , ( 2014 ). doi: 10.1109/ICEHTM.2014.7057397 OpenUrl CrossRef 22. ↵ Matarazzo , M. et al. Remote monitoring of treatment response in Parkinson’s disease: The habit of typing on a computer . Mov. Disord . 34 , 1488 – 1495 ( 2019 ). doi: 10.1002/mds.27772 OpenUrl CrossRef PubMed 23. ↵ Arroyo-Gallego , T. et al. Detecting motor impairment in early Parkinson’s disease via natural typing interaction with keyboards: Validation of the neuroQWERTY approach in an uncontrolled at-home setting . J. Med. Internet Res . 20 , e89 ( 2018 ). doi: 10.2196/jmir.9462 OpenUrl CrossRef PubMed 24. ↵ Bernardo , L. S. et al. Modified SqueezeNet architecture for Parkinson’s disease detection based on keypress data . Biomedicines 10 , 2746 ( 2022 ). doi: 10.3390/biomedicines10112746 OpenUrl CrossRef PubMed 25. ↵ Makarious , M. B. et al. Multi-modality machine learning predicting Parkinson’s disease . npj Parkinson’s Dis . 8 , 35 ( 2022 ). doi: 10.1038/s41531-022-00290-3 OpenUrl CrossRef 26. ↵ Paul , S. et al. Bias investigation in artificial intelligence systems for early detection of Parkinson’s disease: A narrative review . Diagnostics 12 , 166 ( 2022 ). doi: 10.3390/diagnostics12010166 OpenUrl CrossRef PubMed 27. Briscoe , J. & Gebremedhin , A. Facets of Disparate Impact: Evaluating Legally Consistent Bias in Machine Learning . Proc. 33rd ACM Int. Conf. Inf. Knowl. Manag . ( 2024 ). 28. Caton , S. & Haas , C . Fairness in machine learning: A survey . ACM Comput. Surv . 56 , 1 – 38 ( 2024 ). OpenUrl 29. ↵ Mehrabi , N. et al. A survey on bias and fairness in machine learning . ACM Comput. Surv . 54 , 1 – 35 ( 2021 ). OpenUrl View the discussion thread. Back to top Previous Next Posted February 14, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Quantifying Device Type and Handedness Biases in a Remote Parkinson’s Disease AI-Powered Assessment Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Quantifying Device Type and Handedness Biases in a Remote Parkinson’s Disease AI-Powered Assessment Zerin Nasrin Tumpa , Md Rahat Shahriar Zawad , Lydia Sollis , Shubham Parab , Irene Chen , Peter Washington medRxiv 2025.01.03.25319964; doi: https://doi.org/10.1101/2025.01.03.25319964 Share This Article: Copy Citation Tools Quantifying Device Type and Handedness Biases in a Remote Parkinson’s Disease AI-Powered Assessment Zerin Nasrin Tumpa , Md Rahat Shahriar Zawad , Lydia Sollis , Shubham Parab , Irene Chen , Peter Washington medRxiv 2025.01.03.25319964; doi: https://doi.org/10.1101/2025.01.03.25319964 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4422) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15219) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6587) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1367) Health Systems and Quality Improvement (1611) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6587) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5437) Public and Global Health (9218) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1195) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ff77362fb1309d6',t:'MTc3OTQwNzA4Mw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.