A Deep Lightweight Convolutional Neural Network for Detecting Artifacts in Continuous EEG Signals

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 45,314 characters · extracted from preprint-html · click to expand
A Deep Lightweight Convolutional Neural Network for Detecting Artifacts in Continuous EEG Signals | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search A Deep Lightweight Convolutional Neural Network for Detecting Artifacts in Continuous EEG Signals View ORCID Profile Evans Nyanney , View ORCID Profile Parthasarathy D. Thirumala , Shyam Visweswaran , View ORCID Profile Zhaohui Geng doi: https://doi.org/10.1101/2025.10.28.25338681 Evans Nyanney a Ohio University, Department of Industrial and Systems Engineering, 1 Ohio University Drive , 276 Stocker Center, Athens, 45701, OH, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Evans Nyanney For correspondence: en596624{at}ohio.edu Parthasarathy D. Thirumala b University of Pittsburgh, Department of Neurological Surgery , 3550 Terrace Street, Pittsburgh, 15261, PA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Parthasarathy D. Thirumala Shyam Visweswaran c University of Pittsburgh, Department of Biomedical Informatics , 5607 Baum Boulevard, Pittsburgh, 15232, PA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zhaohui Geng a Ohio University, Department of Industrial and Systems Engineering, 1 Ohio University Drive , 276 Stocker Center, Athens, 45701, OH, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zhaohui Geng Abstract Full Text Info/History Metrics Data/Code Preview PDF ABSTRACT Objective This study aimed to develop and validate a system of specialized deep lightweight convolutional neural networks (CNN) to accurately detect specific artifact classes and demonstrate their advantage over traditional rule-based methods. Methods Three distinct CNN systems were trained on the Temple University Hospital EEG Artifact Corpus to identify eye movement, muscle-related and non-physiological artifacts, with each system optimized for an ideal temporal window size. The performance of the proposed system was compared with standard rule-based clinical detection methods in a held-out test set. Results The CNN systems significantly outperformed rule-based methods, with F1-score improvements ranging from +11.2% to +44.9%. Importantly, the results revealed distinct optimal temporal window lengths for each artifact type: 20s for eye movements (ROC AUC 0.975%), 5s for muscle activity (Accuracy 93.2%), and 1s for non-physiological artifact(F1-score 77.4%). Conclusion The results show that specialized, artifact-specific CNNs provide a more consistent and accurate solution for automated EEG artifact detection than traditional rule-based approaches Significance This study establishes a new benchmark for automated EEG quality control by validating one of the first open-source, specialized CNN systems for three distinct artifact classes, both high sensitivity and specificity. 1. Introduction Continuous scalp electroencephalography (EEG) remains the primary diagnostic tool for detecting seizures, status epilepticus, and cerebral ischemia in critical care settings. In these environments, most events are non-convulsive or subtle, making continuous monitoring essential for identifying cases that are not clinically apparent but require immediate intervention ( Sharma, Nunes and Alkhachroum, 2022 ; Claassen, Mayer, Kowalski, Emerson and Hirsch, 2004 ; Mumtaz, Rasheed and Irfan, 2021 ; Prakash and Kumar, 2024 ). However, the diagnostic accuracy of EEG is limited by signal contamination from both physiological sources (such as eye movements, muscle activity, chewing, and shivering) and non-physiological sources (including electrode pops, static, and lead artifacts) that not only obscure or mimic pathological EEG patterns but also create substantial workload for neurologists, EEG technologists, and critical care physicians ( Jiang, Bian and Tian, 2019 ; Britton, Frey, Hopp et al., 2016 ; Urigüen and Garcia-Zapirain, 2015 ; Islam, Rastegarnia and Yang, 2016 ). The issue of EEG signal contamination is exacerbated by the increased use of rapid application EEG caps and head-bands, especially when applied by untrained personnel on patients with scalp abnormalities( Amin, Nascimento, Karakis, Schomer and Benbadis, 2023 ) Foundational understanding of EEG artifact characteristics was established through seminal works characterizing spectral properties and topographical distributions of various types of contamination ( Goncharova, McFarland, Vaughan and Wolpaw, 2003 ; Whitham, Pope, Fitzgibbon, Lewis, Clark, Loveless, Broberg, Wallace, DeLosAngeles, Lillie, Hardy, Fronsko, Pulbrook and Willoughby, 2007 ). This research has led to the development of automated detection frameworks like FASTER ( Nolan, Whelan and Reilly, 2010 ). While traditional approaches including blind-source separation, adaptive filtering, and wavelet decompositions provide partial solutions, they have typically been validated only on small laboratory datasets and lack adaptability across diverse patient populations and recording conditions ( Islam et al., 2016 ; Saba-Sadiya, Chantland, Alhanai, Liu and Ghassemi, 2021 ; Kalita, Deb and Das, 2024 ). Recent deep learning approaches have shown promise but still face challenges in clinical deployment due to the incorrect “one-size-fits-all” assumption that artifact characteristics are similar across subjects and tasks ( Yu, Li, Zhou, Wang and Wang, 2024 ; Delorme, Sejnowski and Makeig, 2007 ; Prasad, Chanamallu and Prasad, 2021 ). Our objectives in addressing these challenges are as follows: (1) to develop distinct deep convolutional neural network (CNN) models for eye movements, muscle artifacts, and non-physiological artifacts; (2) to evaluate whether these artifact-specific models outperform single-model approaches; (3) to compare CNN-based methods with rule-based clinical standards; and (4) to provide open-source tools and evaluation frameworks for clinical validation. We hypothesize that models tailored to specific artifacts leveraging the unique characteristics of each type will offer improved diagnostic accuracy. Upon completion, we will release our models, evaluation framework, data splits, and software code under open-source license to facilitate integration of robust artifact removal in future EEG studies and real-time monitoring systems.( See Fig. 1 ). Download figure Open in new tab Figure 1: Examples of three main EEG artifact types used in this study (a) Eye movement artifacts showing smooth, low-frequency deflections with moderate amplitude, (b) muscle artifacts showing high-frequency, irregular activity with higher amplitude, and (c) non-physiological artifacts presenting as sudden, transient spikes characteristic of electrode pops ( Amin et al., 2023 ). 2. Materials and Methods 2.1. Study Design and Reporting Framework This study follows a diagnostic accuracy design for developing and validating artifact detection algorithms. We adhere to the TRIPOD+AI (Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis + Artificial Intelligence) guidelines for prediction model development (Collins, Moons, Dhiman et al., 2024) and STARD (Standards for Reporting of Diagnostic Accuracy Studies) recommendations for diagnostic test accuracy studies (Bossuyt, Reitsma, Bruns et al., 2015). 2.2. Data Description The development and evaluation datasets used in this study were obtained from the Temple University Hospital (TUH) EEG Corpus( Obeid and Picone, 2016 ), specifically the edf/01_tcp_ar subset, which represents routine clinical care data collected during standard EEG monitoring procedures from 2002 to 2016. We selected this dataset for several reasons: (1) it features expert-annotated artifact labels with high agreement among the annotators ( κ > 0.8) ( The TUH EEG Corpus, 2020 ), (2) it offers a diverse range of artifact types and recording conditions, representative of real clinical settings, and (3) it follows standardized collection procedures that ensure data quality and consistency. The study setting was Temple University Hospital, a tertiary care academic medical center in Philadelphia, Pennsylvania, USA. The dataset comprises 310 curated recordings from patients undergoing routine clinical EEG monitoring across various units, including the epilepsy monitoring unit, the neurointensive care units, and the general neurology units. These recordings were collected during standard clinical workflows, including epilepsy monitoring, sleep studies, and neurological assessments, making the developed models applicable to real-world clinical scenarios. Participants were included in the study if they: (1) underwent routine clinical EEG monitoring at Temple University Hospital between 2002 and 2016, (2) had recordings of sufficient duration and quality for artifact analysis, and (3) had comprehensive artifact annotations completed by expert neurophysiologists. No specific exclusion criteria were used beyond standard clinical contraindications for EEG monitoring. All recordings were collected during standard clinical care, with patients receiving routine medical treatments as indicated by their clinical conditions. The artifact annotations were completed by expert neurophysiologists who were blinded to the automated detection algorithms. 2.2.1. Artifact Distribution and Characteristics The data set contains a total of 158,884 artifact annotations across 19 categories(see Table 1 ), with muscle (30. 4%), eye movement (23. 9%) and electrode artifacts (20.1%) artifacts represent the primary sources of contamination. Both individual and combined artifacts (e.g., eye + muscle, chewing + electrode) provide a realistic representation of complex patterns encountered in clinical practice. Artifact durations range from brief electrode pops (mean 0.165 hours) to prolonged muscle artifacts (mean 0.260 hours), which capture the diverse temporal characteristics important for developing detection models that can effectively handle both transient and continuous artifacts. View this table: View inline View popup Download powerpoint Table 1 Distribution of EEG artifact types in the TUH dataset with mean duration in hours. 2.2.2. Data Preprocessing The raw EEG data from the TUH EEG Corpus required systematic pre-processing to prepare it for training the artifact detection model. We processed 150 EEG recordings that were selected from the 310 available recordings from the edf/01_tcp_ar files. We standardized the variable sampling rates, which ranged from 250 to 1000 Hz, and the channel configurations, which varied from 27 to 36 channels, to uniform specifications. This standardization was done while preserving the original temporal characteristics essential for deep learning models (see Fig. 2 ). Download figure Open in new tab Figure 2: EEG preprocessing pipeline. The process involves: (a) loading EDF files with raw EEG data from 310 recordings (sample of 150), (b) loading CSV annotations with artifact labels, (c) standardization and preprocessing to 250 Hz and 22 channels with seizure file exclusions, (d) segmentation into adaptive non-overlapping windows, (e) preparation of three binary datasets with 60/20/20 patient-level splits and focal loss parameters, and (f) formatting into 3D tensors with global normalization for direct CNN input. 2.2.3. Signal Standardization All recordings underwent systematic standardization, which included the following steps:(1) resampling to 250 Hz, (2) converting to a standardized 22-channel bipolar montage using canonical electrode pairs (FP1-F7, F7-T3, T3-T5, T5-O1, FP2-F8, F8-T4, T4-T6, T6-O2, A1-T3, T3-C3, C3-CZ, CZ-C4, C4-T4, T4-A2, FP1-F3, F3-C3, C3-P3, P3-O1, FP2-F4, F4-C4, C4-P4, P4-O2), and (3) applying bandpass filtering (1-40 Hz) and notch filtering (50/60 Hz) to remove line noise and artifacts outside the desired frequency range. Although the standard clinical EEG range extends to 70 Hz, the 1-40 Hz range was chosen to specifically target cerebral activity and minimize potential signal contamination by high-frequency muscle (EMG) artifacts, which could otherwise lead to an inflated false positive rate. 2.2.4. Referencing and Normalization After filtering, we applied average referencing to reduce common-mode noise and removed DC offsets by subtracting the mean value from each channel. We conducted global normalization using RobustScaler across all channels and timepoints, preserving relative amplitude relationships between channels while standardizing the input range for deep learning model training ( Lawhern, Solon, Waytowich, Gordon, Hung and Lance, 2018 ). These preprocessing steps are important to improve the signal-to-noise ratio and to scale the data appropriately for stable model training. 2.2.5. Adaptive Segmentation and Labeling We segmented each recording into non-overlapping windows of 1, 3, 5, 10, 20, or 30 seconds to create consistent inputs for CNN training. This method maximizes sample independence and preserves the temporal resolution needed for artifact detection. Conducting a structured evaluation across multiple durations is an important step, as the optimal temporal window for detecting an artifact can vary significantly depending on its characteristic timescale. For each segment, we extracted the corresponding ar-tifact category from the annotation files and labeled the original 19 categories to six artifact classes and a non-artifact class (seven classes): eye movement (EYEM, 0), muscle activity (MUSC, 1), electrode artifact (ELEC, 2), chewing (CHEW, 3), shivering (SHIV, 4), non-physiological artifact (ELPP, 5), and non-artifact segments (6). This adaptive approach captures both brief artifact events (including sub-second events within a 1-second window) and longer patterns, which ensure appropriate temporal resolution for each type of artifact. 2.2.6. Dataset Preparation We prepared three datasets with binary classes for the detection of specific artifacts as follows. Segments consisting seizure and background categories were excluded during preprocessing. Eye movements This dataset included segments of class eye movement (0) that were labeled positive and segments of class non-artifact (6) that were labeled negative, focusing on transient, low-frequency frontal activity (blinks, saccades) distinct from EMG and non-physiological noise. Muscle-related artifacts This dataset included seg-ments of class muscle artifact (1), chewing (3), and shivering (4) that were labeled positive and segments of class non-artifact (6) that were labeled negative, due to shared broad-band, high-frequency content and similar impact on signal readability. Non-physiological artifacts This dataset included segments of class electrode artifact (2) and non-physiological artifact (5) that were labeled positive and segments of class non-artifact (6) that were labeled negative, to capture step changes, flatlines, impedance pops, and line-related contamination. Each dataset used the standardized 22-channel bipolar montage with non-overlapping segments at 250 Hz. The dataset included 105 patients with 150 recordings, split into training (63 patients, 60%), validation (21 patients, 20%), and test (21 patients, 20%) sets using patient-level splitting to prevent data leakage. Class imbalance was handled using focal loss (α = 0.25, γ = 2.0), a technique that gives more weight to hard-to-classify segments ( Lin, Goyal, Girshick, He and Dollár, 2017 ). To ensure an unbiased evaluation, thresholds were selected on the validation set to maximize Youden’s J statistic and then applied unchanged to the test set( Powers, 2011 ). 2.3. Deep Learning Method The detection of artifacts in EEG signals presents significant challenges due to complex temporal patterns and the high-dimensionality of the data. Traditional rule-based methods, which rely on predefined thresholds and frequency-domain features, often struggle to capture the complex temporal dependencies associated with EEG artifacts ( Delorme et al., 2007 ). However, increasing model complexity does not necessarily lead to improved accuracy; it can even cause performance decline without solving the high variance-high bias problem. To address these issues, we developed a deep lightweight CNN architecture for EEG artifact detection that achieves better performance while using fewer trainable parameters. The architecture employs multiple convolutional layers that progressively extract hierarchical features from raw EEG signals, while using depthwise separable convolutions, reduced filter sizes, and efficient channel configurations to minimize computational complexity and memory requirements. Our framework uses a single CNN designed for one-dimensional temporal EEG data. It incorporates convolutional layers, max-pooling layers, and global average pooling layers to extract features from the raw temporal information and classify them into various artifact categories.. All layer parameters are optimized together by minimizing classification error across the training set. The convolutional layer operates across the temporal dimension of the input EEG segments, generating activation maps that capture temporal features relevant to artifact detection. The weights ω ij represent the filters connecting the input channels ( i ) and the output feature maps ( j ), while β j represents bias. The convolution operation is: We used ReLU activation for its non-linearity, computational speed, and ability to handle non-negative inputs: Max-pooling reduces the temporal size of feature maps, which helps decrease the number of parameters and computational costs while preventing overfitting. Global average pooling computes the average value of each feature map across the temporal dimension, creating a single feature vector and serving as a regularizer ( Lin, Chen and Yan, 2013 ). The framework was optimized through an iterative process balancing computational complexity and performance, using patient-level splitting with cross-validation to evaluate and select the best configuration. 2.3.1. Loss Function and Optimization We used the Adam optimizer with focal loss to address severe class imbalance inherent in EEG artifact detection by assigning higher weights to hard-to-classify examples: where p t is the predicted probability for the true class, α t is the weighting factor for class t , and γ is the focusing parameter. We use α = 0.25 and γ = 2.0 to effectively handle class imbalance while maintaining focus on hard-to-classify examples. 2.3.2. Model Architecture and Training Inputs were multichannel EEG segments in a standardized 22-channel bipolar montage. The temporal length T was adaptive and inferred at runtime, which was the same design to operate across different segment lengths without structural changes. The deployed model used a lightweight one-dimensional architecture: Conv1D layer (16 filters, kernel size 5, ReLU, same padding), MaxPooling1D (pool size 2), global average pooling over the temporal dimension, and a two-layer classifier (Dense 16, ReLU; Dense 1, sigmoid). This lightweight design without BatchNormalization or Dropout was chosen based on empirical performance comparisons( Lawhern et al., 2018 ). 2.3.3. Training Supervised training used binary targets per classifier with mini-batch optimization (batch size 128, maximum 200 epochs) and shuffled batches. Learning-rate scheduling was based on validation performance, retaining the best weights by validation loss. Decision thresholds were selected on the validation set to maximize Youden’s J statistic, fixed specificity, or maximize the true positive rate (TPR) with the false positive rate FPR ≤ 0.10, then applied unchanged to the test set( Powers, 2011 ). Model fitting used GPU acceleration when available, with CPU preprocessing. Random seeds were fixed for numerical reproducibility, with training proceeding on CPU when a GPU was unavailable, using unchanged hyperparameters. 2.4. Performance Analyses We compared the performance of the CNN with rule-based artifact detection methods across three binary classification tasks: eye movement, muscle artifact, and nonphysiological artifact detection. We computed standard classification metrics (accuracy, precision, recall, F1-score, sensitivity, specificity) and specialized metrics for imbalanced datasets, with F1-score, which serves as the primary optimization criterion during threshold selection ( Saito and Rehmsmeier, 2015 ). For performance evaluation, we utilized two key metrics: AUC ROC (Area Under the Receiver Operating Characteristic Curve), which measures the model’s ability to distinguish between artifact and clean segments across all classification thresholds, with values ranging from 0 to 1 where 1.0 indicates perfect classification, and [email protected] (Partial ROC at 0.1 False Positive Rate), which evaluates performance specifically at low false positive rates by measuring the area under the ROC curve up to a false positive rate of 0.1, which gives insight into the model’s performance under strict false positive constraints. For each classifier, we implemented dual-threshold optimization on the validation set: (1) F1-optimal thresholds determined by maximizing the harmonic mean of precision and recall from precision-recall curves, and three alternative strategies: (2) thresholds maximizing Youden’s J statistic (sensitivity + specificity - 1), (3) thresholds that obtained 95% specificity to prioritize FPRs, and (4) thresholds that maximized the FPR ≤ 0.1 constraint. All optimizations were performed on the validation sets to prevent test data overfitting. 2.4.1. Prevalence-Adjusted Metrics To address the imbalance of the data set, we computed the prevalence adjusted area under the precision recall curve (PR AUC) by adjusting the precision values to a target prevalence of 5%: where π = 0.05 represents target prevalence, TPR is true positive rate, and FPR is false positive rate. Additionally, we calculated partial ROC-AUC for FPR ≤ 0.1, normalized by the FPR limit to provide clinically performance measures focused on low FPRs ( He and Garcia, 2009 ). 2.4.2. Comparison with Standard Methods Performance comparisons between CNN and rule-based methods were conducted using all evaluation metrics. Performance differences were evaluated using a threshold-based approach, with differences > 0.01 considered meaningful for distinguishing between methods, while differences < 0.01 were considered ties, indicating equivalent performance. Analysis was performed separately for each artifact detection task. 3. Results 3.1. Segment-level Artifact Detection Eye Movement Detection CNN-based eye movement detection performance in different segment sizes achieved ROC AUC values ranging from 0.959 to 0.975, with peak performance in 20-second segments(see Table 2 ). The 5-second segment provided an optimal balance between accuracy (92.5%) and specificity (94.0%), while the 20-second segments achieved the highest ROC AUC (0.975) and F1 score (0.905). The 30-second segment demonstrated superior precision (95.1%) and [email protected] (0.088), which indicates excellent performance at low false positive rates. The result indicates that eye movement artifacts benefit from a longer temporal context for reliable detection. View this table: View inline View popup Download powerpoint Table 2 Eye Movement - Window Optimization Muscle Artifact Detection CNN-based muscle artifact detection results varied significantly with segment length, with ROC AUC values ranging from 0.931 to 0.977 (see Table 3 ). Intermediate segment lengths (3-10 seconds) outperformed both shorter and longer segments, with 3-second segments achieving the highest ROC AUC (0.977) and 5-second segments providing peak accuracy (93.2%). The 10-second segment shows superior F1-score (0.863) and [email protected] (0.089) performance. Unlike eye movements, muscle artifacts showed optimal detection in moderate temporal context, with performance declining at very short (1 second) and long (20-30 second) durations. View this table: View inline View popup Download powerpoint Table 3 Muscle Artifact - Window Optimization. Non-Physiological Artifact Detection CNN-based detection of non-physiological artifacts was highly dependent on the size of the segment, with performance varying significantly between the durations tested (see Table 4 ). A clear pattern emerged in which shorter segments (1-3 seconds) consistently outperformed longer ones and performance declined as the segment size increased. Peak performance was achieved with 1-second segments, producing the best results for precision, specificity, F1 score, and [email protected] . This confirms that the transient nature of non-physiological events require shorter temporal segments for optimal detection. View this table: View inline View popup Download powerpoint Table 4 Non-Physiological - Window Optimization. 3.2. Effect of Window Size on Detection We conducted an optimization of the segment size to determine the ideal temporal resolution for each artifact classifier. Our main focus was on specificity ( Goncharova et al., 2003 ; Whitham et al., 2007 ; Nolan et al., 2010 ), while we also considered the F1-score, ROC AUC, and accuracy as secondary metrics. The eye movement detector achieved optimal performance with 20-second segments (ROC AUC: 0.975, F1-score: 90.5%) (see Table 2 ). The Muscle artifact detector performed best with 5-second segments (accuracy: 93.2%, specificity: 96.0%) (see Table 3 ). The Non-physiological artifact detector achieved optimal performance with 1-second segments (ROC AUC: 0.950, accuracy: 96.1%, specificity: 98.2%, F1-score: 77.4%) (see Table 4 ). The results highlight specific temporal requirements for different types of artifacts: non-physiological artifacts are transient and are most effectively detected with a 1-second context, muscle artifacts require a moderate context of 5 seconds, while eye movements benefit from a longer temporal context of 20 seconds. Using Youden’s index for optimal threshold selection, our final models show strong performance across all artifact types (see Table 5 ). The eye movement detector achieved 94.2% sensitivity and 89.3% specificity, the muscle artifact detector reached 89.5% sensitivity and 95.1% specificity, while the non-physiological detector attained 87.8% sensitivity and 88.9% specificity. The Figure 3 shows the confusion matrices for each detector at their optimal configurations, which illustrates the classification performance on the test set. View this table: View inline View popup Download powerpoint Table 5 Final Model Performance Summary at Optimal Window Sizes Download figure Open in new tab Figure 3: Confusion matrices for artifact detectors using optimal window sizes with Youden’s index thresholds. 3.3. Comparative Analysis Comparison of performance between CNN and rule-based methods in all types of artifacts and window sizes (see Table 6 ). For eye movement detector, the CNN showed improvements in the F1 score from +11. 2% (10-second windows) to +15. 9% (1-second windows). The CNN had higher accuracy, precision, recall, and F1-score across all window sizes, with rule-based methods only better in specificity at 10-second windows. View this table: View inline View popup Download powerpoint Table 6 Performance metrics (%) for Deep Lightweight CNN vs Rule-Based artifact detection across six temporal windows. Best values per metric in bold with color coding. Muscle artifact detection showed the most significant performance differences between the methods. The CNN had substantial improvements in the F1 score from +24. 5% to +34. 8%, with substantial improvements in 30-second windows. Although rule-based methods had higher specificity across all window sizes, the CNN showed superior performance in accuracy, precision, recall, and F1-score, with the performance gap larger at increased window sizes. Non-physiological artifact detection showed the largest overall performance differences, with F1 score improvements of +23. 2% (30-second windows) to +44.9% (1 1-second windows). The CNN demonstrated superior performance across all metrics, with the performance advantage smaller at longer window sizes but still substantial throughout. These consistent improvements suggest that deep learning approaches better capture the complex temporal patterns present in EEG artifacts. 4. Discussion We compare our findings to established literature on EEG artifact detection. Goncharova et al. identified muscle artifacts by power spectral analysis in the 20-50 Hz range but reported difficulties with specificity in clinical settings ( Goncharova et al., 2003 ). Whitham et al. investigated the frequency properties of muscle artifacts using spectral-based methods and achieved moderate performance in artifact detection ( Whitham et al., 2007 ). Nolan et al. created the FASTER framework, which uses variance thresholds for automated artifact rejection. This framework reflects current clinical standards, boasting an accuracy rate of 85-90%, although it has limited data on specificity ( Nolan et al., 2010 ). Compared to these rule-based methods, our system performs better in terms of overall performance. We report a high specificity of 96.0%-98.2% while achieving a competitive sensitivity of 74.6%-86.5% across artifact types, making it appropriate for real-world applications. Rule-based methods in our evaluation achieved high specificity for muscle artifacts (95.3%-99.3%) but showed poor sensitivity (34.7%-46.9%) and substantially lower F1-scores. Our approach shows F1-score improvements of +11.2% to +44.9% in all types of artifacts. Traditional spectral and variance-based methods showed particularly poor performance for non-physiological artifacts, which is problematic as it leads to significant loss of valuable EEG information. The length of the right window matters: 20s (eye), 5s (muscle), 1s (non-phys). A single general model performs worse than per-artifact models. For clinical use, our models can provide real-time predictions using sliding window approaches. Even though models are trained on longer windows (like 20 seconds), they can still give predictions every second by looking back at the required window length. This addresses clinical needs for frequent artifact detection while keeping the performance benefits of longer temporal contexts during training. A key contribution is that we are one of the first to develop a CNN model with high sensitivity and specificity for each of the three artifact classes. A limitation of this work is that it is based on a single dataset, and the model needs to be evaluated in other datasets to ensure generalizability. 5. Conclusion We developed a deep lightweight CNN system for automated detection of three artifact classes: eye movement, muscle activity, and non-physiological artifacts. The system consists of three binary detectors that use different time windows and focal loss to handle class imbalance. We evaluated the system through window size tests and compared it with rule-based methods. The CNN system showed better performance than rule-based methods in all types of artifacts. The high specificity achieved by the CNN system is particularly important for clinical applications, as it minimizes false alarms and reduces unnecessary interruptions in patient monitoring. By making our artifact-specific models and evaluation framework publicly available, we provide a valuable resource for the neurophysiology research community. In future work, artifact detection can be used with other EEG detectors, such as seizure or slowing detectors, to improve clinical monitoring systems. Data Availability The data analyzed are from the Temple University Hospital (TUH) EEG Corpus, specifically the edf/01_tcp_ar subset (routine clinical EEG, 2002-2016). Access is available to qualified researchers upon registration and acceptance of the dataset's data-use terms; we are not authorized to redistribute raw TUH data. https://isip.piconepress.com/projects/nedc/html/tuh_eeg/ CRediT authorship contribution statement Evans Nyanney: Conceptualization, Methodology, Software, Writing - Original Draft. Parthasarathy D. Thirumala: Supervision, Validation, Writing - Review & Editing. Shyam Visweswaran: Supervision. Zhaohui Geng: Methodology, Formal analysis. References ↵ Amin , U. , Nascimento , F.A. , Karakis , I. , Schomer , D. , Benbadis , S.R. , 2023 . Normal variants and artifacts: Importance in eeg interpretation . Epileptic Disorders 25 , 591 – 648 . doi: 10.1002/epd2.20040 . OpenUrl CrossRef PubMed Bossuyt , P.M. , Reitsma , J.B. , Bruns , D.E. , et al. , 2015 . STARD 2015: updated list of essential items for reporting diagnostic accuracy studies . BMJ 351 , h5527 . OpenUrl FREE Full Text ↵ Britton , J.W. , Frey , L.C. , Hopp , J.L. , et al. , 2016 . Electroencephalography (EEG): An Introductory Text and Atlas of Normal and Abnormal Findings in Adults, Children, and Infants . American Epilepsy Society, Chicago . ↵ Claassen , J. , Mayer , S.A. , Kowalski , R.G. , Emerson , R.G. , Hirsch , L.J. , 2004 . Detection of electrographic seizures with continuous EEG monitoring in critically ill patients . Neurology 62 , 1743 – 1748 . OpenUrl CrossRef PubMed Collins , G.S. , Moons , K.G.M. , Dhiman , P. , et al. , 2024 . TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods . BMJ 384 , e078378 . OpenUrl ↵ Delorme , A. , Sejnowski , T. , Makeig , S. , 2007 . Enhanced detection of artifacts in EEG data using higher-order statistics and independent component analysis . NeuroImage 34 , 1443 – 1449 . OpenUrl CrossRef PubMed Web of Science ↵ Goncharova , I.I. , McFarland , D.J. , Vaughan , T.M. , Wolpaw , J.R. , 2003 . EMG contamination of EEG: spectral and topographical characteristics . Clinical Neurophysiology 114 , 1580 – 1593 . OpenUrl CrossRef PubMed Web of Science ↵ He , H. , Garcia , E.A. , 2009 . Learning from imbalanced data . IEEE Transactions on Knowledge and Data Engineering 21 , 1263 – 1284 . OpenUrl CrossRef ↵ Islam , M.K. , Rastegarnia , A. , Yang , Z. , 2016 . Methods for artifact detection and removal from scalp EEG: A review . Neurophysiologie Clinique 46 , 287 – 305 . OpenUrl PubMed ↵ Jiang , X. , Bian , G.B. , Tian , Z. , 2019 . Removal of artifacts from EEG signals: A review . Sensors 19 , 987 . OpenUrl PubMed ↵ Kalita , B. , Deb , N. , Das , D. , 2024 . AnEEG: leveraging deep learning for effective artifact removal in EEG data . Scientific Reports 14 , 24234 . OpenUrl PubMed ↵ Lawhern , V.J. , Solon , A.J. , Waytowich , N.R. , Gordon , S.M. , Hung , C.P. , Lance , B.J. , 2018 . EEGNet: a compact convolutional neural network for EEG-based brain-computer interfaces . Journal of Neural Engineering 15 , 056013 . OpenUrl PubMed ↵ Lin , M. , Chen , Q. , Yan , S. , 2013 . Network in network . arXiv preprint arXiv:1312.4400 arXiv: 1312.4400 . ↵ Lin , T.Y. , Goyal , P. , Girshick , R. , He , K. , Dollár , P. , 2017 . Focal loss for dense object detection, in: Proceedings of the IEEE international conference on computer vision , pp. 2980 – 2988 . ↵ Mumtaz , W. , Rasheed , S. , Irfan , A. , 2021 . Review of challenges associated with the EEG artifact removal methods . Biomedical Signal Processing and Control 68 , 102741 . OpenUrl ↵ Nolan , H. , Whelan , R. , Reilly , R.B. , 2010 . FASTER: Fully automated statistical thresholding for EEG artifact rejection . Journal of Neuroscience Methods 192 , 152 – 162 . OpenUrl CrossRef PubMed Web of Science ↵ Obeid , I. , Picone , J. , 2016 . The Temple University Hospital EEG Data Corpus . Frontiers in Neuroscience 10 , 196 . OpenUrl PubMed ↵ Powers , D.M.W. , 2011 . Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation . Journal of Machine Learning Technologies 2 , 37 – 63 . OpenUrl ↵ Prakash , V. , Kumar , D. , 2024 . Artifact detection and removal in EEG: A review of methods and contemporary usage , in: Advances in Artificial-Business Analytics and Quantum Machine Learning , Springer . pp. 263 – 274 . ↵ Prasad , D.S. , Chanamallu , S.R. , Prasad , K.S. , 2021 . Mitigation of ocular artifacts for EEG signal using improved earth worm optimization-based neural network and lifting wavelet transform . Computer Methods in Biomechanics and Biomedical Engineering 24 , 551 – 578 . OpenUrl PubMed ↵ Saba-Sadiya , S. , Chantland , E. , Alhanai , T. , Liu , T. , Ghassemi , M.M. , 2021 . Unsupervised EEG artifact detection and correction . Frontiers in Digital Health 2 , 608920 . OpenUrl PubMed ↵ Saito , T. , Rehmsmeier , M. , 2015 . The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets . PLOS ONE 10 , e0118432 . OpenUrl CrossRef PubMed ↵ Sharma , S. , Nunes , M. , Alkhachroum , A. , 2022 . Adult critical care electroencephalography monitoring for seizures: A narrative review . Frontiers in Neurology 13 , 951286 . OpenUrl PubMed ↵ The TUH EEG Corpus , 2020 . The Temple University Artifact Corpus: An annotated corpus of EEG artifacts . DOI: 10.13026/9njx-6322 . OpenUrl CrossRef ↵ Urigüen , J.A. , Garcia-Zapirain , B. , 2015 . EEG artifact removal: state of the art and guidelines . Journal of Neural Engineering 12 , 031001 . OpenUrl PubMed ↵ Whitham , E.M. , Pope , K.J. , Fitzgibbon , S.P. , Lewis , T. , Clark , C.R. , Loveless , S. , Broberg , M. , Wallace , A. , DeLosAngeles , D. , Lillie , P. , Hardy , A. , Fronsko , R. , Pulbrook , A. , Willoughby , J.O. , 2007 . Scalp electrical recording during paralysis: quantitative evidence that EEG frequencies above 20 Hz are contaminated by EMG . Clinical Neurophysiology 118 , 1877 – 1888 . doi: 10.1016/j.clinph.2007.04.027 . OpenUrl CrossRef PubMed ↵ Yu , Y. , Li , Y. , Zhou , Y. , Wang , Y. , Wang , J. , 2024 . A learnable and explainable wavelet neural network for EEG artifacts detection and classification . IEEE Transactions on Neural Systems and Rehabilitation Engineering 32 , 3358 – 3368 . OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted October 29, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A Deep Lightweight Convolutional Neural Network for Detecting Artifacts in Continuous EEG Signals Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A Deep Lightweight Convolutional Neural Network for Detecting Artifacts in Continuous EEG Signals Evans Nyanney , Parthasarathy D. Thirumala , Shyam Visweswaran , Zhaohui Geng medRxiv 2025.10.28.25338681; doi: https://doi.org/10.1101/2025.10.28.25338681 Share This Article: Copy Citation Tools A Deep Lightweight Convolutional Neural Network for Detecting Artifacts in Continuous EEG Signals Evans Nyanney , Parthasarathy D. Thirumala , Shyam Visweswaran , Zhaohui Geng medRxiv 2025.10.28.25338681; doi: https://doi.org/10.1101/2025.10.28.25338681 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neurology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9219) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffba2398d6b58d3',t:'MTc3OTQ1MDk0NQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00