A Unified Flexible Large Polysomnography Model for Sleep Staging and Brain Disorder Diagnosis

doi:10.1101/2024.12.11.24318815

A Unified Flexible Large Polysomnography Model for Sleep Staging and Brain Disorder Diagnosis

2024 · doi:10.1101/2024.12.11.24318815

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 108,108 characters · extracted from preprint-html · click to expand

A Unified Flexible Large Polysomnography Model for Sleep Staging and Brain Disorder Diagnosis | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search A Unified Flexible Large Polysomnography Model for Sleep Staging and Brain Disorder Diagnosis View ORCID Profile Guifeng Deng , Mengfan Niu , Shuying Rao , Yuxi Luo , Jianjia Zhang , Junyi Xie , Zhenghe Yu , Wenjuan Liu , Junhang Zhang , Sha Zhao , Gang Pan , Xiaojing Li , Wei Deng , Wanjun Guo , Yaoyun Zhang , View ORCID Profile Tao Li , View ORCID Profile Haiteng Jiang doi: https://doi.org/10.1101/2024.12.11.24318815 Guifeng Deng 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China 2 College of Biomedical Engineering & Instrument Science, Zhejiang University , Hangzhou, 310058, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Guifeng Deng Mengfan Niu 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Shuying Rao 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China 2 College of Biomedical Engineering & Instrument Science, Zhejiang University , Hangzhou, 310058, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yuxi Luo 3 School of Biomedical Engineering, Shenzhen Campus of Sun Yat-sen University , Shenzhen, 518100, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jianjia Zhang 3 School of Biomedical Engineering, Shenzhen Campus of Sun Yat-sen University , Shenzhen, 518100, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Junyi Xie 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zhenghe Yu 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Wenjuan Liu 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Junhang Zhang 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sha Zhao 4 MOE Frontier Science Center for Brain Science and Brain-machine Integration, State Key Lab of Brain-Machine Intelligence, Zhejiang University , Hangzhou 311121, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gang Pan 4 MOE Frontier Science Center for Brain Science and Brain-machine Integration, State Key Lab of Brain-Machine Intelligence, Zhejiang University , Hangzhou 311121, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Xiaojing Li 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China 4 MOE Frontier Science Center for Brain Science and Brain-machine Integration, State Key Lab of Brain-Machine Intelligence, Zhejiang University , Hangzhou 311121, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Wei Deng 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China 4 MOE Frontier Science Center for Brain Science and Brain-machine Integration, State Key Lab of Brain-Machine Intelligence, Zhejiang University , Hangzhou 311121, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Wanjun Guo 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China 4 MOE Frontier Science Center for Brain Science and Brain-machine Integration, State Key Lab of Brain-Machine Intelligence, Zhejiang University , Hangzhou 311121, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yaoyun Zhang 5 School of Biomedical Informatics, University of Texas at Dallas , 800 W Campbell Rd, Richardson, TX 75080, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Tao Li 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China 4 MOE Frontier Science Center for Brain Science and Brain-machine Integration, State Key Lab of Brain-Machine Intelligence, Zhejiang University , Hangzhou 311121, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tao Li For correspondence: litaozjusc{at}zju.edu.cn h.jiang{at}zju.edu.cn Haiteng Jiang 1 Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, School of Brain Science and Brain Medicine, and Liangzhu Laboratory, Zhejiang University School of Medicine , Hangzhou, 310058, China 4 MOE Frontier Science Center for Brain Science and Brain-machine Integration, State Key Lab of Brain-Machine Intelligence, Zhejiang University , Hangzhou 311121, China 6 Department of Psychiatry and Mental Health, Wenzhou Medical University , Wenzhou 325035, Zhejiang Province, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Haiteng Jiang For correspondence: litaozjusc{at}zju.edu.cn h.jiang{at}zju.edu.cn Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Sleep disorders affect billions worldwide, yet clinical polysomnography (PSG) analysis remains hindered by labor-intensive manual scoring and limited generalizability of automated sleep staging tools across heterogeneous protocols. We present LPSGM, a large-scale PSG model designed to address two critical challenges in sleep medicine: cross-center generalization and adaptable diagnosis of neuropsychiatric disorders. Trained on 220,500 hours of multi-center PSG data (24,000 full-night recordings from 16 public datasets), LPSGM integrates domain-adaptive pre-training, flexible channel configurations, and a unified architecture to mitigate variability in equipment, montages, and populations during sleep staging while enabling downstream fine-tuning for brain disorder detection. In prospective validation, LPSGM achieves expert-level consensus in sleep staging (κ = 0.845 ± 0.066 vs. inter-expert κ = 0.850 ± 0.102) and matches the performance of fully supervised models on two independent private cohorts. When fine-tuned for sleep disorder diagnosis, LPSGM achieved 80.47% accuracy on the large-scale MNC dataset (773 subjects) for a three-class classification (Healthy Control vs. T1 Narcolepsy vs. Other Hypersomnia). The model also demonstrated strong cross-institutional generalizability, with an AUC of 0.8791 on independent cohorts for a binary (Normal vs. Abnormal) classification. While depression screening on smaller datasets showed perfect accuracy in controlled settings, larger-scale validation is necessary. By bridging automated sleep staging with real-world clinical deployment, LPSGM establishes a scalable framework for integrated sleep and brain disorder diagnostics. The code and pre-trained model are publicly available at https://github.com/Deng-GuiFeng/LPSGM to advance reproducibility and translational research in sleep medicine. Introduction Sleep occupies nearly one-third of human life and serves as a critical determinant of overall health 1 , 2 . The escalating global burden of sleep disorders—including insomnia, obstructive sleep apnea (OSA), and narcolepsy—represents a major public health challenge. Epidemiological studies indicate that OSA affects approximately one billion adults worldwide 3 , while insomnia disorder impacts 10% of the adult population, with an additional 20% experiencing transient symptoms 4 . Polysomnography (PSG), the gold-standard tool for sleep assessment, captures multimodal physiological signals—including electroencephalogram (EEG), electrooculogram (EOG), and electromyogram (EMG) to analyze sleep architecture according to American Academy of Sleep Medicine (AASM) guidelines 5 , which classify sleep into wakefulness (W), non-rapid eye movement (NREM: N1–N3), and rapid eye movement (REM) stages. By mapping these stages, PSG reveals how deviations in sleep architecture—such as shortened REM latency in narcolepsy 6 , 7 or fragmented sleep, reduced slow-wave activity and disinhibition REM sleep in MDD 8 – 11 —serve as biomarkers for neuropsychiatric conditions, positioning it as a dual-purpose tool for both sleep staging and brain disorder diagnostics. Conventional manual PSG scoring requires clinicians to annotate 30-second epochs, a process requiring approximately two hours per full-night recording (7–9 hours of PSG data). This labor-intensive method suffers from significant inter-rater variability due to subjective interpretation of ambiguous patterns, undermining diagnostic consistency across clinical setting 12 – 16 . Machine learning advances have partially addressed these inefficiencies through automated sleep staging 17 – 34 . Early approaches relied on hand-engineered features and traditional machine learning models (e.g., support vector machines, random forests), but struggled to capture complex patterns of sleep physiology 17 – 23 . Subsequent deep learning frameworks—notably convolutional and recurrent neural networks (CNNs/RNNs)—enabled end-to-end learning of hierarchical representations directly from raw physiological data, achieving accuracy comparable to human experts 24 – 34 . However, current studies prioritize intra-dataset performance using fixed PSG channel configurations, exhibiting marked performance degradation in cross-domain settings due to variability in PSG protocols and population demographics. Transfer learning strategies 28 – 32 —including domain adaptation 28 , 29 , 32 and adversarial training 31 —improve out-of-distribution (OOD) generalization but persistently underperform models trained on target-domain data. These limitations underscore the urgent need for robust, generalizable frameworks capable of adapting to heterogeneous clinical environments. Despite recent progress, clinical adoption of automated sleep staging systems remains constrained by two interrelated challenges: domain shift and data scarcity. PSG heterogeneity—driven by variability in recording protocols, equipment, and populations—contravenes the independent and identically distributed (IID) assumption critical to conventional deep learning, while the labor-intensive nature of manual annotation impedes scalable dataset curation. Although numerous public sleep datasets exist 6 , 35 – 52 , their collective utility is diminished by inconsistent signal montages and channel configurations ( Supplementary Table 2 ). Drawing inspiration from recent advances in large language models (LLMs) 53 – 55 and EEG foundation models 56 – 58 , we propose that a unified framework integrating multi-center PSG data through domain-adaptive pre-training could mitigate domain shifts while unlocking downstream clinical utility. Here, we present LPSGM, a large PSG model designed to overcome two critical barriers in sleep medicine: (1) cross-center generalization in automated sleep staging and (2) adaptable PSG-based diagnosis of brain disorders. Our framework integrates three core innovations: (i) Unified training protocol for harmonizing heterogeneous datasets with variable montages and channel configurations, (ii) Flexible inference architecture that dynamically adapts to diverse clinical PSG setups, (iii) Hybrid pre-training on 220,500 hours of PSG data (24,000 recordings) aggregated from 16 public datasets. LPSGM achieves performance parity with fully supervised models trained on two independent private datasets (HANG7 and SYSU; see Supplementary Methods for additional details). In prospective validation, it matches human expert consensus (model vs. experts: κ = 0.845 ± 0.066; inter-expert agreement: κ = 0.850 ± 0.102), demonstrating clinical-grade reliability. When fine-tuned for brain disorder diagnosis, LPSGM demonstrates robust cross-dataset generalizability, achieving strong performance in sleep disorder classification on a large-scale multi-center dataset (773 subjects) and maintaining diagnostic reliability when transferred to independent clinical cohorts without additional training. By unifying robust cross-domain generalization with clinical adaptability, LPSGM bridges the gap between automated sleep staging and real-world deployment, enabling scalable analysis of sleep physiology and its neuropsychiatric correlates. Results Cross-Center Classification Performance of Sleep Staging We evaluated LPSGM’s cross-center sleep staging performance on the HANG7 and SYSU datasets against two baselines ( Table 1 ). Baseline 1 (lower bound) involved direct application of a model trained on one center’s data to another without adaptation, while Baseline 2 (upper bound) used models fully trained on target-center data via five-fold cross-validation. LPSGM-Small outperformed LPSGM in baseline comparisons and was selected as the reference baseline. LPSGM achieved near-upper-bound performance on both datasets. For HANG7, LPSGM reached 85.68% accuracy, 82.88% macro-F1, and a kappa of 0.8138 (99.6%, 99.9%, and 99.5% of Baseline 2, respectively). On SYSU, it attained 84.13% accuracy, 77.88% macro-F1, and 0.7789 kappa (97.1%, 96.7%, and 95.7% of Baseline 2), demonstrating robust generalization across clinical centers. View this table: View inline View popup Download powerpoint Table 1: Comparison of the sleep staging results between LPSGM and baselines on two target private centers. Additional validation on three public datasets—the MESA 71 and two subsets from the MASS (MASS-SS1 and MASS-SS3) 70 —further confirmed LPSGM’s robust generalization capabilities, achieving accuracy of 83.74%, 79.98%, and 84.97% respectively, demonstrating consistent performance across diverse populations, age groups, and recording protocols ( Supplementary Fig. 2 ). Notably, sleep staging performance exhibited variation across subject subgroups ( Fig. 2b ), suggesting potential demographic- or pathology-dependent effects on model generalizability. LPSGM achieved highest accuracy in healthy cohorts (88.99% on HANG7; 86.15% on SYSU) but declined slightly in populations with sleep or neurological disorders—notably, to 87.11% (hypersomnia) and 82.45% (narcolepsy) on HANG7, and 78.36% (MDD) on SYSU. This may reflect atypical sleep architecture in clinical populations. Download figure Open in new tab Fig. 1: Overview of the LPSGM framework. Panel (a): Data harmonization schematizes the aggregation of 220,500 hours of PSG data from 16 public datasets and 2 independent clinical cohorts, spanning diverse geographic populations and recording protocols. Panel (b): Cross-center generalization outlines the training-evaluation pipeline: LPSGM is pre-trained on multi-center public datasets, validated for cross-domain sleep staging on two unseen private datasets, and fine-tuned for downstream tasks including sleep disorder diagnosis and MDD screening. Panel (c): Analytical validation details the study’s three-pronged evaluation: (1) a prospective clinical trial benchmarking LPSGM against expert consensus, (2) interpretability analysis to decode decision-making patterns, and (3) ablation studies quantifying the contribution of key components. Download figure Open in new tab Fig. 2: Performance of LPSGM on Sleep Staging. (a) Hypnogram comparison of a full-night PSG recording from the HANG7 dataset. The top panel illustrates sleep stages predicted by LPSGM, while the bottom panel shows the expert-annotated reference hypnogram. (b) Boxplots summarizing distributions of accuracy (Acc), macro-F1 (MF1), and Cohen’s κ across subject groups in the HANG7 and SYSU datasets. (c) Trade-off between model performance and inference speed under varying channel configurations (8C, 4C, 2C, and 1C; C = channels) on the HANG7 dataset. The x-axis denotes inference time per full-night recording, and the y-axis reports performance metrics (Acc, MF1, κ). (d) Five-fold cross-validation results on the HANG7 and SYSU datasets, comparing models trained with and without large-scale hybrid pre-training. Blue circles denote models trained exclusively on target-center data (without pre-training), while red squares represent models pre-trained on public datasets before target-center fine-tuning (with pre-training). During the inference stage, LPSGM dynamically balances accuracy and computational efficiency by adjusting input channels without altering the model structure. Fig. 2c demonstrates the trade-off between performance metrics and inference time per recording across different EEG channel configurations (8C, 4C, 2C, 1C) obtained from the HANG7 dataset. When using eight channels (8C), LPSGM achieves the highest accuracy of 85.68%, macro-F1 of 82.88%, and kappa of 81.38%, with an inference time of about 4 seconds per recording. As the number of channels decreases, the inference time reduces while there is a corresponding drop in performance metrics. When further reducing to two channels (2C), the accuracy is 84.23±0.49%, macro-F1 81.53±0.46%, kappa 79.57±0.61%, with the inference time dropping to about 1 second per recording. Using a single channel (1C), the model’s accuracy drops more significantly to 82.03±0.74%, macro-F1 to 79.21±0.67%, and kappa to 76.76±0.90%, but the inference time is the fastest at approximately 0.5 seconds per recording. See Supplementary Table 5 for comprehensive performance metrics across all tested channel configurations. In addition to cross-center generalization, we investigate the role of large-scale hybrid pre-training by comparing LPSGM models with and without pre-training through five-fold cross-validation (Fig.2d). The pre-trained model—initially developed on 220,500 hours of public, multi-center polysomnography (PSG) data—demonstrated superior performance following fine-tuning on target-center datasets. Quantitative analysis revealed consistent improvements across all evaluation metrics: pre-training enhanced accuracy, macro-F1 score, and Cohen’s kappa by 2.04%, 1.91%, and 2.67%, respectively, for the SYSU dataset, while comparable improvements of 1.27%, 1.57%, and 1.62% were observed for the HANG7 dataset. These systematic enhancements not only validate the efficacy of the pre-trained LPSGM framework for domain adaptation but also demonstrate its practical advantages, including faster convergence and significantly better performance compared to models trained from scratch. Comparison with State-of-the-Art Sleep Staging Methods To our knowledge, LPSGM constitutes the largest PSG training framework to date among existing sleep staging models, a scale we hypothesize underlies its superior generalization performance. We systematically evaluated LPSGM against contemporary methods using standardized dataset sizes, categorizing comparison approaches as either non-transfer-based or transfer-based according to their original published methodologies. Non-transfer-based methods included: DeepSleepNet 24 (a classical CNN-BiLSTM network for extracting local features and learning transition rules), TinySleepNet 25 (a classical model based on CNN and RNN with fewer model parameters), U-Time 33 (a fully-CNN encoder-decoder architecture for time series segmentation applied to sleep staging), and AttnSleep 34 (a hybrid architecture composed of a multi-resolution CNN and multi-head self-attention with causal convolutions). Transfer-based methods comprised SleepDG 32 (a combination of CNN and Transformer architectures that incorporates a proposed multi-level feature alignment technique to extract domain-invariant features) and RobustSleepNet 30 (channel-adaptive architecture trained on heterogeneous configurations). Notably, only RobustSleepNet and LPSGM supported flexible channel configurations. To ensure fair comparison, we standardized input channels to the overlapping C3-M2 and E1-M2 montages across all methods except RobustSleepNet and LPSGM, which were evaluated under both 2-channel (2C) and 8-channel (8C) configurations. We implemented these methods based on their public code and default hyper-parameters under our cross-center protocol. LPSGM (8C) achieved state-of-the-art performance across all metrics ( Table 2 ), outperforming both LPSGM (2C) and comparator methods. This demonstrates the value of multi-channel EEG integration through complementary signal information. Remarkably, LPSGM (2C) maintained superior performance over alternatives, highlighting the efficacy of Transformer-based channel encoding and attention mechanisms compared to conventional channel-stacking convolutional approaches. In non-transfer methods, we observed a model size-performance correlation, implying capacity limitations in large-scale training scenarios. Among transfer-based methods, SleepDG approached LPSGM (2C) performance, while RobustSleepNet’s compact architecture (0.2M parameters) exhibited dataset-specific learning patterns, evidenced by pronounced HANG7-SYSU performance discrepancies, suggesting constrained generalizability. View this table: View inline View popup Table 2: Performance comparison with existing sleep staging methods. Performance of Brain Disorder Diagnosis To assess LPSGM’s diagnostic capabilities beyond sleep staging, we conducted fine-tuning experiments for brain disorder classification across two independent clinical datasets with distinct patient populations and recording protocols. The MNC dataset comprised 773 PSG recordings from six international cohorts spanning three diagnostic categories: non-narcolepsy controls ( n =310), Type 1 narcolepsy patients ( n =254), and patients with other hypersomnia conditions ( n =209) (Supplementary Methods, Supplementary Table 3 ). The HANG7 dataset included 127 subjects from a single clinical center: 33 healthy controls, 51 narcolepsy patients (13 Type 1, 38 Type 2), and 43 patients with hypersomnia symptoms associated with anxiety and depression (Supplementary Methods). We evaluated three fine-tuning strategies—Full Fine-tune, Partial Fine-tune, and Joint Fine-tune—against Train from Scratch baselines across multiple classification tasks, with particular emphasis on cross-dataset generalization performance. On the MNC dataset, three-class classification among the diagnostic categories demonstrated clear advantages for fine-tuning approaches. At the subject level, Full Fine-tune achieved 80.47% accuracy (κ = 0.7043), outperforming Partial Fine-tune at 77.75% accuracy (κ = 0.6643) and Train from Scratch at 72.70% accuracy (κ = 0.5880) ( Fig. 3a ). Binary classification tasks provided additional validation of LPSGM’s discriminative capacity. For Normal (Non-narcolepsy Healthy Control) versus Abnormal (T1 Narcolepsy and Other Hypersomnia) discrimination, Full Fine-tune achieved subject-wise AUC of 0.9663, compared to Partial Fine-tune (0.9442) and Train from Scratch (0.9394) ( Fig. 3b ). Type 1 narcolepsy identification against Non-T1 Narcolepsy Control demonstrated Full Fine-tune reaching subject-wise AUC of 0.9115, with Partial Fine-tune achieving 0.9065 and Train from Scratch attaining 0.8403 ( Fig. 3c ). Download figure Open in new tab Fig. 3: Performance evaluation of LPSGM after fine-tuning for sleep disorder diagnosis on MNC and HANG7 datasets. Three fine-tuning strategies were compared: Full Fine-tune (all parameters updated), Partial Fine-tune (only disease classifier updated), and Joint Fine-tune (simultaneous optimization for sleep staging and disease diagnosis), alongside Train from Scratch baseline. All evaluations used five-fold cross-validation unless specified otherwise. (a) Three-class classification on MNC dataset distinguishing Non-narcolepsy Healthy Controls, T1 Narcolepsy, and Other Hypersomnia, with performance metrics shown for both sample-wise (left) and subject-wise (right) evaluations. (b) Binary classification ROC analysis on MNC dataset for Normal (Non-narcolepsy) versus Abnormal (T1 Narcolepsy and Other Hypersomnia) discrimination, displaying sample-wise (left) and subject-wise (right) receiver operating characteristic curves with corresponding area under curve (AUC) values. (c) Binary classification ROC analysis on MNC dataset for T1 Narcolepsy identification, comparing Non-T1 Narcolepsy Control (Non-narcolepsy and Other Hypersomnia) versus T1 Narcolepsy, with sample-wise (left) and subject-wise (right) performance. (d) Binary classification ROC analysis on HANG7 dataset for Normal (Healthy Control) versus Abnormal (Narcolepsy and Hypersomnia) discrimination, including both within-dataset five-fold cross-validation and cross-dataset transfer evaluation (Fine-tune from MNC curve), demonstrating model generalizability across independent clinical cohorts. (e) Representative example of LPSGM diagnostic workflow showing overnight hypnogram with sleep stages automatically scored by LPSGM (top), and corresponding epoch-wise diagnostic predictions alternating between Normal and Abnormal classifications (bottom). Subject-level diagnosis is determined through majority voting of epoch-wise predictions across the entire recording. Note that joint fine-tuning was not applicable to MNC dataset due to incomplete sleep stage annotations. See Supplementary Fig. 3 - 7 for detailed confusion matrices for all classification tasks and fine-tuning approaches. Cross-dataset evaluation provided direct assessment of generalization capabilities critical for clinical deployment. Models fine-tuned on MNC for Normal versus Abnormal classification were directly applied to HANG7 without additional training, achieving subject-wise AUC of 0.8791 ( Fig. 3d ). This cross-institutional performance validates LPSGM’s capacity to maintain diagnostic accuracy across different patient populations, recording equipment, and clinical protocols. Within-dataset evaluation on HANG7 for Normal (Healthy Control) versus Abnormal (Narcolepsy and Hypersomnia) classification confirmed superior performance across fine-tuning approaches: Joint Fine-tune achieved subject-wise AUC of 0.9146, Full Fine-tune reached 0.9055, and Partial Fine-tune attained 0.8607, all substantially exceeding Train from Scratch performance at 0.8253 ( Fig. 3d ). The availability of complete sleep stage annotations in HANG7 enabled Joint Fine-tune evaluation, which maintained simultaneous sleep staging and disease classification capabilities. Subject-wise evaluation consistently yielded superior performance compared to sample-wise metrics across all tasks, reflecting the clinical practice of diagnostic assessment based on comprehensive overnight recordings rather than individual epoch classifications ( Fig. 3e ). In addition to the sleep disorder diagnostic tasks, we evaluated LPSGM’s capacity for depression screening on two additional datasets. On the SYSU dataset comprising healthy controls ( n =20) and patients with major depressive disorder ( n =24), LPSGM demonstrated exceptional performance with Joint Fine-tune achieving perfect subject-wise accuracy (100%) and AUC of 1. However, performance declined on the larger APPLES dataset (928 healthy, 163 depressed subjects), where the classification target was depression history. With Full Fine-tune, subject-wise accuracy was 72.0% and the AUC was 0.7236. Detailed results for depression screening tasks are provided in Supplementary Fig. 8 . Prospective Validation of LPSGM for Clinical Sleep Staging We conducted a prospective validation study at the HANG7 center (protocol: Fig. 1c ) to assess the clinical applicability of LPSGM. Three board-certified sleep experts independently scored twenty full-night PSG recordings, randomly selected over a one-month period. LPSGM predictions were generated for the same recordings, with comparative analyses evaluating: 1) Inter-expert variability: Each expert’s annotations vs. consensus derived from the other two; 2) Model-expert agreement: LPSGM predictions vs. full expert consensus. Classification metrics ( Table 3 ) employed probabilistically adjusted accuracy and Cohen’s κ (Supplementary Methods). Consensus labels were determined via epoch-wise majority voting, with tied probabilities evenly distributed. LPSGM achieved 89.2% accuracy (κ = 0.845), closely matching expert consensus (88.7% accuracy, κ = 0.850). Two-tailed t -tests revealed no significant differences between LPSGM and experts (accuracy: p = 0.74; κ: p = 0.60). Notably, LPSGM exhibited lower variability (SD: 4.6% accuracy, 6.6% κ) than individual experts (SD: 9.7% accuracy, 10.2% κ), highlighting superior consistency. View this table: View inline View popup Download powerpoint Table 3: Comparison of classification metrics between LPSGM and experts. Hypnogram metrics ( Table 4 ) quantified sleep architecture through stage transitions, categorized into: 1) Sleep latency: Time to enter specific stages. 2) Sleep duration: Total time per stage. 3) Stage distributions: Proportion relative to total sleep time. Mean absolute error (MAE) was calculated for each scorer against consensus contributors. For model-expert comparisons, MAE reflected LPSGM’s deviation from individual experts, while inter-expert MAE captured pairwise differences. Statistical analysis identified significant discrepancies in N2 latency ( p < 0.05) and N1 duration/distribution ( p < 0.01), suggesting that while LPSGM robustly captures global sleep architecture, refining transient N1/N2 transition detection could further enhance clinical utility. View this table: View inline View popup Table 4: Mean absolute error (MAE) comparison of hypnogram metrics between LPSGM and experts. Interpretability Analysis of LPSGM The black-box nature of deep learning models remains a critical barrier to clinical adoption, where transparency in decision-making is essential. To decode LPSGM’s inference mechanisms, we applied gradient-weighted class activation mapping (Grad-CAM) 59 , a post hoc interpretability method that generates spatial heatmaps highlighting input regions most salient to model predictions. Analyses focused on the final convolutional layer of the Epoch Encoder’s dual-branch CNN ( Fig. 5b ), with activation maps resampled to EEG resolution and aggregated across branches to visualize attention patterns during sleep staging. Figure 4 illustrates Grad-CAM results for six representative 30-second EEG epochs. Panels a–d showcase correct classifications, with attention peaks aligning with clinical biomarkers per AASM guidelines: (a) Posterior dominant rhythm (PDR) in occipital channels (>50% epoch coverage), pathognomonic of wakefulness. (b) Frontal slow-wave activity diagnostic of N3 sleep. (c) Low-amplitude mixed-frequency (LAMF) EEG with slow eye movements (SEM), indicative of N1 sleep. (d) Rapid eye movements (REMs) confirming REM sleep. In contrast, panels e–f depict misclassifications where activation patterns diverge from established biomarkers— either misallocating attention (e.g., transient artifacts) or lacking discriminative features for ambiguous epochs. Download figure Open in new tab Fig. 4: Grad-CAM visualizations of LPSGM predictions for sleep staging. The figure presents six 30-second EEG epochs analyzed using Grad-CAM to highlight model-attended regions across eight EEG channels (F3, F4, C3, C4, O1, O2, E1, E2). Panels (a)-(d) depict correctly classified epochs, while panels (e)-(f) show misclassifications. Highlighted regions indicate features the model deemed relevant for classification. Black boxes mark physician-identified features consistent with AASM sleep staging criteria. (a) Posterior dominant rhythm (PDR) in occipital channels (>50% of the epoch), indicative of Wake. (b) Slow wave activity in frontal channels (>20% of the epoch), characteristic of N3. (c) Absence of PDR, low-amplitude mixed-frequency (LAMF) EEG, and slow eye movements (SEM), consistent with N1. (d) Rapid eye movements (REMs), indicative of REM sleep. (e)-(f) Misclassified epochs where model attention These visualizations provide two key insights into LPSGM’s interpretability: (1) the model successfully attends to clinically relevant EEG features, confirming its biological plausibility, yet (2) reveals limitations in processing ambiguous or complex patterns, suggesting areas for future improvement. Ablation Study To validate key architectural decisions in LPSGM, we performed systematic ablation analyses focusing on two core components: channel & temporal encoding approaches, and feature fusion strategies. These innovations, described in detail in the Methods section, are designed to enhance the model’s ability to process heterogeneous PSG datasets. We evaluated three encoding variants (no encoding, addition-based encoding, and concatenation-based encoding) alongside two fusion approaches (channel averaging versus CLS token-based fusion). As shown in Table 5 , concatenation-based encoding paired with CLS token fusion achieved optimal performance. The baseline (no encoding) exhibited the weakest results, while addition-based encoding showed partial improvement. This hierarchy underscores the necessity of preserving spatiotemporal information in Transformer architectures. We posit that addition-based encoding underperforms due to dimensional conflation during feature integration, which degrades discriminative signal retention. In contrast, concatenation prevents dimensional overlap, enabling robust representation of heterogeneous inputs. CLS token fusion outperformed channel averaging by replacing static linear aggregation with dynamic, attention-driven weighting. While naive averaging indiscriminately combines features, CLS tokens leverage self-attention to emphasize diagnostically salient patterns, thereby preserving feature distinctiveness while adaptively prioritizing clinically relevant biomarkers. View this table: View inline View popup Download powerpoint Table 5: Performance comparison of different channel & temporal encoding and feature fusion methods. To determine the optimal architecture for clinical deployment, we evaluated LPSGM’s performance across varying numbers of Transformer blocks ( N = 1–6). As shown in Table 6 , increasing N yielded divergent outcomes: the HANG7 cohort exhibited consistent performance gains (accuracy: +2.32% from N = 1 to N = 6), while SYSU showed non-linear improvement, peaking at N = 5 (+3.90% vs. N = 1). This dataset-specific response suggests differing sensitivities to model complexity, likely arising from variations in data heterogeneity between clinical centers. View this table: View inline View popup Download powerpoint Table 6: Performance comparison of different number of Transformer block. Balancing computational efficiency and performance, we selected N = 4 as the optimal configuration. This choice retained >99.5% of maximum achievable accuracy while reducing memory demands by 30.8% (9.9M parameters vs. 14.3M for N = 6). The diminishing returns beyond N = 5 highlight the necessity of architectural parsimony for real-world clinical applications, where resource constraints and deployment scalability are paramount. Discussion LPSGM represents a transformative advance in automated sleep staging and brain disorder diagnosis through PSG. Trained on 220,500 hours of PSG data (24,000 full-night recordings aggregated across 16 public datasets) 6 , 35 – 52 , LPSGM achieves robust cross-center generalization, matching the performance of models trained exclusively on target-center data. This capability stems from its ability to approximate real-world clinical diversity: multi-center training aggregates heterogeneous sampling distributions arising from variations in population demographics, recording protocols, and annotation criteria. While individual datasets offer limited snapshots of clinical reality, their integration yields a comprehensive approximation of true data distributions, enabling near-optimal performance (99.6% and 97.1% of center-specific accuracy on independent HANG7 and SYSU datasets, respectively). These results highlight coordinated multi-institutional data sharing as a critical pathway to enhance robustness and accelerate clinical adoption of automated sleep staging technologies. LPSGM’s dynamic channel adaptability addresses the inherent heterogeneity of PSG protocols across clinical environments, enabling scalable sleep assessment tailored to diverse diagnostic needs. In specialized sleep laboratories, the model maximizes precision by leveraging multi-channel EEG configurations to detect subtle neurological or psychiatric abnormalities. Conversely, in resource-constrained settings—such as primary care clinics or home-based monitoring—LPSGM maintains reliability even with reduced channel setups, facilitating accessible preliminary assessments and longitudinal compliance. This flexibility bridges the gap between high-resolution diagnostics and real-world practicality while establishing a foundation for future integration of multimodal physiological signals (e.g., ECG, respiratory metrics), which could refine diagnostic accuracy and enhance pathophysiological insights into sleep disorders. Interpretability remains foundational to deploying deep learning models in clinical workflows 60 – 62 . Grad-CAM 59 analyses reveal that LPSGM prioritizes EEG features aligned with clinical biomarkers during sleep staging—for example, posterior dominant rhythms (PDR) for wakefulness and slow-wave activity for N3 sleep. This congruence with expert-defined criteria (AASM guidelines 5 ) fosters trust in LPSGM’s decision-making, as visualized in attention maps from the final convolutional layer of the Epoch Encoder ( Fig. 5b ). However, misclassifications correlate with divergent attention patterns, such as transient artifact focus or ambiguous EEG signatures (e.g., N1/N2 transitions). These cases highlight opportunities to refine temporal context modeling and artifact resilience. Future iterations could integrate clinician feedback loops during training and embed real-time saliency overlays into user interfaces, enabling practitioners to validate model attention against clinical intuition 63 , 64 . Such transparency enhancements are critical for adoption in routine workflows, bridging the gap between algorithmic outputs and actionable clinical insights. Download figure Open in new tab Fig. 5: Overall architecture of LPSGM. (a) LPSGM consists of an Epoch Encoder, Sequence Encoder, and Classifier, designed for both sleep staging and disorder diagnosis. (b) The Epoch Encoder employs a dual-branch CNN to extract local intra-epoch features from each 30-second PSG segment, using small and large convolutional filters to capture high- and low-frequency EEG features, respectively. (c) The Sequence Encoder consists of a series of N Transformer blocks to capture temporal dependencies across epochs in the sleep sequence. Each Transformer block consists of multi-head self-attention (MSA), feed-forward networks (FFN), and layer normalization (LN). (d) Padding and masking strategy implemented to handle samples with varying numbers of EEG channels, ensuring compatibility across different PSG datasets. Sleep and mental health are bidirectionally linked 65 – 68 through neurochemical and circadian pathways, with disrupted sleep exacerbating psychiatric symptoms while mental disorders often dysregulate sleep architecture. Our evaluation of LPSGM for brain disorder diagnosis yielded encouraging yet nuanced results across different clinical contexts. For sleep-related disorders, LPSGM demonstrated robust performance in narcolepsy and hypersomnia classification on the large-scale MNC dataset (773 subjects), achieving subject-wise accuracy of 80.47% for three-class classification and AUC of 0.9663 for binary Normal versus Abnormal discrimination. Critically, the model maintained diagnostic reliability when transferred from MNC to the independent HANG7 dataset without additional training (AUC=0.8791), validating cross-institutional generalizability essential for clinical deployment across diverse healthcare settings. Depression screening presented more complex findings requiring careful interpretation. On the SYSU dataset, LPSGM achieved exceptional performance with perfect subject-wise accuracy (100%) for Healthy versus Depressive classification. However, performance declined substantially on the larger APPLES dataset (subject-wise accuracy = 72.0%, AUC = 0.7236). Several factors may explain this discrepancy. First, the SYSU dataset comprised distinctly contrasting populations—healthy controls and patients with major depressive disorder—representing extreme phenotypic differences that facilitate classification. Second, APPLES employed a surrogate depression label derived from medical history records rather than formal clinical diagnosis, introducing potential labeling inaccuracies. Third, APPLES exclusion criteria specifically excluded patients with Hamilton Depression Rating Scale (HAMD) scores of 24 or greater, suggesting that even depression-positive subjects exhibited relatively mild symptomatology. Finally, the APPLES cohort consisted primarily of obstructive sleep apnea patients, whose sleep architecture is inherently disrupted by frequent arousals and respiratory events, potentially masking depression-specific sleep pattern alterations and complicating PSG-based mood disorder detection. These depression screening results require cautious interpretation and warrant validation through larger-scale studies with standardized clinical assessments and diverse patient populations. Nevertheless, our findings demonstrate LPSGM’s potential as a foundation for PSG-based brain disorder screening, establishing an important initial framework for integrating sleep physiology with psychiatric diagnosis. While LPSGM demonstrates promising performance, several limitations merit consideration. First, hypnogram analysis revealed opportunities for refinement: transient N1/N2 transitions—critical for assessing sleep fragmentation—were less reliably detected by LPSGM compared to expert consensus. Future iterations could prioritize temporal context modeling (e.g., attention to stage progression patterns) to improve resolution of these clinically salient dynamics. Second, while LPSGM achieves strong performance with multi-channel configurations, accuracy drops in single-channel setups—a critical barrier for home or resource-limited applications. Future work should prioritize optimizing feature extraction algorithms for minimal-channel configurations to enhance accessibility. Additionally, current disease classification operates at the sequence level, processing samples comprising L consecutive epochs as described in Methods. While this temporal context proves sufficient for sleep staging tasks, brain disorder diagnosis may require analysis of longer contextual windows to detect subtle pathological sleep patterns that manifest across extended time scales. Furthermore, to advance clinical utility, subsequent studies could extend LPSGM’s scope to other disorders such as sleep apnea and anxiety disorders, while integrating multimodal signals (e.g., chin electromyography, heart rate variability) to disentangle comorbidities and refine diagnostic specificity. Finally, adaptation for wearable devices could facilitate broader access to PSG-based monitoring, enabling scalable, real-world at-home sleep assessments that bridge diagnostic gaps in underserved populations. In conclusion, this work presents LPSGM as a unified, scalable framework for automated PSG analysis, combining robust cross-domain sleep staging with a novel extension to brain disorder diagnosis. By harmonizing large-scale multi-center PSG data and enabling adaptive channel configurations, the model bridges critical gaps between technical innovation and clinical applicability. To facilitate further research and clinical translation, we release both the source code and pre-trained models, offering the scientific community a foundation for advancing generalizable, clinically adaptable PSG technologies. Methods Overview LPSGM operates as a sequence-to-sequence model for automatic sleep staging using multichannel PSG signals ( Fig. 5a ). The architecture comprises three core components: an Epoch Encoder that extracts local intra-epoch features, a Sequence Encoder that models global inter-epoch dependencies via self-attention mechanisms, and a Classifier that assigns sleep stages to sequential 30-second PSG epochs. Epoch Segmentation To address heterogeneous channel configurations, PSG signals are segmented into 30-second epochs. For a multi-channel PSG sequence containing L epochs, each epoch comprises C i channels and n = 30 × f s samples. Segmentation is performed across temporal and channel dimensions, yielding L × C i single-channel segments . This standardization enables consistent input formatting for downstream encoding and classification. Epoch Encoder Each single-channel segment is processed by a dual-branch convolutional neural network (CNN) inspired by DeepSleepNet 24 . The two branches employ small and large filters to capture high- and low-frequency features, respectively. Each branch consists of four convolutional layers (performing 1D convolution, batch normalization and ReLU activation sequentially), two max pooling layers for downsampling, and a global average pooling layer. Outputs from both branches are concatenated along the feature dimension, resulting in a 512-dimensional feature ∈ ℝ d ( d = 512). Details on the filter sizes, number of filters, stride sizes, and pooling sizes are provided in Fig. 5b . Channel & Temporal Encoding Unlike recurrent neural networks (RNNs) that process sequences sequentially, Transformer architectures lack inherent positional encoding mechanisms for capturing temporal or channel-specific information. To address this, we introduce channel and temporal embeddings inspired by LaBraM 58 , but replace additive fusion with concatenation to preserve dimensional distinctiveness ( Fig. 5c ). Specifically, we maintain two embedding sets: a channel embedding set CE = { ce 1 , ce 2 , …, ce | C | } and a temporal embedding set TE = { te 1 , te 2 , …, te L }. Let d ce and d te denote the dimension of the embedding vectors in the CE and TE , respectively. For each feature vector from epoch encoder, we concatenate its corresponding channel and temporal embeddings based on its channel c and temporal position l within the sequence. This results in an embedded feature vector , described by the following equation: where ⊕ denotes the concatenation operation along the feature dimension, is the original feature vector from the epoch encoder, ce c is the channel embedding from the channel embedding list CE , and te l is the temporal embedding from the temporal embedding list TE . The channel and temporal embedding list CE and TE are learnable parameters of the model, optimized during the training process. Padding & Masking In cross-dataset hybrid training, where input sequences maintain a fixed temporal length ( L epochs) but exhibit variability in channel counts ( C i ), we address hardware limitations in batch processing through a padding and masking strategy adapted from natural language processing (NLP). Following segmentation and encoding, feature sequences of variable length L × C i are standardized by unfolding each batch of B sequences into C i × T i matrices ( T i = L , fixed to 10-minute epochs). These matrices are padded with zero vectors to a uniform dimension of C max × T max , where C max and T max are the maximum number of channels and temporal length within the batch. A binary mask is generated to distinguish valid data from padding: where M ij = 1 indicates that the j -th element of the i -th sequence is padding while M ij = 0 indicates valid data. The mask serves dual roles: suppressing padded positions during Transformer self-attention computations to prevent spurious feature interactions, and isolating loss calculations exclusively to valid data points during backpropagation. By enforcing T i = L , we ensure temporal consistency across sequences while enabling efficient batch training. Insertion of CLS token To generate unified representations for variable-length sequences, we introduce the classification (CLS) token, a technique commonly used in Transformer-based architectures. Specifically, L learnable CLS tokens are prepended to each feature sequence ẽ l . These tokens share the dimensionality of the encoded features ( d + d ce + d te ), thereby extending the sequence to with a length of L × ( C + 1). The CLS tokens are initialized randomly and optimized during training to capture global sequence characteristics. Sequence Encoder The sequence encoder processes the padded and masked feature sequences (including CLS tokens) to extract global temporal dependencies from multi-channel, multi-epoch sleep data. Built on the Transformer architecture, it comprises N identical Transformer blocks, each containing a multi-head self-attention (MSA) layer and a feed-forward network (FFN), interleaved with layer normalization (LN). Let E 0 denote the input feature sequence and E out the output sequence. The encoding process for each Transformer block (indexed l = 1, …, N ) is defined as: The final output is computed as: The sequence retains its original dimensions through this process. To generate classification features E cls , the first L elements of E out (corresponding to CLS tokens) are extracted: Classifier Two classifiers are used: one for sleep staging and another for disease diagnosis. Each classifier consists of a fully connected layer with a softmax function. The sleep staging classifier is a five-class classifier responsible for assigning each epoch to one of the five sleep stages. It takes E cls as input and outputs the probabilities for the five sleep stages corresponding to each of the L epochs. The disease diagnosis classifier is a binary classifier designed to predict the presence or absence of a specific disorder. This classifier takes the average feature vector of the sequence, E cls , computed by averaging the features across the L epochs, as input, and outputs the probability indicating the likelihood of the presence or absence of the disorder. Hybrid Training and Cross-Center Testing on Sleep Staging Task We conducted hybrid training on source domain consisting of 16 public datasets and evaluate the performance of our approach on 2 target domain datasets. We create a validation set by stratified random sampling of 10% from each public dataset. The model is evaluated on this validation set after every epoch, and the best performing model parameters are saved. The length of the sleep sequence L is set to 20, meaning that the context length of the considered sequence is 10 minutes. The feature dimension d is set to 512. The dimension of channel encoding d ce and temporal encoding d te are set to 32 and 64, respectively. The number of Transformer Block is set to 4, the number of heads h is 8, and the dimension of feed-forward network is 608. We use the weighted cross-entropy (WCE) function as the loss function for the sleep staging task: where denotes the probability that x i actually belongs to the k -th stage, and denotes the probability that x i is predicted to the k -th stage. The category weight w k is set to the normalized value of the reciprocal of the k -th stage proportion in the training set. We implemented LPSGM based on PyTorch. The model is trained using Adam optimizer with default setting, the weight decay is set to 1e-3 and the mini-batch size is set to 512. The training epochs is 35, with a warm-up phase of 15 epochs. The learning rate undergoes linear increase from 0 to 1e-4 during the warm-up phase, followed by decay according to a cosine annealing strategy to 1e-6. During training, we employed a data augmentation technique that randomly drops channels to prevent the model from over-relying on specific channels or too many channels. Specifically, each channel in a sample x i with C i channels has a 50% chance of being dropped. The model was trained on one machine with Intel Xeon Gold 6330 CPU and two NVIDIA A800 GPUs. Fine-tuning for Disorder Diagnosis Tasks In addition to the sleep staging task, we further fine-tune the model for brain disorder diagnosis tasks, using the cross-entropy loss function. We investigate three fine-tuning paradigms: partial fine-tuning, full fine-tuning, and joint fine-tuning, and compare them against a model trained from scratch. Partial fine-tuning: In this paradigm, we freeze all modules of the pre-trained sleep staging model except for the classifier, which is modified and fine-tuned specifically for the disorder diagnosis task. Full fine-tuning: In this paradigm, both the classifier and the entire model are fine-tuned for the disorder diagnosis task. Joint fine-tuning: This paradigm simultaneously fine-tunes both the sleep staging task and the disorder diagnosis task in parallel. The training procedure for all fine-tuning paradigms consists of 10 epochs, whereas training from scratch involves 30 epochs. In all four experimental paradigms, the initial learning rate lr 0 for the disorder diagnosis classifier is set to 1e-3, while for all other modules, the initial learning rate is set to 1e-5. The AdamW optimizer is used, and the learning rate is adjusted using a cosine annealing strategy. Data Availability See Supplementary Methods and Supplementary Tables 2 and 3 for a detailed description of the data. The APPLES 35 , 36 , STAGES 35 , ABC 35 , 37 , HOMEPAP 35 , 52 , SHHS 35 , 38 , PATS 35 , 39 , 40 , CHAT 35 , 41 , CCSHS 35 , 42 , CFS 35 , 43 , MNC 6 , 35 , NCHSDB 35 , 44 , MROS 35 , 45 , MESA 71 and MNC 6 , 35 datasets are provided by the National Sleep Research Resource with appropriate deidentification. Permission and access for these datasets can be obtained via the online portal: https://www.sleepdata.org . The SVUH 46 , HMC 47 , P2018 48 and CAP 49 datasets are available from PhysioNet at https://physionet.org/content/ucddb/1.0.0/ , https://physionet.org/content/hmc-sleep-staging/1.1/ , https://physionet.org/content/challenge-2018/1.0.0/ and https://physionet.org/content/capslpdb/1.0.0/ . The ISRUC 50 dataset can be accessed from https://sleeptight.isr.uc.pt/ . DOD-H and DOD-O datasets 51 can be downloaded at https://github.com/Dreem-Organization/dreem-learning-open . MASS 70 dataset can be accessed from https://borealisdata.ca/dataverse/MASS . Access to the HANG7 and SYSU dataset 69 is governed by data-use agreements, and it is therefore not publicly available. Code Availability The source code and pre-trained models of our proposed LPSGM is made available in GitHub: https://github.com/Deng-GuiFeng/LPSGM . Author Contributions G.D. and H.J. designed the methodology. J.Z. and Y.Z provided critical insights into the methodological design. G.D. conducted the experiments and performed the primary analysis. G.D. and M.N. carried out the interpretability analysis. S.R., J.X., S.Z., and G.P. contributed to the analysis and interpretation of results. M.N., and Z.Y. collected, organized, and preprocessed the HANG7 dataset. Y.L. collected, organized, and preprocessed the SYSU dataset. W.L., X.L., W.D., W.G., and T.L. led the prospective study. Competing Interests The authors declare no competing interests. Additional Information Supplementary Information Supplementary Methods Datasets To the best of our knowledge, this work represents the most extensive use of data for training a large-scale sleep staging model to date. We aggregated approximately 24,000 full-night PSG recordings, encompassing roughly 220,500 hours of sleep data, from 16 publicly available datasets as source domains for training. A summary of the public datasets is provided below. The Apnea Positive Pressure Long-term Efficacy Study (APPLES) is a multi-center dataset that includes overnight PSG recordings from 1,104 patients with obstructive sleep apnea syndrome (OSAS). The dataset includes four EEG and two EOG channels, and was scored using Rechtschaffen and Kales (R&K) criteria. For depression screening evaluation, we utilized the depression medical history variable (depressionmedhxhp) from the physician’s history and physical form, which categorizes subjects as: None ( n =841), Resolved ( n =87), Ongoing ( n =163), and Accidental skip ( n =7). Subjects with ongoing depression were classified as the Depression group, while those with None or Resolved status were classified as Healthy controls, excluding subjects with accidental skip entries. The Danish Center for Sleep Medicine (DCSM) dataset consists of 255 randomly selected and fully anonymized overnight lab-based PSG recordings from patients visiting the DCSM for the diagnosis of non-specific sleep related disorders. The dataset includes six EEG and two EOG channels, and was scored according to the AASM criteria. The Dreem Open Dataset (DOD) consists of two subsets, DOD-H and DOD-O. DOD-H comes from French Armed Forces Biomedical Research Institute’s (IRBA), and contains PSG recordings from 25 healthy volunteers. DOD-O comes from the Stanford Sleep Medicine Center, and contains PSG recordings from 56 OSAS patients. Both datasets contain twelve and eight EEG channels, respectively. For experimentation, we specifically selected the three and five channels that overlap with the AASM criteria. Both contain two EOG channels and were scored according to the AASM criteria. Haaglanden Medisch Centrum (HMC) dataset consists of 151 randomly selected whole-night PSG of different sleep disorders. The dataset includes four EEG and two EOG channels, and was scored according to AASM criteria. The Institute of Systems and Robotics, University of Coimbra (ISRUC) dataset consists of 126 PSG recordings from the Sleep Medicine Center of the Hospital of the University of Coimbra, Portugal. The dataset comprises three groups of data. Data in group one concerning 100 subjects, with one recording session per subject. Data in group two is gathered from 8 subjects and two recording sessions were performed per subject. Data in group three is collected from one recording session related to 10 healthy subjects. The dataset includes six EEG and two EOG channels and was scored according to the AASM criteria. The St. Vincent’s University Hospital (SVUH) dataset contains 25 full overnight PSG with suspected sleep-disordered breathing. The dataset contains two EEG and two EOG channels and was scored according to R&K criteria. P2018 (You Snooze You Win: The PhysioNet/Computing in Cardiology Challenge 2018) dataset was contributed by the Massachusetts General Hospital’s (MGH) Computational Clinical Neurophysiology Laboratory (CCNL), and the Clinical Data Animation Laboratory (CDAC). The dataset consists of 994 training examples and 989 test examples, with only the training data having labels publicly available. We only use the training data, which includes 6 EEG and 1 EOG channels, and uses the AASM criteria for sleep staging. The Stanford Technology Analytics and Genomics in Sleep (STAGES) dataset was collected on 1500 patients evaluated for sleep disorders from six centers. The dataset contains six EEG and two EOG channels and is annotated based on AASM criteria. The Apnea, Bariatric surgery, and CPAP (ABC) dataset includes 80 patients with severe OSAS, with six EEG and two EOG channels, being scored based on AASM criteria. The Nationwide Children’s Hospital Sleep DataBank (NCHSDB) has 3,984 pediatric sleep studies on 3,673 unique patients conducted at NCH in Columbus, Ohio, USA between 2017 and 2019. The dataset includes six EEG and two EOG channels and using AASM criteria. The Home Positive Airway Pressure (HOMEPAP) study was a multi-center dataset that enrolled 373 patients with suspected moderate and severe OSAS. Subjects were randomized to lab-based and home-based management. We only use the lab-based subset as it includes the channels we need. The lab-based subset includes six EEG and two EOG channels with AASM criteria annotation. The Childhood Adenotonsillectomy Trial (CHAT) is a multi-center dataset that enrolled 1447 children with mild to moderate OSAS. The dataset includes six EEG and two EOG channels with AASM criteria annotation. The Cleveland Children’s Sleep and Health Study (CCSHS) dataset consists of 515 PSG recordings from 907 children aged between 8 and 11 years old. The dataset includes two EEG and two EOG channels, and was scored according to AASM criteria. The Cleveland Family Study (CFS) is a family-based study of sleep apnea worldwide, compring 730 overnight PSG from 2284 indivisuals. The dataset includes two EEG and two EOG channels with AASM criteria. The MROS dataset enrolled 5994 men 65 years or older at six clinical centers. The dataset consists of 3929 PSG recordings with two EEG and two EOG channals, and was scored according to AASM criteria. The Sleep Heart Health Study (SHHS) is a multi-center cohort study implemented by the National Heart Lung & Blood Institute, consisting of two subset. SHHS-1 and SHHS-2 contain 5793 and 2651 overnight PSG, respectively. The dataset contains two EEG and EOG channels and was scored according to R&K criteria. We test the model on two private datasets from different clinical centers to evaluate the cross-center generalization performance of our methods. The discription of each dataset is provided below. The HANG7 dataset was acquired from the Affiliated Mental Health Center & Hangzhou Seventh People’s Hospital, Zhejiang University School of Medicine. Data were randomly selected from the sleep department and collected between October 2018 and July 2022 using the Australian Compumedics Grael polysomnography system. Data collection was conducted at Zhejiang University with Institutional Review Board approval, and written consent was obtained from all subjects or their caregivers. The dataset comprises PSG recordings from 127 subjects (73 females and 54 males, aged 11-57 years with mean age 23.1 ± 9.9 years), including 33 healthy controls, 51 patients diagnosed with narcolepsy (13 with type 1 and 38 with type 2), and 43 patients exhibiting hypersomnia symptoms associated with anxiety and depression but not meeting the diagnostic criteria for narcolepsy. The dataset includes six EEG and two EOG channels sampled at a frequency of 512 Hz. PSG recordings were scored by experienced clinicians according to AASM criteria. The SYSU dataset includes two groups: 20 healthy controls and 24 patients with major depressive disorder (MDD). Data were collected using the Compumedics Profusion EEG recording system with Neuvo amplifier. All subjects were equipped with the same device to collect overnight PSG signals. This study received approval from the Ethics Committee of Guangdong 999 Brain Hospital (approval number: 2020-010-059). The experiments involving healthy individuals were conducted at the sleep laboratory of Sun Yat-sen University, encompassing 80 PSG recordings from 20 healthy undergraduate and graduate students (10 males and 10 females, mean age 21.9 ± 1.2 years) sampled at 500 Hz. Data collection involved 4 consecutive nights per participant, and all participants were normal sleepers without sleep disorders (Pittsburgh Sleep Quality Index: 5.2 ± 3.0, sleep efficiency: 91.05% ± 4.35%). The MDD dataset comprised 24 PSG recordings from 24 MDD patients (13 males and 11 females, mean age 21.4 ± 7.1 years), who were diagnosed by two experienced psychiatrists based on the criteria of Diagnostic and Statistical Manual of Mental Disorders, Fourth Edition (DSM-IV). The scores of 24-item Hamilton Depression Scale (HAMD-24) and the self-rating Depression Scale (SDS) were 22.6±6.20 and 65.57±9.53, respectively. All patients were without drug abuse, suicide risk, pregnancy, present or history of head injuries, seizures, or epilepsy. The EEG signals were sampled at 256 Hz. Both groups included six EEG and two EOG channels and were scored according to the AASM scoring manual by two well-trained sleep technologists. To evaluate the performance of our fine-tuned model for disease diagnosis, we utilized the Mignot Nature Communications (MNC) dataset, a multi-center collection of raw polysomnography data from 10 different cohorts recorded at 12 sleep centers across 3 continents. In this study, we utilized six cohorts from the MNC dataset, encompassing a total of 773 PSG recordings. These cohorts include the Chinese Narcolepsy Cohort (CNC), Danish Hypersomnia Cohort (DHC), French Hypersomnia Cohort (FHC), Italian Hypersomnia Cohort (IHC), Korean Hypersomnia Cohort (KHC), and the Stanford Sleep Cohort (SSC). For our analysis, we used only the raw PSG signals and diagnostic labels from this dataset, without utilizing hypnogram annotations. The dataset comprises three diagnostic categories: non-narcolepsy controls ( n =310), Type 1 narcolepsy patients ( n =254), and patients with other hypersomnia conditions ( n =209). Type 1 narcolepsy diagnosis was confirmed through clinical criteria including cataplexy and/or cerebrospinal fluid hypocretin-1 levels ≤ 110 pg/ml. The other hypersomnia group included patients with Type 2 narcolepsy, idiopathic hypersomnia, and other forms of excessive daytime sleepiness. The detailed diagnostic distribution across cohorts is provided in Supplementary Table 4 . To further evaluate sleep staging performance across diverse populations and recording protocols, we additionally tested LPSGM on three public datasets: the Multi-Ethnic Study of Atherosclerosis (MESA) and two subsets from the Montreal Archive of Sleep Studies (MASS). The Multi-Ethnic Study of Atherosclerosis (MESA) Sleep dataset comprises 2,033 overnight unattended polysomnography recordings collected between 2010-2012 from a multi-ethnic population including African American, White/Caucasian, Hispanic, and Chinese-American participants. The dataset contains one EEG channel and two EOG channels, with sleep staging performed according to AASM criteria. The Montreal Archive of Sleep Studies (MASS) is an open-access database of laboratory-based polysomnography recordings. MASS comprises five subsets (SS1-SS5), from which we selected SS1 and SS3 as they employ 30-second epoch annotations consistent with clinical practice, while other subsets use 20-second epochs. MASS-SS1 comprises 53 overnight PSG recordings, while MASS-SS3 includes 62 overnight PSG recordings. Both subsets contain six EEG and two EOG channels, and were scored according to AASM criteria. Preprocessing For all PSG datasets, we selected eight EEG/EOG channels recommended by the AASM criteria for sleep staging. No other channels or data types present in the datasets were utilized. The chosen eight channels included F3-M2, F4-M1, C3-M2, C4-M1, O1-M2, O2-M1, E1-M2, and E2-M1. All the PSG recordings were initially subjected to a fourth-order bandpass filter (0.3 Hz to 35 Hz) and subsequently resampled at a rate of 100 Hz. Finally, Z-score normalization was applied individually to each channel of every PSG recording: where x represents a single PSG recording, C denotes the set of channels for that recording. The samples were clamped to the range [−10,10] after Z-score normalization to minimize the impact of outliers. We adhered to the current AASM sleep staging standards. For datasets originally labeled according to the R&K standards, we followed the conventional approach of merging stages N3 and N4 into a single N3 stage. Additionally, we removed any sleep epochs without labels, which typically indicated sensor detachment, sleep interruptions, or other anomalies. After removing such segments, the data was divided into two distinct segments at that specific point. Table 2 shows the class distribution across the datasets. Channel Standardization and Indexing A key objective of LPSGM is to flexibly process PSG recordings from heterogeneous sources (summarized in Supplementary Table 2 ), which vary in both the number of available channels and their specific montages. The model achieves this flexibility not by enforcing a strict channel order, but by mapping available channels to a fixed set of channel identities. This process ensures that channel-specific information is correctly interpreted by the model, regardless of the input configuration. The model internally maintains eight learnable channel embedding vectors { ce 1 , ce 2 , …, ce 8 }, as described in the Methods. These embeddings were randomly initialized and optimized during training to represent the unique characteristics of the eight AASM-recommended channels selected for this study: F3-M2, F4-M1, C3-M2, C4-M1, O1-M2, O2-M1, E1-M2 and E2-M1. During preprocessing for any given dataset, we implemented a protocol to map the dataset’s provided channels to these eight model-specific identities: EEG Channel Mapping (F3, F4, C3, C4, O1, O2): For each of the six target EEG locations, we first identified the corresponding channel in the dataset’s metadata. AASM-Referenced: If the channel was already referenced according to AASM guidelines (e.g., F3-M2, F3-A2, F4-M1, F4-A1), it was directly mapped to its corresponding model identity (e.g., F3-A2 was mapped to the F3-M2 identity). Globally-Referenced: If the channel used a global reference (e.g., F3, F3-REF), we checked for the presence of AASM reference signals (M1, M2, A1, A2) in the recording file. If available, we computationally re-referenced the channel by calculating the differential (e.g., F3 signal minus M2 signal) to create the target channel. If AASM reference signals were not available, we directly adopted the globally-referenced channel (e.g., F3) and mapped it to its corresponding model identity (e.g., F3-M2). EOG Channel Mapping (E1, E2): The same principles were applied to the EOG channels. An additional rule was implemented for datasets providing a differential EOG channel (e.g., E1-E2, LOC-ROC); in such cases, the signal was mapped to the E1-M2 model identity. This identity-based mapping, rather than a fixed-order input, is foundational to the model’s design. Before the Epoch Encoder’s features are fed into the Sequence Encoder (Transformer), each channel’s feature vector is concatenated with its corresponding learnable embedding (e.g., ce 3 for C3-M2). The Transformer architecture, being inherently permutation-invariant, processes these tagged feature vectors in parallel. This mechanism, combined with the Padding & Masking strategy (see Methods), allows the model to inherently manage variable numbers of input channels, seamlessly handling both multi-channel and minimal-channel configurations without structural modification or retraining. For a comprehensive implementation of the preprocessing scripts used for each dataset, please refer to our publicly available code repository. Classification Metrics with Probabilistic Extensions Let n be the total number of 30-s epochs retained, C be the number of sleep-stage classes. For each epoch i = 1, …, n are the true-label and predicted-probability vectors (each in [0,1] C and summing to 1). Probabilistic Accuracy The probabilistic accuracy is an extended metric that calculates the average match between the true and predicted probabilistic distributions across all samples. It quantifies how well the predicted probabilities align with the true probabilities on a per-sample basis. This metric essentially computes the weighted sum of the element-wise product of the true and predicted probability distributions for each sample, then averages these values across all samples. Probabilistic Cohen’s Kappa The probabilistic Cohen’s Kappa extends the traditional Cohen’s Kappa to accommodate probabilistic predictions. It measures the agreement between the true and predicted distributions, accounting for the possibility of multiple correct classes per sample. First construct the soft confusion matrix and denote its total weight N = ∑ a , b M ab . Then the observed agreement is and the chance agreement Finally, This extension allows for a more nuanced assessment of model performance when dealing with probabilistic predictions and scenarios where multiple classes may be partially correct for a given sample. It should be noted that when the probability distributions are deterministic (i.e., the probability is concentrated in a single class for each sample, such as [1,0,0], [0,1,0], or [0,0,1]), the probabilistic accuracy and Cohen’s Kappa reduce to their traditional counterparts. In such cases, the probabilistic metrics yield identical results to the conventional accuracy and Cohen’s Kappa, ensuring consistency with standard evaluation practices when dealing with hard classifications. Supplementary Figures Download figure Open in new tab Supplementary Figure 1: Cross-center sleep staging confusion matrices of LPSGM. Panel (a) presents the confusion matrices of LPSGM on the HANG7 and SYSU datasets. The values in each cell represent the number of 30-second epochs classified into each sleep stage, with darker shades indicating higher counts. Precision and recall scores for each sleep stage are reported below. Panel (b) shows confusion matrices for different subject groups within the HANG7 dataset. Panel (c) displays confusion matrices for different subject groups within the SYSU dataset. Download figure Open in new tab Supplementary Figure 2: Cross-center sleep staging confusion matrices of LPSGM on the (a) MESA, (b) MASS-SS1 and (c) MASS-SS3 datasets. The values in each cell represent the number of 30-second epochs classified into each sleep stage, with darker shades indicating higher counts. Precision and recall scores for each sleep stage are reported below. MESA achieved overall accuracy of 83.74%, macro-F1 of 78.61%, and Cohen’s kappa of 77.37%. MASS-SS1 achieved overall accuracy of 79.98%, macro-F1 of 77.98%, and Cohen’s kappa of 73.23%. MASS-SS3 achieved overall accuracy of 84.97%, macro-F1 of 82.83%, and Cohen’s kappa of 78.75%. Per-class F1 scores for MESA were: W (94.64%), N1 (57.35%), N2 (79.74%), N3 (70.17%), R (91.15%). Per-class F1 scores for MASS-SS1 were: W (89.61%), N1 (64.98%), N2 (81.66%), N3 (65.38%), R (88.30%). Per-class F1 scores for MASS-SS3 were: W (90.66%), N1 (61.74%), N2 (87.43%), N3 (82.14%), R (92.20%). Download figure Open in new tab Supplementary Figure 3: Confusion matrices for three-class classification on MNC dataset distinguishing Non-narcolepsy Control, T1 Narcolepsy, and Other Hypersomnia. Results are shown for Full Fine-tune, Partial Fine-tune, and Train from Scratch approaches, evaluated at both sample-wise (left panels) and subject-wise (right panels) levels. Each matrix displays absolute prediction counts with precision and recall values for individual classes presented below the confusion matrices. Download figure Open in new tab Supplementary Figure 4: Confusion matrices for binary classification on MNC dataset discriminating Normal (Non-narcolepsy) versus Abnormal (T1 Narcolepsy and Other Hypersomnia). Results are presented for Full Fine-tune, Partial Fine-tune, and Train from Scratch approaches, with evaluation at both sample-wise (left panels) and subject-wise (right panels) levels. Matrices show absolute prediction counts for each true-predicted label combination. Download figure Open in new tab Supplementary Figure 5: Confusion matrices for binary classification on MNC dataset identifying T1 Narcolepsy, comparing Non-T1 Control (Non-narcolepsy and Other Hypersomnia) versus T1 Narcolepsy. Results are displayed for Full Fine-tune, Partial Fine-tune, and Train from Scratch approaches, evaluated at both sample-wise (left panels) and subject-wise (right panels) levels. Matrices present absolute prediction counts for true versus predicted classifications. Download figure Open in new tab Supplementary Figure 6: Confusion matrices for binary classification on HANG7 dataset discriminating Normal (Healthy Control) versus Abnormal (Narcolepsy and Hypersomnia). Results are shown for Joint Fine-tune, Full Fine-tune, Partial Fine-tune, and Train from Scratch approaches, with evaluation at both sample-wise (left panels) and subject-wise (right panels) levels. Matrices display absolute prediction counts demonstrating within-dataset cross-validation performance. Download figure Open in new tab Supplementary Figure 7: Confusion matrices for cross-dataset transfer evaluation from MNC to HANG7, showing binary classification performance for Normal versus Abnormal discrimination without additional fine-tuning on the target dataset. Results demonstrate the generalizability of models fine-tuned on MNC dataset when directly applied to HANG7 cohort, evaluated at both sample-wise (left panel) and subject-wise (right panel) levels. Download figure Open in new tab Supplementary Figure 8: LPSGM performance for depression screening across datasets. Performance evaluation for Healthy versus Depressive classification on the SYSU dataset, comparing Train from Scratch, Partial Fine-tune, Full Fine-tune, and Joint Fine-tune approaches through both accuracy metrics (a) and receiver operating characteristic (ROC) curve analysis (b), with results displayed at sample-wise (left panels) and subject-wise (right panels) levels and corresponding area under the curve (AUC) values for all fine-tuning strategies. (c) Performance evaluation for depression screening on the APPLES dataset using Full Fine-tune approach, showing accuracy, sensitivity, and specificity metrics (left) and ROC curve analysis (right) at both sample-wise and subject-wise levels. Depression classification was based on ongoing depression status from medical history records in the APPLES dataset. Download figure Open in new tab Supplementary Figure 9: Grad-CAM visualizations of LPSGM predictions for subjects lacking a posterior dominant rhythm (PDR). According to AASM guidelines, a subset of individuals (approximately 10%) do not generate an alpha rhythm upon eye closure, necessitating reliance on alternative biomarkers for scoring Wakefulness and Stage N1. This figure displays representative 30-second epochs from such subjects, where heatmaps (magenta) indicate regions with high positive contribution to the model’s decision. (a) Stage W: The model directs its attention to the EOG channels (E1/E2), highlighting high-amplitude waveforms characteristic of Eye Blinks. (b) Stage W: The model focuses on Rapid Eye Movements (REMs) in the EOG channels (E1/E2), which serve as an alternative criterion for scoring Wakefulness in the absence of alpha rhythm. (c) Stage N1: Lacking alpha attenuation as a marker, the model identifies N1 by focusing on Slow Eye Movements (SEM) in the EOG channels and Vertex Sharp Waves (V Waves) in the central EEG (C4). (d) Stage N2: The model consistently attends to characteristic K-complexes and Sleep Spindles, demonstrating that the detection of these phasic events remains robust independent of the background rhythm. Supplementary Tables View this table: View inline View popup Download powerpoint Supplementary Table 1: Definitions of hypnogram metrics. View this table: View inline View popup Supplementary Table 2: Overview of sleep staging datasets used in our experiments. View this table: View inline View popup Supplementary Table 3: Number of sleep stages of the datasets after preprocessing. View this table: View inline View popup Download powerpoint Supplementary Table 4: Subject distribution across diagnostic categories in the MNC dataset cohorts. View this table: View inline View popup Download powerpoint Supplementary Tabel 5: LPSGM sleep staging performance across different channel configurations on the HANG7 dataset. Supplementary Notes Acknowledgements The Apnea Positive Pressure Long-term Efficacy Study (APPLES) was supported by the National Heart, Lung, and Blood Institute (U01HL68060). This research has been conducted using the STAGES - Stanford Technology, Analytics and Genomics in Sleep Resource funded by the Klarman Family Foundation. The investigators of the STAGES study contributed to the design and implementation of the STAGES cohort and/or provided data and/or collected biospecimens, but did not necessarily participate in the analysis or writing of this report. The full list of STAGES investigators can be found at the project website. The Apnea, Bariatric surgery, and CPAP study (ABC Study) was supported by National Institutes of Health grants R01HL106410 and K24HL127307. Philips Respironics donated the CPAP machines and supplies used in the perioperative period for patients undergoing bariatric surgery. The Home Positive Airway Pressure study (HomePAP) was supported by the American Sleep Medicine Foundation 38-PM-07 Grant: Portable Monitoring for the Diagnosis and Management of OSA. The Sleep Heart Health Study (SHHS) was supported by National Heart, Lung, and Blood Institute cooperative agreements U01HL53916 (University of California, Davis), U01HL53931 (New York University), U01HL53934 (University of Minnesota), U01HL53937 and U01HL64360 (Johns Hopkins University), U01HL53938 (University of Arizona), U01HL53940 (University of Washington), U01HL53941 (Boston University), and U01HL63463 (Case Western Reserve University). The Pediatric Adenotonsillectomy Trial for Snoring (PATS) study was supported by the U.S. National Institutes of Health, National Heart Lung and Blood Institute (1U01HL125307, 1U01HL125295). The Childhood Adenotonsillectomy Trial (CHAT) was supported by the National Institutes of Health (HL083075, HL083129, UL1-RR-024134, UL1 RR024989). The Cleveland Children’s Sleep and Health Study (CCSHS) was supported by grants from the National Institutes of Health (RO1HL60957, K23 HL04426, RO1 NR02707, M01 Rrmpd0380-39). The Cleveland Family Study (CFS) was supported by grants from the National Institutes of Health (HL46380, M01 RR00080-39, T32-HL07567, RO1-46380). The Mignot Nature Communications research was mostly supported by a grant from Jazz Pharmaceuticals to E.M. Additional funding came from: NIH grant R01HL62252 (to P.E.P.); Ministry of Science and Technology 2015CB856405 and National Foundation of Science of China 81420108002,81670087 (to F.H.); H. Lundbeck A/S, Lundbeck Foundation, Technical University of Denmark and Center for Healthy Aging, University of Copenhagen (to P.J. and H.B.D.S). Additional support was provided by the Klarman Family, Otto Mønsted, Stibo, Vera & Carl Johan Michaelsens, Knud Højgaards, Reinholdt W. Jorck and Hustrus and Augustinus Foundations (to A.N.O.). NCH Sleep DataBank was supported by the National Institute of Biomedical Imaging and Bioengineering of the National Institutes of Health under Award Number R01EB025018. The National Heart, Lung, and Blood Institute provided funding for the ancillary MrOS Sleep Study, “Outcomes of Sleep Disorders in Older Men,” under the following grant numbers: R01 HL071194, R01 HL070848, R01 HL070847, R01 HL070842, R01 HL070841, R01 HL070837, R01 HL070838, and R01 HL070839. The Mignot Nature Communications research was mostly supported by a grant from Jazz Pharmaceuticals to E.M. Additional funding came from: NIH grant R01HL62252 (to P.E.P.); Ministry of Science and Technology 2015CB856405 and National Foundation of Science of China 81420108002,81670087 (to F.H.); H. Lundbeck A/S, Lundbeck Foundation, Technical University of Denmark and Center for Healthy Aging, University of Copenhagen (to P.J. and H.B.D.S). Additional support was provided by the Klarman Family, Otto Mønsted, Stibo, Vera & Carl Johan Michaelsens, Knud Højgaards, Reinholdt W. Jorck and Hustrus and Augustinus Foundations (to A.N.O.). The National Sleep Research Resource was supported by the National Heart, Lung, and Blood Institute (R24 HL114473, 75N92019R002). The National Sleep Research Resource was supported by the U.S. National Institutes of Health, National Heart Lung and Blood Institute (R24 HL114473, 75N92019R002). Acknowledgements This work was supported by STI2030-Major Projects (2022ZD0212400, 2021ZD0200404) National Natural Science Foundation of China (82371453), Key R&D Program of Zhejiang(2024C03006,2024C04024, 2024ZY01010), Fundamental Research Funds for the Central Universities (2025ZFJH01-01) and the Construction Fund of Key Medical Disciplines of Hangzhou (2025HZGF10) Footnotes we have added two large datasets to validate sleep disorder diagnosis (n=773) and depression screening (n=1091). Besides, we have tested LPSGM on three additional public datasets: MESA (n=2,033) and MASS (SS1 and SS3, n=115). Reflecting this coverage of both neurological and psychiatric conditions, we have updated the manuscript title. We have undertaken a substantial revision to address all comments raised by the reviewers. References 1. ↵ Maquet , P . The role of sleep in learning and memory . Science 294 , 1048 – 1052 ( 2001 ). OpenUrl Abstract / FREE Full Text 2. ↵ Irwin , M. R . Why sleep is important for health: a psychoneuroimmunology perspective . Annu. Rev. Psychol . 66 , 143 – 172 ( 2015 ). OpenUrl CrossRef PubMed 3. ↵ Benjafield , A. V. et al. Estimation of the global prevalence and burden of obstructive sleep apnoea: a literature-based analysis . Lancet Respir. Med . 7 , 687 – 698 ( 2019 ). OpenUrl PubMed 4. ↵ Morin , C. M. & Jarrin , D. C . Epidemiology of insomnia: prevalence, course, risk factors, and public health burden . Sleep Med. Clin . 17 , 173 – 191 ( 2022 ). OpenUrl CrossRef PubMed 5. ↵ American Academy of Sleep Medicine . AASM Manual for the Scoring of Sleep and Associated Events: Rules, Terminology and Technical Specifications . 3rd edn ( American Academy of Sleep Medicine , Darien, IL , 2023 ). 6. ↵ Stephansen , J. B. et al. Neural network analysis of sleep stages enables efficient diagnosis of narcolepsy . Nat. Commun . 9 , 5229 ( 2018 ). OpenUrl CrossRef PubMed 7. ↵ Quinnell , T. G. & Smith , I. E . Narcolepsy, idiopathic hypersomnolence and related conditions . Clin. Med . 11 , 282 ( 2011 ). OpenUrl FREE Full Text 8. ↵ Lopez , J. et al. Reduced sleep spindle activity in early-onset and elevated risk for depression . J. Am. Acad. Child Adolesc. Psychiatry 49 , 934 – 943 ( 2010 ). OpenUrl CrossRef PubMed 9. Riemann , D. , Krone , L. B. , Wulff , K. & Nissen , C. Sleep, insomnia, and depression . Neuropsychopharmacol. 45 , 74 – 89 ( 2020 ). OpenUrl CrossRef PubMed 10. Kupfer , D. J . REM latency: a psychobiologic marker for primary depressive disease . Biol. Psychiatry 11 , 159 – 174 ( 1976 ). OpenUrl PubMed Web of Science 11. ↵ Kupfer , D. & Foster , F. G . Interval between onset of sleep and rapid-eye-movement sleep as an indicator of depression . Lancet 300 , 684 – 686 ( 1972 ). OpenUrl CrossRef 12. ↵ Rosenberg , R. S. & Van Hout , S . The American Academy of Sleep Medicine Inter-scorer Reliability Program: Sleep Stage Scoring . Journal of Clinical Sleep Medicine 09 , 81 – 87 ( 2013 ). OpenUrl 13. Zhang , X. et al. Process and outcome for international reliability in sleep scoring . Sleep Breath 19 , 191 – 195 ( 2015 ). OpenUrl PubMed 14. Danker-Hopfe , H. et al. Interrater reliability for sleep scoring according to the Rechtschaffen & Kales and the new AASM standard . J. Sleep Res . 18 , 74 – 84 ( 2009 ). OpenUrl CrossRef PubMed 15. MacLean , A. W. , Lue , F. & Moldofsky , H . The reliability of visual scoring of alpha EEG activity during sleep . Sleep 18 , 565 – 569 ( 1995 ). OpenUrl PubMed 16. ↵ Kim , Y. D. et al. Agreement of visual scoring of sleep stages among many laboratories in Japan: effect of a supplementary definition of slow wave on scoring of slow wave sleep . Psychiatry Clin. Neurosci . 47 , 91 – 97 ( 1993 ). OpenUrl 17. ↵ Hassan , A. R. & Bhuiyan , M. I. H . Automatic sleep stage classification . In 2015 2nd Int. Conf. on Electr. Inf. Commun. Tech. (EICT) 211 – 216 ( IEEE , 2015 ). 18. Rahman , M. A. et al. Optimization of sleep stage classification using single-channel EEG signals . In 2019 4th Int. Conf. on Electr. Inf. Commun. Tech. (EICT) 1 – 6 ( IEEE , 2019 ). 19. Hassan , A. R. & Bhuiyan , M. I. H . A decision support system for automatic sleep staging from EEG signals using tunable Q-factor wavelet transform and spectral features . J. Neurosci. Methods 271 , 107 – 118 ( 2016 ). OpenUrl CrossRef PubMed 20. Fraiwan , L. , Lweesy , K. , Khasawneh , N. , Wenz , H. & Dickhaus , H . Automated sleep stage identification system based on time–frequency analysis of a single EEG channel and random forest classifier . Comput. Meth. Programs Biomed . 108 , 10 – 19 ( 2012 ). OpenUrl 21. Şen , B. , Peker , M. , Çavuşoğlu , A. & Çelebi , F. V. A comparative study on classification of sleep stage based on EEG signals using feature selection and classification algorithms . J. Med. Syst . 38 , 1 – 21 ( 2014 ). OpenUrl CrossRef PubMed 22. Zhao , S. et al. Evaluation of a single-channel EEG-based sleep staging algorithm . Int. J. Environ. Res. Public Health 19 , 2845 ( 2022 ). OpenUrl PubMed 23. ↵ Touil , M. , Bahatti , L. & El Magri , A . Sleep’s depth detection using electroencephalogram signal processing and neural network classification . J. Med. Artif. Intell . 5 , 9 ( 2022 ). OpenUrl 24. ↵ Supratak , A. , Dong , H. , Wu , C. & Guo , Y . DeepSleepNet: a model for automatic sleep stage scoring based on raw single-channel EEG . IEEE Trans. Neural Syst. Rehabil. Eng . 25 , 1998 – 2008 ( 2017 ). OpenUrl CrossRef PubMed 25. ↵ Supratak , A. & Guo , Y . TinySleepNet: an efficient deep learning model for sleep stage scoring based on raw single-channel EEG . In 2020 42nd Annu. Int. Conf. IEEE Eng. Med. Biol. Soc. (EMBC) 641 – 644 ( IEEE , 2020 ). 26. Biswal , S. et al. Expert-level sleep scoring with deep neural networks . J. Am. Med. Inform. Assoc . 25 , 1643 – 1650 ( 2018 ). OpenUrl CrossRef PubMed 27. Phan , H. et al. XSleepNet: multi-view sequential model for automatic sleep staging . IEEE Trans. Pattern Anal. Mach. Intell . 44 , 5903 – 5915 ( 2022 ). OpenUrl CrossRef PubMed 28. ↵ Yoo , C. , Lee , H. W. & Kang , J.-W . Transferring structured knowledge in unsupervised domain adaptation of a sleep staging network . IEEE J. Biomed. Health Inform . 26 , 1273 – 1284 ( 2022 ). OpenUrl PubMed 29. ↵ Fan , J. et al. Unsupervised domain adaptation by statistics alignment for deep sleep staging networks . IEEE Trans. Neural Syst. Rehabil. Eng . 30 , 205 – 216 ( 2022 ). OpenUrl PubMed 30. ↵ Guillot , A. & Thorey , V . RobustSleepNet: transfer learning for automated sleep staging at scale . IEEE Trans. Neural Syst. Rehabil. Eng . 29 , 1441 – 1451 ( 2021 ). OpenUrl CrossRef PubMed 31. ↵ Deng , Z. et al. Adversarial training helps transfer learning via better representations . Adv. Neural Inf. Process. Syst . 34 , 25179 – 25191 ( 2021 ). OpenUrl 32. ↵ Wang , J. et al. Generalizable sleep staging via multi-level domain alignment . Proc. AAAI Conf. Artif. Intell . 38 , 265 – 273 ( 2024 ). OpenUrl 33. ↵ Perslev , M. , et al. U-Time: a fully convolutional network for time series segmentation applied to sleep staging . Adv. Neural Inf. Process. Syst. 32 , – ( 2019 ). 34. ↵ Eldele , E. et al. An attention-based deep learning approach for sleep stage classification with single-channel EEG . IEEE Trans. Neural Syst. Rehabil. Eng . 29 , 809 – 818 ( 2021 ). OpenUrl CrossRef PubMed 35. ↵ Zhang , G.-Q. et al. The National Sleep Research Resource: towards a sleep data commons . J. Am. Med. Inform. Assoc . 25 , 1351 – 1358 ( 2018 ). OpenUrl CrossRef PubMed 36. ↵ Quan , S. F. et al. The association between obstructive sleep apnea and neurocognitive performance—The Apnea Positive Pressure Long-term Efficacy Study (APPLES) . Sleep 34 , 303 – 314 ( 2011 ). OpenUrl PubMed Web of Science 37. ↵ Bakker , J. P. et al. Gastric banding surgery versus continuous positive airway pressure for obstructive sleep apnea: a randomized controlled trial . Am. J. Respir. Crit. Care Med . 197 , 1080 – 1083 ( 2018 ). OpenUrl CrossRef PubMed 38. ↵ Quan , S. F. et al. The sleep heart health study: design, rationale, and methods . Sleep 20 , 1077 – 1085 ( 1997 ). OpenUrl CrossRef PubMed Web of Science 39. ↵ Wang , R. et al. Pediatric Adenotonsillectomy Trial for Snoring (PATS): protocol for a randomised controlled trial to evaluate the effect of adenotonsillectomy in treating mild obstructive sleep-disordered breathing . BMJ Open 10 , e033889 ( 2020 ). OpenUrl Abstract / FREE Full Text 40. ↵ Redline , S. et al. Adenotonsillectomy for snoring and mild sleep apnea in children: a randomized clinical trial . JAMA 330 , 2084 – 2095 ( 2023 ). OpenUrl CrossRef PubMed 41. ↵ Marcus , C. L. et al. A randomized trial of adenotonsillectomy for childhood sleep apnea . N. Engl. J. Med . 368 , 2366 – 2376 ( 2013 ). OpenUrl CrossRef PubMed Web of Science 42. ↵ Rosen , C. L. et al. Prevalence and risk factors for sleep-disordered breathing in 8-to 11-year-old children: association with race and prematurity . J. Pediatr . 142 , 383 – 389 ( 2003 ). OpenUrl CrossRef PubMed Web of Science 43. ↵ Redline , S. et al. The familial aggregation of obstructive sleep apnea . Am. J. Respir. Crit. Care Med . 151 , 682 – 687 ( 1995 ).. OpenUrl CrossRef PubMed Web of Science 44. ↵ Lee , H. et al. A large collection of real-world pediatric sleep studies . Sci. Data 9 , 421 ( 2022 ). OpenUrl PubMed 45. ↵ Blackwell , T. et al. Associations between sleep architecture and sleep-disordered breathing and cognition in older community-dwelling men: the osteoporotic fractures in men sleep study . J. Am. Geriatr. Soc . 59 , 2217 – 2225 ( 2011 ). OpenUrl CrossRef PubMed 46. ↵ Goldberger , A. L. et al. PhysioBank, PhysioToolkit, and PhysioNet: components of a new research resource for complex physiologic signals . Circulation 101 , e215 – e220 ( 2000 ). OpenUrl Abstract / FREE Full Text 47. ↵ Alvarez-Estevez , D. & Rijsman , R. M. Haaglanden Medisch Centrum sleep staging database (version 1.0.1) . PhysioNet ( 2021 ). 48. ↵ Ghassemi , M. et al. You snooze, you win: the PhysioNet/Computing in Cardiology Challenge 2018 . In 2018 Comput. Cardiol. Conf. (CinC) 45 , 1 – 4 ( 2018 ). OpenUrl 49. ↵ Terzano , M. G. et al. Atlas, rules, and recording techniques for the scoring of cyclic alternating pattern (CAP) in human sleep . Sleep Med . 2 , 537 – 554 ( 2001 ). OpenUrl CrossRef PubMed Web of Science 50. ↵ Khalighi , S. et al. ISRUC-Sleep: a comprehensive public dataset for sleep researchers . Comput. Meth. Programs Biomed . 124 , 180 – 192 ( 2016 ). OpenUrl 51. ↵ Guillot , A. et al. Dreem open datasets: multi-scored sleep datasets to compare human and automated sleep staging . IEEE Trans. Neural Syst. Rehabil. Eng . 28 , 1955 – 1965 ( 2020 ). OpenUrl CrossRef PubMed 52. ↵ Rosen , C. L. et al. A multisite randomized trial of portable sleep studies and positive airway pressure autotitration versus laboratory-based polysomnography for the diagnosis and treatment of obstructive sleep apnea: the HomePAP study . Sleep 35 , 757 – 767 ( 2012 ). OpenUrl CrossRef PubMed 53. ↵ Achiam , J. , et al. GPT-4 technical report . Preprint at doi: 10.48550/arXiv.2303.08774 ( 2023 ). OpenUrl CrossRef 54. Lewis , M. et al. BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension . In Proc. 58th Annu. Meet. Assoc. Comput. Linguist . 7871 – 7880 ( ACL , 2020 ). 55. ↵ Burstein , J. , Doran , C. Solorio , T. Devlin , J. , Chang , M.-W. , Lee , K. & Toutanova , K. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding . In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) (eds Burstein , J. , Doran , C. & Solorio , T. ) 4171 – 4186 ( Association for Computational Linguistics, Minneapolis, Minnesota , 2019 ). 56. ↵ Wang , J. , Zhao , S. , Luo , Z. , Zhou , Y. , Jiang , H. , Li , S. , Li , T. & Pan , G . CBraMod: A Criss– Cross Brain Foundation Model for EEG Decoding . In Proceedings of the 13th International Conference on Learning Representations ( 2025 ). 57. Yi , K. et al. Learning topology-agnostic EEG representations with geometry-aware modeling . Adv. Neural Inf. Process. Syst . 36 ( 2024 ). 58. ↵ Jiang , W.-B. , Zhao , L.-M. & Lu , B.-L . Large brain model for learning generic representations with tremendous EEG data in BCI . In The Twelfth International Conference on Learning Representations ( 2024 ). 59. ↵ Selvaraju , R. R. et al. Grad-CAM: visual explanations from deep networks via gradient-based localization . In 2017 IEEE Int. Conf. Computer Vision (ICCV) 618 – 626 ( IEEE , 2017 ). 60. ↵ Esser-Skala , W. & Fortelny , N . Reliable interpretability of biology-inspired deep neural networks . NPJ Syst. Biol. Appl . 9 , 50 ( 2023 ). OpenUrl PubMed 61. Sapoval , N. et al. Current progress and open challenges for applying deep learning across the biosciences . Nat. Commun . 13 , 1728 ( 2022 ). OpenUrl CrossRef PubMed 62. ↵ Mamede , S. & Schmidt , H. G . Making large language models into reliable physician assistants . Nat. Med . 31 , 1071 – 1072 ( 2025 ). OpenUrl PubMed 63. ↵ Nasarian , E. , Alizadehsani , R. , Acharya , U. R. & Tsui , K.-L . Designing interpretable ML system to enhance trust in healthcare: a systematic review to proposed responsible clinician– AI–collaboration framework. Info . Fusion 108 , 102412 ( 2024 ). OpenUrl 64. ↵ Phan , H. et al. SleepTransformer: Automatic sleep staging with interpretability and uncertainty quantification . IEEE Trans. Biomed. Eng . 69 , 2456 – 2467 ( 2022 ). OpenUrl CrossRef PubMed 65. ↵ Alvaro , P. K. , Roberts , R. M. & Harris , J. K . A systematic review assessing bidirectionality between sleep disturbances, anxiety, and depression . Sleep 36 , 1059 – 1068 ( 2013 ) OpenUrl CrossRef PubMed Web of Science 66. Scott , A. J. , Webb , T. L. , Martyn-St James , M. , Rowse , G. & Weich , S . Improving sleep quality leads to better mental health: a meta-analysis of randomised controlled trials . Sleep Med. Rev . 60 , 101556 ( 2021 ). OpenUrl CrossRef PubMed 67. Tahmasian , M. et al. The interrelation of sleep and mental and physical health is anchored in grey-matter neuroanatomy and under genetic control . Commun. Biol. 3 , 171 ( 2020 ). OpenUrl PubMed 68. ↵ Meyer , N. , et al. The sleep-circadian interface: a window into mental disorders . Proc. Natl. Acad. Sci. U.S.A. 121 , e2214756121 ( 2024 ). OpenUrl CrossRef PubMed 69. ↵ Wei , Y. , Zhu , Y. , Zhou , Y. , Yu , X. & Luo , Y . Automatic sleep staging based on contextual scalograms and attention convolution neural network using single-channel EEG . IEEE J. Biomed. Health Inform . 28 , 801 – 811 ( 2024 ). OpenUrl PubMed 70. ↵ O’Reilly , C. , Gosselin , N. , Carrier , J. & Nielsen , T . Montreal Archive of Sleep Studies: an open-access resource for instrument benchmarking and exploratory research . J. Sleep Res . 23 , 628 – 635 ( 2014 ). OpenUrl CrossRef PubMed 71. ↵ Chen , X. et al. Racial/ethnic differences in sleep disturbances: the Multi-Ethnic Study of Atherosclerosis (MESA) . Sleep 38 , 877 – 888 ( 2015 ). OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted November 27, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A Unified Flexible Large Polysomnography Model for Sleep Staging and Brain Disorder Diagnosis Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A Unified Flexible Large Polysomnography Model for Sleep Staging and Brain Disorder Diagnosis Guifeng Deng , Mengfan Niu , Shuying Rao , Yuxi Luo , Jianjia Zhang , Junyi Xie , Zhenghe Yu , Wenjuan Liu , Junhang Zhang , Sha Zhao , Gang Pan , Xiaojing Li , Wei Deng , Wanjun Guo , Yaoyun Zhang , Tao Li , Haiteng Jiang medRxiv 2024.12.11.24318815; doi: https://doi.org/10.1101/2024.12.11.24318815 Share This Article: Copy Citation Tools A Unified Flexible Large Polysomnography Model for Sleep Staging and Brain Disorder Diagnosis Guifeng Deng , Mengfan Niu , Shuying Rao , Yuxi Luo , Jianjia Zhang , Junyi Xie , Zhenghe Yu , Wenjuan Liu , Junhang Zhang , Sha Zhao , Gang Pan , Xiaojing Li , Wei Deng , Wanjun Guo , Yaoyun Zhang , Tao Li , Haiteng Jiang medRxiv 2024.12.11.24318815; doi: https://doi.org/10.1101/2024.12.11.24318815 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (574) Allergy and Immunology (865) Anesthesia (304) Cardiovascular Medicine (4462) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (611) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15251) Forensic Medicine (31) Gastroenterology (1132) Genetic and Genomic Medicine (6621) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4564) Health Policy (1372) Health Systems and Quality Improvement (1617) Hematology (544) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15938) Intensive Care and Critical Care Medicine (1107) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6643) Nursing (346) Nutrition (1001) Obstetrics and Gynecology (1149) Occupational and Environmental Health (957) Oncology (3350) Ophthalmology (981) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1698) Pharmacology and Therapeutics (694) Primary Care Research (714) Psychiatry and Clinical Psychology (5465) Public and Global Health (9259) Radiology and Imaging (2212) Rehabilitation Medicine and Physical Therapy (1372) Respiratory Medicine (1199) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (533) Surgery (715) Toxicology (100) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a03dc098fe7b7885',t:'MTc4MDE0NDI0OQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00