Learning Residual-based Biomarkers of Cognitive Health via Self-Supervised Learning on EEG State Transitions

doi:10.1101/2025.09.21.677598

Learning Residual-based Biomarkers of Cognitive Health via Self-Supervised Learning on EEG State Transitions

2025 · doi:10.1101/2025.09.21.677598

preprint OA: closed CC-BY-4.0

📄 Open PDF Full text JSON View at publisher

Full text 99,210 characters · extracted from preprint-html · click to expand

Learning Residual-based Biomarkers of Cognitive Health via Self-Supervised Learning on EEG State Transitions | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Learning Residual-based Biomarkers of Cognitive Health via Self-Supervised Learning on EEG State Transitions View ORCID Profile Thomas Tveitstøl , View ORCID Profile Mats Tveter , View ORCID Profile Christoffer Hatlestad-Hall , View ORCID Profile Hugo L Hammer , View ORCID Profile Ira R J Hebold Haraldsen doi: https://doi.org/10.1101/2025.09.21.677598 Thomas Tveitstøl a Department of Neurology, Oslo University Hospital , Oslo, Norway b Institute of Clinical Medicine, Faculty of Medicine, University of Oslo , Oslo, Norway Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Thomas Tveitstøl Mats Tveter a Department of Neurology, Oslo University Hospital , Oslo, Norway b Institute of Clinical Medicine, Faculty of Medicine, University of Oslo , Oslo, Norway Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mats Tveter Christoffer Hatlestad-Hall a Department of Neurology, Oslo University Hospital , Oslo, Norway Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Christoffer Hatlestad-Hall Hugo L Hammer c Department of Computer Science, Oslo Metropolitan University , Oslo, Norway d Department of Holistic Systems , SimulaMet, Oslo, Norway Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hugo L Hammer Ira R J Hebold Haraldsen a Department of Neurology, Oslo University Hospital , Oslo, Norway Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ira R J Hebold Haraldsen Abstract Full Text Info/History Metrics Preview PDF Abstract Deep learning (DL) models have achieved impressive performance in EEG-based prediction tasks, but they often lack interpretability, limiting their clinical utility. In this study, we introduce a novel self-supervised learning (SSL) framework inspired by neurophysiological reactivity. Our approach models healthy EEG transitions between ocular states by predicting an EEG-derived feature under eyes-open conditions, using features from eyes-closed recordings. The residual between observed and predicted values quantifies deviations from normative brain dynamics and serves as a candidate biomarker. To improve clinical relevance, we propose two optimisation strategies that promote residuals predictive of pathology. We evaluated the framework using healthy cohorts (LEMON, Dortmund Vital) for self-supervised training, and the AI-Mind cohort for downstream prediction of plasma p-tau217 levels, a proxy for cognitive pathology. Despite extensive hyperparameter optimisation, predictive performance remained poor across all methods, including baseline models, suggesting limitations in the downstream proxy or input signal. Nonetheless, our approach provides a transparent methodology for transition-based EEG biomarker discovery grounded in self-supervised learning. 1. Introduction Artificial intelligence (AI) is expected to play an increasingly important role in healthcare and medicine [ 1 ]. The AI-Mind project aims to develop predictive tools for identifying individuals with mild cognitive impairment (MCI) who are at risk of progressing to dementia [ 2 ]. Electroencephalography (EEG) provides valuable information for such models, as it captures electrical brain activity at the synaptic level with high temporal resolution [ 3 ]. Furthermore, deep learning (DL), a subfield of AI that processes data through multiple non-linear layers, is of high interest due to its strong ability to learn complex patterns in a datadriven manner, also beyond clinically recognised features [ 4 ]. DL has demonstrated strong performance in pattern recognition, with significant advancements in natural language processing, computer vision, and speech recognition [ 5 , 6 , 7 , 8 ]. Moreover, applying DL to EEG data has shown promising results for predictive modelling [ 9 , 10 , 11 , 12 ]. However, from a neurophysiological perspective, factors beyond empirical risk minimisation are important. Although DL models can extract complex patterns from EEG, their outputs are often not biologically interpretable. Developing DL methods that yield interpretable biomarkers is therefore an interesting research direction. To address this, we propose a self-supervised learning (SSL) framework that models normative transitions between EEG ocular states - an approach inspired by reactivity , a well-established neurophysiological phenomenon related to arousal fluctuation, intrinsic attention allocation, and visual stimuli exposure [ 13 , 14 , 15 , 16 ]. Specifically, our framework learns normative relationships between resting-state EEG during eyes-closed and eyes-open conditions. Distinct from common SSL-based approaches, we model expectation values of neural activity rather than learning latent feature representations. Furthermore, our approach addresses additional aspects beyond empirical risk minimisation, as we estimate expected neural responses under healthy conditions and compute residuals. The utilisation of these residuals, which represent deviations from healthy expectation values, results in a more transparent and neurophysiologically grounded output. Therefore, our framework builds on the neurophysiological concept of reactivity, which refers to characteristic transitions in brain activity that occur in response to stimuli, such as when a subject opens or closes their eyes [ 17 ]. In SSL, the DL model is trained on a pretext task , using pseudo-targets which are generated from the data itself rather than human-collected targets [ 18 , 19 ]. An example from computer vision is the jigsawpuzzle, where the model learns to identify the relative positioning of shuffled image tiles [ 20 ]. Several SSL techniques have been explored for EEG analysis. These include signal transformation recognition, where the pretext task requires to identify the transformation applied to the signal [ 21 , 22 , 23 , 24 , 25 ]. Masked graph autoencoders have been investigated with channel masking [ 26 , 27 ] and frequency masking [ 26 ]. Furthermore, contrastive learning has been used with various data augmentation techniques [ 28 , 29 , 24 , 30 , 31 , 10 ]. Masked temporal and spatial recognition were employed in [ 32 ], where Gaussian noise was added to an event related potential (ERP) component or a brain region as part of the pretext tasks. In [ 33 ], the model was trained to determine whether the EEG windows had been rearranged, while in [ 34 ], the pretext task involved predicting the next EEG slice. While many SSL-based approaches aim to produce general-purpose feature representations, some studies have tailored their pretext tasks to specific downstream applications. For instance, tailoring the pretext task to the downstream task was made on sleep EEG [ 31 ], as the design of the pretext task required an understanding of the sleep cycle. In a particularly comprehensive evaluation, demonstrated the effectiveness of SSL in sleep staging in the few-labels regime. Furthermore, a review on deep representation learning for brain-computer interfaces (BCI) identified the main motivations for its use, including performance improvement, learning robustness to noise, learning invariances, learning in small-data regimes, integrating heterogeneous components, and uncovering underlying structures in EEG data [ 36 ]. While current SSL methodology has proven effective for its purposes, we argue that there are additional avenues to explore for their application to EEG. The key distinction of our framework compared to previous works, is that, rather than learning latent vector representations or general-purpose embedding vectors, we aim to model expectation values relevant to cognitive health, and use the residuals as biomarkers. The key contributions of this study are: We propose a novel SSL-based framework that learns normative EEG response patterns across ocular states and uses the deviation (residual) from these learned expectations as a potential biomarker. We introduce two optimisation strategies which explicitly promotes residuals that carry relevant information rather than strict empirical risk minimisation on the pretext task. We highlight that these strategies are not bounded by our application, but can readily be applied to any residual-based biomarker, such as the brain age gap. We perform transparent and well-documented testing of our designed methods through comprehensive hyperparameter optimisation. 2. Methods This section is structured as follows: First, we introduce a generalised SSL-based framework for learning cross-ocular state expectation values on EEG data. Second, we propose two methods for deriving maximally relevant residuals by (1) supervised hyperparameter optimisation (HPO) and (2) semi-supervised multi-task learning (MTL). Third, we describe the specific pretext tasks used in our experiments, and outline additional experimental details such as datasets, hyperparameters (HPs) and HP distributions (HPDs), and baselines. All code is publicly available at https://github.com/thomastveitstol/SSL-Features-EEG . 2.1 Learning normative cross-ocular EEG relationships via SSL Our framework is conceptually simple: predict a chosen scalar feature of the EEG in one ocular state using selected EEG features from the other ocular state. This serves as the pretext task. Then, this residual is used for predictive modelling on a downstream task of interest. Figure 1 illustrates this framework. Download figure Open in new tab Figure 1. Overview of the proposed framework. In the pretext task, features are extracted from the EEG in the ocular state O 1 , and used to predict a scalar feature/pseudo target, y p , calculated from the EEG in the ocular state O 2 . In the downstream task, the residual ϵ , which represents a deviation from the expectation value, is used for predictive modelling. While our figure illustrates a linear regression model, this approach can be extended to other machine learning models. The first design choice in our framework is to select which ocular state should serve as input to the DL 2 model, O 1 ∈ {EC, EO }, and which ocular state the DL model will aim to predict a feature of, O 2 ∈ {EC, EO} 3 . To clarify, the EEG features from O 1 are used to predict a scalar EEG feature derived from O 2 . The EEG features, , may focus on specific parts of the signals, such as a given frequency band through band-pass filtering, regions of interest through channel selection, the covariance matrix, or the EEG itself. In parallel, a scalar feature of choice is computed from the EEG during O 2 , denoted y p ∈ ℝ. This scalar feature will serve as the pseudo-target of the pretext task. The core idea is that by designing and training a model to predict y p from , with , intuitive expectation values can be learned. Finally, the residual between observed and predicted value, ϵ = y p − ŷ p , represents the deviation from expectation, which is hypothesised to reflect neurophysiologically meaningful deviations from normative brain responses and can be used as an interpretable biomarker for pathology prediction. The isolated objective of the pretext task is to solve the following optimisation problem: where θ is the model parameters. The selection of the features and pseudo target y p determines the interpretation of the learned expectation value. For example, if the feature extraction involves computing the beta-filtered signal from frontal channels during the EO state, and the pseudo target is the correlation between the alpha-filtered channels Pz and C3 during the EC state, the expectation value represents: “Given EEG data from betafiltered frontal channels during eyes open, what is the expected alpha correlation between Pz and C3 during eyes closed?”. 2.2 Optimising the predictive value of the residuals for pathological decoding The main principle of our framework is to generate residuals that are both interpretable and useful for predictive modelling in a selected downstream task. This makes the modelling of f (·) and optimisation of θ in Eqs. 1 and 2 less straightforward, as in practice, the aim is not to minimise the residuals but to ensure that they carry relevant information. Since maximising performance on the pretext task does not necessarily increase the clinical utility of the residuals, we propose two methods for explicitly learning such meaningful residuals: Supervised HPO, where the signal to the HPO algorithm is the predictive value of the residuals. This is quantified by how well a machine learning (ML) model can predict the downstream target using the residual. While we employed linear regression in our experiments and provide further details on its implementation in this section, this approach can be extended to other models. Semi-supervised MTL where the first task is to predict the pseudo-target and the second task is to predict the downstream target from the residual. The key difference between optimisation method 1 and 2 is that method 1 optimises the HPs to yield predictive residuals, while method 2 can also leverage gradient-based multi-task/multi-objective optimisation algorithms. 2.2.1. Supervised HPO for maximising the predictive value of the residual In supervised HPO, Eq. 2 is directly optimised for each trial during HPO, but model selection and the signal to the HPO algorithm is based on the predictive value of the residual. That is, we consider a predictive model f (; h, θ ), where h ∈ ℋ denotes an HPC, and ℋ= H 1 × · · · × H n represents the HP configuration space. The HPO algorithm aims to find the optimal HPC h ∗ by solving Here, , and are the regression coefficients fitted by ordinary least squares after training on the pretext task ( Eq. 2 ). The term containing the pseudo-target is included to account for any predictive signal in y p itself. Without this adjustment, the HPO process might favour HPCs that produce constant predictions for ŷ p , causing the residual ϵ to trivially replicate the pseudo-target. For HPO algorithms that iteratively refine HPCs based on prior performance, such as tree-structured parzen estimator (TPE) [ 37 ], supervised HPO can be implemented by passing linear regression performance scores such as MSE, MAE , or R 2 to the HPO algorithm. 2.2.2. Semi-supervised MTL for maximising the predictive value of the residual A limitation of the optimisation method introduced in Sec. 2.2.1 is that it is restricted to only fitting the HPs to find residuals of high predictive value for the downstream task. An alternative method is to formulate our problem as a multi-task learning/multi-objective optimisation problem, where the first task is to solve the pretext task ℒ p ( Eq. 2 ), and the second task is to minimise the error on the downstream task ℒ d . Gradient-based algorithms from MTL and multi-objective optimisation may be applied to jointly optimise these two tasks simultaneously. To impose our framework of making downstream predictions from learned residuals, the two objectives are formulated as By jointly optimising these objectives, we are explicitly trying to learn the transition trends which are maximally relevant for pathology decoding. We implemented five gradient based methods for MTL, which were (1) equal weighting of the loss functions, (2) GradNorm [ 38 ], (3) multiple-gradient descent algorithm (MGDA) [ 39 , 40 ], (4) PCGrad [ 41 ], and (5) uncertainty weighting [ 42 ]. Below, these methods are briefly explained. Equal weighting The simplest multi-task learning strategy is to weigh the losses of the tasks equally. This was implemented by using the sum of the pretext and downstream loss, ℒ tot = ℒ p + ℒ d , and effectively treat the problem as a single-task optimisation problem. GradNorm [ 38 ] GradNorm applies gradient normalisation by dynamically altering gradient magnitudes. It was motivated by task imbalances and task dominance manifesting as imbalances in the magnitude of the backpropagated gradients. It uses adaptive weighted summation on the form L tot = ∑ i w i L i , where w i changes per training step, and the correct balance is obtained when the tasks are trained at similar rates [ 38 ]. As proposed in the original work, GradNorm was only applied on the final shared layer. MGDA [ 39 , 40 ] MGDA is a gradient based and multi-objective optimisation based algorithm which aims to find a Pareto optimal solution. In MGDA, gradients are first applied on task specific parameters, followed by finding a direction of common descent, and applying this solution to the shared parameters. As the number of tasks in our study was equal to two, ℒ p and ℒ d , we did not rely on the application of the Frank-Wolfe algorithm, and directly applied Algorithm 1 in [ 40 ] instead. PCGrad [ 41 ] PCGrad is an algorithm designed to mitigate conflicting gradients across tasks by preventing interfering components to be applied to the gradient-based parameter update. When two gradients are in conflict, a gradient surgery step is applied by projecting the gradients of each task onto the normal plane of the other. Uncertainty weighting [ 42 ] The uncertainty weighting approach developed in [ 42 ] weights the loss functions by using homoscedastic uncertainty. By formulating the problem as maximum likelihood estimation, it introduces a learnable noise parameter per task to adaptively learn the relative weighting of each loss based on task uncertainty. 2.3. Experiments 2.3.1. Pretext tasks and downstream task In this study, we considered pretext tasks aimed at learning the expected log band power during the eyes-open condition, given pre-processed EEG data from the eyes-closed condition. The frequency range used to compute band power was treated as a categorical HP, with the following options: δ (1–4 Hz), θ (4–7 Hz), α (7–12 Hz), β (12–30 Hz), and γ (30–45 Hz). We used the downstream target as the log-transformed concentration of tau phosphorylated at threonine 217 (p-tau217), which was selected as a surrogate target variable due to its established association with cognitive decline and dementia development [ 43 ]. 2.3.2. Data For training on the pretext tasks, we used resting-state EEG data from the LEMON dataset [ 44 ] and the Dortmund Vital study [ 45 , 46 ]. For the downstream task, we used resting-state EEG data and p-tau217 levels from the AI-Mind study [ 2 ]. We used three epochs for each participant, with varying input length (see Sec. 2.3.5). Below, we provide a brief overview of these datasets; full documentation can be found in the original works. Note also that the choice of using LEMON, Dortmund Vital, or both, for training on the pretext task was not predetermined. Instead, dataset selection for the pretext task was treated as an HP to optimise. LEMON We used the pre-processed version of the LEMON dataset [ 44 ], which contains EEG data from N = 203 healthy participants. Data collection took place in Leipzig, Germany, using a 62-channel active ActiCAP electrode system. Since the eye electrode was not used in this study, we retained 61 scalp electrodes, following the extended international 10-20 system. Missing channels were interpolated using the “MNE” interpolation method in MNE-Python [ 47 ]. Dortmund Vital The dataset from the Dortmund Vital study [ 45 , 46 , 48 ] contains EEG data from N = 608 participants. However, as one participant did not fulfill the required three epochs after applying autoreject, the sample size of Dortmund Vital used in this study was N = 607. Participants met exclusion criteria including (1) neurological diseases, (2) cardiovascular, oncological, and eye diseases (3) psychiatric and affective disorders, (4) head injuries, head surgery, and head implants, (5) use of psychotropic drugs and neuroleptics, and (6) limited physical fitness and mobility. Furthermore, their study population was representative in terms of age distribution, genetics, cognitive performance parameters, and occupation, though it differed from the general German population in gender distribution and educational qualifications. EEG recordings were performed using a 64-channel elastic cap (10-20 system), a BrainVision Brainamp DC amplifier, and BrainVision Recorder software (BrainProducts GmbH). For our study, we used EEG data recorded prior to the two-hour block of cognitive experimental tasks. AI-Mind We used data from the AI-development set, which is a subset of the AI-Mind dataset [ 2 ]. This datasets contains participants with mild cognitive impairment, collected at Oslo University Hospital (Oslo, Norway), Helsinki University Hospital (Helsinki, Finland), Universidad Complutense de Madrid (Madrid, Spain), Catholic University of the Sacred Heart (Rome, Italy), and IRCCS San Raffaele (Rome, Italy). The EEG data used for this study was restricted to the first visit of the AI-Mind study. Furthermore, the p-tau 217 levels were also collected at the first visit, at the MCI stage. After excluding participants for whom a p-tau 217 measurement was unavailable, and restricting the dataset to only include participants with at least one available EEG recording from the eyes open condition and at least one available recording from the eyes closed condition, the sample sizes were Norway ( N NO = 152), Finland ( N FI = 66), Spain ( N SP = 158), and Italy ( N IT = 110), giving a total of N = 486 participants. It is important to note that this dataset does not represent the full AI-Mind study data, as a test hold-out set remains unavailable to AI research and development for a final unbiased evaluation of the AI-Mind tools. The EEG was recorded using 126 scalp electrodes prewired in an elastic cap (ANT neuro WaveguardTM), with electrode positioning following the 10-5 system derived from the extended international 10-20 system. Ethical approval for the AI-Mind study was granted to each data controller according to national guidelines. Please see [ 2 ] for details. 2.2.3. Pre-processing Input data Our EEG pre-processing pipeline is shown in Figure 2 . All pre-processing steps were implemented using tools from MNE-Python [ 47 ] and autoreject [ 49 ]. We employed HPO of certain steps of the pre-processing pipeline, resulting in multiple pre-processed versions of the input data. The first step of the pipeline was to remove the first 30 seconds of the data, as artefacts are more common in the start of a recording session. The data was band-pass filtered between (1 − 45)Hz, and after visual inspection of the power spectra, a notch filter at 50Hz was applied to the Dortmund Vital and AI-Mind datasets. Subsequently, the EEG data was epoched into four different copies, with duration d ∈ {5 s , 10 s , 20 s , 30 s } . For each copy, autoreject [ 49 ] was applied for channel repair and epoch removal. We used three epochs per participant, and participants which did not satisfy this criteria after autoreject was applied, was removed from the dataset. Interpolation/channel removal to the Standard 10-20 with 19, 32, 64, and 126 channels was employed 4 . Furthermore, we used both spherical spline and biophysical interpolation. The EEG data was re-sampled to 90Hz, 135Hz, and 180Hz. Finally, average referencing was applied. Download figure Open in new tab Figure 2. Pre-processing pipeline for creating multiple versions of the EEG data to be used as input to the DL models. The rectangles in blue indicates that the pre-processing step contains hyperparameters which are part of the hyperparameter optimisation. For these hyperparameters, we used the configuration space composed of input durations d ∈ {5 s , 10 s , 20 s , 30 s }, channel systems of the extended standard 10-20 system with number of channels in c ∈ {19, 32, 64, 126 }, m spline, biophysical, and f ∈ {90Hz, 135Hz, 180Hz }. Note, however, that neither channel system nor interpolation method were relevant if region-based pooling was used. Pseudo-targets The first steps of the pipeline for computing band power as pseudo-targets included data pre-processing. This was made by (1) cropping out the first 30 seconds and the last 15 seconds of data, (2) band-pass filtering between 1 and 45Hz, (3) applying a notch filter at 50Hz (4) re-sampling to 180Hz (5) epoching the data to 5-second non-overlapping segments (6) applying autoreject, and (7) applying average referencing. Subsequently, PSDs were computed per channel using the Welch method and averaged across epochs. Band-power was calculated by Simpson integration per channel and frequency band ω ∈ { δ, θ, α, β, γ }with δ = (1 − 4)Hz, θ = (4 − 7)Hz, α = (7 − 12)Hz, β = (12 − 30)Hz, and γ = (30 − 45)Hz . To avoid a high potential impact of bad channels, we used the median band power across channels. The bandpower values were multiplied by a factor of 10 12 , and transformed by a log-10 transformation. Finally, the pseudo-targets were z-normalised to obtain zero mean and unit standard deviation on the pretext training data. Downstream targets For the semi-supervised MTL-based optimisation (as well as baseline DL models, see Sec. 2.4), the downstream targets were z-normalised to obtain zero mean and unit standard deviation on the downstream training data. An inverse scaling was applied to the model outputs prior to computing metrics. 2.3.4. Data splits and model selection Supervised HPO For training on the pretext tasks, we performed a single random split, reserving 20% of the subjects for validation. The epoch with the highest performance (as measured by R 2 on the validation set) was selected, and its corresponding model state was used for the HPC evaluation. To evaluate a single HPC, we first partitioned the subjects from the downstream task into two sets, ( S HPCE , S test ), with 70% allocated to S HPCE and 30% to S test . This partitioning remained fixed for all trials. To provide feedback to the HPO algorithm, we performed 50 random 80/20 splits of S HP CE . For each split, a linear regression model was trained using ordinary least squares (OLS) on 80% of the data and evaluated on the remaining 20%. The median R 2 validation score across these splits was used as the signal to the HPO algorithm. S test remained separated from the HPO process, ensuring an unbiased final evaluation of our methods. Semi-supervised MTL To obtain unbiased and comparable results with the supervised HPO strategy, we isolated the same test set as for supervised HPO. For each trial, model training and model state selection was performed by a single random split, reserving 20% of the subjects for validation. All epochs that were Pareto optimal (as measured by R 2 on the validation set for both tasks) were stored, and the model state which had the highest smallest R 2 score on the two tasks was used for the HPC evaluation. 2.3.5. HP distributions Figure 3 shows a shallow overview of the HPs and the corresponding HPDs used in our experiments. See Appendix A.1 for a complete overview. One guideline for selecting HPDs was to use fairly broad configuration spaces, which was motivated by the effective HP dimensionality often being considerably smaller than the number of HPs, and random search being robust in such scenarios [ 50 ]. A second guiding principle was to avoid configurations which vastly increased the runtime. While the majority of HPDs were pre-determined, some HPDs were adjusted due to high runtime and memory considerations. For this purpose, different HPDs were explored through trial and error on (1) age prediction using LEMON as the pretext dataset and Dortmund Vital for the downstream task, and (2) a fake downstream target which only contained noise, using LEMON and Dortmund Vital as pretext datasets and AI-Mind as the downstream dataset. All HPO was made automatic, with no opportunity for the researchers to make post-hoc adjustments to further refine the HPDs and initiate new experiments. Download figure Open in new tab Figure 3. A shallow overview of the HPs and corresponding distributions. HPs marked with * contain additional HPs which are not shown in this figure. HPs marked with ** are conditional HPs. A complete overview is documented in Appendix A.1. 2.4. Experimental setups and baselines This work was performed on the TSD (Tjenester for Sensitive Data) facilities, owned by the University of Oslo, operated and developed by the TSD service group at the University of Oslo IT-Department (UiO IT). We used the High Performance Computing resources of Colossus, the high performance computing cluster provided as part of TSD by UiO IT. HPO was carried out using Optuna [ 51 ], setting both the ‘group’ and ‘multivariate’ arguments to True. All DL models were trained for 500 epochs, employing early stopping with a patience set to 50. For all models not employing semi-supervised MTL, patience refers to the number of consecutive epochs without improvement in the validation set’s R 2 score before early termination is triggered. For models employing semi-supervised MTL, patience refers to the number of consecutive epochs without finding a new Pareto-optimal solution, as evaluated by the R 2 score on the pretext validation set and the R 2 score on the downstream validation set. 2.4.1. Linear regression with band-power features The first baseline was to employ a linear regression model to predict the target variables from band-power. The features were the same as the pseudo-targets, log band-power of { δ, θ, α, β, γ }frequency bands from the eyes open condition. 2.4.2. Downstream training with DL We used DL models to predict the downstream target without any form of pre-training, using the same HPDs as described in Sec. 2.3.5. Furthermore, we added ocular state as an HP, such that the input data could either be from eyes open or eyes closed. 500 random startup trials and 500 trials employing TPE was employed. 2.4.3. Pretraining and fine-tuning Although the primary aim of this study involved using the deviations from healthy expectation values as features for predictive modelling, we also investigated fine-tuning the models on the downstream task. The same HPDs as detailed in Sec. 2.3.5 were used. The loss function and the HPs of the Adam optimiser were considered separate for the pretext and downstream task, that is, their values were sampled independently. We used 500 random startup trials and 500 trials employing TPE. 2.4.4. Supervised HPO: Simple linear regression with residuals Here, we used the derived framework to learn expectation values on healthy EEG data, and using the residuals for predictive modelling on the downstream task. Only a single residual was used, and the frequency band that was used in generating the pseudo-target was considered an HP. We re-used the 500 random trials from Sec. 2.4.3 and continued optimising the HPs with 500 TPE-based trials. 2.4.5. Supervised HPO: Multivariable linear regression with residuals As an attempt to combine the learned residuals from Sec. 2.4.4 into a multivariable linear model, we combined the residuals that resulted from training with the different log band-power features as pseudo-targets. Here, one residual per frequency band was used for all trials. We added trials by (1) combining the highest performing HPCs (as evaluated by R 2 on the downstream task) per frequency band, (2) 2500 trials following a sampling strategy which selects one random experiment per frequency band, and (3) 2500 trials following a sampling strategy which performs weighted sampling per frequency band. In (3), the weights were computed as softmax probabilities , where p i is the probability that the model of experiment i is selected, s i is the validation R 2 score on the downstream task, and T ∈ ℝ is a temperature scaling which is selected such that the top 25% is selected 75% of the times. 2.4.6. MTL for learning residuals We used HPO by 500 random trials followed by 500 trials employing multi-objective TPE [ 37 , 51 , 52 ]. 3. Results 3.1. Downstream predictive performance Figure 4 shows the performance scores for all trials for the six different experiment-types from Sec. 2.4, including R 2 and Pearson’s r. See Fig. A.6 for explained variance regression score, Spearman’s rho, and concordance correlation coefficient. After model selection, the results show negative R 2 scores and weak correlations. Specifically, the performances scores were: linear regression with band-power features ( R 2 = − 0.02, Pearson’s r =0.18), downstream DL training ( R 2 = − 0.25, Pearson’s r = 0.14), pretraining and fine-tuning ( R 2 = − 0.29, Pearson’s r = 0.05), simple supervised HPO ( R 2 = − 0.10, Pearson’s r = 0.07), multivariable supervised HPO ( R 2 = − 0.05, Pearson’s r = 0.23), and semi-supervised MTL ( R 2 = − 0.24, Pearson’s r = 0.17). Additionally, even the highest performing models on the test set (which are clearly biased performance estimates) did not reach strong performance on the downstream task, suggesting that variance during model selection is not the limiting factor. In sum, neither our framework nor the baselines were able to reach high predictive performance for p-tau217 levels. Download figure Open in new tab Figure 4. Test scores for all experiments and trials. The star indicates the test scores after model selection. Notably, all selected models obtained poor results, as indicated by negative R 2 scores and weak correlations. See Fig. A.6 for similar plots with explained variance ratio score, Spearman’s rho, and concordance correlation coefficient 3.2. Estimated Pareto front Figure 5 shows the performance on the downstream task plotted against performance on the pretext task, for multiple regression metrics. The majority of solutions that were Pareto-optimal across all epochs and trials were obtained using band power from the β band. However, as the downstream performance scores were poor, this appears to result from high predictive values on the pretext task. Download figure Open in new tab Figure 5. Performance scores on the downstream test set and pretext validation set for different metrics. Here, all Pareto optimal scores (as computed by R 2 score on the downstream and pretext validation sets) per trial is plotted. The stars indicate that the solution was Pareto optimal as estimated on the validation sets across all trials, hence representing the estimated Pareto front during model selection. The downstream performance scores of the estimated Pareto front were poor, as indicated by low R 2 scores and weak correlations. In contrast, the performance scores on the pretext task was substantially better, achieving high R 2 scores and correlations. The majority of the estimated Pareto optimal solutions used the β power as pseudo-target. Performance scores below selected thresholds were truncated ( R 2 < − 1.0 and Pearson’s r < − 0.2). See Fig. A.7 for similar plots with explained variance ratio score, Spearman’s rho, and concordance correlation coefficient. The performance scores on the pretext task were high, ranging from to and Pearson’s r min = 0.81 to Pearson’s r max = 0.97 for the estimated Pareto-optimal solutions. On the downstream task, these models obtained performances scores ranging from to and from Pearson’s r min = 0.01 to Pearson’s r max = 0.23. Therefore, high performance scores on the pretext task did not translate into high downstream performance scores. This suggests that the model was able to capture healthy patterns of electrophysiological reactivity, but that deviations from these healthy patterns were not predictive of neurodegeneration. 4. Discussion This work designed and tested a novel SSL-based framework which learns healthy ocular state transitioning patterns. Furthermore, it used two optimisation strategies which explicitly aim to find the healthy transitioning trends which also breaks for pathological data, to promote residuals of high predictive value in pathology decoding. The framework is conceptually simple - computing deviations from learned expectation values. This may increase its impact for biomarker discovery, and be a step towards more neurophysiologically grounded DL modelling. A connection to the pathology is indeed of high importance for developing biomarkers [ 53 , 54 ]. We hypothesised that the main advantage of our framework over traditional DL-based predictive modelling was its potential biological insights, but unlikely to reach traditional DL performance scores. From a technical perspective, imposing our framework represents a major constraint for empirical risk minimisation, hence, the baselines which applied DL models directly on the downstream task were unlikely to be surpassed, but in practice serve as an upper bound instead. Additionally, we hypothesised that an advantage to traditionally computed reactivity was that the state transitions could be learned in a data-driven manner, potentially discovering reactivity patterns that are currently unknown and recognised. Indeed, alpha reactivity stands out as the most documented reactivity pattern for dementia-related cognitive health [ 14 , 13 , 55 ]. However, to maintain high exploration in our experiments, we did not explicitly filter out the alpha signal, and we also considered other frequency band-powers as pseudo-targets. Instead, we allowed the DL models to perform such feature extraction through data-driven learning. Additionally, results from computer vision has shown that the first layer of deep neural networks occasionally learns features similar to Gabor filters [ 56 ], and such filters are also explicitly modelled by the GREEN architecture [ 57 ]. Our work is distinct from traditional SSL methods because our framework aims to learn patterns of healthy transitioning patterns and use the deviation from expectation value as a biomarker, rather than learning latent vector representations. Common pretext tasks for SSL applied to data EEG data, however, typically focus on learning representations through contrastive or predictive objectives, such as masking, shuffling, time- and frequency contrasting, and signal transformation recognition [31, 30, 26, 27, 28, 29, 24, 21, 22, 23, 25]. While effective for downstream classification [ 35 ], these approaches do not explicitly model normative dynamics nor capture deviations that may signal pathology. In contrast, our method leverages the actual-to-predicted residual as a measure of abnormality, offering interpretable markers that are better tied to physiological state changes. Adapting DL techniques for EEG biomarker discovery is an interesting area of research. A notable example is the GREEN architecture, which incorporates learnable Gabor wavelets and Riemannian geometry to embed inductive biases directly into the network design [ 57 ]. A key distinction between GREEN and our approach lies in how and where inductive bias is introduced: GREEN modifies the model architecture itself, whereas our framework embeds neurophysiological priors through reactivity-inspired pretext tasks during the self-supervised training phase and enforces residual-based biomarkers. Importantly, as demonstrated in this study, architectural innovations and novel training-phase strategies are not mutually exclusive and may be combined to enhance both biomarker interpretability and predictive performance. Despite thorough HPO, our developed framework did not achieve high predictive performance for p-tau217 levels. However, by using the same HPDs (and including ocular state as an HP) and employing downstream training only, these baselines obtained similar poor results. This suggests that the lack of predictive performance may not be due to the framework itself, but from limitations in the data. Specifically, resting-state EEG may not encode signal components that are reliably associated with the pathological processes approximated by p-tau217 levels at the MCI stage. Furthermore, the use of p-tau217 as a surrogate marker introduces additional complexity, as it is not a direct neural signal but rather a downstream biomolecular correlate of neurodegeneration. 4.1. Connections to reactivity Reduced reactivity in posterior alpha rhythms during the transition from eyes-closed to eyes-open conditions has been linked to cognitive pathology. For instance, reduced posterior alpha reactivity has been associated with Parkinson’s disease dementia [ 14 ], and similar reductions have been observed in patients with dementia due to Lewy body and Alzheimer’s disease, compared to healthy controls [ 13 , 55 ]. Notably, alpha reactivity has also demonstrated predictive value for amyloid-beta positivity beyond established clinical measures [ 58 ], and some evidence suggests that EEG reactivity is preserved in healthy ageing [ 59 ]. In addition, alpha reactivity appears to be sensitive to vestibular dysfunction [ 60 ], and reduced alpha reactivity has been linked to compromised integrity of the cholinergic system [ 61 , 62 ]. These findings highlight the relevance of alpha reactivity as a biomarker, but also reveal a reliance on hand-crafted EEG features. In contrast, our framework offers a data-driven approach for learning transition patterns between brain states, such as the eyes-closed to eyes-open condition. By using a self-supervised objective that models normative reactivity and captures deviations as residuals, our method offers a flexible way to discover latent, potentially unknown patterns that may correspond to underlying pathology. This is further enhanced by optimisation strategies that explicitly encourage the discovery of transition patterns that are both representative of healthy neural functioning and sensitive to pathological deviations. This approach complements existing physiologically informed biomarkers and opens a path toward improved data-driven biomarker discovery. 4.2. MTL for learning relevant residuals MTL is traditionally motivated by goals such as improving predictive performance, enhancing generalisation, learning from partially annotated data, and reducing computational cost [63, 64, 65, 66, 67, 68, 69]. The core idea is that jointly solving multiple tasks using a shared encoder can lead to better representations than training separate models for each task. While our objective differs from these typical motivations, algorithms from the MTL literature are readily applicable. Specifically, we employed MTL not to jointly solve multiple downstream tasks, but to guide the model toward producing interpretable biomarkers that are also predictive of pathological outcomes. Rather than enhancing task performance directly, the goal is to shape residuals in a way that captures clinically relevant deviations from normative brain dynamics. The idea behind formulating our problem using both a pretext and a downstream objective (and applying appropriate gradient-based algorithms from MTL literature) may intuitively be understood through a thought experiment, using alpha power as an example pseudo-target. Imagine that there are multiple features derived from eyes-closed recordings which may be used to predict the alpha power during eyes open, including, e.g., alpha power, theta/delta power ratio, beta/gamma cross-frequency correlations. Among all these possible transitioning patterns, one is not necessarily interested in the feature(s) which obtains the smallest prediction error on this pretext task, but rather to look for the transitioning patterns which are sensitive to abnormalities due to the presence of pathology. Using machine learning with hand-crafted features, this may be approached by domain knowledge and hypothesis-driven research, where the features are specified by the researcher. However, as one has limited control over which features are learned by DL models, imposing such feature-restrictive inductive biases is often not feasible. We therefore argue that for DL models, it is particularly crucial that the optimisation strategies are aligned with our actual goal. The two-task formulation makes the problem fundamentally more challenging, as we now have two optimisation problems without any clear way of comparing different Pareto optimal solutions. However, we argue that solely solving the pretext task is an incomplete optimisation strategy 5 , and that the added complexity represents a more realistic reflection of biomarker discovery, where both biological groundedness and predictive performance must be balanced. Importantly, this approach is not limited to the reactivity- and SSL-based framework developed in this study. The underlying principle, using MTL to increase the predictive value of residuals, can be extended to any residual-based biomarker. Its most straightforward application is to formulate age prediction as a multi-objective optimisation problem, in which the model simultaneously minimises age prediction error and maximises the association between the age gap and a target pathology. Following this formulation (as mathematically expressed in Eqs. 6 and 7), MTL algorithms may help uncover age-related trends that are particularly sensitive to disease processes. This broader perspective suggests that MTL can be a powerful tool for enhancing residual-based biomarkers across diverse domains of EEG and biomedical analysis. 4.3. Future work Our framework may be generalised beyond ocular reactivity. As reactivity has been explored for different purposes than dementia research, it would be interesting to test our methods on such data. An interesting domain of application may be consciousness impairment, where preserved reactivity has been linked to an increased odds of survival [ 15 ]. In consciousness impairment, other stimuli such as auditory and nociceptive stimuli has been used [ 15 ], which impose different requirements for the datasets used in the pretext task. Additionally, reactivity has shown high predictive value of neurological outcome after cardiac arrest [ 16 ], and would therefore be of high interest for further investigation of our approach. We highlight that, although our framework was formulated in the cross-ocular state setting, the extension to other EEG recording paradigms is trivial. By re-defining O 1 as the ‘relaxed state’ and O 2 as the ‘stimulated state’, such as O 2 being the state in which auditory stimuli was applied, our methods are still valid and applicable. Although our experiments only implemented DL-based models for the SSL-based framework, this is not a requirement for using our proposed methods. ML techniques based on hand-crafted feature extraction are indeed compatible with our framework, and may in particular be beneficial for explainability, further enhancing the derived neurophysiological insight. Alternatively, inspecting explainable DL models such as GREEN [ 57 ] could be of high interest. In both our framework and our experiments, the pseudo-targets were pre-selected. It is conceivable, however, that the pseudo-target may benefit from being learned as part of the gradient-based optimisation. That is, to use and add θ 2 to the optimisation in Eqs. 6 and 7. For maintaining the neurophysiological insight however, this approach requires that the model g (·) is sufficiently interpretable, so that an interpretation of the expectation value and corresponding residual can be made. 5. Conclusion In this study, we designed a novel self-supervised learning based framework for modelling patterns of healthy transitions from eyes-closed to eyes-open resting-state EEG. The residual, representing deviation from healthy expectation value, is used as a candidate biomarker. We hypothesised that this framework could yield more biologically interpretable outputs compared to traditional DL methods, and that currently unknown reactivity patterns could be learned in a data-driven manner. To promote residuals that are sensitive to pathological abnormalities, we applied two optimisation strategies which involves maximising the predictive value of the residual. Neither our proposed framework nor our baselines achieved satisfactory predictive performance for ptau-217 levels, which were used as a surrogate marker of cognitive health. Therefore, the main limitation is likely due to insufficient signal components in the EEG that are strongly associated with the pathological processes approximated by p-tau217 levels at the MCI stage. Hence, this study represents a methodological contribution to the development of interpretable, transition-based EEG biomarkers through self-supervised learning, but its clinical utility remains empirically unverified. Future studies with more sensitive downstream markers are needed to validate the utility of our framework in clinical decision-making. Author contributions Thomas Tveitstøl : Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Software, Visualization, Writing – original draft, Writing – review & editing Mats Tveter : Investigation, Writing – review & editing Christoffer Hatlestad-Hall : Writing – review & editing Hugo L Hammer : Supervision, Writing – review & editing Ira R J Hebold Haraldsen : Funding Acquisition, Writing – review & editing Acknowledgments This project has received funding from the European Union’s Horizon 2020 research and innovation programme under grant agreement No 964220. This publication reflects views of the authors and the European Commission is not responsible for any use that may be made of the information it contains. Appendix Appendix A.1. Hyperparameter distributions In this section, we document all the HPDs that were used. Note that the sampling distributions only hold during the initial random search, not for the trials employing TPE. Appendix A.1.1. Datasets for the pretext task The datasets that were used for training on the pretext tasks were either LEMON, Dotmund Vital, or both, sampled with equal probabilities. Appendix A.1.2. Pre-processing The HPs involved in the pre-processing of the data were sampling rate and input length. Additionally, the selection of channels system and the interpolation method were conditional HPs constrained to trials employing interpolation for handling the varied electrode configurations. The corresponding HPDs are detailed in table A.1 . View this table: View inline View popup Download powerpoint Table A1. Sampling distributions for the pre-processing steps. U { a 1 , a 2 , …, a n } denotes uniform sampling over the discrete set { a 1 , a 2 , …, a n } Appendix A.1.3. DL architectures We considered five DL architectures, which were Inception network [ 70 ], ShallowFBCSPNet [ 71 ], Deep4Net [ 71 ], TCN [ 72 ], and GREEN [ 57 ]. Below, we provide a brief overview of each architecture. For complete descriptions, see the original works. In addition, we outline the sampling distributions of their HPs as used in our experiments. For all architectures, the channels were normalised to 0 mean and unit standard deviation before its forward pass, with 50% probability. Inception network [ 70 ] The main building block of Inception network is the Inception module, which performs convolution with kernels of different sizes and aggregates the resulting feature maps by concatenation. Three Inception modules in series builds a single residual block. Inception network was originally composed of two such residual blocks, connected by a shortcut layer. The number of residual blocks was sampled from the logarithmic distribution d ∼ LogU ℕ [ 1 , 12 ]. Furthermore, the number of convolutional filters of the inception modules was sampled from f ∼ LogU ℕ {4, 64}. Deep4Net [ 71 ] Deep4Net consists of four convolutional-max pooling blocks. The first block includes both temporal and spatial filters, while the subsequent three blocks use convolution and max-pooling, with the number of convolutional units doubling for each successive block. Padding on the last three convolutional layers was added. This was added to allow for a smaller number of input time steps. The kernel length for all convolutions, with the exception of the spatial filter in the first convolutional block, was fixed equal and sampled from l ∼ U ℕ [ 5 , 15 ]. The number of filters for the temporal and the spatial convolutional layers in the first convolutional block was fixed equal and sampled from f 1 ∼ U ℕ [ 10 , 40 ]. The subsequent layers were computed as f i = 2 f i −1 ∀ i ∈ {2, 3, 4}. Drop-out on the last layer was applied, and sampled from drop-out ∼ U [0, 0.5]. ShallowFBCSPNet [ 71 ] ShallowFBCSPNet was inspired by the filterbank common spatial pattern pipeline. Similar to Deep4Net, its first two layers comprise separate temporal and spatial convolutional layers. These are followed by a squaring non-linearity activation function, a mean pooling layer, and a logarithmic activation function. The kernel length of the temporal convolution in the first layer was sampled from l ∼ U ℕ [ 5 , 45 ]. As for Deep4Net, the number of filters for the temporal and the spatial convolutional layers in the first convolutional block was fixed equal. The number of filters was sampled from f 1 ∼ U ℕ [ 20 , 60 ], the number of strides of the mean pooling layer was t ∼ U ℕ [ 5 , 25 ], and the kernel length of the pooling layer was computed as 5 t . Drop-out on the last layer was sampled from drop-out ∼ U [0, 0.5]. TCN [ 72 ] TCN employs dilated convolutional 1D filters to obtain exponentially large receptive fields. Furthermore, it uses residual modules which consist of, repeated twice, (1) a dilated causal convolution with weight normalisation, (2) Relu activation function, (3) and a spatial drop-out layer operating per channel. To ensure that the input and output tensors can be added together at the skip connection layer, a convolution with a kernel size of 1 × 1 is applied, if needed, to the input tensor to match the feature dimension of the output tensor. Originally, the TCN architecture makes one prediction per time step. To obtain a scalar output, we added a global average pooling layer in the temporal dimension prior to the output layer. The dilation factor was fixed at d = 2. The number of residual blocks was sampled from U ℕ [ 1 , 5 ]. The number of filters per residual block was sampled from LogU ℕ [ 4 , 128]. Furthermore, the kernel size in the temporal dimension was sampled from U ℕ [ 3 , 8 ]. Finally, the drop-out rate was sampled from U [0, 0.5]. GREEN [ 57 ] The Gabor Riemann EEGNet (GREEN) is a recently developed architecture which integrates domain knowledge by combining DL with spectral analysis and geometric operations. The first module of the network is parametrised convolution , which are wavelets on the form , where σ ∈ ℝ + is the standard deviation of the Gaussian window and f ∈ ℝ + is the frequency of the complex sinusoid. Here, σ and f are treated as trainable parameters. The second module of GREEN is the pooling layer , which transforms the wavelet-transformed signals to non-temporal features. The original work implemented pooling layers which compute (1) the real covariance matrix, (2) the real covariance matrix with cross-frequency interactions, (3) pairwise phase-locking values, (4) cross-frequency phase-locking value, (5) a combination of real covariance matrix and pairwise phase-locking, and (5) a combination of cross-frequency covariance matrix and cross-frequency phase-locking value. To mitigate potential numerical instabilities due to overestimation of the range of eigenvalues in the sample covariance matrices, a covariance shrinkage operation was applied as a subsequent step. Furthermore, a BiMap layer was implemented to mitigate issues with rank deficient covariance matrices. This is followed by a rectified eigenvalue (ReEig) layer, and a logarithm mapping. Finally, a fully connected module is applied. The number of complex wavelets used in the parameterised convolution module was sampled from the distribution LogU ℕ [ 1 , 10 ]. For initialisation of σ and f , we used either the Morlet parametrisation or random initialisation, sampled with equal probabilities. The lower and upper bound of the initialisations of f where f min = 1, f min = 2 5.5 . The upper and lower bound of the full width half maximum (FWHM) where FWHM max = 2 1 , FWHM min = 2 −4.5 . The strides of the convolution was sampled from U ℕ [ 1 , 5 ]. The kernel width was sampled from U [2 s , 4 s ]. For the pooling layer, we used either the real covariance matrix, cross-frequency covariance matrix, pairwise phase-locking value, cross-frequency phase-locking value, a combination of real covariance matrix and pairwise phase-locking, or a combination of cross-frequency covariance matrix and cross-frequency phaselocking value, sampled with equal probabilities. When using either two of the phase-locking pooling layers, we sampled the ‘reg’ HP from reg LogU [10 −7 , 10 −5 ] In the shrinkage layer, we initialised the shrinkage parameter by sampling from the distribution U [− 5, − 1]. For the BiMap layer, the output dimension was computed as [ x ·(#input channels − 1)] where [·] is the rounding operator, with x ∼ U [0.5, 1.0]. For the ReEig layer, the eigenvalue regularisation parameter was sampled from LogU [10 −5 , 10 −3 ]. In the LogMap layer, we used either identity or the running log-Euclidean mean as reference, sampled with equal probabilities. The momentum for in the update rule ( Eq. 7 in [ 57 ]) was sampled from U [0.8, 1.0]. If the matrix passed to the LogMap layer was ill-conditions, hence failing to compute eigenvalues, the trial was pruned. The number of units for the input layer of the fully connected module was sampled from U ℕ [ 16 , 128], whereas the number of hidden layers was sampled from U ℕ [ 1 , 3 ]. The number of units for a given layer was computed as half of its prior. Finally, the drop-out rate was sampled from U [0, 0.5]. Appendix A.1.4. Varied electrode configurations To allow for varied electrode configurations with different numbers of channels, we used interpolation or region-based pooling (RBP) [ 73 ], sampled with equal probabilities. When interpolation was used, the interpolation method and channel system was sampled from the distributions described in table A.1. When using RBP, the algorithm for splitting the montage into different regions was the same as in the original work. This includes to repeatedly compute the centroid of 2D projections of channel positions, and generate k angles which partitions the current region such that the newly formed regions contains the same number of electrodes. To ensure that all channel systems were compatible with all montage splits, we used the intersection of electrodes to fit the montage splits. The split vectors, which determines all the k - values per region split, were k 1 = (2, 2, 2, 2, 2, 2, 2) T , k 2 = (3, 3, 3, 3, 3, 3, 3, 3) T , k 3 = (2, 3, 2, 3, 2, 3, 2, 3, 2) T , k 4 = (4, 3, 2, 3, 4, 3, 2, 3, 4) T . For a single montage split, the split vector was sampled with equal probabilities from the discrete set { k 1 , k 2 , k 3 , k 4 }. The stopping criterion, which prevents the generation of a new split if the number of electrodes is below this threshold, was sampled from U ℕ [ 1 , 6 ]. Finally, the total number of montage splits was sampled from m ∼ LogU ℕ [ 1 , 8 ]. We considered three pooling methods for aggregating channels within a region; (1) averaging, (2) channel attention, and (3) channel attention with a head region. As in the original work, the attention mechanism relied on feature extraction similar to ROCKET [ 74 ], which involved applying random convolutional kernels followed by computing the mean and maximum value of the feature maps. We allowed for the use of multiple pooling modules , which uses parameter sharing across the montage splits it operates on. The use of a single pooling module was employed with a 50% probability. Otherwise, the number of pooling modules was computed as p = max (1, [ xm ]), with x ∼ U [0, 0.5]. The number of montage splits per pooling module was allocated by initialising p bins with unit cardinality and incrementing a randomly selected bin m – p times. The size of the i -th bin was used as the number of montage splits for the i -th pooling module. When using pooling modules with a channel attention mechanism, the number of kernels was sampled from LogU ℕ [ 10 , 500], and their maximum receptive field was sampled from LogU ℕ [15, 100]. For channel attention with a head region, the dimensionality of the search vector was sampled from LogU ℕ [ 4 , 32 ]. Finally, the parameter sharing of the embedding functions in Eqs. (3-4) and (6-7) in [ 73 ] were used with a 50% probability. Appendix A.1.5. Loss and optimiser The Adam optimiser was used for training the DL models on their pretext tasks. The learning rate was sampled from η ∼ LogU ℕ [10 −5 , 10 −2 ]. Furthermore, the beta values were sampled from β 1 ∼ U [0.7, 1] and β 2 ∼ U [0.7, 1], and the epsilon value for numerical stability was sampled from ϵ ∼ LogU ℕ [10 −9 , 10 −7 ]. The batch size was fixed to 64. The loss function was MAE or MSE , sampled with equal probabilities. If the training included both LEMON and Dortmund Vital, a sample re-weighting was employed with a 50% probability. This included weighting the subjects according to the size of the dataset they belong to by , where τ is an HP which determines the power of re-weighting ( τ = 0 corresponds to equal weighting of all subjects, while τ = 1 corresponds to equal weighting of all datasets), N k ∈ ℕis the training sample size of the k -th dataset and α is a normalisation constant which ensures weighted averaging [ 75 ]. For the trials employing this re-weighting, τ was sampled from the distribution τ ∈ U [0, 1]. Appendix A.1.6. Algorithms for multi-task learning In semi-supervised MTL, the two tasks were jointly optimised using (1) equal weighting of the loss functions, (2) GradNorm [ 38 ], (3) MGDA [ 39 , 40 ], (4) PCGrad [ 41 ], or (5) uncertainty weighting [ 42 ], with equal probabilities. In GradNorm, we used stochastic gradient decent on the weights w i with a learning rate sampled from LogU [10 −3 , 10 −1 ]. Furthermore, the α HP which is related to the training rate balance between tasks was sampled from α ∼ U [0, 3] Appendix A.2. Additional performance scores Fig A.6 shows performance scores for all trials for the six different experiment-types from Sec. 2.4, including explained variance regression score, Spearman’s rho, and concordance correlation coefficient. After model selection, the performance scores were linear regression with band-power features (explained variance = 0.03, Spearman’s rho = 0.17, concordance correlation = 0.08), downstream DL training (explained variance = − 0.25, Spearman’s rho = 0.12, concordance correlation = 0.13), pretraining and fine-tuning (explained variance = − 0.20, Spearman’s rho = 0.02, concordance correlation = 0.04), simple supervised HPO (explained variance = − 0.04, Spearman’s rho = 0.01, concordance correlation = 0.04), multivariable supervised HPO (explained variance = 0.01, Spearman’s rho = 0.17, concordance correlation = 0.16), and semi-supervised MTL (explained variance = −0.09, Spearman’s rho = 0.14, concordance correlation = 0.12). Download figure Open in new tab Figure A6. Test scores for all experiments and trials. The star indicates the test scores after model selection. Here, explained variance refers to the offset-invariant explained variance regression score as computed by the ‘explained_variance_score’ func- tion in sklearn [ 76 ]. Notably, all selected models obtained negative R 2 scores, low explained variance regression scores, and weak correlations. Fig. A.7 shows the performance on the downstream task plotted against performance on the pretext task, for explained variance regression score, Spearman’s rho, and concordance correlation coefficient. The Pareto-front, representing the scores of the models that were Pareto-optimal across all epochs and trials on validation R 2 scores, obtained pretext test scores ranging from explained variance min = 0.64 to explained variance max = 0.94, Spearman’s rho min = 0.72 to Spearman’s rho max = 0.97, and concordance correlation min = 0.79 to concordance correlation max = 0.97. On the downstream task, the test performance scores ranged from explained variance min = − 0.53 to explained variance max = 0.04, Spearman’s rho min = − 0.08 to Spearman’s rho max = 0.25, and concordance correlation min = 0.01 to concordance correlation max = 0.18. Download figure Open in new tab Figure A7. Performance scores on the downstream test set and pretext validation set for different metrics. Here, all Pareto optimal scores (as computed by R 2 score on the downstream and pretext validation sets) per trial is plotted. The stars indicate that the solution was Pareto optimal as estimated on the validation sets across all trials, hence representing the estimated Pareto front during model selection. Performance scores below selected threshold were truncated (Explained variance < −1.0, concordance correlation < −0.2, and Spearman’s rho < −0.2). Funder Information Declared EU Horizon 2020 , 964220 Footnotes ↵ 1 These authors share last authorship. ↵ 2 Our framework is not restricted to DL, but can be used with any prediction model. For consistency with our experiments, however, we will assume a DL model throughout the methods section. ↵ 3 Setting O 1 = O 2 removes the “cross-ocular” aspect but is unproblematic for those interested in learning within-ocular-state expectation values. ↵ 4 when region based pooling was used for handling varied electrode configurations, no interpolation was employed ↵ 5 in our framework, solving the downstream task without solving the pretext task would also be incomplete, as it would imply that the residual cannot meaningfully be interpreted as a deviation from an expectation value, leading to a lack of biological insight. References [1]. ↵ P. Rajpurkar , E. Chen , O. Banerjee , E. J. Topol , Ai in health and medicine , Nature medicine 28 ( 1 ) ( 2022 ) 31 – 38 . OpenUrl CrossRef PubMed [2]. ↵ I. H. Haraldsen , C. Hatlestad-Hall , C. Marra , H. Renvall , F. Maestú , J. Acosta-Hernández , S. Alfonsin , V. Andersson , A. Anand , V. Ayllón , A. Babic , A. Belhadi , C. Birck , R. Bruña , N. Caraglia , C. Carrarini , E. Christensen , A. Cicchetti , S. Daugbjerg , R. Di Bidino , A. Diaz-Ponce , A. Drews , G.M. Giuffrè , J. Georges , P. Gil-Gregorio , D. Gove , T. M. Govers , H. Hallock , M. Hietanen , L. Holmen , J. Hotta , S. Kaski , R. Khadka , A. S. Kinnunen , A. M. Koivisto , S. Kulashekhar , D. Larsen , M. Liljeström , P. G. Lind , A. Marcos Dolado , S. Marshall , S. Merz , F. Miraglia , J. Montonen , V. Mäntynen , A.R. Øksengård , J. Olazarán , T. Paajanen , J. M. Peña , L. Peña , D. l. Peniche , A. S. Perez , M. Radwan , F. Ramírez-Toraño, A. Rodríguez-Pedrero , T. Saarinen , M. Salas-Carrillo , R. Salmelin , S. Sousa , A. Suyuthi , M. Toft , P. Toharia , T. Tveitstøl , M. Tveter , R. Upreti , R. J. Vermeulen , F. Vecchio , A. Yazidi , P. M. Rossini , Intelligent digital tools for screening of brain connectivity and dementia risk estimation in people affected by mild cognitive impairment: the ai-mind clinical study protocol , Frontiers in Neurorobotics 17 ( 2024 ). doi: 10.3389/fnbot.2023.1289406 . URL https://www.frontiersin.org/articles/10.3389/fnbot.2023.1289406 OpenUrl CrossRef [3]. ↵ A. Biasiucci , B. Franceschiello , M. M. Murray , Electroencephalography , Current Biology 29 ( 3 ) ( 2019 ) R80 – R85 . doi: 10.1016/j.cub.2018.11.052 . URL https://www.sciencedirect.com/science/article/pii/S0960982218315513 OpenUrl CrossRef PubMed [4]. ↵ Y. LeCun , Y. Bengio , G. Hinton , Deep learning , Nature 521 ( 7553 ) ( 2015 ) 436 – 444 . OpenUrl CrossRef PubMed [5]. ↵ I. Guyon , U. V. Luxburg , S. Bengio , H. Wallach , R. Fergus , S. Vishwanathan , R. Garnett A. Vaswani , N. Shazeer , N. Parmar , J. Uszkoreit , L. Jones , A. N. Gomez , L. u. Kaiser , I. Polosukhin , Attention is all you need , in: I. Guyon , U. V. Luxburg , S. Bengio , H. Wallach , R. Fergus , S. Vishwanathan , R. Garnett (Eds.), Advances in Neural Information Processing Systems , Vol. 30 , Curran Associates, Inc ., 2017 . URL https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf [6]. ↵ F. Pereira , C. Burges , L. Bottou , K. Weinberger A. Krizhevsky , I. Sutskever , G. E. Hinton , Imagenet classification with deep convolutional neural networks , in: F. Pereira , C. Burges , L. Bottou , K. Weinberger (Eds.), Advances in Neural Information Processing Systems , Vol. 25 , Curran Associates, Inc ., 2012 . URL https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf [7]. ↵ K. He , X. Zhang , S. Ren , J. Sun , Deep residual learning for image recognition , in: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , 2016 , pp. 770 – 778 . doi: 10.1109/CVPR.2016.90 . OpenUrl CrossRef [8]. ↵ G. Hinton , L. Deng , D. Yu , G. E. Dahl , A.-r. Mohamed , N. Jaitly , A. Senior , V. Vanhoucke , P. Nguyen , T. N. Sainath , B. Kingsbury , Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups , IEEE Signal Processing Magazine 29 ( 6 ) ( 2012 ) 82 – 97 . doi: 10.1109/MSP.2012.2205597 . OpenUrl CrossRef [9]. ↵ Y. Roy , H. Banville , I. Albuquerque , A. Gramfort , T. H. Falk , J. Faubert , Deep learning-based electroencephalography analysis: a systematic review , Journal of Neural Engineering 16 ( 5 ) ( 2019 ) 051001 . doi: 10.1088/1741-2552/ab260c . URL https://dx.doi.org/10.1088/1741-2552/ab260c OpenUrl CrossRef PubMed [10]. ↵ S. Montazeri , M. Airaksinen , P. Nevalainen , V. Marchi , L. Hellström-Westas , N. J. Stevenson , S. Vanhatalo , An automated bedside measure for monitoring neonatal cortical activity: a supervised deep learning-based electroencephalogram classifier with external cohort validation , The Lancet Digital Health 4 ( 12 ) ( 2022 ) e884 – e892 . OpenUrl [11]. ↵ J. Tveit , H. Aurlien , S. Plis , V. D. Calhoun , W. O. Tatum , D. L. Schomer , V. Arntsen , F. Cox , F. Fahoum , W. B. Gallentine , E. Gardella , C. D. Hahn , A. M. Husain , S. Kessler , M. A. Kural , F. A. Nascimento , H. Tankisi , L. B. Ulvin , R. Wennberg , S. Beniczky , Automated interpretation of clinical electroencephalograms using artificial intelligence , JAMA Neurology 80 ( 8 ) ( 2023 ) 805 – 812 . arXiv: https://jamanetwork.com/journals/jamaneurology/articlepdf/2806244/jamaneurology\_tveit\_2023\_oi\_230036\_1691786999.53858.pdf , doi: 10.1001/jamaneurol.2023.1645 . URL https://doi.org/10.1001/jamaneurol.2023.1645 OpenUrl CrossRef PubMed [12]. ↵ D. Mansilla , J. Tveit , H. Aurlien , T. Avigdor , V. Ros-Castello , A. Ho , C. Abdallah , J. Gotman , S. Beniczky , B. Frauscher , Generalizability of electroencephalographic interpretation using artificial intelligence: An external validation study , Epilepsia 65 ( 10 ) ( 2024 ) 3028 – 3037 . arXiv: https://onlinelibrary.wiley.com/doi/pdf/10.1111/epi.18082 , doi: 10.1111/epi.18082 . URL https://onlinelibrary.wiley.com/doi/abs/10.1111/epi.18082 OpenUrl CrossRef PubMed [13]. ↵ C. Babiloni , I. Lorenzo , R. Lizio , S. Lopez , F. Tucci , R. Ferri , A. Soricelli , F. Nobili , D. Arnaldi , F. Famà , C. Buttinelli , F. Giubilei , V. Cipollini , M. Onofrj , F. Stocchi , L. Vacca , P. Fuhr , U. Gschwandtner , G. Ransmayr , D. Aarsland , L. Parnetti , M. Marizzoni , F. D’Antonio , C. De Lena , B. Güntekin , E. Yıldırım , L. Hanoğlu , G. Yener , D.H. Gündüz , J. P. Taylor , J. Schumacher , I. McKeith , G. B. Frisoni , M. F. De Pandis , L. Bonanni , C. D. Percio , G. Noce , Reactivity of posterior cortical electroencephalographic alpha rhythms during eyes opening in cognitively intact older adults and patients with dementia due to alzheimer’s and lewy body diseases , Neurobiology of Aging 115 ( 2022 ) 88 – 108 . doi: 10.1016/j.neurobiolaging.2022.04.001 . URL https://www.sciencedirect.com/science/article/pii/S0197458022000707 OpenUrl CrossRef [14]. ↵ C. Babiloni , G. Noce , F. Tucci , D. Jakhar , R. Ferri , S. Panerai , V. Catania , A. Soricelli , M. Salvatore , F. Nobili , D. Arnaldi , F. Famà , C. Buttinelli , F. Giubilei , M. Onofrj , F. Stocchi , L. Vacca , F. Radicati , P. Fuhr , U. Gschwandtner , G. Ransmayr , L. Parnetti , M. Marizzoni , F. D’Antonio , G. Bruno , C. De Lena , B. Güntekin , E. Yıldırım , L. Hanoğlu , G. Yener , D. Hünerli , J.-P. Taylor , J. Schumacher , I. McKeith , G. B. Frisoni , A. Antonini , F. Ferreri , L. Bonanni , M. F. De Pandis , C. Del Percio , Poor reactivity of posterior electroencephalographic alpha rhythms during the eyes open condition in patients with dementia due to parkinson’s disease , Neurobiology of Aging 135 ( 2024 ) 1 – 14 . doi: 10.1016/j.neurobiolaging.2023.11.010 . URL https://www.sciencedirect.com/science/article/pii/S0197458023002725 OpenUrl CrossRef PubMed [15]. ↵ E. Azabou , V. Navarro , N. Kubis , M. Gavaret , N. Heming , A. Cariou , D. Annane , F. Lofaso , L. Naccache , T. Sharshar , Value and mechanisms of EEG reactivity in the prognosis of patients with impaired consciousness: a systematic review , Critical Care 22 ( 1 ) ( 2018 ) 184 . OpenUrl [16]. ↵ S. Benghanem , M. Paul , J. Charpentier , S. Rouhani , O. Ben Hadj Salem , L. Guillemet , S. Legriel , W. Bougouin , F. Pène , J. D. Chiche , J.-P. Mira , F. Dumas , A. Cariou , Value of eeg reactivity for prediction of neurologic outcome after cardiac arrest: Insights from the parisian registry , Resuscitation 142 ( 2019 ) 168 – 174 . doi: 10.1016/j.resuscitation.2019.06.009 . URL https://www.sciencedirect.com/science/article/pii/S0300957219302229 OpenUrl CrossRef [17]. ↵ M. Admiraal , A. van Rootselaar , J. Horn , International consensus on eeg reactivity testing after cardiac arrest: Towards standardization , Resuscitation 131 ( 2018 ) 36 – 41 . doi: 10.1016/j.resuscitation.2018.07.025 . URL https://www.sciencedirect.com/science/article/pii/S0300957218303691 OpenUrl CrossRef [18]. ↵ R. Krishnan , P. Rajpurkar , E. J. Topol , Self-supervised learning in medicine and healthcare , Nature Biomedical Engineering 6 ( 12 ) ( 2022 ) 1346 – 1352 . OpenUrl PubMed [19]. ↵ A. Jaiswal , A. R. Babu , M. Z. Zadeh , D. Banerjee , F. Makedon , A survey on contrastive self-supervised learning , Technologies 9 ( 1 ) ( 2021 ). doi: 10.3390/technologies9010002 . URL https://www.mdpi.com/2227-7080/9/1/2 OpenUrl CrossRef [20]. ↵ B. Leibe , J. Matas , N. Sebe , M. Welling M. Noroozi , P. Favaro , nsupervised learning of visual representations by solving jigsaw puzzles , in: B. Leibe , J. Matas , N. Sebe , M. Welling (Eds.), Computer Vision – ECCV 2016 , Springer International Publishing , Cham , 2016 , pp. 69 – 84 . [21]. ↵ X. Wang , Y. Ma , J. Cammon , F. Fang , Y. Gao , Y. Zhang , Self-supervised eeg emotion recognition models based on cnn , IEEE Transactions on Neural Systems and Rehabilitation Engineering 31 ( 2023 ) 1952 – 1962 . doi: 10.1109/TNSRE.2023.3263570 . OpenUrl CrossRef [22]. ↵ H. Cunlin , Y. Ye , X. Nenggang , Self-supervised motor imagery eeg recognition model based on 1-d mtcnn-lstm network , Journal of Neural Engineering 21 ( 3 ) ( 2024 ) 036014 . doi: 10.1088/1741-2552/ad48bc . URL https://dx.doi.org/10.1088/1741-2552/ad48bc OpenUrl CrossRef [23]. ↵ Y. Zeng , J. Lin , Z. Li , Z. Xiao , C. Wang , X. Ge , C. Wang , G. Huang , M. Liu , Adaptive node feature extraction in graph-based neural networks for brain diseases diagnosis using self-supervised learning , NeuroImage 297 ( 2024 ) 120750 . doi: 10.1016/j.neuroimage.2024.120750 . URL https://www.sciencedirect.com/science/article/pii/S1053811924002477 OpenUrl CrossRef PubMed [24]. ↵ Y. Li , S. Luo , H. Zhang , Y. Zhang , Y. Zhang , B. Lo , Mtclss: Multi-task contrastive learning for semisupervised pediatric sleep staging , IEEE Journal of Biomedical and Health Informatics 27 ( 6 ) ( 2023 ) 2647 – 2655 . doi: 10.1109/JBHI.2022.3213171 . OpenUrl CrossRef [25]. ↵ W. Ko , S. Jeong , S.-K. Song , H.-I. Suk , Eeg-oriented self-supervised learning with triple information pathways network , IEEE Transactions on Cybernetics 54 ( 11 ) ( 2024 ) 6495 – 6508 . doi: 10.1109/TCYB.2024.3410844 . OpenUrl CrossRef [26]. ↵ G. Li , N. Chen , Y. Niu , Z. Xu , Y. Dong , J. Jin , H. Zhu , Mslte: multiple self-supervised learning tasks for enhancing eeg emotion recognition , Journal of Neural Engineering 21 ( 2 ) ( 2024 ) 024003 . doi: 10.1088/1741-2552/ad3c28 . URL https://dx.doi.org/10.1088/1741-2552/ad3c28 OpenUrl CrossRef [27]. ↵ Z. Fu , H. Zhu , Y. Zhao , R. Huan , Y. Zhang , S. Chen , Y. Pan , Gmaeeg: A self-supervised graph masked autoencoder for eeg representation learning , IEEE Journal of Biomedical and Health Informatics 28 ( 11 ) ( 2024 ) 6486 – 6497 . doi: 10.1109/JBHI.2024.3443651 . OpenUrl CrossRef [28]. ↵ C. Yang , C. Xiao , M. B. Westover , J. Sun , Self-supervised electroencephalogram representation learning for automatic sleep staging: Model development and evaluation study , JMIR AI 2 ( 2023 ) e46769 . doi: 10.2196/46769 . URL https://ai.jmir.org/2023/1/e46769 OpenUrl CrossRef [29]. ↵ C. Zhao , W. Wu , H. Zhang , R. Zhang , X. Zheng , X. Kong , Sleep stage classification via multi-view based self-supervised contrastive learning of eeg , IEEE Journal of Biomedical and Health Informatics 28 ( 12 ) ( 2024 ) 7068 – 7077 . doi: 10.1109/JBHI.2024.3432633 . OpenUrl CrossRef [30]. ↵ E. Alsentzer , M. B. A. McDermott , F. Falck , S. K. Sarkar , S. Roy , S. L. Hyland M. N. Mohsenvand , M. R. Izadi , P. Maes , Contrastive representation learning for electroencephalogram classification , in: E. Alsentzer , M. B. A. McDermott , F. Falck , S. K. Sarkar , S. Roy , S. L. Hyland (Eds.), Proceedings of the Machine Learning for Health NeurIPS Workshop, Vol. 136 of Proceedings of Machine Learning Research , PMLR , 2020 , pp. 238 – 253 . URL https://proceedings.mlr.press/v136/mohsenvand20a.html [31]. ↵ H. Banville , O. Chehab , A. Hyvärinen , D.-A. Engemann , A. Gramfort , Uncovering the structure of clinical eeg signals with self-supervised learning , Journal of Neural Engineering 18 ( 4 ) ( 2021 ) 046020 . doi: 10.1088/1741-2552/abca18 . URL https://dx.doi.org/10.1088/1741-2552/abca18 OpenUrl CrossRef [32]. ↵ H. Li , J. Tang , W. Li , W. Dai , Y. Liu , Z. Zhou , Multi-task collaborative network: Bridge the supervised and self-supervised learning for eeg classification in rsvp tasks , IEEE Transactions on Neural Systems and Rehabilitation Engineering 32 ( 2024 ) 638 – 651 . doi: 10.1109/TNSRE.2024.3357863 . OpenUrl CrossRef [33]. ↵ Y. Ou , S. Sun , H. Gan , R. Zhou , Z. Yang , An improved self-supervised learning for eeg classification , Mathematical Biosciences and Engineering 19 ( 7 ) ( 2022 ) 6907 – 6922 . doi: 10.3934/mbe.2022325 . URL https://www.aimspress.com/article/doi/10.3934/mbe.2022325 OpenUrl CrossRef [34]. ↵ Y. He , Z. Lu , J. Wang , S. Ying , J. Shi , A self-supervised learning based channel attention mlp-mixer network for motor imagery decoding , IEEE Transactions on Neural Systems and Rehabilitation Engineering 30 ( 2022 ) 2406 – 2417 . doi: 10.1109/TNSRE.2022.3199363 . OpenUrl CrossRef [35]. ↵ E. Eldele , M. Ragab , Z. Chen , M. Wu , C.-K. Kwoh , X. Li , Self-supervised learning for labelefficient sleep stage classification: A comprehensive evaluation , IEEE Transactions on Neural Systems and Rehabilitation Engineering 31 ( 2023 ) 1333 – 1342 . doi: 10.1109/TNSRE.2023.3245285 . OpenUrl CrossRef [36]. ↵ P. Guetschel , S. Ahmadi , M. Tangermann , Review of deep representation learning techniques for brain–computer interfaces , Journal of Neural Engineering 21 ( 6 ) ( 2024 ) 061002 . doi: 10.1088/1741-2552/ad8962 . URL https://dx.doi.org/10.1088/1741-2552/ad8962 OpenUrl CrossRef [37]. ↵ J. Shawe-Taylor , R. Zemel , P. Bartlett , F. Pereira , K. Weinberger J. Bergstra , R. Bardenet , Y. Bengio , B. Kégl , Algorithms for hyper-parameter optimization , in: J. Shawe-Taylor , R. Zemel , P. Bartlett , F. Pereira , K. Weinberger (Eds.), Advances in Neural Information Processing Systems , Vol. 24 , Curran Associates, Inc ., 2011 . URL https://proceedings.neurips.cc/paper_files/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf [38]. ↵ J. Dy , A. Krause Z. Chen , V. Badrinarayanan , C.-Y. Lee , A. Rabinovich , GradNorm: Gradient normalization for adaptive loss balancing in deep multitask networks , in: J. Dy , A. Krause (Eds.), Proceedings of the 35th International Conference on Machine Learning, Vol. 80 of Proceedings of Machine Learning Research , PMLR , 2018 , pp. 794 – 803 . URL https://proceedings.mlr.press/v80/chen18a.html [39]. ↵ J.-A. Désidéri , Multiple-gradient descent algorithm (mgda) for multiobjective optimization , Comptes Rendus Mathematique 350 ( 5 ) ( 2012 ) 313 – 318 . doi: 10.1016/j.crma.2012.03.014 . URL https://www.sciencedirect.com/science/article/pii/S1631073X12000738 OpenUrl CrossRef [40]. ↵ S. Bengio , H. Wallach , H. Larochelle , K. Grauman , N. Cesa-Bianchi , R. Garnett O. Sener , V. Koltun , Multi-task learning as multi-objective optimization , in: S. Bengio , H. Wallach , H. Larochelle , K. Grauman , N. Cesa-Bianchi , R. Garnett (Eds.), Advances in Neural Information Processing Systems , Vol. 31 , Curran Associates, Inc ., 2018 . URL https://proceedings.neurips.cc/paper_files/paper/2018/file/432aca3a1e345e339f35a30c8f65edce-Paper.pdf [41]. ↵ H. Larochelle , M. Ranzato , R. Hadsell , M. Balcan , H. Lin T. Yu , S. Kumar , A. Gupta , S. Levine , K. Hausman , C. Finn , Gradient surgery for multi-task learning , in: H. Larochelle , M. Ranzato , R. Hadsell , M. Balcan , H. Lin (Eds.), Advances in Neural Information Processing Systems , Vol. 33 , Curran Associates, Inc ., 2020 , pp. 5824 – 5836 . URL https://proceedings.neurips.cc/paper_files/paper/2020/file/3fe78a8acf5fda99de95303940a2420c-Paper.pdf OpenUrl [42]. ↵ R. Cipolla , Y. Gal , A. Kendall , Multi-task learning using uncertainty to weigh losses for scene geometry and semantics , in: 2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition , 2018 , pp. 7482 – 7491 . doi: 10.1109/CVPR.2018.00781 . OpenUrl CrossRef [43]. ↵ S. Palmqvist , S. Janelidze , Y. T. Quiroz , H. Zetterberg , F. Lopera , E. Stomrud , Y. Su , Y. Chen , G. E. Serrano , A. Leuzy , N. Mattsson-Carlgren , O. Strandberg , R. Smith , A. Villegas , D. Sepulveda-Falla , X. Chai , N. K. Proctor , T. G. Beach , K. Blennow , J. L. Dage , E. M. Reiman , O. Hansson , Discriminative accuracy of plasma phospho-tau217 for alzheimer disease vs other neurodegenerative disorders , JAMA 324 ( 8 ) ( 2020 ) 772 – 781 . arXiv: https://jamanetwork.com/journals/jama/articlepdf/2768841/jama\_palmqvist\_2020\_oi\_200077\_1597950344.77871.pdf , doi: 10.1001/jama.2020.12134 . OpenUrl CrossRef PubMed [44]. ↵ A. Babayan , M. Erbey , D. Kumral , J. D. Reinelt , A. M. Reiter , J. Röbbig , H. L. Schaare , M. Uhlig , A. Anwander , P.-L. Bazin , et al. , A mind-brain-body dataset of mri, eeg, cognition, emotion, and peripheral physiology in young and old adults , Scientific data 6 ( 1 ) ( 2019 ) 1 – 21 . OpenUrl PubMed [45]. ↵ E. Wascher , D. Schneider , P. D. Gajewski , S. Getzmann , “resting-state eeg data before and after cognitive activity across the adult lifespan and a 5-year follow-up” ( 2024 ). doi : doi: 10.18112/openneuro.ds005385.v1.0.0 . OpenUrl CrossRef [46]. ↵ P. D. Gajewski , S. Getzmann , P. Bröde , M. Burke , C. Cadenas , S. Capellino , M. Claus , E. Genç , K. Golka , J. G. Hengstler , T. Kleinsorge , R. Marchan , M. A. Nitsche , J. Reinders , C. van Thriel , C. Watzl , E. Wascher , Impact of biological and lifestyle factors on cognitive aging and work ability in the dortmund vital study: Protocol of an interdisciplinary, cross-sectional, and longitudinal study , JMIR Res Protoc 11 ( 3 ) ( 2022 ) e32352 . doi: 10.2196/32352 . URL https://www.researchprotocols.org/2022/3/e32352 OpenUrl CrossRef [47]. ↵ A. Gramfort , M. Luessi , E. Larson , D. A. Engemann , D. Strohmeier , C. Brodbeck , R. Goj , M. Jas , T. Brooks , L. Parkkonen , M.S. Hämäläinen , MEG and EEG data analysis with MNE-Python , Frontiers in Neuroscience 7 ( 267 ) ( 2013 ) 1 – 13 . doi: 10.3389/fnins.2013.00267 . OpenUrl CrossRef PubMed [48]. ↵ S. Getzmann , P. D. Gajewski , D. Schneider , E. Wascher , Resting-state EEG data before and after cognitive activity across the adult lifespan and a 5-year follow-up , Sci. Data 11 ( 1 ) ( 2024 ) 988 . OpenUrl [49]. ↵ M. Jas , D. A. Engemann , Y. Bekhti , F. Raimondo , A. Gramfort , Autoreject: Automated artifact rejection for meg and eeg data , NeuroImage 159 ( 2017 ) 417 – 429 . doi: 10.1016/j.neuroimage.2017.06.030 . URL https://www.sciencedirect.com/science/article/pii/S1053811917305013 OpenUrl CrossRef PubMed [50]. ↵ J. Bergstra , Y. Bengio , Random search for hyper-parameter optimization , J. Mach. Learn. Res . 13 (null) ( 2012 ) 281 – 305 . OpenUrl [51]. ↵ T. Akiba , S. Sano , T. Yanase , T. Ohta , M. Koyama , Optuna: A next-generation hyperparameter optimization framework , in: Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , 2019 . [52]. ↵ Y. Ozaki , Y. Tanigaki , S. Watanabe , M. Onishi , Multiobjective tree-structured parzen estimator for computationally expensive optimization problems , in: Proceedings of the 2020 Genetic and Evolutionary Computation Conference, GECCO ‘20, Association for Computing Machinery, New York, NY, USA , 2020 , pp. 533 – 541 . doi: 10.1145/3377930.3389817 . URL https://doi.org/10.1145/3377930.3389817 OpenUrl CrossRef [53]. ↵ S. Ng , S. Masarone , D. Watson , M. R. Barnes , The benefits and pitfalls of machine learning for biomarker discovery , Cell and Tissue Research 394 ( 1 ) ( 2023 ) 17 – 31 . OpenUrl CrossRef PubMed [54]. ↵ P. Monllor , A. Cervera-Ferri , M.-A. Lloret , D. Esteve , B. Lopez , J.-L. Leon , A. Lloret , Electroencephalography as a non-invasive biomarker of alzheimer’s disease: A forgotten candidate to substitute csf molecules? , International Journal of Molecular Sciences 22 ( 19 ) ( 2021 ). doi: 10.3390/ijms221910889 . URL https://www.mdpi.com/1422-0067/22/19/10889 OpenUrl CrossRef PubMed [55]. ↵ C. Babiloni , R. Lizio , F. Vecchio , G. B. Frisoni , M. Pievani , C. Geroldi , F. Claudia , R. Ferri , B. Lanuzza , P. M. Rossini , Reactivity of cortical alpha rhythms to eye opening in mild cognitive impairment and alzheimer’s disease: an eeg study , Journal of Alzheimer’s Disease 22 ( 4 ) ( 2010 ) 1047 – 1064 , pMID: 20930306 . arXiv: https://journals.sagepub.com/doi/pdf/10.3233/JAD-2010-100798 , doi: 10.3233/JAD-2010-100798 . URL https://journals.sagepub.com/doi/abs/10.3233/JAD-2010-100798 OpenUrl CrossRef PubMed [56]. ↵ Z. Ghahramani , M. Welling , C. Cortes , N. Lawrence , K. Weinberger J. Yosinski , J. Clune , Y. Bengio , H. Lipson , How transferable are features in deep neural networks? , in: Z. Ghahramani , M. Welling , C. Cortes , N. Lawrence , K. Weinberger (Eds.), Advances in Neural Information Processing Systems , Vol. 27 , Curran Associates, Inc ., 2014 . URL https://proceedings.neurips.cc/paper_files/paper/2014/file/532a2f85b6977104bc93f8580abbb330-Paper.pdf [57]. ↵ J. Paillard , J. F. Hipp , D. A. Engemann , Green: A lightweight architecture using learnable wavelets and riemannian geometry for biomarker exploration with eeg signals , Patterns ( 2025 ) 101182 doi: 10.1016/j.patter.2025.101182 . URL https://www.sciencedirect.com/science/article/pii/S2666389925000303 OpenUrl CrossRef [58]. ↵ S. Chae , J. Park , M. S. Byun , D. Yi , J. H. Lee , G. H. Byeon , H. W. Suk , H. Choi , J. E. Park , D. Y. Lee , Decreased alpha reactivity from eyes-closed to eyes-open in non-demented older adults with alzheimer’s disease: A combined eeg and [18f]florbetaben pet study , Journal of Alzheimer’s Disease 77 ( 4 ) ( 2020 ) 1681 – 1692 , pMID: 32925046 . arXiv: https://journals.sagepub.com/doi/pdf/10.3233/JAD-200442 , doi: 10.3233/JAD-200442 . URL https://journals.sagepub.com/doi/abs/10.3233/JAD-200442 OpenUrl CrossRef PubMed [59]. ↵ R. J. Barry , F. M. De Blasio , Eeg differences between eyes-closed and eyes-open resting remain in healthy ageing , Biological psychology 129 ( 2017 ) 293 —304. doi: 10.1016/j.biopsycho.2017.09.010 . URL https://ro.uow.edu.au/articles/journal_contribution/EEG_differences_between_eyes-closed_and_eyes-open_resting_remain_in_healthy_ageing/27733818/1/files/50491716.pdf OpenUrl CrossRef PubMed [60]. ↵ R. T. Ibitoye , P. Castro , T. J. Ellmers , D. N. Kaski , A. M. Bronstein , Vestibular loss disrupts visual reactivity in the alpha eeg rhythm , NeuroImage: Clinical 39 ( 2023 ) 103469 . doi: 10.1016/j.nicl.2023.103469 . URL https://www.sciencedirect.com/science/article/pii/S2213158223001602 OpenUrl CrossRef [61]. ↵ L. Wan , H. Huang , N. Schwab , J. Tanner , A. Rajan , N. B. Lam , L. Zaborszky , C.-s. R. Li , C. C. Price , M. Ding , From eyes-closed to eyes-open: Role of cholinergic projections in ec-to-eo alpha reactivity revealed by combining eeg and mri , Human Brain Mapping 40 ( 2 ) ( 2019 ) 566 – 577 . arXiv: https://onlinelibrary.wiley.com/doi/pdf/10.1002/hbm.24395 , doi: 10.1002/hbm.24395 . URL https://onlinelibrary.wiley.com/doi/abs/10.1002/hbm.24395 OpenUrl CrossRef [62]. ↵ J. Schumacher , A. J. Thomas , L. R. Peraza , M. Firbank , R. Cromarty , C. A. Hamilton , P. C. Donaghy , J. T. O’Brien , J.-P. Taylor , EEG alpha reactivity and cholinergic system integrity in lewy body dementia and alzheimer’s disease , Alzheimer’s Research & Therapy 12 ( 1 ) ( 2020 ) 46 . OpenUrl [63]. R. Caruana , Multitask learning , Machine Learning 28 ( 1 ) ( 1997 ) 41 – 75 . OpenUrl [64]. Y. Zhao , X. Wang , T. Che , G. Bao , S. Li , Multi-task deep learning for medical image computing and analysis: A review , Computers in Biology and Medicine 153 ( 2023 ) 106496 . doi: 10.1016/j.compbiomed.2022.106496 . URL https://www.sciencedirect.com/science/article/pii/S0010482522012045 OpenUrl CrossRef [65]. W.-H. Li , X. Liu , H. Bilen , Learning multiple dense prediction tasks from partially annotated data , in: 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) , 2022 , pp. 18857 – 18867 . doi: 10.1109/CVPR52688.2022.01831 . OpenUrl CrossRef [66]. Y. Zhang , Q. Yang , A survey on multi-task learning , IEEE Transactions on Knowledge and Data Engineering 34 ( 12 ) ( 2022 ) 5586 – 5609 . doi: 10.1109/TKDE.2021.3070203 . OpenUrl CrossRef [67]. S. Ruder , An overview of multi-task learning in deep neural networks ( 2017 ). arxiv: 1706.05098 . URL https://arxiv.org/abs/1706.05098 [68]. O. Alabi , T. Vercauteren , M. Shi , Multitask learning in minimally invasive surgical vision: A review , Medical Image Analysis 101 ( 2025 ) 103480 . doi: 10.1016/j.media.2025.103480 . URL https://www.sciencedirect.com/science/article/pii/S1361841525000283 OpenUrl CrossRef [69]. S. Vandenhende , S. Georgoulis , W. Van Gansbeke , M. Proesmans , D. Dai , L. Van Gool , Multi-task learning for dense prediction tasks: A survey , IEEE Transactions on Pattern Analysis and Machine Intelligence 44 ( 7 ) ( 2022 ) 3614 – 3633 . doi: 10.1109/TPAMI.2021.3054719 . OpenUrl CrossRef [70]. ↵ H. Ismail Fawaz , B. Lucas , G. Forestier , C. Pelletier , D. F. Schmidt , J. Weber , G. I. Webb , L. Idoumghar , P.-A. Muller , F. Petitjean , Inceptiontime: Finding alexnet for time series classification , Data Mining and Knowledge Discovery ( 2020 ). [71]. ↵ R. T. Schirrmeister , J. T. Springenberg , L. D. J. Fiederer , M. Glasstetter , K. Eggensperger , M. Tangermann , F. Hutter , W. Burgard , T. Ball , Deep learning with convolutional neural networks for eeg decoding and visualization , Human Brain Mapping 38 ( 11 ) ( 2017 ) 5391 – 5420 . arXiv: https://onlinelibrary.wiley.com/doi/pdf/10.1002/hbm.23730 , doi: 10.1002/hbm.23730 . URL https://onlinelibrary.wiley.com/doi/abs/10.1002/hbm.23730 OpenUrl CrossRef PubMed [72]. ↵ S. Bai , J. Z. Kolter , V. Koltun , An empirical evaluation of generic convolutional and recurrent networks for sequence modeling , arxiv: 1803.01271 ( 2018 ). [73]. ↵ T. Tveitstøl , M. Tveter , A. S. Pérez T. , C. Hatlestad-Hall , A. Yazidi , H. L. Hammer , I. R. J. Hebold Haraldsen , Introducing region based pooling for handling a varied number of eeg channels for deep learning models , Frontiers in Neuroinformatics 17 ( 2024 ). doi: 10.3389/fninf.2023.1272791 . URL https://www.frontiersin.org/journals/neuroinformatics/articles/10.3389/fninf.2023.1272791 OpenUrl CrossRef [74]. ↵ A. Dempster , F. Petitjean , G. I. Webb , ROCKET: Exceptionally fast and accurate time series classification using random convolutional kernels , Data Mining and Knowledge Discovery 34 ( 5 ) ( 2020 ) 1454 – 1495 . OpenUrl CrossRef [75]. ↵ T. Tveitstøl , M. Tveter , C. Hatlestad-Hall , H. L. Hammer , D. Engemann , I. Haraldsen , Assessing the robustness of deep learning based brain age prediction models across multiple eeg datasets , bioRxiv ( 2025 ). arXiv: https://www.biorxiv.org/content/early/2025/05/21/2025.05.20.655022.full.pdf , doi: 10.1101/2025.05.20.655022 . URL https://www.biorxiv.org/content/early/2025/05/21/2025.05.20.655022 OpenUrl Abstract / FREE Full Text [76]. ↵ F. Pedregosa , G. Varoquaux , A. Gramfort , V. Michel , B. Thirion , O. Grisel , M. Blondel , P. Prettenhofer , R. Weiss , V. Dubourg , J. Vanderplas , A. Passos , D. Cournapeau , M. Brucher , M. Perrot , E. Duchesnay , Scikit-learn: Machine learning in Python , Journal of Machine Learning Research 12 ( 2011 ) 2825 – 2830 . OpenUrl View the discussion thread. Back to top Previous Next Posted September 21, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Learning Residual-based Biomarkers of Cognitive Health via Self-Supervised Learning on EEG State Transitions Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Learning Residual-based Biomarkers of Cognitive Health via Self-Supervised Learning on EEG State Transitions Thomas Tveitstøl , Mats Tveter , Christoffer Hatlestad-Hall , Hugo L Hammer , Ira R J Hebold Haraldsen bioRxiv 2025.09.21.677598; doi: https://doi.org/10.1101/2025.09.21.677598 Share This Article: Copy Citation Tools Learning Residual-based Biomarkers of Cognitive Health via Self-Supervised Learning on EEG State Transitions Thomas Tveitstøl , Mats Tveter , Christoffer Hatlestad-Hall , Hugo L Hammer , Ira R J Hebold Haraldsen bioRxiv 2025.09.21.677598; doi: https://doi.org/10.1101/2025.09.21.677598 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neuroscience Subject Areas All Articles Animal Behavior and Cognition (7637) Biochemistry (17705) Bioengineering (13899) Bioinformatics (41970) Biophysics (21463) Cancer Biology (18605) Cell Biology (25526) Clinical Trials (138) Developmental Biology (13385) Ecology (19911) Epidemiology (2067) Evolutionary Biology (24329) Genetics (15615) Genomics (22514) Immunology (17743) Microbiology (40424) Molecular Biology (17194) Neuroscience (88650) Paleontology (667) Pathology (2835) Pharmacology and Toxicology (4827) Physiology (7648) Plant Biology (15160) Scientific Communication and Education (2046) Synthetic Biology (4302) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-23T02:00:01.238055+00:00

License: CC-BY-4.0