Full text
66,724 characters
· extracted from
preprint-html
· click to expand
A robust framework for harmonising health measures across international cohorts: Evidence from the COVID-19 pandemic | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search A robust framework for harmonising health measures across international cohorts: Evidence from the COVID-19 pandemic James Lian , Pedro F Zuccolo , Omid V. Ebrahimi , Daniel Fatori , Abhaya Adlakha , Juan F. De La Hoz , Younga H. Lee , Adriana Carneiro , Isabela M. Benseñor , Paulo A. Lotufo , Alessandra C. Goulart , Justin D. Tubbs , Devon Watts , Yu Zhou , Lorenza Dall’Aglio , Mihael Cudic , Morgane Kuenzi , CGMHC Consortium , Ronald C. Kessler , Vikram Patel , André Brunoni , Jordan W. Smoller , Sarah Bauermeister doi: https://doi.org/10.1101/2025.09.09.25335409 James Lian 1 Dementias Platform UK, Department of Psychiatry, University of Oxford, Warneford Hospital , Oxford OX3 7JX, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: james.lian{at}psych.ox.ac.uk Pedro F Zuccolo 2 Departamento de Psiquiatria, Faculdade de Medicina FMUSP, Universidade de Sao Paulo , Sao Paulo, BR Find this author on Google Scholar Find this author on PubMed Search for this author on this site Omid V. Ebrahimi 3 Department of Experimental Psychology, University of Oxford , Oxford, UK 4 Department of Psychiatry, University of Oxford , Oxford, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Daniel Fatori 2 Departamento de Psiquiatria, Faculdade de Medicina FMUSP, Universidade de Sao Paulo , Sao Paulo, BR 5 Laboratorio de Psicopatologia e Terapeutica Psiquiatrica LIM-23, Instituto de Psiquiatria, Hospital das Clinicas HCFMUSP, Faculdade de Medicina, Universidade de Sao Paulo , Sao Paulo, SP, BR Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abhaya Adlakha 1 Dementias Platform UK, Department of Psychiatry, University of Oxford, Warneford Hospital , Oxford OX3 7JX, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Juan F. De La Hoz 6 Department of Psychiatry, Massachusetts General Hospital , Boston, Massachusetts, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Younga H. Lee 6 Department of Psychiatry, Massachusetts General Hospital , Boston, Massachusetts, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Adriana Carneiro 2 Departamento de Psiquiatria, Faculdade de Medicina FMUSP, Universidade de Sao Paulo , Sao Paulo, BR Find this author on Google Scholar Find this author on PubMed Search for this author on this site Isabela M. Benseñor 7 Centro de Pesquisa Clínica e Epidemiológica, Hospital Universitário, Universidade de São Paulo , BR Find this author on Google Scholar Find this author on PubMed Search for this author on this site Paulo A. Lotufo 7 Centro de Pesquisa Clínica e Epidemiológica, Hospital Universitário, Universidade de São Paulo , BR Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alessandra C. Goulart 7 Centro de Pesquisa Clínica e Epidemiológica, Hospital Universitário, Universidade de São Paulo , BR 8 Departamento de Epidemiologia, Faculdade de Saúde Pública, Universidade de São Paulo , BR Find this author on Google Scholar Find this author on PubMed Search for this author on this site Justin D. Tubbs 6 Department of Psychiatry, Massachusetts General Hospital , Boston, Massachusetts, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Devon Watts 6 Department of Psychiatry, Massachusetts General Hospital , Boston, Massachusetts, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yu Zhou 6 Department of Psychiatry, Massachusetts General Hospital , Boston, Massachusetts, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lorenza Dall’Aglio 6 Department of Psychiatry, Massachusetts General Hospital , Boston, Massachusetts, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mihael Cudic 6 Department of Psychiatry, Massachusetts General Hospital , Boston, Massachusetts, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Morgane Kuenzi 1 Dementias Platform UK, Department of Psychiatry, University of Oxford, Warneford Hospital , Oxford OX3 7JX, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ronald C. Kessler 9 Department of Health Care Policy, Harvard Medical School , Boston, MA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Vikram Patel 10 Department of Global Health & Social Medicine, Harvard Medical School , Boston, MA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site André Brunoni 2 Departamento de Psiquiatria, Faculdade de Medicina FMUSP, Universidade de Sao Paulo , Sao Paulo, BR Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jordan W. Smoller 6 Department of Psychiatry, Massachusetts General Hospital , Boston, Massachusetts, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sarah Bauermeister 1 Dementias Platform UK, Department of Psychiatry, University of Oxford, Warneford Hospital , Oxford OX3 7JX, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Cross-national research on health trajectories requires harmonised measures that are valid and comparable. However, measurement scales often differ between cohorts in item content and cultural context. We present a structured framework to harmonise mental health measures, using depression and anxiety symptoms across two longitudinal cohorts that used different self-report measures with heterogeneous item codings: ELSA-UK and ELSA-Brasil. Data were collected before, during, and after the COVID-19 pandemic, providing a natural experiment for examining temporal changes in mental health. We applied a theory-driven strategy to align item content across cohorts relying on a priori assumptions of cross-national item equivalence and binary comparability. This involved: (1) mapping items to DSM-5 symptom domains via expert review; (2) transforming response formats into harmonised ordinal indicators; (3) leveraging the Harmony AI tool to identify semantically equivalent items; and (4) establishing measurement invariance across waves and cohorts using multi-group confirmatory factor analysis (MGCFA). Within this framework, scalar invariance was achieved for depression in both longitudinal and cross-cohort models, enabling meaningful latent mean comparisons between cohorts. For anxiety, scalar invariance was supported within but not across cohorts, likely due to the limited number of conceptually matching items. These findings highlight that the success of harmonisation relies on the quality and conceptual alignment of available items. Our results demonstrate the feasibility of robust cross-cultural comparisons and provide a methodological template for future harmonisation efforts in global health research. Introduction Internationally harmonised health data have become increasingly essential for addressing key global health research questions [ 1 ]. In epidemiological research, meaningful comparisons across countries, time points, and populations rely on the availability of harmonised measures [ 2 , 3 ]. Harmonisation enables pooled analyses, enhances statistical power, and facilitates generalisability of findings. It also expands the range of contextual and demographic variation in exposures, such as socioeconomic status or public health policies, by integrating diverse samples from multiple settings [ 4 – 6 ]. However, in the field of mental health, where assessment tools often vary in content, structure, and cultural interpretation, creating comparable measures across studies remains a significant methodological challenge [ 7 ]. Several initiatives have published guidelines to promote standardised harmonisation practices [ 4 ] and successful examples are emerging [ 2 , 4 , 8 – 10 ]. Despite this progress, harmonising mental health data across heterogeneous cohorts remains challenging. In many cases, harmonisation is restricted to studies using identical instruments, such as the HRS International Family of Studies, which employs the CES-D across member cohorts [ 11 ]. When cohorts rely on different instruments, harmonisation strategies typically include retrospective item mapping, variable recoding, or the application of item response theory (IRT) [ 12 , 13 ]. These strategies, while valuable, face limitations: they may depend heavily on subjective decisions about item equivalence, impose strong assumptions such as unidimensionality, or fail to evaluate whether constructs are measured equivalently across time and groups (3,14). To address these limitations, researchers have increasingly called for harmonisation frameworks that explicitly test measurement equivalence across diverse populations (Ebrahimi et al., 2024). Multi-group confirmatory factor analysis (MGCFA) offers such a framework, providing a rigorous psychometric method for assessing whether latent constructs are measured consistently across cohorts, cultures, and timepoints [ 15 ]. MGCFA involves fitting a series of increasingly constrained models — configural, metric, and scalar invariance — to evaluate whether observed differences reflect genuine variation in underlying constructs rather than artefacts of measurement bias or cultural interpretation. Importantly, MGCFA offers a scalable and statistically principled foundation for global harmonisation of health constructs, particularly when studies differ in language, format, or item selection (Putnick & Bornstein, 2016; Van de Schoot et al., 2015). Although prior work has examined measurement invariance either longitudinally or cross-sectionally, few harmonisation strategies address both dimensions simultaneously (Ebrahimi, 2024). Without evidence that a measure functions consistently across time and across populations, observed differences may conflate true change with artefacts of measurement [ 16 ]. Moreover, standardisation of scales between cohorts with different instruments is rarely attempted in a way that also explicitly assesses the similarity of item functioning. To our knowledge, no prior study has combined expert-driven item mapping, data-driven semantic matching, harmonisation of heterogeneous response formats, and formal invariance testing across both time and cohorts in a single methodological pipeline. The COVID-19 pandemic provides a unique natural experiment for testing harmonisation methods due to its global reach, profound psychological impact, and variability in public health responses. While extensive research has documented health effects during the pandemic [ 17 – 22 ], relatively little is known about cross-national differences in mental health trajectories during this period. Countries varied widely in the timing, duration, and stringency of containment policies [ 23 ], contributing to variability in mental health responses. Vulnerable groups, such as women, young people, and individuals from socioeconomically disadvantaged backgrounds, were disproportionately affected [ 24 – 29 ]. Moreover, mental health trends fluctuated over time, often peaking during initial lockdowns and attenuating thereafter [ 30 – 36 ]. These unique circumstances provide an opportunity to test whether harmonisation methods can yield valid, comparable mental health indicators across countries with different instruments, contexts, and pandemic responses. Objectives As part of the COVID Global Mental Health Consortium (CGMHC), we aimed to develop and apply a systematic harmonisation framework to generate depression and anxiety measures that are comparable both longitudinally and cross-culturally, despite differences in item content and response formats across instruments. Data came from two culturally and socioeconomically distinct cohorts: the English Longitudinal Study of Ageing (ELSA-UK) [ 37 ] and the Brazilian Longitudinal Study of Ageing (ELSA-Brasil) [ 38 , 39 ]. Using these studies, we implemented a multi-step process combining expert item mapping, AI-based semantic similarity matching, and multi-group confirmatory factor analysis (MGCFA) to evaluate measurement invariance across time and cohorts. Our objective was to test the feasibility of this framework by deriving harmonised latent factor scores for depression and anxiety, and to highlight both its potential and its limitations for future cross-national health research. Methods Data sources Participating cohorts This study draws on two longitudinal population-based cohorts: the ELSA-Brasil COVID-19 Mental Health Cohort study, and the English Longitudinal Study of Ageing (ELSA-UK). Despite their similar names, these studies differ substantially in measurement instruments, cultural context, and data collection procedures. The ELSA-Brasil cohort is a prospective study of 15,105 participants from six major Brazilian cities, established to investigate the clinical and sociodemographic determinants of chronic diseases and mortality in a middle-income country context [ 38 ]. Participants are university employees (active or retired), aged 35-74 years, and free from major neurocognitive disorders at enrolment. Nested within the larger study is ELSA Brasil COVID-19. During the pandemic, participants from the São Paulo site (n = 4,191) were invited to complete online assessments of mental health, performed in four waves: c1 (May–July 2020), c2 (July– September 2020), c3 (October–December 2020), and c4 (April–June 2021) [ 28 , 39 ]. A total of 2,691 individuals who completed at least one wave were included in the analysis. The ELSA-UK cohort, launched in 2002, follows a nationally representative sample of adults aged 50 and older living in England, tracking health, social, and economic changes in later life. For this study, we used data from wave 9 (2018–2019), two COVID-19 sub-studies (June– July 2020 and November–December 2020), and wave 10 (2021–2023), comprising 11,004 participants. For both cohorts, we selected waves from 2018–2023 that were available in the DPUK portal and contained comparable mental health measures, rather than aligning assessments by calendar date. This ensured that harmonisation focused on measurement equivalence, even though data collection periods differed between countries. Figure 1 depicts the timing of assessments included in these analyses. Ethical approval was obtained from relevant local committees for all waves and cohorts. Download figure Open in new tab Figure 1 Timing of data collection around the COVID-19 pandemic. Note. ELSA Brasil waves (N = 2691): 1a) May–July 2020 , 2a) July–September 2020, 3a) October–December 2020, and 4a) April to June 2021. ELSA UK waves (N = 11,004): 1b) 2018-2019, 2b) June-July 2020, 3b) November, December 2020, and 4b) 2021-2023. Setting All data were accessed and analysed via the Dementias Platform UK (DPUK) Data Portal, a secure, cloud-based research environment equipped with pre-installed statistical tools [ 40 ]. DPUK can only be accessed by approved users and data is never downloaded or analysed outside this environment. R version 4.1 was used for analysis. Analyses were conducted between December 2024 and May 2025. Mental health measures Mental health symptoms in the ELSA-Brasil COVID-19 study were assessed using the Brazilian version of the Depression, Anxiety, and Stress Scale – 21 items (DASS-21) [ 41 ]. This instrument was designed to measure symptoms related to depression, anxiety, and stress in the past week. It consists of 21 self-report items rated on a four-point Likert scale (0 = “strongly disagree” to 3 = “totally agree”), based on the frequency or intensity of experienced symptoms. The scale includes three subscales—depression, anxiety, and stress—each composed of seven items; scores can be computed separately for each domain or summed into a total score (range: 0–63), with higher values indicating greater symptom severity. Questions from DASS-21 were asked in Brazilian Portuguese and were translated to English for this study. Mental health in the ELSA UK cohort was assessed using the 8-item version of the Center for Epidemiologic Studies Depression Scale (CES-D-8) [ 42 ], the Generalized Anxiety Disorder Scale (GAD-7) [ 43 ], the Control, Autonomy, Self-Realization and Pleasure Scale (CASP-12) [ 44 ], and a four-item personal well-being questionnaire (ONS-4) [ 45 ]. The CES-D-8 is a brief self-report scale designed to measure the frequency of depressive symptoms over the past week. In contrast to the original CES-D-20 that is rated on a four-point Likert scale, the CES-D-8 uses a dichotomous (yes/no) format to reduce participant burden and confusion [ 46 ]. The GAD-7 aims to evaluate symptoms of generalised anxiety disorder over the past two weeks. It includes 7 items rated on a four-point Likert scale (0 = “not at all” to 3 = “nearly every day”). The total score ranges from 0 to 21, with higher scores reflecting more severe anxiety symptoms. The CASP-12 was developed to assess quality of life in older adults, focusing on control, autonomy, self-realisation, and pleasure. It comprises 12 items rated on a four-point Likert scale (1-4, Often-Never, with some items negatively worded and reversed when computing scores), where higher scores indicate better quality of life. The ONS-4 assesses personal well-being using four measures: Life satisfaction, Worthwhile (i.e., subjective sense of purpose and meaning in life), Happiness, and Anxiety. The four questions are on an 11-point scale from 0 to 10, where 0 is “not at all” and 10 is “completely”. Harmonisation pipeline To facilitate cross-cohort comparisons, a team of subject matter experts (PFZ, JL, DF, and AA) conducted a 5-step process of retrospective harmonisation of mental health measures, as shown in Figure 2 . The experts are either clinical psychologists and/or hold doctorates in psychiatric epidemiology. Download figure Open in new tab Figure 2 Harmonisation pipeline. 1. Variable selection and symptom level alignment Data dictionaries from each cohort were reviewed to identify and extract all measures assessing depression and anxiety. Symptom categories for these conditions were defined based on DSM-5-TR criteria for major depressive disorder and generalised anxiety disorder. All relevant items were then compiled in a spreadsheet to allow for symptom-level and item-level alignment (see supplementary). Two independent teams (DF and PFZ; JL and AA) reviewed the individual items of each scale and mapped them to specific depression and anxiety symptom categories (e.g., “fatigue”, “low energy”; or “worry”, or “irritability”). Items were considered similar if they used comparable wording (e.g., “I felt that life was meaningless” and “I feel that my life has meaning”) or if they were interpreted as referring to the same symptom (e.g., “I felt down-hearted and blue” and “I felt depressed”). Items that were judged to capture multiple symptoms (e.g., “depressed mood” and “anhedonia”) were coded under all applicable categories. Discrepancies between the two groups were resolved by a third independent rater with extensive clinical and psychometric experience (AC) (see supplementary). 2. Scale standardisation A major challenge in the harmonisation process was the variation in the number of response levels across instruments (binary or likert-scale). For example, the CES-D (used in ELSA-UK) adopted a binary response format (e.g., 0 = “No”, 1 = “Much of the time during the past week”), while the DASS-21 (used in ELSA-Brasil) relied on a four-point Likert scale (from 0 = “Did not apply at all” to 3 = “Applied a lot, or most of the time”). To enable cross-cohort comparability, we adopted a conservative approach by aligning all items to the lowest common response structure, which led to all items being binarised. Specifically, CES-D items were retained as binary format (1 = “Much of the time”), whereas DASS-21 items were dichotomised using a predefined threshold ( Table 1 ). View this table: View inline View popup Download powerpoint Table 1 Item recoding thresholds for DASS, CES-D, CASP, GAD, and QOL questionnaires. 3. Item matching To fit MGCFA models, each cohort must have an equal number of items representing each construct (e.g., depression and anxiety), although the specific item wording can differ if the items are conceptually comparable. We used the Harmony online harmonisation tool [ 2 , 47 ] to assess semantic similarity between items across ELSA-UK and ELSA-Brasil. Harmony uses the Sentence Bidirectional Encoder Representations from Transformers model (SBERT) model [ 48 ], a deep learning model optimised for natural language inference, to calculate cosine similarity which measures the cosine of the angle between their embedding vectors in multidimensional space, with values closer to 1 indicating that the items are more semantically alike, and values closer to 0 indicating little semantic overlap. We did not apply a rigid minimum Hcos threshold for inclusion because high semantic similarity scores do not always guarantee conceptual equivalence in a clinical/psychometric context, and vice versa. Instead, the Harmony scores were used as a starting point to identify candidate matches, which were then reviewed by domain experts against DSM-5 symptom definitions and the broader measurement context (e.g., scale format, intended construct). This review allowed the inclusion of some lower-scoring pairs (e.g., Hcos ≈ 0.30 for “lack of motivation or apathy”), when the items were judged to represent the same symptom domain despite differences in wording or translation nuance. Conversely, some higher-scoring pairs were excluded if they were judged to differ conceptually (e.g., “not feeling much worth as a person” vs. “not feeling one’s life is worthwhile”). Where multiple candidate items were available for the same symptom category, the highest-scoring pair that also met expert conceptual criteria was selected to construct the harmonised scale for each cohort. 4. Measurement invariance testing After selecting harmonised items, we tested whether the depression and anxiety constructs were measured equivalently across cohorts and over time using multi-group confirmatory factor analysis (MGCFA). Measurement invariance evaluates whether the relationships between latent constructs (e.g., depression) and their observed indicators are comparable (i) longitudinally within each cohort and (ii) cross-sectionally between cohorts. Without establishing invariance, any observed differences between groups may reflect measurement artefacts rather than true differences in the latent construct [ 14 , 49 ]. MGCFA proceeds by fitting a sequence of increasingly constrained models: Configural invariance tests whether groups share the same underlying factor structure, that is, whether the same number of factors and the same set of items load on each factor. This serves as a baseline for further invariance testing. Metric invariance constrains the unstandardised factor loadings (metric coefficients) to be equal across groups. This ensures that a one-unit change in the latent variable corresponds to the same expected change in each observed item in all groups, establishing a common measurement scale. Standardised loadings may still differ because of differences in item or factor variances [ 50 , 51 ]. Achieving metric invariance permits valid comparisons of associations involving the latent construct (e.g., regressions). Scalar invariance adds equality constraints on item thresholds (for categorical indicators) or intercepts (for continuous indicators), ensuring that group differences in observed means reflect true differences in the latent construct rather than measurement bias. Scalar invariance is required to compare latent means [ 52 ]. Strict invariance further constrains residual variances to be equal across groups. While difficult to achieve, it is useful in certain contexts, such as when comparing factor means from dichotomous items [ 53 , 54 ]. When full invariance cannot be established, partial measurement invariance (PMI) can be applied by relaxing specific constraints (e.g., freeing thresholds for some items) while retaining others, provided that at least two invariant items anchor each factor [ 49 ]. The highest level of invariance attained dictates the types of valid cross-group comparisons: configural and metric invariance allow for comparing associations, whereas scalar (full or partial) invariance is required for comparing latent means (Yoon & Kim, 2014). 5. Harmonised measure extraction Once partial or full measurement invariance was established, we applied a two-step procedure to derive harmonised latent scores for subsequent analyses. These scores served as cross-cohort, longitudinally comparable measures of depression and anxiety. Factor scores estimate an individual’s standing on an underlying latent construct, typically on a standardised scale (mean = 0, SD = 1). When measurement invariance holds, these scores can be meaningfully compared across groups and time points. Scores at the distributional extremes indicate stronger associations with the latent factor, whereas values near zero reflect weaker symptom endorsement. By constraining measurement parameters across groups and waves during estimation, scores are placed on a shared metric, thereby supporting valid comparisons of mental health trajectories across methodologically distinct cohorts [ 55 ] Statistical analyses First, we assessed longitudinal measurement invariance within each cohort separately: ELSA-Brasil (COVID waves 1–4) and ELSA-UK (waves 9, COVID 1, COVID 2, and wave 10). For each construct, we tested a sequence of increasingly constrained models: (1) a configural model to examine whether the same factor structure held across waves; (2) a metric model to assess equality of unstandardised factor loadings; and (3) a scalar model to evaluate equality of both loadings and item thresholds across time. We then tested cross-cohort measurement invariance using MGCFA on the wave-specific item sets retained from the longitudinal models. This ensured that the cross-cohort models were informed by the longitudinal invariance results. For each construct, configural, metric, and scalar models were fitted across the two cohorts using identical matched item sets. Cross-cohort models were estimated for each wave separately rather than collapsing or averaging across time points, allowing us to examine invariance within specific temporal contexts. Model fit was evaluated using standard CFA fit indices: RMSEA, CFI, TLI, and SRMR [ 56 ]. Following Hu and Bentler’s two-index approach, acceptable fit was defined as RMSEA ≤ .06; SRMR ≤ .08; CFI/TLI ≥ .95 with values of CFI/TLI > .90 considered adequate [ 57 ]. All models were estimated in lavaan (Rosseel, 2012) using the WLSMV estimator, which is suitable for binary indicators [ 58 ]. Analysis scripts are available at: https://github.com/cgmhc/cgmhc_harmonisation . Results Table 2 presents baseline descriptive statistics for each cohort. The mean age was 69 years in ELSA-UK and 61 years in ELSA-Brasil. Educational attainment was higher in ELSA-Brasil, with 95% of participants educated at high school level or above, compared with 70% in ELSA-UK. View this table: View inline View popup Download powerpoint Table 2 Descriptive statistics of demographics across ELSA-UK and ELSA-Brasil cohorts. Table 3 summarises the Harmony item-matching process. Five matching item pairs were identified for depression (depressed mood, anhedonia, worthlessness, hopelessness, apathy) and four for anxiety (worry, general anxiety, irritability, restlessness). Cosine similarity (Hcos) scores varied across matches, with final selections based on both semantic similarity and conceptual alignment with DSM-5 symptom domains. Cut-offs used to standardise and recode response categories across cohorts are shown in Table 1 . View this table: View inline View popup Table 3 Items matched across cohorts by semantic similarity using Harmony tool. Table 4 shows longitudinal measurement invariance results within each cohort. In ELSA-Brasil, configural, metric, and scalar models for both depression and anxiety demonstrated excellent fit (RMSEA .95), supporting full scalar invariance across four COVID-19 waves. In ELSA-UK, all depression models showed acceptable fit, with only a small decline in fit indices from configural to scalar models. Anxiety models demonstrated excellent fit at all levels of invariance, supporting full scalar invariance across time. View this table: View inline View popup Table 4 Fit indices for longitudinal measurement invariance of depression and anxiety within ELSA-Brasil (N = 2,636) and ELSA-UK (N = 13,695) cohorts. Table 5 summarises cross-cohort invariance testing. For depression, acceptable fit was maintained from configural through scalar models, indicating that latent mean comparisons between ELSA-UK and ELSA-Brasil are valid. For anxiety, configural and metric models fit well, but the scalar model failed to converge, suggesting that latent mean comparisons for anxiety across cohorts may be biased by measurement non-equivalence. View this table: View inline View popup Download powerpoint Table 5 Fit indices for cross-cohort measurement invariance models of depression and anxiety in ELSA-Brasil (n=2,636) and ELSA UK cohorts (N = 13,695). Discussion Our study illustrates the feasibility of retrospectively harmonising psychological symptom measures across culturally distinct ageing cohorts using a structured, theory-driven approach. We combined expert item mapping with AI-enabled semantic matching and MGCFA with the aim to harmonise cohorts with substantial differences in item content, response scales, and cultural context. In summary, we were able to derive harmonised depression and anxiety constructs from the ELSA-UK and ELSA-Brasil longitudinal studies. Crucially, we found that valid latent depression scores could be estimated comparably across time and between countries. Within each cohort, both depression and anxiety showed full longitudinal measurement invariance, indicating that the underlying constructs were measured equivalently over successive waves. Moreover, our cross-cohort analysis demonstrated scalar invariance for depression across the UK and Brazil samples, implying that group differences in the latent depression mean are meaningful despite the use of different instruments [ 59 ]. In practical terms, this means we can compare average depression levels between countries without confounding by measurement artefacts. These findings extend prior harmonisation work. For example, the CLOSER collaboration successfully applied factor-analytic harmonisation to psychological distress measures across six British cohorts (McElroy et al., 2020). Our results build on this by pushing beyond a single-country context to show that a more stringent, generalisable framework can work across diverse populations. Similarly, studies in other domains have highlighted the value of advanced statistical harmonisation for cross-national comparisons. For instance, [ 60 ] harmonised cognitive measures between the US and India, finding that linking items via CFA enabled valid cross-country comparisons of cognitive function. In the same spirit, [ 61 ] demonstrated that a depressive symptom scale (EURO-D) achieved (approximate) scalar invariance across 27 European countries, supporting its use for cross-cultural comparisons. Together, these examples underscore that harmonisation methods can facilitate joint analysis of cohorts in different nations, allowing researchers to disentangle true population differences from measurement bias. A key step in our approach was the use of an AI-based semantic matching tool (Harmony) to aid item harmonisation. Prior work has shown that such NLP-based tools can greatly reduce the manual effort in harmonisation: they quickly calculate high-potential item matches, which experts can then verify [ 47 ]. In our case, Harmony helped identify candidate matches across English and Portuguese scales, streamlining the mapping process. Nevertheless, we emphasise that semantic similarity does not guarantee conceptual equivalence. For example, Harmony gave a moderate similarity score (cosine ≈ 0.52) to the pairs “not feeling worth much as a person” and “not feeling one’s life is worthwhile.” Although both involve the notion of worth, an expert might note that the former taps self-esteem or self-concept, whereas the latter reflects a sense of purpose or meaning [ 62 ]. Such nuances are not distinguished by the embedding algorithm. Therefore, we adopted a hybrid strategy: automated matching to flag likely pairs, followed by clinician review to adjudicate subtle conceptual differences. This balance allowed us to gain efficiency from the algorithm while preserving domain fidelity. It is worth noting that tools like Harmony depend on the quality of the input text and metadata: different translations, ambiguity in item wording, or context differences can affect semantic scores. In practice, researchers should interpret the AI suggestions in light of substantive knowledge, using them as a guide rather than a final determinant. Our harmonisation framework opens the door to robust cross-national analyses of mental health trajectories. With a common latent metric for depression, we can investigate how symptom courses diverge between countries, and what contextual factors drive those differences. For example, the UK and Brazil adopted markedly different COVID-19 policies, and harmonised data will allow a direct comparison of how lockdown stringency or social support measures affected depression trends [ 63 ]. More broadly, harmonised measures can link mental health trajectories to country-specific variables such as economic indicators, healthcare access, or public health interventions. Beyond the COVID-19 context, the same approach could be applied to other collective stressors. For example, natural or manmade disasters are known to have long-term psychological impacts, often compounding one another [ 64 , 65 ]. By harmonising data from different nations affected by successive events (e.g. wildfires, floods, economic crises), researchers could compare resilience and vulnerability factors internationally. In sum, cross-cohort harmonisation provides a powerful tool to ask global questions about mental health determinants, policy effects, and recovery patterns. We recognise several limitations with our approach. Although we achieved configural and metric invariance for anxiety, our cross-cohort scalar model for anxiety did not converge. This likely reflects the smaller number and greater heterogeneity of overlapping anxiety items available between ELSA-UK and ELSA-Brasil. Insufficient item coverage can preclude full invariance: with only a handful of anxiety items, the model lacked power to align thresholds across groups. This failure highlights a general point: successful harmonisation requires a reasonably large pool of well-matched items. When instruments are too dissimilar or sparse, one may only achieve partial invariance. In practice, partial invariance can sometimes be acceptable; with guidelines suggesting that having at least half of the indicators invariant may allow for meaningful comparisons [ 66 ]. In extreme cases, problematic items might need to be dropped or given less weight in scoring [ 67 ]. In our harmonisation, none of the depression items had to be omitted, but researchers should be prepared to iterate: dropping or adjusting items is a valid strategy when invariance is violated. Ultimately, our anxiety results illustrate the difficulty of retrospectively harmonising constructs that were measured inconsistently across studies. Furthermore, it is important to acknowledge the assumptions and trade-offs in our approach. Our framework is anchored in a priori decisions about which items captured the same symptom domains in both countries. These judgments assumed that, despite different wordings and answer formats, each paired item tapped an equivalent underlying construct. We also collapsed response categories into binary indicators to allow direct pooling; this simplification undoubtedly sacrificed some information in exchange for comparability [ 68 ]. Thus, the validity of our harmonised scores depends critically on the soundness of these initial decisions. Future users of this framework should be transparent about such choices and consider sensitivity analyses (e.g., testing alternative coding schemes) when feasible. Alternative harmonisation strategies may also be considered to address some of the limitations of our approach, though each comes with its own trade-offs and assumptions. For example, when item-level data are unavailable, integrative data analysis (IDA) can pool raw scores across studies by rescaling or linking composite measures [ 69 ]. Similarly, test equating and linking methods [ 70 ] rely on anchor items or external calibration samples to align scales, potentially reducing the reliance on expert-driven dichotomisation. Meta-analytic approaches [ 71 ] provide another option when only study-level results are available, though these limit the ability to model within-person change or test measurement invariance directly. Additionally, multiple-group item response theory (IRT) provides an alternative harmonisation approach by testing differential item functioning (DIF) to evaluate whether items function equivalently across groups, thereby placing respondents on a common latent scale [ 72 , 73 ]. While more granular at the item level than MGCFA, it requires strong unidimensionality assumptions and larger sample sizes, and thus comes with its own trade-offs. In summary, our structured harmonisation framework proved efficient and robust for depression: despite different instruments and languages, we derived a common latent depression metric that behaves consistently. For our anxiety construct, we were able to detect that invariance was not met, thus avoiding forcibly harmonising conceptually inconsistent data. However, the challenge with converging anxiety underscores the limits of retrospective harmonisation, especially when instruments diverge. These issues would only multiply as more cohorts or countries enter the analysis, reducing the chance of finding matching items across all datasets. Therefore, future studies would benefit from prospective harmonisation efforts: agreeing on core symptom domains and standardised item sets (e.g. using DSM-based criteria or well-validated global mental health modules) at the design phase. Adopting common measurement frameworks or shared modules across international studies would minimise data loss and enhance comparability from the outset. Until then, our results show that careful, methodical harmonisation, combining expert knowledge, NLP tools, and factor analysis, can nevertheless yield meaningful cross-national mental health metrics. Data Availability All data produced in the present study are available via application to Dementia Platforms UK (DPUK). https://github.com/cgmhc/cgmhc_harmonisation Competing interests The authors declare no conflicts of interest. Acknowledgements This study was funded by the National Institute of Mental Health – 1RF1MH134638. PFZ has received funds from Fundação de Amparo à Pesquisa do Estado de São Paulo (FAPESP, grant number 2024/17532-0). DF has received funds from Wellcome Leap 1kD. ELSA-Brasil was funded by National Council for Scientific and Technological Development (CNPq) (Wave 1: BA 01 06 021200; ES 01 06 0300-00; MG 01 06 0278-00; 01 06 0071-00; RS 01 06 0010-00; SP 01 06 0115-00; FAPESP 2020/01476-2). We would also like to acknowledge the CGMHC cohort PI’s for their involvement in the project. Footnotes ↵ * joint first author ↵ ^ joint senior author References [1]. ↵ O’Connor M , Spry E , Patton G , Moreno-Betancur M , Arnup S , Downes M , et al. Better together: Advancing life course research through multi-cohort analytic approaches . Adv Life Course Res 2022 ; 53 : 100499 . doi: 10.1016/j.alcr.2022.100499 . OpenUrl CrossRef PubMed [2]. ↵ McElroy E , Wood T , Bond R , Mulvenna M , Shevlin M , Ploubidis GB , et al. Using natural language processing to facilitate the harmonisation of mental health questionnaires: a validation study using real-world data . BMC Psychiatry 2024 ; 24 : 530 . doi: 10.1186/s12888-024-05954-2 . OpenUrl CrossRef PubMed [3]. ↵ Black MH , Buitelaar J , Charman T , Ecker C , Gallagher L , Hens K , et al. Conceptual framework for data harmonisation in mental health using the International Classification of Functioning, Disability and Health: an example with the R2D2-MH consortium . BMJ Ment Health 2024 ; 27 . doi: 10.1136/bmjment-2024-301283 . OpenUrl Abstract / FREE Full Text [4]. ↵ Fortier I , Raina P , Van den Heuvel ER , Griffith LE , Craig C , Saliba M , et al. Maelstrom Research guidelines for rigorous retrospective data harmonization . Int J Epidemiol 2017 ; 46 : 103 – 5 . doi: 10.1093/ije/dyw075 . OpenUrl CrossRef PubMed [5]. Fortier I , Doiron D , Burton P , Raina P . Invited commentary: consolidating data harmonization--how to obtain quality and applicability? Am J Epidemiol 2011 ; 174 : 261 – 4 ; author reply 265–6. doi: 10.1093/aje/kwr194 . OpenUrl CrossRef PubMed Web of Science [6]. ↵ Bauermeister S , Phatak M , Sparks K , Sargent L , Griswold M , McHugh C , et al. Evaluating the harmonisation potential of diverse cohort datasets . Eur J Epidemiol 2023 ; 38 : 605 – 15 . doi: 10.1007/s10654-023-00997-3 . OpenUrl CrossRef PubMed [7]. ↵ Cheng C , Messerschmidt L , Bravo I , Waldbauer M , Bhavikatti R , Schenk C , et al. A general primer for data harmonization . Sci Data 2024 ; 11 : 152 . doi: 10.1038/s41597-024-02956-3 . OpenUrl CrossRef [8]. ↵ Wey TW , Doiron D , Wissa R , Fabre G , Motoc I , Noordzij JM , et al. Overview of retrospective data harmonisation in the MINDMAP project: process and results . J Epidemiol Community Health 2021 ; 75 : 433 – 41 . doi: 10.1136/jech-2020-214259 . OpenUrl Abstract / FREE Full Text [9]. Moltrecht B , Villanova do Amaral J , Salum GA , Miguel EC , Rohde LA , Ploubidis GB , et al. Social connection and its prospective association with adolescent internalising and externalising symptoms: an exploratory cross-country study using retrospective harmonisation . J Child Psychol Psychiatry 2024 . doi: 10.1111/jcpp.14080 . OpenUrl CrossRef [10]. ↵ Maelstrom Research . Maelstrom Research 2024 . http://www.maelstrom-research.org/ (accessed December 21, 2024). [11]. ↵ Lee J , Wilkens J , Phillips D , Knapp D , Nichols E . Recommended Contents for Maximizing Harmonization Potential for the Health and Retirement Study and its International Network of Studies . CESR-SCHAEFFER WORKING PAPER SERIES 2024 . [12]. ↵ Mc Elroy E , Villadsen A , Patalay P , Goodman A , Richards M , Northstone K , et al. Harmonisation and measurement properties of mental health measures in six British cohorts 2020 . [13]. ↵ Kennedy E , Vadlamani S , Lindsey HM , Lei P-W , Jo-Pugh M , Thompson PM , et al. Bridging big data in the ENIGMA consortium to combine non-equivalent cognitive measures . Sci Rep 2024 ; 14 : 24289 . doi: 10.1038/s41598-024-72968-x . OpenUrl CrossRef PubMed [14]. ↵ Putnick DL , Bornstein MH . Measurement invariance conventions and reporting: The state of the art and future directions for psychological research . Dev Rev 2016 ; 41 : 71 – 90 . doi: 10.1016/j.dr.2016.06.004 . OpenUrl CrossRef PubMed [15]. ↵ King K , Allum N , Stoneman P , Cernat A . Estimating measurement equivalence of the 12-item General Health Questionnaire across ethnic groups in the UK . Psychol Med 2023 ; 53 : 1778 – 86 . doi: 10.1017/S0033291721003408 . OpenUrl CrossRef PubMed [16]. ↵ Welzel C , Brunkert L , Kruse S , Inglehart RF . Non-invariance? An overstated problem with misconceived causes . Sociol Methods Res 2023 ; 52 : 1368 – 400 . doi: 10.1177/0049124121995521 . OpenUrl CrossRef [17]. ↵ Santomauro DF , Mantilla Herrera AM , Shadid J , Zheng P , Ashbaugh C , Pigott DM , et al. Global prevalence and burden of depressive and anxiety disorders in 204 countries and territories in 2020 due to the COVID-19 pandemic . Lancet 2021 ; 398 : 1700 – 12 . doi: 10.1016/S0140-6736(21)02143-7 . OpenUrl CrossRef PubMed [18]. Cénat JM , Farahi SMMM , Dalexis RD , Darius WP , Bekarkhanechi FM , Poisson H , et al. The global evolution of mental health problems during the COVID-19 pandemic: A systematic review and meta-analysis of longitudinal studies . J Affect Disord 2022 ; 315 : 70 – 95 . doi: 10.1016/j.jad.2022.07.011 . OpenUrl CrossRef PubMed [19]. Miao R , Liu C , Zhang J , Jin H . Impact of the COVID-19 pandemic on the mental health of children and adolescents: A systematic review and meta-analysis of longitudinal studies . J Affect Disord 2023 ; 340 : 914 – 22 . doi: 10.1016/j.jad.2023.08.070 . OpenUrl CrossRef PubMed [20]. Phiri P , Ramakrishnan R , Rathod S , Elliot K , Thayanandan T , Sandle N , et al. An evaluation of the mental health impact of SARS-CoV-2 on patients, general public and healthcare professionals: A systematic review and meta-analysis . EClinicalMedicine 2021 ; 34 : 100806 . doi: 10.1016/j.eclinm.2021.100806 . OpenUrl CrossRef PubMed [21]. Leung CMC , Ho MK , Bharwani AA , Cogo-Moreira H , Wang Y , Chow MSC , et al. Mental disorders following COVID-19 and other epidemics: a systematic review and meta-analysis . Transl Psychiatry 2022 ; 12 : 205 . doi: 10.1038/s41398-022-01946-6 . OpenUrl CrossRef PubMed [22]. ↵ Robinson E , Sutin AR , Daly M , Jones A . A systematic review and meta-analysis of longitudinal cohort studies comparing mental health before versus during the COVID-19 pandemic in 2020 . J Affect Disord 2022 ; 296 : 567 – 76 . doi: 10.1016/j.jad.2021.09.098 . OpenUrl CrossRef PubMed [23]. ↵ Hale T , Angrist N , Kira B , Petherick A , Phillips T , Webster S. Variation in government responses to COVID-19 2020 . [24]. ↵ Almeida M , Shrestha AD , Stojanac D , Miller LJ . The impact of the COVID-19 pandemic on women’s mental health . Arch Womens Ment Health 2020 ; 23 : 741 – 8 . doi: 10.1007/s00737-020-01092-2 . OpenUrl CrossRef PubMed [25]. Zuccolo PF , Casella CB , Fatori D , Shephard E , Sugaya L , Gurgel W , et al. Children and adolescents’ emotional problems during the COVID-19 pandemic in Brazil . Eur Child Adolesc Psychiatry 2022 . doi: 10.1007/s00787-022-02006-6 . OpenUrl CrossRef [26]. Hawrilenko M , Kroshus E , Tandon P , Christakis D . The Association Between School Closures and Child Mental Health During COVID-19 . JAMA Netw Open 2021 ; 4 : e2124092 . doi: 10.1001/jamanetworkopen.2021.24092 . OpenUrl CrossRef [27]. Hawks JL . Editorial: The Impact of the COVID-19 Pandemic on Racial Disparities in Pediatric Mental Health . J Am Acad Child Adolesc Psychiatry 2023 ; 62 : 398 – 9 . doi: 10.1016/j.jaac.2022.12.015 . OpenUrl CrossRef PubMed [28]. ↵ Fatori D , Suen P , Bacchi P , Afonso L , Klein I , Cavendish BA , et al. Trajectories of common mental disorders symptoms before and during the COVID-19 pandemic: findings from the ELSA-Brasil COVID-19 Mental Health Cohort . Soc Psychiatry Psychiatr Epidemiol 2022 ; 57 : 2445 – 55 . doi: 10.1007/s00127-022-02365-0 . OpenUrl CrossRef PubMed [29]. ↵ Lee YH , Liu Z , Fatori D , Bauermeister JR , Luh RA , Clark CR , et al. Association of everyday discrimination with depressive symptoms and suicidal ideation during the COVID-19 pandemic in the all of Us Research Program . JAMA Psychiatry 2022 ; 79 : 898 – 906 . doi: 10.1001/jamapsychiatry.2022.1973 . OpenUrl CrossRef PubMed [30]. ↵ Iob E , Frank P , Steptoe A , Fancourt D . Levels of Severity of Depressive Symptoms Among At-Risk Groups in the UK During the COVID-19 Pandemic . JAMA Netw Open 2020 ; 3 : e2026064 . doi: 10.1001/jamanetworkopen.2020.26064 . OpenUrl CrossRef [31]. Fancourt D , Steptoe A , Bu F . Trajectories of anxiety and depressive symptoms during enforced isolation due to COVID-19 in England: a longitudinal observational study . Lancet Psychiatry 2021 ; 8 : 141 – 9 . doi: 10.1016/S2215-0366(20)30482-X . OpenUrl CrossRef PubMed [32]. Bu F , Steptoe A , Fancourt D . Depressive and anxiety symptoms in adults during the COVID-19 pandemic in England: A panel data analysis over 2 years . PLoS Med 2023 ; 20 : e1004144 . doi: 10.1371/journal.pmed.1004144 . OpenUrl CrossRef PubMed [33]. Magnúsdóttir I , Lovik A , Unnarsdóttir AB , McCartney D , Ask H , Kõiv K , et al. Acute COVID-19 severity and mental health morbidity trajectories in patient populations of six nations: an observational study . Lancet Public Health 2022 ; 7 : e406 – 16 . doi: 10.1016/S2468-2667(22)00042-1 . OpenUrl CrossRef [34]. Ebrahimi OV , Bauer DJ , Hoffart A , Johnson SU . A critical period for pandemic adaptation: The evolution of depressive symptomatology in a representative sample of adults across a 17-month period during COVID-19 . J Psychopathol Clin Sci 2022 ; 131 : 881 – 94 . doi: 10.1037/abn0000786 . OpenUrl CrossRef PubMed [35]. Pierce M , McManus S , Hope H , Hotopf M , Ford T , Hatch SL , et al. Mental health responses to the COVID-19 pandemic: a latent class trajectory analysis using longitudinal UK data . Lancet Psychiatry 2021 ; 8 : 610 – 9 . doi: 10.1016/S2215-0366(21)00151-6 . OpenUrl CrossRef PubMed [36]. ↵ Batterham PJ , Calear AL , McCallum SM , Morse AR , Banfield M , Farrer LM , et al. Trajectories of depression and anxiety symptoms during the COVID 19 pandemic in a representative Australian adult cohort . Med J Aust 2021 : mja2.51043 . doi: 10.5694/mja2.51043 . OpenUrl CrossRef [37]. ↵ Steptoe A , Breeze E , Banks J , Nazroo J . Cohort profile: the English longitudinal study of ageing . Int J Epidemiol 2013 ; 42 : 1640 – 8 . doi: 10.1093/ije/dys168 . OpenUrl CrossRef PubMed Web of Science [38]. ↵ Schmidt MI , Duncan BB , Mill JG , Lotufo PA , Chor D , Barreto SM , et al. Cohort Profile: Longitudinal Study of Adult Health (ELSA-Brasil) . Int J Epidemiol 2015 ; 44 : 68 – 75 . doi: 10.1093/ije/dyu027 . OpenUrl CrossRef PubMed [39]. ↵ Brunoni AR , Suen PJC , Bacchi PS , Razza LB , Klein I , Dos Santos LA , et al. Prevalence and risk factors of psychiatric symptoms and diagnoses before and during the COVID-19 pandemic: findings from the ELSA-Brasil COVID-19 mental health cohort . Psychol Med 2023 ; 53 : 446 – 57 . doi: 10.1017/S0033291721001719 . OpenUrl CrossRef [40]. ↵ Dementias Platform UK 2024 . https://www.dementiasplatform.uk/about-us/about-us (accessed September 27, 2024). [41]. ↵ Vignola RCB , Tucci AM . Adaptation and validation of the depression, anxiety and stress scale (DASS) to Brazilian Portuguese . J Affect Disord 2014 ; 155 : 104 – 9 . doi: 10.1016/j.jad.2013.10.031 . OpenUrl CrossRef PubMed [42]. ↵ O’Halloran AM , Kenny RA , King-Kallimanis BL . The latent factors of depression from the short forms of the CES-D are consistent, reliable and valid in community-living older adults . Eur Geriatr Med 2014 ; 5 : 97 – 102 . doi: 10.1016/j.eurger.2013.12.004 . OpenUrl CrossRef [43]. ↵ Spitzer RL , Kroenke K , Williams JBW , Löwe B . A brief measure for assessing generalized anxiety disorder: the GAD-7: The GAD-7 . Arch Intern Med 2006 ; 166 : 1092 – 7 . doi: 10.1001/archinte.166.10.1092 . OpenUrl CrossRef PubMed Web of Science [44]. ↵ Wiggins RD , Netuveli G , Hyde M , Higgs P , Blane D . The evaluation of a self-enumerated scale of quality of life (CASP-19) in the context of research on ageing: A combination of exploratory and confirmatory approaches . Soc Indic Res 2008 ; 89 : 61 – 77 . doi: 10.1007/s11205-007-9220-5 . OpenUrl CrossRef Web of Science [45]. ↵ Personal well-being user guidance 2025 . https://www.ons.gov.uk/peoplepopulationandcommunity/wellbeing/methodologies/personalwellbeingsurveyuserguide (accessed May 28, 2025). [46]. ↵ Turvey CL , Wallace RB , Herzog R . A revised CES-D measure of depressive symptoms and a DSM-based measure of major depressive episodes in the elderly . Int Psychogeriatr 1999 ; 11 : 139 – 48 . doi: 10.1017/s1041610299005694 . OpenUrl CrossRef PubMed [47]. ↵ Ploubidis G , Moltrecht B , McElroy E , Hoffmann MS , Wood T . Harmony – A global platform for contextual harmonisation, translation and cooperation in mental health research . OSF ; 2022 . doi: 10.17605/OSF.IO/BCT6K . OpenUrl CrossRef [48]. ↵ Reimers N , Gurevych I. Sentence-BERT: Sentence embeddings using Siamese BERT-networks . arXiv [csCL] 2019 . [49]. ↵ Van De Schoot R , Schmidt P , De Beuckelaer A , Lek K , Zondervan-Zwijnenburg M . Editorial: Measurement invariance . Front Psychol 2015 ; 6 : 1064 . doi: 10.3389/fpsyg.2015.01064 . OpenUrl CrossRef [50]. ↵ Muthén B , Christoffersson A . Simultaneous factor analysis of dichotomous variables in several groups . Psychometrika 1981 ; 46 : 407 – 19 . doi: 10.1007/bf02293798 . OpenUrl CrossRef Web of Science [51]. ↵ Sass DA , Schmitt TA . Testing Measurement and Structural Invariance . Handbook of Quantitative Methods for Educational Research , Rotterdam : SensePublishers ; 2013 , p. 315 – 45 . doi: 10.1007/978-94-6209-404-8_15 . OpenUrl CrossRef [52]. ↵ Sass DA . Testing measurement invariance and comparing latent factor means within a confirmatory factor analysis framework . J Psychoeduc Assess 2011 ; 29 : 347 – 63 . doi: 10.1177/0734282911406661 . OpenUrl CrossRef Web of Science [53]. ↵ Meredith W , Teresi JA . An essay on measurement and factorial invariance . Med Care 2006 ; 44 : S69 – 77 . doi: 10.1097/01.mlr.0000245438.73837.89 . OpenUrl CrossRef PubMed Web of Science [54]. ↵ Tse WW-Y , Lai MHC , Zhang Y . Does strict invariance matter? Valid group mean comparisons with ordered-categorical items . Behav Res Methods 2024 ; 56 : 3117 – 39 . doi: 10.3758/s13428-023-02247-6 . OpenUrl CrossRef PubMed [55]. ↵ Lai MHC , Tse WW-Y . Are factor scores measurement invariant? Psychol Methods 2024 . doi: 10.1037/met0000658 . OpenUrl CrossRef [56]. ↵ Hu L-T , Bentler PM . Fit indices in covariance structure modeling: Sensitivity to underparameterized model misspecification . Psychol Methods 1998 ; 3 : 424 – 53 . doi: 10.1037/1082-989x.3.4.424 . OpenUrl CrossRef Web of Science [57]. ↵ Schreiber JB , Nora A , Stage FK , Barlow EA , King J . Reporting Structural Equation Modeling and Confirmatory Factor Analysis Results: A Review . The Journal of Educational Research 2006 ; 99 : 323 – 38 . doi: 10.3200/JOER.99.6.323-338 . OpenUrl CrossRef Web of Science [58]. ↵ Kiliç A , Uysal İ , Atar B . Comparison of confirmatory factor analysis estimation methods on binary data . Int J Assess Tools Educ 2020 ; 7 : 451 – 87 . doi: 10.21449/ijate.660353 . OpenUrl CrossRef [59]. ↵ Harry ML , Coley RY , Waring SC , Simon GE . Evaluating the cross-cultural measurement invariance of the PHQ-9 between American Indian/Alaska Native adults and diverse racial and ethnic groups . J Affect Disord Rep 2021 ; 4 : 100121 . doi: 10.1016/j.jadr.2021.100121 . OpenUrl CrossRef PubMed [60]. ↵ Vonk JMJ , Gross AL , Zammit AR , Bertola L , Avila JF , Jutten RJ , et al. Cross-national harmonization of cognitive measures across HRS HCAP (USA) and LASI-DAD (India) . PLoS One 2022 ; 17 : e0264166 . doi: 10.1371/journal.pone.0264166 . OpenUrl CrossRef PubMed [61]. ↵ Fong TCT , Chan RTH . Longitudinal measurement invariance of EURO-D scale across 27 countries in SHARE wave 8 and wave 9: A cross-country alignment study . J Affect Disord 2025 ; 390 : 119852 . doi: 10.1016/j.jad.2025.119852 . OpenUrl CrossRef [62]. ↵ Ammerman BA , Burke TA , Jacobucci R , McClure K . How we ask matters: The impact of question wording in single-item measurement of suicidal thoughts and behaviors . Prev Med 2021 ; 152 : 106472 . doi: 10.1016/j.ypmed.2021.106472 . OpenUrl CrossRef PubMed [63]. ↵ Yoo JY , Dutra SVO , Fanfan D , Sniffen S , Wang H , Siddiqui J , et al. Comparative analysis of COVID-19 guidelines from six countries: a qualitative study on the US, China, South Korea, the UK, Brazil, and Haiti . BMC Public Health 2020 ; 20 : 1853 . doi: 10.1186/s12889-020-09924-7 . OpenUrl CrossRef PubMed [64]. ↵ Leppold C , Gibbs L , Block K , Reifels L , Quinn P . Public health implications of multiple disaster exposures . Lancet Public Health 2022 ; 7 : e274 – 86 . doi: 10.1016/S2468-2667(21)00255-3 . OpenUrl CrossRef [65]. ↵ Lian , J. , Anstey , K.J. , Eramudugolla , R. , Kim , S. , Draper , G . The PATH Through Life Project: The early impact of COVID-19 and lockdowns on health outcomes for a cohort of older adults. ACT Government, Canberra City , Australian Capital Territory 2023 . [66]. ↵ Little TD . Longitudinal Structural Equation Modeling (Methodology in the Social Sciences) . The Guilford Press ; 2024 . [67]. ↵ Zhao X , Coxe S , Sibley MH , Zulauf-McCurdy C , Pettit JW . Harmonizing depression measures across studies: A tutorial for data harmonization . Prev Sci 2023 ; 24 : 1569 – 80 . doi: 10.1007/s11121-022-01381-5 . OpenUrl CrossRef PubMed [68]. ↵ Purgato M , Barbui C . Dichotomizing rating scale scores in psychiatry: a bad idea? Epidemiol Psychiatr Sci 2013 ; 22 : 17 – 9 . doi: 10.1017/S2045796012000613 . OpenUrl CrossRef PubMed [69]. ↵ Curran PJ , Hussong AM . Integrative data analysis: the simultaneous analysis of multiple data sets . Psychol Methods 2009 ; 14 : 81 – 100 . doi: 10.1037/a0015914 . OpenUrl CrossRef PubMed Web of Science [70]. ↵ Kolen MJ , Brennan RL . Test equating, scaling, and linking: Methods and practices . 3rd ed . New York, NY : Springer ; 2014 . doi: 10.1007/978-1-4939-0317-7 . OpenUrl CrossRef [71]. ↵ Cheung MW-L , Cheung SF . Random-effects models for meta-analytic structural equation modeling: review, issues, and illustrations . Res Synth Methods 2016 ; 7 : 140 – 55 . doi: 10.1002/jrsm.1166 . OpenUrl CrossRef PubMed [72]. ↵ Muthén B , Asparouhov T . IRT studies of many groups: the alignment method . Front Psychol 2014 ; 5 : 978 . doi: 10.3389/fpsyg.2014.00978 . OpenUrl CrossRef [73]. ↵ Kim ES , Yoon M . Testing measurement invariance: A comparison of multiple-group categorical CFA and IRT . Struct Equ Modeling 2011 ; 18 : 212 – 28 . doi: 10.1080/10705511.2011.557337 . OpenUrl CrossRef Web of Science View the discussion thread. Back to top Previous Next Posted September 12, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A robust framework for harmonising health measures across international cohorts: Evidence from the COVID-19 pandemic Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A robust framework for harmonising health measures across international cohorts: Evidence from the COVID-19 pandemic James Lian , Pedro F Zuccolo , Omid V. Ebrahimi , Daniel Fatori , Abhaya Adlakha , Juan F. De La Hoz , Younga H. Lee , Adriana Carneiro , Isabela M. Benseñor , Paulo A. Lotufo , Alessandra C. Goulart , Justin D. Tubbs , Devon Watts , Yu Zhou , Lorenza Dall’Aglio , Mihael Cudic , Morgane Kuenzi , CGMHC Consortium , Ronald C. Kessler , Vikram Patel , André Brunoni , Jordan W. Smoller , Sarah Bauermeister medRxiv 2025.09.09.25335409; doi: https://doi.org/10.1101/2025.09.09.25335409 Share This Article: Copy Citation Tools A robust framework for harmonising health measures across international cohorts: Evidence from the COVID-19 pandemic James Lian , Pedro F Zuccolo , Omid V. Ebrahimi , Daniel Fatori , Abhaya Adlakha , Juan F. De La Hoz , Younga H. Lee , Adriana Carneiro , Isabela M. Benseñor , Paulo A. Lotufo , Alessandra C. Goulart , Justin D. Tubbs , Devon Watts , Yu Zhou , Lorenza Dall’Aglio , Mihael Cudic , Morgane Kuenzi , CGMHC Consortium , Ronald C. Kessler , Vikram Patel , André Brunoni , Jordan W. Smoller , Sarah Bauermeister medRxiv 2025.09.09.25335409; doi: https://doi.org/10.1101/2025.09.09.25335409 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Epidemiology Subject Areas All Articles Addiction Medicine (569) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1510) Epidemiology (15230) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6608) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15923) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6607) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3336) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (664) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5448) Public and Global Health (9237) Radiology and Imaging (2201) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a018a7852bc1ad07',t:'MTc3OTc1NTI0OQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.