Latent Transition Analysis for Longitudinal Studies of Post-Acute Infection Syndromes: A Multinational Investigation of Post-COVID-19 Condition

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 105,764 characters · extracted from preprint-html · click to expand
Latent Transition Analysis for Longitudinal Studies of Post-Acute Infection Syndromes: A Multinational Investigation of Post-COVID-19 Condition | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Latent Transition Analysis for Longitudinal Studies of Post-Acute Infection Syndromes: A Multinational Investigation of Post-COVID-19 Condition View ORCID Profile Roy Gusinow , Anna Górska , Lorenzo Maria Canziani , Iris Lopes-Rafegas , Carolina Alvarez Garavito , Adriana Tami , Elisa Gentilotti , Elisa Sicuri , Cédric Laouénan , Jade Ghosn , Aline-Marie Florence , Nadhem Lahfej , Fulvia Mazzaferri , Lidia Del Piccolo , Maddalena Giannella , Alice Toschi , Michela Di Chiara , Maria Giulia Caponcello , Zaira R. Palacios-Baena , Karin I. Wold , Elisa Rossi , Evelina Tacconelli , View ORCID Profile Jan Hasenauer , the ORCHESTRA study group doi: https://doi.org/10.1101/2025.09.01.25334817 Roy Gusinow 1 The Life and Medical Sciences Institute (LIMES), University of Bonn , Germany 2 Bonn Center for Mathematical Life Sciences, University of Bonn , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Roy Gusinow Anna Górska 3 Division of Infectious Diseases, Department of Diagnostics and Public Health, University of Verona , Verona, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lorenzo Maria Canziani 3 Division of Infectious Diseases, Department of Diagnostics and Public Health, University of Verona , Verona, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Iris Lopes-Rafegas 4 ISGlobal , Barcelona, Spain 5 Facultat de Medicina i Ciències de la Salut, Universitat de Barcelona (UB) , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site Carolina Alvarez Garavito 1 The Life and Medical Sciences Institute (LIMES), University of Bonn , Germany 2 Bonn Center for Mathematical Life Sciences, University of Bonn , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Adriana Tami 6 University of Groningen, University Medical Center Groningen, Department of Medical Microbiology and Infection Prevention , Groningen, The Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site Elisa Gentilotti 3 Division of Infectious Diseases, Department of Diagnostics and Public Health, University of Verona , Verona, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Elisa Sicuri 4 ISGlobal , Barcelona, Spain 5 Facultat de Medicina i Ciències de la Salut, Universitat de Barcelona (UB) , Barcelona, Spain 7 LSE Health, London School of Economics and Political Science , London, UK 8 Centro de Investigação em Saúde de Manhiça (CISM) , Manhiça, Mozambique Find this author on Google Scholar Find this author on PubMed Search for this author on this site Cédric Laouénan 9 APHP Nord, Hôpital Bichat, Service des Maladies Infectieuses , Paris F75018, France 10 Université Paris Cité, INSERM UMR 1137 IAME, Paris F75018, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jade Ghosn 9 APHP Nord, Hôpital Bichat, Service des Maladies Infectieuses , Paris F75018, France 10 Université Paris Cité, INSERM UMR 1137 IAME, Paris F75018, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aline-Marie Florence 9 APHP Nord, Hôpital Bichat, Service des Maladies Infectieuses , Paris F75018, France 10 Université Paris Cité, INSERM UMR 1137 IAME, Paris F75018, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nadhem Lahfej 11 APHP Nord, Hôpital Bichat, Department of Epidemiology Biostatistics and Clinical Research , Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Fulvia Mazzaferri 3 Division of Infectious Diseases, Department of Diagnostics and Public Health, University of Verona , Verona, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lidia Del Piccolo 12 Department of Neurosciences, Biomedicine and Movement Sciences, University of Verona , Verona, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Maddalena Giannella 13 Department of Medical and Surgical Sciences, Alma Mater Studiorum, University of Bologna , Bologna, Italy 14 Infectious Diseases Unit, Department for Integrated Infectious Risk Management, IRCCS Azienda Ospedaliero-Universitaria di Bologna , Bologna, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alice Toschi 14 Infectious Diseases Unit, Department for Integrated Infectious Risk Management, IRCCS Azienda Ospedaliero-Universitaria di Bologna , Bologna, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Michela Di Chiara 14 Infectious Diseases Unit, Department for Integrated Infectious Risk Management, IRCCS Azienda Ospedaliero-Universitaria di Bologna , Bologna, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Maria Giulia Caponcello 15 Unidad Clínica de Enfermedades Infecciosas y Microbiología, Hospital Universitario Virgen Macarena; De- partamento de Medicina, Universidad de Sevilla, Instituto de Biomedicina de Sevilla (IBiS)/CSIC , Seville, Spain 16 CIBERINFEC, Instituto de Salud Carlos III , Madrid, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zaira R. Palacios-Baena 15 Unidad Clínica de Enfermedades Infecciosas y Microbiología, Hospital Universitario Virgen Macarena; De- partamento de Medicina, Universidad de Sevilla, Instituto de Biomedicina de Sevilla (IBiS)/CSIC , Seville, Spain 16 CIBERINFEC, Instituto de Salud Carlos III , Madrid, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site Karin I. Wold 6 University of Groningen, University Medical Center Groningen, Department of Medical Microbiology and Infection Prevention , Groningen, The Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site Elisa Rossi 17 CINECA Interuniversity Consortium , Bologna, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Evelina Tacconelli 3 Division of Infectious Diseases, Department of Diagnostics and Public Health, University of Verona , Verona, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jan Hasenauer 1 The Life and Medical Sciences Institute (LIMES), University of Bonn , Germany 2 Bonn Center for Mathematical Life Sciences, University of Bonn , Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jan Hasenauer For correspondence: jan.hasenauer{at}uni-bonn.de Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Post-Acute Infectious Syndromes (PAISs) refer to the symptoms persisting for months after an initial infection. PAISs are multi-systemic conditions, and thus clinical research studies often collect rich, multi-modal datasets – including demographic information, patients’ comorbidities, acute-phase presentation, and observations over extended periods that can include clinical data, patient-reported outcomes (PROMs) or experimental data. Yet, the complexity of the datasets and lack of the precise clinical case definition pose difficulties in comprehensive and meaningful analyses of such datasets. Studies on Post-COVID-19 condition (PCC) represent the most prominent example of this challenge. In this work, we outline a flexible and interpretable framework for modelling and analysing data from longitudinal studies of PAIS using Latent Transition Analysis (LTA). The framework enables the identification of distinct disease phenotypes and the patient-level analysis of transitions between them, without relying on predefined clinical categorisations. Furthermore, we introduce a method for incorporating covariate information, which enables exploration of how patient characteristics influence disease trajectories. We apply this methodology to the ORCHESTRA dataset, composed of individuals affected by SARS-CoV-2 infection from multiple European centres. Longitudinal data were collected at SARS-CoV-2 infection, and at 6, 12, 18, and 24 months of follow-up from patients in France, Italy, Spain, and the Netherlands. In total, 5,094 patients were clinically assessed, reporting symptoms and quality-of-life information. Our model identifies distinct PCC phenotypes and uncovers complex transition patterns among them. We found that advanced age and pre-existing chronic respiratory conditions significantly increased the likelihood of transitioning to more severe PCC states in later timepoints. Conversely, male sex and exposure to later SARS-CoV-2 variants were associated with a higher probability of complete recovery. Our study highlights how LTA can enhance the interpretability of complex, time-resolved clinical data, support personalized patient monitoring and management, and accelerate therapeutic development — not only for PCC but also for other PAISs in both pandemic and inter-pandemic settings. Introduction Understanding and managing the long-term health consequences of acute diseases is a critical component of public health. A number of infectious pathogens can cause Post-Acute Infection Syndromes (PAIS), a source of unexplained chronic disability with diverse and often severe symptoms lasting for months after the resolution of the initial acute infection [ 1 ]. Longitudinal studies of PAIS are instrumental in identifying lasting effects, tracking symptom trajectories, and guiding interventions. These syndromes present substantial challenges to both research and clinical care due to their protracted nature, heterogeneity, and often unclear pathophysiology. One impactful example of PAIS is Post-COVID Condition (PCC), also known as Long COVID or Post-COVID-19 Syndrome. According to the World Health Organization, PCC is defined as the presence of symptoms lasting more than two months following a confirmed or probable SARS-CoV-2 infection, without an alternative diagnosis [ 2 ]. It is estimated that more than 65 million people globally are affected by PCC [ 3 ], making it a major health and societal burden. Current efforts to delineate PCC phenotypes rely on symptom clustering techniques, typically applied to cross-sectional data. These studies have consistently identified symptom-based phenotypes such as Chronic Fatigue, Respiratory, Pain, Neurosensorial, and Gastrointestinal clusters [ 4 , 5 ]. A complementary line of research has employed severity-based clustering, grouping patients by symptom burden rather than type [ 6 , 7 ]. Incorporation of Health-Related Quality of Life (HRQoL) measures has further highlighted the substantial impact of PCC on daily functioning [ 8 , 9 ]. Risk factor analyses suggest higher susceptibility among women, unvaccinated individuals, patients with diabetes or chronic respiratory disease, and those with severe acute illness [ 10 ]. However, these clustering approaches face important limitations. Clusters are often defined independently at each timepoint as the condition can present differently in patients from one timepoint to the next, making it difficult to track the progression of disease states or to compare clusters across assessments. Symptom profiles can fluctuate, resulting in false negative observations, variable cluster composition and inconsistent phenotype definitions [ 11 ]. While unsupervised learning methods such as hierarchical clustering help address some of this subjectivity [ 12 , 13 ], they still fail to capture the temporal dynamics of patient trajectories or the influence of individual-level characteristics over time. To overcome these limitations, researchers have turned to Latent Transition Analysis (LTA) — a type of Hidden Markov Model (HMM) designed to infer unobservable (latent) states and estimate transition probabilities between any two states over successive timepoints [ 14 ]. LTA is particularly well-suited for studying PAISs, where symptom presentation is presumptively dynamic and the underlying disease structure is not directly observable. LTA has already been applied to PCC in several recent studies. It was used to investigate trajectories of quality-of-life impairments [ 15 ], explore psychological distress patterns [ 16 ], and study the evolution of pulmonary function after infection [ 17 ]. These efforts demonstrated the utility of LTA, yet they were constrained by modest cohort sizes (largest: 1467 patients) and number of latent states (5 states), limited observational modalities (only binary or continuous data), and restricted model parameterisations (number of covariates), reducing their capacity to generalize findings or support individual-level predictions. In this study, we demonstrate how LTA capable of considering a large number of latent states and set of covariates estimated in an efficient manner, can be used for the extensive modelling and model-based assessment of data from PAIS studies. The flexible framework enables the data-driven identification of latent disease phenotypes, the modelling of temporal disease progression, and the analysis of covariate-dependent transition dynamics, without relying on predefined clinical categorisations. To support personalised patient-level insights, we further describe a filtering-based approach that improves the prediction accuracy of individual symptom trajectories. While the methodology is broadly applicable to a wide range of chronic and post-infectious conditions, we showcase it by studying PCC using a partially published dataset collected within the ORCHESTRA project ( Figure 2 ). This rich, multinational cohort includes longitudinal clinical, symptom, and quality-of-life data from over 5,000 microbiologically confirmed SARS-CoV-2 patients across four countries, with follow-up extending up to 24 months. Download figure Open in new tab Figure 1. Illustration of Latent Trajectory Analysis for Longitudinal Studies of Post-Acute Infection Syndromes, using the PCC ORCHESTRA Study as an Example. (A) Data collection process which provides information about the health state for each patient (recorded symptoms and HRQoL measurements) at different timepoints. (B) Parameterisation of model candidates and model selection. (C) Interpretation of model states (using symptoms and HRQoL scores) and (D) relationship between states for chosen 7-State Model. (E) Model-based quantification of the impact of patient covariates and their relation to observed symptoms. Percentages next to the coloured circles represent the conditional probabilities of the identified states given patient covariates. (F) Prediction of trajectories of health states from the acute infection. As new observations are recorded throughout, trajectories are updated. Download figure Open in new tab Figure 2. ORCHESTRA Long-Term Sequelae Cohort Dataset. (A) Prospective cohorts included in ORCHESTRA study; Servicio Andaluz de Salud (SAS), INSERM, University Medical Center Groningen (UMCG), University of Verona (UNIVR) and University of Bologna (UNIBO). (B) Stacked SARS-CoV-2 infection timeline of patients, separated by level of care. (C) Number of enrolled patients vaccinated for SARS-CoV-2 before acute infection over time. (D) Age and sex distributions. (E) Distribution of COVID-19 pharmacological treatment, including immunomodulators (i.e., tocilizumab, ruxolitinib, adalimumab, baricitinib, or tacrolimus) and monoclonal antibodies (i.e., bamlanivimab, bamlanivimab plus etesevimab, and casirivimab plus imdevimab) measured at the acute phase. (F) SF-36 mental and physical component scores across the recorded months after acute infection. Black dashed line at 50 indicates the norm-based scoring mean of the 1998 US general population (mean=50, SD=10). (G) Stacked bar charts of the 9 most commonly reported PCC-related symptoms over time. Total number of available patients is shown at the top of each bar, with percent occurrence of symptoms shown within each bar. In total, the dataset contained observations for 5094 individuals ( Supplementary Table 7 ). The LTA framework is tailored for heterogeneous longitudinal data, the incorporation of demographic and clinical covariates into the model’s transition structure. It allows for the identification of robust PCC phenotypes and transition pathways, as well as the interpretation of key risk and protective factors governing disease progression and recovery. Our approach provides a transferable template for investigating other PAISs, offering both mechanistic insight of the hidden patient health progressions and practical utility of risk and protective factors for clinical monitoring and therapeutic development. Results A Flexible Latent Transition Analysis Framework for Long-Term Sequelae Studies To provide a comprehensive and scalable assessment of PAISs, we propose a flexible and interpretable LTA framework. The framework builds on established Hidden Markov Modelling implementations [ 18 , 19 ], addressing the specific challenges of longitudinal cohort studies with heterogeneous data types (i.e., binary and continuous), as well as incomplete observations that arise over long follow-up periods. A key innovation of our approach lies in a parsimonious parameterisation of the covariate-dependent transition structure. Rather than modelling the full matrix of covariate effects separately for each transition probability, which would lead to an unmanageable number of parameters, patient characteristics are projected onto a low-dimensional scalar representation that modulates the entire transition matrix in an interpretable and computationally efficient way. This significantly reduces overfitting risk while preserving individual-level heterogeneity. We conducted a comprehensive simulation-based validation study to evaluate the robustness of parameter recovery, predictive performance, and covariate interpretability across a range of controlled scenarios. These experiments confirm the framework’s reliability under varying data sparsity, symptom noise, and covariate effects, providing confidence in its application to real-world longitudinal data. In addition to characterising state transitions, our framework supports state filtering for individual patients, making use of prior symptom and HRQoL history to improve future symptom and HRQoL predictions. By applying a recursive state update procedure, we dynamically refine patient-level latent state probabilities at each timepoint and generate predictions for both binary and continuous variables. Full methodological details, including model specification, estimation procedures, and evaluation metrics, are provided in the Methods section. The full implementation of the framework is provided, including model fitting, prediction, and visualisation routines, available on Github and archived on Zenodo . Clinical Characterisation of the ORCHESTRA Long term Sequelae Cohort In order to assess the proposed framework for LTA in PAISs, we analysed data from the ORCHESTRA Long-Term Sequelae Cohort. This cohort of individuals with confirmed SARS-CoV-2 infection was previously established as part of the EU-funded ORCHESTRA project ([ 20 ]) and comprises six prospective subcohorts from 56 centres across five countries ( Figure 2A ). Cohort included adult patients ( > 14 years old) with laboratory-confirmed SARS-CoV-2 infection. At baseline, data collection included demographic characteristics, comorbidities, clinical severity, ICU admission, early antiviral and monoclonal antibody treatment ( Figure 2B–E , Supplementary Table 7 ). HRQoL measures through the Short Form Health Survey 36 (SF-36) were collected during each follow-up. A poor HRQoL was defined for a score below 50 ( Figure 2F , Supplementary Table 9 ). In additon, we focus on 9 PCC-related symptoms (ageusia, anosmia, arthralgia, cough, dyspnoea, fatigue, headache, memory loss and myalgia) measured at all timepoints recorded at each timepoint ( Figure 2G and Supplementary Table 8 ). In total, the dataset contained observations from 5094 individuals, all of whom had data at acute phase and at least one follow-up ( Supplementary Table 7 ). The dataset includes 1796 patients for whom analysis of the 12-month follow-up data was published before [ 21 ], while data from the 18- and 24-month visits are presented here for the first time. The assessment of the newly collected data revealed a minor drop in patient participation from the 12-to 18-month follow-up (from 2,495 to 2,120), while 628 patients completed a 24-month visit ( Supplementary Figure 1 ). A total of 419 patients were assessed at all five timepoints, with most participants having assessments at both acute infection and the first follow-up at 6 months. Notably, there is a non-negligible number of patients with non-standard follow-up patterns, including a subgroup of 274 individuals assessed at 18 months who had data only from the acute stage prior to that assessed time period. Data-informed Latent Transition Analysis Identifies Initial COVID-19 Infection and Post-COVID Syndrome States To comprehensively characterise the disease dynamics and individual symptom trajectories associated with PCC based on the ORCHESTRA Long-Term Sequelae Cohort, we applied our LTA framework, using a fully connected HMM allowing for transitions between all pairs of latent health states, without imposing any a priori structure on the symptom profiles or progression paths ( Figure 3 ). Each latent state is defined by a set of emission probabilities for binary symptoms and expected values for continuous HRQoL scores, enabling joint modelling of discrete symptom and continuous HRQoL data. The model also captures patient-level heterogeneity through covariate-dependent transition dynamics, with state identities and trajectories inferred directly from the data via maximum likelihood estimation. Download figure Open in new tab Figure 3. LTA Model for ORCHESTRA Cohort. (A) Model with 7 latent states: the intensity of the arrow colour indicates the average transition probability over the patients in the cohort. Each state has an associated emission probability to each of the nine PCC-related symptoms, as well as Gaussian distributions describing physical (left) and mental (right) HRQoL scores. (B) Heatmap showing the probability of reporting one of the 9 PCC-related symptoms given the state with its associated Gaussian distributions of the SF-36 HRQoL physical and mental component scores. (C) Mean probability of state at each timepoint. (D) Initial and transition state probabilities for three example patients; men infected during the fourth COVID-19 wave, men older than 60, and women over 60. The assessment of the models with 4 to 8 latent states revealed a high degree of structural similarity ( Supplementary Figure 3 ), as well as similar covariate impact ( Supplementary Figure 4 ). Model selection reveals that the 7-state model is most appropriate ( Supplementary Figure 2 ). This model provided clinical interpretations for each state. States 1-2 were exclusively observed during the acute infection in acute phase, while States 3-7 were predominantly observed in follow-up timepoints ( Figure 3A ). At the extremes, we identified the Healthy state (State 3) and the Severe Symptom state (State 7) based on emission probabilities ( Figure 3B ). The Healthy state is characterised by very low symptom probabilities ( < 15% for fatigue, < 10% for all other symptoms), indicating complete recovery from the initial COVID-19 infection. This interpretation is further supported by notably high mean SF-36 scores for both physical (54.31) and mental (57.62) components, reflecting above-average HRQoL. In contrast, the Severe Symptom state displays consistently high probabilities across all symptoms and notably lower SF-36 scores, indicative of significantly compromised HRQoL. In addition to these extreme states, the LTA reveals the Sensorial PCC state (State 4), the Fatigue PCC state (State 6), and the Respiratory PCC state (State 5) ( Figure 3B ). The Sensorial PCC state is specifically characterised by elevated probabilities of anosmia (81%) and ageusia (89%), second only to the Severe Symptom state. Notably, this Sensorial PCC state predominantly impacts physical HRQoL but has minimal influence on mental health scores. Fatigue PCC is marked by elevated probabilities of fatigue (85%), along with intermediate probabilities (30–60%) for all other symptoms except anosmia and ageusia. The Respiratory PCC state shows slightly reduced physical and mental HRQoL scores (46.86 and 47.72, respectively) compared to the Healthy state, with notable increases in symptom probabilities for fatigue (from 6% to 31%) and dyspnoea (from 14% to 43%). The analysis of the mean initial state distribution and transition dynamics reveals that two states exclusively emerge during the acute infection phase ( Figure 3C ). The Acute Respiratory state (State 1) and Acute Moderate state (State 2) are characterised by high probabilities of cough (74% and 77%, respectively), dyspnoea (57% and 60%), fatigue (54% and 81%), and headache (16% and 48%), with Acute Moderate additionally characterised by arthralgia (49%) and myalgia (91%). The HRQoL distributions associated with both states have extremely large uncertainty in their means and standard deviations, which is expected, given that no HRQoL data were collected at the acute timepoint ( Supplementary Tables 2 and 3 ). From 6 months onwards, the Healthy and Respiratory PCC phenotypes become dominant, with Sensorial PCC, Fatigue PCC, and Severe Symptom phenotypes having lower but non-negligible probabilities (5.97%, 17.03%, and 2.32%, respectively, at 24 months). Generally, patients tend to enter the Healthy state, indicating recovery, as time progresses (from 32.6% at 6 months to 39.2% at 24 months), mainly due to decreases in the probability of the Respiratory PCC state (from 41.5% at 6 months to 35.5% at 24 months) ( Supplementary Table 6 ). LTA reveals that patient characteristics influence initial state and state transitions in distinct ways. Across three patient profiles — chosen to cover the spectrum of disease presentation — the Acute Respiratory and Acute Moderate states, as well as the Severe Symptom state, are notably prevalent at the initial infection stage. This highlights the considerable number of patients experiencing extensive symptoms ( Figure 3D ). The transition probabilities indicate that patients have a high likelihood of remaining within the Healthy, Respiratory PCC, Sensorial PCC, or Fatigue PCC states, suggesting that patients who transition into these states generally remain stable over subsequent follow-up periods. Given the initial prominence of the Acute Respiratory, Acute Moderate and Severe Symptom states, the likelihood of transitioning out of these states during follow-ups is particularly important as subsequent states typically exhibit high persistence. For instance, men infected during the fourth wave show a large probability of rapidly transitioning to the Healthy state, despite initially having a higher probability of starting in the Severe Symptom state (28%). Conversely, women older than 60 years show higher initial probabilities for the Acute Respiratory and Acute Moderate states (47% and 24%, respectively) and tend to move into Respiratory PCC and Fatigue PCC states during later follow-ups. These patients also exhibit higher probabilities of remaining within these PCC states, suggesting a comparatively lower chance of recovery. Our analysis revealed that LTA based on flexible HMMs allows for the data-driven definition of Acute Infection and PCC states. Model selection allows for robust assessment of the number of distinct states, and subsequent model simulations can be used to identify transition patterns between states. Cohort Level Dynamics Captured across Follow-up Symptoms and Health-Related Quality of Life Metrics In order to assess the model’s ability to accurately describe the observed cohort data, we conducted forward simulation using the patient characteristics of the ORCHESTRA dataset (see Forward Simulations). Since the true state of each patient is unobserved, our evaluation focuses on the distribution of symptoms, the HRQoL values, and their correlation structure. The comparison of the model simulations with real observed data for the nine reported symptoms reveals an overall good agreement. For most symptoms and timepoints, the range of model simulations overlaps with 2 × standard error of the mean derived from the observed data ( Figure 4A ). A discrepancy occurs in the predicted prevalence of dyspnoea at 24 months, where simulations typically forecast a decrease, while the observed data indicate an increase. This anomaly could reflect potential sampling bias at the 24-month follow-up, which is also evident from the broader confidence intervals. Similarly, the memory loss symptom exhibits a minor increase from 12 to 18 months but a slight decrease at 24-months, suggesting potential fluctuations or biases in symptom reporting over extended follow-up periods. Download figure Open in new tab Figure 4. Comparison of Model and Data at the Cohort Level. (A) Percentage of the cohort reporting a specific symptom at Months 0 to 24. The observed dataset is indicated in orange (bold dashed line: mean; shaded area: 2 × standard error of the mean) and the results of 1000 model simulations are indicated in purple (bold line: mean; thin lines: individual simulations). (B) Reported SF-36 physical and mental component scores (orange) and distribution from model simulations (purple) for timepoints after acute infection. (C) Dimension reduction for the vectors containing symptom information and SF-36 physical and mental component scores. The reported datapoints (orange) and simulation results by the model (purple) are indicated in the space of the first two principal components of each patient. The assessment of the distribution of the HRQoL scores revealed strong agreement between averaged model simulations and observed data across all timepoints ( Figure 4B ). Despite the complex structure of the mental health scores, characterised by a peak around 60 and a long tail extending toward lower values, the simulations are able to accurately capture this dynamic. Indeed, the model predicts that distinct segments of these distributions correspond to specific latent states ( Supplementary Figure 5 ). The pronounced clustering around 60, accompanied by a secondary cluster around 45, is driven primarily by patients transitioning into Healthy and Respiratory PCC states. Conversely, patients classified within the Respiratory PCC state commonly report scores around 50 for both the mental and physical components, indicating a stable but mildly impacted HRQoL. Over time, patients do recover in both physical (51% of patients with a score above 50 at Month 6 to 56% at Month 24) and mental components (60% of patients > 50 at Month 6 to 65% at Month 24) of the SF-36. Inspection of the Principal Component Analysis (PCA) plots of the cohort data shows that the projected datapoints are not easily separable between timepoints, implying that it is difficult to discern underlying clusters ( Figure 4C ). Yet, the LTA provides a model with symptom observations which map to the first two principal components. These are similar in structure to the real observation, displaying the ability to capture underlying latent mechanisms which would normally not be revealed by a standard factor analysis [ 22 – 24 ]. More so, the PCA loading values of the first principal component also match closely between the dataset and simulations, implying the impact of symptoms on the underlying patterns is also similar ( Supplementary Figure 6 ). In addition, comparisons of the Pearson correlation coefficients between symptoms and HRQoL variables produced by the dataset and averaged simulation also yielded similar results ( Supplementary Figure 7 ). Our assessment of the model at the cohort level revealed a good agreement between simulations and observed data, as the individual variables, the dimensional reduction results, and the correlation structure between observed variables are properly captured. LTA Model Confirms Known Risk Factors for Severe Post-COVID Condition As the model accurately describes the cohort-level dynamics, we next assessed the role of patient characteristics. While it is known that patient characteristics affect the chance of developing PCC after the acute infection [ 10 ], there is limited information on how they impact specific patient trajectories. LTA allows for a dependency of initial state and transition probabilities on patient characteristics. Specifically, we use a hierarchical model, which begins by using estimated values and to model the dependence on intermediate variables for each patient. These intermediate variables r initial and r trans then modulate the corresponding initial state and transition probabilities which impact the patient trajectories. This hierarchical approach reduces number of parameters compared to a model accounting for a direct dependence of initial state and transition probabilities of the patient characteristics. By inspecting how a given patient characteristics change the probability of being in specific states, we uncover how varying these intermediate values have in ultimately driving patient state trajectories throughput the follow-up. The meaning of directionality for the transition probability covariates is interpreted by inspecting the steady state distribution of the corresponding transition matrix constructed ( Figure 5B ). Worsening, defined as a decreased probability of being in the Healthy state and an increased probability of being in the Respiratory or Fatigue PCC states, is correlated with female sex, age 41-60 and age > 60, corticosteroid therapy, and chronic respiratory disease ( Figure 5A ). Infections at later waves are associated with higher probabilities to enter the Healthy state. The state distribution shown corresponds to the distribution of states in the distant future, which is shown to be similar in pattern to that at the 24-month follow-up ( Supplementary Figure 10 ). Download figure Open in new tab Figure 5. Patient Trajectories and their Dependence on Covariates. (A) Forest plot for dependence of transition probabilities on the covariates. The point estimates and the 95% confidence band are indicated. Coloured points are indicative of statistically significant (p-value ≤ 0.05) covariates leading to improving states if green (increased probability of Healthy state), while red indicates worsening states (increased probability of Respiratory/Fatigue PCC states). (B) Probability of state occurrence of patients with varying characteristics, with vertical lines representing an example male, aged 15 to 30, patient infected during the first wave to indicate their respective probability distributions compared to a female with the same characteristics. (C) The 20 most frequently observed patient trajectories with corresponding percentage frequency after 1000 simulations using the patient of real ORCHESTRA dataset. (D) Predicted probability of being in the Healthy State of various patients with 95% confidence bands computed from simulations. (E) Predicted time for patients to transition from any of the PCC states to the Healthy state after 1000 simulations. The model shows that individual covariates do not meaningfully change the probabilities of being in a particular state at the acute stage of infection, as all covariate estimates have low statistical significance with large confidence bands. ( Supplementary Figure 11 ). However, the model is able to identify a distinct pattern of how covariates impact stationary state distribution, despite this not being explicitly determined by the LTA model. There is a clear progression of improving stationary state distributions, as the Fatigue PCC state dominates initially for negative values of r trans , with a higher probability to enter the Healthy state as the value increases ( Figure 5B ). Our model aligns with previous key findings on PCC [ 21 ]; that is, being female (0.41 ± 0.36), older age (0.40 ± 0.18 and 0.5 ± 0.45 for ages 41-60 and > 60, respectively) and chronic respiratory disease (0.33 ± 0.29) leads to an increased risk to in developing PCC. The third and fourth waves (-0.35 ± 0.32 and -0.35 ± 0.33, respectively) had significantly better recovery, compared to the first wave ( Figure 5A and Supplementary Table 5 ). The significance of corticosteroids used to treat COVID-19 at the acute infection (0.25 ± 0.23), indicating worsening health, adds to the growing complexity of the impact of the treatment for PCC. Indeed, as corticosteroid use was limited to patients with respiratory failure [ 25 ], the interpretability of the impact in the final model remains limited. To understanding the actual phenotype trajectories of patients, rather than mean state pathways, we examined the frequency of observed state trajectories within the ORCHESTRA cohort population ( Figure 5C ). The assessment shows that the most common trajectories are dominated by starting in one of the Acute states and transitioning to either the Healthy or Respiratory PCC states, as most patients are not expected to develop severe PCC symptoms. Furthermore, after entering the Healthy, Respiratory PCC, Fatigue PCC or Sensorial PCC states, no additional change of state is usually observed. Patients in the Acute Respiratory state followed by the Respiratory PCC state either enter the Healthy state or the Fatigue PCC state. The Sensorial PCC state was uncommon in the ORCHESTRA dataset but stable, as patients either began in the Sensorial PCC state at the acute stage or transitioned into it from the Severe Symptoms state. Notably, the Respiratory and Fatigue PCC states did not appear for this type of trajectory at any timepoint. Importantly, in the 20 most frequent trajectories, the Severe Symptom state occurs once during the acute infection and remains uncommon during the follow-up. Simulations of the model also provide predictions for the impact of patient characteristics on recovery probability and the recovery time. Notably, the combination of male sex, ages 15-30 and infected at the fourth wave gives a high probability of being in the Healthy state at 24 months after acute infection (72.2%) while females aged over 60 had a remarkably lower probability (26.6%) ( Figure 5D ). We also calculated the time to transition to the Healthy state, indicating recovery, for patients who begin at either Fatigue PCC, Respiratory PCC or Severe Symptom states ( Figure 5E ). On average, females over 60 required more than double the time to recover (89.46 months), compared to males aged 15-30 infected at the fourth wave (20.81 months). Overall, our in-depth analysis of the data-driven model reveals that it recapitulates known risk factors. Furthermore, the impact of risk factors on the initial and transition probabilities can be clearly interpreted. Most notably, it is found that patient Healthy and PCC states remain stable over time, rarely moving between follow-ups. Hidden Markov Model Enables Longitudinal Data Integration for the Improvement and Personalisation of Predictions As the LTA provides an interpretable description of observations and risk factors at the cohort level, we next assessed its ability to provide personalised, patient-level predictions. Given the dynamic nature of the model, we consider not only the task of forecasting patient trajectories based on observations from the acute phase — a task previously recognized as challenging [ 21 ] — but also how additional observations along the patient trajectory can improve predictions for subsequent timepoints. This approach represents a filtering problem, leveraging the dynamic characteristics of the model and sequential patient data, without the need to re-estimate the entire model (see Utilising Patient History and Evaluation Metrics). We expect that the incorporation of symptom observations at intermediate timepoints increases confidence in identifying the correct state at those moments, thus enhancing the accuracy of state predictions for future visits ( Figure 6A ). Download figure Open in new tab Figure 6. Prediction Accuracy of Model for Individual Patients. (A) Illustration of the prediction task for the physical score of a patient, using information up to different timepoints (individual rows). The predictions for different timepoints are indicated using different colours. As the patient’s score (vertical dashed lines) is used in updating the current state probabilities, the probability distributions of the prediction change with the addition of new information. For this particular patient, the characteristics suggest a high probability of full recovery; yet, the recording of a poor SF-36 physical score at 6 and 12 months indicates that the patient is on a different trajectory, resulting in an update to the prediction for later timepoints. (B) The percentage of maximum achievable AUROC values of the 9 PCC-related symptoms at each timepoint after acute infection. Patient trajectories and predictions are updated by increasingly using previous observations up to the previous timepoint. (C) The average logarithm of the probability of mental/physical score given the estimated probability density for each patient across the timepoints and expressed as a percentage of the maximum achievable mean logarithmic score. The model demonstrates a good predictive performance of the ORCHESTRA patient cohort. In general, the best performance is achieved when the full information about the timepoint is utilised ( Figures 6B-C ). There are only few occurrences in which additional information decreases performance, such as the prediction of the presence of coughing at month 24 using symptom and HRQoL measurements up until month 12 versus month 18 (80% to 76%). These exceptions are probably related to sampling noise and the stochasticity of the underlying process, as they occur only at Month 24 – the timepoint for which the fewest patients have been evaluated. We assess the model’s capability of predicting symptoms at each timepoint, updating the model to use all observations up to the evaluated follow-up. Across all timepoints, the LTA model predicts symptoms, updating the model to use all observations up to the evaluated follow-up, with an average Area under the Reciever Operator Curve (AUROC) score of 0.69 when evaluated on the entire ORCHESTRA dataset, while 5-fold cross validation of the same model yielded a mean value of 0.65 ( Supplementary Figures 8 and 9 ). In summary, our model not only provides strong agreement with the ORCHESTRA cohort dataset on an aggregated level, but also good agreement in predicting patient-specific symptoms and HRQoL scores in both in- and out-of sample datasets. The model accommodates for adding the most updated information using a filtering scheme rather than a complete re-estimation, which would be computationally burdensome. Discussion Understanding the long-term consequences of infectious diseases requires comprehensive and flexible data analysis approaches, especially in situations where prior clinical knowledge is limited or evolving. This is particularly critical in the early stages of emerging diseases, such as during pandemics, when the underlying biological mechanisms, disease states, and symptom trajectories are not yet well understood. However, even in more established contexts, reliance on predefined disease states or symptom clusters can introduce bias and obscure subtle or atypical patterns present in complex data. There is a growing need for unbiased, data-driven methods that allow health states to be inferred directly from observed data, with methods that can robustly characterise disease heterogeneity and capture the evolving dynamics of PAISs such as Post-COVID Condition (PCC). In this work, we demonstrate how Latent Transition Analysis — a powerful, well-established modelling framework based on Hidden Markov Models — can be extended and applied to long-term sequelae studies in a flexible and generalisable way. By enabling the (i) joint modelling of binary and continuous data, (ii) incorporation of trajectories with missing observations, and (iii) analysis of large numbers of covariates, LTA allows for the identification of latent health states and transition pathways, without requiring prior assumptions about disease structure or temporal dynamics. We apply the LTA framework to the ORCHESTRA Long-Term Sequelae Cohort, one of the most comprehensive cohorts with up to 24 months of post-infection follow-up. The newly added 18- and 24-month assessments significantly expand the temporal resolution of the dataset, although as expected, participation rates decline over time. While the decline in participation numbers is a limitation of the study, with only 419 individuals completing all five timepoints, the flexibility of our LTA framework allows us to leverage these uneven data patterns without the need for imputation. Hence, it provides a principled way to extract signal despite missingness. Through systematic model selection and covariate-aware inference, we show that a fully connected 7-state HMM effectively captures the latent dynamics of PCC. The model recovers distinct phenotypic states, including a Healthy, Severe Symptom, and intermediate states such as Sensorial, Fatigue, and Respiratory PCC — each with specific symptom profiles and HRQoL patterns. The model also identifies two Acute-phase states, which are only present during the early stages of infection. Notably, the symptom clusters of the Acute Infection and PCC states as well as the corresponding trajectories emerge in a fully data-driven fashion, without any clinical labelling or hard-coded structure, demonstrating the plausibility of our modelling approach. Our analysis confirms known features of PCC while offering novel insights. LTA largely recreates the earlier PCA-derived phenotypes [ 21 ], and the overall picture emerging from the scientific literature [ 10 , 26 ]. In addition to the established research, the LTA revealed that fatigue underpins all PCC phenotypes. While our previous analysis identified the Sensorial PCC state with elevated rates of anosmia and ageusia at 12 months, the LTA model helps us to find that the state is also associated with reduced physical but preserved mental HRQoL, suggesting that sensory impairments may not significantly impact mental wellbeing. Conversely, the Fatigue PCC state involves broader symptom involvement and more marked reductions in both mental and physical HRQoL. We also observe that transitions between latent states are influenced by individual characteristics, such as age, sex, and the infection wave. While male patients from later waves tend to recover more rapidly, older individuals and females exhibit greater persistence in PCC states, especially the Fatigue and Respiratory PCCs. In comparison to earlier publications which identified similar risk factors [ 10 ], the LTA describes in detail how these characteristics specifically impact patients’ trajectories throughout the follow-up. Even if current definitions of PCC include fluctuation of symptoms [ 2 ], the main change in LTA states is the resolution of PCC. Importantly, our model captures these individualised, covariate-dependant disease trajectories without introducing a prohibitive number of parameters. A key contribution of this work is the parsimonious parameterisation of covariate effects, which avoids the rapid growth in the number of parameters typically associated with covariate-dependent HMMs. Indeed, a classical model accounting for the dependence of all initial state and transition probabilities on all covariates would have required more than 1000 parameters to be estimated during model selection and could not have been handled using our available computational infrastructure. By projecting individual covariates into a low-dimensional scalar summary, we achieve interpretable, patient-specific transition probabilities while maintaining computational efficiency. The model’s predictive performance is strong, with good AUROC scores and reliable forecasts of both binary symptoms and continuous HRQoL metrics at future timepoints. While other clustering models of PCC measure performance on the ability to identify formed clusters [ 27 ], which in themselves are estimated via unsupervised algorithms, our model accounts for the uncertainty in the formation of clusters in the emergence of latent states in order to predict observable outcomes at each timepoint. Despite its strengths, our study is subject to several limitations. Firstly, there are data collection constraints due to pandemic stress on health systems. In addition, patient attrition at the 18- and 24-month follow-ups reduces the sample size for late-stage trajectory analysis, increasing uncertainty in those periods, highlighting the difficulties in maintaining an active follow-up in an observational study. We assume a missing completely-at-random (MCAR) pattern in the dropout process so if healthier patients are disproportionately underrepresented in late follow-ups, this could bias our trajectory estimates; this assumption is given by the fact that the rate of healthy patients in the last follow-up is decreasing from the previous one. Secondly, the model is also subject to the typical non-identifiability issues of HMMs such as label switching [ 28 ], which may overinflate uncertainty in covariate estimates. However, careful initialisation via multiple warm optimisation starts and consistent interpretation across varying number of latent states helped to mitigate these effects. Lastly, it is important to note that HRQoL estimates per state and timepoint represent the HRQoL of patients within each state; however, they should not be interpreted as a consequence of the state itself, as both initial states and subsequent trajectories may be endogenous to HRQoL. The underlying clinical status of individuals before infection, which is typically well captured by HRQoL, likely influenced both the acute presentation and subsequent PCC trajectories. As pre-infection HRQoL data are not available, this potential source of confounding cannot be fully accounted for. Our open-source Julia implementation provides researchers with tools to fit, interpret, and visualize latent transition models with customizable covariates and state structures. It includes simulation tools, diagnostic routines, and an interactive visualization interface, enabling researchers to explore how changing covariate inputs alters individual trajectories. This makes our framework a ready-to-use solution for other PAIS studies, particularly those involving incomplete, multi-modal clinical data. In short, the LTA paradigm manages to combine the benefit of both unsupervised clustering methods and clinically guided grouping of patient observations used for single timepoints, on longitudinal datasets. Unlike classical techniques, LTA facilitates a more holistic approach to PAIS phenotype identification, as the relationship between observations and hidden states is defined in an intuitive manner, allowing better accessibility and interpretability for trained clinicians while not sacrificing the necessary complexity of the model. Despite inherent limitations of our PCC study, LTA enabled an analysis of patient-level trajectories over time. The framework supports patient-level prediction of future state trajectories and recovery times at any point after the acute infection phase, by leveraging the distribution of symptoms observed at previous timepoints. LTA constitutes an unbiased, data-driven method that allows health states to be inferred directly from observed data, that can robustly characterise disease heterogeneity and capture the evolving dynamics of long-term conditions. Thus, this generalised approach is not specific to disease progressions of PCC in the ORCHESTRA cohort study only but offers practical utility for clinical monitoring and therapeutic development of all longitudinal PAIS studies characterised by a broad spectrum of symptoms that persist or emerge after the initial resolution of infection. These syndromes present substantial challenges to both research and clinical care due to their protracted nature, heterogeneity, and often unclear pathophysiology. The framework is easily extendable to general PAIS studies that include relevant biomarkers, patient-reported outcomes, and structured assessments such as clinical scales or omics profiles, which enables the identification of multimodal latent phenotypes that reflect disease biology across diverse data types. Future work may also include time-varying covariates or explicit modelling of interventions (e.g., post-infection vaccination or rehabilitation), enabling causal inference about recovery-modifying treatments. Conclusions In summary, we present an extended LTA framework capable of uncovering the hidden dynamics of the progression of post-acute infection syndromes in a data-driven, unbiased manner. Applied to the ORCHESTRA cohort, the model identifies meaningful PCC phenotypes, explains patient heterogeneity in recovery, and provides predictive insight into individual trajectories — despite challenges such as missing data and model ambiguity. We identified seven states describing Acute Infection and PCC and the most common trajectories, being able to predict PCC resolution based on patient-level covariates. By eliminating the need for prior disease knowledge or symptom grouping assumptions, our approach is particularly valuable in emerging or poorly understood conditions, offering a robust and generalizable tool for post-infection surveillance, cohort studies, and future pandemic preparedness. Methods Study Design and Participants The ORCHESTRA Long-term Sequelae Cohort (CT registration number: NCT05097677 ) comprises of five prospective subcohorts from 56 centres across four countries (France, Italy, the Netherlands, and Spain). Eligible participants included both hospitalised and non-hospitalised patients aged over 14 years, with laboratory-confirmed SARS-CoV-2 infection, enrolled after providing written informed consent. Participants were systematically followed at 6-, 12-, 18-, and 24-months after infection in outpatients’ clinics or at the patients’ home for all the centres. Each follow-up involved clinical assessments by qualified medical personnel and extensive laboratory testing; additional testing was performed if clinically needed. Nasopharyngeal swabs were collected at baseline for diagnosis and to identify the variant of concern (VoC) and were repeated only if positive after 30 days from the initial diagnosis. VoC typing and serological analyses were performed at the Antwerp laboratory or in local laboratories following standardised protocols. Local cohorts datasets that started before December 2020 were homogenised and standardised as previously described [ 29 , 30 ]. Data collected at baseline included date of symptom onset and diagnosis, duration of symptoms, demographic characteristics, comorbidities, clinical presentation, hospitalisation, admission to ICU, treatment, and post-acute infection complications. Recommendations for early antiviral treatment (e.g., anti-SARS-CoV-2 therapy within the first five days of onset of symptoms, according to national recommendations) included three anti-SARS-CoV-2 spike monoclonal antibodies (bamlanivimab, bamlanivimab/etesevimab, and casirivimab/imdevimab). HRQoL was assessed through the physical component score and the mental component score of the SF-36 questionnaire [ 31 , 32 ] at 6-, 12-, 18-, and 24-months after infection. Study data were collected and managed using the REDCap electronic data capture tool (Research Electronic Data CAPture) [ 33 ]. Since the cohorts in France and the Netherlands started before the ORCHESTRA project was financed (in February and March 2020, respectively), data from these two cohorts went through a post-data collection harmonisation process under the supervision of the Charité, Universitäats Medizin Berlin and the Centre Informatique National de l’Enseignement Superieur [ 29 , 34 ]. The SF-36 questionnaires were scored using the PRO CoRE software developed by QualityMetrics, which applies US1998 norms. The threshold for suboptimal scores (50) represents the norm-based scoring mean of the 1998 US general population (mean=50, SD=10). Scores below this threshold indicate below average HRQoL compared to the reference population. Hidden Markov Model We assume that a patients’ collection of observations (occurrence of symptoms and HRQoL scores) are driven by a patient’s respective unobserved PCC phenotype/severity, which may change over time as patients move towards more severe or healthy states. We model this process by a discrete time and discrete space hidden Markov model (HMM) with non-zero N ∈ℤ + latent states. Let S i be the i -th latent state, where i ∈ { 1, 2, …, N } with T as the timepoint of the last of observation. The probability of the state occurring at a given discrete timepoint, t ∈ {1, 2, …, T }, is . Importantly, a patient may only occupy one state at any time Considering one patient who has l ∈ { 1, 2, …, L } individual observation (such as anosmia or mental HRQoL score), so that is the observed vector at time, t , where explicitly. For the same given patient, the patient characteristics are encapsulated in two covariate vectors, which are used to determine the initial state probability distribution for the given patient’s covariates, and the other, which impact a patient’s corresponding transition state probabilities. The HMM allows us to model the progression of PCC as the movement of PCC severity state from one timepoint to the next in 6 months intervals. The i− th state is responsible for a particular probability, b il of manifesting the l -th discrete observed symptom. The continuous HRQoL scores are modelled as coming from a Gaussian distribution with mean, µ il and covariance, σ il associated with the i− th state, When considering the PCC study presented in this work, for each given state, S i , there is an associated probability of patient manifesting one of the L = 11 observations, composed of 9 binary symptoms and 2 HRQoL scores (physical and mental). We note that both the binary symptom observations and continuous HRQoL scores are independent of time. As the severity state of a patient can change within the follow-up time period, there is an associated transition rate, a ij , which is the probability of moving from the i -th state to the j -th state, dependant on a patient’s associated covariates, . These transition probabilities are parameterised by a multinomial logistic regression function with regression coefficients . In order to capture the unique hidden state trajectory of each patient as well as ensure interpretability of covariate effects on the transition rates, we introduce the vector, , which reduces the effect of the vector of covariates, to a single value, r trans , through the inner product . This scalar value is then used as the single independent variable in the regression function. Thus, the transition probabilities are defined as, where the reference state is the i -th state. We note that for a standard multinomial function, the number of estimated parameters associated for the transition probabilities, would be N ( N − 1) K trans . Our hierarchical modelling approach reduces the number of parameters to 2 N ( N − 1) + K trans when considering sufficiently large models ( N, K trans ≥ 3). This parameterisation reduces the complexity of possible trajectories of the patient as there is no longer a unique set of regression coefficients for each covariate for each N − 1 states. However, the scalar value r trans now gives a more interpretable impact of the transition probabilities, a ij , as the variation of a single value affects the entire transition matrix for a given patient and their covariates. A similar parametrisation is used for the initial state distribution with , which calculates the probability of being in the i -th state at the first timepoint, , using i = 1 as the reference state (Supplementary 1.1.1). For ease of notation, we write the complete set of parameters, which define the reduced covariate-driven hidden Markov model, as where we ensure b il , µ il ∈ [0, 1] and σ il ≥ 0 (Supplementary 1.1.2). Model Fitting and Selection To fit a discrete-time, discrete-space HMM, the Baum-Welch algorithm is often utilised, being a special case of an expectation-maximisation (EM) algorithm [ 35 , 36 ] in order to estimate the model parameters which best describes the observational sequences of all patients. There have been extensions made [ 18 ] to define explicit update steps for each parameter when the initial and transitions state probabilities are constructed from a collection of multinomial functions. Rather than deriving the explicit update rule of an EM algorithm for our covariate-dependant model, we construct the likelihood function efficiently using taking the forward algorithm of the Forward-Backwards algorithm [ 37 ], and utilising fast automatic differentiation tools to find the best model parameters in a gradient-based optimiser. Using the iterative rule from the forward algorithm, the likelihood function is as follows, The iterative rule of the forward algorithm builds up the probability of the observed sequence step by step. At each timepoint t , the forward probability, α t ( j ) represents the joint probability of being in the j -th state and having observed all data up to t . This is obtained by summing over all possible predecessor states, weighting their forward probabilities by the transition probability a ij and multiplying by the probability of observing a new collection of symptoms and HRQoL scores, (Supplementary 1.1.2). By carrying this recursion forward, the algorithm accumulates the probability of all possible state paths consistent with the observations. At the final timepoint T , every possible path through the model has been incorporated, so summing α T ( j ), across all states yields the total probability of the observed data under the model, which is equivalent to the likelihood. Thus, the likelihood of the set of parameters, θ of the HMM given the observational sequence for all times, for a single patient is, We introduce two penalty terms which assist in finding strong starting parameters necessary to the optimal model. First, optimisation can be prone to degenerate solutions where infinite variance is estimated for the Gaussian distributions associated with each states, σ il , leading to numerical instabilities, which are typically seen in Gaussian mixture models [ 38 ]. One proposed solution is to add another regularisation term dependent on the variance and scaled by the number of observations [ 39 ], thereby punishing large variances while preventing attraction towards zero as well. Second, to ensure and are consistently scaled with magnitude equal to 1, we introduce L 2 regularisation on the norm, ∥ρ− 1 ∥ 2 for both vectors as a soft constraint for consistent interpretability across different models. Thus, the regularised log likelihood is as follows, where the first term is the log-likelihood of the dataset, 𝒟 consisting of d = { 1, 2, …, 𝒟 } patients where now the forward path probability of the d -th patient. We note that 𝒞 ⊂ 1, …, L is the set of indices corresponding to continuous observations. Although LTA models are typically applied to complete datasets where imputation methods may be employed, our model is fitted using a Full-Information Maximum Likelihood (FIML) approach in order to handle missing observations (either partially or completely missing) at any timepoint, assuming the observations are missing completely at random (MCAR). This is done by marginalising out the contribution to the likelihood when an observation is not known, which is equivalent to skipping the contribution entirely as no new information is added to the forward path probability (Supplementary 1.1.4). Our FIML approach allows efficient usage of all information available within the ORCHESTRA Long Term Sequelae dataset without subjecting the model parameter estimates to possible biases introduced by selected imputation methods. To estimate the optimal parameters, we employ the Broyden–Fletcher–Goldfarb–Shannon (BFGS) algorithm for optimisation [ 40 ], with gradient calculations completed using automatic differentiation [ 41 ]. A single optimisation run is completed in a 2-stage approach, where the parameters are first estimated with the regularisation terms set to λ 1 = 10 and λ 2 = 1. After convergence, the estimated parameter vector is used as the starting parameter vector for fitting, this time without the penalty terms. This process leads to a reduced number of degenerate solutions found, as well as easier comparisons of and across runs, as the magnitudes are roughly consistent. Latin Hypercube sampling was used to generate initial starting points for the majority of parameters [ 42 ], except for the mean and standard deviation of the Gaussian distributions, which were chosen from the clusters found in a N -means clustering step of the continuous variables, which ensure adequate starting values. Uncertainty of the estimated parameters is obtained via the associated variance-covariance matrix which is calculated by taking the inverse of the Fisher Information matrix. This is equivalent to taking the second derivative of the likelihood function evaluated at the parameter estimates [ 43 ]. Model selection was completed by first estimating the null HMM model for a given number of latent states, without the influence of covariates in the initial and transition state probabilities, using the 2-stage optimisation approach with 20 multistarts each. 20 starts were chosen due to available computation resources but were seen to be sufficient in finding similar solutions across starts. Age and sex covariates are then added to describe the initial and transition probabilities and the parameters are re-estimated with a warm start using the estimated parameters for the best fitting null model for each number of states, N . A forward selection approach is then made, where a single covariate is introduced to the age/sex model, and a similar re-estimation is completed using a warm start. The corresponding Bayesian Information Criterion (BIC) value [ 44 ] is calculated after estimating the new covariate model, and the covariate is selected and added to the base model if it returns the lowest BIC value. This process is repeated until an improvement less than 10 occurs across all covariates. This was completed for 5 models (4 to 8 latent states) and we chose the final model using the BIC as there is strong support for its usage in identifying the most suitable number of latent states [ 45 – 47 ]. Hospitalisation was excluded from the selection process as it is strongly correlated with unrecorded medical complications, making the clinical meaning difficult to interpret. In comparing models varying in size from N = 4 to 8 states, it was found that the N = 7 model gave the smallest BIC value in comparison to both the null (no covariates used) and forward-selected model groups ( Supplementary Figure 2 ). To evaluate model performance of the final 7 state model, the chosen set of covariates was used in a 5 fold cross validation procedure with 20 multistarts, each with a cold starting parameter vector and not utilising the 2-stage approach. Forward Simulations Using the fitted 7-state model, we performed 1000 simulations based on the dataset for the ORCHESTRA Cohort, by utilising each patients covariates and predicting their corresponding symptom reports and HRQoL scores at each timepoint. We then compared these forward simulated trajectories against the observed data at each follow-up month to evaluate the model’s performance in capturing both aggregate population-level statistics and predicted state probability distributions. Utilising Patient History and Evaluation Metrics The latent transition model with the set of estimated parameters, can predict the probability of the l− th observation occurring at a given timepoint, t , , which is computed by finding the patient’s corresponding state distribution at t , utilising the initial state distribution and transition probabilities. However, if we would like to use previous observations in order to inform the prediction, then So, we may improve our prediction of by simply updating a patient’s state distribution, which utilises the observations at 1 to t − 1 timepoints (Supplementary 1.1.3). If we expect our observations to be informative of future timepoints, then the predictive capability should likewise improve, with this having the benefit of utilising new added patient history without complete re-estimation of the model parameters. In order to evaluate how good these predictions of the observations are using the model and corresponding updates, the AUROC value is calculated for the binary observations. For the set of 9 PCC-related symptoms, at each timepoint we calculate an associated AUROC value using patient observations up until and including the timepoint before evaluation. In order to evaluate how well the density predicted for the continuous HRQoL scores are, we calculate the mean logarithmic score for the mental and physical questionnaire scores at each timepoint, where x d is the observed value of the d− th patient. Similarly, we may utilise the distribution of states at time, t informed by observations up and including t now, to predict observations at the very same timepoint, Thus, we can obtain a maximum AUROC and log mean score value achievable of the estimated model, as the observations at t provide the best determination of the states at t , if we do not consider future observations informing the probability of the states. Thus, we express the performance of our predictions as a percentage of the maximum achievable AUROC for binary symptoms, and maximum achievable mean log score for the continuous observations. Implementation R 4.3.3 was primarily used for the preprocessing of the dataset. All data formatting for input into the model, simulation, model implementation, model selection, performance evaluation and exportation of results were conducted in Julia version 1.10.1 [ 48 ], primarily using the Optim package version for the optimisation procedure [ 49 ]. Plotted figures were created in Python 3.10.12 using Matplotlib 3.6.3. The implementation can found online on https://github.com/roygusinow/LTA . Data Availability All data produced are available online at https://orchestra-cohort.eu/data-portal/ https://orchestra-cohort.eu/data-portal/ Funding The ORCHESTRA project has received funding from the European Union’s Horizon Europe Research and innovation programme under Grant Agreement No. 101016167. Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or of the European Research Executive Agency (REA). Neither the European Union nor the granting authority can be held responsible for them. Funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany’s Excellence Strategy - EXC 2151 - 390873048. Authorship Contributions Conceptualisation: Lorenzo Maria Canziani, Elisa Gentilotti, Anna Górska, Roy Gusinow, Jan Hasenauer, Evelina Tacconelli. Methodology: Carolina Alvarez Garavito, Anna Górska, Roy Gusinow, Jan Hasenauer, Iris Lopes-Rafegas, Elisa Sicuri, Evelina Tacconelli. Software: Anna Górska, Roy Gusinow, Iris Lopes-Rafegas. Patient Recruitment and Data Collection: Maria Giulia Caponcello, Michela Di Chiara, Aline-Marie Florence, Elisa Gentilotti, Jade Ghosn, Maddalena Giannella, Cédric Laouénan, Nadhem Lahfej, Fulvia Mazzaferri, Zaira R. Palacios-Baena, Lidia Del Piccolo, Adriana Tami, Alice Toschi, Karin I. Wold and the ORCHESTRA Study Group. Data Management and Curation: Lorenzo Maria Canziani, Maria Giulia Caponcello, Michela Di Chiara, Elisa Gentilotti, Anna Górska, Roy Gusinow, Nadhem Lahfej, Elisa Rossi, Adriana Tami. Writing - Original Draft: Lorenzo Maria Canziani, Anna Górska, Roy Gusinow, Jan Hasenauer, Iris Lopes-Rafegas, Evelina Tacconelli. Writing - Review & Editing: All Authors. Visualisation: Lorenzo Maria Canziani, Anna Górska, Roy Gusinow, Jan Hasenauer, Iris Lopes-Rafegas. Declaration of Interests The authors report no conflict of interest. 1 Supplementary This section is a supplementary to the paper, ” Latent Transition Analysis for Longitudinal Studies of Post-Acute Infection Syndromes: A Multinational Investigation of Post-COVID-19 Condition” and contains further mathematical derivations of the model, supplementary tables, as well as supplementary figures to provide added details presented in the paper. 1.1 Derivations This section serves as a supplement to the Methods section of the paper. 1.1.1 Notation The three key components of interest in defining the HMM model are as follows. Let π i be the probability of being in a state at the initial timepoint, t = 0 such that . We wish to make the initial state distribution conditioned on a patient’s covariates, by parameterising via a multinomial function again with and the first state chosen as the reference. However, noting that all states except one have a corresponding parameter set β , the total number of parameters to define the initial distribution is ( N − 1)( K initial + 1). In order to reduce the complexity of the model while still estimating the general effect of a patient’s characteristics, we collapse the covariate vector to a scalar value, r initial by taking the inner product with parameter vector, , such that . This value is then used as the single aggregated independent variable within the multinomial function, and the number of parameters is 2( N − 1) + K initial and notably does not scale multiplicatively with the number of states and covariates within the model. Let a be the matrix of transition probabilities between states, such that is the probability of moving from the i− th state to j− th state in a single timestep. We note that these probabilities remain independent of the timepoint t . No restrictions are placed on the transition probabilities between states, so the latent states are fully connected and there is a non-zero probability of jumping from any given state to any other state within a single timestep. This importantly includes the possibility to remain within a state. Similarly to how we parameterised the initial state distribution, the probability of moving into a j -th state from the i -th state is described by the following function for each state, where reduces the set of covariates. This ensures that transition matrix a is row-stochastic and dependant on a patient’s covariates. The total number of parameters now becomes 2 N ( N − 1) + K trans , thus the computational complexity does not increase greatly when more transition covariates are added to the model. Lastly, each state has an associated probability of manifesting the l -th component of the observation vector, , which depends only on being in the state at the given timepoint t . So, we define the probability of manifesting an observation from the i -th state as noting that the relation between the observation and state is independent of the timepoint. If the observation is binary, such as the occurrence of symptoms like anosmia, we model it as being drawn from a Bernoulli distribution with probability b il . In the case of continuous observations like the HRQoL scores, we assume it to be drawn from a Gaussian distribution with associated mean µ il and standard deviation, σ il . Thus we have, Because b ij , µ il ∈ [0, 1] as the scores have been normalised and σ il ∈ [0, ∞ ), we express these quantities as, In order to calculate corresponding confidence intervals at a specified significance level, α , we simply take the inverse functions, where z 1 −α/ 2 is the z-score. So, the complete set of parameters is now . 1.1.2 Forward Algorithm for HMM Likelihood Computation Rather than deriving the explicit update step for our model, we derive the likelihood function and then utilise a BFGS to fit the parameters which give the largest likelihood. To do this efficiently, we take the forward section of the forward-backward algorithm to construct the likelihood using dynamic programming. We denote the forward path probability of being in the j -th state at time t , with all observations up until that time as, Considering the i− th state, S i , we demonstrate the case where there is a vector of observations, . We note is the probability of the l -th observation at time t given the i − th state. For the initialisation then, as we consider observations of different symptoms conditionally independent of each other. Similarly, the iterative rule of α t is changed to, Terminating the iteration at the last timepoint, t = T and summing across the states thus yields the likelihood of sample sequence, Reformulating the algorithm now, such that the small unit is in log space. From the likelihood, the algorithm for computing the full likelihood of a dataset, 𝒟 consisting of d patients and given model parameters, θ is Thus, for a given sample, we have the initialisation step, and for the general update rule, 1.1.3 Probability of Latent State given Multiple Observations (Filtering) We may also calculate the distribution of states given the previous observation in order to update model predictions. Noting that, using the definition of conditional probability and seeing that the total probability of the observed data comes from summing the joint probabilities from each state. We may also then calculate the distribution of future observations given all observations up to a given time t , . using the chain rule and definition of the emission and transition probabilities. Thus, we may compute the distribution of observations by updating the state distribution at that timepoint, given observations at the previous timepoints. 1.1.4 Handling Missing Observations using Full-Information Maximum Likelihood Consider the case where an observation, is missing. We can marginalise out the effect by as we note that by definition. Thus the forward update in the case of a missing observation is equivalent to ignoring the observation in regular forward update. Rather than computing the integral directly at each step, we can simply redefine, Thus, with a minor adjustment, we may compute the likelihood efficiently with missing observations, rather than relying on imputation based methods, which may be prone to bias estimation results. 1.1.5 Steady State The steady state distribution, of a row-stochastic transition probability matrix, A is defined as . That is, the steady state distribution remains unchanged after transitioning from one timepoint to another. Noting that when our Markov chain is aperiodic and irreducible then there exists a steady distribution, with eigenvectors of a equal to . 1.2 Estimated Model Parameters View this table: View inline View popup Download powerpoint Supplementary Table 1: Probability (%) of observing each symptom given the latent state. View this table: View inline View popup Download powerpoint Supplementary Table 2: Parameter Estimates of Means and Standard Deviation of Normal Distribution per state for Physical Component Scores. View this table: View inline View popup Download powerpoint Supplementary Table 3: Parameter Estimates of Means and Standard Deviation of Normal Distribution per state for Mental Component Scores. View this table: View inline View popup Download powerpoint Supplementary Table 4: Parameter Estimates of View this table: View inline View popup Download powerpoint Supplementary Table 5: Parameter Estimates of 1.3 Mean State Probabilities View this table: View inline View popup Download powerpoint Supplementary Table 6: Mean State Probabilities (%) View this table: View inline View popup Supplementary Table 7: Patient summary statistics of the ORCHESTRA patient population across time of initial infection and follow-up timepoints. View this table: View inline View popup Supplementary Table 8: Summary of symptoms frequencies of the ORCHESTRA patient population across time of initial infection and follow-up time points. Missing patients are those that had a follow-up but the symptom was not recorded. View this table: View inline View popup Download powerpoint Supplementary Table 9: Summary of SF-36 physical and mental QoL questionairre measurement of the ORCHESTRA patient population across time of initial infection and follow-up timepoints. Supplementary Figures Download figure Open in new tab Supplementary Figure 1: Missingness pattern of the follow-up across the whole cohort study. Download figure Open in new tab Supplementary Figure 2: Comparing BIC Values between Null and Forward-selected Covariate Models against the 7 State Covariate Model. Error bars indicate highest and lowest values across 20 multistart runs. Download figure Open in new tab Supplementary Figure 3: Emission matrices of Forward-selected Model for varying number of states ( N ). Newly added states are highlighted with a black border. Download figure Open in new tab Supplementary Figure 4: Scatter plot of parameter estimates of and for 20 multistarts using all covariates. The influence of covariate is consistent across varying number of states ( N ). More blue lines indicates parameters with a larger likelihood value. Download figure Open in new tab Supplementary Figure 5: Population-level predictions of SF-36 Physical and Mental HRQoL scores. Download figure Open in new tab Supplementary Figure 6: Heatmaps comparing the loading values of the first component for the dataset and model simulation (average across 1000 simulations). Download figure Open in new tab Supplementary Figure 7: Heatmaps comparing the Pearson Correlation values of the nine symptoms and HRQoL scores for the dataset and model simulation (average across 1000 simulations). Download figure Open in new tab Supplementary Figure 8: AUROC and Mean Log Values of Symptoms and SF-36 Scores respectively. Download figure Open in new tab Supplementary Figure 9: 5-Fold Cross Validation Results of AUROC and Mean Log Values of Symptoms and SF-36 Scores respectively. Download figure Open in new tab Supplementary Figure 10: Comparing 24 Months and Steady State Distributions for varying r trans . Download figure Open in new tab Supplementary Figure 11: Acute phase covariates impact on the initial probability of states. Appendix ORCHESTRA Study Group University of Verona (UNIVR) Gaia Maccarrone, Ruth Joanna Davies, Stefania Vitali, Mariana Nunes Pinho Guedes, Giorgia Tomassini, Alessandra Nazeri, Massimo Mirandola, Matteo Morra, Jacopo Garlasco, Andrea Leonardi, Elena Carrara, Alessia Savoldi, Maria Diletta Pezzani, Marcella Sibani, Giorgia Franchina, Chiara Konishi De Toffoli, Marco Meroi, Daniele Fasan, Pasquale De Nardo, Alessandro Visentin, Elda Righi, Chiara Perlini, Maddalena Armellini, Enrico Gibbin, Matilde Rocchi, Alessandro Castelli, Federico Lattanzi, Carmine Cutone, Lucia Bonato, Anna Maria Azzini, Giada Fasani, Lorenza Lambertenghi, Michela Conti, Giulia Rosini, Filippo Cioli Puviani, Andrea Sartori, Salvatore Hermes Dall’O’, Chiara Zanchi, Laura Rovigo, Lorenzo Tavernaro, Rebecca Scardellato, Francesco Luca, Anna Giulia Salvadori, Andrea Volpe, Maria Mongardi, Simona Sorbello, Miriam Emiliani, Raffaella Cordioli, Alessio Esposito, Erica Sodano, Riccardo Cecchetto, Davide Gibellini, Concetta Sciammarella, Elena Addis, Benedetta Barana, Claudio Micheletto, Nicolò Cardobi, Gianluca Vantini, Gloria Mazzali, Giovanni Stabile, Maddalena Marcanti, Marco Pattaro Zonta, Debora Calì, Anna Mason, Cinzia Perlini, Paolo Gisondi, Maria Paola Cecchini, Gianluigi Zanusso, Salvatore Monaco . University of Bonn : Manuel Huth, Clemens Peiter. CINECA : Salvatore Cataudella, Chiara Dellacasa. University Medical Center Groningen (UMCG) : Gerolf de Boer, Bernardina T. F. van der Gun, María F. Vincenti-González, Alida C. M. Veloo, Daniele Pantano, Margriet van der Meer, Lilli Gard, Erley F. Lizarazo, Marjolein Knoester, Alex W. Friedrich, Hubert G. M. Niesters. Servicio Andaluz de Salud (SAS) : Jesús Rodríguez-Baño. María Isabel Garcia Sánchez, Ana Belén Hidalgo Céspedes (Biobanco del sistema sanitario Público de Andalucía Nodo del Hospital Universitario Virgen Macarena.Sevilla, España); Aurora Aleman Rodriguez, Lola Cubero Aranda; Paula Olivares Navarro, Sandra De la Rosa Riestra; José M. Bravo- Ferrero. University of Bologna (UNIBO) : Natascia Caroccia, Cecilia Bonazzetti, Beatrice Tazza, Zeno Igor Adrien Pasquini, Domenico Marzolla, Giacomo Fornaro, Fabio Trapani, Lorenzo Marconi, Luciano Attard, Sara Tedeschi, Silvia Vituliano, Liliana Gabrielli, Tiziana Lazzarotto. Footnotes ↵ + The ORCHESTRA Study Group is listed in the Appendix * These authors share last authorship ( evelina.tacconelli{at}univr.it , jan.hasenauer{at}uni-bonn.de ) References 1. ↵ Choutka , J. et al. Unexplained post-acute infection syndromes . Nature Medicine 28 , 911 – 923 . ISSN: 1546-170X . doi: 10.1038/s41591-022-01810-6 ( 2022 ). OpenUrl CrossRef PubMed 2. ↵ Soriano , J. B. et al. A clinical case definition of post-COVID-19 condition by a Delphi consensus . The Lancet Infectious Diseases 22 , e102 – e107 . ISSN: 1473-3099 . https://www.sciencedirect.com/science/article/pii/S1473309921007039 ( 2022 ). OpenUrl CrossRef PubMed 3. ↵ Davis , H. E. et al. Long COVID: major findings, mechanisms and recommendations . Nature Reviews Microbiology 21 , 133 – 146 ( 2023 ). OpenUrl CrossRef PubMed 4. ↵ Goldhaber , N. H. et al. Deep dive into the long haul: analysis of symptom clusters and risk factors for post-acute sequelae of COVID-19 to inform clinical care . International Journal of Environmental Research and Public Health 19 , 16841 ( 2022 ). OpenUrl 5. ↵ Ito , F. et al. Cluster analysis of long COVID in Japan and association of its trajectory of symptoms and quality of life . BMJ open respiratory research 11 ( 2024 ). 6. ↵ Fischer , A. et al. Long COVID classification: findings from a clustering analysis in the predi-COVID cohort study . International Journal of Environmental Research and Public Health 19 , 16018 ( 2022 ). OpenUrl 7. ↵ Thaweethai , T. et al. Development of a definition of postacute sequelae of SARS-CoV-2 infection . Jama 329 , 1934 – 1946 ( 2023 ). OpenUrl CrossRef PubMed 8. ↵ Malesevic , S. et al. Impaired health-related quality of life in long-COVID syndrome after mild to moderate COVID-19 . Scientific Reports 13 , 7717 . ISSN: 2045-2322 . doi: 10.1038/s41598-023-34678-8 ( 2023 ). OpenUrl CrossRef PubMed 9. ↵ Carlile , O. , Briggs , A. , Henderson , A. D. , et al. Impact of long COVID on health-related quality-of-life: an OpenSAFELY population cohort study using patient-reported outcome measures (OpenPROMPT) . The Lancet Regional Health – Europe 40 . ISSN: 2666-7762 . doi: 10.1016/j.lanepe.2024.100908 ( 2024 ). OpenUrl CrossRef 10. ↵ Greenhalgh , T. et al. Long COVID: a clinical update . The Lancet 404 , 707 – 724 . ISSN: 0140-6736 . doi: 10.1016/S0140-6736(24)01136-X ( 2024 ). OpenUrl CrossRef PubMed 11. ↵ van den Houdt , S. C. , Slurink , I. A. & Mertens , G. Long COVID is not a uniform syndrome: Evidence from person-level symptom clusters using latent class analysis . Journal of Infection and Public Health 17 , 321 – 328 . ISSN: 1876-0341 . https://www.sciencedirect.com/science/article/pii/S1876034123004616 ( 2024 ). OpenUrl 12. ↵ Kenny , G. et al. Identification of Distinct Long COVID Clinical Phenotypes Through Cluster Analysis of Self-Reported Symptoms . Open Forum Infectious Diseases 9 , ofac060 . ISSN: 2328-8957 . eprint: https://academic.oup.com/ofid/article-pdf/9/4/ofac060/42754772/ofac060.pdf . doi: 10.1093/ofid/ofac060 ( Mar . 2022 ). OpenUrl CrossRef PubMed 13. ↵ Niewolik , J. et al. Cluster analysis of long COVID symptoms for deciphering a syndrome and its long-term consequence ( 2024 ). 14. ↵ Graham , J. W. et al. Modeling transitions in latent stage-sequential processes: A substance use prevention example . Journal of Consulting and Clinical Psychology 59 , 48 – 57 . doi: 10.1037/0022-006X.59.1.48 ( 1991 ). OpenUrl CrossRef PubMed Web of Science 15. ↵ Wisk , L. E. et al. Association of SARS-CoV-2 With Health-related Quality of Life 1 Year After Illness Using Latent Transition Analysis . Open Forum Infectious Diseases 12 , ofaf278 ( 2025 ). OpenUrl PubMed 16. ↵ Li , H. et al. Change in psychological distress and associated factors among Hong Kong young adults in post-COVID-19 era: a latent transition analysis . Social Psychiatry and Psychiatric Epidemiology . ISSN: 1433-9285 . doi: 10.1007/s00127-025-02912-5 ( 2025 ). OpenUrl CrossRef 17. ↵ Chimera , D. et al. COVID-19 pulmonary phenotypes and longitudinal patterns in the first wave of the pandemic . Respiratory Medicine 237 , 107952 . ISSN: 0954-6111 . https://www.sciencedirect.com/science/article/pii/S0954611125000149 ( 2025 ). OpenUrl PubMed 18. ↵ Bartolucci , F. , Farcomeni , A. & Pennoni , F. Latent Markov models: a review of a general framework for the analysis of longitudinal data with covariates . TEST 23 , 433 – 465 . ISSN: 1863-8260 . doi: 10.1007/s11749-014-0381-7 ( 2014 ). OpenUrl CrossRef 19. ↵ Jackson , C. Multi-State Models for Panel Data: The msm Package for R . Journal of Statistical Software 38 , 1 – 28 . https://www.jstatsoft.org/index.php/jss/article/view/v038i08 ( 2011 ). OpenUrl 20. ↵ Azzini , A. M. et al. How European Research Projects Can Support Vaccination Strategies: The Case of the ORCHESTRA Project for SARS-CoV-2 . Vaccines 11 . ISSN: 2076-393X . https://www.mdpi.com/2076-393X/11/8/1361 ( 2023 ). 21. ↵ Gentilotti , E. et al. Clinical phenotypes and quality of life to define post-COVID-19 syndrome: a cluster analysis of the multinational, prospective ORCHESTRA cohort . eClinicalMedicine 62 . ISSN: 2589-5370 . doi: 10.1016/j.eclinm.2023.102107 ( 2023 ). OpenUrl CrossRef PubMed 22. ↵ Kenny , G. et al. Identification of Distinct Long COVID Clinical Phenotypes Through Cluster Analysis of Self-Reported Symptoms . Open Forum Infectious Diseases 9 , ofac060 . ISSN: 2328-8957 . eprint: https://academic.oup.com/ofid/article-pdf/9/4/ofac060/42754772/ofac060.pdf . doi: 10.1093/ofid/ofac060 ( Mar . 2022 ). OpenUrl CrossRef PubMed 23. Klein , J. et al. Distinguishing features of long COVID identified through immune profiling . Nature 623 , 139 – 148 . ISSN: 1476-4687 . doi: 10.1038/s41586-023-06651-y ( 2023 ). OpenUrl CrossRef PubMed 24. ↵ Torrell , G. et al. Characterisation, symptom pattern and symptom clusters from a retrospective cohort of Long COVID patients in primary care in Catalonia . BMC Infectious Diseases 24 , 82 . ISSN: 1471-2334 . doi: 10.1186/s12879-023-08954-x ( 2024 ). OpenUrl CrossRef PubMed 25. ↵ The RECOVERY Collaborative Group. Dexamethasone in Hospitalized Patients with Covid-z 19 . New England Journal of Medicine 384 , 693 – 704 . eprint: https://www.nejm.org/doi/pdf/10.1056/NEJMoa2021436 . https://www.nejm.org/doi/full/10.1056/NEJMoa2021436 ( 2021 ). OpenUrl CrossRef PubMed 26. ↵ Vlaming-van Eijk , L. E. et al. Post-COVID-19 condition: clinical phenotypes, pathophysio-logical mechanisms, pathology, and management strategies . The Journal of Pathology 266 , 369 – 389 . eprint: https://pathsocjournals.onlinelibrary.wiley.com/doi/pdf/10.1002/path.6443 . https://pathsocjournals.onlinelibrary.wiley.com/doi/abs/10.1002/path.6443 ( 2025 ). OpenUrl PubMed 27. ↵ Venkatraman , S. et al. Clusters of long COVID among patients hospitalized for COVID-19 in New York City . BMC Public Health 24 , 1994 . ISSN: 1471-2458 . doi: 10.1186/s12889-024-19379-9 ( 2024 ). OpenUrl CrossRef PubMed 28. ↵ Spezia , L. Reversible jump and the label switching problem in hidden Markov models . Journal of Statistical Planning and Inference 139 , 2305 – 2315 . ISSN: 0378-3758 . https://www.sciencedirect.com/science/article/pii/S0378375808004217 ( 2009 ). OpenUrl 29. ↵ Rinaldi , E. et al. Harmonization and standardization of data for a pan-European cohort on SARS-CoV-2 pandemic . NPJ Digital Medicine 5 , 75 ( 2022 ). OpenUrl PubMed 30. ↵ Stellmach , C. et al. Creation of Standardized Common Data Elements for Diagnostic Tests in Infectious Disease Studies: Semantic and Syntactic Mapping . J Med Internet Res 26 , e50049 . ISSN: 1438-8871 . https://www.jmir.org/2024/1/e50049 ( 2024 ). OpenUrl PubMed 31. ↵ Hays , R. D. , Sherbourne , C. D. & Mazel , R. M. The rand 36-item health survey 1.0 . Health economics 2 , 217 – 227 ( 1993 ). OpenUrl CrossRef PubMed 32. ↵ Ware Jr , J. E. SF-36 health survey update . Spine 25 , 3130 – 3139 ( 2000 ). OpenUrl CrossRef PubMed Web of Science 33. ↵ Harris , P. A. et al. The REDCap consortium: Building an international community of software platform partners . Journal of Biomedical Informatics 95 , 103208 . ISSN: 1532-0464 . https://www.sciencedirect.com/science/article/pii/S1532046419301261 ( 2019 ). OpenUrl CrossRef PubMed 34. ↵ Tacconelli , E. et al. Challenges of data sharing in European Covid-19 projects: A learning opportunity for advancing pandemic preparedness and response . The Lancet Regional Health–Europe 21 ( 2022 ). 35. ↵ Baum , L. E. et al. A Maximization Technique Occurring in the Statistical Analysis of Proba-bilistic Functions of Markov Chains . The Annals of Mathematical Statistics 41 , 164 – 171 . ISSN: 00034851, 21688990 . http://www.jstor.org/stable/2239727 (2025) ( 1970 ). OpenUrl CrossRef 36. ↵ Welch , L. R. Hidden Markov models and the Baum-Welch algorithm . IEEE Information Theory Society Newsletter 53 , 10 – 13 ( 2003 ). OpenUrl 37. ↵ Rabiner , L. A tutorial on hidden Markov models and selected applications in speech recognition . Proceedings of the IEEE 77 , 257 – 286 ( 1989 ). OpenUrl CrossRef 38. ↵ Biernacki , C. & Chrétien , S. Degeneracy in the maximum likelihood estimation of univariate Gaussian mixtures with EM . Statistics Probability Letters 61 , 373 – 382 . ISSN: 0167-7152 . https://www.sciencedirect.com/science/article/pii/S0167715202003966 ( 2003 ). OpenUrl 39. ↵ Chen , J. & Tan , X. Inference for multivariate normal mixtures . Journal of Multivariate Analysis 100 , 1367 – 1383 . ISSN: 0047-259X . https://www.sciencedirect.com/science/article/pii/S0047259X08002728 ( 2009 ). OpenUrl 40. ↵ Fletcher , R. in Practical Methods of Optimization 44 – 79 ( John Wiley Sons, Ltd , 2000 ). ISBN: 9781118723203 . eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1002/9781118723203.ch3 . https://onlinelibrary.wiley.com/doi/abs/10.1002/9781118723203.ch3 . 41. ↵ Rall , L. B. The Arithmetic of Differentiation . Mathematics Magazine 59 . Full publication date: Dec ., 1986, 275 – 282 . doi: 10.2307/2689402 ( 1986 ). OpenUrl CrossRef 42. ↵ Iman , R. L. in Wiley StatsRef: Statistics Reference Online ( John Wiley Sons, Ltd , 2014 ). ISBN: 9781118445112 . eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1002/9781118445112.stat03803 . https://onlinelibrary.wiley.com/doi/abs/10.1002/9781118445112.stat03803 . 43. ↵ Lehmann , E. L. & Casella , G. Theory of Point Estimation 2nd. ISBN: 0-387-98502-6 ( Springer , New York , 1998 ). 44. ↵ Schwarz , G. Estimating the Dimension of a Model . The Annals of Statistics 6 . Full publication date: Mar ., 1978 , 461 – 464 . http://www.jstor.org/stable/2958889 (1978). OpenUrl 45. ↵ Nylund , K. L. , Asparouhov , T. & and , B. O. M. Deciding on the Number of Classes in Latent Class Analysis and Growth Mixture Modeling: A Monte Carlo Simulation Study . Structural Equation Modeling: A Multidisciplinary Journal 14 , 535 – 569 . eprint: https://doi.org/10.1080/10705510701575396 . doi: 10.1080/10705510701575396 ( 2007 ). OpenUrl CrossRef 46. Bollen , K. A. et al. BIC and Alternative Bayesian Information Criteria in the Selection of Structural Equation Models . Structural Equation Modeling: A Multidisciplinary Journal 21 . PMID: 31360054, 1 – 19 . eprint : doi: 10.1080/10705511.2014.856691 . https://doi.org/10.1080/10705511.2014.856691 ( 2014 ). OpenUrl CrossRef PubMed 47. ↵ Cai , J. et al. Bayesian Diagnostics of Hidden Markov Structural Equation Models with Missing Data . Multivariate Behavioral Research 53 , 151 – 171 . ISSN: 0027-3171 . doi: 10.1080/00273171.2017.1407233 ( 2018 ). OpenUrl CrossRef PubMed 48. ↵ Bezanson , J. et al. Julia: A fresh approach to numerical computing . SIAM Review 59 , 65 – 98 . https://epubs.siam.org/doi/10.1137/141000671 ( 2017 ). OpenUrl CrossRef 49. ↵ Mogensen , P. K. & Riseth , A. N. Optim: A mathematical optimization package for Julia . Journal of Open Source Software 3 , 615 ( 2018 ). OpenUrl View the discussion thread. Back to top Previous Next Posted September 02, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Latent Transition Analysis for Longitudinal Studies of Post-Acute Infection Syndromes: A Multinational Investigation of Post-COVID-19 Condition Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Latent Transition Analysis for Longitudinal Studies of Post-Acute Infection Syndromes: A Multinational Investigation of Post-COVID-19 Condition Roy Gusinow , Anna Górska , Lorenzo Maria Canziani , Iris Lopes-Rafegas , Carolina Alvarez Garavito , Adriana Tami , Elisa Gentilotti , Elisa Sicuri , Cédric Laouénan , Jade Ghosn , Aline-Marie Florence , Nadhem Lahfej , Fulvia Mazzaferri , Lidia Del Piccolo , Maddalena Giannella , Alice Toschi , Michela Di Chiara , Maria Giulia Caponcello , Zaira R. Palacios-Baena , Karin I. Wold , Elisa Rossi , Evelina Tacconelli , Jan Hasenauer , the ORCHESTRA study group medRxiv 2025.09.01.25334817; doi: https://doi.org/10.1101/2025.09.01.25334817 Share This Article: Copy Citation Tools Latent Transition Analysis for Longitudinal Studies of Post-Acute Infection Syndromes: A Multinational Investigation of Post-COVID-19 Condition Roy Gusinow , Anna Górska , Lorenzo Maria Canziani , Iris Lopes-Rafegas , Carolina Alvarez Garavito , Adriana Tami , Elisa Gentilotti , Elisa Sicuri , Cédric Laouénan , Jade Ghosn , Aline-Marie Florence , Nadhem Lahfej , Fulvia Mazzaferri , Lidia Del Piccolo , Maddalena Giannella , Alice Toschi , Michela Di Chiara , Maria Giulia Caponcello , Zaira R. Palacios-Baena , Karin I. Wold , Elisa Rossi , Evelina Tacconelli , Jan Hasenauer , the ORCHESTRA study group medRxiv 2025.09.01.25334817; doi: https://doi.org/10.1101/2025.09.01.25334817 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Epidemiology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9219) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffb07918a7058f4',t:'MTc3OTQ0NDYxMA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00