Inferring rheumatoid arthritis disease activity status from the electronic health records across health systems to enable real-world data studies

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 56,102 characters · extracted from preprint-html · click to expand
Inferring rheumatoid arthritis disease activity status from the electronic health records across health systems to enable real-world data studies | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Inferring rheumatoid arthritis disease activity status from the electronic health records across health systems to enable real-world data studies David Cheng , Xuan Wang , Gregory C. McDermott , Jennifer S. Hanberg , Zoe Love , Katherine Zhong , Mary Jeffway , Jue Hou , Vidul Panickan , Rahul Sangar , Ying Qi , Connor Melley , Lauren Costa , Dakota Feil , Rachael Matty , Dana Weisenfeld , Abisayo Animashaun , Aimee Schreiner , Sara Morini , Lauren Rusnak , Andrew Cagan , Misti Paudel , J. Michael Gaziano , Brian Sauer , Michael Weinblatt , Joshua Baker , Bryant England , View ORCID Profile Yuk-Lam Ho , Kelly Cho , Paul Monach , Grant W. Cannon , Nancy Shadick , Ted R. Mikuls , Tianxi Cai , View ORCID Profile Katherine P. Liao doi: https://doi.org/10.1101/2025.11.13.25340003 David Cheng 1 Biostatistics Center, Massachusetts General Hospital , Boston, MA, USA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Xuan Wang 2 Division of Biostatistics, Department of Population Health Sciences, University of Utah , Salt Lake City, UT, USA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gregory C. McDermott 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jennifer S. Hanberg 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Zoe Love 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Katherine Zhong 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA BS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mary Jeffway 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jue Hou 5 Division of Biostatistics and Health Data Science, University of Minnesota , Minneapolis, MN, USA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Vidul Panickan 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA 6 Department of Biomedical Informatics, Harvard Medical School , Boston, MA, USA MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rahul Sangar 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ying Qi 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site Connor Melley 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA BS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lauren Costa 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dakota Feil 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rachael Matty 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site Dana Weisenfeld 7 Marcus Institute of Aging Research , Hebrew SeniorLife, Boston, MA, USA MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abisayo Animashaun 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aimee Schreiner 8 Medicine Service, VA Nebraska-Western Iowa Health Care System , Omaha, NE, USA 9 Division of Rheumatology and Immunology, University of Nebraska Medical Center , Omaha, NE, USA MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sara Morini 6 Department of Biomedical Informatics, Harvard Medical School , Boston, MA, USA MA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lauren Rusnak 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Andrew Cagan 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Misti Paudel 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site J. Michael Gaziano 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MPH, MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Brian Sauer 10 Salt Lake City VA Health Care System , Salt Lake City, UT, USA 11 Division of Rheumatology, University of Utah , Salt Lake City, UT, USA PhD, MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Michael Weinblatt 12 Division of Medicine, Brigham and Women’s Hospital , Boston, MA, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Joshua Baker 13 Division of Rheumatology, University of Pennsylvania , Philadelphia, PA, USA 14 Department of Biostatistics , Epidemiology, and Informatics, University of Pennsylvania , Philadelphia, PA, USA 15 Corporal Michael J. Crescenz VA Medical Center , Philadelphia, PA, USA4 MD, MSCE Find this author on Google Scholar Find this author on PubMed Search for this author on this site Bryant England 8 Medicine Service, VA Nebraska-Western Iowa Health Care System , Omaha, NE, USA 9 Division of Rheumatology and Immunology, University of Nebraska Medical Center , Omaha, NE, USA MD, PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yuk-Lam Ho 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Yuk-Lam Ho Kelly Cho 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Paul Monach 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA MD, PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Grant W. Cannon 10 Salt Lake City VA Health Care System , Salt Lake City, UT, USA 11 Division of Rheumatology, University of Utah , Salt Lake City, UT, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nancy Shadick 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA MD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ted R. Mikuls 8 Medicine Service, VA Nebraska-Western Iowa Health Care System , Omaha, NE, USA 9 Division of Rheumatology and Immunology, University of Nebraska Medical Center , Omaha, NE, USA MD MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site Tianxi Cai 6 Department of Biomedical Informatics, Harvard Medical School , Boston, MA, USA ScD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Katherine P. Liao 3 Division of Rheumatology, Brigham and Women’s Hospital , Boston, MA, USA 4 MAVERIC, VA Boston Healthcare System , Boston, MA, USA 6 Department of Biomedical Informatics, Harvard Medical School , Boston, MA, USA MD, MPH Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Katherine P. Liao For correspondence: Katherine.Liao{at}va.gov Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Objective Disease activity plays a central role in rheumatoid arthritis (RA) clinical studies. However, RA disease activity is inconsistently recorded in real-world electronic health records (EHR) data limiting the generation of real-world evidence (RWE). This study aimed to develop and validate scalable machine learning (ML) models to infer RA disease activity from EHR data. Methods We conducted studies from EHR data from Mass General Brigham (MGB) and the Veterans Affairs (VA); both have RA registries with prospectively collected disease activity score 28 (DAS28). The features for the algorithm were extracted from the EHR including structured data, e.g., ICD codes and narrative data using natural language processing (NLP). Machine learning models were trained on the registry-collected DAS28.We tested within-institution trained model performance and across systems transportability. The association between inferred disease activity and major adverse cardiovascular events (MACE) was tested with stratified Cox models to test face-validity. Results We studied 1105 MGB and 2631 VA RA patients. Models with structured data models achieved an AUC of 0.68-0.70; models incorporating structured and NLP achieved higher performance (AUC=0.843, MGB; 0.833, VA). Cross-site validation demonstrated reduced transportability (AUC=0.679, MGB→VA; 0.718, VA→MGB), due to differences in the important feature. Within institution, inferred disease activity was significantly associated with increased risk for incident MACE (MGB: HR=1.12; VA: HR=1.14). Conclusion RA disease activity can be inferred at scale from within-institution EHR data, though cross-institution performance is limited. The inferred disease activity replicated association between RA and MACE and supports it’s use in future studies to generate RWE. Introduction Measurement of disease activity in patients with rheumatoid arthritis (RA) is a key variable in clinical and epidemiologic research and is recommended as part of clinical management [ 1 ]. RA disease activity is measured by several validated indices such as the Disease Activity Score with 28-joint counts based on C-reactive protein (DAS28-CRP)[ 2 ]. In studies relying upon electronic health records (EHR) data, measures of disease activity are often not directly observed, although EHRs capture data on patient symptoms, physical examinations, labs, and medications that correlate with disease activity. The lack of systematically collected disease activity data to use as an exposure, outcome, or confounder is a major barrier in leveraging large-scale real-world data (RWD) from EHRs to generate real-world evidence in RA. For example, RA disease activity is frequently used as an outcome in drug trials, including those used for regulatory approvals in the United States and Europe. Studies have demonstrated associations between higher RA disease activity as an exposure with safety and effectiveness outcomes such as major adverse cardiovascular events (MACE) [ 3 , 4 ]. RA treatment guidelines have also suggested treat-to-target approaches whereby clinicians are encouraged to formally assess disease activity measures and intensify therapy until achieving low disease activity or remission [ 1 , 5 ]. Models for predicting disease activity using EHR data in specific settings have been previously attempted. An early study using natural language processing (NLP) leveraged concepts identified from clinical notes along with lab values to infer disease activity at rheumatology clinic visits [ 1 , 6 ]. Later work has demonstrated that models based on concepts from notes alone can achieve good performance when trained on a large sample based on registry data [ 8 ]. Another study linking claims to EHR data has found that the addition of lab data from EHRs significantly improved the performance of models for disease activity [ 9 ]. The ability to forecast future disease activity using structured data at a given clinical visit, including demographics, medications, labs and prior disease activity measures has also been reported [ 10 ]. Other recent works have developed models that incorporated patient-reported outcomes [ 11 ] and specifically for patients treated with biologics [ 12 ]. Despite these existing models, there is a lack of recent algorithms for EHR data that incorporate both structured and narrative data and allow for disease activity to be inferred at specific times during patient follow-up, which would enable use of disease activity as either an outcome, exposure, or covariate in studies with longitudinal follow-up. In this study, we focus on retrospectively inferring disease activity status using both structured and narrative data from clinical notes in EHRs from the Mass General Brigham (MGB) system and the Department of Veterans Affairs (VA) [ 13 , 14 ]. Both MGB and the VA have RA registries linked to EHRs: the Brigham Rheumatoid Arthritis Sequential Study (BRASS) and VA Rheumatoid Arthritis Registry (VARA). Both registries prospectively collected DAS28-CRP. The linkage between the registry and EHR enables the creation of a platform to train algorithms to predict disease activity collected in registries using EHR data. The objective of this study was to develop models that allow for time-specific inferences over specified months during patients’ EHR follow-up. We tested the feasibility of this approach at MGB and the VA, training models for each institution and testing the transportability of the models both within and across institutions. We hypothesized that the addition of NLP to structured EHR data would significantly improve the accuracy for inferring disease activity. Additionally, we sought to replicate the known association between disease activity and future MACE as an example of a downstream application using inferred disease activity [ 15 , 16 ]. Patients and Methods Patients We identified patients in the EHRs at each institution with an ICD-9/10 code for RA (ICD-9: 714.X and ICD-10: M06.9 and local codes) who were participants of the corresponding registry study at each site. Individuals enrolled in the Brigham and Women’s Rheumatoid Arthritis Sequential Study (BRASS) [ 17 ] with prospectively collected data from 2003-2019 were linked with data from the MGB EHR. For VA, we studied individuals enrolled in the Veteran Affairs Rheumatoid Arthritis registry (VARA) using data from 1999-2020 [ 18 ] with data linked to the VA EHR. Patients with ≥1 measurement of DAS28-CRP at a registry visit were included in the study population for the analysis [ 19 ]. The relationships between the registries, EHR and an overview of the study design are outlined in Figure 1a . In the downstream analysis assessing associations of inferred disease activity with subsequent MACE, we identified a broader RA population from the EHRs using a validated multimodal RA phenotyping algorithm [ 20 ] that was separately applied at both institutions. RA patients in that analysis were identified as those with a predicted probability for having RA above a threshold such that the positive predictive value (PPV) is ≥90% at each site. Download figure Open in new tab Figure 1. (a) Overview of relationships between the institution specific registries, BRASS and VARA, which contain DAS28-CRP data that are consistently collected through research visits, electronic health record (EHR) based RA cohorts identified using a phenotyping pipeline (light green circle), and the overall EHR data; and (b) the study design to develop models for disease activity within and across institutions. Observed Disease Activity Status The DAS28-CRP was prospectively assessed at registry visits in BRASS and VARA. We identified months in which DAS28-CRP was assessed for patients in the study sample and dichotomized DAS28-CRP at 3.2 to classify patients to moderate or high disease activity (DAS28-CRP >3.2) versus remission or low disease activity (DAS28-CRP ≤3.2) in the month. When multiple assessments were recorded within a month, we took the median of available measurements and then dichotomized to define disease activity status in the month. EHR Features We extracted data on patients’ demographics and structured data, i.e., codes related to RA including diagnosis, procedure, medication, and laboratory codes. These codes were obtained from a knowledge network that identifies codes related to phenotypes in the EHR [ 21 – 24 ]. Additionally, available data on 4 laboratory values considered important in assessing RA and disease activity were extracted: C-reactive protein (CRP), erythrocyte sedimentation rate (ESR), antibodies to cyclic-citrullinated peptide (anti-CCP), and rheumatoid factor (RF). All available laboratory data were initially extracted for all patients on all dates. For ESR and CRP, the data were aggregated into monthly counts of all available codified data based on observed dates. Laboratory values were then summarized based on the median values in months with a measurement. Data harmonization was required for CRP which were reported in both mg/dL and mg/L. RF and anti-CCP status were defined as positive if an individual had a positive test in the month. All clinical notes were processed using natural language processing (NLP) to identify mentions of RA-and disease activity-related concepts from the narrative clinical notes. A dictionary of RA-related concepts was created through the Online Narrative and Codified feature Search Engine (ONCE) tool generated by the knowledge network mentioned above [ 20 ]. A separate NLP dictionary was developed to identify terms associated with disease activity. We performed a manual review of rheumatology notes and sections of text that were informative for determining disease activity were flagged. Named entity recognition was applied to identify clinical terms from these text. The terms were mapped to NLP concepts and associated concept unique identifiers (CUI) were identified using the Unified Medical Language System. All concepts not already present in the RA dictionary generated by ONCE were retained in the disease activity NLP dictionary. The monthly count of mentions of concepts from both the RA and disease activity dictionaries were then obtained based on the date of the notes. We refer to the counts of codes as codified features , the codified and lab value data as structured features , and the counts of NLP concepts as the narrative features . A comprehensive list of the structured features can be found in Supplemental Table 1 , and NLP features in Supplemental Table 2. Modeling approach for inferring RA disease activity The approach for inferring disease activity is illustrated in Figure 2a . For each month in which DAS28-CRP was measured in the registry (target month), we further summarized the EHR features that are dated to be within 3 months (i.e., ≥ (target month - 3 months) or ≤ (target month + 3 months)). The codified and narrative features were aggregated into total counts of specific codes and CUIs incurred within this time window, respectively. The CRP and ESR data were further summarized by the median value over the window, and the anti-CCP and RF were summarized by presence or absence of positive test over the window. Missing CRP and ESR values were imputed by single imputation using a LASSO model given other screened features (see the following paragraph) in target months with no observed lab value. All these features were then used to train a model to predict the disease activity status in the target months. Download figure Open in new tab Figure 2. (a) Approach for training time-specific disease activity inferences over specified months during EHR follow-up, and (b) s tudy design for applying inferred RA disease activity to replicate the association between disease activity and MACE in RA. The 1 st RA code refers to the first RA diagnosis code observed in EHR follow-up. Patients with at least one rheumatology clinic visit within 1 year after the 1 st RA code are included in the analysis. Those with any MACE events observed prior to the 1 st rheumatology clinic visit during EHR follow-up were excluded. To identify features that are potentially relevant to disease activity based on the existing data, feature screening was performed. This was done by fitting a multivariable logistic regression model for moderate-high disease activity at target months against each individual candidate feature while additionally adjusting by age, sex, race, and healthcare utilization, where healthcare utilization is defined as the number of days with any ICD code [ 25 ] recorded during EHR follow-up. Codified and narrative features with p-values ≤0.1 and prevalence >5% were then selected to be features used for training the disease activity algorithm ( Supplemental Figures 1-4 ). We used target months before 2009 for feature screening and reserved the data from 2009 onward for training and validation. This approach allowed us to separately leverage historical data for screening and 10 years of contemporary data for training and validation to avoid overfitting. We fitted an ensemble of machine learning models using Super Learner [ 26 ] to estimate the probability of having moderate-high disease activity status (DAS28-CRP > 3.2) at each target month given the EHR features: the demographic (age, sex, race), healthcare utilization, laboratory value (ESR, CRP, RF and anti-CCP), and screened codified and narrative features. The ensemble included the mean, random forest [ 27 ], XGBoost [ 28 ], and neural network [ 29 ]. A weighted average of the estimated probabilities based on each component of the ensemble is taken to obtain the final probability estimates. These estimated probabilities measure how likely a patient is to have moderate or high disease activity. The probabilities can then be dichotomized at different thresholds to yield classification of patients into moderate-high versus remission-low disease activity status. When averaged across a sample of patients, the mean predicted probabilities represent an estimate of the prevalence of moderate-high disease activity status in the corresponding population. Evaluation of RA disease activity phenotyping model We evaluated the phenotyping model when training and validating using data from the same institution. We also evaluated the performance when training in one institution and externally validating in another institution ( Figure 1b ). For within-institution evaluations, we randomly sampled 80% of the target months for training the phenotyping model and used the remaining 20% as an independent validation sample. All screened features available at each site were considered as candidate features for training and validation. For cross-institution evaluations, we trained the model using all target months from one institution and validated in target months from the other institution, restricting the set of screened features to those common to both sites. To assess the incremental impact of different sets of features on the phenotyping performance in terms of discrimination, we evaluated the area under the receiver operating curve (AUC) in the validation sample for 5 models trained on: (1) codified features only, (2) codified features plus lab values, (3) codified data plus narrative features, (4) codified features plus both lab values and narrative features, and (5) codified features plus both lab values and narrative features that are common to both sites. We additionally assessed the phenotyping performance in terms of calibration by plotting the observed proportion with moderate-high disease activity (i.e., the prevalence of moderate-high disease activity) for patients within each decile of the predicted probability of moderate-high disease activity. For both the within- and cross-institution evaluations, we assessed the AUC and calibration plots in the corresponding validation samples. The relative importance of features for the algorithm was ranked using the SHapley Additive exPlanations (SHAP) values, which was calculated in terms of each feature’s contribution to reducing Mean Absolute Error (MAE, Supplemental Figures 1-4 ). The SHAP value provides a way to attribute the contribution of each feature toward the prediction outcome of a model [ 30 ]. Application of inferred disease activity in an association study with major adverse cardiovascular events (MACE) To demonstrate the application of the disease activity algorithm in a downstream analysis and assess its face validity, we tested the known association between higher disease activity with the risk of major adverse cardiovascular events (MACE) in RA [ 31 ]. We conducted this analysis in both MGB and VA data and restricted the populations to patients with at least one rheumatology clinic visit within 1 year after the first available RA diagnosis code ( Figure 2b ). Patients with diagnosis codes for MACE at any time prior to the first rheumatology clinic visit were excluded. We applied the model trained using target months linked to the respective registry at each institution to the broader cohort of RA patients in the EHR data in the same institution ( Figure 1a ). The predicted probability of moderate-high disease activity was inferred for the month of the first rheumatology clinic encounter within 1 year after the first RA diagnosis code (index date). The inferred moderate-high vs remission-low disease activity status was defined by dichotomizing the predicted probabilities at thresholds of 0.35 for MGB and 0.36 for VA, to match the prevalences of moderate-high disease activity status observed in BRASS and VARA. MACE events were ascertained based on the presence of diagnosis and procedure codes for myocardial infarction, ischemic stroke, use of coronary artery bypass graft, percutaneous coronary intervention, percutaneous transluminal coronary angioplasty, or stents [ 32 ] ( Supplemental Table 3 ). Time-to-MACE was censored at the month of either the last structured or narrative feature. Kaplan-Meier analysis was performed to estimate the unadjusted MACE-free survival among those with inferred moderate-high vs remission low disease activity status. A stratified Cox model was fit to further assess this association after adjustment for patients’ age, sex, and self-reported race and stratifying the baseline hazard by calendar year of the rheumatology clinic visit. Results We identified 1,105 participants in BRASS who were followed in the MGB system and had at least one measurement of DAS28-CRP in 5,072 distinct post-2009 target months ( Table 1 ). The mean age in the month of the first post-2009 BRASS target month was 57.7 years, 82.4% of patients were female, and 87.1% were White; 85.7% were seropositive. Among VARA patients, 2,631 patients had a measurement of DAS28-CRP in 31,440 distinct post-2009 target months. The mean age in the month of the first post-2009 VARA visit was 64.6 years, 12.1% of patients were female, and 76.8% were White; 88.6% were seropositive. The proportion of target months in which patients were in remission, low, moderate and high disease activity were similar in BRASS and VARA. In the majority of target months, 59.8% of target months in BRASS and 60.6% in VARA, patients were in remission. There were 22.0% of target months in BRASS and 22.9% in VARA in which patients had moderate disease activity. View this table: View inline View popup Table 1. Characteristics at the baseline visit for BRASS or VARA for the phenotyping analysis, and for the baseline for the MACE analysis. When separately trained and validated within MGB and the VA, the phenotyping models exhibited AUCs of 0.676 at MGB and 0.702 at the VA when using codified data only ( Table 2 ). The AUC improved to 0.756 at MGB and 0.736 at the VA when additionally incorporating lab values and to 0.821 at MGB and 0.832 at the VA when additionally incorporating narrative features. The best model performance was observed when all codified and narrative features were included with an AUC of 0.843 at MGB and 0.833 at the VA. View this table: View inline View popup Download powerpoint Table 2. Within- and cross-institution AUC (95% CI in bracket) by feature set. In cross-institution evaluations, applying the model trained using features common to both MGB and VA to an external site, the performance exhibited considerable degradation. Even when including both narrative and lab features, the AUC was 0.679 when trained at MGB and validated at the VA; the AUC was 0.718 when trained at VA and validated at MGB. While CRP, ESR, age and prednisone use were among the top 5 most important features at both institutions, the order of importance differed at MGB vs the VA ( Supplemental Figures 1-4 ). The observed probabilities of moderate-high disease activity exhibited relatively close agreement with mean predicted probabilities when models were trained on data from the same institution ( Figure 3 ). There was some systematic under-estimation of the probability of moderate-high activity when training the model at the VA and validating at MGB. Conversely, there was over-estimation of the probability of moderate-high disease activity in most deciles when training the model at MGB and validating at the VA. Download figure Open in new tab Figure 3. Within- and cross-institution calibration plots at MGB and VA. In the analysis of associations between disease activity and MACE, we identified 16,553 patients at MGB and 72,847 at the VA with a rheumatology clinic visit within 1 year after the first RA diagnosis code without a prior MACE event. For the MGB population, at the index date, the mean age was 60.6 years, 78.5% were female, 68.9% seropositive, and 44.9% had an inferred moderate-high disease activity status with the chosen cutoff. For the VA population, at the index date, the mean age was 61.9 years, 13.4% were female, 73.3% seropositive, and 15.9% had an inferred moderate-high disease activity status with the chosen cutoff ( Table 1 ). Patients with inferred moderate-high disease activity exhibited a higher risk for MACE at both MGB and VA ( Figure 4 ). In Cox analyses adjusted for baseline covariates (age, gender, race) and stratified by calendar year, inferred moderate-high disease activity status was also significantly associated with higher hazard of subsequent MACE at both MGB (HR=1.12, 95% CI 1.00-1.25) and VA (HR=1.14, 1.08-1.21). Download figure Open in new tab Figure 4. Survival curves with 95% CIs (dotted lines) demonstrating association between moderate/high disease activity vs remission low and incident MACE at (a) MGB and (b) VA, with HR=1.12 (95% CI 1.00-1.25) for MGB and HR=1.14 (1.08-1.21) in VA of inferred moderate-high disease activity for subsequent MACE in Cox analyses, adjusted for baseline covariates and stratified by calendar year. Discussion In this study, we demonstrate that disease activity can be inferred at scale with reasonable performance using data readily available in most EHRs. Comprehensively incorporating longitudinal structured and narrative features from EHRs enabled time-specific inferences about disease activity, which is an essential variable to generate RWE from RWD studies in RA. The framework outlined in this study provides a means to approximate disease activity data over available follow-up in EHR-based cohorts, which would otherwise only be available in prospective longitudinal studies. While the data are sufficiently informative to infer disease activity within an institution, the informativeness of features may vary across institutions leading to lack of transportability of the algorithms across institutions. However, inferred disease activity from models trained within the same institution achieved sufficient accuracy to replicate known associations with subsequent risk of MACE. As observed in prior studies, models trained using only codified features achieved only modest AUCs with or without the use of machine learning [ 33 – 34 ]. Additionally, incorporating laboratory values into these models led to substantial improvements in their AUCs, despite high rates of missingness at both institutions. We applied simple single-imputation procedures to address missing values, which improved model performance. Inclusion of NLP narrative features to models with codified features led to the largest improvements in the AUC. This finding supports the clinical intuition that the description of a patient’s disease status in the notes provides orthogonal information needed to infer disease activity that are not available in lab results or diagnoses codes [ 34 ]. In addition to good discrimination performance reflected in the AUC, models with the full set of structured and narrative data were also able to achieve close agreement between the predicted and observed probabilities of moderate-high disease activity. Algorithms trained at one institution were not readily transportable to a different institution. The lack of transportability was not surprising given significant differences in patient composition, with older male patients at the VA and predominantly female patients at MGB. Potential differences in the importance of specific codes may be due in part to inherent differences in the healthcare systems. MGB is reimbursed by patients’ private insurance, while Veterans’ healthcare is covered by the VA. Caution is thus warranted in general when transporting models trained at one institution to external institutions. Ideally, models trained at one institution would be re-trained or calibrated for use at other institutions. The performance of the within-institution algorithms suggests that the model development process itself can generally be repeated at other institutions with a registry linked to EHR to obtain an algorithm with reasonable accuracy, although the vast majority of EHRs won’t have a linked registry. Using the inferred measures of disease activity, we replicated known associations between higher disease activity and future risk for MACE in large EHR cohorts [ 35 ]. This demonstrates how these measures can be used for downstream analyses in large EHR populations in which disease activity at specific time points would otherwise be unavailable. In addition to being an exposure, as in this analysis, these measures can also be used as outcomes or used to define patient populations of interest. Calibration may be warranted to ensure that such analyses are free of bias from use of inferred measures [ 36 ]. There were limitations in our study. When incorporating NLP features into our models, we used simple “one-hot” encoding to indicate the presence/absence of terms in proximal narrative notes. Improvements in performance can likely be achieved using word embeddings that represent terms in context [ 37 ]. The rapid advancement in large language models (LLMs) also provide opportunities to infer disease activity from EHR features, likely at a much higher cost and need for resources. For example, development of in-house LLMs generally require large datasets, on-premise servers with graphic processing units, Health Insurance Portability and Accountability Act- (HIPAA-) compliant infrastructure, and teams of specialists to develop and maintain the models [ 38 ]. Future work will explore the use of LLMs as an option. Our models were trained using data from patients who participated in registry studies, who may differ from the overall RA population at each institution. These differences could bias the trained models towards disease activity levels observed among sub-populations more likely to participate in these registries. Further validation among broader RA populations at each institution is needed to confirm the performance in the overall populations. We used DAS28-CRP to measure disease activity in this investigation, as it is a validated measure widely used in clinical studies. Nevertheless, DAS28-CRP may not be fully reflective of disease activity that is observed clinically. Different results may be expected if other measures or criteria are used. While linkage to external claims or death records may partially address this issue, the possibility of missed MACE events outside the VA/MGB systems remains a limitation. We provide a framework, method, and algorithm to infer disease activity in a manner that can be readily adopted in EHR-based clinical studies, expanding opportunities to study the effect of treatments with large RWD. The methods allow for inference of disease activity at specified time-points throughout a patient’s follow-up. This advancement addresses an important need for studies that require larger populations to study outcomes such as MACE, or studies comparing other safety and effectiveness outcomes, including less common adverse events. Future work includes utilizing inferred disease activity for large-scale trial emulation studies in RA using RWD. Data Availability All data produced in the present work are contained in manuscript and supplementary. Identifiable patient-level data cannot be shared to protect subject privacy. Funding This study was supported by the NIH R01AR080193, P30 AR072577, K24AR086342 Conflicts GM: Research supported by Boehringer Ingelheim MW: Research supported by Bristol Myers Squibb, consulting for: (Aclaris, Amgen, Anaptysbio, Artiva Bio, Bristol Myers Squibb, Biohaven, Curie Bio, Deep Cure, Forward Therapeutics, Gilead, Ignite,Janux Therapeutics, Johnson and Johnson, Lilly, Lifordi, Marvel Bio, Matchpoint, Merck, Neutrolis, Novartis, Roche, Santa Ana, Sana, Sci Rhom, Set Point, Surf Therapeutics, Thymmunz, Xencorp, ZuraBio), and receives options from: (Canfite, Inmedix, Scipher) NS: received BRASS funding from Janssen TM: Research funding from Horizon (Amgen); consulting for Merck, UCB, Horizon, Olatec Therapeutics KPL: Consulting for Merck Acknowledgements The opinions expressed in this article are those of the authors and do not necessarily represent those of the Department of Veterans Affairs or the United States government. References 1. ↵ Fraenkel , L. , Bathon , J.M. , England , B.R. , et al. , 2021 . 2021 American College of Rheumatology guideline for the treatment of rheumatoid arthritis . Arthritis & Rheumatology , 73 ( 7 ), pp. 1108 – 1123 . OpenUrl PubMed 2. ↵ England , B. R. , Tiong , B. K. , Bergman, et al. ( 2019 ). 2019 Update of the American College of Rheumatology Recommended Rheumatoid Arthritis Disease Activity Measures . Arthritis care & research , 71 ( 12 ), 1540 – 1555 . OpenUrl PubMed 3. ↵ Crowson , C.S. , Rollefstad , S. , Ikdahl , E. , et al. , 2018 . Impact of risk factors associated with cardiovascular outcomes in patients with rheumatoid arthritis . Annals of the rheumatic diseases , 77 ( 1 ), pp. 48 – 54 . OpenUrl Abstract / FREE Full Text 4. ↵ Solomon , D.H. , Reed , G.W. , Kremer , J.M. , et al. , 2015 . Disease activity in rheumatoid arthritis and the risk of cardiovascular events . Arthritis & rheumatology , 67 ( 6 ), pp. 1449 – 1455 . OpenUrl PubMed 5. ↵ Smolen , J.S. , Landewé , R.B. , Bergstra , S.A. , et al. , 2023 . EULAR recommendations for the management of rheumatoid arthritis with synthetic and biological disease-modifying antirheumatic drugs: 2022 update . Annals of the rheumatic diseases , 82 ( 1 ), pp. 3 – 18 . OpenUrl Abstract / FREE Full Text 6. ↵ Curtis , J. R. , Baddley , J. W. , Yang, et al. ( 2011 ). Derivation and preliminary validation of an administrative claims-based algorithm for the effectiveness of medications for rheumatoid arthritis . Arthritis research & therapy , 13 ( 5 ), R155 . OpenUrl 7. Lin , C. , Karlson , E.W. , Canhao , H. , Miller , T.A. , Dligach , D. , Chen , P.J. , Perez , R.N.G. , Shen , Y. , Weinblatt , M.E. , Shadick , N.A. and Plenge , R.M ., 2013 . Automatic prediction of rheumatoid arthritis disease activity from the electronic medical records . PloS one , 8 ( 8 ), p. e69932 . OpenUrl CrossRef PubMed 8. ↵ Spencer , A.K. , Bandaria , J. , Leavy , M.B. , et al. , 2021 . Validation of a machine learning approach to estimate Clinical Disease Activity Index Scores for rheumatoid arthritis . RMD open , 7 ( 3 ), p. e001781 . OpenUrl Abstract / FREE Full Text 9. ↵ Feldman , C.H. , Yoshida , K. , Xu , Cet al , 2019 . Supplementing claims data with electronic medical records to improve estimation and classification of rheumatoid arthritis disease activity: a machine learning approach . ACR Open Rheumatology , 1 ( 9 ), pp. 552 – 559 . OpenUrl PubMed 10. ↵ Norgeot , B. , Glicksberg , B.S. , Trupin , L. , et al. , 2019 . Assessment of a deep learning model based on electronic health record data to forecast clinical outcomes in patients with rheumatoid arthritis . JAMA network open , 2 ( 3 ), pp. e190606 – e190606 . OpenUrl 11. ↵ Curtis , J.R. , Su , Y. , Black , S. , et al. , 2022 . Machine Learning Applied to Patient-Reported Outcomes to Classify Physician-Derived Measures of Rheumatoid Arthritis Disease Activity . ACR Open Rheumatology , 4 ( 12 ), pp. 995 – 1003 . OpenUrl PubMed 12. ↵ Koo , B.S. , Eun , S. , Shin , K. , et al. , 2021 . Machine learning model for identifying important clinical features for predicting remission in patients with rheumatoid arthritis treated with biologics . Arthritis Research & Therapy , 23 , pp. 1 – 10 . OpenUrl CrossRef PubMed 13. ↵ Iannaccone , C.K. , Lee , Y.C. , Cui , J. , et al. , 2011 . Using genetic and clinical data to understand response to disease-modifying anti-rheumatic drug therapy: data from the Brigham and Women’s Hospital Rheumatoid Arthritis Sequential Study . Rheumatology , 50 ( 1 ), pp. 40 – 46 . OpenUrl CrossRef PubMed Web of Science 14. ↵ Mikuls , T.R. , Baker , J.F. , Cannon , G.W. , et al. , 2025 , February. The Veterans Affairs Rheumatoid Arthritis Registry: A unique population in rheumatoid arthritis research. In Seminars in Arthritis and Rheumatism (Vol. 70, p. 152580). WB Saunders . 15. ↵ Solomon , D.H. , Reed , G.W. , Kremer , J.M. , et al. , 2015 . Disease activity in rheumatoid arthritis and the risk of cardiovascular events . Arthritis & rheumatology , 67 ( 6 ), pp. 1449 – 1455 . OpenUrl PubMed 16. ↵ Navarro-Millán , I. , Yang , S. , DuVall , S.L. , et al. , 2016 . Association of hyperlipidaemia, inflammation and serological status and coronary heart disease among patients with rheumatoid arthritis: data from the National Veterans Health Administration . Annals of the rheumatic diseases , 75 ( 2 ), pp. 341 – 347 . OpenUrl Abstract / FREE Full Text 17. ↵ Iannaccone , C.K. , Lee , Y.C. , Cui , J. , et al. , 2011 . Using genetic and clinical data to understand response to disease-modifying anti-rheumatic drug therapy: data from the Brigham and Women’s Hospital Rheumatoid Arthritis Sequential Study . Rheumatology , 50 ( 1 ), pp. 40 – 46 . OpenUrl CrossRef PubMed Web of Science 18. ↵ Mikuls , T.R. , Baker , J.F. , Cannon , G.W. , et al. , 2025 , February. The Veterans Affairs Rheumatoid Arthritis Registry: A unique population in rheumatoid arthritis research. In Seminars in Arthritis and Rheumatism (Vol. 70, p. 152580). WB Saunders . 19. ↵ England , B.R. , Tiong , B.K. , Bergman , M.J. , et al. , 2019 . 2019 update of the American College of Rheumatology recommended rheumatoid arthritis disease activity measures . Arthritis care & research , 71 ( 12 ), pp. 1540 – 1555 . OpenUrl PubMed 20. ↵ Xiong , X. , Sweet , S.M. , Liu , M. , et al. , 2023 . Knowledge-driven online multimodal automated phenotyping system. medRxiv , pp.2023-09. 21. ↵ Hong , C. , Rush , E. , Liu , M. , et al. , 2021 . Clinical knowledge extraction via sparse embedding regression (KESER) with multi-center large scale electronic health record data . NPJ digital medicine , 4 ( 1 ), p. 151 . OpenUrl PubMed 22. https://phewascatalog.org/phewas/#home 23. https://hcup-us.ahrq.gov/ 24. ↵ https://hcup-us.ahrq.gov/toolssoftware/ccs_svcsproc/ccssvcproc.jsp 25. ↵ Wei , W.Q. , Bastarache , L.A. , Carroll , R.J. , et al. , 2017 . Evaluating phecodes, clinical classification software, and ICD-9-CM codes for phenome-wide association studies in the electronic health record . PloS one , 12 ( 7 ), p. e0175508 . OpenUrl CrossRef PubMed 26. ↵ Laan , M.V.D. , Polley , E. and Hubbard , A ., 2007 . Super Learner Statistical Applications in Genetics and Molecular Biology . Super learner. Statis-tical applications in genetics and molecular biology , 6 . 27. ↵ Wright , M.N. and Ziegler , A ., 2015 . ranger: A fast implementation of random forests for high dimensional data in C++ and R . arXiv preprint arXiv:1508.04409. 28. ↵ Chen , T ., 2015 . Xgboost: extreme gradient boosting . R package version 0.4-2, 1(4). 29. ↵ Kuhn M , Falbel D ( 2024 ). brulee: High-Level Modeling Functions with ’torch’. R package version 0.3.0 , https://brulee.tidymodels.org/ , https://github.com/tidymodels/brulee . 30. ↵ Lundberg , S.M. and Lee , S.I ., 2017 . A unified approach to interpreting model predictions . Advances in neural information processing systems , 30 . 31. ↵ Crowson , C.S. , Rollefstad , S. , Ikdahl , E. , et al. , 2018 . Impact of risk factors associated with cardiovascular outcomes in patients with rheumatoid arthritis . Annals of the rheumatic diseases , 77 ( 1 ), pp. 48 – 54 . OpenUrl Abstract / FREE Full Text 32. ↵ Liao , K.P. , Liu , J. , Lu , B. , et al. , 2015 . Association between lipid levels and major adverse cardiovascular events in rheumatoid arthritis compared to non–rheumatoid arthritis patients . Arthritis & Rheumatology , 67 ( 8 ), pp. 2004 – 2010 . OpenUrl PubMed 33. ↵ Feldman , C.H. , Yoshida , K. , Xu , C. , et al. , 2019 . Supplementing claims data with electronic medical records to improve estimation and classification of rheumatoid arthritis disease activity: a machine learning approach . ACR Open Rheumatology , 1 ( 9 ), pp. 552 – 559 . OpenUrl PubMed 34. ↵ Sauer , B.C. , Teng , C.C. , Accortt , N.A. , et al. , 2017 . Models solely using claims-based administrative data are poor predictors of rheumatoid arthritis disease activity . Arthritis Research & Therapy , 19 ( 1 ), p. 86 . OpenUrl PubMed 35. ↵ Crowson , C.S. , Rollefstad , S. , Ikdahl, E.,et al., 2018 . Impact of risk factors associated with cardiovascular outcomes in patients with rheumatoid arthritis . Annals of the rheumatic diseases , 77 ( 1 ), pp. 48 – 54 . OpenUrl Abstract / FREE Full Text 36. ↵ Cheng , D. , Ananthakrishnan , A.N. and Cai , T ., 2021 . Robust and efficient semi-supervised estimation of average treatment effects with application to electronic health records data . Biometrics , 77 ( 2 ), pp. 413 – 423 . OpenUrl CrossRef PubMed 37. ↵ Rasmy , L. , Xiang , Y. , Xie , Z. , et al. , 2021 . Med-BERT: pretrained contextualized embeddings on large-scale structured electronic health records for disease prediction . NPJ digital medicine , 4 ( 1 ), p. 86 . OpenUrl CrossRef PubMed 38. ↵ Burns , M.L. , Chen , S.Y. , Tsai , C.A. , et al. , 2025 . Generative AI costs in large healthcare systems, an example in revenue cycle . NPJ Digital Medicine , 8 ( 1 ), p. 579 . OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted November 17, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Inferring rheumatoid arthritis disease activity status from the electronic health records across health systems to enable real-world data studies Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Inferring rheumatoid arthritis disease activity status from the electronic health records across health systems to enable real-world data studies David Cheng , Xuan Wang , Gregory C. McDermott , Jennifer S. Hanberg , Zoe Love , Katherine Zhong , Mary Jeffway , Jue Hou , Vidul Panickan , Rahul Sangar , Ying Qi , Connor Melley , Lauren Costa , Dakota Feil , Rachael Matty , Dana Weisenfeld , Abisayo Animashaun , Aimee Schreiner , Sara Morini , Lauren Rusnak , Andrew Cagan , Misti Paudel , J. Michael Gaziano , Brian Sauer , Michael Weinblatt , Joshua Baker , Bryant England , Yuk-Lam Ho , Kelly Cho , Paul Monach , Grant W. Cannon , Nancy Shadick , Ted R. Mikuls , Tianxi Cai , Katherine P. Liao medRxiv 2025.11.13.25340003; doi: https://doi.org/10.1101/2025.11.13.25340003 Share This Article: Copy Citation Tools Inferring rheumatoid arthritis disease activity status from the electronic health records across health systems to enable real-world data studies David Cheng , Xuan Wang , Gregory C. McDermott , Jennifer S. Hanberg , Zoe Love , Katherine Zhong , Mary Jeffway , Jue Hou , Vidul Panickan , Rahul Sangar , Ying Qi , Connor Melley , Lauren Costa , Dakota Feil , Rachael Matty , Dana Weisenfeld , Abisayo Animashaun , Aimee Schreiner , Sara Morini , Lauren Rusnak , Andrew Cagan , Misti Paudel , J. Michael Gaziano , Brian Sauer , Michael Weinblatt , Joshua Baker , Bryant England , Yuk-Lam Ho , Kelly Cho , Paul Monach , Grant W. Cannon , Nancy Shadick , Ted R. Mikuls , Tianxi Cai , Katherine P. Liao medRxiv 2025.11.13.25340003; doi: https://doi.org/10.1101/2025.11.13.25340003 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Rheumatology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6600) Geriatric Medicine (668) Health Economics (997) Health Informatics (4536) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (541) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3332) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9232) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00c928d1e2b52ad',t:'MTc3OTYyODU2MQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00