Full text
48,929 characters
· extracted from
preprint-html
· click to expand
Real-world data from the Japanese National Health Insurance System enable fine phenotyping in a 14K-scale population-based study | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Real-world data from the Japanese National Health Insurance System enable fine phenotyping in a 14K-scale population-based study View ORCID Profile Yayoi Otsuka-Yamasaki , Naoyuki Nishiya , Yoichi Sutoh , Reiko Aoyama , Kozo Tanno , Motoki Nakao , Shohei Komaki , Shiori Minabe , Hideki Ohmomo , Koichi Asahi , Yasushi Ishigaki , Makoto Sasaki , Atsushi Shimizu doi: https://doi.org/10.1101/2025.10.06.25337397 Yayoi Otsuka-Yamasaki 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 2 Division of Biomedical Information Analysis, Institute for Biomedical Sciences, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Yayoi Otsuka-Yamasaki Naoyuki Nishiya 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 3 Division of Integrated Information for Pharmaceutical Sciences, Department of Clinical Pharmacy, School of Pharmacy, Iwate Medical University 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yoichi Sutoh 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 2 Division of Biomedical Information Analysis, Institute for Biomedical Sciences, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Reiko Aoyama 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kozo Tanno 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 4 Department of Hygiene and Preventive Medicine, School of Medicine, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Motoki Nakao 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 5 Department of Pharmacology, School of Medicine, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 6 Department of Cardiovascular Medicine, Faculty of Medicine and Graduate School of Medicine, Hokkaido University , Kita 15, Nishi 7, Kita-ku, Sapporo 060-8638, Hokkaido, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Shohei Komaki 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 2 Division of Biomedical Information Analysis, Institute for Biomedical Sciences, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Shiori Minabe 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 2 Division of Biomedical Information Analysis, Institute for Biomedical Sciences, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Hideki Ohmomo 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 2 Division of Biomedical Information Analysis, Institute for Biomedical Sciences, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Koichi Asahi 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 7 Division of Nephrology and Hypertension, Department of Internal Medicine, School of Medicine, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yasushi Ishigaki 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 8 Division of Diabetes, Metabolism and Endocrinology, Department of Internal Medicine, School of Medicine, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Makoto Sasaki 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 9 Division of Ultrahigh field MRI, Institute for Biomedical Sciences, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site Atsushi Shimizu 1 Iwate Tohoku Medical Megabank Organization, Disaster Reconstruction Center, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan 2 Division of Biomedical Information Analysis, Institute for Biomedical Sciences, Iwate Medical University , 1-1-1, Idaidori, Yahaba 028-3694, Iwate, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: ashimizu{at}iwate-med.ac.jp Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract The use of medical databases, known as real-world data (RWD), enables accurate and efficient estimation of disease prevalence in population-based studies, making it a potential game-changer in epidemiology. However, the lack of standardized data formats across hospitals complicates integration across institutions. In Japan, health insurance claims data are standardized under a nationally unified format, providing a reliable source of structured RWD. We evaluated the utility of insurance claims data in epidemiological research. Incorporating both diagnosis and prescription information into case definitions resulted in four- and six-fold increases in the estimated prevalence of Alzheimer’s disease (AD) and Parkinson’s disease (PD), respectively, compared with conventional self-reported definitions. Subsequent genome-wide association studies (GWAS) for AD showed increased model log-likelihood and identified a characteristic APOE signal, findings observed only with extended case definitions. The APOE effect size was consistent with large case–control studies, while standard errors remained comparable to smaller studies. These results indicate that claims-based phenotyping improves case identification without loss of accuracy and supports scalable approaches for genomic epidemiology and public health surveillance. Patient consent statement All participants provided written informed consent and the study protocol was approved by the Institutional Review Board of Iwate Medical University (Approval number HG2021-009) Permission to reproduce material from other sources No materials requiring permission from other sources have been used in this manuscript. Clinical trial registration This study does not involve interventional components and therefore was not registered as a clinical trial. Introduction The accurate estimation of disease prevalence within a community is crucial for public health. However, challenges remain. Recent advances in medical databases—such as electronic health records and health insurance claims data, collectively referred to as real-world data (RWD)—provide opportunities to enhance the accuracy, efficiency, and frequency of prevalence estimation in population-based studies. These developments also enable the scale-up of surveys and inform public health decision-making ( Liu & Panagiotakos, 2022 ; Swift et al., 2018 ). Furthermore, the integration of RWD with large-scale genome cohorts increasingly contributes to the generation of epidemiological evidence that informs genomic research ( Briggs et al., 2022 ). Nonetheless, technical challenges remain. Although diagnoses recorded in electronic health records are generally reliable, the lack of standardized data formats across hospitals in many regions limit the integration of data across institutions ( Hiramatsu et al., 2021 ). Insurance claims data offer a standardized format that facilitates integration across institutions, particularly in countries with universal health coverage, such as Japan. In Japan, national health insurance claims are standardized across diverse healthcare providers and administered by local governments ( Shida et al., 2023 ). The Japanese health insurance system consists of two major schemes: the National Health Insurance (NHI), which covers individuals aged ≤74 years who are not enrolled in employee insurance, and the Late-stage Medical Care System for the Elderly (LsMCSE), which covers individuals aged ≥75 years ( Ministry of Health, Labour and Welfare., 2025 ). Individuals are typically insured through their local municipal governments. Both systems provide structured diagnostic and prescription claims data, consequently enabling the retrieval and integration of health data from multiple regions under appropriate legal and ethical frameworks ( Fujinaga & Fukuoka, 2022 ). The Tohoku Medical Megabank Community Cohort (TMM CommCohort) is a large-scale population-based cohort study established to monitor public health in areas affected by the 2011 Great East Japan Earthquake ( Kuriyama et al., 2016 ). This cohort obtained health insurance claims data through agreements with participating local governments. In this context, diagnosis and prescription data from claims can supplement medical histories based on self-administered questionnaires (SAQs) and serve as proxies for clinical diagnoses. In this study, we evaluated the incorporation of diagnosis and prescription data from insurance claims into conventional case–control diagnostic definitions. The prevalence of hypertension, dyslipidemia, diabetes, Parkinson’s disease (PD), and Alzheimer’s disease (AD) is expected to increase in aging societies. Therefore, we aimed to determine whether incorporating diagnosis and prescription data from insurance claims could improve case– control definitions for these common diseases. We further sought to evaluate the accuracy of phenotyping using an extended case definition incorporating claims-based diagnosis and prescription records in the context of genome-wide association studies (GWAS). These findings may facilitate the development of more scalable strategies in genomic epidemiology and public health research. In addition, they highlight the potential value of a feedback cycle in which claims data inform both public health action and evidence-based policy development for insured individuals. Materials and Methods Population The design of the TMM CommCohort study was described previously ( Kuriyama et al., 2016 ). In the present study, we included participants from the TMM CommCohort study who held insurance from the NHI or LsMCSE and resided in Iwate Prefecture. Individuals with missing genotypic, phenotypic, and claims data, or other data necessary for analysis, were excluded from the study. Participants were recruited during specific health checkups or through visits to cohort assessment centers between May 2013 and March 2016. Medical history (e.g., hypertension, dyslipidemia, diabetes, AD, and PD diagnoses) and treatment status for selected conditions (hypertension, dyslipidemia, and diabetes) were collected using SAQs. Systolic and diastolic blood pressure were measured twice at recruitment, and the average was used for analysis. Hemoglobin A1c (HbA1c), high-(HDL) and low-density lipoprotein (LDL) cholesterol, and triglyceride levels were measured in blood samples collected at recruitment. National health insurance claims data, including medical, pharmacy, and Diagnosis Procedure Combination receipts, were collected annually by electronically collating with participant information from the TMM CommCohort study in Iwate. This process enabled the construction of a claims-based dataset for TMM participants in Iwate. Personally identifiable information and non-research related elements were excluded to ensure strict data minimization and privacy protection. The claims data are structured in a standard format used across Japan’s public health insurance systems. Clinical endpoints were defined by a combination of disease and medication. This study adhered to the principles of the Declaration of Helsinki, and written informed consent was obtained from all participants. All individual data were anonymized before analysis. The study was approved by the Institutional Review Board of Iwate Medical University (Approval number HG H25-2). Analyses involving personal genotypic or phenotypic information were performed on a standalone computing system—the Tohoku University Tohoku Medical Megabank Organization supercomputer. Genotyping and GWAS Genotyping and genotype imputation were performed as previously described ( Minegishi et al., 2019 ; Sutoh et al., 2025 ; Yasuda et al., 2019 ). In the hard-call genotype data, variants that met any of the following criteria were excluded: low call rate (<0.99), low heterozygous allele count (<5), deviation from Hardy–Weinberg equilibrium ( P < 1 × 10-6), low minor allele frequency (<0.01), or location within regions of strong linkage disequilibrium ( Price et al., 2008 ). Variant filtering was conducted using PLINK v1.9 (version b5.1) ( Chang et al., 2015 ). Linkage disequilibrium pruning was performed with PLINK v2.0 (version a3LM) using the option “--indep-pairwise 1500 150 0.03”. Principal component analysis was conducted in PLINK v2.0 using the “--pca approx allele-wts” option. One individual from each kinship pair (kinship > 0.0884) was excluded based on the KING method to account for relatedness ( Manichaikul et al., 2010 ). The resulting principal components were then projected onto all individuals, including those excluded based on the kinship threshold. Case–control GWAS were conducted using SAIGE v0.44.6.5 ( Zhou et al., 2018 ). Individuals with a low call rate (<0.99) or extreme genetic ancestry, defined as values of PC1 or PC2 deviating by ≥4 standard deviations from the mean, were excluded. In addition, imputed variants with low imputation quality ( R 2 < 0.3) or allele frequency (<0.01) were excluded from the analysis. Statistical analysis Statistical analyses were performed using R versions 4.1.0 and 4.3.2. Meta-analysis was performed with the R package "meta" (version 8.0-2) ( Schwarzer et al., 2015 ). Use of artificial intelligence tools During the preparation of this work, the authors used ChatGPT 4o (OpenAI, version 4o, including Deep Research mode) to assist in reviewing and summarizing previous studies and improve the computer scripts for data analysis. The authors subsequently verified and edited all content generated by this tool and take full responsibility for the integrity and accuracy of the final manuscript. Results Retrieval of health insurance claims data and phenotyping A workflow of the study is shown in Figure 1 . Download figure Open in new tab Figure 1. The workflow in this study. The study participants had hard-called and imputed genotype data, as well as cohort information including health insurance claims, demographic characteristics (age and sex), laboratory test results, and responses to pre-specified items in a self-administered questionnaire (SAQ). The real-world data used in this study included disease diagnoses and prescription histories derived from medical, pharmacy, and Diagnosis Procedure Combination claims data. Phenotyping was performed using either conventional definitions based on SAQs and laboratory tests or extended definitions incorporating health insurance claims data. HWE, Hardy-Weinberg equilibrium; LD, linkage disequilibrium; MAF, minor allele frequency; PC, principal component; SD, standard deviation; V, variants. Health insurance claims data on diagnosis and drug prescriptions were obtained for 19,880 individuals (7,638 men and 12,242 women) ( Figure 1 ). Of these, 14,241 participants met the cohort information and genotype data requirements. Claims data for hypertension, diabetes, and dyslipidemia were retrieved from the NHI system and covered the period between June 2013 and December 2019. The claims data for AD and PD were collected between June 2013 and December 2020 and included data from both the NHI and LsMCSE. Participants were classified into case and control groups using two criteria: (1) the conventional case definition based on SAQs and laboratory tests used in the cohort study, and (2) an extended case definition incorporating both disease names recorded in claims and drug prescription data, in addition to (1) ( Table 1 ). View this table: View inline View popup Table 1. Case definition and prescription drugs for target outcomes. The incorporation of claims data into phenotyping increased the prevalence of all target outcomes by 1.0–6.3% ( Table 2 ). Common lifestyle-related diseases such as hypertension, diabetes, and dyslipidemia showed modest increases of 4.0%, 1.0%, and 6.3%, respectively. In contrast, the prevalence of AD and PD increased by more than six- and four-fold, respectively. View this table: View inline View popup Table 2. Characteristics and prevalence by phenotype. GWAS The effects of different phenotyping methods on GWAS results were confirmed ( Figure 2A ). Download figure Open in new tab Figure 2. Genome-wide association studies (GWAS) using two case definitions. GWAS were performed using two different case definitions: (1) the conventional method based on SAQs and laboratory tests (light-colored Manhattan plot; most of the plot is hidden), and (2) the extended method incorporating health insurance claims data (dark-colored Manhattan plot) (A). Gene names near genome-wide significant variants are shown in italics; those identified only in the GWAS using definition (2) are shown in bold italics. The red and blue dashed horizontal lines indicate the thresholds for genome-wide significance ( P = 5 × 10 -8 ) and suggestive significance ( P = 5 × 10 -5 ), respectively. λc/λp denote genomic inflation factors for the GWAS based on case definitions (1) and (2), respectively; 95% confidence intervals are shown in square brackets. Comparison of the sum of log-likelihoods (log L) from GWAS summary statistics between the two case definitions (B). The number of variants and sample sizes were identical between the two analyses. Effect sizes and directions of significant variants identified in GWAS using the two different case definitions were compared (C). Variants are color-coded by chromosome. Bars and points represent the standard errors and mean of the beta values, respectively. The dotted line represents the diagonal. Several genome-wide significant signals ( P < 5 × 10 -8 ) were detected for hypertension, dyslipidemia, diabetes, and AD ( Figure 2A and Supplementary Tables 1–7). ALDH2 and KCNQ1 were identified in the GWAS for hypertension and diabetes. Both genes have been reported for blood pressure and HbA1c, respectively, in previous GWAS ( Sakaue et al., 2021 ). Known lipid-associated loci such as APOB, LPL, TRIB1, APOA5, LDLR , and APOE were replicated in the GWAS for dyslipidemia, consistent with prior findings on LDL HDL, total cholesterol, and triglyceride levels. In contrast, association signals at 5p13.3 and the motilin gene locus were detected only in the GWAS based on the extended case definition ( Figure 2A ). These two genes have been detected in the LDL cholesterol or triglyceride GWAS of the TMM CommCohort study, which included 67K participants (Study ID: TGA000007, https://jmorp.megabank.tohoku.ac.jp/gwas-studies/ ) and Asian populations ( Chen et al., 2023 ; Hsu et al., 2025 ; Ismail Umlai et al., 2025 ). The lead variant rs4704208 in the 5p13.3 region identified in this study is an expression quantitative trait locus for the 3-hydroxy-3-methylglutaryl-CoA reductase gene ( Figure 2A ), as determined according to the GTEx database ( https://www.gtexportal.org/home/ ). This enzyme is the pharmacological target of statins, which are widely prescribed for the treatment of dyslipidemia. The GWAS for AD using the extended case definitions revealed significant signals in APOE , whereas these signals were not detected in the GWAS based on conventional phenotyping ( Figure 2A and Supplementary Table 7). The log-likelihood values, which indicate model fit, increased with the use of the extended definition for AD ( Figure 2B ). The effect sizes and directions of significant variants were consistent across the different phenotyping definitions ( Figure 2C ). For AD, using the extended case definition yielded smaller standard errors of effect size estimates than the conventional definition, indicating improved estimation precision. Comparison with previous studies The rs429358 C allele, a missense variant in APOE that results in a cysteine-to-arginine substitution, is classified as a risk and/or pathogenic allele for AD (ClinVar accession: VCV000017864.36). This variant was replicated in the present GWAS for AD ( Figure 2A and Supplementary Table 7). In total, 976 studies reported rs429358-associated hits in the GWAS catalog ( Cerezo et al., 2025 ). Of these, seven that reported associations with AD ( Moreno-Grau et al., 2019 ; Wang et al., 2021 ; Nazarian et al., 2019 ; Hong et al., 2020 ; Akgun et al., 2024 ; Harper et al., 2022 ; Li et al., 2021 ) (Supplementary Table 8), mostly based on clinically diagnosed cases using established guidelines, were comparable to the present study ( Figure 3 ). Download figure Open in new tab Figure 3. Comparison with previous results. Effect sizes of rs429358 were compared with those reported in previous studies registered in the GWAS Catalog ( Cerezo et al., 2025 ) (A). Studies associated with the trait “Alzheimer’s disease” were selected. The effect size for each cohort (or subgroup) was used when available for analysis. The study design is indicated by color: magenta for population-based studies, black for case–control studies, and cyan for other designs (including meta-analyses and family-based studies). A funnel plot (B) and Egger’s test (C) were used to assess potential publication bias. The linear regression test for funnel plot asymmetry yielded an intercept of -3.49 ± 1.52 ( P = 0.0484). HRS, University of Michigan Health and Retirement Study; CHS, Cardiovascular Health Study; LOADFS, Late-Onset Alzheimer’s Disease Family Study. The odds ratio estimated in our study was 3.30 (95% CI: 2.26–4.82). This is consistent with estimates from larger-scale studies (number of cases > 4000; 3.21 (3.02–3.41) reported by Moreno-Grau et al ( Moreno-Grau et al., 2019 ) and Wang et al ( Wang et al., 2021 ) (3.25 (2.94–3.58) and 3.66 (3.37–3.97)) ( Figure 3A , B). Moreover, the precision in our study was similar to that in studies with smaller cases (number of cases < 500), such as those by Nazarian et al ( Nazarian et al., 2019 ) and Hong et al ( Hong et al., 2020 ) ( Figure 3C ). Egger’s test revealed a significant asymmetry (-3.49 ± 1.52; P = 0.0484) in the effect size estimates of these GWAS. Discussion According to the NIA-AA research framework ( Jack et al., 2018 ), a biological definition of AD requires costly and time-consuming assessments, such as biomarker testing using positron emission tomography or cerebrospinal fluid analysis. However, increasing case numbers and frequency of follow-up limit the practical application of these approaches. The more than six-fold increase in the estimated prevalence of AD demonstrates the effectiveness of RWD-based phenotyping in community-based cohort studies. In the present study, the estimated prevalence of AD in the cohort was 1.3%. By 2025, the prevalence of dementia is estimated to be 1.10% (95% CI: 1.07–1.17) in men and 3.47% (95% CI: 3.41–3.52) among women aged 60–74 years in Japan ( Kasajima et al., 2022 ). As women comprised 61.5% of our dataset, the expected overall prevalence of dementia was approximately 2.56%, calculated as 0.615 × 0.0347 + 0.386 × 0.011. Assuming that AD accounts for approximately two-thirds of all dementia cases in Japan ( Ikeda et al., 2021 ; Montgomery et al., 2018 ), its estimated prevalence would be approximately 1.7%, resulting in a smaller gap with the estimate derived using RWD. However, our dataset included claims data from the LsMCSE, thereby covering individuals aged 75 years and older. Because the prevalence of AD increases substantially with age ( Kasajima et al., 2022 )— especially among individuals aged ≥75—the inclusion of older adults may increase the overall prevalence. Nevertheless, given that the participants were followed up for 7 years, and most of the cohort remained under 75 years old, the observed prevalence of 1.3% may still represent a reasonable estimate. Case–control classification based on RWD also affected the GWAS results for AD. The increased log-likelihood values suggest that this updated phenotyping approach may reduce the effect of unmeasured confounders that are not included in the regression models. The replicated detection of the APOE variant rs429358, which has been identified as an AD risk allele in several studies, further supports the validity of this approach. The odds ratio estimated in the present study was similar to those reported in large-scale clinical or meta-analytic GWAS with more than 4,000 cases. However, the standard error remained comparable to those from studies with similar case numbers. These findings suggest that the revised case definition improved case identification while maintaining accuracy. The prevalence of PD also increased when claims data were included. However, no significant improvement was observed in the GWAS results. Although a few studies have reported the prevalence of PD in Japan, regional estimates for East Asia suggested an all-age prevalence of 0.357% (95% CI: 0.287–0.428) in 2021, which is estimated to increase to 0.821% (95% CI: 0.674–0.995) by 2050 ( Su et al., 2025 ). The estimated prevalence of 0.7% in the present cohort, which is predominantly older, is consistent with these projections. A limitation of this study is the potential selection bias due to the characteristics of the insured population. This study included claims data from the NHI system, which covers approximately 30% of the Japanese population, and from the LsMCSE system for those aged 75 years and older. Therefore, it does not represent the entire population ( Shigeoka, 2014 ). Additionally, we could not directly compare the phenotypes estimated from prescription data with clinical diagnoses made in hospitals. This was due to ethical constraints and the substantial human effort required to collect diagnosis data from the hospitals where cohort participants were treated. However, we plan to address this limitation in future research. In conclusion, this study demonstrates that incorporating RWD into case definitions can improve the efficiency of epidemiological research in population-based studies while maintaining accuracy. These approaches may improve public health surveillance and contribute to the advancement of precision medicine. Data Availability The GWAS summaries have been published in jMorp ( https://jmorp.megabank.tohoku.ac.jp/gwas-studies/TGA000016 ). The datasets of individuals in the TMM CommCohort study analyzed here are not publicly available owing to ethical reasons, such as the protection of personal information and prevention of unintended personal identification. However, they are accessible upon request, subject to approval from the Ethics Committee of Iwate Medical University and the Materials and Information Distribution Review Committee of the TMM Project. Those interested in obtaining the datasets for individuals may contact the corresponding author. Data Availability Statement The GWAS summaries have been published in jMorp ( https://jmorp.megabank.tohoku.ac.jp/gwas-studies/TGA000016 ). The datasets of individuals in the TMM CommCohort study analyzed here are not publicly available for ethical reasons, such as protecting personal information and preventing unintended personal identification. However, they are available from the corresponding author upon request, subject to approval from the Ethics Committee of Iwate Medical University and the Materials and Information Distribution Review Committee of the TMM Project. Funding This study was supported by the Tohoku Medical Megabank (TMM) Project of the Ministry of Education, Culture, Sports, Science and Technology (MEXT), and by the Japan Agency for Medical Research and Development (AMED) under grant number JP23tm0124006, as well as by research funding from IQVIA. YO-Y and AS were supported by the Japan Society for the Promotion of Science (JSPS) KAKENHI under grant number 23K09696. Conflict of interest disclosure The authors declare no competing interests. Ethical approval This study was approved by the Institutional Review Board of Iwate Medical University (Approval number HG2021-009). Supplementary information Supplemental_Tables.xlsx. The Excel file includes Supplementary Tables 1–8. Author contributions Conceptualization: YO-Y, YS, AS; Methodology: YO-Y, YS, AS; Investigation: NN, RA, KT; Formal analysis: YO-Y, NN, YS; Data curation: YO-Y, NN, YS, KT, AS; Writing—original draft: YO-Y, NN, YS; Writing—review & editing: YO-Y, NN, YS, KT, MN, SK, SM, HO, KA, YI, MS, AS. Acknowledgements This research utilized the supercomputer system and dbTMM provided by the Tohoku Medical Megabank Project, supported by AMED under grant numbers JP21tm0424601 and JP21tm0124005. Footnotes ↵ * These authors should be considered joint first authors. The manuscript has been updated. Specifically, the description of the claims data structure in the Methods section has been clarified, supplementary tables have been added to facilitate interpretation of the results, and minor typographical errors have been corrected. The main results and conclusions remain unchanged. References ↵ Akgun , B. , Feliciano-Astacio , B. E. , Hamilton-Nelson , K. L. , Scott , K. , Rivero , J. , Adams , L. D. , et al. ( 2024 ). Genome-wide association analysis and admixture mapping in a Puerto Rican cohort supports an Alzheimer disease risk locus on chromosome 12 . Frontiers in Aging Neuroscience , 16 , 1459796 . doi: 10.3389/fnagi.2024.1459796 OpenUrl CrossRef PubMed ↵ Briggs , S. E. W. , Law , P. , East , J. E. , Wordsworth , S. , Dunlop , M. , Houlston , R. , et al. ( 2022 ). Integrating genome-wide polygenic risk scores and non-genetic risk to predict colorectal cancer diagnosis using UK Biobank data: population based cohort study . BMJ , 379 , e071707 . doi: 10.1136/bmj-2022-071707 OpenUrl Abstract / FREE Full Text ↵ Cerezo , M. , Sollis , E. , Ji , Y. , Lewis , E. , Abid , A. , Bircan , K. O. , et al. ( 2025 ). The NHGRI-EBI GWAS Catalog: standards for reusability, sustainability and diversity . Nucleic Acids Research , 53 , D998 – D1005 . doi: 10.1093/nar/gkae1070 OpenUrl CrossRef PubMed ↵ Chang , C. C. , Chow , C. C. , Tellier , L. C. , Vattikuti , S. , Purcell , S. M. , & Lee , J. J. ( 2015 ). Second-generation PLINK: rising to the challenge of larger and richer datasets . Gigascience , 4 , 7 . doi: 10.1186/s13742-015-0047-8 OpenUrl CrossRef PubMed ↵ Chen , C. Y. , Chen , T. T. , Feng , Y. A. , Yu , M. , Lin , S. C. , Longchamps , R. J. , et al. ( 2023 ). Analysis across Taiwan Biobank, Biobank Japan, and UK Biobank identifies hundreds of novel loci for 36 quantitative traits . Cell Genomics , 3 , 100436 . doi: 10.1016/j.xgen.2023.100436 OpenUrl CrossRef PubMed ↵ Fujinaga , J. , & Fukuoka , T . ( 2022 ). A review of research studies using data from the Administrative Claims Databases in Japan . Drugs - Real World Outcomes , 9 , 543 – 550 . doi: 10.1007/s40801-022-00331-5 OpenUrl CrossRef PubMed ↵ Harper , J. D. , Fan , K.-H. , Aslam , M. M. , Snitz , B. E. , DeKosky , S. T. , Lopez , O. L. , et al. ( 2022 ). Genome-wide association study of incident dementia in a community-based sample of older subjects . Journal of Alzheimer’s Disease , 88 , 787 – 798 . doi: 10.3233/JAD-220293 OpenUrl CrossRef PubMed ↵ Hiramatsu , K. , Barrett , A. , Miyata , Y. , & PhRMA Japan Medical Affairs Committee Working Group 1 . ( 2021 ). Current status, challenges, and future perspectives of real-world data and real-world evidence in Japan . Drugs - Real World Outcomes , 8 , 459 – 480 . doi: 10.1007/s40801-021-00266-3 OpenUrl CrossRef PubMed ↵ Hong , S. , Prokopenko , D. , Dobricic , V. , Kilpert , F. , Bos , I. , Vos , S. J. B. , et al. ( 2020 ). Genome-wide association study of Alzheimer’s disease CSF biomarkers in the EMIF-AD Multimodal Biomarker Discovery dataset . Translational Psychiatry , 10 , 403 . doi: 10.1038/s41398-020-01074-z OpenUrl CrossRef PubMed ↵ Hsu , L. A. , Teng , M. S. , Duan , D. M. , Yeh , K. H. , Wu , S. , & Ko , Y. L . ( 2025 ). 3-Hydroxy-3-methylglutaryl-CoA reductase variants strongly associated with low-density lipoprotein cholesterol levels and diabetes mellitus risk in a Taiwanese population: a Mendelian randomization study . Tzu Chi Medical Journal , 37 , 181 – 188 . doi: 10.4103/tcmj.tcmj_247_24 OpenUrl CrossRef PubMed ↵ Ikeda , S. , Mimura , M. , Ikeda , M. , Wada-Isoe , K. , Azuma , M. , Inoue , S. , et al. ( 2021 ). Economic burden of Alzheimer’s disease dementia in Japan . Journal of Alzheimer’s Disease , 81 , 309 – 319 . doi: 10.3233/JAD-210075 OpenUrl CrossRef PubMed ↵ Ismail Umlai , U. K. , Toor , S. M. , Al-Sarraj , Y. A. , Mohammed , S. , Al Hail , M. S. H. , Ullah , E. , et al. ( 2025 ). A multi-ancestry genome-wide association study and evaluation of polygenic scores of LDL-C levels . Journal of Lipid Research , 66 , 100752 . doi: 10.1016/j.jlr.2025.100752 OpenUrl CrossRef ↵ Jack , C. R. , Jr. , Bennett , D. A. , Blennow , K. , Carrillo , M. C. , Dunn , B. , Haeberlein , S. B. , et al. ( 2018 ). NIA-AA Research Framework: toward a biological definition of Alzheimer’s disease . Alzheimer’s & Dementia , 14 , 535 – 562 . doi: 10.1016/j.jalz.2018.02.018 OpenUrl CrossRef PubMed ↵ Kasajima , M. , Eggleston , K. , Kusaka , S. , Matsui , H. , Tanaka , T. , Son , B. K. , et al. ( 2022 ). Projecting prevalence of frailty and dementia and the economic cost of care in Japan from 2016 to 2043: a microsimulation modelling study . The Lancet Public Health , 7 , e458 – e468 . doi: 10.1016/S2468-2667(22)00044-5 OpenUrl CrossRef ↵ Kuriyama , S. , Yaegashi , N. , Nagami , F. , Arai , T. , Kawaguchi , Y. , Osumi , N. , et al. ( 2016 ). The Tohoku Medical Megabank Project: design and mission . Journal of Epidemiology , 26 , 493 – 511 . doi: 10.2188/jea.JE20150268 OpenUrl CrossRef PubMed ↵ Li , L. , Yang , Y. , Zhang , Q. , Wang , J. , Jiang , J. , & Alzheimer’s Disease Neuroimaging Initiative . ( 2021 ). Use of deep-learning genomics to discriminate healthy individuals from those with Alzheimer’s disease or mild cognitive impairment . Behavioural Neurology , 14 , 3359103 . doi: 10.1155/2021/3359103 OpenUrl CrossRef ↵ Liu , F. , & Panagiotakos , D . ( 2022 ). Real-world data: a brief review of the methods, applications, challenges and opportunities . BMC Medical Research Methodology , 22 , 287 . doi: 10.1186/s12874-022-01768-6 OpenUrl CrossRef PubMed ↵ Manichaikul , A. , Mychaleckyj , J. C. , Rich , S. S. , Daly , K. , Sale , M. , & Chen , W. M . ( 2010 ). Robust relationship inference in genome-wide association studies . Bioinformatics , 26 , 2867 – 2873 . doi: 10.1093/bioinformatics/btq559 OpenUrl CrossRef PubMed Web of Science ↵ Minegishi , N. , Nishijima , I. , Nobukuni , T. , Kudo , H. , Ishida , N. , Terakawa , T. , et al. ( 2019 ). Biobank establishment and sample management in the Tohoku Medical Megabank Project . The Tohoku Journal of Experimental Medicine , 248 , 45 – 55 . doi: 10.1620/tjem.248.45 OpenUrl CrossRef PubMed ↵ Ministry of Health, Labour and Welfare . ( 2025 ). Overview of medical service regime in Japan . Retrieved from https://www.mhlw.go.jp/bunya/iryouhoken/iryouhoken01/dl/01_eng.pdf ↵ Montgomery , W. , Ueda , K. , Jorgensen , M. , Stathis , S. , Cheng , Y. , & Nakamura , T . ( 2018 ). Epidemiology, associated burden, and current clinical practice for the diagnosis and management of Alzheimer’s disease in Japan . ClinicoEconomics and Outcomes Research , 10 , 13 – 28 . doi: 10.2147/CEOR.S146788 OpenUrl CrossRef PubMed ↵ Moreno-Grau , S. , de Rojas , I. , Hernandez , I. , Quintela , I. , Montrreal , L. , Alegret , M. , et al. ( 2019 ). Genome-wide association analysis of dementia and its clinical endophenotypes reveal novel loci associated with Alzheimer’s disease and three causality networks: The GR@ACE project . Alzheimer’s & Dementia , 15 , 1333 – 1347 . doi: 10.1016/j.jalz.2019.06.4950 OpenUrl CrossRef PubMed ↵ Nazarian , A. , Yashin , A. I. , & Kulminski , A. M . ( 2019 ). Genome-wide analysis of genetic predisposition to Alzheimer’s disease and related sex disparities . Alzheimer’s Research & Therapy , 11 , 5 . doi: 10.1186/s13195-018-0458-8 OpenUrl CrossRef PubMed ↵ Price , A. L. , Weale , M. E. , Patterson , N. , Myers , S. R. , Need , A. C. , Shianna , K. V. , et al. ( 2008 ). Long-range LD can confound genome scans in admixed populations . American Journal of Human Genetics , 83 , 132 – 135 . doi: 10.1016/j.ajhg.2008.06.005 OpenUrl CrossRef PubMed Web of Science ↵ Sakaue , S. , Kanai , M. , Tanigawa , Y. , Karjalainen , J. , Kurki , M. , Koshiba , S. , et al. ( 2021 ). A cross-population atlas of genetic associations for 220 human phenotypes . Nature Genetics , 53 , 1415 – 1424 . doi: 10.1038/s41588-021-00931-x OpenUrl CrossRef PubMed ↵ Schwarzer , G. , Carpenter , J. R. , & Rücker , G . ( 2015 ). Meta-analysis with R . Springer International Publishing . doi: 10.1007/978-3-319-21416-0 OpenUrl CrossRef ↵ Shida , H. , Kajiyama , K. , Sawada , S. , Ishiguro , C. , Kubo , M. , Kimura , R. , et al. ( 2023 ). Use of National Database of Health Insurance Claims and Specific Health Checkups for examining practical utilization and safety signal of a drug to support regulatory assessment on postmarketing drug safety in Japan . Frontiers in Medicine , 10 , 1096992 . doi: 10.3389/fmed.2023.1096992 OpenUrl CrossRef ↵ Shigeoka , H. ( 2014 ). The effect of patient cost sharing on utilization, health, and risk protection . American Economic Review , 104 , 2152 – 2184 . Retrieved from https://www.jstor.org/stable/42920882 OpenUrl ↵ Su , D. , Cui , Y. , He , C. , Yin , P. , Bai , R. , Zhu , J. , et al. ( 2025 ). Projections for prevalence of Parkinson’s disease and its driving factors in 195 countries and territories to 2050: modelling study of Global Burden of Disease Study 2021 . BMJ , 388 , e080952 . doi: 10.1136/bmj-2024-080952 OpenUrl Abstract / FREE Full Text ↵ Sutoh , Y. , Hachiya , T. , Otsuka-Yamasaki , Y. , Komaki , S. , Minabe , S. , Ohmomo , H. , et al. ( 2025 ). Healthy lifestyle practice correlates with decreased obesity prevalence in individuals with high polygenic risk: TMM CommCohort study . Journal of Human Genetics , 70 , 9 – 15 . doi: 10.1038/s10038-024-01280-3 OpenUrl CrossRef PubMed ↵ Swift , B. , Jain , L. , White , C. , Chandrasekaran , V. , Bhandari , A. , Hughes , D. A. , et al. ( 2018 ). Innovation at the intersection of clinical trials and real-world data science to advance patient care . Clinical and Translational Science , 11 , 450 – 460 . doi: 10.1111/cts.12559 . OpenUrl CrossRef PubMed ↵ Wang , H. , Lo , M. T. , Rosenthal , S. B. , Makowski , C. , Andreassen , O. A. , Salem , R. M. , et al. ( 2021 ). Similar genetic architecture of Alzheimer’s disease and differential APOE effect between sexes . Frontiers in Aging Neuroscience , 13 , 674318 . doi: 10.3389/fnagi.2021.674318 OpenUrl CrossRef ↵ Yasuda , J. , Kinoshita , K. , Katsuoka , F. , Danjoh , I. , Sakurai-Yageta , M. , Motoike , I. N. , et al. ( 2019 ). Genome analyses for the Tohoku Medical Megabank Project towards establishment of personalized healthcare . Journal of Biochemistry , 165 , 139 – 158 . doi: 10.1093/jb/mvy096 OpenUrl CrossRef PubMed ↵ Zhou , W. , Nielsen , J. B. , Fritsche , L. G. , Dey , R. , Gabrielsen , M. E. , Wolford , B. N. , et al. ( 2018 ). Efficiently controlling for case–control imbalance and sample relatedness in large-scale genetic association studies . Nature Genetics , 50 , 1335 – 1341 . doi: 10.1038/s41588-018-0184-y OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 28, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Real-world data from the Japanese National Health Insurance System enable fine phenotyping in a 14K-scale population-based study Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Real-world data from the Japanese National Health Insurance System enable fine phenotyping in a 14K-scale population-based study Yayoi Otsuka-Yamasaki , Naoyuki Nishiya , Yoichi Sutoh , Reiko Aoyama , Kozo Tanno , Motoki Nakao , Shohei Komaki , Shiori Minabe , Hideki Ohmomo , Koichi Asahi , Yasushi Ishigaki , Makoto Sasaki , Atsushi Shimizu medRxiv 2025.10.06.25337397; doi: https://doi.org/10.1101/2025.10.06.25337397 Share This Article: Copy Citation Tools Real-world data from the Japanese National Health Insurance System enable fine phenotyping in a 14K-scale population-based study Yayoi Otsuka-Yamasaki , Naoyuki Nishiya , Yoichi Sutoh , Reiko Aoyama , Kozo Tanno , Motoki Nakao , Shohei Komaki , Shiori Minabe , Hideki Ohmomo , Koichi Asahi , Yasushi Ishigaki , Makoto Sasaki , Atsushi Shimizu medRxiv 2025.10.06.25337397; doi: https://doi.org/10.1101/2025.10.06.25337397 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4411) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15205) Forensic Medicine (30) Gastroenterology (1119) Genetic and Genomic Medicine (6574) Geriatric Medicine (666) Health Economics (994) Health Informatics (4511) Health Policy (1365) Health Systems and Quality Improvement (1608) Hematology (537) HIV/AIDS (1263) Infectious Diseases (except HIV/AIDS) (15903) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6573) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (954) Oncology (3319) Ophthalmology (968) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (662) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5422) Public and Global Health (9205) Radiology and Imaging (2191) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1191) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9fe9e5c5aa4b41e2',t:'MTc3OTI2NDk2OA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.