Pangenome-based identification of cryptic pathogenic variants in undiagnosed rare disease patients

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 71,519 characters · extracted from preprint-html · click to expand
Pangenome-based identification of cryptic pathogenic variants in undiagnosed rare disease patients | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Pangenome-based identification of cryptic pathogenic variants in undiagnosed rare disease patients View ORCID Profile Se Song Jang , View ORCID Profile Seoyeon Kim , View ORCID Profile Seungbok Lee , View ORCID Profile Soo Yeon Kim , View ORCID Profile Jangsup Moon , View ORCID Profile Jun Kim , View ORCID Profile Jong-Hee Chae doi: https://doi.org/10.1101/2025.07.08.25330875 Se Song Jang 1 Department of Pediatrics, Seoul National University College of Medicine, Seoul National University Children’s Hospital, 103, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea 2 Department of Genomic Medicine, Seoul National University Hospital, 101, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea 3 Biomedical Research Institute, Seoul National University Hospital, 71, Daehak-ro, Jongno-gu , Seoul 03082, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Se Song Jang Seoyeon Kim 4 Graduate School of Life Sciences, College of Bioscience and Biotechnology, Chungnam National University, 99, Daehak-ro, Yuseong-gu , Daejeon 34134, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Seoyeon Kim Seungbok Lee 1 Department of Pediatrics, Seoul National University College of Medicine, Seoul National University Children’s Hospital, 103, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea 2 Department of Genomic Medicine, Seoul National University Hospital, 101, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Seungbok Lee Soo Yeon Kim 1 Department of Pediatrics, Seoul National University College of Medicine, Seoul National University Children’s Hospital, 103, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea 2 Department of Genomic Medicine, Seoul National University Hospital, 101, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Soo Yeon Kim Jangsup Moon 1 Department of Pediatrics, Seoul National University College of Medicine, Seoul National University Children’s Hospital, 103, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea 5 Department of Neurology, Seoul National University Hospital, 101, Daehak-ro, Jongno-gu , Seoul 03082, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jangsup Moon Jun Kim 4 Graduate School of Life Sciences, College of Bioscience and Biotechnology, Chungnam National University, 99, Daehak-ro, Yuseong-gu , Daejeon 34134, Republic of Korea 6 Department of Convergent Bioscience and Informatics, College of Bioscience and Biotechnology, Chungnam National University, 99, Daehak-ro, Yuseong-gu , Daejeon 34134, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jun Kim For correspondence: chaeped1{at}snu.ac.kr junkim{at}cnu.ac.kr Jong-Hee Chae 1 Department of Pediatrics, Seoul National University College of Medicine, Seoul National University Children’s Hospital, 103, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea 2 Department of Genomic Medicine, Seoul National University Hospital, 101, Daehak-ro, Jongno-gu , Seoul 03080, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jong-Hee Chae For correspondence: chaeped1{at}snu.ac.kr junkim{at}cnu.ac.kr Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Background Despite widespread implementation of exome and genome sequencing, a substantial proportion of rare disease patients remain undiagnosed due to inherent limitations in detecting structural, repetitive, and regulatory variants. Methods We applied long-read sequencing (LRS) to 40 individuals from 33 previously undiagnosed Korean families. De novo assemblies were integrated into a graph-based pangenome workflow, enabling sensitive detection of single-nucleotide, structural, and tandem-repeat variants and direct profiling of CpG methylation. Results Pathogenic or likely pathogenic variants were identified in 9 (27.3%) families that had remained unsolved despite prior short-read sequencing. The discoveries comprised deep intronic splice-altering SNVs, non-coding regulatory deletions, complex rearrangements, large deletions, tandem repeat expansions, and aberrant methylation profiles. We also implicate CXXC1 as a novel disease-associated gene, potentially contributing to a global DNA methylation defects, and revealed novel pathogenic variants in established disease genes such as HEXB and NGLY1 , providing insights into underrecognized genetic contributors to rare diseases. Conclusions LRS coupled with pangenome-based, graph-driven analysis closed a sizable diagnostic gap, broadened the mutational spectra of several Mendelian genes and brought epigenomic evidence into rare disease investigation. These findings support the adoption of long-read, graph-based workflows as a front-line strategy for comprehensive genomic and epigenomic diagnosis. Background Rare diseases affect an estimated 300 million people worldwide and impose physical, emotional, and economic burdens on patients, families, and society ( 1 ). Although genomic medicine has advanced rapidly, many individuals whose clinical features strongly suggest a genetic etiology still lack a molecular diagnosis ( 2 , 3 ). This persistent diagnostic gap hinders applicability of personalized clinical care, accurate prognosis, and enrollment in targeted therapies or clinical trials. Continuous innovation in sequencing technologies and bioinformatic methods is therefore essential to improve diagnostic rates in rare genetic disease. Conventional short-read sequencing (SRS), including targeted-panel sequencing, whole-exome sequencing (WES), and whole-genome sequencing (WGS), has significantly improved the diagnostic rate of rare genetic disorders ( 4 – 9 ). WES alone achieves a diagnostic yield of roughly 36–41% of cases ( 10 – 12 ). In unsolved patients, adding SR WGS confers a further 7– 19% yield by detecting variant classes that exome approaches often miss, such as structural and deep intronic variants ( 13 , 14 ). Although short-read (SR) WGS captures a broader range of genomic regions than WES, its SR length limits its ability to resolve structurally complex or repetitive genomic regions. These challenges have led to increased interest in long-read sequencing (LRS) for rare disease diagnostics ( 15 – 28 ). LRS significantly enhances the detection and characterization of complex structural variants–including large insertions, deletions, inversions, substitutions, and tandem repeat (TR) expansions–as well as variants located within previously unmappable “dark” genomic regions ( 18 , 20 , 25 , 29 – 31 ). Beyond variant detection, LRS allows haplotype phasing without parental genotyping and supports epigenetic profiling, including DNA methylation or hydroxymethylation ( 32 – 34 ). Moreover, LRS technologies allow the generation of highly contiguous and accurate de novo genome assemblies, allowing direct comparison of large haplotype-resolved segments and thus detection of genetic variants of any type and size ( 35 ). The recent emergence of population-specific genome assemblies through global pangenome initiatives is providing valuable background variation data, improving filtering and prioritization of candidate variants. Projects such as the Human Pangenome Reference Consortium (HPRC), Korean Pangenome Project, Chinese Pangenome Project, Arab Pangenome Project, Emirates Pangenome Project and others are contributing to a more representative understanding of human genomic diversity ( 36 – 40 ). However, as rare disease diagnosis requires extensive control datasets, there is still a need for high-quality and population-specific datasets. In this study, we applied LRS and pangenome analysis to resolve undiagnosed Korean patients with rare diseases. These patients had previously undergone SR WES and WGS with available parental or other familial data but remained nonetheless undiagnosed. Genome assembly and variant discovery were performed on 40 patients from 33 unrelated families ( Fig. 1 ). We further provide genome-wide methylation profiles and Korean-specific LR (long-read) genome datasets as publicly accessible resources. These results highlight the diagnostic value of assembly-based LRS and its potential to elucidate the complex genetic architecture underlying rare diseases. Download figure Open in new tab Fig. 1: Schematic overview of pangenome-based variant discovery and interpretation workflow. Step 1. Overview of sequencing strategies and genome assembly approaches. For patient-only long-read sequencing (LRS), partially phased assemblies were generated. For selected trios or quartets, trio-binning was applied using parental short-read sequencing (SRS) and patient LRS data to obtain fully phased haplotypes. Step 2. Pangenome-level, assembly-based variant calling using multiple reference genomes (GRCh38 and T2T-CHM13) and graph-based alignment enabled comprehensive detection of structural variants (SVs), single nucleotide variants (SNVs), and tandem repeat expansions. HPRC genome assemblies were used as control cases. Epigenetic features were investigated using raw base-calling data. Step 3. Patient-specific variants were filtered based on their allelic and known disease association information. Filtered variants were manually assessed using raw read validation and known gene and variant databases. This workflow facilitated identification and interpretation of disease-associated variants in previously undiagnosed rare disease patients. Methods Patient recruitment and sample collection The patients enrolled in this study exhibited clinical features suggestive of rare genetic disorders and remained undiagnosed despite extensive prior genetic testing, including WES, WGS, and, in some cases, RNA sequencing of muscle tissue obtained via biopsy (Fig. S1). Participants were recruited through the Department of Genomic Medicine at Seoul National University Hospital, and phenotypic data were reviewed by experienced clinicians. LRS was pursued in cases with persistent diagnostic uncertainty and available biospecimens from the patient and, where possible, parents or affected siblings. Clinical characteristics and sample metadata are summarized in Table S1. Whole-genome sequencing and de novo genome assembly SR WGS data of trios was produced using the Illumina NovaSeq6000 platform. LR WGS data from patients and available siblings were generated using one of the following LRS platforms: Pacific Biosciences (PacBio) Sequel IIe, PacBio Revio, or Oxford Nanopore Technologies (ONT) PromethION 24. Detailed sequencing metadata of each sample is shown in Table S1. Read lengths and quality scores of LRS datasets were summarized using bioawk (version 20110810; bioawk-c fastx’{print length($seq), meanqual($seq)}’ ) and SeqKit stats (version 2.8.0; seqkit stat-a ) ( https://github.com/lh3/bioawk ) ( 41 , 42 ). For samples without parental sequencing data, PacBio long reads were assembled into contig using Hifiasm (version 0.19.8-r603; default parameters) ( 43 – 45 ). For samples with available parental SR WGS data, parental k -mers were first calculated for the SR WGS data using yak (version 0.1; yak count-b 37 ) ( https://github.com/lh3/yak ). Based on these parental k -mers and the patients’ HiFi reads, a pair of fully phased genome assemblies was produced for each sample using Hifiasm (version 0.19.8-r603; hifiasm-1 paternal.yak-2 maternal.yak HiFi.fastq ) ( 43 – 45 ). For samples sequenced using the ONT platform, raw reads were polished using Herro (version model_R10_v0.1; default parameters); polished reads were assembled using Hifiasm (version 0.19.8-r603; default parameters) ( 43 – 46 ). Then, the assembly quality value (QV) was calculated by first generating a k -mer database from each sample’s SR WGS data with Meryl (version 1.3; meryl k=21 count ) before evaluating it with Merqury (version 1.3; merqury.sh ) ( 47 ). For samples with available parental SR WGS data, Hap-mer was additionally used to construct paternal-and maternal-specific k -mer databases, enabling the assessment of haplotype-level completeness (version 1.3; hapmers.sh ) ( 47 ). Contig length metrics for each de novo assembly were obtained using SeqKit (version 2.8.0; seqkit stats ) and bioawk (version 20110810; bioawk-c fastx’{print length($seq)}’ ) ( https://github.com/lh3/bioawk ). Graph-based pangenome construction for variant calling and discovery of candidate causal variants A pangenome graph was built using the Minigraph-Cactus pipeline by integrating GRCh38.p14, T2T-CHM13 v2.0, HPRC Year 1 assemblies, and our newly generated genome assemblies (version 2.9.2; cactus-pangenome--reference GRCh38 CHM13--giraffe clip filter--vcf--viz--odgi--chrom-vg clip filter--chrom-og--gbz clip filter full--gfa clip full--vcf--giraffe--gfa--gbz--chrom-vg ) ( 36 , 48 , 49 ). During graph construction, 22 assemblies from the low-depth PacBio 10× group (11 individuals) did not integrate cleanly enough to support a high-confidence graph and were therefore excluded from the build. The pipeline’s final output—a multiallelic VCF—served as input for variant discovery in patients who remained genetically unsolved after SR WGS analysis. Assuming that conventional small variants were addressed during the preceding SR WGS analysis, this step concentrated on loci whose longest allele was ≥ 3 bp. Allele lengths at each locus were standardized using Scikit-learn’s StandardScaler and grouped by hierarchical clustering using AgglomerativeClustering (version 1.5.2; AgglomerativeClustering(distance_threshold=1, n_clusters=None) ), so that alleles of identical or similar length fell into the same cluster. Cluster groups containing any allele from the HPRC sample were filtered out. Among the remaining patient-specific clusters, we focused on two patterns: (i) clusters composed exclusively of siblings and (ii) clusters in which a single patient harbored a length-distinct allele absent from all other individuals. Then, each candidate allele was inspected in raw read alignments to confirm the length difference and the gene disrupted by the variant evaluated with respect to the patient’s clinical presentation. Variants whose predicted gene disruption plausibly accounted for the observed phenotype were retained as final, curated causal candidates. To compute the pangenome growth curve, we produced a base-pair-level cumulative growth curve without GRCh38 reference paths using Panacus (version 0.3.3; panacus ordered-histgrowth-c bp-l 1,2,1,1,1-q 0,0,1,0.5,0.1-S-O and panacus-visualize ) ( 50 ). Identification of 5-methylcytosine sites and gene-level DNA methylation analysis PacBio HiFi reads were aligned to the GRCh38 reference genome using pbmm2 (version1.13.1; pbmm2 align--preset HIFI--sort ) ( 50 , 51 ). Methylation ratios at CpG sites were subsequently obtained using pb-CpG-tools (version 2.3.2; aligned_bam_to_cpg_scores--model pileup_calling_model.v1.tflite ) ( https://github.com/PacificBiosciences/pb-CpG-tools ). For ONT reads, alignment and sorting to GRCh38 were performed with Dorado (version 0.6.2; dorado basecaller hac with a hac model [email protected] ) and SAMtools (version 1.20; samtools sort ), respectively (Oxford Nanopore Technologies, 2023; https://github.com/nanoporetech/dorado ) ( 52 ). Methylation detection and analysis were performed using the ONT wf-human-variation pipeline (version 2.1.0; wf-human-variation--snv--sv--str--mod--cnv--sex--tr_bed--use_qdnaseq--qdnaseq_bin_size 50--basecaller_cfg [email protected] ) ( https://github.com/epi2me-labs/wf-human-variation ). Methylation ratios were extracted from the resulting bedmethyl files using the following filtering criteria: coverage between 20–200× and modification score > 75%. The identified CpG sites as well as their methylation ratios were classified into defined genomic regions (intergenic region, upstream, promoter, 5’ UTR, CDS, intron, and 3’ UTR). The genomic coordinates of these regions were based on Ensembl’s evidence-based annotation of GRCh38.p14 (version 46; Ensembl 112), except for the upstream and promoter regions, which were defined separately due to lacking annotations: The upstream region was defined as 1–5 kb upstream of the gene start site and the promoter region as that within 1 kb immediately upstream of the gene start site. Finally, the number of CpG sites and the average methylation ratio for each gene region were extracted for each sample. Identification of tandem repeats and detection of pathogenic repeat expansions TR loci were delineated across the GRCh38 reference genome using Tandem Repeat Finder (version 4.09; trf 2 5 7 80 10 50 2000-l 6-h ) ( 53 ). Each TR was assigned to its precise genic context—intergenic, upstream, promoter, 5′ UTR, CDS, intron, or 3′ UTR— to produce the full list of repeat alleles for every gene present in GRCh38. Population-level TR variants were extracted from the intersection between these TR coordinates and the multiallelic VCF generated with the Minigraph-Cactus pangenome pipeline using BCFtools (version 1.17; bcftools view ) ( 52 ). The resulting TR variant calls were projected onto reference alleles with BCFtools (version 1.17; bcftools consensus ) to yield the complete set of repeat alleles for each TR locus present in the cohort population ( 52 ). To isolate candidate pathogenic expansions, alleles were filtered in four sequential steps. First, only alleles unique to our patient cohort were retained. Second, within each locus group, the allele with the maximum repeat count was selected. Third, an expansion index was calculated for every allele as where an index of 1 denotes a two-fold expansion relative to controls. Fourth, alleles exhibiting an expansion index ≥ 1 for both the maximum repeat count and the total repeat length (bp) were considered candidate pathogenic repeat expansions (e.g., for the allele (CAG) 11 (CAT)(CAG) 14 , the maximum repeat count equals 14 due to (CAG) 14 and the total repeat length equals 78 bp by calculating the length sum of (CAG) 11 , (CAT), and (CAG) 14 , namely 3×11+3+3×14). We further checked if these filtered candidate TR loci were contained within OMIM-listed gene regions. Finally, these final candidate TR calls near disease-associated genes were subjected to manual curation: we evaluated concordance with each patient’s clinical phenotype and confirmed the repeat-length expansion by visual inspection of raw read alignments. Results Long-read data enable high-quality de novo genome assembly sufficient for accurate variant detection High-quality LRS data were successfully obtained for all enrolled individuals. We used PacBio and ONT LRS platforms to sequence 28 and 12 individuals, respectively. We obtained an average coverage of 11.95× and 31.29× for PacBio samples (hereafter, PacBio 10× and 30× samples, respectively) and 35.67× for ONT samples and mean read length of 14.85 kb, 15.53 kb, and 14.21 kb for PacBio 10×, PacBio 30×, and ONT samples (Table S2). For patients with or without parental SR WGS data, long reads were assembled into contigs using LRS-only or trio-binning approaches, respectively. Although LRS-only assemblies were only partially phased while trio-binned assemblies achieved full phasing, all assemblies exhibited sufficient contiguity (N50: PacBio 10× samples, 0.25–3.50 Mb; PacBio 30× samples, 10.78–59.97 Mb; ONT samples, 1.43–58.08 Mb) to enable phasing and comparison of large genomic segments ( Fig. 2A and Table S3). Download figure Open in new tab Fig. 2: Quality assessment of LRS-generated genome assemblies. A Cumulative contig length distributions for each assembly, stratified by sequencing platform and coverage depth. Assemblies generated at ≥ 30× coverage exhibit contiguity comparable to reference-grade genomes (HPRC samples). B Comparison of maternal and paternal assembly quality values (QV) calculated by k -mer distributions of assembly and raw SRS data. C Phasing completeness of maternal and paternal haplotypes, demonstrating near-complete assemblies for trio-binned samples and strong performance across high-coverage datasets. D Average methylated CpG ratios in promoter regions for cohort samples. Our sample prefix SNUH-is not shown. As presented in Fig. S2, SNUH-17P, SNUH-18P, SNUH-18S, and SNUH-21P samples originated from fibroblasts instead of blood. E Pangenome growth curves using HPRC and our cohort samples. Coverage indicates the number of haplotypes sharing variants and quorum the number of haplotypes sharing variants. For example, coverage ≥ 1 and quorum ≥ 100% indicates that variants are shared by all sample assemblies, while coverage ≥ 1 and quorum ≥ 0% represent singleton variants. All genome assemblies were high-quality in terms of base-and phasing-level accuracy and our LRS data exhibited consistent methylation profiles ( Fig. 2 ). For samples with available LRS and SRS, we calculated base-level accuracy of the genome assembly by comparing k -mer distributions of matched genome assembly and SR WGS data. All genome assemblies exhibited 37.72–57.59 quality values (QVs), specifically, QV37.40–45.69 for PacBio 10×, QV38.12–57.59 for PacBio 30×, and QV48.29–54.51 for ONT samples (Table S4). Of these, genome assemblies constructed using PacBio 30× and ONT samples were comparable to state-of-the-art HPRC genome assemblies (QV51.59–58.98 for HPRC samples; Fig. 2B ). For samples where both maternal and paternal SR WGS were available, we also assessed the phasing-level of the genome assembly by analyzing haplotype-specific k -mer distributions between each haplotype genome assembly and each parental SR WGS data. These samples were solely sequenced using PacBio HiFi platforms; they exhibited 92.38–99.81% haplotype-specific k -mer completeness, indicating that the haplotype genome assemblies of each sample were accurately phased ( Fig. 2C ). Specifically, PacBio 10× and 30× samples exhibited 92.38–98.60% and 97.24–99.81% k -mer completeness, while HPRC did 98.53–99.80% k -mer completeness (Table S4). Further, the methylation profiles of these samples were highly consistent among samples, except for the patient with triple X syndrome (SNUH-04P), which showed outlier methylation patterns ( Fig. 2D for blood samples and Fig. S2 for fibroblast samples). In these patients, we analyzed variants by integrating genome assemblies into a single pangenome graph. For this pangenome graph, we merged 94 and 58 genome assemblies of 47 HPRC samples and our 29 cohort samples, respectively, excluding 22 genome assemblies of 11 PacBio 10× samples due to low depth ( Fig. 2E ). This pangenome graph contains a total of 20.5M SNVs, 73-K SVs, and 573-K TR loci. Moreover, our genome assemblies revealed patient-specific variants, consisting of 1.9M SNVs, 29 K SVs, and 159 TR expansion loci. Next, these patient-specific variants were investigated to identify potential causal variants in undiagnosed patient samples ( Fig. 1 and Table 1 ). LRS enabled detection of complex structural variants, noncoding deletions, repeat expansions associated with methylation changes, and accurate phasing of compound heterozygous variants, many unresolved by previous SRS methods. View this table: View inline View popup Table 1: Summary of identified variants and diagnostic contributions of long-read sequencing in undiagnosed rare disease patients. Our pangenome analysis outperforms previous short-read-based approaches To systematically assess the diagnostic contribution of LRS, we analyzed a cohort of 33 previously undiagnosed rare disease families. Sample IDs (SNUH-01 to SNUH-33) were assigned in order of diagnostic outcome: newly diagnosed by both SRS and LRS (SNUH-01 to SNUH-03), newly diagnosed by LRS (SNUH-04 to SNUH-12), and undiagnosed despite LRS (SNUH-13 to SNUH-33). Table 1 summarizes clinical features, causative genes, variant types, and phasing information for all diagnosed families. A molecular diagnosis was established in 12 of 33 previously undiagnosed families. Of these, 3 cases (SNUH-01 to 03) harbored pathogenic variants retrospectively detectable by SR WGS and included to assess the confirmatory capacity of LRS. The remaining 9 diagnoses (SNUH-04 to 12) were achieved exclusively through LRS-based variant detection or phasing. Among the nine LRS-contributed cases, the identified diagnostic variants included deep intronic SNVs, large deletions, repeat expansions with associated methylation abnormalities, and complex rearrangements. In several cases, LRS enabled phasing of compound heterozygous variants even in the absence of parental samples. These findings underscore the broad utility of LRS for detecting complex and noncoding variants in unresolved rare disease cases. A triple X syndrome patient with a de novo CXXC1 SNV exhibits a global methylation defect signature A female patient with a prior diagnosis of triple X syndrome (SNUH-04P) presented with profound global developmental delay, microcephaly (<3rd percentile), and syndromic facial features. She was unable to walk independently, and her speech was limited to a few simple words such as’mom’ and’dad,’ with no ability to form sentences. This unusually severe phenotype could not be sufficiently explained by triple X syndrome alone. LRS-based genome-wide methylation profiling revealed a markedly altered epigenetic signature, including higher promoter methylation and an abnormal 5′ UTR methylation pattern across all chromosomes ( Fig. 2D and Figs. S2 and S3). Retrospective re-analysis of trio-based exome sequencing identified a de novo one-base-pair frameshift insertion in CXXC1 (c.1011dupT; p. Glu338Ter), a gene involved in H3K4 methylation and maintenance of DNA methylation homeostasis ( 54 – 57 ). This variant was absent from both the gnomAD (v4.1.0) and Bravo databases ( https://bravo.sph.umich.edu/ ). In addition, CXXC1 has a probability of loss-of-function intolerance (pLI) score of 1.0, indicating a high likelihood of haploinsufficiency. Together, these findings support the hypothesis that CXXC1 dysfunction plays a causal role in the patient’s atypical phenotype, highlighting the diagnostic potential of methylation profiling in rare diseases where sequence-level variants are absent or uninformative. A tandem repeat expansion in AFF3 is associated with promoter hypermethylation A patient with neurodevelopmental delay (SNUH-05P) harbored a (CGG) 145 TR expansion in the promoter region of AFF3 , identified by LRS ( Fig. 3A ). The expansion was associated with promoter hypermethylation, consistent with previously reported cases of AFF3 -related intellectual disability ( Figs. 3B and 3C ) ( 11 , 58 ). This repeat expansion was undetected in prior SRS analysis due to its size and location within a GC-rich region. This case demonstrates the value of LRS in detecting unstable TRs with epigenetic consequences. Download figure Open in new tab Fig. 3: Tandem repeat expansion and hypermethylation profile in AFF3 . A Tandem repeat motif counts and allele frequency distributions in HPRC and our cohort. The exceptional TR expansion in SNUH-05P is denoted as large stacks of CGG motifs. B Hypermethylated patterns in the promoter (upper) and specific CpG sites where SNUH-05P exhibited higher methylation ratios (lower). Phased genome assemblies reveal compound heterozygous variants in NGLY1 , HEXB , and ERCC8 A patient with features consistent with a congenital disorder of deglycosylation (SNUH-06P) remained undiagnosed despite trio-based WES and WGS. LRS identified compound heterozygous variants in NGLY1 , a gene associated with an autosomal recessive congenital disorder of deglycosylation-1 ( Fig. 4A ) ( 59 ). We identified a missense variant ( NGLY1 : c.925T>C; p. Cys309Arg), classified as a variant of uncertain significance (VUS) in ClinVar (Variation ID: VCV000219542.5) but extremely rare (gnomAD exome allele frequency (AF) = 1.73×10 −7 ; genome AF = 6.57×10 −6 ), with a high Combined Annotation Dependent Depletion (CADD) score of 26.3 and strong evolutionary conservation (PhyloP100 = 9.021). The second variant was a previously unreported 16.6 kb deletion spanning exons 1–2. Download figure Open in new tab Fig. 4: Representative cases demonstrating the diagnostic contributions of LRS. A Complex structural variant including a large deletion and insertion in NGLY1 (SNUH-06P). B Compound heterozygous deep intronic SNV and exon deletion in HEXB (SNUH-07 Siblings). C Compound heterozygous complex rearrangement and complex substitution in ERCC8 (SNUH-08P). Two siblings presenting with a progressive motor disorder and features clinically resembling neuronal ceroid lipofuscinosis (SNUH-07P and SNUH-07S) were found to carry compound heterozygous variants in HEXB , which causes Sandhoff disease, a condition with partially overlapping clinical manifestations ( 60 , 61 ). LRS identified an approximately 6.9 kb deletion spanning exons 4–5 in trans with a deep intronic SNV ( HEXB : c.771+985G>A; Fig. 4B ). The intronic variant missed by WES due to its noncoding location, is extremely rare (gnomAD AF = 1.31×10 −5 ), predicted to alter splicing as an acceptor gain (SpliceAI score = 0.85), and has a high CADD score = 25.2. LRS allowed for phasing and structural resolution, highlighting its utility in detecting and characterizing compound heterozygous variants involving both coding and noncoding regions. Two siblings (SNUH-08P and SNUH-08S) presented with failure to thrive, short stature, deep-set eyes, nystagmus, episodic encephalopathy, spastic-ataxic gait, and dental crowding, and they were clinically suspected to have Cockayne syndrome. Brain MRI showed hypomyelination and vermian hypoplasia. LRS revealed a complex rearrangement involving inversion and insertion within ERCC8 , in a previously reported region associated with Cockayne syndrome type A ( 62 ). Additionally, we identified a novel complex substitution of 18.8 kb by 7 bp in ERCC8 ( Fig. 4C ). LRS also enabled phasing of the two variants, confirming location on opposite alleles ( in trans ) without parental testing. PCR amplification and Sanger sequencing validated the result (Fig. S4). This case illustrates the utility of LRS for detecting and phasing both known and novel structural variants in autosomal recessive Mendelian disorders. Resolution of previously ambiguous pathogenic variants through LRS-based phasing, assembly, and repeat quantification Next, we reexamined previously reported pathogenic variants to resolve phasing and structural ambiguities using LRS. In SNUH-09 siblings, despite the absence of a parental sample, LRS confirmed that the SHMT2 variants were in trans (Supplementary Text Case 1). In the SNUH-10 siblings and SNUH-12P, LRS clarified the structure and breakpoints of unresolved deletions and insertions in SLC16A2 and FKTN , respectively (Supplementary Texts Case 2 and Case 3) ( 63 ). In SNUH-11P, we previously identified pathogenic NOTCH2NLC GGC repeat expansions, with repeat sizes estimated of 82–90 ( 64 ). With our current assembly-based LRS approach, we narrowed down the repeat size to 94, demonstrating the enhanced resolution of this de novo assembly. Together, these cases highlight the advantages of LRS in phasing, structural resolution, and repeat quantification—capabilities that enhance diagnostic precision beyond the limits of SRS. Potential repair mechanisms for large variants based on genomic signatures We further annotated the potential repair mechanisms underlying the formation of these newly found large SVs by analyzing DNA repair signatures in their flanking sequences. For the 16,588-bp deletion in NGLY1 , we found 1-bp microhomologous sequences at the breakpoints. This microhomology implies that a DNA double-strand break (DSB) was repaired by polymerase theta-mediated end-joining (TMEJ), which excises both DSB ends annealling them to repair the DSB site ( Fig. 5A ). Similarly, the 6,875-bp and 2,876-bp deletions in HEXB and SLC16A2 , respectively, exhibited ∼300-bp long homologous sequences. This longer homology implies that the single-strand annealing (SSA) repair mechanism requiring much longer homologous sequences to anneal both DSB ends than TMEJ acted on the corresponding DSB sites ( Figs. 5B and 5C ). The complex substitution of 18,756 bp by 7 bp in ERCC8 exhibited several microhomologous sequences, supporting that TMEJ was involved in this complex substitution, as this type of templated insertion could be generated solely by polymerase theta. In this case, TMEJ may have repaired the DSB sites through several rounds of template switching and microhomology search followed by synthesis ( Fig. 5D ). Download figure Open in new tab Fig. 5: Annotating potential repair mechanisms underlying newly identified large SVs. A 16,588-bp deletion in NGLY1 . A DSB may have occurred in the gene as both DSB ends were resected to expose microhomologous sequences. These microhomologous sequences may have been annealed and repaired (a signature of TMEJ). B – C 6,875-bp and 2,876-bp deletions in HEXB and SLC16A2 , respectively. Although similar events may have occurred in these genes, their homologous sequences were much longer (∼300-bp), which is a signature of SSA rather than TMEJ. D Complex substitution of 18,756 bp by 7 bp in ERCC8 . Three rounds of template switching and microhomology search followed by synthesis are shown. Double-stranded DNA sequences are indicated; single-stranded DNA sequences represent resected DNA molecules exposing the microhomologous or homologous sequences to be annealed. Same colors represent a pair of microhomologous or homologous sequences. DSB, double-strand break; TMEJ, polymerase theta-mediated end-joining; SSA, single-strand annealing. Discussion LRS combined with assembly-and graph-based genome analysis reconstructs patient-specific haplotypes and directly compares them to a pangenome reference graph, providing a robust framework for uncovering variants often missed by SRS. Through this approach, our study delivered molecular diagnoses of approximately 30% of families who had exhausted conventional testing, demonstrating that sequence length remains a critical determinant of diagnostic success. By integrating de novo assembly with graph-based variant calling, we not only increased variant detection sensitivity but also achieved phasing of compound heterozygous variants without requiring parental data. This capability is particularly valuable for adult patients or isolated cases where trio sequencing is not feasible. In addition, LRS unlocked access to variant classes elusive to conventional SRS, while simultaneously enabling methylation profiling. Representative cases in our cohort illustrate the distinct diagnostic power of LRS: for example, the resolution of a novel complex substitution of 19 kb by 7 bp in ERCC8 , the discovery of a promoter repeat expansion in AFF3 with concurrent hypermethylation, and the identification of a CXXC1 variant associated with a genome-wide methylation outlier profile. For AFF3 , several studies have demonstrated that single nucleotide variants, TR expansions, and methylation changes in its promoter region are associated with intellectual disability ( 11 , 58 , 65 , 66 ). To date, only two studies have reported both a TR expansion and associated promoter methylation changes at the same AFF3 locus as in our patient–one using pyrosequencing in three families, and the other employing LRS in two trios ( 11 , 58 ). Our study represents the third report supporting these observations, and is the first in an East Asian population. Despite the lack of targeted therapies, the epigenetic nature of this mechanism suggests potential avenues for therapeutic development. This study provides new evidence implicating CXXC1 in disease through globally aberrant DNA methylation. To date, CXXC1 has not been directly associated with human disease. However, it forms a histone H3 lysine 4 (H3K4) methyltransferase complex together with SETD1A that can regulate both H3K4 and DNA methylation ( 67 – 69 ). SETD1A is a well-characterized OMIM gene involved in neurodevelopmental disorders (OMIM #618832 and #619056) ( 69 – 73 ). CXXC1 can directly bind unmethylated CpG sites and recruit a H3K4 methyltransferase, thereby blocking the access of DNA methyltransferases to these CpG sites and inhibiting DNA methylation ( 69 , 74 – 80 ). This function provides a plausible explanation for the global DNA methylation defects observed in the patient harboring said mutation. Although further validation is necessary, our findings provide strong evidence that CXXC1 may be a critical mediator of global DNA methylation dysregulation in neurodevelopmental disorders. Notably, we identified novel large variants in NGLY1 , HEXB , and ERCC8 and a noncoding regulatory deletion in SLC16A2 , expanding the known mutational spectrum of these genes. Identifying diverse variants is especially critical to maximize diagnostic yield, given the growing availability of targeted therapies, as such findings may point to clinically actionable or potentially treatable targets. For example, HEXB mutations cause Sandhoff disease, for which substrate reduction and gene therapies are currently in development ( 81 – 87 ). In the SLC16A2 -related Allan-Herndon-Dudley syndrome, early intervention with thyroid hormone analogs has clinical benefits, and recent preclinical studies further support the therapeutic potential of antisense oligonucleotides and AAV-based gene therapy to restore brain thyroid hormone transport ( 88 – 90 ). NGLY1 deficiency, though ultra-rare, has become a focus of emerging therapeutic strategies, including microbial metabolic therapy, pharmacological chaperones, and gene transfer approaches. Recent studies also suggest metabolic and proteostatic vulnerabilities that may offer additional intervention points ( 91 – 94 ). These findings accentuate how LRS and pangenome-based analysis not only improve diagnostic yield but also play an important role in guiding therapeutic decisions, particularly in the years to come. Moreover, precise breakpoint resolution afforded by LRS allowed us to infer the DNA damage repair mechanisms underlying these structural changes. Micro-or homologous sequences near these large deletions were genomic signatures of homology-directed repair mechanisms. It specifically implies that SSA was the potential mechanism of forming the deletions in HEXB and SLC16A2 and TMEJ was for the deletion and complex substitution in NGLY1 and ERCC8 , respectively. Interrogating the potential repair mechanisms of pathogenic variants provides valuable insight into how these variant alterations have emerged and persisted in the human population. By generating high-quality genome assemblies, methylation profiles, and variant catalogs from individuals of Korean ancestry, we also contribute population-specific resources that can support future variant filtering and discovery efforts. Although functional validation was beyond our scope, the convergence of genomic and phenotypic evidence supports the clinical relevance of our findings. Nonetheless, >50% patients in our cohort remain undiagnosed, highlighting the need for broader application of LRS and continued advances in analytical methods. CONCLUSIONS Our integrative LRS and pangenome-based analysis demonstrates that direct interrogation of both the genome and epigenome can uncover pathogenic variations that elude detection by conventional SRS. The discovery of CXXC1 as a novel disease gene, along with expanded allelic spectra in established genes ( NGLY1 , HEXB , ERCC8 , and SLC16A2 ), underscores the diagnostic value of LRS technologies. This study also supports the integration of epigenomic insights into precision medicine workflows. Broader application of this framework may improve diagnostic yield and accelerate the translation of comprehensive genomic and epigenomic data into actionable clinical care. Data Availability The raw sequencing data generated during the current study are available in the NCBI BioProject database, https://www.ncbi.nlm.nih.gov/bioproject/, under the accession number PRJNA1284003. SUPPLEMENTARY INFORMATION Additional file1. Fig. S1: Overview of long-read sequencing datasets for individual patients. Additional file2. Table. S1: Clinical characteristics and sample metadata of individuals included in this study. Additional file3. Fig. S2: Global methylation profiles categorized by genic components. Additional file4. Table. S2: Summary statistics of long-read sequencing data from 40 patients. Additional file5. Table. S3: Summary statistics of genome assemblies from 40 patients. Additional file6. Table. S4: Quality value and completeness of haplotype-resolved genome assemblies for each patient. Additional file7. Fig. S3: Methylation profiles across chromosomes of the patient (SNUH-04P) harboring a CXXC1 mutation. Additional file8. Fig. S4: Validation of a novel substitution of SNUH-08P identified in ERCC8 . Additional file9. Supplementary Text Case1–3: Diagnosed cases using LRS in addition to previous SRS analyses. DECLARATIONS Ethics approval and consent to participate All participants (or their legal guardians) provided written informed consent for study participation. The study protocol was approved by the Institutional Review Board (IRB number: 2402-149-1518) of Seoul National University Hospital. Consent for publication Written informed consent for publication was obtained from all patients or their legal guardians. The consent permits the publication of relevant clinical information while explicitly prohibiting the disclosure of personally identifiable information. Availability of data and materials The raw sequencing data generated during the current study are available in the NCBI BioProject database, https://www.ncbi.nlm.nih.gov/bioproject/ , under the accession number PRJNA1284003. The 94 pangenome assemblies of HPRC used during the current study are available in the Zenodo repository at https://zenodo.org/record/5826274/files/HPRC-yr1.agc?download=1 (HPRC-yr1.agc for HPRC Year 1 genome assemblies). Competing interests The authors declare that they have no competing interests. Funding SNUH Lee Kun-hee Child Cancer & Rare Disease Project, Republic of Korea [22B-001-0100]. National Research Foundation of Korea (NRF) grant funded by the Korea government (MSIT) [2025-00519278]. Funding for open access charge: Not Determined. Author’s contributions S.S.J.: Conceptualization, Methodology, Formal analysis, Investigation, Writing—original draft, Writing—review & editing. S.K.: Methodology, Formal analysis, Investigation, Writing— original draft, Writing—review & editing. S.L.: Conceptualization, Methodology, Investigation, Writing—original draft, Writing—review & editing, Supervision. S.Y.K.: Investigation, Writing—review & editing. J.M.: Investigation, Writing—review & editing. J.K.: Conceptualization, Methodology, Writing—original draft, Writing—review & editing, Funding acquisition, Supervision. J-H.C.: Conceptualization, Methodology, Funding acquisition, Supervision. All authors read and approved the final manuscript. Acknowledgements We acknowledge and thank all the participating individuals and their families. LIST OF ABBREVIATIONS LRS Long-read sequencing SRS Short-read sequencing WES Whole-exome sequencing WGS Whole-genome sequencing SR Short-read TR tandem repeat HPRC Human Pangenome Reference Consortium LR Long-read SV Structural variant SNV Single nucleotide variant PacBio Pacific Biosciences ONT Oxford Nanopore Technologies QV Quality value pLI Probability of loss-of-function intolerance VUS Variant of uncertain significance AF Allele frequency DSB DNA double-strand break TMEJ Polymerase theta-mediated end-joining SSA Single-strand annealing REFERENCES 1. ↵ The Lancet Global H. The landscape for rare diseases in 2024 . Lancet Glob Health . 2024 ; 12 ( 3 ): e341 . OpenUrl 2. ↵ Ahn JH , Yoon JG , Cho J , Lee S , Kim S , Kim MJ , et al. Implementing genomic medicine in clinical practice for adults with undiagnosed rare diseases. npj Genomic Medicine . 2024 ; 9 ( 1 ): 1 – 12 . OpenUrl 3. ↵ Lam WKJ , Lau CS , Luk HM , Au LWC , Chan GCP , Chan WYH , et al. The implementation of genome sequencing in rare genetic diseases diagnosis: a pilot study from the Hong Kong genome project . Lancet Reg Health West Pac . 2025 ; 55 : 101473 . 4. ↵ Jobanputra V , Schroeder B , Rehm HL , Shen W , Spiteri E , Nakouzi G , et al. Advancing access to genome sequencing for rare genetic disorders: recent progress and call to action . NPJ Genom Med . 2024 ; 9 ( 1 ): 23 . OpenUrl PubMed 5. Sha Q-Q , Zhu Y-Z , Xiang Y , Yu J-L , Fan X-Y , Li Y-C , et al. Role of CxxC-finger protein 1 in establishing mouse oocyte epigenetic landscapes . Nucleic Acids Res . 2021 ; 49 ( 5 ): 2569 – 82 . OpenUrl PubMed 6. Hong J , Lee D , Hwang A , Kim T , Ryu H-Y , Choi J . Rare disease genomics and precision medicine . Genomics & Informatics . 2024 ; 22 ( 1 ): 1 – 11 . OpenUrl PubMed 7. Jang SH , Yoon K , Gee HY . Common genetic etiologies of sensorineural hearing loss in Koreans . Genomics & Informatics . 2024 ; 22 ( 1 ): 1 – 9 . OpenUrl PubMed 8. Cho A . Neuromuscular diseases: genomics-driven advances . Genomics & Informatics . 2024 ; 22 ( 1 ): 1 – 4 . OpenUrl PubMed 9. ↵ Lee JS . Molecular diagnostic approach to rare neurological diseases from a clinician viewpoint . Genomics & Informatics . 2024 ; 22 ( 1 ): 1 – 4 . OpenUrl PubMed 10. ↵ Wojcik MH , Lemire G , Berger E , Zaki MS , Wissmann M , Win W , et al. Genome Sequencing for Diagnosing Rare Diseases . N Engl J Med . 2024 ; 390 ( 21 ): 1985 – 97 . OpenUrl CrossRef PubMed 11. ↵ Jadhav B , Garg P , van Vugt JJFA , Ibanez K , Gagliardi D , Lee W , et al. A phenome-wide association study of methylated GC-rich repeats identifies a GCC repeat expansion in AFF3 associated with intellectual disability . Nat Genet . 2024 ; 56 ( 11 ): 2322 – 32 . OpenUrl PubMed 12. ↵ Clark MM , Hildreth A , Batalov S , Ding Y , Chowdhury S , Watkins K , et al. Diagnosis of genetic diseases in seriously ill children by rapid whole-genome sequencing and automated phenotyping and interpretation . Sci Transl Med . 2019 ; 11 ( 489 ). 13. ↵ Nurchis MC , Altamura G , Riccardi MT , Radio FC , Chillemi G , Bertini ES , et al. Whole genome sequencing diagnostic yield for paediatric patients with suspected genetic disorders: systematic review, meta-analysis, and GRADE assessment . Arch Public Health . 2023 ; 81 ( 1 ): 93 . OpenUrl PubMed 14. ↵ Ewans LJ , Minoche AE , Schofield D , Shrestha R , Puttick C , Zhu Y , et al. Whole exome and genome sequencing in mendelian disorders: a diagnostic and health economic analysis . Eur J Hum Genet . 2022 ; 30 ( 10 ): 1121 – 31 . OpenUrl CrossRef PubMed 15. ↵ Porubsky D , Dashnow H , Sasani TA , Logsdon GA , Hallast P , Noyes MD , et al. A familial, telomere-to-telomere reference for human de novo mutation and recombination from a four-generation pedigree . bioRxiv . 2024 . 16. Collins RL , Talkowski ME . Diversity and consequences of structural variation in the human genome . Nature Reviews Genetics . 2025 ; 26 ( 7 ): 443 – 62 . OpenUrl PubMed 17. Steyaert W , Sagath L , Demidov G , Yépez VA , Esteve-Codina A , Gagneur J , et al. Unravelling undiagnosed rare disease cases by HiFi long-read genome sequencing . medRxiv . 2024 . 18. ↵ Mitsuhashi S , Matsumoto N . Long-read sequencing for rare human genetic diseases . Journal of Human Genetics . 2019 ; 65 ( 1 ): 11 – 9 . OpenUrl PubMed 19. Negi S , Stenton SL , Berger SI , Canigiula P , McNulty B , Violich I , et al. Advancing long-read nanopore genome assembly and accurate variant calling for rare disease detection . Am J Hum Genet . 2025 ; 112 ( 2 ): 428 – 49 . OpenUrl PubMed 20. ↵ Eisfeldt J , Ameur A , Lenner F , de Boer ETB , Ek M , Wincent J , et al. Towards routine long-read sequencing for rare disease: a national pilot study on chromosomal rearrangements . medRxiv . 2024 . 21. Yu SY , Xi YL , Xu FQ , Zhang J , Liu YS . Application of long read sequencing in rare diseases: The longer, the better? Eur J Med Genet . 2023 ; 66 ( 12 ): 104871 . OpenUrl PubMed 22. Xu R , Zhang M , Yang X , Tian W , Li C . Decoding complexity: The role of long-read sequencing in unraveling genetic disease etiologies . Mutat Res Rev Mutat Res . 2025 ; 795 : 108529 . 23. Su Y , Fan L , Shi C , Wang T , Zheng H , Luo H , et al. Deciphering Neurodegenerative Diseases Using Long-Read Sequencing . Neurology . 2021 ; 97 ( 9 ): 423 – 33 . OpenUrl CrossRef PubMed 24. Marwaha S , Knowles JW , Ashley EA . A guide for the diagnosis of rare and undiagnosed disease: beyond the exome . Genome Medicine . 2022 ; 14 ( 1 ): 1 – 22 . OpenUrl CrossRef PubMed 25. ↵ Del Gobbo GF , Boycott KM . The additional diagnostic yield of long-read sequencing in undiagnosed rare diseases . Genome Res . 2025 . 26. Groza C , Schwendinger-Schreck C , Cheung WA , Farrow EG , Thiffault I , Lake J , et al. Pangenome graphs improve the analysis of structural variants in rare genetic diseases . Nature Communications . 2024 ; 15 ( 1 ): 1 – 12 . OpenUrl PubMed 27. Hiatt SM , Lawlor JMJ , Handley LH , Latner DR , Bonnstetter ZT , Finnila CR , et al. Long-read genome sequencing and variant reanalysis increase diagnostic yield in neurodevelopmental disorders . Genome Res . 2024 ; 34 ( 11 ): 1747 – 62 . OpenUrl Abstract / FREE Full Text 28. ↵ Sen S , Handler HP , Victorsen A , Flaten Z , Ellison A , Knutson TP , et al. Validation of a comprehensive long-read sequencing platform for broad clinical genetic diagnosis . Front Genet . 2025 ; 16 : 1499456 . 29. ↵ Eisfeldt J , Ek M , Nordenskjöld M , Lindstrand A . Toward clinical long-read genome sequencing for rare diseases . Nature Genetics . 2025 : 1 – 10 . 30. Kim J , Park JL , Yang JO , Kim S , Joe S , Park G , et al. Highly accurate Korean draft genomes reveal structural variation highlighting human telomere evolution . Nucleic Acids Res . 2025 ; 53 ( 1 ). 31. ↵ Ryu H , Han H , Kim C , Kim J . GDBr: genomic signature interpretation tool for DNA double-strand break repair mechanisms . Nucleic Acids Res . 2025 ; 53 ( 2 ). 32. ↵ Cheung WA , Johnson AF , Rowell WJ , Farrow E , Hall R , Cohen ASA , et al. Direct haplotype-resolved 5-base HiFi sequencing for genome-wide profiling of hypermethylation outliers in a rare disease cohort . Nature Communications . 2023 ; 14 ( 1 ): 1 – 13 . OpenUrl PubMed 33. Han H , Lee HH , Kim MG , Shin YS , Chung JS , Kim J . Genome assembly resources of genitourinary cancers for chromosomal aberration at the single nucleotide level . Scientific Data . 2025 ; 12 ( 1 ): 1 – 11 . OpenUrl PubMed 34. ↵ Kim J , Kim Y , Shin J , Kim Y-K , Lee DH , Park J-W , et al. Fully phased genome assemblies and graph-based genetic variants of the olive flounder, Paralichthys olivaceus . Scientific Data . 2024 ; 11 ( 1 ): 1 – 11 . OpenUrl CrossRef PubMed 35. ↵ Logsdon GA , Vollger MR , Eichler EE . Long-read human genome sequencing and its applications . Nat Rev Genet . 2020 ; 21 ( 10 ): 597 – 614 . OpenUrl PubMed 36. ↵ Liao W-W , Asri M , Ebler J , Doerr D , Haukness M , Hickey G , et al. A draft human pangenome reference . Nature . 2023 ; 617 (7960): 312 -24. OpenUrl CrossRef PubMed 37. Wang T , Antonacci-Fulton L , Howe K , Lawson HA , Lucas JK , Phillippy AM , et al. The Human Pangenome Project: a global resource to map genomic diversity . Nature . 2022 ; 604 (7906): 437 -46. OpenUrl CrossRef PubMed 38. Gao Y , Yang X , Chen H , Tan X , Yang Z , Deng L , et al. A pangenome reference of 36 Chinese populations . Nature . 2023 ; 619 (7968): 112 -21. OpenUrl CrossRef PubMed 39. Nassir N , Almarri MA , Kumail M , Mohamed N , Balan B , Hanif S , et al. A draft Arab pangenome reference . bioRxiv . 2024 . 40. ↵ Olbrich M , Mousa M , Wohlers I , Al Aamri A , Alnaqbi H , Alsuwaidi AH , et al. An Emirati pangenome incorporating a diploid telomere-to-telomere reference . bioRxiv . 2024 . 41. ↵ Shen W , Le S , Li Y , Hu F . SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation . PLOS ONE . 2016 ; 11 ( 10 ): e0163962 . OpenUrl CrossRef PubMed 42. ↵ Shen W , Sipos B , Zhao L . SeqKit2: A Swiss army knife for sequence and alignment processing . iMeta . 2024 ; 3 ( 3 ): e191 . OpenUrl CrossRef 43. ↵ Cheng H , Concepcion GT , Feng X , Zhang H , Li H . Haplotype-resolved de novo assembly using phased assembly graphs with hifiasm . Nature Methods . 2021 ; 18 ( 2 ): 170 – 5 . OpenUrl CrossRef PubMed 44. Cheng H , Jarvis ED , Fedrigo O , Koepfli K-P , Urban L , Gemmell NJ , et al. Haplotype-resolved assembly of diploid genomes without parental data . Nature Biotechnology . 2022 ; 40 ( 9 ): 1332 – 5 . OpenUrl CrossRef PubMed 45. ↵ Cheng H , Asri M , Lucas J , Koren S , Li H . Scalable telomere-to-telomere assembly for diploid and polyploid genomes with double graph . Nature Methods . 2024 ; 21 ( 6 ): 967 – 70 . OpenUrl PubMed 46. ↵ Stanojević D , Lin D , Nurk S , de Sessions PF , Šikić M. Telomere-to-Telomere Phased Genome Assembly Using HERRO-Corrected Simplex Nanopore Reads . bioRxiv . 2024 . 47. ↵ Rhie A , Walenz BP , Koren S , Phillippy AM . Merqury: reference-free quality, completeness, and phasing assessment for genome assemblies . Genome Biology . 2020 ; 21 ( 1 ): 1 – 27 . OpenUrl CrossRef PubMed 48. ↵ Nurk S , Koren S , Rhie A , Rautiainen M , Bzikadze AV , Mikheenko A , et al. The complete sequence of a human genome . Science . 2022 . 49. ↵ Hickey G , Monlong J , Ebler J , Novak AM , Eizenga JM , Gao Y , et al. Pangenome graph construction from genome alignments with Minigraph-Cactus . Nature biotechnology . 2024 ; 42 ( 4 ). 50. ↵ Parmigiani L , Garrison E , Stoye J , Marschall T , Doerr D . Panacus: fast and exact pangenome growth and core size estimation . Bioinformatics . 2024 ; 40 ( 12 ). 51. ↵ Grüning B , Dale R , Sjödin A , Chapman BA , Rowe J , Tomkins-Tinch CH , et al. Bioconda: sustainable and comprehensive software distribution for the life sciences . Nat Methods . 2018 ; 15 ( 7 ): 475 – 6 . OpenUrl CrossRef PubMed 52. ↵ Danecek P , Bonfield JK , Liddle J , Marshall J , Ohan V , Pollard MO , et al. Twelve years of SAMtools and BCFtools . Gigascience . 2021 ; 10 ( 2 ):giab008. 53. ↵ Benson G . Tandem repeats finder: a program to analyze DNA sequences . Nucleic acids research . 1999 ; 27 ( 2 ). 54. ↵ Cao W , Guo J , Wen X , Miao L , Lin F , Xu G , et al. CXXC finger protein 1 is critical for T-cell intrathymic development through regulating H3K4 trimethylation . Nature Communications . 2016 ; 7 ( 1 ): 1 – 11 . OpenUrl 55. Lin F , Meng X , Guo Y , Cao W , Liu W , Xia Q , et al. Epigenetic initiation of the T17 differentiation program is promoted by Cxxc finger protein 1 . Sci Adv . 2019 ; 5 ( 10 ):eaax1608. 56. Sha Q-Q , Dai X-X , Jiang J-C , Yu C , Jiang Y , Liu J , et al. CFP1 coordinates histone H3 lysine-4 trimethylation and meiotic cell cycle progression in mouse oocytes . Nature Communications . 2018 ; 9 ( 1 ): 1 – 17 . OpenUrl PubMed 57. ↵ Meng X , Zhu Y , Liu K , Wang Y , Liu X , Liu C , et al. CXXC-finger protein 1 associates with FOXP3 to stabilize homeostasis and suppressive functions of regulatory T cells . eLife . 2025 ; 13 :RP103417. 58. ↵ Metsu S , Rooms L , Rainger J , Taylor MS , Bengani H , Wilson DI , et al. FRA2A Is a CGG Repeat Expansion Associated with Silencing of AFF3 . PLoS Genetics . 2014 ; 10 ( 4 ): e1004242 . OpenUrl 59. ↵ Freeze HH . Understanding human glycosylation disorders: biochemistry leads the charge . J Biol Chem . 2013 ; 288 ( 10 ): 6936 – 45 . OpenUrl Abstract / FREE Full Text 60. ↵ Bolhuis PA , Bikker H . Deletion of the 5’-region in one or two alleles of HEXB in 15 out of 30 patients with Sandhoff disease . Human genetics . 1992 ; 90 ( 3 ). 61. ↵ McInnes B , Potier M , Wakamatsu N , Melancon SB , Klavins MH , Tsuji S , et al. An unusual splicing mutation in the HEXB gene is associated with dramatically different phenotypes in patients from different racial backgrounds . The Journal of clinical investigation . 1992 ; 90 ( 2 ). 62. ↵ Wang X , Huang Y , Yan M , Li J , Ding C , Jin H , et al. Molecular spectrum of excision repair cross-complementation group 8 gene defects in Chinese patients with Cockayne syndrome type A . Sci Rep . 2017 ; 7 ( 1 ): 13686 . OpenUrl PubMed 63. ↵ Yoon JG , Lee S , Park S , Jang SS , Cho J , Kim MJ , et al. Identification of a novel non-coding deletion in Allan-Herndon-Dudley syndrome by long-read HiFi genome sequencing . BMC Med Genomics . 2025 ; 18 ( 1 ): 41 . OpenUrl PubMed 64. ↵ Lee S , Yoon JG , Hong J , Kim T , Kim N , Vandrovcova J , et al. Prevalence and Characterization of GGC Repeat Expansions in Koreans: From a Hospital Cohort Analysis to a Population-Wide Study . Neurol Genet . 2024 ; 10 ( 3 ): e200147 . OpenUrl PubMed 65. ↵ Voisin N , Schnur RE , Douzgou S , Hiatt SM , Rustad CF , Brown NJ , et al. Variants in the degron of AFF3 are associated with intellectual disability, mesomelic dysplasia, horseshoe kidney, and epileptic encephalopathy . American journal of human genetics . 2021 ; 108 ( 5 ). 66. ↵ Shimizu D , Sakamoto R , Yamoto K , Saitsu H , Fukami M , Nishimura G , et al. De novo AFF3 variant in a patient with mesomelic dysplasia with foot malformation . Journal of Human Genetics . 2019 ; 64 ( 10 ): 1041 – 4 . OpenUrl PubMed 67. ↵ Lee JH , Skalnik DG . CpG-binding protein (CXXC finger protein 1) is a component of the mammalian Set1 histone H3-Lys4 methyltransferase complex, the analogue of the yeast Set1/COMPASS complex . The Journal of biological chemistry . 2005 ; 280 ( 50 ). 68. Lee JH , Tate CM , You JS , Skalnik DG . Identification and characterization of the human Set1B histone H3-Lys4 methyltransferase complex . The Journal of biological chemistry . 2007 ; 282 ( 18 ). 69. ↵ Tate CM , Lee J-H , Skalnik DG . CXXC finger protein 1 restricts the Setd1A histone H3K4 methyltransferase complex to euchromatin . The FEBS Journal . 2010 ; 277 ( 1 ): 210 – 23 . OpenUrl CrossRef PubMed 70. Wang S , Bleeck A , Nadif Kasri N , Kleefstra T , van Rhijn J-R , Schubert D . SETD1A Mediated H3K4 Methylation and Its Role in Neurodevelopmental and Neuropsychiatric Disorders . Front Mol Neurosci . 2021 ; 14 : 772000 . 71. Kummeling J , Stremmelaar DE , Raun N , Reijnders MRF , Willemsen MH , Ruiterkamp-Versteeg M , et al. Characterization of SETD1A haploinsufficiency in humans and Drosophila defines a novel neurodevelopmental syndrome . Mol Psychiatry . 2021 ; 26 ( 6 ): 2013 – 24 . OpenUrl PubMed 72. Zhang J , Tao Q , Yang Z , Li Y , Gan J . De novo variant of SETD1A causes neurodevelopmental disorder with dysmorphic facies: A case report . Psychiatry Clin Neurosci . 2022 ; 76 ( 2 ): 58 – 9 . OpenUrl 73. ↵ Clifton NE , Policicchio S , Walker EM , Castanho I , Bosworth ML , Saravanaraj KS , et al. Setd1a Loss-of-function Disrupts Epigenetic Regulation of Ribosomal Genes via Altered DNA Methylation . Schizophr Bull . 2025 . 74. ↵ Xu C , Bian C , Lam R , Dong A , Min J . The structural basis for selective binding of non-methylated CpG islands by the CFP1 CXXC domain . Nature communications . 2011 ; 2 . 75. Long HK , Blackledge NP , Klose RJ. ZF-CxxC domain-containing proteins, CpG islands and the chromatin connection . Biochemical Society transactions . 2013 ; 41 ( 3 ). 76. Thomson JP , Skene PJ , Selfridge J , Clouaire T , Guy J , Webb S , et al. CpG islands influence chromatin structure via the CpG-binding protein Cfp1 . Nature . 2010 ; 464 (7291). 77. Blackledge NP , Thomson JP , Skene PJ . CpG Island Chromatin Is Shaped by Recruitment of ZF-CxxC Proteins . Cold Spring Harb Perspect Biol . 2013 ; 5 ( 11 ): a018648 . OpenUrl Abstract / FREE Full Text 78. Blattler A , Farnham PJ . Cross-talk between site-specific transcription factors and DNA methylation states . J Biol Chem . 2013 ; 288 ( 48 ): 34287 – 94 . OpenUrl Abstract / FREE Full Text 79. Liu K , Min J . Structural Basis for the Recognition of Non-methylated DNA by the CXXC Domain . J Mol Biol . 2020 ; 432 ( 6 ): 1674 – 86 . OpenUrl PubMed 80. ↵ Yang L-Q , Hu H-Y , Han Y , Tang Z-Y , Gao J , Zhou Q-Y , et al. CpG-binding protein CFP1 promotes ovarian cancer cell proliferation by regulating BST2 transcription . Cancer Gene Therapy . 2022 ; 29 ( 12 ): 1895 – 907 . OpenUrl CrossRef PubMed 81. ↵ McCurdy VJ , Rockwell HE , Arthur JR , Bradbury AM , Johnson AK , Randle AN , et al. Widespread correction of central nervous system disease after intracranial gene therapy in a feline model of Sandhoff disease . Gene Therapy . 2014 ; 22 ( 2 ): 181 – 9 . OpenUrl PubMed 82. Gray-Edwards HL , Brunson BL , Holland M , Hespel A-M , Bradbury AM , McCurdy VJ , et al. Mucopolysaccharidosis-like phenotype in feline Sandhoff disease and partial correction after AAV gene therapy . Mol Genet Metab . 2015 ; 116 ( 1-2 ): 80 – 7 . OpenUrl CrossRef PubMed 83. McNulty MA , Prevatt PB , Nussbaum ER , Randle AN , Johnson AK , Hudson JA , et al. Abnormal epiphyseal development in a feline model of Sandhoff disease . Journal of Orthopaedic Research® . 2020 ; 38 ( 12 ): 2580 – 91 . OpenUrl 84. McCurdy VJ , Johnson AK , Gray-Edwards HL , Randle AN , Bradbury AM , Morrison NE , et al. Therapeutic benefit after intracranial gene therapy delivered during the symptomatic stage in a feline model of Sandhoff disease . Gene Therapy . 2020 ; 28 ( 3 ): 142 – 54 . OpenUrl PubMed 85. Shaimardanova AA , Solovyeva VV , Issa SS , Rizvanov AA . Gene Therapy of Sphingolipid Metabolic Disorders . International journal of molecular sciences . 2023 ; 24 ( 4 ). 86. Vyas M , Deschenes NM , Osmon KJL , Chen Z , Ahmad I , Kot S , et al. Efficacy of Adeno-Associated Virus Serotype 9-Mediated Gene Therapy for AB-Variant GM2 Gangliosidosis . Int J Mol Sci . 2023 ; 24 ( 19 ). 87. ↵ Ryckman AE , Deschenes NM , Quinville BM , Osmon KJL , Mitchell M , Chen Z , et al. Intrathecal delivery of a bicistronic AAV9 vector expressing β-hexosaminidase A corrects Sandhoff disease in a murine model: A dosage study . Molecular therapy Methods & clinical development . 2023 ; 32 ( 1 ). 88. ↵ Wilpert N-M , Hewitt AL , Pons R , Henke M-T , Dell’Orco A , Bauer M , et al. Patients with Allan-Herndon-Dudley Syndrome (MCT8 Deficiency) Display Symptoms of Parkinsonism in Childhood and Respond to Levodopa/Carbidopa Treatment . Mov Disord . 2025 ; 40 ( 5 ): 938 – 49 . OpenUrl PubMed 89. Groeneweg S , van Geest FS , Martín M , Dias M , Frazer J , Medina-Gomez C , et al. Mapping variants in thyroid hormone transporter MCT8 to disease severity by genomic, phenotypic, functional, structural and deep learning integration . Nat Commun . 2025 ; 16 ( 1 ): 2479 . OpenUrl PubMed 90. ↵ Chen J , Salveridou E , Liebmann L , Sundaram SM , Doycheva D , Markova B , et al. Triac Treatment Prevents Neurodevelopmental and Locomotor Impairments in Thyroid Hormone Transporter Mct8/Oatp1c1 Deficient Mice . Int J Mol Sci . 2023 ; 24 ( 4 ). 91. ↵ Du A , Yang K , Zhou X , Ren L , Liu N , Zhou C , et al. Systemic gene therapy corrects the neurological phenotype in a mouse model of NGLY1 deficiency . JCI insight . 2024 ; 9 ( 19 ). 92. Zhu L , Tan B , Dwight SS , Beahm B , Wilsey M , Crawford BE , et al. AAV9-NGLY1 gene replacement therapy improves phenotypic and biomarker endpoints in a rat model of NGLY1 Deficiency . Molecular therapy Methods & clinical development . 2022 ; 27 . 93. Pandey A , Adams JM , Han SY , Jafar-Nejad H . NGLY1 Deficiency, a Congenital Disorder of Deglycosylation: From Disease Gene Function to Pathophysiology . Cells . 2022 ; 11 ( 7 ). 94. ↵ Owings KG , Lowry JB , Bi Y , Might M , Chow CY . Transcriptome and functional analysis in a Drosophila model of NGLY1 deficiency provides insight into therapeutic approaches . Hum Mol Genet . 2018 ; 27 ( 6 ): 1055 – 66 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted July 11, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Pangenome-based identification of cryptic pathogenic variants in undiagnosed rare disease patients Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Pangenome-based identification of cryptic pathogenic variants in undiagnosed rare disease patients Se Song Jang , Seoyeon Kim , Seungbok Lee , Soo Yeon Kim , Jangsup Moon , Jun Kim , Jong-Hee Chae medRxiv 2025.07.08.25330875; doi: https://doi.org/10.1101/2025.07.08.25330875 Share This Article: Copy Citation Tools Pangenome-based identification of cryptic pathogenic variants in undiagnosed rare disease patients Se Song Jang , Seoyeon Kim , Seungbok Lee , Soo Yeon Kim , Jangsup Moon , Jun Kim , Jong-Hee Chae medRxiv 2025.07.08.25330875; doi: https://doi.org/10.1101/2025.07.08.25330875 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9219) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffba5915fa406cf',t:'MTc3OTQ1MTA4Mg=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00