Streamlining Large-Scale Genomic Data Management: Insights from the UK Biobank Whole-Genome Sequencing Data

doi:10.1101/2025.01.27.25321225

Streamlining Large-Scale Genomic Data Management: Insights from the UK Biobank Whole-Genome Sequencing Data

2025 · doi:10.1101/2025.01.27.25321225

preprint OA: gold CC-BY-NC-ND-4.0

📄 Open PDF Full text JSON View at publisher

Full text 25,358 characters · extracted from preprint-html · click to expand

Streamlining Large-Scale Genomic Data Management: Insights from the UK Biobank Whole-Genome Sequencing Data | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Streamlining Large-Scale Genomic Data Management: Insights from the UK Biobank Whole-Genome Sequencing Data View ORCID Profile Xihao Li , View ORCID Profile Andrew R. Wood , Yuxin Yuan , Manrui Zhang , Yushu Huang , Gareth Hawkes , Robin N. Beaumont , Michael N. Weedon , Wenyuan Li , Xiaoyu Li , View ORCID Profile Xihong Lin , View ORCID Profile Zilin Li doi: https://doi.org/10.1101/2025.01.27.25321225 Xihao Li 1 Department of Biostatistics, University of North Carolina at Chapel Hill , Chapel Hill, NC, USA 2 Department of Genetics, University of North Carolina at Chapel Hill , Chapel Hill, NC, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Xihao Li For correspondence: xihaoli{at}unc.edu A.R.Wood{at}exeter.ac.uk xlin{at}hsph.harvard.edu lizl{at}nenu.edu.cn Andrew R. Wood 3 Department of Biomedical and Clinical Sciences, Faculty of Health and Life Sciences, University of Exeter , Exeter, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Andrew R. Wood For correspondence: xihaoli{at}unc.edu A.R.Wood{at}exeter.ac.uk xlin{at}hsph.harvard.edu lizl{at}nenu.edu.cn Yuxin Yuan 4 School of Mathematics and Statistics and KLAS, Northeast Normal University , Changchun, Jilin, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Manrui Zhang 5 Department of Sociology, Tsinghua University , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yushu Huang 6 Department of Big Data in Health Science, School of Public Health and Center of Clinical Big Data and Analytics of The Second Affiliated Hospital, Zhejiang University School of Medicine , Hangzhou, Zhejiang, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gareth Hawkes 3 Department of Biomedical and Clinical Sciences, Faculty of Health and Life Sciences, University of Exeter , Exeter, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Robin N. Beaumont 3 Department of Biomedical and Clinical Sciences, Faculty of Health and Life Sciences, University of Exeter , Exeter, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Michael N. Weedon 3 Department of Biomedical and Clinical Sciences, Faculty of Health and Life Sciences, University of Exeter , Exeter, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Wenyuan Li 6 Department of Big Data in Health Science, School of Public Health and Center of Clinical Big Data and Analytics of The Second Affiliated Hospital, Zhejiang University School of Medicine , Hangzhou, Zhejiang, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Xiaoyu Li 5 Department of Sociology, Tsinghua University , Beijing, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Xihong Lin 7 Department of Biostatistics, Harvard T.H. Chan School of Public Health , Boston, MA, USA 8 Department of Statistics, Harvard University , Cambridge, MA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Xihong Lin For correspondence: xihaoli{at}unc.edu A.R.Wood{at}exeter.ac.uk xlin{at}hsph.harvard.edu lizl{at}nenu.edu.cn Zilin Li 4 School of Mathematics and Statistics and KLAS, Northeast Normal University , Changchun, Jilin, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zilin Li For correspondence: xihaoli{at}unc.edu A.R.Wood{at}exeter.ac.uk xlin{at}hsph.harvard.edu lizl{at}nenu.edu.cn Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Biobank-scale Whole-Genome Sequencing (WGS) studies are increasingly pivotal in unraveling the genetic bases of diverse health outcomes. However, managing and analyzing these datasets’ sheer volume and complexity presents significant challenges. We propose vcf2agds , an all-in-one toolkit that efficiently converts WGS data from Variant Call Format (VCF) format to the annotated Genomic Data Structure (aGDS) format, significantly reducing data size while supporting seamless genomic and functional data integration for comprehensive genetic analyses. The toolkit was applied to the UK Biobank 500k WGS data, resulting in twenty-three aGDS files, one for each chromosome, which collectively compressed 1,473.85 Tebibytes of pVCF data into 1.10 Tebibytes. Utilizing these aGDS files, we conducted a functionally informed rare variant association analysis of total cholesterol employing the STAARpipeline and detected 480 genome-wide significant coding and noncoding associations. Overall, vcf2agds offers a streamlined approach facilitating the efficient management and analysis of biobank-scale WGS data across hundreds of thousands of samples. Over the past decade, the rapid advancements in whole-genome sequencing (WGS) technologies have generated vast amounts of data, enabling comprehensive assessment of associations between complex traits and both common and rare variants across coding and noncoding regions of the genome. Notably, the National Heart, Lung, and Blood Institute (NHLBI) Trans-Omics for Precision Medicine Program (TOPMed) 1 , the National Institutes of Health (NIH) All of Us Research Program 2 , and the UK Biobank 3 have sequenced the whole genomes of approximately 200,000, 245,000, and 500,000 individuals, respectively. Together, these initiatives have identified over 1 billion genetic variants, presenting unprecedented opportunities and challenges for data management and analysis at the biobank scale. A widely used format for storing human genetic variation data is the Variant Call Format (VCF) 4 , which contains detailed information and genotype counts for all genomic variants across study participants. For instance, the UK Biobank released the 500k WGS jointly called variant data produced using GraphTyper (Data Field: 23374) in 151,561 blocks of project VCF (pVCF) files across the genome ( Supplementary Table 1 ) 5 , with a total of 1473.85 Tebibyte (TiB) and an average of 9.96 Gibibytes (GiB) per file. However, such large size and inherent complexity of VCF files often hinder efficient downstream analyses, particularly in biobank-scale studies such as those involving the UK Biobank, which remains the only data format available since its release in November 2023. To address the limitations of VCF files in large-scale genomic studies, several alternative data formats have been developed, including PLINK BED 6 , BGEN 7 , and Genomic Data Structure (GDS) 8 format. Among these, the SeqArray GDS 9 format is designed as a storage-efficient high-performance data format for WGS variant calls, utilizing a hierarchical structure to support efficient data access and seamless integration with several analytical pipelines. The annotated Genomic Data Structure (aGDS) 10 format further extends the capabilities of SeqArray GDS by incorporating multi-faceted variant annotations to facilitate functionally-informed downstream analysis within an all-in-one file 11 . Here we present vcf2agds , a streamlined and reproducible toolkit for processing the WGS data from VCF to aGDS files, with a case study using the UK Biobank WGS data ( Fig. 1 ). The toolkit provides a four-step workflow, including VCF trimming, VCF merging, format conversion, and variant annotation. The first step ( vcf_trimmer ) involves reducing the complexity of VCF files using bcftools 12 by removing potentially redundant fields and applying quality filters (e.g. excluding low quality variants with AAscore less than 0.5 assigned by GraphTyper 13 ), that will not be used by downstream analyses. This preprocessing step minimizes file sizes and improves compatibility for merging, and users can customize this step by selecting specific fields and thresholds to retain the most relevant information for analysis. After trimming, the second step ( vcf_merger ) is to merge multiple VCF files into a unified dataset for each chromosome. Efficient merging ensures that the subsequent conversion to other file formats is streamlined and manageable, which is particularly critical for large-scale projects like the UK Biobank. Download figure Open in new tab Fig. 1: Overview of the cloud-based WGS analysis workflow. (i) prepare the input data, including genotypes, phenotypes and covariates; (ii) process WGS genotype data from VCF format to GDS format, using vcf_trimmer, vcf_merger and vcf2gds tools; (iii) functionally annotate all variants in the genome and generate aGDS files using favorannotator ; (iv) calculate ancestry PCs and sparse GRM using FastSparseGRM ; (v) perform functionally informed association analysis using STAARpipeline ; (vi) provide result summarization, visualization and analytical follow-up using STAARpipelineSummary . The third step ( vcf2gds ) converts the merged VCF files into GDS files using SeqArray 9 , which allows for efficient access to specific sample and/or variant subsets without loading the entire dataset into memory. The last step ( favorannotator ) performs functional annotation of the SeqArray GDS files into aGDS files through the Functional Annotation of Variants Online Resource (FAVOR) database 10 . The resulting aGDS files are equipped with both genotypes and variant annotations in the same file, providing detailed biological insights, including predicted functional impact, regulatory elements, protein function and conservation scores. We applied vcf2agds to the 500k release of the UK Biobank WGS dataset, converting pVCF files (Data Field: 23374) to aGDS files ( Supplementary Table 1 ). The original pVCF files, provided by the UK Biobank in 151,561 blocks, totaled 1,473.85 TiB. The aGDS files were generated as one file per chromosome, achieving a total size of only 1.10 TiB, representing a 1336x reduction compared to the pVCF files ( Supplementary Table 1 ). As a downstream application of the utility of aGDS files, we conducted a comprehensive analysis of total cholesterol (TC) using variants that passed quality control in 500k WGS data ( Supplementary Table 2 ). Leveraging the STAARpipeline for variant set analysis across diverse coding and noncoding genomic units 14 ( Supplementary Note ), we identified 480 genome-wide significant associations. These associations were determined using Bonferroni-corrected significance thresholds of α = 0.05/(20,000 × 7) = 3.57 × 10 −7 accounting for 7 different coding or noncoding masks across protein-coding genes, and α = 0.05/20,000 = 2.50 × 10 −6 accounting for noncoding RNA genes. Among these, 200 associations were uncovered through gene-centric coding analyses, while 280 were identified through gene-centric noncoding analyses ( Supplementary Fig. 1, Supplementary Tables 3-4 ). Further conditional analysis can be performed to detect putatively novel associations adjusting for the previously reported TC-associated variants and independent associations via aggregate conditioning 14 , 15 . We further benchmarked the storage requirements of aGDS files using the 200k release of the UK Biobank WGS dataset, compared with pVCF files (Data Field: 24304), PLINK BED files (Data Field: 24305) and BGEN files (Data Field: 24306). The original pVCF files, PLINK BED and BGEN files totaled 533.62 TiB, 26.86 TiB and 0.87 TiB in size, respectively. The aGDS files, generated using vcf2agds , had a total size of 0.49 TiB, representing a 1084x reduction in size compared to the original pVCF files, and a compression ratio of 54.61x relative to PLINK BED files and 1.77x relative to BGEN files. Notably, the aGDS format uniquely integrates variant functional annotation data within the same file, making it the most compact and information-rich data structure among the formats compared ( Supplementary Table 5 ). Although these benchmarks were conducted on data generated from joint variant calling using GraphTyper for both the 200k and 500k releases of the UK Biobank WGS data 5 , the proposed vcf2agds toolkit can seamlessly be applied to WGS data generated from joint variant calling with Illumina DRAGEN (Data Field: 24310) for the 500k release as well. These results underscore the flexibility and efficiency of the vcf2agds toolkit in managing large-scale genomic datasets, which only required to be performed once for all downstream analyses. Overall, vcf2agds provides a streamlined solution for converting VCF files to aGDS format, enabling efficient management and analysis of biobank-scale WGS data of hundreds of thousand samples. As WGS datasets continue to expand in size and complexity, we hope that our proposed tools will provide an increasingly important role in unlocking the full potential of genomic research. Data Availability Access to the UK Biobank resource is available via application ( https://www.ukbiobank.ac.uk/ ). Development of software was undertaken under UK Biobank applications 9072 and 103356. Benchmarking results for UK Biobank WGS data were generated using the UK Biobank resource under applications 91486 and 52008. All analyses were conducted in the UK Biobank Research Analysis Platform (RAP, https://ukbiobank.dnanexus.com/ ). Code and data availability vcf2agds is a suite of open-source software freely available at https://github.com/drarwood/vcf2agds_overview . Access to the UK Biobank resource is available via application ( https://www.ukbiobank.ac.uk/ ). Development of software was undertaken under UK Biobank applications 9072 and 103356. Benchmarking results for UK Biobank WGS data were generated using the UK Biobank resource under applications 91486 and 52008. All analyses were conducted in the UK Biobank Research Analysis Platform (RAP, https://ukbiobank.dnanexus.com/ ), and the generated aGDS files will be made available as a ‘Returned Dataset’. STAARpipeline is implemented as an open-source R package available at https://github.com/xihaoli/STAARpipeline , and as an applet in UK Biobank RAP available at https://github.com/xihaoli/staarpipeline-rap . Competing interests X. Lin is a consultant of AbbVie Pharmaceuticals and Verily Life Sciences. The remaining authors declare no competing interests. Acknowledgements We would like to acknowledge Timothy M. Frayling at the University of Geneva for the contribution of funds for software development and testing on the DNAnexus Research Analysis Platform. This study is supported by the National Institute for Health and Care Research (NIHR) Exeter Biomedical Research Centre (BRC). The views expressed are those of the author(s) and not necessarily those of the NIHR or the Department of Health and Social Care. M.N.W. and R.N.B. are supported by MRC grant MR/Y003748/1. X. Lin is supported by the NIH grants R35-CA197449, R01-HL163560, U19-CA203654, U01-HG012064, and U01-HG009088. References 1. ↵ Taliun , D. et al. Sequencing of 53,831 diverse genomes from the NHLBI TOPMed Program . Nature 590 , 290 – 299 ( 2021 ). OpenUrl CrossRef PubMed 2. ↵ Bick , A.G. et al. Genomic data in the All of Us Research Program . Nature 627 , 340 – 346 ( 2024 ). OpenUrl CrossRef PubMed 3. ↵ Li , S. , Carss , K.J. , Halldorsson , B.V. & Cortes , A. Whole-genome sequencing of half-a-million UK Biobank participants . medRxiv , 2023.12.06.23299426 ( 2023 ). 4. ↵ Danecek , P. et al. The variant call format and VCFtools . Bioinformatics 27 , 2156 – 2158 ( 2011 ). OpenUrl CrossRef PubMed Web of Science 5. ↵ Eggertsson , H.P. et al. Graphtyper enables population-scale genotyping using pangenome graphs . Nature Genetics 49 , 1654 – 1660 ( 2017 ). OpenUrl CrossRef PubMed 6. ↵ Purcell , S. et al. PLINK: A Tool Set for Whole-Genome Association and Population-Based Linkage Analyses . The American Journal of Human Genetics 81 , 559 – 575 ( 2007 ). OpenUrl CrossRef PubMed 7. ↵ Band , G. & Marchini , J. BGEN: a binary file format for imputed genotype and haplotype data . bioRxiv, 308296 ( 2018 ). 8. ↵ Zheng , X. et al. A high-performance computing toolset for relatedness and principal component analysis of SNP data . Bioinformatics 28 , 3326 – 3328 ( 2012 ). OpenUrl CrossRef PubMed Web of Science 9. ↵ Zheng , X. et al. SeqArray—a storage-efficient high-performance data format for WGS variant calls . Bioinformatics 33 , 2251 – 2257 ( 2017 ). OpenUrl CrossRef PubMed 10. ↵ Zhou , H. et al. FAVOR: functional annotation of variants online resource and annotator for variation across the human genome . Nucleic Acids Research 51 , D1300 – D1311 ( 2023 ). OpenUrl PubMed 11. ↵ Li , X. et al. Dynamic incorporation of multiple in silico functional annotations empowers rare variant association analysis of large whole-genome sequencing studies at scale . Nature Genetics 52 , 969 – 983 ( 2020 ). OpenUrl CrossRef PubMed 12. ↵ Li , H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data . Bioinformatics 27 , 2987 – 2993 ( 2011 ). OpenUrl CrossRef PubMed Web of Science 13. ↵ Halldorsson , B.V. et al. The sequences of 150,119 genomes in the UK Biobank . Nature 607 , 732 – 740 ( 2022 ). OpenUrl CrossRef PubMed 14. ↵ Li , Z. et al. A framework for detecting noncoding rare-variant associations of large-scale whole-genome sequencing studies . Nature Methods 19 , 1599 – 1611 ( 2022 ). OpenUrl CrossRef PubMed 15. ↵ Hawkes , G. et al. Whole-genome sequencing in 333,100 individuals reveals rare non-coding single variant and aggregate associations with height . Nature Communications 15 , 8549 ( 2024 ). OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted January 28, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Streamlining Large-Scale Genomic Data Management: Insights from the UK Biobank Whole-Genome Sequencing Data Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Streamlining Large-Scale Genomic Data Management: Insights from the UK Biobank Whole-Genome Sequencing Data Xihao Li , Andrew R. Wood , Yuxin Yuan , Manrui Zhang , Yushu Huang , Gareth Hawkes , Robin N. Beaumont , Michael N. Weedon , Wenyuan Li , Xiaoyu Li , Xihong Lin , Zilin Li medRxiv 2025.01.27.25321225; doi: https://doi.org/10.1101/2025.01.27.25321225 Share This Article: Copy Citation Tools Streamlining Large-Scale Genomic Data Management: Insights from the UK Biobank Whole-Genome Sequencing Data Xihao Li , Andrew R. Wood , Yuxin Yuan , Manrui Zhang , Yushu Huang , Gareth Hawkes , Robin N. Beaumont , Michael N. Weedon , Wenyuan Li , Xiaoyu Li , Xihong Lin , Zilin Li medRxiv 2025.01.27.25321225; doi: https://doi.org/10.1101/2025.01.27.25321225 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetic and Genomic Medicine Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4436) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6600) Geriatric Medicine (668) Health Economics (997) Health Informatics (4538) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (542) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3333) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9232) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00f780acb9afbb0',t:'MTc3OTY1ODkzMg=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-21T05:10:58.409756+00:00

License: CC-BY-NC-ND-4.0