Interoperability of standardised electronic healthcare records facilitates transfer learning

doi:10.1101/2025.06.12.25329419

Interoperability of standardised electronic healthcare records facilitates transfer learning

2025 · doi:10.1101/2025.06.12.25329419

preprint OA: gold CC-BY-4.0

📄 Open PDF Full text JSON View at publisher

Full text 23,694 characters · extracted from preprint-html · click to expand

Interoperability of standardised electronic healthcare records facilitates transfer learning | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Interoperability of standardised electronic healthcare records facilitates transfer learning View ORCID Profile Elizabeth Remfry , View ORCID Profile Rafael Henkin doi: https://doi.org/10.1101/2025.06.12.25329419 Elizabeth Remfry 1 Queen Mary University of London , London, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Elizabeth Remfry Rafael Henkin 1 Queen Mary University of London , London, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Rafael Henkin For correspondence: r.henkin{at}qmul.ac.uk Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Electronic healthcare records (EHR) use codes from different vocabularies to describe medical occurrences, often varying by type of care and country. Common data models (CDM) such as the Observational Medical Outcomes Partnership (OMOP) have been developed to enable the combination and comparison of heterogeneous datasets. We use the OMOP Standard Vocabularies to standardise two English EHR datasets and assess their interoperability through a BERT-based transformer model via pretraining and fine-tuning. Our results show the potential for standardisation to empower transfer learning, with tradeoffs related to data loss. 1 Introduction Electronic healthcare records (EHR) use codes from different vocabularies, often tailored to different countries’ healthcare realities, to describe diseases, medications and other kinds of occurrences in clinical practice. Within the same country or even care system, there is considerable heterogenity between coding systems used in primary and secondary care, for drug prescriptions and surgical procedures. To navigate these ontological challenges and standardise clinical data, common data models (CDM) such as the Observational Medical Outcomes Partnership (OMOP) have been developed [ 8 ]. CDMs include data structures and vocabularies to facilitate combining and comparing datasets to unlock health data analysis on a global level [ 7 ]. The standardisation of clinical ontologies has also opened doors for transfer learning, where knowledge gained from one dataset can be applied to another without requiring signficant effort to align vocabularies or retrain models [ 4 ]. Previous work has successfully trained foundation models on OMOP CDM data from one hospital and applied these models to other out-of-domain datasets across different prediction tasks [ 2 , 9 , 6 ]. In this study, we map the medical codes from two UK datasets of routinely collected data from Clinical Practice Research Datalink (CPRD), GOLD and Aurum, to standard concepts in the OMOP Standardized Vocabularies (SV). To assess interoperability, we pretrain a transformer model on CPRD Aurum, and fine-tune on both CPRD Aurum and GOLD. We report on the quality of the mapping, model performance and impact of transfer learning. 2 Methodology 2.1 Data We used EHRs from two datasets, CPRD GOLD and Aurum, and include: individuals ≥ 16 years, registered with an English general practice between January 1, 2010, and December 31, 2020, eligible for linkage to secondary care data and diagnosed with at least 2 long-term conditions (LTC) defined by prevalidated codelists [ 1 ]. In Aurum, only general practices based in London were included. The primary care datasets were linked to the Hospital Episode Statistics (HES) for secondary care data, Office for National Statistics (ONS) mortality data and Index of Multiple Deprivation (IMD) data. We identified individuals within these larger datasets with complex multi-morbidity (CMM), defined as the diagnosis of at least three LTCs that impact three separate body systems [ 3 ] using the same code lists [ 1 ]. We extracted all data from the date of CMM, which formed our baseline, until deregistration, death or study end (31/12/2020). This resulted in 185,798 (13%) cases and 1,174,173 (87%) controls in CPRD GOLD and 57,328 (8%) cases and 659,691 (92%) controls in CPRD Aurum. This data was used for the pre-training and fine-tuning of the transformer model. See Appendix for more details. 2.2 Mapping pipeline We took a pragmatic approach to mapping codes. First, we downloaded the standard set of vocabularies from the Athena library 1 along with additional vocabularies required for the CPRD datasets. We then mapped internal CPRD codes medcodid to external coding systems: SNOMED CT, OPCS, ICD10, Read and BNF, which are the source vocabularies. Next we mapped source codes, which are referred to as concept codes in OMOP, to concept IDs in the downloaded vocabularies. If codes are found in the SVs these can be classified as non-standard or standard. Unique concept IDs can map non-standard concepts to standard concepts using the OMOP Concept Relationship table. For example, 271649006 is a concept code from SNOMED, which is mapped to the standard concept ID 4152194. The use of standard concepts also enables consolidation of codes. For example, multiple SNOMED codes to record blood pressure can be mapped to a single blood pressure standard concept. We assessed the total volume of data and calculated the total number of concept codes with no mapping to SVs, non-standard concepts and standard concepts. We report on mapping of the larger datasets, prior to the selection of our CMM cohort. 2.3 Model We pretrained and fine-tuned a BERT-based model from scratch on our clinical corpus consisting of standard concept IDs from our CMM cohort. We used concept IDs as they are unique. We built chronologically ordered sequences of concept IDs for each patient, e.g. 4152194 4334559 4086275 , keeping only patients with sequences longer than 3 concepts within the study period. We trained a domain-specific WordPiece tokenizer from scratch on the whole CMM cohort data from CPRD Aurum. The vocabulary size was set to 20,000 and only codes that appeared over 3 times in the dataset were kept, resulting in 57% coverage of the Aurum vocabulary. We pretrained a 107M parameter BERT model on a masked language modelling task, where 15% of tokens are randomly masked. The data was split 80/10/10 for train, test and validation datasets. All sequences were truncated from left, with a maximum context length of 512. We fine-tuned the pretrained models separately on Aurum and GOLD, on a downstream clinical prediction task: 5-year all-cause mortality in our CMM cohort. For fine-tuning, we kept only data from the date of CMM and up to 5 years before a death date for cases. For controls, we kept all data until end of study or deregistration. We report area under the precision-recall curve (AUPRC) and area under the receiver operating characteristic curve (AUROC). All experiments were conducted on 4 A100 GPUs in the Queen Mary’s Apocrita HPC facility, supported by QMUL Research-IT [ 5 ]. 3 Results 3.1 Concept Mapping In both GOLD and Aurum, the majority of codes and rows in the data were successfully mapped to standard concepts. In GOLD the majority of concepts were mapped via other vocabularies, particularly as Read and ICD10 codes are not SVs in OMOP. In GOLD, the greatest loss of data was in prescription data, where 5.5% of the data could not be mapped to OMOP concept IDs (unmapped codes), and Read codes where 3.8% of the data was mapped to a non-standard concept ID which was not matched to a standard concept ID . In Aurum, the largest loss of data was again through unmapped SNOMED source codes (11.2%) and non-standard concepts (8.7%) in the pirmary care data. See Appendix for more details for the complete mapping results. 3.2 Model performance We evaluated our pretrained model by fine-tuning on both same-database scenarios AURUM → AURUM and cross-database transfer scenarios AURUM → GOLD. Model performance was higher within the same-database, but demonstrated good performance between different datasets with a 0.10 difference between models (see table 1 ). The results may reflect the contents of the tokenizer as we trained our custom tokenizer on concept IDs from CPRD Aurum only. The tokenizer covered 68% of the concept IDs in the Aurum fine-tuning dataset and only 37% in GOLD. We report on the AUROC and AUPRC plots in the Appendix . View this table: View inline View popup Download powerpoint Table 1. Performance metrics comparing Aurum and GOLD dataset using bootstrap resampling on the test set using 1000 iterations and a 95% confidence interval. 4 Conclusion and future work Standardisation enabled us to reuse models and lower the time and computational expense required to train multiple models from scratch. Mapping to OMOP also helped reduce the size of the datasets, as multiple source codes (from SNOMED, Read, etc.) can map to one OMOP concept ID . Standardisation is a step towards interoperability, but does not guarantee it as the SVs include coding systems from different countries. For example, the EHRSHOT model [ 9 ] was trained on SV-compatible data, but predominantly includes codes from the LOINC system for test measurements – these tend to be recorded as standard concepts in SNOMED in UK datasets. Among the limitations of our work, our pragmatic download strategy of standardised vocabularies led to some codes from Aurum and GOLD not being mapped because the correct vocabulary was not downloaded. We also did not calculate or explore the amount of data lost per patient; future work should understand where and for whom data is lost in the conversion to OMOP. Future research should also assess the capacity for zero or few-shot learning in both the same and across datasets. We present an initial exploration of the possibilities of interoperability between two EHR datasets that rely on different clinical ontologies. Whilst the mapping steps were largely successful, data loss was experienced across both datasets. Models performed better when fine-tuning on the same dataset, but there was evidence of some transfer learning across databases, demonstrating potential to enable the sharing of large healthcare models. Data Availability The data underlying this article is provided by the UK CPRD electronic health record database, which is only accessible to researchers with protocols approved by CPRD Research Data Governance. Disclosure of Interests The authors have no competing interests to declare that are relevant to the content of this article. Acknowledgments ER is funded by the Wellcome Trust Health Data in Practice (HDiP) Programme (218584/Z/19/Z). This work uses data provided by patients and collected by the NHS as part of their care and support. The data used in this study was under licence. Appendix Data Processing CPRD GOLD and Aurum contain four primary coding systems: primary care (Read or SNOMED), prescriptions (dm+d), hospitalisations (ICD10) and procedures (OPCS4). For the purpose of this study, we did not map the CPRD data structures to the OMOP CDM and limited our work to code standardisation. In CPRD GOLD, we combined the tables of clinical findings, referrals and test results to represent primary care data. Before mapping datasets to OMOP CDM, we removed all rows without valid dates, those that were empty, occurred before birth or after death of the patient, or occurred after the end of the study. BERT model Pretraining was conducted using the AdamW optimizer, with a learning rate of 5e-5 and warm-up ratio of 0.1. We trained the entire train dataset of CPRD Aurum for 5 epochs with early stopping. The model was fine tuned for 3 epochs with early stopping with a learning rate of 2e-5. We used weighted cross entropy due to class imbalance. To assess variations in model performance we employed a bootstrap resampling method on our test set using 1000 iteractions and report a 95% confidence interval. Mapping results Tables 2 and 3 show the mapping results for the four source vocabularies in GOLD and the four source vocabularies in Aurum. Table 4 summarises the distributions of cases and controls between GOLD and Aurum, as well as characteristics of input sequences to the model. The Aurum dataset is slightly more imbalanced than the GOLD dataset. Tables 5 and 6 show the top 30 concepts that, after standardisation, are common between cases and controls in GOLD and Aurum, respectively. Although many concepts have similar distributions, Aurum contorls have an overwhelming amount of administrative codes including SMS messages and general labels such as “History of event”. View this table: View inline View popup Download powerpoint Table 2. CPRD GOLD data mapping results. NS-MV: Non-standard concepts or missing vocabulary, S-D: standard concepts directly mapped via source vocabulary, S-V: standard concepts mapped via another vocabulary View this table: View inline View popup Download powerpoint Table 3. CPRD Aurum data mapping results. NS-MV: Non-standard concepts or missing vocabulary, S-D: standard concepts directly mapped via source vocabulary, S-V: standard concepts mapped via another vocabulary View this table: View inline View popup Download powerpoint Table 4. Characteristics of model input data View this table: View inline View popup Download powerpoint Table 5. Shared concepts between GOLD and Aurum cases datasets after mapping. Values show the number of concepts per patient in each dataset. View this table: View inline View popup Download powerpoint Table 6. Shared concepts between GOLD and Aurum controls datasets after mapping. Values show the number of concepts per patient in each dataset. AUROC and AUPRC model performance Download figure Open in new tab Fig. 1. AUROC for AURUM → AURUM Download figure Open in new tab Fig. 2. AUPRC for AURUM → AURUM Download figure Open in new tab Fig. 3. AUROC for AURUM → GOLD Download figure Open in new tab Fig. 4. AUPRC for AURUM → GOLD Footnotes e.a.remfry{at}qmul.ac.uk , r.henkin{at}qmul.ac.uk ↵ 1 https://athena.ohdsi.org/ References 1. ↵ Eto , F. , Samuel , M. , Henkin , R. , Mahesh , M. , Ahmad , T. , Angdembe , A. , Hamish McAllister-Williams , R. , Missier , P. , J. Reynolds , N. , R. Barnes , M. , Hull , S. , Finer , S. , Mathur , R. : Ethnic differences in early onset multimorbidity and associations with health service use, long-term prescribing, years of life lost, and mortality : A cross-sectional study using clustering in the UK clinical practice research datalink 20 ( 10 ), e1004300 . doi: 10.1371/journal.pmed.1004300 , https://dx.plos.org/10.1371/journal.pmed.1004300 OpenUrl CrossRef 2. ↵ Guo , L.L. , Fries , J. , Steinberg , E. , Fleming , S.L. , Morse , K. , Aftandilian , C. , Posada , J. , Shah , N. , Sung , L. : A multi-center study on the adaptability of a shared foundation model for electronic health records 7 ( 1 ), 1 – 9 . doi: 10.1038/s41746-024-01166-w , https://www.nature.com/articles/s41746-024-01166-w , publisher: Nature Publishing Group OpenUrl CrossRef PubMed 3. ↵ Harrison , C. , Britt , H. , Miller , G. , Henderson , J. : Examining different measures of multimorbidity, using a large prospective cross-sectional study in australian general practice 4 ( 7 ), e004694 . doi: 10.1136/bmjopen-2013-004694 , https://bmjopen.bmj.com/lookup/doi/10.1136/bmjopen-2013-004694 OpenUrl Abstract / FREE Full Text 4. ↵ Hur , K. , Oh , J. , Kim , J. , Kim , J. , Lee , M.J. , Cho , E. , Moon , S.E. , Kim , Y.H. , Choi , E. : UniHPF : Universal healthcare predictive framework with zero domain knowledge . doi: 10.48550/arXiv.2211.08082 , http://arxiv.org/abs/2211.08082 OpenUrl CrossRef 5. ↵ King , T. , Butcher , S. , Zalewski , L. : Apocrita - high performance computing cluster for queen mary university of london . doi: 10.5281/zenodo.438045 , https://zenodo.org/records/438045 , publisher: Zenodo OpenUrl CrossRef 6. ↵ Kirchler , M. , Ferro , M. , Lorenzini , V. , Lippert , C. , Ganna , A. : Large language models improve transferability of electronic health record-based predictions across countries and coding systems p. 2025.02.03.25321597 . doi: 10.1101/2025.02.03.25321597 , https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11838679/ OpenUrl Abstract / FREE Full Text 7. ↵ Observational Health Data Sciences and Informatics: The book of OHDSI , https://ohdsi.github.io/TheBookOfOhdsi/ 8. ↵ OHDSI: OMOP common data model , https://ohdsi.github.io/CommonDataModel 9. ↵ Wornow , M. , Thapa , R. , Steinberg , E. , Fries , J.A. , Shah , N.H. : EHRSHOT: An EHR benchmark for few-shot evaluation of foundation models . doi: 10.48550/arXiv.2307.02028 , http://arxiv.org/abs/2307.02028 OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted June 12, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Interoperability of standardised electronic healthcare records facilitates transfer learning Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Interoperability of standardised electronic healthcare records facilitates transfer learning Elizabeth Remfry , Rafael Henkin medRxiv 2025.06.12.25329419; doi: https://doi.org/10.1101/2025.06.12.25329419 Share This Article: Copy Citation Tools Interoperability of standardised electronic healthcare records facilitates transfer learning Elizabeth Remfry , Rafael Henkin medRxiv 2025.06.12.25329419; doi: https://doi.org/10.1101/2025.06.12.25329419 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15227) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6597) Geriatric Medicine (668) Health Economics (997) Health Informatics (4534) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3332) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9230) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00262fecb424193',t:'MTc3OTUyMTc1NQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-21T05:10:58.409756+00:00

License: CC-BY-4.0