A large dataset of brain imaging linked to health systems data: the curation and access to a whole system national cohort from NHS Scotland

doi:10.1101/2025.10.21.25338469

A large dataset of brain imaging linked to health systems data: the curation and access to a whole system national cohort from NHS Scotland

2025 · doi:10.1101/2025.10.21.25338469

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 52,210 characters · extracted from preprint-html · click to expand

A large dataset of brain imaging linked to health systems data: the curation and access to a whole system national cohort from NHS Scotland | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search A large dataset of brain imaging linked to health systems data: the curation and access to a whole system national cohort from NHS Scotland View ORCID Profile Michael P J Camilleri , View ORCID Profile Dorian Gouzou , Salim Al-Wasity , View ORCID Profile Muthu R K Mookiah , View ORCID Profile María Valdes Hernandez , View ORCID Profile Bea Alex , View ORCID Profile Sotirios A. Tsaftaris , View ORCID Profile Andrew Brooks , View ORCID Profile Ruairidh MacLeod , View ORCID Profile Honghan Wu , View ORCID Profile Brenda Bauer , View ORCID Profile Claire Grover , View ORCID Profile Parminder Reel , View ORCID Profile Susan Krueger , View ORCID Profile Richard Tobin , View ORCID Profile J. Douglas Steele , View ORCID Profile Grant Mair , View ORCID Profile Joanna Wardlaw , View ORCID Profile Alexander Doney , View ORCID Profile Emanuele Trucco , View ORCID Profile William Whiteley doi: https://doi.org/10.1101/2025.10.21.25338469 Michael P J Camilleri 1 Computing, School of Science and Engineering, University of Dundee , Dundee, UK 2 School of Engineering, University of Edinburgh , Edinburgh, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Michael P J Camilleri Dorian Gouzou 3 Institute for Neuroscience and Cardiovascular Research, School of Medicine, University of Edinburgh , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Dorian Gouzou Salim Al-Wasity 4 School of Medicine, Ninewells NHS and University Hospital , Dundee, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Muthu R K Mookiah 4 School of Medicine, Ninewells NHS and University Hospital , Dundee, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Muthu R K Mookiah María Valdes Hernandez 3 Institute for Neuroscience and Cardiovascular Research, School of Medicine, University of Edinburgh , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for María Valdes Hernandez Bea Alex 5 School of Informatics, University of Edinburgh , Edinburgh, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Bea Alex Sotirios A. Tsaftaris 2 School of Engineering, University of Edinburgh , Edinburgh, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sotirios A. Tsaftaris Andrew Brooks 6 Edinburgh Parallel Computing Centre, University of Edinburgh , Edinburgh, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Andrew Brooks Ruairidh MacLeod 6 Edinburgh Parallel Computing Centre, University of Edinburgh , Edinburgh, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ruairidh MacLeod Honghan Wu 7 Usher Institute, School of Medicine, University of Edinburgh , Edinburgh, UK 8 School of Health and Wellbeing, University of Glasgow , Glasgow, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Honghan Wu Brenda Bauer 3 Institute for Neuroscience and Cardiovascular Research, School of Medicine, University of Edinburgh , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Brenda Bauer Claire Grover 5 School of Informatics, University of Edinburgh , Edinburgh, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Claire Grover Parminder Reel 9 Health Informatics Centre, School of Medicine, University of Dundee , Dundee, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Parminder Reel Susan Krueger 9 Health Informatics Centre, School of Medicine, University of Dundee , Dundee, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Susan Krueger Richard Tobin 5 School of Informatics, University of Edinburgh , Edinburgh, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Richard Tobin J. Douglas Steele 4 School of Medicine, Ninewells NHS and University Hospital , Dundee, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for J. Douglas Steele Grant Mair 3 Institute for Neuroscience and Cardiovascular Research, School of Medicine, University of Edinburgh , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Grant Mair Joanna Wardlaw 3 Institute for Neuroscience and Cardiovascular Research, School of Medicine, University of Edinburgh , UK 10 UK Dementia Research Institute Centre at the University of Edinburgh Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joanna Wardlaw Alexander Doney 11 Cardiovascular Research, School of Medicine, University of Dundee , Dundee, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alexander Doney Emanuele Trucco 1 Computing, School of Science and Engineering, University of Dundee , Dundee, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Emanuele Trucco William Whiteley 3 Institute for Neuroscience and Cardiovascular Research, School of Medicine, University of Edinburgh , UK 7 Usher Institute, School of Medicine, University of Edinburgh , Edinburgh, UK 12 Health Data Research UK , London, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for William Whiteley For correspondence: william.whiteley{at}ed.ac.uk Abstract Full Text Info/History Metrics Data/Code Preview PDF ABSTRACT We present the design and implementation of a data curation framework to generate a large-scale clinical brain imaging dataset suitable for artificial intelligence (AI) enabled image analysis, accessible through the Brain Health Data (BHD) initiative. The raw data accessible through the BHD includes approximately 417K magnetic resonance imaging (MRI) and 846K computerized tomography (CT) head scans, linked electronic health records (EHRs), and associated free-text imaging reports from clinical practice between 2010 and 2018 in Scotland, totally exceeding 185 TB storage of brain imaging and associated data. We present the work curating the dataset and the strengths of the BHD, including clinical relevance thanks to its unprecedented scale, population-wide representativeness of a national free-at-the-point-of-service healthcare, long-term follow-up to neurodegenerative disease, and real-world variability. We discuss challenges and lessons learnt in developing the framework to curate the data initially available, including the time needed to obtain relevant permissions, the need for easily accessible, secure, responsive and affordable computational environments, the variability and inconsistencies of clinical data and records, and the challenge of extracting linked clinical data and images at scale, among others. This resource will be crucial for clinical research, fostering the development of personalized medicine approaches, and fast-tracking the implementation of AI models in clinical workflows. We encourage the use of the BHD data through a streamlined application to the Data Research and Innovation Service (eDRIS) of Public Health Scotland (PHS). INTRODUCTION Brain imaging plays a crucial role in the diagnosis of neurological disorders. However, clinical imaging services are under great demand, highlighting the need for new tools to improve radiology workflows. These tools should accelerate image assessment, reduce the workload for radiologists, and ultimately improve patient care. Artificial intelligence (AI) methods show promise for faster diagnosis (for example in acute ischaemic stroke)[ 1 ] and similar improvements are possible in head injury, neurodegeneration and brain cancers[ 2 , 3 ] To develop and test AI models that are clinically relevant, researchers need access to large datasets of clinically acquired images, with sufficient computing resources, and secure, ethical data provision. We present the Brain Health Data (BHD) initiative, a framework that unifies all clinical brain imaging data with related information collected in Scotland, to facilitate their access through the Electronic Data Research and Innovation Service (eDRIS) of Public Health Scotland (PHS) for other researchers. The BHD dataset includes magnetic resonance imaging (MRI) and computerized tomography (CT) head scans, linked electronic health records (EHRs), and free text radiology reports. We also present the design and implementation of the MRI data processing framework to curate this large-scale clinical brain imaging dataset in a format suitable for AI analysis, which we make available through the BHD. The work was undertaken during the SCANDAN project (SCottish AI in Neuroimaging to predict Dementia and Neurodegenerative Disease), which aimed to develop AI algorithms for reliable dementia risk estimation with routine brain imaging and clinical records. The SCANDAN work packages are illustrated in Figure 1 . SCANDAN used Scotland-wide clinical practice data from 2010 to 2018 from more than 830 thousand individuals, now made accessible through the BHD. The BHD dataset offers clinical relevance with its unprecedented scale, population-wide representativeness of healthcare, long-term follow-up to neurodegenerative disease, and real-world variability. Download figure Open in new tab Figure 1. Work packages (WP) in the SCANDAN project: Data labelling, cohort building, image cataloguing, and processing for classification into being indicative of having dementia or not using deep learning (DL), support vector machine (SVM) and from the analysis of extracted imaging phenotypes. A survey conducted between December 2024 and February 2025 across UK secure data environments found no comparable brain imaging resource with nationwide coverage. For instance, the Diagnostic Imaging Dataset curated by NHS England includes patient-level metadata on the 501 million diagnostic imaging procedures performed within NHS England facilities since April 2012, but it lacks actual imaging data and associated reports.[ 4 ] The preparation of large repositories of routinely collected imaging data is challenging, particularly within secure data environments that are designed to protect patient privacy. Despite adherence to Digital Imaging and Communications in Medicine (DICOM)[ 5 ] standards as best practice, real-world medical imaging datasets vary significantly in quality, format, and acquisition protocols, which makes standardization across different imaging sources necessary. Determining imaging sequences (e.g., T1-or T2-weighted MRI) is essential for analysis, but difficult in practice. DICOM meta-data tags provide rapid but sometimes unreliable classification, while image-based classification is more accurate but computationally demanding, and not free from ambiguities. Natural language processing (NLP) of radiology reports can aid in sequence identification if recorded in the free text. Filtering out unusable scans, such as those with artefacts or missing brain structures, is a critical step. Automated AI-based image quality assessment can help, but manual verification may be necessary. The automation of data pipelines, and the integration of structured clinical records with imaging data, can facilitate data retrieval and pre-processing. Compliance with governance frameworks is important to access large-scale unconsented clinical imaging datasets within safe havens, and needs ethical approval, data governance agreements, and compliance with privacy regulations, all with costs. These administrative barriers, although necessary, can significantly delay or completely deter research. To address these challenges, SCANDAN developed an automated framework that extracted, cleaned and prepared structured brain imaging and clinical data for AI analysis, creating a research-ready dataset. This paper presents SCANDAN’s methods, highlights key lessons learned from working within Scotland’s data governance frameworks, and describes how to access the data through the BHD. METHODS Data sources PHS’ eDRIS provided brain CT and MR head studies in adults performed in Scotland between 2010 and 2018 from the Scottish Medical Imaging (SMI) service [ 6 ]. S tudy refers here to a complete imaging session, encompassing all images obtained during a single scanning session. Each scan contains three hierarchical levels: study, series, and images. Within each study, there are one or more series that group together images acquired using the same imaging technique or settings. Each series is, in turn, composed of multiple single images or “slices”. We linked each study (i.e., one per person) deterministically with pseudonymized identifiers based on the Community Health Index (CHI) number which is the unique patient identifier used across NHS Scotland. We linked them with hospital admission records (SMR01), dementia records from mental health hospitalisations (SMR04), dispensed prescriptions from Prescribing Information System (PIS), death records (National Records of Scotland (NRS)) and demographics (birth year, sex, deprivation index) since the year 2000. All the data was processed and stored within the National Safe Haven (NSH) a secure data environment provisioned by the Edinburgh Parallel Computing Centre (EPCC) ( https://edinburgh-international-data-facility.ed.ac.uk/services/safe-haven-services/scottish-national-safe-haven ). Permissions and research governance For SCANDAN, multicentre research ethics permission from the NHS Human Research Authority was obtained from the North of Scotland Research Ethics Committee (23/NS/0017). Permission to access the data was provided by the NHS Scotland Public Benefit and Privacy Panel for Health and Social Care (HSC-PBPP), a patient advocacy panel which scrutinises applications for access to NHS Scotland health data for non-direct care (PBPP application 2223-0200 and 2223-0005). The SCANDAN team engaged with several Scottish public and patient groups during the application process and throughout the project. Computing environment The Scottish NSH is hosted in EPCC’s Trusted Research Environment (TRE), a secure infrastructure which currently hosts twelve Safe Havens. Each Safe Haven is operated under the “Five Safes” framework,[ 7 ] designed by the Office for National Statistics, and the Scottish Government Charter for Safe Havens.[ 8 ] Researchers access a secure data sharing and analysis environment with a virtual desktop, under the terms and conditions prescribed by the data providers. Standard software packages such as R and Python are available in the NSH; additional software packages can be installed from repositories such as the comprehensive R archive network (CRAN) and the Python package index (PyPI). Safe Havens have access to large shared-memory, high-performance computer clusters, including one with graphical processing unit (GPU) accelerators for large-scale analysis. All EPCC Safe Haven Services are operated at EPCC’s Advanced Computing Facility, located in Edinburgh, Scotland. The EPCC Trusted Research Environment (TRE) is accredited by ISO27001 [ 9 ] for information security practices and self-certified under Cyber Essentials and NHS Digital’s Data Security and Protection Toolkit (DSPT). In addition, the NSH is accredited under the Digital Economy Act 2017 by the UK Statistics Authority, and all Safe Havens in the TRE are operated to the same standard. Natural language processing of brain imaging reports We applied a clinical NLP tool, the Edinburgh Information Extraction for Radiology (EdIE-R), [ 10 , 11 ] which was originally developed and validated for radiology reports of brain imaging in the Edinburgh Stroke Study and NHS Tayside [ 12 ] EdIE-R processes radiology reports through a pipeline architecture that identifies entities, detects negation, extracts relationships and assigns document-level labels to identify phenotypes referring to brain abnormalities. The tool was later adapted and validated for use with data from other areas in Scotland provided by Generation Scotland.[ 13 ] EdIE-R was used to extract 24 distinct phenotypes, including different types of strokes (ischaemic, haemorrhagic and underspecified, with time and location details), brain tumours (meningiomas, gliomas, metastases or underspecified), small vessel disease, microbleeds, atrophy and other neurological abnormalities. Additionally, it marked up MRI sequence types (T1, T2, and FLAIR) to support further analysis. To improve data selection, EdIE-R was enhanced to identify scans of non-head and non-brain body parts, and flag them for exclusion. This ensured that only relevant imaging reports were included in the analysis. Enhancement of the detection of section boundaries allowed more precise separation of clinical history from the main report text so that phenotype mentions were only extracted from the latter. The refined EdIE-R pipeline was applied to all radiology reports in the SCANDAN project, producing structured outputs to guide data selection for image analysis and predictive modelling. By processing radiology reports within the Scottish NSH, the tool allows exclusion of scans (e.g., those showing tumours or non-brain regions) and served to validate outputs from imaging type classification and phenotype extraction. This process streamlined the subsequent image analysis and predictive modelling processes. Phenotyping dementia We defined dementia with ICD-10 codes or prescribed medicines that are generally used for Alzheimer’s disease (AD) [ 14 ]. Each patient interaction with the health system was used, including a single stay in hospital, multiple consecutive stays, a prescription, or a death record. We defined labels for ‘any dementia’ and five dementia subtypes: AD, vascular dementia, other rare dementias, unspecified dementia and possible dementia. The subtype was defined as the most frequently occurring dementia phenotype in each person’s electronic record. ( Error! Reference source not found .). All individuals with a dementia label were categorised as cases, and individuals with no mention of dementia in any record were considered controls. Cohort building To test the BHD data capability, we built a matched case-control study cohort with MR brain images. A matched case-control design was chosen for several reasons. First, most deep learning and other algorithms work best with balanced cases and controls. Second, we had limited computing capacity at the beginning of the project. Third, the rate of image delivery was limited by the need to copy data from a preparation area to a research area which had limited storage capacity. We first selected useable images from those that had gone through the data pipeline to define image sequence, ensure presence of whole brain, absence of other body parts and absence of intravenous contrast, and had no NLP label of tumour or haemorrhagic stroke in the radiology report. We selected people who were aged over 40 years at the time of scan or had an associated EHR record. To exclude people with a dementia diagnosis at time of scan, we defined controls as those without a diagnosis of dementia at any point in their record, and cases where a diagnosis of dementia was made one year or more after their first scan. Dementia cases were matched to controls based on age at the time of scan (within one year of the matched case) and recorded sex from the linked demographic information. Cases with no matched controls were discarded. Age and sex matching was verified by analysing the resulting distributions over the entire cohort. Identification of MR scan type and sequence DICOM tags were used to produce five labels for each image series: imaging sequence, presence of brain, presence of other body part, angiography, and imaging with contrast. We aimed to retain MRI series with sequences T1, T2 and Fluid-Attenuated Inversion Recovery (FLAIR), containing a brain and no other body part than the neck, without angiography or contrast, and with a 3D image volume of over 5 litres. These labels were subsequently combined with the results from the NLP tool, the MRI acquisition parameters, and the computed volume of the image series, to exclude those which did not meet SCANDAN criteria. To produce these labels, the DICOM tags were parsed with regular expressions. For example, the expressions /(?i)(?<!pa)t2/ (case insensitive and ignoring occurrences starting with “pa”) and /*se2d1/ were associated with the intermediate label “tmp-T2” (T2-weighted), while the expressions /TOF/ and /MRA/ were associated with the intermediate label “tmp-MRA” (MR angiography). Then, the final labels were created by grouping all the intermediate labels of a series. For example, for an image series to be labelled “T1”, it had to match the intermediate label “tmp-T1”, and could optionally match the intermediate labels “FLAIR”, “GRE” (gradient echo), and “FAT SAT” (fat saturation), which are not T1-weighted exclusive, but not any other intermediate label. For sequence identification, the DICOM tag ‘Series Description’ (0008,103E) was used. To identify the body part, we used the tags ‘Body Part Examined’ (0018,0015), ‘Protocol Name’ (0018,1030), ‘Performed Procedure Step Description’ (0040,0254) and ‘Study Description’ (0008,1030). Angiograms were identified with the tags ‘Angio Flag’ (0018, 00), ‘Study Description’ (0008,1030), ‘Protocol Name’ (0018,1030) and ‘Series Description’ (0008,103E). Contrast identification used ‘Study Description’ (0008,1030), ‘Contrast/Bolus Agent’ (0018,0010), ‘Contrast/Bolus Route’ (0018,1040), ‘Performed Procedure Step Description’ (0040,0254) and ‘Series Description’ (0008,103E). The results from the NLP tool provided additional information for the sequence identification and the presence of other body parts. Finally, the MRI sequence was defined with the tags ‘Echo Time’ (0018,0081), ‘Inversion Time’ (0018,0082), ‘Repetition Time’ (0018,0080), ‘Scanning Sequence’ (0018,0020), ‘Flip Angle’ (0018,1314), ‘Sequence Name’ (0018,0024) based on “optimal” value [ 7 - 8 ] adapted to the data through observation on manually annotated data, to complement the identification based on the series description. Data validation and quality control To generate ground truth labels for an initial evaluation of the automatic labelling process described above, we developed a custom python-based graphical user interface (GUI) optimized for MRI and CT DICOM files. The GUI allowed users to load DICOM images from a single or nested folder structure. It utilized DICOM header metadata to: stack slices according to the acquisition order using the DICOM tag ‘Instance Number’ (0020,0013), in ascending, descending, or interleave format, constructing and saving the 3D volumes for subsequent analysis; filter CT scans using the tag ‘Modality’ (0008,0060) and adjust their intensities, e.g. brain-windowing, using the tag ‘Rescale Intercept’ (0028,1052); determine the orientation of the imaging planes (i.e., axial, sagittal, or coronal) using the ta’Image Orientation (Patient)’ (0020,0037) to display mid-axial, mid-coronal, and mid-sagittal views for assessment; and 4c alculate the aspect ratio using the ‘Slice Thickness’ (0018,0050) and ‘Pixel Spacing’ (0028,0030) tags for accurate scaling and visualising of mid-view slices within the designated display area. Five experts (3 clinicians and 2 trained imaging scientists) annotated 713 randomly selected scans (388 MRI, 319 CT) with modality, sequence, presence of contrast, lesion and artefacts, presence of full brain, and presence of body parts ( Figure 2 ). The 713 scans were evenly divided among the five annotators, with a subset of 100 images overlapping for cross-validation. The Generative model of Labels, Abilities, and Difficulties (GLAD) [ 14 ] probabilistic framework was used to estimate the true label for each image while accounting for annotator expertise and image difficulty. Download figure Open in new tab Figure 2. Criteria used by the annotators to label the test imaging set using the GUI developed. The ground-truth labels were compared with those from the automatic labelling using GLAD. Scans were further reviewed by a neuroradiologist and an experienced imaging scientist independently where the ground truth and automatic labels disagreed regarding sequence type and contrast, or the ground-truth labels were “unknown” or “uncertain”. A third round of annotations resolved disagreements between the neuroradiologist and the imaging scientists regarding the sequence type. The presence of brain, of other body part and the fullness of the brain was re-annotated by a trained image scientist, due to the simpler amount of ‘unknown’ result. RESULTS Dataset description MR and CT brain images were available for 830,884 people. Exclusion criteria at subject levels and their effects are listed in Table 3 . After applying exclusion criteria, 10,709 MRI and 57,242 CT dementia cases were age and sex matched to the same number of healthy controls. The earliest 21,418 MRI and 114,484 CT studies associated with these subjects were requested, of which 21,197 and 94,740 were successfully retrieved. View this table: View inline View popup Download powerpoint Table 1. International classification of diseases 10 ( ICD - 10 ) and British national formulary (BNF) codes to define dementia subtypes. View this table: View inline View popup Download powerpoint Table 2. Labels from automated pipeline versus ground truth manual annotation for image characteristics View this table: View inline View popup Download powerpoint Table 3. Selection process for the MRI and CT cohorts, showing both the overall and dementia counts, with addition of characteristics The MRI cohort contained 8,145 cases (53.2% female) and 7,236 controls (54.1% female) ( Table 4 ). The mean age at scan was 74 years. The mean time from scan to first mention of dementia was 5 years for cases, and the mean follow-up time for controls was 6 years 9 months. Of the 8,145 dementia cases there was non-exclusive record of AD in 3,774, vascular dementia in 3,386, unspecified dementia in 3,784 and other dementia types in 508. The mean number of hospitalisations in the year prior to scan was 1.1 (standard deviation ([SD] 1.52) for cases and 1.0 (SD 1.50) for controls. During the same period, the mean number of prescriptions was 15.4 for cases and 14.2 for controls. View this table: View inline View popup Download powerpoint Table 4. Distribution of subjects with mention of each dementia type and, controls grouped by key characteristics View this table: View inline View popup Download powerpoint Table 5. Final population Data validation and quality control For simplicity, we refer to the results of the manual annotations as “annotations”, and the results of the automatic tools which are compared to the manual annotations as “labels”. During the first round, 707 annotations were obtained from 713 images. Four images could not be read due to acquisition errors. Two images were only partially annotated due to visual perception error and discarded. 29 (4.5%) images were wrongly annotated because several scans containing only one slice (localiser) or did not containing a brain. As the annotators were less familiar with CT, 24 (7.52%) of CT images were written as ‘Unknown’. In the labelling process we used the previously validated DICOM tag ‘Modality’ (0008,0060), to identify CT and MRI scans. The identification of the body parts was easier for the annotators than the modality or sequence type. “Unknown” was given for 23 (3.25%) series when annotators were questioned whether they contained a brain or not, 40 (5.66%) when questioned if the brain was acquired in full, and in 36 (5.09%) series the annotators could not assert whether there was another body part. Annotators could not identify the sequence type for 85 (12.02%) series and the presence of contrast in 104 (14.71%) series. In this first round of annotations, the main source of disagreements between annotations and labels was the presence of non-brain images or localisers. For example, non-brain scans had a higher rate of mistakes in sequence annotation. For further analyses, the scans labelled as ‘localiser’ in the first round of annotations, which contained under 15 slices, were ignored. If the annotation and label agreed on the absence of brain, the images were not re-annotated. In the second round of annotations, the images with unknown sequence type (16 scans), and those with disagreement between label and annotation in which either assigned ‘T1’, ‘T2’ or ‘FLAIR’ (48 scans) were re-annotated. Additionally, a subset of images was selected out of 66 which had partial agreement between at least one sequence label and the annotation, to validate commonly occurring combination of labels which were not similar but not outright disagreeing (e.g. ‘T1’ + ‘GRE’ instead of ‘T1’ +). Scans which contained at least one mention of ‘T1’, ‘T2’ or ‘FLAIR’ in either the annotation or the label were re-annotated for contrast presence when disagreement was found or when they were annotated as ‘Unknown’. Series with ‘Unknown’ annotation for questions regarding brain presence (13), other body part presence (8) and whole brain (11) were also re-annotated. Disagreement between the annotation and the labels were also re-annotated, respectively 7, 143 and 20 scans. In case of a whole brain, the disagreement was ignored if other body parts were present and if label and annotation agreed. In total, 143 series were re-annotated for the presence of brain and other body parts, and for full brain coverage. Additionally, there were 84 images re-annotated for sequence type and contrast. To resolve conflict between the two re-annotators, or between them and the labelling tools, 27 series were then annotated a third time. Some conflicts could not be resolved, such as 7 images having the same ‘Series Description’ (0008,103E) tag value, and thus the same label. Three of them were identified as T1 and four as T2* by the two annotators in agreement. Between each round of annotation, the regular expressions used by the labelling tools were updated to reflect previously unknown, and to solve conflicting information and errors. The results of the labelling tools compared to the annotation as ground truth were very good. The true positive rate ranged from 87% to 97% and the positive predictive value from 81% to 99% ( Table 2 ). The presence of other body parts and contrast was amended to their absence, the target which we considered “positive”. This value excludes localisers for sequence type, and series without presence of brain for ‘whole brains’. The lower precision for detection of other body parts is explained by the lack of mention of any parts in the different DICOM tags, sometimes due to missing data, as well as the detection of some other head parts, such as the jaw, without mention of the brain, which often, but not always, indicate non brain scans. The lower recall for the absence of contrast is caused by the low number of studies that used intravenous (IV) contrast. During a scanning session that used IV contrast, a first image will normally be captured free of contrast, prior to the injection, however, the ‘Study Description’ (0008,1030) will indicate the presence of IV contrast nonetheless for this first series, as was commonly found. Permissions and governance Our application to the PBPP, which included an industry partner and aimed to develop an AI algorithm, required 210 days spanning 4 iterations for approval from the initial submission and over 17,000 words across 33 pages. The data flow and linkage process for the BHD framework are schematically illustrated in Figure 3 . Researchers can either log into a workspace running in the NSH, over approved secure channels, where data will be provided directly alongside the tools to perform analysis. To run externally developed tools, they can build a container outside the NSH, and pull it from a public registry (such as the gihub container registry: ghcr.io). Alternatively, we plan to allow for the ADDI Workbench ( https://www.alzheimersdata.org/ad-workbench ) to run an analysis workflow externally, and receive the output once approved by eDRIS. It should be noted that no data leaves the EPCC TRE during this process. The TRE is divided in several zones ( Figure 3 ). The blue zones, where eDRIS store the data, are not accessible to the researchers. The different areas the researchers have access to carry their work comprise the green zones. They will have access to subsets of the data, as defined by their project group. The zones coloured in yellow are external to the NSH and represents cloud resources, such as the ADDI workbench, or container registry. Download figure Open in new tab Figure 3. Diagram to illustrate the data flow and data linkage process in the BHD. Download figure Open in new tab Figure 4. Flowchart summarizing the filtering of the MRI scans using DICOM metadata and structured report There can be different project groups working simultaneously through the BHD. Each group may only have access to a subset of the data as necessary for their research, whether that be a subset of tables, a subset of rows, or a subset of patients. To gain an additional level of confidentiality each group will only see a CHI replaced by a pseudonymised identifier which is specific to their group. eDRIS will maintain a mapping from the Encrypted Universal Patient Identifier (EUPI) to these group-specific pseudonymised identifiers. PHS eDRIS will prepare suitable subsets of the data for a particular research group and copy it to their working space. The eDRIS research coordinator will review any results of the researcher team to be released outside of the TRE, for example for publication. DISCUSSION To our best knowledge, BHD is the first large-scale, curated brain imaging clinical dataset relevant to dementia research that is available to researchers via moderated public access. The dataset offers several advantages in addition to its large sizer: clinical relevance, long-term follow-up, co-location with a GPU cluster in a safe haven, greater population representativeness compared to many research cohorts, and accessibility for clinical researchers. The resource continues to grow in data size and computing power. Working with health systems data presents challenges. One of them is the time elapsed for governance approvals. In our case, if governance had been applied for after the funding had been awarded, it would have represented 58% of a 1-year postdoctoral award. This would not only impact negatively on career development of the post holder but also delay the project goals. To address this barrier, BHD has co-developed a streamlined application process with PHS, which included simplified forms for pre-competitive research and faster approval timelines. Data provision was initially constrained by the limitations of the virtual machine environment, limited staff availability, and increased procedural complexity resulting in delayed access to imaging data and complicating project planning. The framework established by SCANDAN, now adopted by PHS, and the experience gained throughout the conduction of the project, are expected to accelerate data delivery for future projects. It is important however, to note that all research outputs generated within the NSH must undergo review by PHS staff prior to release. Most imaging research is based on uniformly acquired research data. In contrast, clinical scans acquired in a routine free-at-the-point-of-service healthcare are sometimes incomplete, may be obscured by movement or other artefacts, show signs of non-relevant pathologies, may have been obtained with non-standardised protocols, and on different machines. However, such real-world data with inherent variability is essential for the development of software tools suitable for robust application in clinical practice where such heterogeneity is the norm. Using electronic health records for dementia diagnosis has limitations. Currently, primary care data are unavailable through PHS. Hence, we relied on recorded diagnosis after an inpatient stay or death. Hospital and death records under-ascertain (false negatives) dementia in the short term and have modest reliability for dementia subtypes [ 15 ]. However, they have also previously shown high positive predictive value for all dementia diagnosis [ 14 ]. Referral reasons for scans acquisitions are not currently available, although further NLP work with reports could achieve this. The use of head scans does raise privacy concerns due to facial recognition risks. We have mitigated these by working only in a safe haven environment, visually examining only brain slices, prohibiting facial reconstruction, limiting access to approved researchers, and having PHS strictly checking all outputs from the secure environment. Future work aims to further mitigate privacy risks by limiting the need for direct human access to data, for example by implementing software via containers. However, this work needs training of the research community, better labelling of metadata (so the data is truly FAIR), and further development of technology within the NSH environment. There are many opportunities for further linkage to other datasets (for example community retinal imaging[ 16 ]). Such work will require further engagement with public contributors, and improvement in the security process to extract AI models trained within the NSH. Researchers can access the BHD data by applying to eDRIS. Proposals must demonstrate a clear public benefit, and researcher-generated outputs must be added back to the dataset so every project strengthens the next. We strongly encourage cross-group collaboration. The resources available through the BHD are growing in terms of data availability, storage capacity, and computing power that are provided to researchers. We hope that this, and similar global initiatives, will ultimately contribute to improve the brain health of people worldwide. Data Availability All data are available by application to eDRIS, https://publichealthscotland.scot/resources-and-tools/health-intelligence-and-data-management/electronic-data-research-and-innovation-service-edris/overview/what-is-edris/ https://publichealthscotland.scot/resources-and-tools/health-intelligence-and-data-management/electronic-data-research-and-innovation-service-edris/overview/what-is-edris/ Funding This work was supported by NEURii, a collaborative partnership involving the University of Edinburgh, Gates Ventures, Eisai, LifeArc and Health Data Research UK (HDR UK). We acknowledge the eDRIS team (Public Health Scotland) for their support in obtaining approvals, the provisioning and linking of data and facilitating access to the National Safe Haven. The Brain Health Data Pilot is supported by Alzheimer’s Disease Data Initiative (ADDI) and HDR UK with funding to the University of Edinburgh. Conflicts of Interest MVH and JMW are supported by Row Fogo Charitable Trust (Grant no. BRO-D.FID3668413). JMW was supported by the UK Dementia Research Institute (award no. UKDRI –4002 and 4205, DRIEdi17/18, and MRC MC_PC_17113) which receives its funding from DRI Ltd, funded by the UK Medical Research Council, Alzheimer’s Society and Alzheimer’s Research UK. ST acknowledges support of the UKRI AI programme, and the Engineering and Physical Sciences Research Council (EPSRC), for CHAI - Causality in Healthcare AI Hub [grant number EP/Y028856/1]. WW and HW are supported by HDRUK. Footnotes Correct some references in the final paragraph References 1. ↵ Westwood M , Ramaekers B , Grimm S , Armstrong N , Wijnen B , Ahmadu C , et al. Software with artificial intelligence-derived algorithms for analysing CT brain scans in people with a suspected acute stroke: a systematic review and cost-effectiveness analysis . Health Technol Assess (Rockv) . NIHR Journals Library ; 2024 ; doi: 10.3310/RDPA1487 . OpenUrl CrossRef 2. ↵ Ferber D , El Nahhas OSM , Wölflein G , Wiest IC , Clusmann J , Leßmann ME , et al. Development and validation of an autonomous artificial intelligence agent for clinical decision-making in oncology . Nat Cancer . Nature Research ; 2025 ; doi: 10.1038/S43018-025-00991-6;SUBJMETA . OpenUrl CrossRef 3. ↵ D’Adderio L , Bates DW . Transforming diagnosis through artificial intelligence . NPJ Digit Med . Nature Research ; 2025 ; doi: 10.1038/S41746-025-01460-1 . OpenUrl CrossRef 4. ↵ Classification: Official Diagnostic Imaging Dataset Annual Statistical Release 2023 /24. 5. ↵ Larobina M. Thirty Years of the DICOM Standard . Tomography . Multidisciplinary Digital Publishing Institute (MDPI ); 2023 ; doi: 10.3390/TOMOGRAPHY9050145 OpenUrl CrossRef 6. ↵ Baxter R , Nind T , Sutherland J , McAllister G , Hardy D , Hume A , et al. The Scottish Medical Imaging Archive: 57.3 Million Radiology Studies Linked to Their Medical Records . Radiol Artif Intell . Radiological Society of North America Inc .; 2024 ; doi: 10.1148/RYAI.220266/ASSET/IMAGES/LARGE/RYAI.220266.FIG2.JPEG . OpenUrl CrossRef 7. ↵ : The Five Safes Framework - GOV.UK . https://www.gov.uk/data-ethics-guidance/the-five-safes-framework Accessed 2025 May 28 . 8. ↵ : Charter for Safe Havens in Scotland: Handling Unconsented Data from National Health Service Patient Records to Support Research and Statistics. - gov.scot . https://www.gov.scot/publications/charter-safe-havens-scotland-handling-unconsented-data-national-health-service-patient-records-support-research-statistics/ Accessed 2025 May 28 . 9. ↵ : ISO/IEC 27001:2022 - Information security management systems . https://www.iso.org/standard/27001 Accessed 2025 May 28 . 10. ↵ Alex B , Grover C , Tobin R , Sudlow C , Mair G , Whiteley W. Text mining brain imaging reports . J Biomed Semantics . England ; 2019 ; doi: 10.1186/s13326-019-0211-7 . OpenUrl CrossRef PubMed 11. ↵ : Software – Language Technology Group . https://www.ltg.ed.ac.uk/software/ Accessed 2025 Aug 14 . 12. ↵ Wheater E , Mair G , Sudlow C , Alex B , Grover C , Whiteley W. A validated natural language processing algorithm for brain imaging phenotypes from radiology reports in UK electronic health records . BMC Med Inform Decis Mak . 2019 ; doi: 10.1186/s12911-019-0908-7 . OpenUrl CrossRef PubMed 13. ↵ Casey A , Davidson E , Grover C , Tobin R , Grivas A , Zhang H , et al. Understanding the performance and reliability of NLP tools: a comparison of four NLP tools predicting stroke phenotypes in radiology reports . Front Digit Health . Switzerland ; 2023 ; doi: 10.3389/fdgth.2023.1184919 . OpenUrl CrossRef 14. ↵ Whitehill J , Wu T , Bergsma J , Movellan J , Ruvolo P. Whose Vote Should Count More: Optimal Integration of Labels from Labelers of Unknown Expertise . Adv Neural Inf Process Syst . 222009 ; 15. ↵ McGuinness LA , Warren-Gash C , Moorhouse LR , Thomas SL . The validity of dementia diagnoses in routinely collected electronic health records in the United Kingdom: A systematic review . Pharmacoepidemiol Drug Saf . John Wiley and Sons Ltd ; 2019 ; doi: 10.1002/PDS.4669 . OpenUrl CrossRef 16. ↵ Tochel C , Bernabeu MO , McTrusty A , Tatham AJ , Pead E , Buckmaster F , et al. SCONe: a community-acquired retinal image repository enabling ocular, cardiovascular and neurodegenerative disease prediction . BMJ Health Care Inform . BMJ Publishing Group ; 2025 ; doi: 10.1136/BMJHCI-2024-101236 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted October 24, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A large dataset of brain imaging linked to health systems data: the curation and access to a whole system national cohort from NHS Scotland Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A large dataset of brain imaging linked to health systems data: the curation and access to a whole system national cohort from NHS Scotland Michael P J Camilleri , Dorian Gouzou , Salim Al-Wasity , Muthu R K Mookiah , María Valdes Hernandez , Bea Alex , Sotirios A. Tsaftaris , Andrew Brooks , Ruairidh MacLeod , Honghan Wu , Brenda Bauer , Claire Grover , Parminder Reel , Susan Krueger , Richard Tobin , J. Douglas Steele , Grant Mair , Joanna Wardlaw , Alexander Doney , Emanuele Trucco , William Whiteley medRxiv 2025.10.21.25338469; doi: https://doi.org/10.1101/2025.10.21.25338469 Share This Article: Copy Citation Tools A large dataset of brain imaging linked to health systems data: the curation and access to a whole system national cohort from NHS Scotland Michael P J Camilleri , Dorian Gouzou , Salim Al-Wasity , Muthu R K Mookiah , María Valdes Hernandez , Bea Alex , Sotirios A. Tsaftaris , Andrew Brooks , Ruairidh MacLeod , Honghan Wu , Brenda Bauer , Claire Grover , Parminder Reel , Susan Krueger , Richard Tobin , J. Douglas Steele , Grant Mair , Joanna Wardlaw , Alexander Doney , Emanuele Trucco , William Whiteley medRxiv 2025.10.21.25338469; doi: https://doi.org/10.1101/2025.10.21.25338469 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Radiology and Imaging Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9219) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffc11b7fcd81b23',t:'MTc3OTQ1NTUxMg=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Outcome instruments

NRS-pain

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00