Knowledge representation of a multi-centre adolescent and young adult (AYA) cancer infrastructure; development of the STRONG AYA Knowledge Graph

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 44,326 characters · extracted from preprint-html · click to expand
Knowledge representation of a multi-centre adolescent and young adult (AYA) cancer infrastructure; development of the STRONG AYA Knowledge Graph | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Knowledge representation of a multi-centre adolescent and young adult (AYA) cancer infrastructure; development of the STRONG AYA Knowledge Graph View ORCID Profile J. (Joshi) Hogenboom , View ORCID Profile V. (Varsha) Gouthamchand , View ORCID Profile C. (Charlotte) Cairns , View ORCID Profile S.H.M. (Silvie) Janssen , View ORCID Profile K. (Kirsty) Way , View ORCID Profile A.L.A.J. (Andre) Dekker , View ORCID Profile W.T.A. (Winette) Van Der Graaf , View ORCID Profile A. (Anne-Sophie) Darlington , View ORCID Profile O. (Olga) Husson , View ORCID Profile L.Y.L. (Leonard) Wee , View ORCID Profile J. (Johan) Van Soest , View ORCID Profile A. (Aiara) Lobo Gomes doi: https://doi.org/10.1101/2025.06.03.25328788 J. (Joshi) Hogenboom 1 Department of Radiation Oncology (Maastro), GROW School for Oncology and Reproduction, Maastricht University Medical Centre+ , Maastricht, the Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for J. (Joshi) Hogenboom For correspondence: joshi.hogenboom{at}maastrichtuniversity.nl V. (Varsha) Gouthamchand 1 Department of Radiation Oncology (Maastro), GROW School for Oncology and Reproduction, Maastricht University Medical Centre+ , Maastricht, the Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for V. (Varsha) Gouthamchand C. (Charlotte) Cairns 2 School of Health Sciences, University of Southampton , Southampton, United Kingdom 3 Clinical Standards Unit , British Association of Dermatologists, London, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for C. (Charlotte) Cairns S.H.M. (Silvie) Janssen 4 Department of Medical Oncology, Netherlands Cancer Institute , Amsterdam, the Netherlands 5 Department of Medical Oncology, Erasmus MC Cancer Institute, Erasmus University Medical Centre , Rotterdam, the Netherlands 6 Department of Public Health, Erasmus MC Cancer Institute, Erasmus University Medical Centre , Rotterdam, The Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for S.H.M. (Silvie) Janssen K. (Kirsty) Way 2 School of Health Sciences, University of Southampton , Southampton, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for K. (Kirsty) Way A.L.A.J. (Andre) Dekker 1 Department of Radiation Oncology (Maastro), GROW School for Oncology and Reproduction, Maastricht University Medical Centre+ , Maastricht, the Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for A.L.A.J. (Andre) Dekker W.T.A. (Winette) Van Der Graaf 4 Department of Medical Oncology, Netherlands Cancer Institute , Amsterdam, the Netherlands 5 Department of Medical Oncology, Erasmus MC Cancer Institute, Erasmus University Medical Centre , Rotterdam, the Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for W.T.A. (Winette) Van Der Graaf A. (Anne-Sophie) Darlington 2 School of Health Sciences, University of Southampton , Southampton, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for A. (Anne-Sophie) Darlington O. (Olga) Husson 4 Department of Medical Oncology, Netherlands Cancer Institute , Amsterdam, the Netherlands 7 Department of Public Health and Surgical Oncology, Erasmus Medical University Centre , Rotterdam, the Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for O. (Olga) Husson L.Y.L. (Leonard) Wee 1 Department of Radiation Oncology (Maastro), GROW School for Oncology and Reproduction, Maastricht University Medical Centre+ , Maastricht, the Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for L.Y.L. (Leonard) Wee J. (Johan) Van Soest 1 Department of Radiation Oncology (Maastro), GROW School for Oncology and Reproduction, Maastricht University Medical Centre+ , Maastricht, the Netherlands 8 Brightlands Institute for Smart Society (BISS), Faculty of Science and Engineering, Maastricht University , Maastricht, The Netherlands Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for J. (Johan) Van Soest A. (Aiara) Lobo Gomes 1 Department of Radiation Oncology (Maastro), GROW School for Oncology and Reproduction, Maastricht University Medical Centre+ , Maastricht, the Netherlands 9 Institute of Molecular Medicine, RWTH Aachen University , Aachen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for A. (Aiara) Lobo Gomes Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Purpose Rare diseases are difficult to fully capture, and regularly call for large, geographically dispersed initiatives. Such initiatives are often met with data harmonisation challenges. These challenges render data incompatible and impede successful realisation. The STRONG AYA project is such an initiative, specifically focusing on adolescents and young adults (AYAs) with cancer. STRONG AYA is setting up a federated data infrastructure containing data of varying format. Here, we elaborate on how we used healthcare-agnostic Semantic Web technologies to overcome such challenges. Methodology We structured the STRONG AYA case-mix and core outcome measures concepts and their properties as knowledge graphs. Having identified the corresponding standard terminologies, we developed a semantic map based on the knowledge graphs and the here introduced annotation helper plugin for Flyover. Flyover is a tool that converts structured data into Resource Descriptor Framework (RDF) triples and enables semantic interoperability. As a demonstration, we mapped data that is to be included in the STRONG AYA infrastructure. Results The knowledge graphs provided a comprehensive overview of the large number of STRONG AYA concepts. The semantic terminology mapping and annotation helper allowed us to query data with incomprehensible terminologies, without changing them. Both the knowledge graphs and semantic map were made available on a Hugo webpage for increased transparency and understanding. Discussion The use of Semantic Web technologies such as RDF and knowledge graphs are a viable solution to overcome challenges regarding data interoperability and reusability for a federated AYA cancer data infrastructure without being bound to rigid standardised schemas. The linkage of semantically meaningful concepts to otherwise incomprehensible data elements demonstrates how by using these domain-agnostic technologies we made non-standardised healthcare data interoperable. Introduction Adolescent and young adult (AYA) cancer is rare and concerns an often-overlooked population, defined as people aged 15 to 39 years at primary cancer diagnosis. AYAs are in part characterised by significant differences in tumour type, psychosocial characteristics and care needs. As a result, AYAs cancer calls for age-specific care that is traditionally unmet by paediatric and adult cancer care 1 . To improve healthcare services, research, and outcomes for AYAs, the STRONG AYA Initiative 2 is setting up a federated data infrastructure 3 , 4 that incorporates both retrospective and prospective AYA data – contributed by several medical centres across Europe. This regional variability allows us to highlight significant challenges in data harmonisation across healthcare systems. The definitions, format, and terminology of the data in these datasets vary across institutions; this is often for practical and operational purposes. For example, when recording an individual’s highest obtained educational level, it is most pragmatic to consult participants in a format that is sensible in their regional setting – as this is the information people will know. Such differences can render data incompatible with that collected in other regional settings, if not adequately harmonised. Therefore, one of the first crucial steps in such large and international collaborations is to adopt or establish certain standards within a data model. This involves two distinct components: standardising the data schema, and standardising the definitions and terminology. Establishing a standardised schema provides syntactic interoperability; but this still relies on semantic interoperability – standardising definitions and terminology. These two components constitute interoperability in the broader sense, enabling data integration from diverse sources 5 . For educational level, a straightforward solution would involve transcribing such regionally sensible levels to UNESCO’s International Standard Classification of Education (ISCED) 6 , but this can be costly and burdensome. Moreover, even for simple concepts, intrinsic differences in data semantics regularly occur. For example, biological sex can be recorded as ‘male, female’ , ‘male, female, intersex’ , and ‘0, 1, 2’ . More complex cases are abundant, and even a seemingly straightforward example like the time of diagnosis can quickly reveal operational differences. The definition of the time of diagnosis can range from the first illness-related hospital visit to the date of biopsy evaluation or formal diagnosis made by a clinician. A well-established approach to solve such challenges is the implementation of the F.A.I.R. – findable, accessible, interoperable, and reusable – data guiding principles 7 . This previous work defines how data can be made F.A.I.R. for both humans and machines, whilst allowing flexibility in terms of multiple co-existing semantic ontologies schema and semantics. In a federated data infrastructure, applying the F.A.I.R. principles has been effective in overcoming interoperability hurdles related to data semantics 8 , 9 , while bridging pitfalls concerning syntactic interoperability e.g. using an ‘on-read’ approach 9 . At the same time, the F.A.I.R. principles highlight reusability, thus increasing the understanding of data – and hence transforming it into information – which is a pivotal aspect of the process. To that end, the use of knowledge graphs and Semantic Web standards are established methods as they are semantically rich and reflect the structure of the data at hand 8 – 12 . These concepts represent complex information in a graphical format and, therewith, aim to enhance understanding of the data by illustrating relationships between data concepts. These methodologies are however domain agnostic and lack the specificity relevant for STRONG AYA. The aim of this work is to develop and implement a tailored data model for STRONG AYA that addresses the unique challenges of AYA cancer data harmonisation. To achieve this, we developed a data model for STRONG AYA that is aligned with its data collection procedures, simultaneously delivering on the implementation of the ’Interoperable’ and ’Reusable’ aspects of the F.A.I.R. data principles. In this work, we elaborate on this effort to illuminate the various processes involved with resolving data incompatibilities in a large healthcare consortium’s federated infrastructure. To reduce the complexity of concepts relevant for AYAs with cancer, we developed the STRONG AYA data model as a knowledge graph. Using this knowledge graph, we then transcribe its contents to the STRONG AYA semantic map which can be used for the necessary mapping that ensures semantic interoperability, whilst overcoming syntactic interoperability through an established ‘on-read’ approach. With the interplay of the knowledge graph and semantic map we aim to accelerate and facilitate STRONG AYA’s goals of improving healthcare services, research, and outcomes for AYAs – whilst also setting an example of F.A.I.R. data principles implementation in large scale consortia. Methodology Data elements In STRONG AYA, extensive research identified key information to enhance healthcare services, research, and outcomes for AYAs with cancer. This involved a literature review, qualitative interviews, and a three-round Delphi procedure with AYA cancer stakeholders (AYAs with cancer, caregivers, health professionals, researchers, and policymakers) to determine relevant outcome domains 13 , 14 . A Core Outcome Set (COS) was developed from these domains. Subsequently, a set of core measurement instruments and/or items were compiled which best measure the COS. The COS and measurement set were refined to minimise participant burden while retaining essential elements. A list of relevant case-mix variables, identified through a literature review 15 , supplemented the COS to form the final data elements for the STRONG AYA infrastructure, represented in table 1 , excluding all time elements except the initial timestamp. Details on these procedures can be found in their original publications 13 – 15 . View this table: View inline View popup Download powerpoint Table 1: Overview of AYA cancer relevant concepts compiled through a literature review, qualitative interviews, and a three-round Delphi procedure with AYA cancer stakeholders. Data conversion and annotation For multi-centre semantic mapping and knowledge graph generation we used the Resource Description Framework (RDF) data format 16 . RDF is a data representation standard for the basic building block of a graph: the representation of nodes and arcs. This triple format is made up of a subject – predicate – object statement representing node – arc –node, respectively. For instance, ‘AYA – has column – biological sex’ . As none of the centres collected data as RDF-triples, we used the Flyover tool 9 ; ( https://github.com/MaastrichtU-CDS/Flyover ) to harmonise the data format across centres. Flyover converts an arbitrary form of data such as comma separated values (CSV), into triples and then stores them in a graph database. For instance, a row – AYA 1 – with a value of ‘ femalè for ‘biological_sex’ being converted into ‘AYA 1 – has column – biological_sex’ and ‘biological_sex – has value – femalè as is illustrated in figure 1 . Download figure Open in new tab Figure 1: Conversion of tabular biological sex data to RDF-triple format using Flyover. Using this triple format, Flyover allows us to impose semantics on top of this existing data through a metadata layer – or annotation graph. This means that the ‘AYA – has column – biological_sex’ triple can refer to variables whose names do not necessarily carry semantic significance, which is one of the challenges that was emphasised in the introduction. Flyover maintains the original data structure by providing semantic interoperability on-read through this annotation graph. To make the best use of Flyover ’s descriptives abilities, we developed a JSON semantic map plugin for Flyover ’s graphical user interface. Using this semantic map, we can directly map variable names as they appear in their original data source, to standardised terminologies. This semantic map could then be used to easily develop the queries that annotate the original data sources’ names in the metadata layer through Flyover ’s Annotation Helper . This helper parses the variable and values names’ along with the mapped standard terms into queries that insert triple statements into the annotation graph. Knowledge graphs and semantic map The RDF model enabled the addition of semantically rich graph structures to the data without modification. Using the list of AYA cancer-relevant concepts, we created a visual knowledge graph to illustrate these data elements in a structured way. We displayed this visual knowledge graph in three sub-structures, all part of a single graph model: data, data source, and instrument graph. The graph structure was then reviewed by those who defined the list of AYA cancer relevant concepts elements. After this review, we included the graph structures in the semantic map so that they could be incorporated in the metadata layer. For the STRONG AYA data elements, we identified relevant standardised terminologies, predominantly leveraging the National Cancer Institute Thesaurus (NCIt) 17 for object terms – or classes, and the Semanticscience Integrated Ontology (SIO) 18 for properties. Other vocabularies that were used include the Gender, Sex, and Sexual Orientation Ontology (GSSO) 19 and SNOMED CT (SCT) 20 . We used custom terms for STRONG AYA’s bespoke questions and concepts lacking standardised definitions. We used Flyover ’s Annotation Helper semantic map format as basis and as overview of all data elements and their standard terms. This global semantic map – without local terms – was published on GitHub to allow for transparent semantic map updates and traceability. The workflow that was used to achieve semantic interoperability is illustrated in figure 2 . Download figure Open in new tab Figure 2: Workflow used to achieve semantic interoperability for STRONG AYA data elements and data-contributing centres. To provide a comprehensive overview of our knowledge graph and semantic map we integrated them into a STRONG AYA Knowledge Representation, semi-static Hugo ( https://gohugo.io ) website. This resource enables continuous review by consortium members. The semantic map section displays variable names, vocabulary reference codes, and associated preferred names and definitions from BioPortal ( https://bioportal.bioontology.org ). Content is updated quarterly and upon GitHub repository update by extracting semantic map information and fetching vocabulary details via the BioPortal REST API. Unfound reference codes are automatically reported as GitHub issues to alert repository owners. Semantic mapping demonstration As part of STRONG AYA’s retrospective data retrieval and to demonstrate our semantic mapping method, we mapped the SURVAYA study (ClinicalTrials.gov identifier: NCT05379387 ), a population-based cross-sectional cohort study of long-term AYA cancer survivors from the Netherlands Cancer Registry. Details can be found in the original study publication 21 . SURVAYA will be integrated into STRONG AYA’s infrastructure, but for testing, we used a synthetic dataset 22 containing overlapping elements. The SURVAYA study was conducted in accordance with the Declaration of Helsinki and approved by the Netherlands Cancer Institute Institutional Review Board (IRBIRBd18122) on February 6, 2019. The synthetic dataset was used with permission from the study’s principal investigator and sponsor. Results Knowledge graphs The knowledge graph in figure 3 offers a visual and structured representation of AYA cancer-relevant concepts. Specifically, figure 3 presents the data graph, categorising data concepts into sociodemographic, clinical, and outcome characteristics using – using SIO’s ‘has annotation’ – to reduce complexity and structure concepts. Data concepts are generally attributes of a given category, utilising SIO’s ‘has attribute’. Units for continuous concepts, such as age at diagnosis, overall progression time, and survival, were added in years and days to enhance interpretability, associated via SIO’s ‘has unit’. Intervariable relationships are limited to neoplasm-associated concepts, including cancer progression time, tumour staging, and localisation, – which utilise SIO’s ‘is related to’ and ‘has property’. The AYA’s research identifier is directly associated with the AYA using SIO’s ‘has unique identifier’ and is not part of any sub-category. Data sources are colour-coded: orange for patient-reported outcome measures (PROMs), pink for healthcare professional reported outcome measures (HCPROMs), and purple for electronic health records (EHRs). Download figure Open in new tab Figure 3: Data graph showcasing the AYA cancer relevant concepts in a more comprehensive way. Information on collection time is not present here and is visible in the underlying instrument graphs. The measurement instrument type or data source per concept is identifiable by the coloured outline. Please note that while certain concepts are specifically categorised as ’Outcome,’ what constitutes an outcome is study-specific and may also include variables categorised here under ’Medical characteristics’ and ’Sociodemographic characteristics. The data source graphs are displayed in supplementary figure 1 and describe the data elements’ sources using SIO’s ‘has property’, detailing the distinct properties of PROM, HCPROM, and EHR data elements. Supplementary figure 2 introduces an additional layer to the graph structure by clustering data related to a single concept, exemplified through the European Organisation for Research and Treatment of Cancer Quality of Life Questionnaire for AYAs and its specific questions 23 . Knowledge representation webpage Figure 4 shows an excerpt of the semantic map and the knowledge representation webpage of the concept biological sex . The semantic map describes the references to standard vocabularies (in bold orange font) and defines the graph structure (in bold blue font). Concretely, this semantic mapping annotates our triple statement of ‘AYA 1 – has column – biological_sex’ with ‘AYA 1 – sio:SIO_000235 – ncit:C18772’ and ‘ncit:C18772 – sio:SIO_000008 – ncit:C28421’ . Download figure Open in new tab Figure 4: An excerpt of the AYA semantic map and of the AYA cancer knowledge representation. With in the semantic map excerpt the reference to standardised terminologies in bold orange font, The single triple statement is reconstructed to two statements through the schema reconstruction section, which reflects the structure of the previously described knowledge graphs. Our value triple statement of ‘biological_sex – has value – femalè is annotated with ‘ncit:C28421 – has value – ncit:C16576’. Whilst in the webpage the structure is displayed in a human-readable format by showing what the references in the semantic map – here being the triple statements classes – correspond to. The complete semantic map ( https://github.com/STRONGAYA/AYA-cancer-semantic-map ) and the knowledge representation pages ( https://strongaya.github.io/AYA-cancer-semantic-map ) are available on GitHub. Semantic mapping demonstration The interoperability of the annotated RDF-triple SURVAYA data is illustrated in figure 5 , showcasing data accessibility through both local and standard terminology – here exemplified using biological sex. Utilising Flyover and the semantic map, we tested our data harmonisation workflow with synthetic SURVAYA data. Initially, RDF-converted biological sex data of a SURVAYA dataset would solely be available as ‘AYA 1 – has column – alg_v1b’ , but through annotation this data becomes accessible through the standard semantic mapping of ‘AYA 1 – sio:SIO_000235 – ncit: C326200’ and ‘ncit:C326200 – sio:SIO_000008 – ncit:C28421’ . SPARQL 24 was used for data queries. The semantic mapping for SURVAYA is available on GitHub ( https://github.com/STRONGAYA/AYA-cancer-semantic-map/tree/dev/retrospective/SURVAYA ). Supplementary figure 3 presents the semantic mapping excerpt for biological sex data, emphasising the necessity of semantic meaning, as local terminology lacks clarity without it. The full output of our graph database containing synthetic SURVAYA data is available in supplementary figure 4. Download figure Open in new tab Figure 5: Demonstration of the interoperability of the SURVAYA dataset, which through a SPARQL-query is both accessible via its original – and incomprehensible– terminology ’alg_v1b’ and standardised terminology ’ncit:C28421’. Both the local terminology and the standard terminology are highlighted in bold font. Shown values are from synthetic SURVAYA data and do not contain information of real participants. Discussion The knowledge representation described in this work illustrates how, by making use of RDF-data structures, we can make a large mix of complex, AYA cancer concepts to adhere to the ’Interoperable’ and ’Reusable’ aspects of the F.A.I.R. data principles 7 . Our work demonstrates that adhering to these principles allows us to navigate through the difficulties of heterogeneous semantics and inherent differences in data schemas whilst simultaneously expanding the application of a domain-agnostic standard such as RDF. The STRONG AYA Knowledge Graph has provided a comprehensive overview of the large number of items collected for this consortium. This knowledge graph was then transcribed into a STRONG AYA data semantic map. In turn, the introduction of this semantic map enables us to map and annotate AYA-specific concepts with standardised terminologies, thereby circumventing the use of project- specific definitions. We showcase how we use the AYA knowledge representation on one of the AYA datasets to be included in STRONG AYA, laying a foundation for other datasets. In the process of making data more F.A.I.R., there are numerous approaches to address challenges related to data interoperability and enhance understandability. In this work, we used Semantic Web standards, such as RDF 16 and SPARQL 24 , due to the flexible and non-rigid schema design of the RDF format. This flexibility allows data-contributing partners to submit data in its original format, with most interoperability work managed by a single coordinating party. Partners can use the Flyover tool and Annotation Helper plugin with the STRONG AYA semantic map for most scenarios. The introduced Annotation Helper significantly simplifies this process, providing an easy-to-understand aid verifiable by those without RDF or Semantic Web knowledge. The annotation helper also enhances RDF’s flexibility, as the annotation layer can be reapplied as requirements evolve. For instance, the introduced instrument graph supports simultaneous use of cross-sectional and longitudinal data, but as prospective data collection procedures evolve, the instrument graph must adapt. Additionally, the RDF format allows for future inclusion of logical reasoning to identify erroneous combinations in the STRONG AYA Knowledge Graph, benefiting data quality assurance and efficient data use. In comparison to other data models, such as the Observational Medical Outcomes Partnership Common Data Model (OMOP-CDM) 25 , our approach benefits from Flyover ’s ‘on-read’ approach 9 , allowing the STRONG AYA knowledge graph and schema to adapt without modifying the data. Additionally, inherently to Semantic Web standards and the main mantra “Anyone can say Anything about Anything” we are not limiting ourselves to given terminology standards and data schemas. By using uniform resource identifiers (URIs) of terms – which should resolve to their descriptions – these standards are more open to extension by anyone. However, this flexibility means forgoing the tools available for OMOP-CDM, which, despite its rigidity, offers a well-established ecosystem for data integration and analysis. In contrast, RDF provides the flexibility crucial for STRONG AYA’s diverse and evolving data structures. However, rather than comparing RDF to OMOP, they should be considered complementary. RDF is a data representation standard predominantly used by big-tech and industry 26 , 27 , while OMOP is a healthcare-specific data model. Future work should focus on integrating OMOP and RDF to create a sustainable hybrid solution. When developing any form of knowledge representation, it is vital that the included concepts are relevant to the overarching subject. In our work, we have based our AYA cancer knowledge representation on an extensive Delphi procedure and literature review 13 , 15 . This procedure significantly reduced difficulties in defining the relevant concepts – reiterating the importance of such preparatory work. Adherence to this protocol ensured that the true meaning of a term such as ‘date of diagnosis’ was already quite refined, and prevented extensive discussions during data model development. Because of this robustness and rigorous adherence to these predefined concepts, we were required to use numerous custom, and thus non-standard terminologies. This is because a substantial number of AYA cancer concepts are not present in any ontology, owing to the bespoke nature of various PROMs. Although this does not hinder interoperability, the lack of standard terms diminishes understandability, as our custom ontology codes have no established semantic significance. Moreover, while this work advances knowledge representation for AYA cancers, it also highlights areas lacking common definitions. It underscores both the need for extended AYA research and the establishment – and adoption – of standard PROM terminologies 28 . All in in all, we have shown how developing an AYA knowledge representation can navigate the challenging topography of interoperability and understandability in a large AYA cancer consortium. By leveraging existing tools and terminologies, we can increase the adherence of our AYA data pool to the F.A.I.R. data principles whilst concurrently reducing the burden for our data-contributing partners by using Flyover ’s integrated Annotation Helper . Whilst issues of incompatibility and understandability seem addressed, further implementation of F.A.I.R. data principles will enhance the societal benefits of the data collected in STRONG AYA. Future work should focus on transcribing our knowledge representation to a machine-findable source that, through appropriate agreements, licenses, and protocols is accessible to individuals currently outside of the consortium – simultaneously addressing the ‘Findable’ and ‘Accessible’ attributes in F.A.I.R. To maximise the reusability for a wider audience it is however a necessity that future work also focuses on the introduction of standardised terminology for AYA cancer – and other fields with notable reliance on PROMs. Data Availability The knowledge graph, semantic map, and knowledge representation website are available on GitHub: https://github.com/STRONGAYA/AYA-cancer-semantic-map. https://github.com/STRONGAYA/AYA-cancer-semantic-map Acknowledgements All individuals who have contributed to this study are included in the author list. Footnotes Prior presentation: This study and its results are original and have not yet been presented elsewhere. Code availability: The knowledge graph, semantic map, and knowledge representation website are available on GitHub: https://github.com/STRONGAYA/AYA-cancer-semantic-map . Support: J. Hogenboom, V. Gouthamchand, C. Cairns, S.H.M. Janssen, K. Way, A.L.A.J. Dekker, A. Darlington, O. Husson, and L.Y.L. Wee were supported in this work by the European Union’s Horizon 2020 research and innovation programme through The STRONG AYA Initiative (Grant agreement ID: 101057482). J. Van Soest was supported in this work by the European Union’s Horizon 2020 research and innovation programme through the BETTER project (Grant agreement ID: 101136262). A. Lobo Gomes acknowledges support by NWO DACIL (KICH1.GZ03.21.023) and ERDF DigiONE-I3 (SEP-210898024). L.Y.L. Wee is currently in receipt of research funds from ZonMW, Velux Stiftung, and NWA-ORC. W.T.A. van der Graaf received a research grant (to the institution) from Eli Lilly, outside this work. References 1. ↵ Collaborators GBDAYAC: The global burden of adolescent and young adult cancer in 2019: a systematic analysis for the Global Burden of Disease Study 2019 . Lancet Oncol 23:27-52, 2022 2. ↵ The STRONG-AYA Initiative, 2025 3. ↵ Martin F , Beusekom B , Leurs R , et al: vantage6, 2025 4. ↵ Smits D , Van Beusekom B , Martin F , et al : An Improved Infrastructure for Privacy- Preserving Analysis of Patient Data . Stud Health Technol Inform 295 : 144 – 147 , 2022 OpenUrl PubMed 5. ↵ Almeida JR , Silva LB , Bos I , et al : A methodology for cohort harmonisation in multicentre clinical research . Informatics in Medicine Unlocked 27 : 100760 , 2021 6. ↵ UNESCO IoS: International Standard Classification of Education (ISCED), 2017 7. ↵ Wilkinson MD , Dumontier M , Aalbersberg IJ , et al : The FAIR Guiding Principles for scientific data management and stewardship . Sci Data 3 : 160018 , 2016 8. ↵ Gaudet-Blavignac C , Raisaro JL , Toure V , et al: A National, Semantic-Driven, Three- Pillar Strategy to Enable Health Data Secondary Usage Interoperability for Research Within the Swiss Personalized Health Network: Methodological Study . JMIR Med Inform 9 : e27591 , 2021 OpenUrl 9. ↵ Gouthamchand V , Choudhury A , Hoebers FJP , et al : Making head and neck cancer clinical data Findable-Accessible-Interoperable-Reusable to support multi-institutional collaboration and federated learning. BJR|Artificial Intelligence 1 , 2024 10. Sloep M , Kalendralis P , Choudhury A , et al : A knowledge graph representation of baseline characteristics for the Dutch proton therapy research registry . Clin Transl Radiat Oncol 31 : 93 – 96 , 2021 OpenUrl PubMed 11. Sachdeva S , Bhalla S: Using Knowledge Graph Structures for Semantic Interoperability in Electronic Health Records Data Exchanges , Information , 2022 12. ↵ Scoarta S , Kucukosmanoglu A , Bindt F , et al: Review: A Roadmap to Use Nonstructured Data to Discover Multitarget Cancer Therapies . JCO Clin Cancer Inform 7 : e2200096 , 2023 OpenUrl 13. ↵ Husson O , Janssen SHM , Reeve BB , et al : Protocol for the development of a Core Outcome Set (COS) for Adolescents and Young Adults (AYAs) with cancer . BMC Cancer 24 : 126 , 2024 14. ↵ Darlington AS , Way K , Collaço N , et al : Development of a Core Outcome Set (COS) for Adolescents and Young Adults (AYAs) with cancer . Article in submission , 2025 15. ↵ Janssen SHM , van der Graaf WTA , Hurley-Wallace A , et al: Core Patient-Centered Outcomes for Adolescents and Young Adults with Cancer: A Comprehensive Review of the Literature from the STRONG-AYA Project, Cancers, 2025 16. ↵ Cyganiak R , Wood D , Lanthaler M : RDF 1.1 Concepts and Abstract Syntax, W3C, 2014 17. ↵ de Coronado S , Remennik L , Elkin PL : National Cancer Institute Thesaurus (NCIt) , in Elkin PL (ed): Terminology, Ontology and their Implementations. Health Informatics. Cham , Springer International Publishing , 2023 , pp 395 – 441 18. ↵ Dumontier M , Baker CJ , Baran J , et al : The Semanticscience Integrated Ontology (SIO) for biomedical research and knowledge discovery . J Biomed Semantics 5 : 14 , 2014 19. ↵ Kronk CA , Dexheimer JW : Development of the Gender, Sex, and Sexual Orientation ontology: Evaluation and workflow. J Am Med Inform Assoc 27 : 1110 – 1115 , 2020 OpenUrl PubMed 20. ↵ Millar J : The Need for a Global Language - SNOMED CT Introduction. Nursing Informatics 2016: Ehealth for All: Every Level Collaboration - from Project to Realization 225 : 683 – 685 , 2016 OpenUrl 21. ↵ Vlooswijk C , Poll-Franse LVV , Janssen SHM , et al: Recruiting Adolescent and Young Adult Cancer Survivors for Patient-Reported Outcome Research: Experiences and Sample Characteristics of the SURVAYA Study . Curr Oncol 29 : 5407 – 5425 , 2022 OpenUrl PubMed 22. ↵ Hogenboom J , Lobo Gomes A , Dekker A , et al : Actionability of Synthetic Data in a Heterogeneous and Rare Health Care Demographic: Adolescents and Young Adults With Cancer . JCO Clin Cancer Inform 8 : e2400056 , 2024 OpenUrl PubMed 23. ↵ Sodergren S , Husson O , Rohde G , et al: Development of an EORTC quality of life questionnaire specific to adolescents and young adults with cancer: Measuring what matters most to this unique patient group, ISOQOL Cologne, Germany, 2024 , pp S31 – S32 24. ↵ Harris S , Seaborne A : SPARQL 1.1 Query Language, W3C, 2013 25. ↵ Hripcsak G , Duke JD , Shah NH , et al: Observational Health Data Sciences and Informatics (OHDSI): opportunities for observational researchers, MEDINFO 2015: eHealth-enabled Health , IOS Press , 2015 , pp 574 – 578 26. ↵ He Q : Building The LinkedIn Knowledge Graph , LinkedIn , 2025 27. ↵ 27. Singhal A: Introducing the Knowledge Graph: things, not strings, Google, 2012 28. ↵ Cella D , Hays RD : A patient reported outcome ontology: Conceptual issues and challenges addressed by the Patient-Reported Outcomes Measurement Information System®(PROMIS®) . Patient Related Outcome Measures : 189 – 197 , 2022 View the discussion thread. Back to top Previous Next Posted June 03, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Knowledge representation of a multi-centre adolescent and young adult (AYA) cancer infrastructure; development of the STRONG AYA Knowledge Graph Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Knowledge representation of a multi-centre adolescent and young adult (AYA) cancer infrastructure; development of the STRONG AYA Knowledge Graph J. (Joshi) Hogenboom , V. (Varsha) Gouthamchand , C. (Charlotte) Cairns , S.H.M. (Silvie) Janssen , K. (Kirsty) Way , A.L.A.J. (Andre) Dekker , W.T.A. (Winette) Van Der Graaf , A. (Anne-Sophie) Darlington , O. (Olga) Husson , L.Y.L. (Leonard) Wee , J. (Johan) Van Soest , A. (Aiara) Lobo Gomes medRxiv 2025.06.03.25328788; doi: https://doi.org/10.1101/2025.06.03.25328788 Share This Article: Copy Citation Tools Knowledge representation of a multi-centre adolescent and young adult (AYA) cancer infrastructure; development of the STRONG AYA Knowledge Graph J. (Joshi) Hogenboom , V. (Varsha) Gouthamchand , C. (Charlotte) Cairns , S.H.M. (Silvie) Janssen , K. (Kirsty) Way , A.L.A.J. (Andre) Dekker , W.T.A. (Winette) Van Der Graaf , A. (Anne-Sophie) Darlington , O. (Olga) Husson , L.Y.L. (Leonard) Wee , J. (Johan) Van Soest , A. (Aiara) Lobo Gomes medRxiv 2025.06.03.25328788; doi: https://doi.org/10.1101/2025.06.03.25328788 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4411) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15205) Forensic Medicine (30) Gastroenterology (1119) Genetic and Genomic Medicine (6574) Geriatric Medicine (666) Health Economics (994) Health Informatics (4511) Health Policy (1365) Health Systems and Quality Improvement (1608) Hematology (537) HIV/AIDS (1263) Infectious Diseases (except HIV/AIDS) (15903) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6573) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (954) Oncology (3319) Ophthalmology (968) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (662) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5422) Public and Global Health (9205) Radiology and Imaging (2191) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1191) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9feacaae394158f4',t:'MTc3OTI3NDM0NA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00