Full text
59,206 characters
· extracted from
preprint-html
· click to expand
Semantic Encoding in Medical LLMs for Vocabulary Standardisation | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Semantic Encoding in Medical LLMs for Vocabulary Standardisation Samuel Mainwood , View ORCID Profile Aashish Bhandari , View ORCID Profile Sonika Tyagi doi: https://doi.org/10.1101/2025.06.16.25329716 Samuel Mainwood 1 School of Computing Technologies, RMIT University , Melbourne, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aashish Bhandari 1 School of Computing Technologies, RMIT University , Melbourne, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Aashish Bhandari Sonika Tyagi 1 School of Computing Technologies, RMIT University , Melbourne, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sonika Tyagi For correspondence: sonika.tyagi{at}rmit.edu.au Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract High-quality, standardised medical data availability remains a bot-tleneck for digital health and AI model development. A major hurdle is translating noisy free text into controlled clinical vocabularies, aiming for harmonisation and interoperability, especially when source datasets are inconsistent or incomplete. We bench-mark domain-specific encoder models against general LLMs for semantic-embedding retrieval using minimal vocabulary building blocks and test several prompt techniques. We also try prompt augmentation with LLM-generated differential definitions. We tested these prompts on open-source Llama and medically fine-tuned Llama models to steer their alignment toward accurate concept assignment across multiple prompt formats. Domain-tuned models consistently outperform general models of the same size in retrieval and generative tasks. However, performance is sensitive to prompt design and model size, and the benefits of adding LLM-generated context are inconsistent. While newer, larger foundation models are closing the gap, today’s lightweight open-source generative LLMs lack the stability and embedded clinical knowledge needed for deterministic vocabulary standardisation. 1 Introduction For a system to be successfully integrated and approved for use within healthcare, it must demonstrate rigid efficacy, reliability and interpretability at a higher burden than other domains [ 16 ]. One of the significant barriers to developing these clinical models is the quantity, quality, and availability of data [ 39 , 48 ]. When developing general domain large language models (LLM), large companies can send public releases to be broken and iron out any unwanted quirks. Healthcare does not have that same luxury, as these faults can result in adverse patient outcomes, so the burden of consistency and performance at roll-out is much higher. Electronic Health Records (EHR) is a term that refers to clinical databases with a wealth of patient information. This information includes both structured and unstructured data. Structured data typically refers to organised tabular data in the form of numerical and categorical information. Unstructured data, as the name implies, can take any form and often consists of documents of text, images and many other forms that have not historically been easy to use in computational problems. Improving data quality and availability is one facet. Another challenge is harmonising diverse sources into usable and consistent datasets for machine learning applications [ 38 ]. “Medical concept mapping” refers to converting medical information into a controlled vocabulary. This study defines medical concepts as entities that represent medical terminology related to disease, diagnoses, symptoms, treatments, interventions, and all other information that could characterise the semantic meaning of a medical concept. This process aims to facilitate interoperability through consistency in data whilst maintaining the semantic essence of a term, agnostic of context. One of the ways that medical concept data is stored is in the form of an ontology [ 19 ]. An ontology is a knowledge graph that holds semantic information in the form of classes, including unique identifying features [ 19 , 41 ]. This study will explore ontology mapping from the OAEI BioML track as a proxy for standardising clinical terminology. The automation of a reliable medical ontology conversion system is an essential step in addressing medical data standardisation. 2 Background The BioML track of the Ontology Alignment Evaluation Initiative (OAEI) is notoriously challenging; methods effective in other ontology matching tasks often fail to generalise well here due to the distinct complexities inherent in biomedical ontologies [ 2 , 32 ]. For instance, while large-scale biomedical ontologies such as SNOMED CT have richly structured semantic relationships, simpler classification systems like ICD-10 [ 45 ], ICD-11 [ 46 ], and the UK Biobank dataset lack such extensive semantic structures [ 37 ]. These simpler ontologies typically present as tabular datasets without hierarchical relationships or entity types, limiting the applicability of more sophisticated ontology matching approaches [ 37 ]. Several tools and frameworks have addressed medical vocabulary standardisation. USAGI, an open-source tool, relies primarily on information retrieval and has successfully mapped UK Biobank medical data to SNOMED CT [ 34 , 37 , 38 ]. Although accurate within their validated domain, rule-based methods such as Ontoserver, lack dataset flexibility and generalisability [ 28 , 40 ]. MedCAT, another open-source toolkit, combines traditional natural language processing (NLP) techniques with manual expert involvement but relies on iterative reinforcement learning to maintain performance in evolving clinical contexts [ 20 ]. EHR-QC further extends traditional NLP through ensemble methods, integrating tools like MedCAT, fuzzy matching and reverse indexing, achieving good results validated against simpler datasets such as the UK Biobank [ 38 ]. Despite traditional NLP methods initially setting benchmarks, recent advancements using transformer-based LLMs have significantly surpassed earlier techniques in medical NLP tasks [ 49 ]. Medical and domainspecific models tend to outperform their generalist counterparts on medical tasks regarding size and training volume size relative to performance [ 1 , 22 , 24 , 25 , 49 ]. Nonetheless, LLM-based approaches in ontology matching have faced difficulties despite competitive performance in generalist tracks [ 3 , 31 , 32 ]. Conversely, the retrieveidentify-prompt pipeline proposed by Taboada et al . demonstrated that high-quality embeddings alone could yield competitive results without complex decoder LLM frameworks, despite not exploring domain-specific embeddings or nuanced prompting methods [ 42 ]. Additionally, some models have achieved high benchmarks through ensemble models that include embedding, graph matching and ontology repair, but these methods are incompatible with simpler datasets [ 12 , 13 , 17 , 28 , 36 ]. The most comprehensive prompt evaluation study did not evaluate on the BioML track [ 15 ]. Common evaluation challenges include sparse ontology intersections and inconsistencies in curated “gold-standard” datasets, compounded by non-named entities lacking clinical relevance and complicating NLP-based methods [ 14 ]. Given these complexities, and considering real-world biomedical datasets tend to be simpler and less granular, focusing purely on label-based ontology matching emerges as a practically justified approach for this study. This more straight-forward label-based strategy aligns with realistic biomedical use cases. 3 Methodology Tokenisation is the first step in processing clinical documents that converts a document into character or word representations called “tokens”. Tokens can then be processed into numerical representations through encoding and storing them as embedding vectors [ 43 ]. Tokenisation combined with the transformer architecture [ 10 ] enables a reinforcement-style training approach by feeding the model millions of tokens and having the model slowly settle on embeddings or vectors that would align with token prediction. This resulted in a compact, efficient model with the most robust semantic embeddings. The resulting direction from a tokenised and vectorised document holds calculable meaning. The resulting vectors can then be used to calculate their similarity, resulting in scalable semantic similarity search. 3.1 Concept Matching Overview The task involves identifying one-to-one mappings between items in a source vocabulary and concepts in a controlled vocabulary. For terminology clarity, this paper defines a confirmed match as cases where a model identifies two concepts from distinct vocabularies representing the same underlying medical concept. A non-match is a case in which the model identifies no appropriate concept within the controlled vocabulary corresponding to the source concept. 3.2 Semantic Embedding Matching Algorithm To match embeddings, this paper used Euclidean distance and Cosine Similarity as a comparison. High-quality embeddings should achieve similar results for retrieval across all three matching algorithms if the embeddings have good alignment. Cosine similarity was preferred for its ability to match with a relative metric instead of the absolute metric provided by the Euclidean distance. To define non-matches, Cosine similarity enables thresholding strategies that are not possible using Euclidean distance. This is explored across five OAEI ontologies. The strength of simple embedding matchings is their robustness, reliability, and simplicity, which are much more explainable and controlled than generative LLM models. The challenge with standalone use is that there is always a confirmed match retrieved. The ability to assign non-matches whilst leveraging the high-performance of retrieval methods is paramount in defining the best system. The benchmark algorithm was to consider a simple threshold to assign non-matches based on the decay of F1 across lowered thresholds. Other studies have used a 0.9 threshold as a filter for Decoder LLM queries [ 2 , 3 , 42 ]. The quadratic complexity O ( n 2 ) of ontology matching, compounded by the computational demands of LLMs and the infeasibility of storing or re-calculating embeddings in memory, made early list-based approaches (e.g., requiring ∼ 4 hours per match) untenable. By adopting scalable vector databases like ChromaDB (disk-based) [ 8 ] and FAISS (RAM-based) [ 21 ], integrated via LangChain [ 21 ], the search complexity was reduced to O (2 n ) through constant-time nearest-neighbour lookups. Customising feature embeddings for vocabulary standardisation does not increase model complexity but can impact embedding quality. While adding context to labels may seem beneficial, prior work [ 3 ] and early experiments found it degrades performance by overwhelming the embedding space. We used a variety of domain-specific models, including ClinicalT5’s encoder [ 24 ] and a series of domain-specific BERT models with varying training sizes [ 1 , 29 , 48 ]. We also compared self-aligned domain-specific models that have undergone targeted vocabulary pre-training [ 22 , 25 ]. 3.3 Identifying Non-Matches through LLM Retrieval augmented generation (RAG) refers to a secondary component in an LLM model that is highly adept at returning relevant documents and information for the core model before using a decoder. The major component of this study explored the use of the domain-specific semantic embeddings as a RAG tool to generate higher-quality, precise prompts for generative LLMs. Instead of retrieving the highest match, the highest K-matches are retrieved, and then the results can be given to an LLM to avoid using an arbitrary threshold. RAG allows the user to maintain a minimal problem size and significantly constrains the output space to prevent LLM hallucination. 3.4 Prompting Techniques Prompt content is among the most significant challenges in generative LLM processes. Prompting aims to align the model’s output with the intended task. Prompting is an infinite space, and whilst there is some theoretical basis for what a “good” prompt requires [ 23 , 30 ], knowledge of it is still growing. High-level prompt templates exist, but no prompt template in published studies has demonstrated conclusively better performance. The prompts used in this study, when using LLM models, are available in the appendix. Whilst encoding models have a deterministic output that absorbs natural language as text to compute a vector, generative models have increasingly complex hyperparameters that can make it challenging to control their output. 3.4.1 Chattiness and Common LLM Weaknesses Chattiness refers to generating verbose or overly detailed outputs in response to prompts. This behaviour poses a reliability issue in automated pipelines, especially those requiring structured or binary outputs. Decoder-only models, particularly with fewer parameters, often fail to constrain their output despite explicit, minimal instructions. Excess tokens or unexpected formatting reduce usability even when the generated content is semantically correct. Therefore, this pipeline penalises any deviation from the expected output format, such as added explanations or uncertainty phrases, by marking the response as a non-match. Prompt refinement was performed iteratively on a small sample dataset until the model reliably adhered to the task description and expected output format. Common issues observed included the model selecting multiple target codes for a single input concept, particularly when the model interpreted an input as equally similar to multiple target codes. Additionally, quantised LLaMA models tended to erroneously return the input code as the chosen match, necessitating further prompt adjustments. Consequently, due to these instabilities, the final retrieval step treated any output that deviated from the specified format or presented invalid candidate codes as a non-match, thereby preserving output consistency and integrity. In early experiments with Llama3.1, a model was asked to give a “confidence” score, but this did not appear to be a deterministic number and instead made the model “chattier” by vocalising the need to explain the score number despite explicit instructions otherwise. 3.4.2 LLM Hyperparameters Quantisation is a model compression technique that reduces the memory footprint and computational demands of LLMs by representing weights and activations using low-bit integers instead of full floating precision numbers, whilst preserving performance [ 18 ]. In this study, all quantised models were obtained via Ollama [ 35 ], which distributes pre-quantised models optimised for local inference. The relevant models will indicate quantisation intensity. This study used a temperature of 0.0, or another hyperparameter alternative that uses a deterministic decoder output. With something as rigid as standardisation, a higher temperature may allow freedom for niche codes to be mapped in “creative” ways. However, this compromises the already questionable reliability and consistency of generative LLMs, and by setting the temperature to 0, the model should return the same consistent and reliable outputs in response to the same inputs. 3.4.3 Multi-Choice vs Binary Questioning Most decoder models are released with evaluation on question-answering tasks (QA) [5– 7, 11, 26, 47, 49 ]. As such, it is logical that the next step is reframing label generation as a question. It is infeasible to list all codes from large ontologies within a prompt context window, but using a RAG tool enables a much smaller, targeted set of top − k to be retrieved and presented. Multiple-choice questioning shows the input term with its top − k candidate codes and asks the model to return the best match, or if there are no appropriate matches, returns None. Early smoke tests with Q4_K_M Llama 3.1 showed that prompts had to be iteratively tightened to curb “chatty” responses and also prevent the model from echoing the input code. This technique allows more candidates to be provided to an LLM at lower computational cost through a single prompt, but increases prompt complexity, making it harder to adjust effectively. Binary questioning for ontology matching gives the model an input prompt with a potential match, asking true/false or another binary outcome. The simplicity of this prompt makes it easier to align the decoder output. There are some emerging challenges with this approach. Firstly, asking only for the best match excludes the demonstrated improvement for the retrieval models at k = 5 and k = 10 matches without adding further algorithmic elements. 3.4.4 Context Enhancement using LLM Generated Differential Definitions Separation Generative LLMs improve their task alignment with additional context. Due to the lack of ontology homogeneity, the task is to build prompt context without additional information. One persistent limitation of current artificial intelligence models is their inability to generate or consider counterfactual scenarios or abstain from responding. This manifests as a failure to recognise alternative, potentially superior matches within the same ontology beyond the immediate best apparent match. For instance, consider the Foundational Model of Anatomy (FMA) [ 4 ] code “44620” (Medial Meniscus), which may appear as the closest match to the SNOMED [ 44 ] input “74135004” (Meniscus structure of joint). While contextually plausible, other closer semantic matches within the ontology remain unconsidered by models that lack context. Without additional context, models cannot infer or recognise superior matching possibilities that a human might naturally consider. The proposed method first prompts the LLM to consider an input concept alongside its top − K semantically similar candidates, which are derived using the embedding matching retrieval method from within the same ontology. The decoder model is instructed to generate a definition that defines but also explicitly differentiates the input from these candidates. The underlying hypothesis is that instructing the LLM in this targeted manner enables it to produce distinct keywords or phrases that effectively adjust the semantic embedding vectors within the LLM model. Consequently, this definitional differentiation provides critical contextual cues that enhance the model’s ability to distinguish between similar medical concepts and infer the existence of similar concepts within the same ontology, thus improving match selection precision. By leveraging the generative capabilities of the LLM, the method removes subjective human biases that might influence manual definitions, aiming instead to generate precise, objective, and consistent definitions that may improve context in the subsequent LLM. A robust generative LLM approach can also avoid data bias in threshold matching. The resulting differential definitions produced by the LLM are then systematically stored for later access in the pipeline to be accessed when building prompts for negative filtering. The rationale is that using the same model to generate differential definitions produces language that the same generative model should understand downstream. A sample prompt for differential definition building, multi-choice questioning and true-false questioning can be found in the appendix. 3.4.5 Datasets The Ontology Datasets used in this study are from the OAEI 2024 Machine-Learning Friendly Datasets [ 14 ]. These included pruned and cleaned ontologies with 8 Ontologies (one ontology has three sub-ontologies), divided into five matching tasks [ 33 ] 3.4.6 LLM Sampling Datasets This study used a smaller sample dataset to balance computational availability while maintaining a reliable and holistic model performance evaluation. 200 random positive and 200 random negative items were taken from each input dataset to create five datasets of 400 items across the 5 OAEI tracks. 4 Results 4.1 Self-Aligned Domain-Specific Models Outperform in Retrieval Table 1 demonstrates that self-aligned embedding retrievers topped every RAG benchmark at k = 5 and k = 10, and both self-aligned models had an impressive k=1. SAPPMBert is not OAEI-competitive, yet a blanket 0.90 similarity cut-off still gave mean F1 = 0.674 and precision = 0.762. Figure 1 highlights this decay across a lowered matching threshold. Such fixed thresholds remain fragile: score distributions differ sharply across ontology pairs. For instance, as demonstrated in Table 3 , most SNOMED-Pharm to NCIT-Pharm alignments sit above 0.90 similarity, whereas many MONDO-OMIM to ORDO links do not. Even so, a strong, domain-tuned retriever can shoulder much of the alignment burden, and domain-specific pretraining usually outperforms general-domain retrieval methods. Table 2 demonstrates a very marginal increase in performance using Euclidean distance, but as highlighted previously, the marginal performance difference does not justify this inflexible method. View this table: View inline View popup Download powerpoint Table 1: Average Cosine-Distance Retrieval Performance Across Datasets. Performance is reported using Hits@K metrics ( K = 1, 5, 10). It indicates the proportion of times the correct item appears in the top- K retrieved results. View this table: View inline View popup Download powerpoint Table 2: Comparison of Cosine Similarity vs Euclidean Distance in Label Matching. Performance is measured using Hits@K metrics ( K = 1, 5, 10) and Mean Reciprocal Rank (MRR). MRR gives a balanced measure of Hits@K, favouring returning the desired document at a higher rank [ 9 ]. These metrics show the effectiveness of each distance metric in retrieving the correct label. The higher value indicates better matching performance. View this table: View inline View popup Download powerpoint Table 3: Baseline Raw with 0.90 Threshold Cut-off. Precision, Recall, and F1 scores are reported for each dataset using a fixed confidence threshold of 0.90. These metrics evaluate label matching quality. The bottom row shows the macro-average across all datasets. Download figure Open in new tab Figure 1: Demonstrating the change in average F1 score across 5 OAEI tasks as the threshold value increases from 0.0 to 1.0 for the two best-performing models 4.2 Decoder LLMs Show Mixed Results in Prompting Experiments. Domain-Specific Trained Models Still Outperform Overall Generative LLMs show mixed results, as can be seen in Table 4 . ME-Llama 3 8B underperformed every other model, including a quantised Llama 3.1 8B, hinting at substantive gains between Llama 3.0 and 3.1 at some expense of recall. Both Q4_K_M Llama 3.1 and ME-Llama 3 8B rarely chose “non-match” in multi-choice prompts, showing that this prompt format is too taxing for older or smaller quantised models. Only Q4_K_M Llama 3.3 70B in the true/false no-definition scheme beat the 0.90 heuristic. A Llama 3.1 8B Instruct run failed under the prompting parameters used in this study so its results were not included. This evidence shows that effective prompting techniques vary significantly even within the same model family or company. The domain-finetuned m42-v2 outclassed all Llama 3.1 variants, reinforcing the advantage of medical-specific tuning. LLM-generated differential definitions lifted precision significantly for m42-v2 and Q4_K_M Llama 3.3 70B in both multichoice and true/false prompting. ME-Llama selected a code almost always in the multi-choice task, regardless of the differential definitions. Likewise, Q4_K_M Llama 3.1 ignored added context and seldom returned a non-match. Naturally, in this setting, recall will remain artificially high. Therefore, these models are unreliable for this task. Llama 3.1 8B even appeared to drop in performance with differential definitions, suggesting the task is either too specialised or complex without domain fine-tuning, and comparing this directly to the m42-v2 Llama 3.1 8B suggests that domain fine-tuning is still relevant for performance even for decoder LLMs. View this table: View inline View popup Download powerpoint Table 4: Evaluation of Decoder Models Across Prompting Conditions. Precision, Recall, and F1 scores are reported for each model across multiple-choice and true/false settings, with and without differential definitions. Results show how prompting strategies affect model performance, with benchmark averages included for reference. 5 Conclusion Domain-specific encoder models are still very relevant amongst available open-source AI tools. These models can be lightweight and maintain competitive performance in semantic encoding tasks. This study further demonstrates that domain-specific models outperform their general domain counterparts when adjusting for model size and training data volume. Even when paired with a high-quality RAG, decoder LLMs do not yet meet reliability or performance requirements for fully automated vocabulary standardisation. Progress will likely depend on a few factors. More capable base LLMs may overcome domain-specific gaps, as evidenced by Q4_K_M Llama 3.3 70B outperforming the medical 8B models at the trade-off of 9x the parameters. Massive (Llama 3.3 70B) generaldomain releases show encouraging cross-domain gains and may eventually brute force performance. Without extensive contextual scaffolding, multi-choice prompting is still too demanding for today’s readily deployable 8B parameter LLMs and prompting needs to be adjusted to the individual model. A greater availability of high-quality and a greater volume of medical and clinical data may improve the current AI models and assist future development. This study adds to the ongoing challenge that medical domain models face when adopting state-of-the-art AI systems. Data Availability All data and models produced are available online through either open-source or free to use with registration or usage agreement A Sample Prompts A.1 Definition Builder Prompt You are a biomedical ontology expert with deep knowledge of ontologies such as UMLS, SNOMED, and MONDO. Context: You are working with the ontology: {ontology_name} . Term to define: “ {input_label} ” Similar labels to distinguish from: {Similar Labels} Instructions Internally reason about how “ {input_label} ” differs from the similar labels. Provide a concise, specific definition for “ {input_label} ” that clearly distinguishes it. Output only the definition text (one paragraph or less), with no additional commentary. Do not include disclaimers or any other text. Begin your definition now : A.2 MultiChoice with Differential Definitions You are an Ontology Matching Expert in UMLS, MONDO, and related ontologies. Below is an input code and several candidate codes with generated definitions from another LLM model. If none matches, respond with None. Otherwise, output only the best label—no explanations. Input Details . Code: {input_code} Label: {input_label} Definition: {input_definition} Candidate Matches . {items} Instructions . Compare the input label/definition to each candidate. If one aligns best, output its label exactly; if not, output None. Output only that label or None. Example: A Suitable Match . Input: Code: 123, Label: “Corneal Disease”, Definition: “A condition affecting the cornea.” Candidates: “eye disease”, “corneal dystrophy” Output: corneal dystrophy Example: No Suitable Match . Input: Code: 321, Label: “Rare Genetic Disorder X”, Definition: “A condition with unique markers.” Candidates: “common cold”, “seasonal allergies” Output: None Answer (Only one label or None ) . A.3 TrueFalse You are an Ontology Matching Expert with extensive experience in UMLS, MONDO, and similar ontologies. A previous model generated definitions for each code, highlighting how they may differ within the same ontology. These definitions may be paraphrased or partial. The two codes below come from different ontologies, but they may represent the same underlying concept. Evaluate their labels and definitions, and respond with yes or no only if they truly match. Input Concept . Code: {input_code} Label: {input_label} Definition: {input_definition} Candidate Concept . Code: {matched_code} Label: {matched_label} Definition: {matched_definition} Do these two concepts refer to the same real-world entity, condition, or idea? Answer yes or no only: B Online Resources Machine Learning Friendly OAEI Ontology Dataset [ 14 ]: https://zenodo.org/records/13119437 LangChain [ 21 ]: https://www.langchain.com FAISS (Facebook AI Similarity Search) [ 27 ]: https://faiss.ai CHROMA [ 8 ]: https://www.trychroma.com Ollama [ 35 ]: https://ollama.com Acknowledgments The authors thank Yashpal Ramakrishnaiah for initial discussions on concept mapping and EHR-QC implementations. This work was supported by computational resources provided by RMIT University. AB acknowledges the STEM PhD scholarship from RMIT University. References [1]. ↵ Emily Alsentzer , John R. Murphy , Willie Boag , Wei-Hung Weng , Di Jin , Tristan Naumann , and Matthew B. A. McDermott . 2019 . Publicly Available Clinical BERT Embeddings . http://arxiv.org/abs/1904.03323 arxiv: 1904.03323 [cs]. [2]. ↵ Terry R. Payne , Valentina Presutti , Guilin Qi , María Poveda-Villalón , Giorgos Stoilos , Laura Hollink , Zoi Kaoudi , Gong Cheng , and Juanzi Li Hamed Babaei Giglou , Jennifer D’Souza , and Sören Auer . 2023 . LLMs4OL: Large Language Models for Ontology Learning . In The Semantic Web – ISWC 2023 , Terry R. Payne , Valentina Presutti , Guilin Qi , María Poveda-Villalón , Giorgos Stoilos , Laura Hollink , Zoi Kaoudi , Gong Cheng , and Juanzi Li (Eds.). Vol. 14265 . Springer Nature Switzerland , Cham , 408 – 427 . doi: 10.1007/978-3-031-47240-4_22 Series Title: Lecture Notes in Computer Science. OpenUrl CrossRef [3]. ↵ Albert Meroño Peñuela , Oscar Corcho , Paul Groth , Elena Simperl , Valentina Tamma , Andrea Giovanni Nuzzolese , Maria Poveda-Villalón , Marta Sabou , Valentina Presutti , Irene Celino , Artem Revenko , Joe Raad , Bruno Sartini , and Pasquale Lisena Hamed Babaei Giglou , Jennifer D’Souza , Felix Engel , and Sören Auer . 2025 . LLMs4OM: Matching Ontologies with Large Language Models . In The Semantic Web: ESWC 2024 Satellite Events , Albert Meroño Peñuela , Oscar Corcho , Paul Groth , Elena Simperl , Valentina Tamma , Andrea Giovanni Nuzzolese , Maria Poveda-Villalón , Marta Sabou , Valentina Presutti , Irene Celino , Artem Revenko , Joe Raad , Bruno Sartini , and Pasquale Lisena (Eds.). Vol. 15344 . Springer Nature Switzerland , Cham , 25 – 35 . doi: 10.1007/978-3-031-78952-6_3 Series Title: Lecture Notes in Computer Science. OpenUrl CrossRef [4]. ↵ O. Bodenreider . 2004 . The Unified Medical Language System (UMLS): integrating biomedical terminology . Nucleic Acids Research 32 , 90001 (Jan. 2004), 267D–270. doi: 10.1093/nar/gkh061 OpenUrl CrossRef [5]. Elliot Bolton , Abhinav Venigalla , Michihiro Yasunaga , David Hall , Betty Xiong , Tony Lee , Roxana Daneshjou , Jonathan Frankle , Percy Liang , Michael Carbin , and Christopher D Manning . 2024 . BioMedLM: A 2.7B Parameter Language Model Trained On Biomedical Text. (March 2024) . doi : arxiv: 2403.18421v1 [6]. Aakanksha Chowdhery , Sharan Narang , Jacob Devlin , Maarten Bosma , Gaurav Mishra , Adam Roberts , Paul Barham , Hyung Won Chung , Charles Sutton , Sebastian Gehrmann , Parker Schuh , Kensen Shi , Sasha Tsvyashchenko , Joshua Maynez , Abhishek Rao , Parker Barnes , Yi Tay , Noam Shazeer , Vinodkumar Prabhakaran , Emily Reif , Nan Du , Ben Hutchinson , Reiner Pope , James Bradbury , Jacob Austin , Michael Isard , Guy Gur-Ari , Pengcheng Yin , Toju Duke , Anselm Levskaya , Sanjay Ghemawat , Sunipa Dev , Henryk Michalewski , Xavier Garcia , Vedant Misra , Kevin Robinson , Liam Fedus , Denny Zhou , Daphne Ippolito , David Luan , Hyeontaek Lim , Barret Zoph , Alexander Spiridonov , Ryan Sepassi , David Dohan , Shivani Agrawal , Mark Omernick , Andrew M Dai , Thanumalayan Sankaranarayana Pillai , Marie Pellat , Aitor Lewkowycz , Erica Moreira , Rewon Child , Oleksandr Polozov , Katherine Lee , Zongwei Zhou Xuezhi Wang , Brennan Saeta , Mark Diaz , Orhan Firat , Michele Catasta , Jason Wei , Kathy Meier-Hellstern , Douglas Eck , Jeff Dean , Slav Petrov , Noah Fiedel , and Ruslan Salakhutdinov . [n. d.]. PaLM: Scaling Language Modeling with Pathways. ([n. d.]) . [7]. Clément Christophe , Praveen K. Kanithi , Tathagata Raha , Shadab Khan , and Marco AF Pimentel . 2024 . Med42-v2: A Suite of Clinical LLMs . doi: 10.48550/arXiv.2408.06142 arxiv: 2408.06142 [cs]. OpenUrl CrossRef [8]. ↵ Chroma . 2025 . Chroma is the open-source AI application database. Batteries included . https://www.trychroma.com [9]. ↵ Nick Craswell . 2009 . Mean Reciprical Rank . Encyclopedia of Database Systems (2009) . doi: 10.1007/978-0-387-39940-9_488 OpenUrl CrossRef [10]. ↵ Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2019 . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding . http://arxiv.org/abs/1810.04805 arxiv: 1810.04805 [cs]. [11]. Aaron Grattafiori , Abhimanyu Dubey , Abhinav Jauhri , Abhinav Pandey , Abhishek Kadian , Ahmad Al-Dahle , Aiesha Letman , Akhil Mathur , Alan Schelten , Alex Vaughan , Amy Yang , Angela Fan , Anirudh Goyal , Anthony Hartshorn , Aobo Yang , Archi Mitra , Archie Sravankumar , Artem Korenev , Arthur Hinsvark , Arun Rao , Aston Zhang , Aurelien Rodriguez , Austen Gregerson , Ava Spataru , Baptiste Roziere , Bethany Biron , Binh Tang , Bobbie Chern , Charlotte Caucheteux , Chaya Nayak , Chloe Bi , Chris Marra , Chris McConnell , Christian Keller , Christophe Touret , Chunyang Wu , Corinne Wong , Cristian Canton Ferrer , Cyrus Nikolaidis , Damien Allonsius , Daniel Song , Danielle Pintz , Danny Livshits , Danny Wyatt , David Esiobu , Dhruv Choudhary , Dhruv Mahajan , Diego Garcia-Olano , Diego Perino , Dieuwke Hupkes , Egor Lakomkin , Ehab AlBadawy , Elina Lobanova , Emily Dinan , Eric Michael Smith , Filip Radenovic , Francisco Guzmán , Frank Zhang , Gabriel Synnaeve , Gabrielle Lee , Georgia Lewis Anderson , Govind Thattai , Graeme Nail , Gregoire Mialon , Guan Pang , Guillem Cucurell , Hailey Nguyen , Hannah Korevaar , Hu Xu , Hugo Touvron , Iliyan Zarov , Imanol Arrieta Ibarra , Isabel Kloumann , Ishan Misra , Ivan Evtimov , Jack Zhang , Jade Copet , Jaewon Lee , Jan Geffert , Jana Vranes , Jason Park , Jay Mahadeokar , Jeet Shah , Jelmer van der Linde , Jennifer Billock , Jenny Hong , Jenya Lee , Jeremy Fu , Jianfeng Chi , Jianyu Huang , Jiawen Liu , Jie Wang , Jiecao Yu , Joanna Bitton , Joe Spisak , Jongsoo Park , Joseph Rocca , Joshua Johnstun , Joshua Saxe , Junteng Jia , Kalyan Vasuden Alwala , Karthik Prasad , Kartikeya Upasani , Kate Plawiak , Ke Li , Kenneth Heafield , Kevin Stone , Khalid El-Arini , Krithika Iyer , Kshitiz Malik , Kuenley Chiu , Kunal Bhalla , Kushal Lakhotia , Lauren Rantala-Yeary , Laurens van der Maaten , Lawrence Chen , Liang Tan , Liz Jenkins , Louis Martin , Lovish Madaan , Lubo Malo , Lukas Blecher , Lukas Landzaat , Luke de Oliveira , Madeline Muzzi , Mahesh Pasupuleti , Mannat Singh , Manohar Paluri , Marcin Kardas , Maria Tsimpoukelli , Mathew Oldham , Mathieu Rita , Maya Pavlova , Melanie Kambadur , Mike Lewis , Min Si , Mitesh Kumar Singh , Mona Hassan , Naman Goyal , Narjes Torabi , Nikolay Bashlykov , Nikolay Bogoychev , Niladri Chatterji , Ning Zhang , Olivier Duchenne , Onur Çelebi , Patrick Alrassy , Pengchuan Zhang , Pengwei Li , Petar Vasic , Peter Weng , Prajjwal Bhargava , Pratik Dubal , Praveen Krishnan , Punit Singh Koura , Puxin Xu , Qing He , Qingxiao Dong , Ragavan Srinivasan , Raj Ganapathy , Ramon Calderer , Ricardo Silveira Cabral , Robert Stojnic , Roberta Raileanu , Rohan Maheswari , Rohit Girdhar , Rohit Patel , Romain Sauvestre , Ronnie Polidoro , Roshan Sumbaly , Ross Taylor , Ruan Silva , Rui Hou , Rui Wang , Saghar Hosseini , Sahana Chennabasappa , Sanjay Singh , Sean Bell , Seohyun Sonia Kim , Sergey Edunov , Shaoliang Nie , Sharan Narang , Sharath Raparthy , Sheng Shen , Shengye Wan , Shruti Bhosale , Shun Zhang , Simon Vandenhende , Soumya Batra , Spencer Whitman , Sten Sootla , Stephane Collot , Suchin Gururangan , Sydney Borodinsky , Tamar Herman , Tara Fowler , Tarek Sheasha , Thomas Georgiou , Thomas Scialom , Tobias Speckbacher , Todor Mihaylov , Tong Xiao , Ujjwal Karn , Vedanuj Goswami , Vibhor Gupta , Vignesh Ramanathan , Viktor Kerkez , Vincent Gonguet , Virginie Do , Vish Vogeti , Vítor Albiero , Vladan Petrovic , Weiwei Chu , Wenhan Xiong , Wenyin Fu , Whitney Meers , Xavier Martinet , Xiaodong Wang , Xiaofang Wang , Xiaoqing Ellen Tan , Xide Xia , Xinfeng Xie , Xuchao Jia , Xuewei Wang , Yaelle Goldschlag , Yashesh Gaur , Yasmine Babaei , Yi Wen , Yiwen Song , Yuchen Zhang , Yue Li , Yuning Mao , Zacharie Delpierre Coudert , Zheng Yan , Zhengxing Chen , Zoe Papakipos , Aaditya Singh , Aayushi Srivastava , Abha Jain , Adam Kelsey , Adam Shajnfeld , Adithya Gangidi , Adolfo Victoria , Ahuva Goldstand , Ajay Menon , Ajay Sharma , Alex Boesenberg , Alexei Baevski , Allie Feinstein , Amanda Kallet , Amit Sangani , Amos Teo , Anam Yunus , Andrei Lupu , Andres Alvarado , Andrew Caples , Andrew Gu , Andrew Ho , Andrew Poulton , Andrew Ryan , Ankit Ramchandani , Annie Dong , Annie Franco , Anuj Goyal , Aparajita Saraf , Arkabandhu Chowdhury , Ashley Gabriel , Ashwin Bharambe , Assaf Eisenman , Azadeh Yazdan , Beau James , Ben Maurer, Benjamin Leonhardi , Bernie Huang , Beth Loyd , Beto De Paola , Bhargavi Paranjape , Bing Liu , Bo Wu , Boyu Ni , Braden Hancock , Bram Wasti , Brandon Spence , Brani Stojkovic , Brian Gamido , Britt Montalvo , Carl Parker , Carly Burton , Catalina Mejia , Ce Liu , Changhan Wang , Changkyu Kim , Chao Zhou , Chester Hu , Ching-Hsiang Chu , Chris Cai , Chris Tindal , Christoph Feichtenhofer , Cynthia Gao , Damon Civin , Dana Beaty , Daniel Kreymer , Daniel Li , David Adkins , David Xu , Davide Testuggine , Delia David , Devi Parikh , Diana Liskovich , Didem Foss , Dingkang Wang , Duc Le , Dustin Holland , Edward Dowling , Eissa Jamil , Elaine Montgomery , Eleonora Presani , Emily Hahn , Emily Wood , Eric-Tuan Le , Erik Brinkman , Esteban Arcaute , Evan Dunbar , Evan Smothers , Fei Sun , Felix Kreuk , Feng Tian , Filippos Kokkinos , Firat Ozgenel , Francesco Caggioni , Frank Kanayet , Frank Seide , Gabriela Medina Florez , Gabriella Schwarz , Gada Badeer , Georgia Swee , Gil Halpern , Grant Herman , Grigory Sizov , Guangyi , Zhang , Guna Lakshminarayanan , Hakan Inan , Hamid Shojanazeri , Han Zou , Hannah Wang , Hanwen Zha , Haroun Habeeb , Harrison Rudolph , Helen Suk , Henry Aspegren , Hunter Goldman , Hongyuan Zhan , Ibrahim Damlaj , Igor Molybog , Igor Tufanov , Ilias Leontiadis , Irina-Elena Veliche , Itai Gat , Jake Weissman , James Geboski , James Kohli , Janice Lam , Japhet Asher , Jean-Baptiste Gaya , Jeff Marcus , Jeff Tang , Jennifer Chan , Jenny Zhen , Jeremy Reizenstein , Jeremy Teboul , Jessica Zhong , Jian Jin , Jingyi Yang , Joe Cummings , Jon Carvill , Jon Shepard , Jonathan McPhie , Jonathan Torres , Josh Ginsburg , Junjie Wang , Kai Wu , Kam Hou U , Karan Saxena , Kartikay Khandelwal , Katayoun Zand , Kathy Matosich , Kaushik Veeraraghavan , Kelly Michelena , Keqian Li , Kiran Jagadeesh , Kun Huang , Kunal Chawla , Kyle Huang , Lailin Chen , Lakshya Garg , Lavender A , Leandro Silva , Lee Bell , Lei Zhang , Liangpeng Guo , Licheng Yu , Liron Moshkovich , Luca Wehrstedt , Madian Khabsa , Manav Avalani , Manish Bhatt , Martynas Mankus , Matan Hasson , Matthew Lennie , Matthias Reso , Maxim Groshev , Maxim Naumov , Maya Lathi , Meghan Keneally , Miao Liu , Michael L. Seltzer , Michal Valko , Michelle Restrepo , Mihir Patel , Mik Vyatskov , Mikayel Samvelyan , Mike Clark , Mike Macey , Mike Wang , Miquel Jubert Hermoso , Mo Metanat , Mohammad Rastegari , Munish Bansal , Nandhini Santhanam , Natascha Parks , Natasha White , Navyata Bawa , Nayan Singhal , Nick Egebo , Nicolas Usunier , Nikhil Mehta , Nikolay Pavlovich Laptev , Ning Dong , Norman Cheng , Oleg Chernoguz , Olivia Hart , Omkar Salpekar , Ozlem Kalinli , Parkin Kent , Parth Parekh , Paul Saab , Pavan Balaji , Pedro Rittner , Philip Bontrager , Pierre Roux , Piotr Dollar , Polina Zvyagina , Prashant Ratanchandani , Pritish Yuvraj , Qian Liang , Rachad Alao , Rachel Rodriguez , Rafi Ayub , Raghotham Murthy , Raghu Nayani , Rahul Mitra , Rangaprabhu Parthasarathy , Raymond Li , Rebekkah Hogan , Robin Battey , Rocky Wang , Russ Howes , Ruty Rinott , Sachin Mehta , Sachin Siby , Sai Jayesh Bondu , Samyak Datta , Sara Chugh , Sara Hunt , Sargun Dhillon , Sasha Sidorov , Satadru Pan , Saurabh Mahajan , Saurabh Verma , Seiji Yamamoto , Sharadh Ramaswamy , Shaun Lindsay , Shaun Lindsay , Sheng Feng , Shenghao Lin , Shengxin Cindy Zha , Shishir Patil , Shiva Shankar , Shuqiang Zhang , Shuqiang Zhang , Sinong Wang , Sneha Agarwal , Soji Sajuyigbe , Soumith Chintala , Stephanie Max , Stephen Chen , Steve Kehoe , Steve Satterfield , Sudarshan Govindaprasad , Sumit Gupta , Summer Deng , Sungmin Cho , Sunny Virk , Suraj Subramanian , Sy Choudhury , Sydney Goldman , Tal Remez , Tamar Glaser , Tamara Best , Thilo Koehler , Thomas Robinson , Tianhe Li , Tianjun Zhang , Tim Matthews , Timothy Chou , Tzook Shaked , Varun Vontimitta , Victoria Ajayi , Victoria Montanez , Vijai Mohan , Vinay Satish Kumar , Vishal Mangla , Vlad Ionescu , Vlad Poenaru , Vlad Tiberiu Mihailescu , Vladimir Ivanov , Wei Li , Wenchen Wang , Wenwen Jiang , Wes Bouaziz , Will Constable , Xiaocheng Tang , Xiaojian Wu , Xiaolan Wang , Xilun Wu , Xinbo Gao , Yaniv Kleinman , Yanjun Chen , Ye Hu , Ye Jia , Ye Qi , Yenda Li , Yilin Zhang , Ying Zhang , Yossi Adi , Youngjin Nam , Yu , Wang , Yu Zhao , Yuchen Hao , Yundi Qian , Yunlu Li , Yuzi He , Zach Rait , Zachary DeVito , Zef Rosnbrick , Zhaoduo Wen , Zhenyu Yang , Zhiwei Zhao , and Zhiyu Ma . 2024 . The Llama 3 Herd of Models . doi: 10.48550/arXiv.2407.21783 arxiv: 2407.21783 [cs]. OpenUrl CrossRef [12]. ↵ Yuan He , Jiaoyan Chen , Denvar Antonyrajah , and Ian Horrocks . 2022 . BERTMap: A BERT-based Ontology Alignment System . http://arxiv.org/abs/2112.02682 arxiv: 2112.02682 [cs]. [13]. ↵ Yuan He , Jiaoyan Chen , Hang Dong , Ian Horrocks , Carlo Allocca , Taehun Kim , and Brahmananda Sapkota . 2024 . DeepOnto: A Python Package for Ontology Engineering with Deep Learning . doi: 10.48550/arXiv.2307.03067 arxiv: 2307.03067 [cs]. OpenUrl CrossRef [14]. ↵ Yuan He , Jiaoyan Chen , Hang Dong , Ernesto Jiménez-Ruiz , Ali Hadian , and Ian Horrocks . 2023 . Machine Learning-Friendly Biomedical Datasets for Equivalence and Subsumption Ontology Matching . http://arxiv.org/abs/2205.03447 arxiv: 2205.03447 [cs, q-bio]. [15]. ↵ Sven Hertling and Heiko Paulheim . 2023 . OLaLa: Ontology Matching with Large Language Models . In Proceedings of the 12th Knowledge Capture Conference 2023. 131–139 . doi: 10.1145/3587259.3627571 arxiv: 2311.03837 [cs]. OpenUrl CrossRef [16]. ↵ Andreas Holzinger , Chris Biemann , Constantinos S. Pattichis , and Douglas B. Kell . 2017 . What do we need to build explainable AI systems for the medical domain? doi: 10.48550/arXiv.1712.09923 arxiv: 1712.09923 [cs]. OpenUrl CrossRef [17]. ↵ Lora Aroyo , Chris Welty , Harith Alani , Jamie Taylor , Abraham Bernstein , Lalana Kagal , Natasha Noy , and Eva Blomqvist Ernesto Jiménez-Ruiz and Bernardo Cuenca Grau . 2011 . LogMap: Logic-Based and Scalable Ontology Matching . In The Semantic Web – ISWC 2011 , Lora Aroyo , Chris Welty , Harith Alani , Jamie Taylor , Abraham Bernstein , Lalana Kagal , Natasha Noy , and Eva Blomqvist (Eds.). Vol. 7031 . Springer Berlin Heidelberg, Berlin, Heidelberg, p273–288 . doi: 10.1007/978-3-642-25073-6_18 Series Title: Lecture Notes in Computer Science. OpenUrl CrossRef [18]. ↵ Renren Jin , Jiangcun Du , Wuwei Huang , Wei Liu , Jian Luan , Bin Wang, and Deyi Xiong . 2024 . A Comprehensive Evaluation of Quantization Strategies for Large Language Models . doi: 10.48550/arXiv.2402.16775 arxiv: 2402.16775 [cs]. OpenUrl CrossRef [19]. ↵ Bogumil M. Konopka . 2015 . Biomedical ontologies—A review . Biocybernetics and Biomedical Engineering 35 , 2 (2015), 75 – 86 . doi: 10.1016/j.bbe.2014.06.002 OpenUrl CrossRef [20]. ↵ Zeljko Kraljevic , Thomas Searle , Anthony Shek , Lukasz Roguski , Kawsar Noor , Daniel Bean , Aurelie Mascio , Leilei Zhu , Amos A. Folarin , Angus Roberts , Rebecca Bendayan , Mark P. Richardson , Robert Stewart , Anoop D. Shah , Wai Keong Wong , Zina Ibrahim , James T. Teo , and Richard J.B. Dobson . 2021 . Multi-domain clinical natural language processing with MedCAT: The Medical Concept Annotation Toolkit . Artificial Intelligence in Medicine 117 (July 2021), 102083 . doi: 10.1016/j.artmed.2021.102083 OpenUrl CrossRef PubMed [21]. ↵ LangChain, Inc . 2025 . Introduction . https://python.langchain.com/docs/introduction/ [22]. ↵ Jinhyuk Lee , Wonjin Yoon , Sungdong Kim , Donghyeon Kim , Sunkyu Kim , Chan Ho So , and Jaewoo Kang . 2020 . BioBERT: a pre-trained biomedical language representation model for biomedical text mining . Bioinformatics 36 , 4 (Feb. 2020), 1234 – 1240 . doi: 10.1093/bioinformatics/btz682 OpenUrl CrossRef PubMed [23]. ↵ Lee Boonstra . 2025 . Prompt Engineering . https://cloud.google.com/discover/what-is-prompt-engineering [24]. ↵ Eric Lehman , Evan Hernandez , Diwakar Mahajan , Jonas Wulff , Micah J Smith , Zachary Ziegler , Daniel Nadler , Peter Szolovits , Alistair Johnson , and Emily Alsentzer . 2023 . Do We Still Need Clinical Language Models? PhysioNet (Feb. 2023). doi arxiv: 2302.08091v1 [25]. ↵ Fangyu Liu , Ehsan Shareghi , Zaiqiao Meng , Marco Basaldella , and Nigel Collier . 2021 . Self-Alignment Pretraining for Biomedical Entity Representations . http://arxiv.org/abs/2010.11784 arxiv: 2010.11784 [cs]. [26]. Qiuhao Lu , Dejing Dou , and Thien Huu Nguyen . [n. d.]. ClinicalT5: A Generative Language Model for Clinical Text. ([n. d.]) . [27]. ↵ Meta . 2025 . FAISS . https://ai.meta.com/tools/faiss/ [28]. ↵ Alejandro Metke-Jimenez , Jim Steel , David Hansen , and Michael Lawley . 2018 . Ontoserver: a syndicated terminology server . J Biomed Semant 9 , 1 (Dec. 2018), 24 . doi: 10.1186/s13326-018-0191-z OpenUrl CrossRef [29]. ↵ George Michalopoulos , Yuanxin Wang , Hussam Kaka , Helen Chen , and Alexander Wong . 2021 . UmlsBERT: Clinical Domain Knowledge Augmentation of Contextual Embeddings Using the Unified Medical Language System Metathesaurus . In Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Association for Computational Linguistics, Online, 1744–1753 . doi: 10.18653/v1/2021.naaclmain.139 OpenUrl CrossRef [30]. ↵ Noah MacCallum and Julian Lee . 2025 . GPT-4.1 Prompting Guide . https://cookbook.openai.com/examples/gpt4-1_prompting_guide [31]. ↵ OAEI . 2020 . Ontology Alignment Evaluation Initiative 2020 Campaign . https://oaei.ontologymatching.org/2020/ [32]. ↵ OAEI . 2023 . Ontology Alignment Evaluation Intiative . http://oaei.ontologymatching.org/2023/ [33]. ↵ OAEI . 2024 . Ontology Alignment Evaluation Intiative . http://oaei.ontologymatching.org/2024/ [34]. ↵ Observational Health Data Sciences and Informatics OHDSI . 2021 . USAGI . https://github.com/OHDSI/Usagi [35]. ↵ Ollama . 2025 . Ollama . https://ollama.com [36]. ↵ Samira Oulefki , Lamia Berkani , Ladjel Bellatreche , and Nassim Boudjenah . [n. d.]. Results for BioGITOM in OAEI 2024. R F ([n. d.]) . [37]. ↵ Vaclav Papez , Maxim Moinat , Erica A Voss , Sofia Bazakou , Anne Van Winzum , Alessia Peviani , Stefan Payralbe , Elena Garcia Lara , Michael Kallfelz , Folkert W Asselbergs , Daniel Prieto-Alhambra , Richard J B Dobson , and Spiros Denaxas . 2022 . Transforming and evaluating the UK Biobank to the OMOP Common Data Model for COVID-19 research and beyond . Journal of the American Medical Informatics Association 30 , 1 (Dec. 2022), 103 – 111 . doi: 10.1093/jamia/ocac203 OpenUrl CrossRef [38]. ↵ Yashpal Ramakrishnaiah , Nenad Macesic , Geoffrey I. Webb , Anton Y. Peleg , and Sonika Tyagi . 2023 . EHR-QC: A streamlined pipeline for automated electronic health records standardisation and preprocessing to predict clinical outcomes . Journal of Biomedical Informatics 147 (Nov. 2023), 104509 . doi: 10.1016/j.jbi.2023.104509 OpenUrl CrossRef PubMed [39]. ↵ Tabinda Sarwar , Sattar Seifollahi , Jeffrey Chan , Xiuzhen Zhang , Vural Aksakalli , Irene Hudson , Karin Verspoor , and Lawrence Cavedon . 2023 . The Secondary Use of Electronic Health Records for Data Mining: Data Characteristics and Challenges . ACM Comput. Surv . 55 , 2 (Feb. 2023), 1 – 40 . doi: 10.1145/3490234 OpenUrl CrossRef [40]. ↵ Merlijn Sevenster , Rob Van Ommering , and Yuechen Qian . 2012 . Algorithmic and user study of an autocompletion algorithm on a large medical vocabulary . Journal of Biomedical Informatics 45 , 1 (Feb. 2012), 107 – 119 . doi: 10.1016/j.jbi.2011.09.004 OpenUrl CrossRef PubMed [41]. ↵ Athanasios Tsanas and Andreas Triantafyllidis Bram Steenwinckel , Mathias De Brouwer , Marija Stojchevska , Jeroen Van Der Donckt , Jelle Nelis , Joeri Ruyssinck , Joachim Van Der Herten , Koen Casier , Jan Van Ooteghem , Pieter Crombez , Filip De Turck , Sofie Van Hoecke , and Femke Ongenae . 2023 . Data Analytics for Health and Connected Care: Ontology, Knowledge Graph and Applications . In Pervasive Computing Technologies for Healthcare , Athanasios Tsanas and Andreas Triantafyllidis (Eds.). Vol. 488. Springer Nature Switzerland, Cham , 344 – 360 . doi: 10.1007/978-3-031-34586-9_23 Series Title: Lecture Notes of the Institute for Computer Sciences, Social Informatics and Telecommunications Engineering. OpenUrl CrossRef [42]. ↵ Maria Taboada , Diego Martinez , Mohammed Arideh , and Rosa Mosquera . 2025 . Ontology Matching with Large Language Models and Prioritized Depth-First Search . doi: 10.48550/arXiv.2501.11441 arxiv: 2501.11441 [cs]. OpenUrl CrossRef [43]. ↵ Navya Tyagi , Naima Vahab , and Sonika Tyagi . 2024 . Genome Language Modeling (Glm): A Beginner’s Cheat Sheet . doi: 10.20944/preprints202411.0285.v1 OpenUrl CrossRef [44]. ↵ Amy Y Wang , Jeremiah H Sable , and Kent A Spackman . 2002 . The SNOMED clinical terms development process: refinement and analysis of content .. In Proceedings of the AMIA Symposium. American Medical Informatics Association, 845 . [45]. ↵ World Health Organisation . 2019 . International Statistical Classification of Diseases and Related Health Problems 10th Revision . https://icd.who.int/browse10/2019/en [46]. ↵ World Health Organisation . 2022 . International Statistical Classification of Diseases and Related Health Problems (ICD) . https://www.who.int/standards/classifications/classification-of-diseases [47]. Qianqian Xie , Qingyu Chen , Aokun Chen , Cheng Peng , Yan Hu , Fongci Lin , Xueqing Peng , Jimin Huang , Jeffrey Zhang , Vipina Keloth , Xinyu Zhou , Lingfei Qian , Huan He , Dennis Shung , Lucila Ohno-Machado , Yonghui Wu , Hua Xu , and Jiang Bian . [n. d.]. Me-LLaMA: Medical Foundation Large Language Models for Comprehensive Text Analysis and Beyond. ([n. d.]) . [48]. ↵ Xi Yang , Nima PourNejatian , Hoo Chang Shin , Kaleb E Smith , Christopher Parisien , Colin Compas , Cheryl Martin , Mona G Flores , Ying Zhang , Tanja Magoc , Christopher A Harle , Gloria Lipori , Duane A Mitchell , William R Hogan , Elizabeth A Shenkman , Jiang Bian , and Yonghui Wu . 2022 . GatorTron: A Large Language Model for Clinical Natural Language Processing . doi: 10.1101/2022.02.27.22271257 OpenUrl Abstract / FREE Full Text [49]. ↵ Hongjian Zhou , Fenglin Liu , Boyang Gu , Xinyu Zou , Jinfa Huang , Jinge Wu , Yiru Li , Sam S Chen , Peilin Zhou , Junling Liu , Yining Hua , Chengfeng Mao , Chenyu You , Xian Wu , Yefeng Zheng , Lei Clifton , Zheng Li , Jiebo Luo , and David A Clifton . 2024 . A Survey of Large Language Models in Medicine: Progress, Application, and Challenge . Preprint (2024) . doi:arXiv.2303.18223 View the discussion thread. Back to top Previous Next Posted June 17, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Semantic Encoding in Medical LLMs for Vocabulary Standardisation Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Semantic Encoding in Medical LLMs for Vocabulary Standardisation Samuel Mainwood , Aashish Bhandari , Sonika Tyagi medRxiv 2025.06.16.25329716; doi: https://doi.org/10.1101/2025.06.16.25329716 Share This Article: Copy Citation Tools Semantic Encoding in Medical LLMs for Vocabulary Standardisation Samuel Mainwood , Aashish Bhandari , Sonika Tyagi medRxiv 2025.06.16.25329716; doi: https://doi.org/10.1101/2025.06.16.25329716 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (569) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1511) Epidemiology (15230) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6610) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15923) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6607) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3338) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5448) Public and Global Health (9239) Radiology and Imaging (2202) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a01fc7635ac21b23',t:'MTc3OTgyOTk1NQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.