Filling the gaps: leveraging large language models for temporal harmonization of clinical text across multiple medical visits for clinical prediction

doi:10.1101/2024.05.06.24306959

Filling the gaps: leveraging large language models for temporal harmonization of clinical text across multiple medical visits for clinical prediction

2024 · doi:10.1101/2024.05.06.24306959

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 35,686 characters · extracted from preprint-html · click to expand

Filling the gaps: leveraging large language models for temporal harmonization of clinical text across multiple medical visits for clinical prediction | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Filling the gaps: leveraging large language models for temporal harmonization of clinical text across multiple medical visits for clinical prediction View ORCID Profile Inyoung Choi , View ORCID Profile Qi Long , View ORCID Profile Emily Getzen doi: https://doi.org/10.1101/2024.05.06.24306959 Inyoung Choi 1 School of Engineering and Applied Sciences at the University of Pennsylvania , Philadelphia, PA B.S. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Inyoung Choi For correspondence: inyoungc{at}seas.upenn.edu Qi Long 2 Perelman School of Medicine at the University of Pennsylvania , Philadelphia, PA Ph.D. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Qi Long Emily Getzen 2 Perelman School of Medicine at the University of Pennsylvania , Philadelphia, PA M.S. Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Emily Getzen Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Electronic health records offer great promise for early disease detection, treatment evaluation, information discovery, and other important facets of precision health. Clinical notes, in particular, may contain nuanced information about a patient’s condition, treatment plans, and history that structured data may not capture. As a result, and with advancements in natural language processing, clinical notes have been increasingly used in supervised prediction models. To predict long-term outcomes such as chronic disease and mortality, it is often advantageous to leverage data occurring at multiple time points in a patient’s history. However, these data are often collected at irregular time intervals and varying frequencies, thus posing an analytical challenge. Here, we propose the use of large language models (LLMs) for robust temporal harmonization of clinical notes across multiple visits. We compare multiple state-of-the-art LLMs in their ability to generate useful information during time gaps, and evaluate performance in supervised deep learning models for clinical prediction. Introduction In 2009, the United States introduced the Health Information Technology for Economic and Clinical Health (HITECH) Act with the aim of promoting the implementation of health information technology 1 . Since then, there has been a significant increase in the utilization of Electronic Health Records systems (EHR), which are digital versions of a patient’s medical chart. These data, which include information about a patient’s medical history, diagnoses, medications, laboratory test results, and treatment plans, hold great potential for application in various healthcare settings. EHR data have been employed for various purposes such as disease prediction, risk assessment, and assisting clinical decisions. Clinical notes in particular, which may contain important contextual information regarding family history, subtleties of symptoms and treatments, early warning signs, lifestyle, and socioeconomic factors, have been increasingly used for predictive modeling of disease. 2 - 15 However, EHR presents several challenges as it contains data that is recorded at irregular intervals and with varying frequencies ( Figure 1 ) 16 . This can happen for a variety of reasons– in an emergency department (ED) or intensive care setting (ICU), patients with more frequent visits might be sicker than their counterparts. 17 - 18 In other settings such as primary care, patients from different groups may have differing levels of access to healthcare. For example, studies have found that underserved populations are more likely to visit multiple healthcare institutions to receive care, thus contributing to data fragmentation 19 . It has also been found that disproportionate missing data in patients within certain demographic groups deteriorates the prediction of disease, emphasizing the impact that missing data may have on underserved populations 20 . Download figure Open in new tab Figure 1: Medical visits occur at irregular time intervals and varying frequencies. Deep learning models that can handle time series data such as Long Short-Term Memory Neural Network (LSTM) or transformers generally expect a fixed number of time points from which to learn temporal relationships. Traditionally, to deal with the irregular time series nature of EHRs, many machine learning practitioners employ a method known as zero-padding. This involves appending zeros (or in some cases, prepending them) to the beginning and / or end of the time series data in order to create inputs of fixed size 21 . While zero padding is a standard technique for dealing with missing time series data, there are a variety of issues that can arise. The introduction of zeros can create artificial temporal relationships that do not actually exist. Other methods average the patient notes over time, then combine it with a time-series embedding to obtain the final representation 15 . Additionally, machine learning architectures such as convolutional neural networks (CNN) and CNN-RNN architecture with semantic and temporal blocks can be leveraged to address temporality. 5 , 10 More recently, transformers have been adapted for temporal data, such as the Flexible Time-aware LSTM Transformer, which incorporates a time aware mechanism. 14 This mechanism addresses the irregular timing of notes by learning a flexible time decay function, effectively handling the variability in data spacing. Large Language Models (LLMs) are artificial intelligence systems trained on extensive text data to understand and generate human-like language. These models have demonstrated remarkable capabilities in text generation, utilizing their comprehensive knowledge and understanding of language patterns. Recent works have leveraged LLMs for clinical applications, including named entity recognition, label generation, relation extraction, etc. 21 , 22 We propose that LLMs, by harnessing knowledge from their extensive training data and the existing data in the EHR, will be capable of making meaningful observations that consider the temporal aspect of EHR data. LLMs have been found to be capable of providing information about a hypothetical datapoint given information about its embedding space 23 . Thus, by filling in the gap using text generated by LLMs, we hypothesize that we will be able to increase the accuracy of clinical predictions compared to existing machine learning and imputation methods for clinical text. Furthermore, through LLM generation, we can potentially mitigate disparities resulting from dataset irregularities and missing data, which can arise when certain groups have differing health status or limited access to healthcare. We leverage a variety of LLMs, including those trained specifically on biology and clinical data, to enhance the temporal structure of clinical notes from EHRs data. We feed the enhanced temporal structure into supervised deep learning models to predict 1-year mortality in ICU / ED patients. We compare to existing methods such as multimodal imputation, last observation carried forward (LOCF), and zero padding. Methods Data We use the Medical Information Mart for Intensive Care IV (MIMIC-IV) database, which contains de-identified health data for patients who were admitted to either the emergency department or stayed in critical care units of the Beth Israel Deaconess Medical Center in Boston, Massachusetts 24 . MIMIC-IV excludes patients under 18 years of age. Date of death is derived from hospital and state records collected two years after the last patient discharge. We restrict patients to those with at least two medical visits within 1-2 years apart. Visits occurring more than two years after the first visit are treated as a separate patient. A patient’s observation window is made up of medical visits that occur within two years after their first visit. We incorporate a 30-day buffer between the observation window and the prediction window so as not to use any visits where the patient may have died during (these patients were omitted). This process yields 33,123 patients with 12 percent of patients having experienced 1-year mortality. We then downsample randomly for a more balanced cohort (50/50). This yields 6070 patients ( Table 1 ). View this table: View inline View popup Download powerpoint Table 1: Characteristics of our MIMIC-IV patient cohort From each patient visit we extract the first 512 words from each discharge summary (deleting chunks of text related to physical exam and discharge instructions). We use BioClinicalBERT to create a 768-dimensional vector representation of each discharge summary from each visit. BioClinicalBERT was initialized with BioBERT (trained on PubMed articles) and trained on all the notes from MIMIC-III, a past version of our database containing electronic health records from ICU patients from BIDMC between 2001 and 2012 9 , 25 . Zero padding The use of zero-padding is a standard technique to deal with missing data in time series analysis, as it allows machine learning models to handle variable-length sequences while maintaining a consistent input size 26 . The maximum number of visits for a patient in our data is 79. Thus, for each patient, we fill in any missing doctor’s notes and ensure representation for up to 79 visits by using zero vectors of dimension 768 in accordance with the BioClinicalBERT embedding size ( Figure 2a ). Download figure Open in new tab Figure 2: Illustration of zero-padding and LLM-based imputation: (a) BioClinicalBert embeddings are generated based on existing notes and zero-vectors are used for missing notes. (b) The LLM is given existing notes and asked to generate text for the missing notes. BioClinicalBert embeddings are calculated for both existing and generated text. Last Observation Carried Forward (LOCF) For the rest of the temporal harmonization methods, we first apply temporal windows to the data. We consider the first visit as our index to align each patient, and split each patient record into four six-month intervals. If multiple visits occur in an interval, we keep the notes from the most recent visit. Naturally, if no visits occur in a particular interval, there is a missing gap. The following methods describe the different ways to fill that gap. For each missing doctor’s note embedding, if it is not associated with the first visit, we reference the last observation for that patient and carry that embedding forward. In the case that the first visit is missing, we impute with a zero vector. This method is based on the rationale that the most recent observation for the patient typically holds the most relevant information about their condition. Multimodal Imputation Supervised learning models may learn better by leveraging multimodal data to pull information from similar patients. MIMIC-IV data also provides codified data and structured data such as lab and vitals. Thus, we extract the diagnosis codes, prescription codes, procedure codes, and 30 most common labs and vitals in addition to the discharge summaries. We hypothesize that for two multimodal examples, if the majority of their modalities exhibit high similarities, then it is likely that their remaining modalities will also be similar. Thus, we propose that we leverage the most similar datapoint across the known modalities to impute the missing data in the remaining modality. Note that in this process, we mask out all note embeddings belonging to the same patient so that the algorithm is forced to carry over note embeddings from a different patient. We use cosine similarity for the similarity metric. Large Language Models Finally, LLMs trained on large amounts of text data with billions of parameters may provide additional knowledge to help fill in the gaps between patient visits. We leverage LLMs to generate text data for the missing doctor’s notes based on the available doctor’s notes for that patient ( Figure 2b ). We compare the existing state-of-the-art LLM Generative Pre-trained Transformer 4 (GPT-4) to LLMs that were trained specifically on biology data and clinical data. GPT-4 is a LLM developed by OpenAI, based on the transformer architecture. It demonstrates human-like language abilities and is trained on a large corpus of text data in an unsupervised manner 27 . BioMistral is an open-sourced LLM that uses Mistral as its foundation model and is further pre-trained on PubMed Central 28 . Asclepius is a specialized clinical large language model that is trained on synthetic clinical notes and further evaluated with real clinical notes from MIMIC-III discharge summaries. Asclepius has exhibited comparable performance metrics to GPT-3.5-turbo on several clinical benchmarks 29 . To evaluate the performance of these three LLMs, we begin by exploring their capabilities in zero-shot learning. Zero-shot learning refers to the ability of a model to perform a task without having seen any explicit examples of that task during training 30 . We provide the LLM with the patient’s four visits, labeling any missing visits as ‘MISSING.’ We inform the LLM that each visit occurs at six-month intervals. We prompt the LLM to generate a doctor’s note for the missing visit about the patient’s symptoms and treatment plans based on the information from the existing visits. We further explore few-shot learning, where the LLM is provided with a few examples of the task within the prompt itself, along with the original task instructions. This approach has been found to enhance the model’s performance by providing contextual examples 30 . However, few-shot learning may be limited by the LLM’s input size constraints, particularly when the original prompt is lengthy. Thus, we experimented with one-shot learning and modified the format of the example prompt. In this format, the example prompt includes notes from a single visit and the corresponding answer is the note from the subsequent visit occurring six months later. Following this, the original task instruction is provided. Supervised learning We compare various methods for temporal harmonization of clinical text across multiple medical visits. For unstructured text data, we use BioClinicalBERT to extract embeddings. Due input size constraints of the BioClinicalBERT model, we truncate the text to the last 512 tokens and output a 768-dimensional vector representation of the text. All input text is lowercase. In order to capture the temporal nature of the data, we use a BiLSTM model and a transformer model with positional encoding for evaluation. After the missing doctor’s notes have been filled through various methods, we extract the BioClinicalBERT embeddings then feed in the vectors into the models to predict mortality. The BiLSTM is trained for 100 epochs with an Adam optimizer and a learning rate of 1e-4 and consists of two layers with a hidden dimension of 256. The transformer architecture is also trained for 100 epochs and consists of one layer with four attention heads with an Adam optimizer and a learning rate of 1e-4. Algorithmic Fairness We further assess our best method’s performance by segmenting the data based on the percentage of missing information. We compare the effectiveness of zero-padding, and the best temporal harmonization methods across varying levels of data incompleteness. Results We evaluate the performance of various temporal harmonization methods by comparing their area under the curve (AUC) scores and F1 metrics for both BiLSTM and transformer architectures. In both architectures, GPT-4 with zero-shot prompting achieves the highest AUC score. For F1-score, GPT-4 with zero-shot prompting leads in the BiLSTM architecture, while GPT-4 with one-shot prompting has the best performance with the Transformer architecture. GPT-4 significantly surpasses the baseline zero-padding method ( Table 2 , Figure 3 ). For precision and recall, see Table S1 in the supplement. View this table: View inline View popup Download powerpoint Table 2: Performance metrics of supervised learning methods by temporal harmonization method Download figure Open in new tab Figure 3: ROC curves by temporal harmonization method for a) BiLSTM and b) Transformer We find that there is no significant difference in performance between zero-shot prompting and one-shot prompting across all three LLMs. When we evaluate the AUC scores and F1-metric, there is no clear trend that suggests that one-shot prompting increases the accuracy of the generated text ( Table 2 , Figure 3 ). For non-LLM methods, we see that the LOCF approach shows a slight improvement over the baseline method of zero-padding. We further note that Multimodal Imputation underperforms compared to the baseline method of zero-padding ( Table 2 , Figure 3 ). After applying temporal windows on the data, each patient has four time intervals that occur six months apart. After identifying the best performing temporal harmonization methods, we stratify our test data by patients with less than 50% missingness for those time intervals, and those with more than 50%. We observe huge improvements in model performance for both BiLSTM (AUC from 0.728 to 0.846, F1 score from 0.673 to 0.717) and transformer (AUC from 0.781 to 0.836, F1 score from 0.608 to 0.705) when using GPT-4 zero shot and one shot prompting to fill in the gaps ( Table 3 ). For precision / recall results, see Table S2 in the Supplement. View this table: View inline View popup Download powerpoint Table 3: Quantifying improvements in performance for patients with more or less missing data Discussion In this work, we demonstrated the effectiveness of GPT-4 for temporal harmonization in supervised learning models when modeling medical notes for clinical prediction. Since GPT-4 is not an open source model, it is unknown what data were used to train it. However, its performance in temporal harmonization indicates that it was likely trained on medical data along with other sources of data to provide insightful predictions about patient conditions when filling in the missing gaps. Particularly for patients with less medical visits, we observe huge improvements in model performance by filling those gaps with text generated by GPT-4. Previous studies have shown that missing data in EHRs can negatively impact model performance in disease prediction models 19 . In the emergency department / ICU case, patients with less visits are more likely to be healthier. However, in a primary care setting, underserved populations are more likely to have less data in their record. These results display the potential for GPT-4 to combat algorithmic unfairness by augmenting the EHRs for patients with less data. Interestingly, our results indicate that the use of GPT-4 for temporal harmonization in clinical prediction models surpasses the use of clinical and biological LLMs in performance, highlighting the adaptability of GPT-4 for healthcare applications. This suggests that even without targeted training in medical text, GPT-4 is able to generalize and provide relevant information in various clinical scenarios. This suggests exciting new directions for research in applying more generalist LLMs to healthcare, and identifying different scenarios where one may or may not need specific clinical LLMs. We observe that Multimodal Imputation performs worse in our supervised deep learning models than the baseline method of zero-padding. This is likely due to the fact that Multimodal Imputation masks out note embeddings from the same patient during imputation. Even if data from the other modalities is very similar, forcing the imputation to carry over note embeddings from a different patient could introduce noise into the note embeddings. We further note that one-shot prompting did not lead to a consistent increase in model performance compared to zero-shot prompting. This may be an indicator that the LLM models have strong generalization capabilities even without task-specific training. In that case, we could come to the conclusion that zero-shot prompting is sufficient for our setting. However, the lack of performance improvement could also be attributed to the length of the prompts and the potential limitations of the LLM in handling longer prompts. Additionally, the discrepancy in formatting between the example prompt and the actual prompt, motivated by constraints on input length, may have diminished the impact of one-shot prompting. There are a few limitations to this study. Future research could focus on enhancing explainability with LLMs and reducing instances of hallucinations. Currently, due to the black-box nature of LLMs, it is difficult to determine how they arrive at specific outputs, which is problematic in the healthcare sector where transparency is critical. We further anticipate that improving explainability in LLMs could also contribute to minimizing hallucinations in their outputs. Therefore, research into methods of identifying and correcting hallucinations in LLM outputs will be a significant step towards their safe and effective use in the domain of healthcare. Finally, it would be worth investigating the impact of LLMs on temporal harmonization of primary care data, or even simulating missingness in ICU clinical notes that reflect barriers in access to healthcare (similar to Getzen et al., 2023), and evaluating LLMs’ abilities to fill in the gaps and improve clinical prediction performance. Data Availability All data produced are available online at https://physionet.org/content/mimiciv/2.2/ https://physionet.org/content/mimiciv/2.2/ Supplementary material View this table: View inline View popup Table S1: Performance metrics of supervised learning methods by temporal harmonization method including PPV and Recall View this table: View inline View popup Download powerpoint Table S2: Quantifying improvements in performance for patients with more or less missing data including PPV and Recall Acknowledgements This work was supported in part by National Institutes of Health grants, U01CA274576 and RF1AG063481. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health. References 1. ↵ Hoerbst A , Ammenwerth E. Electronic health records . A systematic review on quality requirements. Methods Inf Med . 2010 ; 49 ( 4 ): 320 – 36 OpenUrl 2. ↵ Huang K , Altosaar J , Ranganath R. ClinicalBERT: Modeling Clinical Notes and Predicting Hospital Readmission . CHIL Workshop . 2020 . 3. Kim B. Medical Codes Predictions from Clinical Notes: From Human Coders to Machines . BayLearn . 2022 . 4. Saigaonkar S , Narawade V. Predicting chronic diseases using clinical notes and fine-tuned transformers . IEEE Bombay Section . 2022 . 5. ↵ Hashir M , Rupy R. Towards unstructured mortality prediction with free-text clinical notes . Journal of Biomedical Informatics . 2020 ; 108 ; 103489 OpenUrl 6. Zhang Y , Chen Q , Yang Z , Lin H , Lu Z. Biowordvec, improving biomedical word embeddings with subword information and mesh . Scientific data . 2019 ; 6 ( 1 ): 1 – 9 . OpenUrl 7. Ghorbani R , Ghousi R , Makui A , Atashi A. A new hybrid predictive model to predict the early mortality risk in intensive care units on a highly imbalanced dataset . IEEE Access . 2020 . 8. Lee J , Yoon W , Kim S , Kim D , Kim S , So C , Kang J. Biobert: a pre-trained biomedical language representation model for biomedical text mining . Bioinformatics , 36 ( 4 ): 1234 – 1240 . 9. ↵ Alsentzer E , Murphy J , Boag W , Weng W , Jindi D , Naumann T , McDermott M. Publicly available clinical-bert embeddings . Proceedings of the 2nd Clinical Natural Language Processing Workshop . 72 – 78 . 10. ↵ Yang H , Kuang L , Xia F. Multimodal temporal-clinical network for mortality prediction . Journal of Biomedical Semantics . 2021 ; 12 . 11. Liu J , Zhang Z , Razavian N. Deep EHR: Chronic Disease Prediction Using Medical Notes . Proceedings of the 3rd Machine Learning for Healthcare Conference . 2018 ; 85 : 440 – 464 . OpenUrl 12. Xu Z , So D , Dai A. MUFASA: Multimodal Fusion Architecture Search for Electronic Health Records . 35th AAAI Conference on Artificial Intelligence . 2021 . 13. Yang B , Wu L. How to Leverage Multimodal EHR Data for Better Medical Predictions? EMNLP . 2021 . 14. ↵ Zhang D , Thadajarassiri J , Sen C , Rundensteiner E. Time-Aware Transformer-based Network for Clinical Notes Series Prediction . Proceedings of Machine Learning Research . 2020 . 15. ↵ Deznabi , Iyyer M , Fiterau M. Predicting in-hospital mortality by combining clinical notes with time-series data . Findings . 2021 . 15. Wells B , Chagrin KM , Nowacki AS , Kattan MW . Strategies for handling missing data in electronic health record derived data . eGEMs . 2013 . 16. ↵ Tan A , Getzen E , Hutch M , Strasser Z , Gutiérrez-Sacristán A , Le T , Dagliati A , Morris M , Hanauer D , Moal B , Bonzel C , Yuan W , Chiudinelli L , Das P , Zhang H , Aronow B , Avillach P , Brat G , Cai T , Hong C , La Cava W , Shriver E , Shakeri Hossein Abad Z , Tan B , Visweswaran S , Wang X , Weber G , Xia Z , Verdy B. COVID-19 by EHR 4CE Collaborative Group/Consortium, Long Q, Mowery D, Holmes J . Informative missingness: what can we learn from patterns in missing laboratory data in the electronic health record? J Biomed Inform 2023 ; 2 : 104306 . OpenUrl 17. ↵ Getzen E , Tan A , Brat G , Omenn G , Strasser Z , COVID-19 by EHR 4CE Collaborative Group/Consortium , Long Q , Holmes J , Mowery D. Leveraging informative missing data to learn about acute respiratory distress syndrome and mortality in long-term hospitalized COVID-19 patients throughout the years of the pandemic . AMIA Ann. Symp. Proc . 2024 ; 942 – 950 . 18. ↵ Gianfrancesco M , Tamang S , Yazdany J , Schmajuk G. Potential biases in machine learning algorithms using electronic health record data . Jama Intern Med . 2019 ; 178 ( 11 ): 1544 – 1547 . OpenUrl 19. ↵ Getzen E , Ungar L , Mowery D , Jiang X , Long Q. Mining for equitable health: Assessing the impact of missing data in electronic health records . Journal of Biomedical Informatics . 2023 ; 139 : 104269 . OpenUrl CrossRef 20. ↵ Hettige B , Wang W , Yuan-Fang L , Suong L , Wray B. MedGraph: structural and temporal representation learning of electronic medical records . ECAI . 2020 . 21. ↵ Goel A , Gueta A , Gilon O , Liu C , Erell S , Nguyen L , Hao X , Jaber B , Reddy S , Kartha R , Steiner J , Laish I , Feder A. LLMs accelerate annotation for medical information extraction . Machine Learning for Health (ML4H) . 2023 ; 82 – 100 . 22. ↵ Yang J , Jin H , Tang R , Han X , Feng Q , Jiang H , Yin B , Hu X Harnessing the Power of LLMs in Practice: A Survey on ChatGPT and Beyond . ACM Transactions on Knowledge Discovery From Data . 2023 . 23. ↵ Tennenholtz G , Chow Y , Hsu C-W , Jeong Jm Šhani L , Tulepbergenov A , Ramachandran D , Mladenov M , Boutilier C. Demystifying Embedding Spaces Using Large Language Models . ICLR . 2024 . 24. ↵ Johnson A , Bulgarelli L , Shen L , Gayles A , Shammout A , Horng S , Pollard T , Hao S , Moody B , Gow B , Lehman L , Celi L , Mark R. MIMIC-IV, a freely accessible electronic health record dataset . Scientific Data . 2023 ; 10 . 25. ↵ Johnson A , Pollard T , Shen L , Lehman L , Feng M , Ghassemi M , Moody B , Szolovits P , Celi L , Mark R. MIMIC-III, a freely accessible electronic health record dataset . Scientific Data . 2016 ; 3 : 160035 . OpenUrl 26. ↵ Goodfellow I , Bengio Y , Courville A. Deep Learning . MIT Press . 2016 . 27. ↵ OpenAI . GPT-4 . 2023 . 28. ↵ Labrak Y , Bazoge A , Morin E , Gourraud P , Rouvier M , Dufour R. BioMistral: A Collection of Open-Source Pretrained Large Language Models for Medical Domains . arXiv preprint . 2024 ;2404.10373 29. ↵ Kweon S , Kim J , Kim J , Im S , Cho E , Bae S , Oh J , Lee G , Moon J , You S , Baek S , Han C , Jung Y , Jo Y , Choi E. Publicly Shareable Clinical Large Language Model Built on Synthetic Clinical Notes . arXiv preprint . 2023 ;2309.00237. 30. ↵ Radford A , Wu J , Child R , Luan D , Amodei D , Sutskever I. Language models are unsupervised multitask learners . OpenAI blog . 2019 ; 1 ( 8 ): 9 . OpenUrl View the discussion thread. Back to top Previous Next Posted May 07, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Filling the gaps: leveraging large language models for temporal harmonization of clinical text across multiple medical visits for clinical prediction Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Filling the gaps: leveraging large language models for temporal harmonization of clinical text across multiple medical visits for clinical prediction Inyoung Choi , Qi Long , Emily Getzen medRxiv 2024.05.06.24306959; doi: https://doi.org/10.1101/2024.05.06.24306959 Share This Article: Copy Citation Tools Filling the gaps: leveraging large language models for temporal harmonization of clinical text across multiple medical visits for clinical prediction Inyoung Choi , Qi Long , Emily Getzen medRxiv 2024.05.06.24306959; doi: https://doi.org/10.1101/2024.05.06.24306959 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Intensive Care and Critical Care Medicine Subject Areas All Articles Addiction Medicine (574) Allergy and Immunology (865) Anesthesia (304) Cardiovascular Medicine (4460) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (611) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15251) Forensic Medicine (31) Gastroenterology (1132) Genetic and Genomic Medicine (6621) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4564) Health Policy (1372) Health Systems and Quality Improvement (1617) Hematology (544) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15938) Intensive Care and Critical Care Medicine (1107) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6642) Nursing (346) Nutrition (1001) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3350) Ophthalmology (981) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1698) Pharmacology and Therapeutics (694) Primary Care Research (714) Psychiatry and Clinical Psychology (5464) Public and Global Health (9259) Radiology and Imaging (2212) Rehabilitation Medicine and Physical Therapy (1372) Respiratory Medicine (1198) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (533) Surgery (715) Toxicology (100) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a0392e73aa5d7406',t:'MTc4MDA5NjMyMA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00