Automated abstraction of clinical parameters of multiple myeloma from real-world clinical notes using large language models

doi:10.1101/2024.12.17.24318605

Automated abstraction of clinical parameters of multiple myeloma from real-world clinical notes using large language models

2024 · doi:10.1101/2024.12.17.24318605

preprint OA: closed CC-BY-ND-4.0

📄 Open PDF Full text JSON View at publisher

Full text 47,504 characters · extracted from preprint-html · click to expand

Automated abstraction of clinical parameters of multiple myeloma from real-world clinical notes using large language models | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Automated abstraction of clinical parameters of multiple myeloma from real-world clinical notes using large language models View ORCID Profile Alana O’Brien Del Campo , View ORCID Profile Dmytro Lituiev , View ORCID Profile Gowtham Varma , View ORCID Profile Mithun Manoharan , View ORCID Profile Sunil Kumar Ravi , View ORCID Profile Avinash Aman , View ORCID Profile Ankit Kansagra , View ORCID Profile Joel Greshock , View ORCID Profile AJ Venkatakrishnan , View ORCID Profile Ashita Batavia doi: https://doi.org/10.1101/2024.12.17.24318605 Alana O’Brien Del Campo 1 Johnson & Johnson Innovative Medicine , 301 Binney Street, Cambridge, MA 02142 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alana O’Brien Del Campo Dmytro Lituiev 2 Johnson & Johnson Innovative Medicine, 1 Johnson & Johnson Plaza , New Brunswick, NJ, 08933 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Dmytro Lituiev Gowtham Varma 3 nference, 3rd, 4th & 5th Floor, Indiqube Golf View Homes, 3rd Cross, Rustam Bagh Layout, NAL Wind Tunnel Main Road , Murugeshpalya, Bengaluru – 560017, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Gowtham Varma Mithun Manoharan 3 nference, 3rd, 4th & 5th Floor, Indiqube Golf View Homes, 3rd Cross, Rustam Bagh Layout, NAL Wind Tunnel Main Road , Murugeshpalya, Bengaluru – 560017, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mithun Manoharan Sunil Kumar Ravi 3 nference, 3rd, 4th & 5th Floor, Indiqube Golf View Homes, 3rd Cross, Rustam Bagh Layout, NAL Wind Tunnel Main Road , Murugeshpalya, Bengaluru – 560017, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sunil Kumar Ravi Avinash Aman 3 nference, 3rd, 4th & 5th Floor, Indiqube Golf View Homes, 3rd Cross, Rustam Bagh Layout, NAL Wind Tunnel Main Road , Murugeshpalya, Bengaluru – 560017, India Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Avinash Aman Ankit Kansagra 4 Johnson & Johnson Innovative Medicine , 920 US Route 202 South, Raritan, NJ, 08869 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ankit Kansagra Joel Greshock 5 Johnson & Johnson Innovative Medicine , 965 Chesterbrook Boulevard, Wayne, PA, 19087 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joel Greshock AJ Venkatakrishnan 6 nference, One Main Street , Suite 400 East Arcade, 4th Floor Cambridge, MA 02142 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for AJ Venkatakrishnan For correspondence: aj{at}nference.net Ashita Batavia 2 Johnson & Johnson Innovative Medicine, 1 Johnson & Johnson Plaza , New Brunswick, NJ, 08933 Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ashita Batavia For correspondence: aj{at}nference.net Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Background Real-world evidence (RWE) is increasingly recognized as a valuable type of oncology research but extracting fit-for-purpose real-world data (RWD) from electronic health records (EHRs) remains challenging. Manual abstraction from free-text clinical documents, although the gold standard for information extraction, is resource-intensive. RWD generation using natural language processing (NLP) has been limited by performance ceilings and annotation requirements, which recent LLMs improve on. We evaluate new NLP workflows in abstracting multiple myeloma (MM) information from de-identified EHRs. Methods NLP workflows (BERT and Llama-based using various prompt types) were developed for 12 MM-specific data fields and evaluated with manually curated data from 125 clinical notes. The best Llama-based workflow for three data fields was applied to all recent notes in selected charts to generate patient journey timelines. Results Average F 1 for the best Llama and BERT workflows was 0.82 and 0.65 respectively. Best workflow performance ranged across the data fields (F 1 = 0.59–0.99). Statistical analysis of the results showed model size, inter-rater reliability (IRR), variable type, and prompt design significantly predicted workflow performance, in descending order of significance ( p < 0.05). Conclusion The overall performance improvements seen with larger LLMs and chain-of-thought prompting was greater in ambiguous data fields. IRR can be used to prioritize NLP resources and increase efficiency of RWD generation without sacrificing data quality. Introduction In the past decade, multiple myeloma (MM) patients and providers have seen a rapid expansion of therapeutic options, creating greater need for real world evidence (RWE) that can expand on findings from randomized controlled trials. 1 – 4 Electronic Health Records (EHRs) contain a wealth of real-world clinical data capturing diagnoses, treatments and outcomes. However, key concepts for creating trial-like cohorts and outcomes, such as transplant eligibility and certain International Myeloma Working Group (IMWG) criteria, are captured in unstructured clinical notes and must be extracted and structured to create fit-for-purpose real world data (RWD) usable for evidence generation. 5 , 6 Manual abstraction continues to be the gold standard for clinical information extraction (IE) from unstructured text. However, the time and resources required to manually abstract numerous clinical data fields may ultimately constrain RWE generation. In recent years, natural language processing (NLP) techniques based on transformer language models have shown great potential for automating this labor-intensive process. 7 Transformer models can be broadly categorized into encoders such as Bidirectional Encoder Representations from Transformer (BERT) typically used for discriminative tasks like classification and named entity recognition (NER), and decoders (such as GPT and Llama) and encoder-decoders typically used for tasks requiring text generation. Recent decoder models typically have at least 1 billion parameters compared to earlier decoder models (e.g., BERT with 300 million parameters). Until recently, NER-based IE using BERT models has been a popular NLP approach due to their strong performance across many tasks. 8 – 10 The effectiveness of BERT models for clinical IE has previously been demonstrated in tasks like identifying drug-related adverse events, extracting clinical symptoms and real-world disease outcomes. 11 – 13 Their success notwithstanding, BERT models pose certain limitations to investigators. BERT models, while highly parameterized compared to traditional NLP techniques, still have fewer parameters than most of the newer LLMs and often require task-specific training on manually labeled datasets. 14 , 15 Also, the narrower context window of most BERT models limits their ability to infer concepts that are typically presented over multiple sentences or paragraphs, such as transplant eligibility. 16 More recent generative LLMs have more parameters and larger context windows. When pre-trained on large corpora, these newer LLMs promise high off-the-shelf performance in clinical IE tasks without task-specific training, particularly for complex concepts. 17 These advances in LLMs thus hold promise to accelerate development of high quality real-world datasets by mitigating the need for extensive manual data curation and fine-tuning. Early applications of generative LLMs for clinical IE predominantly use Flan and GPT family models, and demonstrate wide-ranging performance (accuracy and F1 statistics both ranging from low 40s to high 90s percentage), primarily due to three contributing factors. 18 – 23 First, complex data fields, such as adverse social determinants of health, challenge all NLP techniques including LLMs. 24 , 25 Model size, which impacts pre-training capacity, drives off-the-shelf performance of LLMs. And finally, reasoning-based prompting techniques like chain-of-thought purport to help a generally pre-trained LLM handle specialized tasks better than zero-shot learning. 26 , 27 Investigators planning RWD generation from unstructured text can find newer LLMs to be a powerful tool, but the lack of comparability between published studies hinders generalization of results to inform IE strategy. Off-the-shelf accuracy of LLMs is unpredictable, obfuscating the time and effort required to produce a reliable data field. Although larger LLMs achieve better results, they require more time to run, can be costly, and request substantial up-front investment in computational infrastructure. Also, prompt engineering is an art more than a science, with few guardrails to guide users on improving performance. Instead of defaulting to a single NLP technique for all data fields, efficient data generation balances accuracy versus time and resource demands. 28 Our objective in this research is quantify the impact of IE design choices and provide investigators with an approach to tailoring efficient LLM usage. Ultimately, we hope this will accelerate generation of high-quality MM RWD. In this research, we utilize several NLP models (Llama 3 8B, Llama 3 70B, and BERT) in workflows to extract clinical information from the EHR notes of patients diagnosed with MM. We evaluate NLP workflow performance against a manually abstracted reference dataset. We analyze the impact of text ambiguity, model size and chain-of-thought prompting on newer LLM performance. Finally, we use the best NLP workflow to extract three data fields for all recent notes in select patient charts, demonstrating enhanced information availability using the entirety of the observable patient journey. Methods Data source & selection This study analyzed de-identified EHR data from a network of tertiary clinical centers tied to an academic medical center in the United States through the nference nSights Analytics Platform. 29 nference, in collaboration with the academic medical center data partner (AMC) that provided the de-identified data for this study, has established a secure data environment, hosted by and within the AMC, that houses the AMC’s de-identified patient data. The provisioning of and access to this data are governed by an expert determination that satisfies the HIPAA Privacy Rule requirements for the de-identification of protected health information. Each AMC’s de-identified data environment is specifically designed and operated to enable access to and analysis of de-identified data without the need for Institutional Review Board (IRB) oversight, approval, or an exemption confirmation. Given these measures, informed consent and IRB review were not required for this study. Patient population and selected notes A cohort of MM patients was created and notes for IE were sampled from their charts. The study cohort (n=3,793) included patients with MM diagnosis and treatment between January 1, 2019, to March 31, 2024. Patients with MM were identified using two occurrences of diagnosis codes of 203.0* (ICD-9), C90, and C90.0* (ICD-10). The first occurrence of MM diagnosis code was considered the diagnosis date. Patients were required to have at least one encounter recorded within six months of the diagnosis date and another encounter six months after the diagnosis date. The notes database (n = 250 notes) was created by selecting one instance of provider-documented unstructured text associated with an encounter, pathology testing, or imaging study, from the charts of 250 patients randomly selected from the study cohort. Study cohort and note selection criteria are available in Supplementary Note 1. This notes database was randomly divided into development and test sets of 125 notes each. Clinical concepts and data fields 13 MM-related data fields were chosen. (See Table 1 ). All fields were text-based, provider-documented values (i.e., not derived from structured data fields such as ICD codes, medications, or timestamps). View this table: View inline View popup Table 1. Data fields of interest contained in HCP notes Manual data curation A reference dataset for the 13 data fields was created for prompt development and NLP workflow testing. The reference dataset was created by two independent abstractors with arbitration by a third abstractor. Inter-rater reliability (IRR) was calculated to approximate the ambiguity of the information available. IRR was assessed in two ways. First, Krippendorff’s α (K-α) was calculated. An overall average K-α of ≥0.8 for the test dataset was considered acceptable. 30 Secondly, the agreement between each abstractor and the final arbitrated label was evaluated using an F 1 score, which was then averaged across the two abstractors. This F 1 -based IRR (subsequently referred to as IRR-F 1 ) was used as a proxy for data field ambiguity in statistical analysis. Supplementary Note 2 contains abstraction protocol and metrics (K-α and F 1 score for each data field, class distribution) and low-count class combinations. NLP-based methods for information extraction Five LLM workflows for extracting information on the selected data fields were developed: four workflows utilizing Llama models and one workflow utilizing BERT. Llama workflows Meta’s Llama 3 family of models provided an open-source, privately deployable LLM with small and medium-sized model sizes. 31 The small model (8 billion parameters) has lower performance on all benchmarks assessed at release but is more computationally facile. 31 The medium model (70 billion parameters) offers advanced natural language capabilities, more promising for challenging clinical text, but required more computational infrastructure to run. 31 (A 405 billion parameter model was released after the start of research and thus not included.) The small (8B) and medium (70B) models were run in private compute clusters to protect deidentified clinical text. Performance evaluations were done using a temperature of 0.1 and a top-K (the number of highest probability token options used for sampling) value of 1 for reproducible results. Sensitivity analysis was performance for additional top-K and temperature values. Zero-shot-learning (ZSL) was selected for default performance and chain-of-thought (CoT) prompting was compared as a common reasoning-method technique. Bespoke prompts for each data field were systematically designed using the development set. Prompts were refined based on errors generated until achieving average abstractor F 1 for the respective field or when subsequent modifications did not yield performance improvements. Final prompts are available in Supplementary Note 3. The model was prompted to return a JSON-structured response. The syntax of JSON output was parsed and standardized to pre-specified labels using post-processing logic. These four Llama workflows (i.e., model-prompt combination) are referenced throughout using respective model size and prompting technique (e.g., 70B-CoT for Llama 3 70B with chain-of-thought prompting). BERT workflow A BERT workflow, earlier developed for a broad range of biomedical tasks on nference data, was deployed as the baseline technique. A pipeline of six proprietary BERT-based classification models includes: (1) named entity recognition model trained to detect 27 entity types; (2-4) qualifier models: subject, temporality and certainty models; (5) concept association model for “problem-location”, “problem-severity”, “lab data-value”; and (6) date association model for “variable-date” entity pairs. These proprietary models are fine-tuned versions of SciBERT cased 32 (basis for models 1-5 specified above) and ClinicalBERT 33 (basis for model 6). The base models underwent further supervised fine-tuning for IE tasks on annotated sentences from clinical document texts of the nference nSights database, but not specifically on MM patient note database. For each data field, relevant synonyms derived from tokenization of the development set were curated and incorporated. The BERT pipeline output was refined using regular expression models and business rules developed using the development set BERT workflow development approach, performance metrics, and rules are available in Supplementary Note 4. Statistical analysis Macro-F 1 scores were used to evaluate the performance of the NLP workflows. For multi-label fields such as dates, macro-F 1 score was substituted with a weighted F 1 , calculated as defined in Supplementary Note 1. Spearman’s rho was calculated for numeric fields. Visualization and exploratory data analysis was conducted in Python (3.10.6) using pandas (2.2.2), numpy (1.26.4), scipy, matplotlib (3.9.1), seaborn (0.13.2), and plotly (5.23.0) ( https://plot.ly/ ). 34 – 39 Statistical analysis of LLM workflow performance for each data field included pairwise comparisons and, for the four Llama workflows, ANOVA. Independent variables included in the ANOVA were: model size, prompt design style, data type (numeric, binary, or categorical), and IRR-F 1 (the latter two capturing data ambiguity). Data ambiguity metrics reflecting development and test set were used. Statistical analysis was performed in RStudio (2023.06.1), R (4.3.1) and visualizations were performed in ggplot2 (3.5.1). 40 , 41 Evaluation of LLM-extracted events against structured diagnosis dates The best-performing Llama workflow for three data fields ( MM type, transplant status , and extramedullary disease ) was deployed on routine clinical notes within 120 days of MM diagnosis date as determined by ICD codes. Extracted labels for 200 randomly selected patients with routine clinical documents were plotted on the timelines to evaluate frequency, distribution, and timing of the label occurrences around the structured diagnosis date. Results Reference datasets for performance evaluation The test dataset of 125 notes (median length of 977 words, IQR 576-289), representing 125 unique patients with MM, was annotated for the selected data fields (100 notes annotated for all data fields, 25 notes for first-line therapy related data fields: FLT regimen, FLT response, FLT response date . Very few notes (<10%) contained information for FLT response date ; this data field was excluded from further analysis. For the 12 data fields analyzed, the average Krippendorff’s α was α=0.77 for test and development set (0.83 for test set only). Average IRR-F 1 was F 1 =0.74 for test and development set (0.79 for test set only). Values by data field are shown in Supplementary Table 2. Comparison of workflow performance In total, five LLM workflows (four Llama-based, and one BERT-based) were deployed. Performance of each workflow is shown in Figure 1a . Spearman’s rank correlation coefficients were calculated for continuous data fields ( ECOG score, Plasmacytosis percentage, MM diagnosis date , and FLR start date ) (see Figure 1b ). Sensitivity analysis for the best-performing Llama workflow by data field is available in Supplementary Note 5. Download figure Open in new tab Figure 1: (b) ;F 1 score by workflow and data field; (b) : Spearman’s rho for numeric data fields. F 1 statistics ranged widely by workflow and data field (F 1 = 0.32-0.99). Llama 70B-CoT had the highest performance on seven out of 12 data fields overall. Notably, the BERT workflow outperformed Llama 8B with both prompt types for three fields, and outperformed 8B-ZSL in four additional fields. For data fields with continuous values, Spearman rank correlations of predictions with the reference data showed mostly near-perfect prediction by 70B workflows. Figure 2 demonstrates the F 1 statistics distribution for each Llama workflow compared to BERT workflow. Larger model size and CoT prompting improves performance compared to BERT workflow more consistently Download figure Open in new tab Figure 2. Llama vs BERT workflow F 1 by data field. R: Pearson correlation coefficient, p: p-value. Drivers of Llama workflow performance ANOVA and pairwise analysis (see Table 2 and Table 3 ) were used to quantify the impact of model size, prompting technique, and data field characteristics (type and IRR-F 1 ) on Llama workflow F 1 statistics. View this table: View inline View popup Download powerpoint Table 2. ANOVA test for predictors of Llama workflow F 1 View this table: View inline View popup Download powerpoint Table 3a. Univariate testing of predictors of Llama workflow F 1 View this table: View inline View popup Download powerpoint Table 3b. Pairwise comparison of data field type on Llama workflow F 1 All variables significantly impact Llama workflow performance (ANOVA p < 0.05). IRR-F 1 , capturing data field ambiguity, has the largest impact on workflow F 1. Increasing model size (from Llama-8B to Llama-70B) has twice the impact on improving F 1 as using CoT prompting instead of ZSL. Comparing NLP workflow performance by inter-rater reliability score As shown in Figure 3 , IRR-F 1 varied across data fields from 0.61 to 0.88. For all data fields, a larger model size consistently improved LLM workflow performance, although the improvement magnitude was generally higher for more ambiguous data fields (lower IRR-F 1 ). Response to CoT prompting was heterogeneous: three data fields ( Bone lesion presence, MM diagnosis, and MM status ) experienced worse performance with CoT than with ZSL; Plasmacytosis percentage was essentially equivalent between the two methods. Worse performance with CoT prompting occurred more often in data fields demonstrating low ambiguity (high IRR-F 1 , greater than 0.75). More ambiguous data fields (low IRR-F 1 , under 0.75) were substantially helped by CoT prompting. Download figure Open in new tab Figure 3. LLM workflow F 1 in comparison to inter-rater agreement measured by IRR-F 1 . Green-highlighted cells indicate F 1 statistic over 0.75. Exploratory analysis of LLM-extracted events timing vs. structured diagnosis date Labels for MM status, Transplant eligibility & status , and EMD presence extracted by the top-performing LLM in a 200-patient subcohort were used to create patient timelines. Figure 4a shows patient-level timelines for 20 illustrative patients (to protect patient-level data), demonstrating the recurrence of labels in a ±120-day window around the MM diagnosis date. Figure 4b shows the aggregated distribution of first label occurrence in each data field class for the entire 200-patient cohort. The most frequent labels occur within a narrow window around the diagnosis date. “Newly diagnosed” and “EMD present” labels cluster most closely to the structured diagnosis date, versus other labels which show more dispersed incidence. Review by authors with medical training (ASB, AOD, AK, MM, SKR, GV) concluded that these distributions were clinically reasonable based on timing of diagnostic testing and documentation. Download figure Open in new tab Download figure Open in new tab Figure 4. (a) Illustrative synthetic patient-level timelines, synthesized using the event rates observed in the 200-patient sample, show extracted labels categorized by type of MM (diamonds), transplant eligibility (squares), and EMD presence (circles), plotted by days relative to the index date (vertical dashed line). First occurrences are colored, and subsequent duplicates are grey. (b) Histograms summarize the timing of first label occurrence, highlighting median extraction times and clustering patterns around the index date. Discussion To accelerate RWD generation, time and resource efficiency in information extraction from unstructured text is paramount. LLMs are increasingly used to improve clinical IE efficiency but must be deployed judiciously. We tested five NLP workflows for clinical IE to understand performance variability and infer appropriate selection of NLP technique, model size, and prompt design. Our workflows show variability across 12 data fields (F 1 = 0.59—0.99 for the best NLP workflow). Subsequent statistical analysis of Llama workflow indicated that inter-rater reliability, model size, and prompt design were all significantly associated with performance (ANOVA p < 0.05). All abstraction techniques, both human and machine, are challenged by data ambiguity. Even rigorously trained human abstractors err at a rate of 1-2%; to correct for this, double abstraction with expert arbitration is the gold standard. 42 We use this human error rate, quantified as IRR-F 1 , as a surrogate for data ambiguity in analyzing Llama workflow performance. IRR-F 1 was significantly associated with Llama workflow performance (ANOVA p = 3.20 × 10 −5 ): greater human accuracy predicted higher performance with a point-for-point improvement in F 1 statistic (OLS p = 9.87 x 10 -4 ). Since reference dataset creation is a standard NLP workflow development step, IRR-F 1 can be calculated readily. We suggest IRR-F 1 as a novel guide for selecting IE approach. Our data shows that IRR-F 1 of 0.75 is a reasonable threshold for flagging a challenging data field. Challenging fields, with IRR-F 1 under 0.75, tend to see a greater performance gain in F 1 when a larger LLM and CoT prompting is used, compared to the impact seen in simple data fields. In simple fields, with IRR-F 1 over 0.75, NLP workflow performance is typically higher and there is less improvement with more powerful IE approaches. By using this rule-of-thumb, researchers can increase efficiency in RWD generation without sacrificing data quality. As expected, larger LLM size is significantly associated with higher F 1 statistics (ANOVA p = 1.36 × 10 −3 ). Llama 3-70B performs better and more consistently across data fields than Llama 3-8B. The impact is smaller for simple fields, especially numeric or binary ones, which were satisfactorily extracted by Llama 3-8B and BERT workflows. Larger LLMs, with higher resource requirements, can be reserved for challenging fields where their value will be most realized. Data field type indicated categorical variables were significantly more challenging for NLP models (ANOVA p = 1.52 x 10 -4 for type variable; p = 0.01 for pairwise comparison of categorical versus binary or numeric), thus categorical simple fields merit second priority for resource intensive approaches. Researchers without access to a larger model may want to consider model fine-tuning or manual abstraction for challenging fields. While CoT prompt design was expected to improve LLM performance, study results indicate more nuanced application of CoT prompting for best results. Challenging fields experience significant improvement with CoT prompting (ANOVA p = 3.12 × 10 −2 ), which should be the preferred approach. Simple fields generally show minimal improvement with CoT prompting and occasional worse results which we hypothesize is due to LLM “over-thinking”. We recommend using ZSL prompting as a default in simple fields, with selective exploration of where CoT might improve results. Figure 5 summarizes our recommendation for prioritizing resources for data field extraction. Download figure Open in new tab Figure 5. Evaluation criteria for designing IE workflow by data field. Finally, we constructed patient journey timelines for clinical concepts of interest. We demonstrate that patient journey timelines can provide a sense check for IE performance. Furthermore, the timelines reveal label redundancy in patient charts. Clinical documentation is inherently incomplete, and clinical characteristics evolve along the clinical journey, e.g., a radiology report may not mention transplant status; disease status may change from newly diagnosed to relapsed. NLP tool optimization will not surmount information missingness from a single note or changes over time. Patient journey timelines with repetition of data labels could help solve the uncertainty or incompleteness of any single note when generating labels for a real-world data set. Deploying NLP models over a patient record, rather than a single note, and deriving a consensus label within a timeframe could produce more accurate labelling. Development of a logical method to using information repetition demonstrated along the patient journey to create higher-accuracy real-world datasets would extend this research. This analysis has two limitations inherent to LLM utilization. LLM performance is highly sensitive to prompt design, which is difficult to quantify, and to model pre-training, which here is limited to the corpora used for Llama models. We could not exhaustively test all models or potential prompts; demonstrated applications of this proposed approach using different NLP models and prompts will support generalization. Conclusion This study analyzes the performance of various NLP models (BERT-based and foundation open-source LLMs of different sizes) in extracting data fields relevant to MM for RWD generation. We show that inter-rater reliability is the largest driver of NLP performance. We propose using an inter-rater reliability metric as a novel guide to efficiently approach information extraction for real-world dataset creation for time and resource allocation without sacrificing data quality. Authors ASB, AOD, DL, MM, GV, AJV, and JG were instrumental in the conception and design of the research. SKR and GV developed the abstraction guidelines and trained the abstractors, with ASB and AK reviewing the guidelines. SKR served as the lead abstractor and played a key role in ground truth dataset generation. MM designed and conducted the BERT-based information extraction, while AA and GV designed and conducted the LLM-based information extraction. DL and AOD performed the formal statistical analysis of the results. ASB and AJV provided high-level guidance, supervision, and resources. ASB, AOD, AK, MM, GV, and SKR contributed to the exploratory analysis and interpretation of patient record-level LLM-based extractions. DL and GV created the figures. AOD, GV, and MM drafted the manuscript, with ASB, AJV, DL, AA, JG, and AK critically revising it. All authors read and approved the final manuscript and supplementary materials. AOD, DL, and GV are co-first authors, and ASB and AJV are co-corresponding authors. Conflict of Interest Statement The work was sponsored by Johnson & Johnson. ASB, AOD, JG, DSL, and AK are current employees of Johnson & Johnson. ASB, JG, DSL, and AK are minor stockholders of Johnson & Johnson. AOD is a former employee of nference and a minor stockholder of an nference subsidiary. AA, MM, SKR, GV, and AJV are current employees and minor stockholders of nference. Data availability This study involves the analysis of de-identified Electronic Health Record (EHR) data via the nference Analytics Platform. The data shown and reported in this manuscript was extracted from this environment using an established protocol for data extraction, aimed at preserving patient privacy. The data has been de-identified pursuant to an expert determination in accordance with the HIPAA Privacy Rule. Any data beyond what is reported in the manuscript, including but not limited to the raw EHR data, cannot be shared or released due to the parameters of the expert determination to maintain data de-identification. For additional details regarding the nference Analytics Platform, please contact the corresponding authors. Code availability Annotation protocol, Llama workflow prompts, and BERT workflow rules and model performance metrics are available in Supplementary Materials. Supplementary materials Supplementary Note 1: Data selection and evaluation metrics Supplementary Note 2: Abstraction protocol and metrics Supplementary Note 3: Llama workflow prompts Supplementary Note 4: BERT workflow rules and model performance metrics Supplementary Note 5: Llama workflow sensitivity analysis Acknowledgements The authors thank Akash Anand (nference) for contributing to the study design and development of BERT and LLM workflows; Praveen Kumar M (nference) for contributing to development of the abstraction guidelines and interpretation of patient record-level LLM-based extractions and manuscript review; Purushotham Sinha (nference) for contributing to setting up the LLM private inference infrastructure and supplementing the analysis; Sai Hanitha and Poorvika Babu (both of nference) for contributing as independent abstractors for dataset creation; Ajit V Rajasekharan (nference); Tommaso Mansi (Johnson & Johnson) for reviewing the manuscript; and Venky Soundarajan (nference) for providing high-level guidance, supervision, and resources. References 1. ↵ Richardson , P. G. et al. Interpreting clinical trial data in multiple myeloma: translating findings to the real-world setting . Blood Cancer J 8 , ( 2018 ). 2. Bertamini , L. , Bertuglia , G. & Oliva , S. Beyond Clinical Trials in Patients With Multiple Myeloma: A Critical Review of Real-World Results . Front Oncol 12 , ( 2022 ). 3. Eichler , H. et al. Randomised controlled trials versus real world evidence: neither magic nor myth . Clin Pharmacol Ther ( 2020 ). doi: 10.1002/cpt.2083 OpenUrl CrossRef PubMed 4. ↵ Sherman , R. E. et al. Real-World Evidence — What Is It and What Can It Tell Us? N Engl J Med 2293 – 2297 ( 2016 ). doi: 10.1056/NEJMsb1609216 OpenUrl CrossRef 5. ↵ Kumar et al. International Myeloma Working Group consensus criteria for response and minimal residual disease assessment in multiple myeloma . Lancet Oncol 17 , E328 – E346 ( 2016 ). OpenUrl PubMed 6. ↵ Rajkumar et al. International Myeloma Working Group updated criteria for the diagnosis of multiple myeloma . Lancet Oncol 15 , E538 – E548 ( 2014 ). OpenUrl PubMed 7. ↵ Vaswani , A. Attention is all you need . Advances in Neural Information Processing Systems ( 2017 ). 8. ↵ Wu , S. et al. Deep learning in clinical natural language processing: a methodical review . J Am Med Inform Assoc 27 , 457 – 470 ( 2020 ). OpenUrl CrossRef PubMed 9. Li , I. et al. Neural Natural Language Processing for unstructured data in electronic health records: A review . Computer Science Review 46 , ( 2022 ). 10. ↵ Fu et al. Clinical concept extraction: A methodology review . Journal of Biomedical Informatics 109 , ( 2020 ). 11. ↵ Varma , G. et al. A deep learning–enabled workflow to estimate real world progression-free survival in patients with metastatic breast cancer . J Clin Oncol 42 , 11176 ( 2024 ). OpenUrl 12. Wagner , T. et al. Augmented curation of clinical notes from a massive EHR system reveals symptoms of impending COVID-19 diagnosis . Elife 9 , ( 2020 ). 13. ↵ Barman , H. et al. Identification and Characterization of Immune Checkpoint Inhibitor– Induced Toxicities From Electronic Health Records Using Natural Language Processing . JCO Clin Cancer Inform 8 , E2300151 ( 2024 ). OpenUrl 14. ↵ Devlin , J. Bert: Pre-training of deep bidirectional transformers for language understanding . arXiv preprint arXiv 1810 , ( 2018 ). 15. ↵ Liu , Y. Roberta: A robustly optimized bert pretraining approach . arXiv preprint arXiv 1907 , 364 ( 2019 ). OpenUrl 16. ↵ Jaiswal , A. & Evangelos Milios . Breaking the Token Barrier: Chunking and Convolution for Efficient Long Text Classification with BERT . arXiv preprint arXiv 2310 , ( 2023 ). 17. ↵ Li , L. & et al. A scoping review of using large language models (llms) to investigate electronic health records (ehrs) . arXiv preprint arXiv 2405 , ( 2024 ). 18. ↵ Chiang , C. et al. A large language model–based generative natural language processing framework fine-tuned on clinical notes accurately extracts headache frequency from electronic health records . Headache 64 , 400 – 409 ( 2024 ). OpenUrl CrossRef PubMed 19. Choi , H. S. , Song , J. Y. , Shin , K. H. , Chang , J. H. & Jang , B.-S. Developing prompts from large language model for extracting clinical information from pathology and ultrasound reports in breast cancer . Radiat Oncol J 41 , 209 – 216 ( 2023 ). OpenUrl CrossRef PubMed 20. Lee , D. T. & et al. Development of a privacy preserving large language model for automated data extraction from thyroid cancer pathology reports . medRxiv 11 – 2023 ( 2023 ). 21. Fink , M. A. et al. Potential of ChatGPT and GPT-4 for Data Mining of Free-Text CT Reports on Lung Cancer . Radiology 308 , ( 2023 ). 22. Hu , D. , Liu , B. , Zhu , X. , Lu , X. & Wu , N. Zero-shot information extraction from radiological reports using ChatGPT . Int J Med Inform 183 , ( 2024 ). 23. ↵ Alsentzer , E. et al. Zero-shot interpretable phenotyping of postpartum hemorrhage using large language models . NPJ Digit Med 6 , ( 2023 ). 24. ↵ Mahbub , M. & et al. Leveraging Large Language Models to Extract Information on Substance Use Disorder Severity from Clinical Notes: A Zero-shot Learning Approach . arXiv preprint arXiv 2403 , ( 2024 ). 25. ↵ Guevara , M. et al. Large language models to identify social determinants of health in electronic health records . NPJ Digit Med 7 , ( 2024 ). 26. ↵ Sivarajkumar , S. , Kelley , M. , Samolyk-Mazzanti , A. , Visweswaran , S. & Wang , Y. An Empirical Evaluation of Prompting Strategies for Large Language Models in Zero-Shot Clinical Natural Language Processing: Algorithm Development and Validation Study . JMIR Med Inform 12 , E55318 ( 2024 ). OpenUrl 27. ↵ Li , M. , Zhou , H. , Yang , H. & Zhang , R. RT: a Retrieving and Chain-of-Thought framework for few-shot medical named entity recognition . J Am Med Inform Assoc 31 , 1929 – 1938 ( 2024 ). OpenUrl PubMed 28. ↵ Tavabi , N. , Singh , M. , Pruneski , J. & Kiapour , A. M. Systematic evaluation of common natural language processing techniques to codify clinical notes . PLoS One 19 , E298892 ( 2024 ). OpenUrl 29. ↵ Murugadoss et al. Building a best-in-class automated de-identification tool for electronic health records through ensemble learning . Patterns 2 , ( 2021 ). 30. ↵ Beckler , D. T. , Thumser , Z. C. , Schofield , J. S. & Marasco , P. D. Reliability in evaluator-based tests: using simulation-constructed models to determine contextually relevant agreement thresholds . BMC Med Res Methodol 18 , ( 2018 ). 31. ↵ Meta . Introducing Meta Llama 3: The most capable openly available LLM to date . Introducing Meta Llama 3: The most capable openly available LLM to date ( 2024 ). 32. ↵ Beltagy , I. , Lo , K. & Cohan , A. SciBERT: A pretrained language model for scientific text . arXiv [cs ( 2019 ). 33. ↵ Huang , K. , Altosaar , J. & Ranganath , R. ClinicalBERT: Modeling clinical notes and predicting hospital readmission . arXiv [cs ( 2019 ). 34. ↵ Pandas: Data structures for statistical computing in python, McKinney, Proceedings of the 9th Python in Science Conference . 445 , 35. Python Reference Manual. Python : Van Rossum , G. & Drake , F L ( 1995 ). 36. Harris , C. R. et al. Array programming with NumPy . Nature 585 , 357 – 362 ( 2020 ). OpenUrl CrossRef PubMed 37. Virtanen , P. et al. SciPy 1.0: fundamental algorithms for scientific computing in Python . Nature Methods 17 , 261 – 272 ( 2020 ). OpenUrl CrossRef PubMed 38. Hunter , J. D. Matplotlib: A 2D Graphics Environment . Comput Sci Eng 9 , 90 – 95 ( 2007 ). OpenUrl CrossRef PubMed 39. ↵ Waskom , M. seaborn: statistical data visualization . J Open Source Softw 6 , 3021 ( 2021 ). OpenUrl CrossRef 40. ↵ Irizarry , R. A. Ggplot2 . Introduction to Data Science 107 – 125 ( 2024 ). 41. ↵ R: A language and environment for statistical computing . Foundation for Statistical Computing , Vienna, Austria ( 2013 ). 42. ↵ Garza MY et al. Error Rates of Data Processing Methods in Clinical Research: A Systematic Review and Meta-Analysis of Manuscripts Identified Through PubMed . Res Sq [Preprint] 3 ( 2023 ). doi: 10.21203/rs.3.rs-2386986/v2 OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted December 20, 2024. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Automated abstraction of clinical parameters of multiple myeloma from real-world clinical notes using large language models Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Automated abstraction of clinical parameters of multiple myeloma from real-world clinical notes using large language models Alana O’Brien Del Campo , Dmytro Lituiev , Gowtham Varma , Mithun Manoharan , Sunil Kumar Ravi , Avinash Aman , Ankit Kansagra , Joel Greshock , AJ Venkatakrishnan , Ashita Batavia medRxiv 2024.12.17.24318605; doi: https://doi.org/10.1101/2024.12.17.24318605 Share This Article: Copy Citation Tools Automated abstraction of clinical parameters of multiple myeloma from real-world clinical notes using large language models Alana O’Brien Del Campo , Dmytro Lituiev , Gowtham Varma , Mithun Manoharan , Sunil Kumar Ravi , Avinash Aman , Ankit Kansagra , Joel Greshock , AJ Venkatakrishnan , Ashita Batavia medRxiv 2024.12.17.24318605; doi: https://doi.org/10.1101/2024.12.17.24318605 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Hematology Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (303) Cardiovascular Medicine (4457) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (610) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15244) Forensic Medicine (30) Gastroenterology (1132) Genetic and Genomic Medicine (6620) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4557) Health Policy (1372) Health Systems and Quality Improvement (1615) Hematology (543) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15936) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6634) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3348) Ophthalmology (980) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5463) Public and Global Health (9257) Radiology and Imaging (2210) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1198) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (532) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a03414c2b925e2c5',t:'MTc4MDA0MjgzOQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-23T02:00:01.238055+00:00

License: CC-BY-ND-4.0