A Standard Framework for Converting Coronary Angiography Reports into Machine-Readable Format Using Large Language Models

doi:10.1101/2025.05.03.25326846

A Standard Framework for Converting Coronary Angiography Reports into Machine-Readable Format Using Large Language Models

2025 · doi:10.1101/2025.05.03.25326846

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 43,172 characters · extracted from preprint-html · click to expand

A Standard Framework for Converting Coronary Angiography Reports into Machine-Readable Format Using Large Language Models | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search A Standard Framework for Converting Coronary Angiography Reports into Machine-Readable Format Using Large Language Models View ORCID Profile Ji Woo Song , View ORCID Profile Ji Yong Jang , View ORCID Profile Hyeongsoo Kim , View ORCID Profile Young-Guk Ko , View ORCID Profile Seng Chan You doi: https://doi.org/10.1101/2025.05.03.25326846 Ji Woo Song 1 Yonsei University College of Medicine , Seoul, Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ji Woo Song Ji Yong Jang 2 Division of Cardiology, National Health Insurance Service Ilsan Hospital , Goyang, Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ji Yong Jang Hyeongsoo Kim 2 Division of Cardiology, National Health Insurance Service Ilsan Hospital , Goyang, Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hyeongsoo Kim Young-Guk Ko 3 Department of Cardiology, Severance Hospital, Yonsei University College of Medicine , Seoul, Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Young-Guk Ko Seng Chan You 4 Institute for Innovation in Digital Health, Yonsei University , Seoul, South Korea 5 Department of Biomedical Systems Informatics, Yonsei University College of Medicine , Seoul, Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Seng Chan You For correspondence: chandryou{at}yuhs.ac Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF ABSTRACT Background and Objectives Coronary angiography (CAG) reports contain many details about coronary anatomy, lesion characteristics, and interventional procedures. However, their free-text format limits their research utility. Therefore, we sought to develop and validate a framework leveraging large language models (LLMs) to convert CAG reports automatically into a standardized structured format. Methods Using 50 CAG reports from a tertiary hospital, we developed a multi-step framework to standardize and extract key information from CAG reports. First, a standard annotation schema was developed by cardiologists. Thereafter, an LLM (GPT-4o) converted the free-text CAG reports into the hierarchical annotation schema in a standardized format. Finally, clinically relevant information was extracted from the standardized schema. One hundred CAG reports from each of two hospitals were used for internal and external test, respectively. The 12 key information points included four CAG-related (previous stent information, lesion characteristics, and anatomical diagnosis) and eight percutaneous coronary intervention (PCI)-related key points (complex PCI criteria and current stent information). For internal test, two interventional cardiologists independently extracted information, with discrepancies resolved through consensus, as reference standard. Results Based on the reference standard, the proposed framework demonstrated superior accuracy for CAG-related (99.5% vs. 91.8%; p < 0.001) and comparable accuracy for PCI-related key points (98.3% vs. 97.4%; p = 0.512) in the internal test. External test confirmed high accuracy for both CAG-(96.2%) and PCI-related key points (99.4%). Conclusions This framework demonstrated excellent accuracy in standardizing free-text CAG reports, potentially enabling more efficient utilization of detailed clinical data for cardiovascular research. Author’s Summary The novel framework that standardizes CAG report is a practical solution to a significant challenge in cardiovascular research — the efficient utilization of detailed procedural and anatomical information untapped in free-text CAG reports. Our framework could enable systematic analysis of large-scale coronary intervention outcomes, reduce the burden of cardologists’ clinical trial recruitment, and support evidence-based clinical decision-making. Introduction Coronary angiography (CAG) reports contain information about coronary artery disease and percutaneous coronary intervention (PCI), providing invaluable data for clinical research. With continued growth in the secondary use of clinical data, observational studies on coronary artery diseases using electronic health records (EHR) continue to increase. However, many of these studies are conducted without the use of detailed information on complex coronary anatomy or procedures. 1 – 3 CAG reports are limited by their unstructured, free-text format, which makes it difficult to search, analyze, and process data consistently. 4 To standardize complex medical information in free-text medical records, we have previously proposed Staged Optimization of Curation, Regularization, and Annotation of clinical text (SOCRATex). This framework initially requires domain experts to define a standardized schema that specifies how clinical information should be organized. Thereafter, experts manually review clinical notes and annotate relevant information according to this predefined schema to create standardized, machine-readable data. 5 Although this systematic approach effectively converts unstructured clinical notes into analyzable data, it involves a time-consuming manual annotation process and requires significant expert involvement, hampering its large-scale implementation. 6 Recent advancements in large language models (LLMs), such as ChatGPT, a type of artificial intelligence (AI) used for understanding and processing free-text, can structure vast amounts of free-text within EHRs with minimal programming effort. 7 Various studies have reported using LLMs to convert free-text radiology and pathology reports into structured formats, highly accurately. 8 – 10 However, LLM application in transforming cardiology-related procedure reports into machine-readable formats, particularly CAG reports with their complex procedural details, remains unexplored. Therefore, building upon our previous work on hierarchical annotation, this study sought to develop a standard framework for the automated transformation of CAG reports into machine-readable formats using an LLM, based on hierarchical annotation. We then compared the accuracy of the extracted information from this machine-readable format with the accuracy of manual review by cardiologists. METHODS DATASET CAG studies of adults (>18-years-old), performed from January 1, 2009, to December 31, 2023 (15 years), at Severance Hospital, a tertiary hospital, were retrieved. From these, 50 and 100 CAG reports were randomly selected for the training and internal test sets, respectively. Following the Society for Cardiovascular Angiography and Interventions’ expert consensus, 11 these CAG reports included the coronary anatomy and described lesions with their location, severity, and morphological characteristics. PCI-related information, including the types and sizes of catheters, balloons, stents, and adjuvant devices, was thoroughly detailed in the CAG report. All reports were written in English, with rare exceptions of Korean texts describing emergency situations during procedures. Furthermore, 100 random CAG reports from the National Health Insurance Service Ilsan Hospital, a secondary hospital, obtained between January 1, 2023, and December 31, 2023, were retrieved as an external test set. This study was approved by the Institutional Review Board (IRB) of Severance Hospital (IRB No. 2024-1928-002), Seoul, Korea, and the IRB of the National Health Insurance Service Ilsan Hospital (IRB No. 2024-09-006). The requirement for informed patient consent was waived by the IRBs due to the retrospective nature of the study. FRAMEWORK OVERVIEW Figure 1 illustrates our CAG report standardization framework. After development of a hierarchical annotation schema by experience cardiologists (detailed below), the framework consisted of standardization and extraction steps. In the standardization step, an LLM converted free-text CAG reports into a standardized format based on the expert-defined schema. Thereafter, an automated extraction algorithm extracted the required clinical information. Figure 2 details the development and validation of the framework, including iterative refinement with interventional cardiologists to ensure clinical accuracy and reliability. Download figure Open in new tab Figure 1. Overview of the two-step framework for free-text CAG report standardization and data extraction The framework consists of two primary steps: (1) standardization and (2) extraction. In the standardization step, free-text coronary angiography (CAG) reports are converted into a machine-readable hierarchical annotation schema using large language models (LLMs), guided by instruction prompts and optional few-shot prompts. This schema defines the structure and content of the machine-readable format generated in this manner. In the extraction step, 12 key data points are extracted from these machine-readable reports to enable automated and precise data analysis, providing insights into clinical variables, such as lesion characteristics, stent details, and complex percutaneous coronary intervention information. Download figure Open in new tab Figure 2. Overview of the training, internal test, and external test processes of the framework. Fifty and 100 reports from electronic health records of Center 1 were used for training the framework and for evaluating the accuracy of the framework and that of the manual extraction of two cardiologists in an internal validation test, respectively. One hundred reports from electronic health records of Center 2 were used as an external test dataset to evaluate the generalizability of the framework. Center 1, Severance Hospital; Center 2, Ilsan Hospital. LLM, large language model In accordance with the Minimum Reporting Items for Clear Evaluation of Accuracy Reports of Large Language Models in Healthcare (MINI-CLEAR-LLM) checklist, we disclose the details of this study to ensure transparency in utilizing LLMs for healthcare applications in Supplementary Method 1. 12 DEVELOPMENT OF THE HIERARCHICAL ANNOTATION SCHEMA A domain-specific hierarchical annotation schema was collaboratively developed by interventional cardiologists (JYJ) and clinical informatics experts (JWS and SCY). This schema defined how to organize key clinical information from CAG reports systematically, including coronary anatomy, lesion characteristics, and procedural details. Through iterative testing using the training dataset, we refined the schema to ensure comprehensive and accurate capture of complex clinical information. The schema was implemented in JavaScript Object Notation (JSON) format, which enabled efficient organization of multiple clinical events (e.g., multiple lesions or procedures) and flexible representation of clinical details at various levels of granularity. 8 – 10 The full annotation schema is shown in Supplementary Figure 1. DOCUMENT-LEVEL HIERARCHICAL ANNOTATION We developed a prompt 13 to encode domain-specific knowledge, including a Synergy Between Percutaneous Coronary Intervention with Taxus and Cardiac Surgery (SYNTAX) score segmentation system 14 for precise documentation of coronary anatomy; lesion morphology and characteristics based on American College of Cardiology/American Heart Association (ACC/AHA) classification 15 – 17 ; and details of the intervention, procedural complications, and outcomes. We utilized Generative Pretrained Transformer 4 Omni (GPT-4o) as the LLM. 18 The free-text conversion process was initially validated using representative cases from the training dataset that encompassed various clinical scenarios, from simple single-vessel disease to complex multivessel interventions. Among them, eight particularly instructive cases were selected to serve as reference examples, ensuring consistency in the interpretation of complex anatomical and procedural details. While these reference cases were utilized for an internal test, they were not applied in the external test, to avoid institutional bias in reporting styles. The full instruction prompt and few-shot prompts are shown in Supplementary Figure 1 and Supplementary Figure 2, respectively. EXTRACTION AND VALIDATION The extraction process was designed to capture clinically relevant information from the standardized text, with a particular focus on two main categories: CAG-related and PCI-related key information points. For validation test, we pre-defined the following 12 key information points in the extraction process: four CAG-related key points and eight PCI-related key points. The four CAG-related key points included the location of previous stents, previous stent information, the location and type of lesion, and anatomical diagnosis (e.g., two-vessel disease). Previous stent information included the device name, diameter, and length of previous stents. Type of lesion (A, B1, B2, and C) was determined for each lesion according to the ACC/AHA classification. The eight PCI-related key points focused on the six criteria of complex PCI 19 : Multivessel PCI, implantation of ≥ 3 stents; treatment of ≥ 3 lesions; bifurcation PCI using ≥ 2 stents; total stent length > 60 mm; chronic total occlusion (CTO) as the target lesion 20 – 24 ; complex PCI; and current stent information. Complex PCI was defined when any one or more criteria were met in the index PCI. Current stent information included the device name, diameter, and length of current stents. The framework’s accuracy was assessed through both internal and external test. For internal test, two experienced cardiologists (JYJ and HSK) independently extracted the 12 key information points manually. Any discrepancies between the framework’s output and manual review were resolved through discussion to consensus, to establish a reference standard. The framework’s performance was compared with that of the cardiologists using Fisher’s exact test (p-value threshold of 0.05). For external test, cardiologists directly evaluated the framework’s extraction results through thorough inspection. The complete extraction algorithm is explained in Supplementary Method 2 and is available at our public repository: https://github.com/jiuisdisciple/CAGtoJSON . RESULTS COMPOSITION OF TRAINING AND TEST DATASETS Table 1 summarizes the number of reports included in the training, internal test, and external test datasets, based on the year of report writing and whether they included CAG and PCI procedures. In each CAG report, CAG-related and PCI-related information were intermingled. Therefore, the training set of 50 reports included 47 CAG and 27 PCI cases. Among the 100 reports included in the internal test set were 96 CAG and 44 PCI cases. The 100 reports in the external test dataset included 98 CAG and 58 PCI cases. View this table: View inline View popup Download powerpoint Table 1. Characteristics of the training, internal test, and external test datasets. INTERNAL TEST Table 2 and Figure 3 show the accuracy of the framework and the mean accuracy of the two cardiologists in extracting clinical key points from CAG reports in the internal test dataset. Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Figure 3. Bar plot showing the accuracy percentages with 95% confidence intervals for the analysis of 12 key information points by the framework and by two cardiologists in the internal validation process. This figure compares the accuracy of the framework and of the two cardiologists in analyzing 12 key information points during the internal validation process. (A) shows the overall accuracy for four coronary angiography (CAG)-related and eight percutaneous coronary intervention (PCI)-related key points. (B) presents individual accuracy for the four CAG-related key points. (C) displays the accuracy for the eight individual PCI-related key points. Statistical significance is indicated as follows: * for p < 0.05, ** for p < 0.01, and *** for p < 0.001. View this table: View inline View popup Table 2. Accuracy percentages with 95% confidence intervals for the analysis of 12 key information points by the framework and by two cardiologists in the internal test. The framework demonstrated a significantly higher mean accuracy than that of the cardiologists in extracting the four CAG-related key points (99.5% vs. 91.8%, p < 0.001). For individual CAG key points, the framework showed superior accuracy in location of previous stents (100.0% vs. 91.1%, p = 0.001), previous stent information (100.0% vs. 95.3%, p = 0.032), and location and type of lesion (97.9% vs. 80.7%, p < 0.001). Both the framework and cardiologists achieved 100% accuracy for anatomical diagnosis . For the eight PCI-related key points, the framework showed comparable accuracy to the cardiologists across all items, with mean accuracy scores of 98.3% vs. 97.4% (p = 0.512). The framework and the cardiologists performed similarly for each specific point: multivessel PCI (100.0% vs. 100.0%), ≥ 3 lesions treated (97.7% vs. 97.7%), bifurcation PCI with ≥ 2 stents (100.0% vs. 97.7%), ≥ 3 stents implanted (97.7% vs. 97.7%), CTO PCI (100.0% vs. 100.0%), total stent length > 60 mm (97.7% vs. 97.7%), complex PCI (97.7% vs. 98.9%), and current stent information (95.5% vs. 89.8%). All p-values were above 0.05, indicating no significant differences between the framework and the cardiologists. More detailed results are shown in Supplementary Table 1. We performed a qualitative analysis to understand the significant discrepancies in accuracy between the framework and the cardiologists. As shown in Supplementary Table 2, cardiologists often generated errors by mislabeling the location of previous stents, current stents, and lesions. They also frequently misclassified the type of lesions due to missed key descriptors, such as “eccentric” and “ostium”. In contrast, the framework consistently performed precise extraction of information from very complex cases, as shown in Supplementary Figure 3A and 3B EXTERNAL TEST Table 3 shows the accuracy of the framework in extracting clinical key points from CAG reports in the external test dataset. For the four CAG-related key points, the framework achieved a mean accuracy of 96.2%, showing consistently high performance across each item. For the eight PCI-related key points, the framework achieved a mean accuracy of 99.4%, also showing consistently high performance across each item. These results indicated the framework’s robust accuracy in extracting key details from both CAG and PCI data. Supplementary Table 3 shows more detailed results. View this table: View inline View popup Download powerpoint Table 3. Accuracy percentages for the analysis of framework in the external validation test. 12 key information points by the framework in the external validation test. A qualitative analysis was conducted to understand the reasons for errors made by the framework. As shown in Supplementary Table 4, some inevitable errors arose from the inherent ambiguity and inconsistencies within the CAG section of the reports themselves. However, the PCI sections of the reports were structured using seven key items for each treated lesion—“Location,” “Guiding catheter,” “Guidewire,” “Preballoon,” “Adjuvant balloon,” “DEB,” and “Stent”—which provided a well-organized foundation for data extraction. This pre-existing structure in the free-text report significantly facilitated the conversion into a machine-readable format by the LLM, resulting in high accuracy for extracting PCI-related key points. Supplementary Figure 4 depicts two representative cases from the external test set, showcasing the excellent capability of the framework in converting all the details of a highly complex CAG and PCI case into a machine-readable format. DISCUSSION This study demonstrated that our novel framework, which combines hierarchical annotation with LLMs, can successfully automate the standardization of CAG reports with accuracy comparable to or exceeding that of experienced cardiologists. In both internal and external tests, the framework showed excellent performance in extracting and structuring complex information about coronary anatomy and PCI procedures, achieving up to 99.5% accuracy for CAG-related key points and 98.3% for PCI-related key points. These results suggest that automated standardization of complex medical documents is not only feasible, but can be implemented with high reliability, potentially transforming how we utilize clinical information embedded in unstructured medical reports. To our knowledge, no previous study has attempted to convert detailed information about coronary anatomy and catheterization procedures from free-text reports into a machine-readable format. A recent systematic review indicated that natural language processing research in the field of cardiology remains underexplored as compared to that in the field of oncology. 25 Although the availability of coded information in EHRs and claims data has increased substantially, most large-scale observational studies have not utilized detailed information about coronary anatomy or procedural characteristics. 26 While studies leveraging data from dedicated registries have incorporated such information, this approach requires marked manual effort. Our framework addresses this limitation by automatically converting detailed procedural and anatomical information into a structured, machine-readable format, which promotes several critical capabilities: (1) systematic analysis of large-scale coronary intervention outcomes across different patient populations and institutions, (2) efficient quality assessment and performance monitoring of interventional procedures, (3) facilitation of clinical research by enabling rapid identification of eligible patients for trials based on specific anatomical or procedural criteria, 27 and (4) providing support for clinical decision-making by making historical procedural data more accessible and analyzable. The framework developed in this study uses a two-step approach to ensure accuracy and interpretability. In a previous study, while a one-step approach using LLMs was able to capture basic information accurately, it demonstrated difficulty with complex medical reasoning. 9 For example, in pathology reports, LLMs could accurately identify tumor measurements but had difficulty determining accurate cancer staging. To overcome this limitation, we separated our process into two distinct steps: first, using LLMs to organize the basic information from CAG reports into an expert-defined standardized format, and second, applying specific rules developed by cardiologists to build clinically relevant data. This approach, similar to methods successfully used in pathology research, 10 combines the LLM’s strength in understanding medical text with cardiologist-designed rules for interpreting nuance and complexity in coronary anatomy and catheterization. Furthermore, an instruction prompt was given to LLM to encode domain-specific knowledge and detailed rules, as guided by cardiologists, to ensure accurate and consistent mapping of the intricate information contained in CAG reports. The significance of our work extends beyond its immediate performance metrics. By developing and publicly releasing a comprehensive hierarchical annotation schema for CAG reports, we have provided a standardized framework that other institutions can readily adopt or modify for their specific needs. This flexibility is particularly important as healthcare institutions increasingly seek to process sensitive clinical data by using internal systems rather than external services. With the rapid advance in the capabilities of open-source LLMs, our framework offers a practical blueprint for standardization of automated medical documents that could be implemented using various LLM options, while maintaining high accuracy and reliability. The primary advantage is that it saves time and human effort. LIMITATIONS The study had some limitations. First, only CAG reports from hospitals in South Korea were used to validate the robustness of the framework. To ensure thorough validation, the framework should be implemented on CAG reports from more diverse countries. Second, this study utilized only GPT-4o as the LLM for standardization. Nevertheless, our framework was intentionally designed to be model-agnostic and to be able to work with any sufficiently capable LLM, although further testing is needed to determine whether its high accuracy and reliability will be maintained even with less-competent open-source models, such as Llama-3. 28 Third, unlike the internal validation, the external validation process only evaluated the framework’s accuracy as judged by cardiologists, without direct comparison to cardiologists’ manual reviews. Although the framework achieved excellent accuracy in the external test, a direct comparison with manual reviews would have more clearly demonstrated its capabilities. Fourth, in both the internal and external validation processes, accuracy was evaluated based on only a single output from the LLM. Although we set the LLM’s “temperature”—a parameter that controls the randomness and variability of the model’s responses—to zero, in order to maximize predictability, slight variability remained due to the model’s inherent characteristics. Addressing these limitations in future will help to ensure broader applicability, consistent performance across various LLMs, and even more robust validation methods. CONCLUSION In this study, we developed a novel framework for standardizing free-text CAG reports and extracting complex clinical data. This framework demonstrated excellent accuracy in standardizing CAG reports, indicating its potential for more efficient utilization of detailed clinical data in these reports for cardiovascular research. Future research may explore adaptation of this framework to other types of free-text data. Data Availability All data produced in the present study are available upon reasonable request to the authors. Funding Support This research was supported by a grant of the MD-Phd/Medical Scientist Training Program through the Korea Health Industry Development Institute (KHIDI), funded by the Ministry of Health & Welfare, Republic of Korea, and also by a grant of the Korea Health Technology R&D Project through the Korea Health Industry Development Institute (KHIDI), funded by the Ministry of Health & Welfare, Republic of Korea (grant number: HI22C0452). Disclosure of conflicts of interest SCY reports grants from Daiichi Sankyo. He is a coinventor of granted Korea Patent DP-2023-1223 and DP-2023-0920, and pending Patent Applications DP-2024-0909, DP-2024-0908, DP-2022-1658, DP-2022-1478, and DP-2022-1365 unrelated to current work. SCY is a chief executive officer of PHI Digital Healthcare. Other authors have no potential conflicts of interest to disclose. Acknowledgements This research was supported by a grant of the MD-Phd/Medical Scientist Training Program through the Korea Health Industry Development Institute (KHIDI), funded by the Ministry of Health & Welfare, Republic of Korea. Footnotes Authors’ e-mail addresses: Ji Woo Song: jiusong3232{at}gmail.com , Ji Yong Jang: dogkkoma{at}gmail.com , Hyeongsoo Kim: hyeongsoo{at}nhimc.or.kr , Young-Guk Ko: ygko{at}yuhs.ac Funding information : This research was supported by a grant of the MD-Phd/Medical Scientist Training Program through the Korea Health Industry Development Institute (KHIDI), funded by the Ministry of Health & Welfare, Republic of Korea. Disclosure : SCY reports grants from Daiichi Sankyo. He is a coinventor of granted Korea Patent DP-2023-1223 and DP-2023-0920, and pending Patent Applications DP-2024-0909, DP-2024-0908, DP-2022-1658, DP-2022-1478, and DP-2022-1365 unrelated to current work. SCY is a chief executive officer of PHI Digital Healthcare. Other authors have no potential conflicts of interest to disclose. Abbreviations ACC/AHA American College of Cardiology / American Heart Association CAG coronary angiography CTO chronic total occlusion EHR electronic health records JSON JavaScript Object Notation LLM large language model PCI percutaneous coronary interventions SYNTAX Synergy Between Percutaneous Coronary Intervention with Taxus and Cardiac Surgery References 1. ↵ Hansen KN , Bendix K , Antonsen L , Veien K , Maeng M , Junker A , et al. One-year rehospitalisation after percutaneous coronary intervention: a retrospective analysis . EuroIntervention . 2018 ; 14 ( 8 ): 926 – 34 . OpenUrl PubMed 2. Lee C-w , Lee JK , Choi YJ , Kim H , Han K , Jung J-h , et al. Blood pressure and mortality after percutaneous coronary intervention: a population-based cohort study . Scientific Reports . 2022 ; 12 ( 1 ): 2768 . OpenUrl PubMed 3. ↵ Waldo SW , Gokhale M , O’Donnell CI , Plomondon ME , Valle JA , Armstrong EJ , et al. Temporal Trends in Coronary Angiography and Percutaneous Coronary Intervention . JACC: Cardiovascular Interventions . 2018 ; 11 ( 9 ): 879 – 88 . OpenUrl 4. ↵ Rush EN , Danciu I , Ostrouchov G , Cho K , Mayer BW , Ho YL , et al. JSONize: A Scalable Machine Learning Pipeline to Model Medical Notes as Semi-structured Documents . AMIA Jt Summits Transl Sci Proc . 2020 ; 2020 : 533 – 41 . OpenUrl PubMed 5. ↵ Park J , You SC , Jeong E , Weng C , Park D , Roh J , et al. A Framework (SOCRATex) for Hierarchical Annotation of Unstructured Electronic Health Records and Integration Into a Standardized Medical Database: Development and Usability Study . JMIR Med Inform . 2021 ; 9 ( 3 ): e23983 . OpenUrl 6. ↵ Unlu O , Shin J , Mailly CJ , Oates MF , Tucci MR , Varugheese M , et al. Retrieval-Augmented Generation–Enabled GPT-4 for Clinical Trial Screening . NEJM AI . 2024 ; 1 ( 7 ): AIoa2400181 . OpenUrl 7. ↵ OpenAI , Achiam J , Adler S , Agarwal S , Ahmad L , Akkaya I , et al. GPT-4 Technical Report2023 March 01, 2023 :[arXiv:2303.08774 p.]. Available from: https://ui.adsabs.harvard.edu/abs/2023arXiv230308774O . 8. ↵ Adams LC , Truhn D , Busch F , Kader A , Niehues SM , Makowski MR , Bressem KK . Leveraging GPT-4 for Post Hoc Transformation of Free-text Radiology Reports into Structured Reporting: A Multilingual Feasibility Study . Radiology . 2023 ; 307 ( 4 ): e230725 . OpenUrl CrossRef PubMed 9. ↵ Huang J , Yang DM , Rong R , Nezafati K , Treager C , Chi Z , et al. A critical assessment of using ChatGPT for extracting structured data from clinical notes . npj Digital Medicine . 2024 ; 7 ( 1 ): 106 . OpenUrl PubMed 10. ↵ Cho H , Yoo S , Kim B , Jang S , Sunwoo L , Kim S , et al. Extracting lung cancer staging descriptors from pathology reports: A generative language model approach . Journal of Biomedical Informatics . 2024 ; 157 : 104720 . OpenUrl PubMed 11. ↵ Naidu SS , Abbott JD , Bagai J , Blankenship J , Garcia S , Iqbal SN , et al. SCAI expert consensus update on best practices in the cardiac catheterization laboratory: this statement was endorsed by the American College of Cardiology (ACC), the American Heart Association (AHA), and the Heart Rhythm Society (HRS) in April 2021 . Catheterization and Cardiovascular Interventions . 2021 ; 98 ( 2 ): 255 – 76 . OpenUrl PubMed 12. ↵ Park SH , Suh CH , Lee JH , Kahn CE , Moy L . Minimum Reporting Items for Clear Evaluation of Accuracy Reports of Large Language Models in Healthcare (MI-CLEAR-LLM) . Korean J Radiol . 2024 ; 25 ( 10 ): 865 – 8 . OpenUrl PubMed 13. ↵ Chen B , Zhang Z , Langrené N , Zhu S. Unleashing the potential of prompt engineering in Large Language Models: a comprehensive review2023 October 01, 2023 :[arXiv:2310.14735 p.]. Available from: https://ui.adsabs.harvard.edu/abs/2023arXiv231014735C . 14. ↵ Sianos G , Morel MA , Kappetein AP , Morice MC , Colombo A , Dawkins K , et al. The SYNTAX Score: an angiographic tool grading the complexity of coronary artery disease . EuroIntervention . 2005 ; 1 ( 2 ): 219 – 27 . OpenUrl PubMed 15. ↵ Ryan TJ . Guidelines for percutaneous transluminal coronary angioplasty: A report of the American College of Cardiology/American Heart Association Task Force on Assessment of Diagnostic and Therapeutic Cardiovascular Procedures (Subcommittee on Percutaneous Transluminal Coronary Angioplasty) . Journal of the American College of Cardiology . 1988 ; 12 ( 2 ): 529 – 45 . OpenUrl FREE Full Text 16. Konigstein M , Redfors B , Zhang Z , Kotinkaduwa LN , Mintz GS , Smits PC , et al. Utility of the ACC/AHA Lesion Classification to Predict Outcomes After Contemporary DES Treatment: Individual Patient Data Pooled Analysis From 7 Randomized Trials . Journal of the American Heart Association . 2022 ; 11 ( 24 ): e025275 . OpenUrl PubMed 17. ↵ Theuerle J , Yudi MB , Farouque O , Andrianopoulos N , Scott P , Ajani AE , et al. Utility of the ACC/AHA lesion classification as a predictor of procedural, 30-day and 12-month outcomes in the contemporary percutaneous coronary intervention era . Catheter Cardiovasc Interv . 2018 ; 92 ( 3 ): E227 – e34 . OpenUrl PubMed 18. ↵ OpenAI . Hello GPT-4o: OpenAI ; 2024 [Available from: https://openai.com/index/hello-gpt-4o/ . 19. ↵ Giustino G , Chieffo A , Palmerini T , Valgimigli M , Feres F , Abizaid A , et al. Efficacy and Safety of Dual Antiplatelet Therapy After Complex PCI . Journal of the American College of Cardiology . 2016 ; 68 ( 17 ): 1851 – 64 . OpenUrl FREE Full Text 20. ↵ Suh J , Park D-W , Lee J-Y , Jung IH , Lee S-W , Kim Y-H , et al. The relationship and threshold of stent length with regard to risk of stent thrombosis after drug-eluting stent implantation . JACC: Cardiovascular Interventions . 2010 ; 3 ( 4 ): 383 – 9 . OpenUrl CrossRef 21. Mauri L , O’Malley AJ , Cutlip DE , Ho KK , Popma JJ , Chauhan MS , et al. Effects of stent length and lesion length on coronary restenosis . The American journal of cardiology . 2004 ; 93 ( 11 ): 1340 – 6 . OpenUrl CrossRef PubMed Web of Science 22. Van Werkum JW , Heestermans AA , Zomer AC , Kelder JC , Suttorp M-J , Rensing BJ , et al. Predictors of coronary stent thrombosis: the Dutch Stent Thrombosis Registry . Journal of the American college of cardiology . 2009 ; 53 ( 16 ): 1399 – 409 . OpenUrl FREE Full Text 23. Brilakis ES , Banerjee S , Karmpaliotis D , Lombardi WL , Tsai TT , Shunk KA , et al. Procedural outcomes of chronic total occlusion percutaneous coronary intervention: a report from the NCDR (National Cardiovascular Data Registry) . JACC: Cardiovascular Interventions . 2015 ; 8 ( 2 ): 245 – 53 . OpenUrl 24. ↵ Holmes DR , Kereiakes DJ , Garg S , Serruys PW , Dehmer GJ , Ellis SG , et al. Stent thrombosis . Journal of the American College of Cardiology . 2010 ; 56 ( 17 ): 1357 – 65 . OpenUrl FREE Full Text 25. ↵ Turchioe MR , Volodarskiy A , Pathak J , Wright DN , Tcheng JE , Slotwiner D . Systematic review of current natural language processing methods and applications in cardiology . Heart . 2022 ; 108 ( 12 ): 909 – 16 . OpenUrl Abstract / FREE Full Text 26. ↵ You SC , Rho Y , Bikdeli B , Kim J , Siapos A , Weaver J , et al. Association of ticagrelor vs clopidogrel with net adverse clinical events in patients with acute coronary syndrome undergoing percutaneous coronary intervention . Jama . 2020 ; 324 ( 16 ): 1640 – 50 . OpenUrl CrossRef PubMed 27. ↵ Khan MS , Usman MS , Talha KM , Van Spall HG , Greene SJ , Vaduganathan M , et al. Leveraging electronic health records to streamline the conduct of cardiovascular clinical trials . European heart journal . 2023 ; 44 ( 21 ): 1890 – 909 . OpenUrl PubMed 28. ↵ Meta . Introducing Meta Llama 3: The most capable openly available LLM to date 2024 [Available from: https://ai.meta.com/blog/meta-llama-3/ . View the discussion thread. Back to top Previous Next Posted May 06, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A Standard Framework for Converting Coronary Angiography Reports into Machine-Readable Format Using Large Language Models Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A Standard Framework for Converting Coronary Angiography Reports into Machine-Readable Format Using Large Language Models Ji Woo Song , Ji Yong Jang , Hyeongsoo Kim , Young-Guk Ko , Seng Chan You medRxiv 2025.05.03.25326846; doi: https://doi.org/10.1101/2025.05.03.25326846 Share This Article: Copy Citation Tools A Standard Framework for Converting Coronary Angiography Reports into Machine-Readable Format Using Large Language Models Ji Woo Song , Ji Yong Jang , Hyeongsoo Kim , Young-Guk Ko , Seng Chan You medRxiv 2025.05.03.25326846; doi: https://doi.org/10.1101/2025.05.03.25326846 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Cardiovascular Medicine Subject Areas All Articles Addiction Medicine (572) Allergy and Immunology (864) Anesthesia (302) Cardiovascular Medicine (4448) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15240) Forensic Medicine (30) Gastroenterology (1130) Genetic and Genomic Medicine (6614) Geriatric Medicine (669) Health Economics (1000) Health Informatics (4550) Health Policy (1372) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1268) Infectious Diseases (except HIV/AIDS) (15928) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (668) Neurology (6621) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1147) Occupational and Environmental Health (957) Oncology (3343) Ophthalmology (977) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1695) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5459) Public and Global Health (9248) Radiology and Imaging (2205) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1197) Rheumatology (597) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a0289f72887573a6',t:'MTc3OTkyMjY5MA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00