HeartBERT : A language model pre-trained on anopen source dataset for cardiac text mining | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Article HeartBERT : A language model pre-trained on anopen source dataset for cardiac text mining Hansle Gwon, Hyeram Seo, Seohyun Park, Young-Hak Kim, Tae Joon Jun This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-4137702/v1 This work is licensed under a CC BY 4.0 License Status: Published Journal Publication published 23 Nov, 2024 Read the published version in Scientific Reports → Version 1 posted 11 You are reading this latest preprint version Abstract The advent of the Transformer has significantly altered the course of research in Natural Language Processing (NLP) within thedomain of deep learning, making Transformer-based studies the mainstream in subsequent NLP research. There has alsobeen considerable advancement in domain-specific NLP research, including the development of specialized language modelsfor medical. These medical-specific language models were trained on medical data and demonstrated high performance. Whilethese studies have treated the medical field as a single domain, in reality, medical is divided into multiple departments, eachrequiring a high level of expertise and treated as a unique domain. Recognizing this, our research focuses on constructinga model specialized for cardiology within the medical sector. Our study encompasses the creation of open-source datasets,training, and model evaluation in this nuanced domain. Full Text Additional Declarations No competing interests reported. Cite Share Download PDF Status: Published Journal Publication published 23 Nov, 2024 Read the published version in Scientific Reports → Version 1 posted Editorial decision: Revision requested 08 Jul, 2024 Reviews received at journal 04 Jul, 2024 Reviewers agreed at journal 26 Jun, 2024 Reviewers agreed at journal 17 Jun, 2024 Reviews received at journal 30 Apr, 2024 Reviewers agreed at journal 22 Apr, 2024 Reviewers invited by journal 19 Apr, 2024 Editor assigned by journal 11 Apr, 2024 Editor invited by journal 08 Apr, 2024 Submission checks completed at journal 08 Apr, 2024 First submitted to journal 20 Mar, 2024 You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-4137702","acceptedTermsAndConditions":true,"allowDirectSubmit":false,"archivedVersions":[],"articleType":"Article","associatedPublications":[],"authors":[{"id":289321062,"identity":"4777d98f-a3e7-4cbc-8033-da62051ea593","order_by":0,"name":"Hansle Gwon","email":"","orcid":"","institution":"INMED DATA","correspondingAuthor":false,"prefix":"","firstName":"Hansle","middleName":"","lastName":"Gwon","suffix":""},{"id":289321063,"identity":"fce1330f-60a3-4ea3-95ee-8936c6b1f318","order_by":1,"name":"Hyeram Seo","email":"","orcid":"","institution":"Asan Medical Center","correspondingAuthor":false,"prefix":"","firstName":"Hyeram","middleName":"","lastName":"Seo","suffix":""},{"id":289321064,"identity":"b001d97c-ace2-4d2e-a503-e67dfc2f851b","order_by":2,"name":"Seohyun Park","email":"","orcid":"","institution":"University of Ulsan","correspondingAuthor":false,"prefix":"","firstName":"Seohyun","middleName":"","lastName":"Park","suffix":""},{"id":289321065,"identity":"c9333573-cce7-45c1-83d6-f7014ab39135","order_by":3,"name":"Young-Hak Kim","email":"","orcid":"","institution":"INMED DATA","correspondingAuthor":false,"prefix":"","firstName":"Young-Hak","middleName":"","lastName":"Kim","suffix":""},{"id":289321066,"identity":"06edb3e4-1e66-483d-bd1c-e199e418931a","order_by":4,"name":"Tae Joon Jun","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAA6klEQVRIiWNgGAWjYJACAyCWYwOxeBiYCSvngWoxhmvhIUYLCCQ2EK3Fnv3wgYIPFXfS+6S7Ex+8YbCWt2c/wPjhYw4eW3jSEgxnnHmW2yZzdrPhHIZ0wx6eBGbJmdvwOSzHwJi37XBum0TuNmnef4cZexgS2Jh58Wnhf/8BpCWdTSJ3+28ehsP2PfwPCGiRyGEAaUkAatnGDNSS2CNByJYbzwyAfjlsCHTYZkmgX5J7bjxsxusX9v7kZwYfKg7Ly8/I3fgBGGK27f3JBz98xKMFCNgM0AQYG/CqBwLmB4RUjIJRMApGwQgHAFVESrbONxKVAAAAAElFTkSuQmCC","orcid":"","institution":"INMED DATA","correspondingAuthor":true,"prefix":"","firstName":"Tae","middleName":"Joon","lastName":"Jun","suffix":""}],"badges":[],"createdAt":"2024-03-20 14:00:18","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-4137702/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-4137702/v1","draftVersion":[],"editorialEvents":[{"content":"https://doi.org/10.1038/s41598-024-80165-z","type":"published","date":"2024-11-23T15:57:49+00:00"}],"editorialNote":"","failedWorkflow":false,"files":[{"id":69834936,"identity":"c6581799-ffa8-442f-9b82-a7b740313e72","added_by":"auto","created_at":"2024-11-25 16:10:33","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":393203,"visible":true,"origin":"","legend":"","description":"","filename":"heartbertscientificreportengrevised.pdf","url":"https://assets-eu.researchsquare.com/files/rs-4137702/v1_covered_67c6b994-b232-4b46-84ba-7d500e6762da.pdf"}],"financialInterests":"No competing interests reported.","formattedTitle":"\u003cp\u003eHeartBERT : A language model pre-trained on anopen source dataset for cardiac text mining\u003c/p\u003e","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":false,"hideJournal":false,"highlight":"","institution":"","isAcceptedByJournal":true,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"scientific-reports","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":false,"externalIdentity":"scirep","sideBox":"Learn more about [Scientific Reports](http://www.nature.com/srep/)","snPcode":"","submissionUrl":"","title":"Scientific Reports","twitterHandle":"","acdcEnabled":true,"dfaEnabled":true,"editorialSystem":"stoa","reportingPortfolio":"Scientific Reports","inReviewEnabled":true,"inReviewRevisionsEnabled":true},"keywords":"","lastPublishedDoi":"10.21203/rs.3.rs-4137702/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-4137702/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"The advent of the Transformer has significantly altered the course of research in Natural Language Processing (NLP) within thedomain of deep learning, making Transformer-based studies the mainstream in subsequent NLP research. There has alsobeen considerable advancement in domain-specific NLP research, including the development of specialized language modelsfor medical. These medical-specific language models were trained on medical data and demonstrated high performance. Whilethese studies have treated the medical field as a single domain, in reality, medical is divided into multiple departments, eachrequiring a high level of expertise and treated as a unique domain. Recognizing this, our research focuses on constructinga model specialized for cardiology within the medical sector. Our study encompasses the creation of open-source datasets,training, and model evaluation in this nuanced domain.","manuscriptTitle":"HeartBERT : A language model pre-trained on anopen source dataset for cardiac text mining","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2024-04-11 18:38:02","doi":"10.21203/rs.3.rs-4137702/v1","editorialEvents":[{"type":"communityComments","content":0},{"type":"decision","content":"Revision requested","date":"2024-07-08T11:13:45+00:00","index":"","fulltext":""},{"type":"editorInvitedReview","content":"","date":"2024-07-04T04:18:11+00:00","index":"hide","fulltext":""},{"type":"reviewerAgreed","content":"170898720889312707354448040779089397054","date":"2024-06-26T22:13:51+00:00","index":"hide","fulltext":""},{"type":"reviewerAgreed","content":"241489236268787001150086596173725185178","date":"2024-06-17T12:55:37+00:00","index":"hide","fulltext":""},{"type":"editorInvitedReview","content":"","date":"2024-05-01T02:07:48+00:00","index":"hide","fulltext":""},{"type":"reviewerAgreed","content":"8abcd6db-00eb-42f2-8634-2694f536aca7","date":"2024-04-22T21:07:03+00:00","index":"hide","fulltext":""},{"type":"reviewersInvited","content":"","date":"2024-04-20T03:57:40+00:00","index":"","fulltext":""},{"type":"editorAssigned","content":"","date":"2024-04-11T16:44:05+00:00","index":"","fulltext":""},{"type":"editorInvited","content":"","date":"2024-04-08T15:43:06+00:00","index":"","fulltext":""},{"type":"checksComplete","content":"","date":"2024-04-08T15:39:34+00:00","index":"","fulltext":""},{"type":"submitted","content":"Scientific Reports","date":"2024-03-20T13:59:09+00:00","index":"","fulltext":""}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"scientific-reports","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":false,"externalIdentity":"scirep","sideBox":"Learn more about [Scientific Reports](http://www.nature.com/srep/)","snPcode":"","submissionUrl":"","title":"Scientific Reports","twitterHandle":"","acdcEnabled":true,"dfaEnabled":true,"editorialSystem":"stoa","reportingPortfolio":"Scientific Reports","inReviewEnabled":true,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"6a39c303-55a9-46ad-9609-e547fc421293","owner":[],"postedDate":"April 11th, 2024","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"published-in-journal","subjectAreas":[],"tags":[],"updatedAt":"2024-11-25T16:03:04+00:00","versionOfRecord":{"articleIdentity":"rs-4137702","link":"https://doi.org/10.1038/s41598-024-80165-z","journal":{"identity":"scientific-reports","isVorOnly":false,"title":"Scientific Reports"},"publishedOn":"2024-11-23 15:57:49","publishedOnDateReadable":"November 23rd, 2024"},"versionCreatedAt":"2024-04-11 18:38:02","video":"","vorDoi":"10.1038/s41598-024-80165-z","vorDoiUrl":"https://doi.org/10.1038/s41598-024-80165-z","workflowStages":[]},"version":"v1","identity":"rs-4137702","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-4137702","identity":"rs-4137702","version":["v1"]},"buildId":"qtupq5eGEP_6zYnWcrvyt","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.