MammoVQA: A Benchmark for Breast Cancer Screening and Diagnosis in Mammogram Visual Question Answering | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Article MammoVQA: A Benchmark for Breast Cancer Screening and Diagnosis in Mammogram Visual Question Answering Hao Chen, Jiayi Zhu, Fuxiang Huang, Qiong Luo This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-6543886/v1 This work is licensed under a CC BY 4.0 License Status: Published Journal Publication published 27 Nov, 2025 Read the published version in Nature Communications → Version 1 posted You are reading this latest preprint version Abstract Breast cancer remains the most prevalent malignancy in women worldwide. Mammography-based early detection plays a pivotal role in improving patient survival outcomes. While large vision-language models (LVLMs) offer transformative potential for mammogram visual question answering (VQA), the absence of standardized evaluation benchmarks currently limits their reliable clinical deployment. In this study, we address this critical gap through three key contributions: (1) We introduce MammoVQA, the first mammogram VQA dataset, unifying 11 public datasets into 104,914 images (337K QA pairs) for image-level cases and 72,518 exams (476K images, 144K QA pairs) for exam-level analysis. (2) Systematic evaluation of 9 LVLMs (4 general, 5 medical) reveals diagnostic performance statistically equivalent to random guessing, highlighting their unreliability for clinical breast cancer screening. (3) Our domain-optimized LLaVA-Mammo achieves average +21.00% weighted accuracy gains over SOTA in internal validation, with average +22.99% weighted accuracy improvements in external validation across 4 datasets. Health sciences/Health care/Medical imaging/Radiography Health sciences/Health care/Public health/Population screening Mammogram Visual Question Answering Large Vision Language Model Full Text Additional Declarations There is NO Competing Interest. Supplementary Files Supplementary.docx Experiment result table Cite Share Download PDF Status: Published Journal Publication published 27 Nov, 2025 Read the published version in Nature Communications → Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-6543886","acceptedTermsAndConditions":true,"allowDirectSubmit":false,"archivedVersions":[],"articleType":"Article","associatedPublications":[],"authors":[{"id":450230533,"identity":"84add1d1-727f-4f3f-b813-cf58bb4084dd","order_by":0,"name":"Hao Chen","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAsUlEQVRIiWNgGAWjYBACxhkMjA9gHGaGA8RpYTaAaSBOC4MEA5sEaVqYZzc/q/jxx4bB4Pz5A8wFZ4hx2JxjZjd729IYDG4kMzDPuEGMlhk5bLcZGw4DtQAdxvOBSC3FDH/+Ax12mAQtzAxsBxgMDgAdxkOcw9KMJXvbknkkbyQbHOYhxvuGM5Iffvjxx06O7/zBh495jhGjpQFC84CIA0RoYGCQJ0rVKBgFo2AUjGwAAITwNJZ5nBIOAAAAAElFTkSuQmCC","orcid":"https://orcid.org/0000-0002-8400-3780","institution":"The Hong Kong University of Science and Technology","correspondingAuthor":true,"prefix":"","firstName":"Hao","middleName":"","lastName":"Chen","suffix":""},{"id":450230534,"identity":"2c42f6b6-93fe-495a-8fac-311112be7abb","order_by":1,"name":"Jiayi Zhu","email":"","orcid":"https://orcid.org/0000-0001-8154-7582","institution":"Hong Kong University of Science and Technology (Guangzhou)","correspondingAuthor":false,"prefix":"","firstName":"Jiayi","middleName":"","lastName":"Zhu","suffix":""},{"id":450230535,"identity":"8e994c89-8acb-4ff0-9daa-ce71bf342ee9","order_by":2,"name":"Fuxiang Huang","email":"","orcid":"","institution":"Hong Kong University of Science and Technology","correspondingAuthor":false,"prefix":"","firstName":"Fuxiang","middleName":"","lastName":"Huang","suffix":""},{"id":450230536,"identity":"4b9e25d8-c8c6-4343-a78f-aac24cb19151","order_by":3,"name":"Qiong Luo","email":"","orcid":"","institution":"Hong Kong University of Science and Technology","correspondingAuthor":false,"prefix":"","firstName":"Qiong","middleName":"","lastName":"Luo","suffix":""}],"badges":[],"createdAt":"2025-04-28 04:55:13","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-6543886/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-6543886/v1","draftVersion":[],"editorialEvents":[{"content":"https://doi.org/10.1038/s41467-025-66507-z","type":"published","date":"2025-11-27T05:00:00+00:00"}],"editorialNote":"","failedWorkflow":false,"files":[{"id":99212973,"identity":"43167719-41f8-4c52-ad38-626e70a26b25","added_by":"auto","created_at":"2025-12-30 08:34:51","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":5197388,"visible":true,"origin":"","legend":"Article File","description":"","filename":"MammoVQA.pdf","url":"https://assets-eu.researchsquare.com/files/rs-6543886/v1_covered_de9f61fa-57d8-4816-9d18-ffe423aec834.pdf"},{"id":81793927,"identity":"d1068968-b5db-4e77-bc78-80be28ce34c0","added_by":"auto","created_at":"2025-05-02 02:44:22","extension":"docx","order_by":1,"title":"","display":"","copyAsset":false,"role":"supplement","size":459832,"visible":true,"origin":"","legend":"Experiment result table","description":"","filename":"Supplementary.docx","url":"https://assets-eu.researchsquare.com/files/rs-6543886/v1/8dfa88b29294394ebfce730c.docx"}],"financialInterests":"There is \u003cb\u003eNO\u003c/b\u003e Competing Interest.","formattedTitle":"MammoVQA: A Benchmark for Breast Cancer Screening and Diagnosis in Mammogram Visual Question Answering","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":true,"hideJournal":false,"highlight":"","institution":"","isAcceptedByJournal":true,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"nature-portfolio","isNatureJournal":true,"hasQc":false,"allowDirectSubmit":false,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"","title":"Nature Portfolio","twitterHandle":"","acdcEnabled":false,"dfaEnabled":false,"editorialSystem":"ejp","reportingPortfolio":"","inReviewEnabled":true,"inReviewRevisionsEnabled":false},"keywords":"Mammogram, Visual Question Answering, Large Vision Language Model","lastPublishedDoi":"10.21203/rs.3.rs-6543886/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-6543886/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"Breast cancer remains the most prevalent malignancy in women worldwide. Mammography-based early detection plays a pivotal role in improving patient survival outcomes. While large vision-language models (LVLMs) offer transformative potential for mammogram visual question answering (VQA), the absence of standardized evaluation benchmarks currently limits their reliable clinical deployment. In this study, we address this critical gap through three key contributions: (1) We introduce MammoVQA, the first mammogram VQA dataset, unifying 11 public datasets into 104,914 images (337K QA pairs) for image-level cases and 72,518 exams (476K images, 144K QA pairs) for exam-level analysis.\r\n(2) Systematic evaluation of 9 LVLMs (4 general, 5 medical) reveals diagnostic performance statistically equivalent to random guessing, highlighting their unreliability for clinical breast cancer screening.\r\n(3) Our domain-optimized LLaVA-Mammo achieves average +21.00% weighted accuracy gains over SOTA in internal validation, with average +22.99% weighted accuracy improvements in external validation across 4 datasets.","manuscriptTitle":"MammoVQA: A Benchmark for Breast Cancer Screening and Diagnosis in Mammogram Visual Question Answering","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2025-05-02 02:44:17","doi":"10.21203/rs.3.rs-6543886/v1","editorialEvents":[],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"nature-communications","isNatureJournal":true,"hasQc":false,"allowDirectSubmit":false,"externalIdentity":"NCOMMS","sideBox":"Learn more about [Nature Communications](http://www.nature.com/ncomms/)","snPcode":"","submissionUrl":"https://mts-ncomms.nature.com/","title":"Nature Communications","twitterHandle":"","acdcEnabled":true,"dfaEnabled":true,"editorialSystem":"ejp","reportingPortfolio":"Nature Communications","inReviewEnabled":true,"inReviewRevisionsEnabled":false}}],"origin":"","ownerIdentity":"0990d2af-72ef-4eec-92f5-2e9c408f0b90","owner":[],"postedDate":"May 2nd, 2025","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"published-in-journal","subjectAreas":[{"id":47910656,"name":"Health sciences/Health care/Medical imaging/Radiography"},{"id":47910657,"name":"Health sciences/Health care/Public health/Population screening"}],"tags":[],"updatedAt":"2025-12-30T08:34:40+00:00","versionOfRecord":{"articleIdentity":"rs-6543886","link":"https://doi.org/10.1038/s41467-025-66507-z","journal":{"identity":"nature-communications","isVorOnly":false,"title":"Nature Communications"},"publishedOn":"2025-11-27 05:00:00","publishedOnDateReadable":"November 27th, 2025"},"versionCreatedAt":"2025-05-02 02:44:17","video":"","vorDoi":"10.1038/s41467-025-66507-z","vorDoiUrl":"https://doi.org/10.1038/s41467-025-66507-z","workflowStages":[]},"version":"v1","identity":"rs-6543886","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-6543886","identity":"rs-6543886","version":["v1"]},"buildId":"8U1c8b4HqxoKbykW_rLl7","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.