Refinement Feature Fusion for Multi-model Depression Detection | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Research Article Refinement Feature Fusion for Multi-model Depression Detection Yongwei Liao, Yinbin Zhang, Weiwei Chen, Zhenjun Li, Bing Zhou This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-9000115/v1 This work is licensed under a CC BY 4.0 License Status: Under Review Version 1 posted 4 You are reading this latest preprint version Abstract This paper proposes UniMDD (Unified Multi-modal Depression Detection), a novel framework designed for multimodal depression detection based on video interviews. It employs an efficient multi-model data fusion strategy, comprising four sequential modules: feature extraction, feature fusion, feature refinement, and depression detection. Specifically, the feature extraction module leverages OpenFace for facial features, VGGish for audio features, and a Large Language Model for textual features. The core innovation lies in the fusion and refinement stages. The feature fusion module designs a Mixed Attention Network(MAN) to capture intricate cross-modal interactions and effectively integrate complementary information from different data streams. Subsequently, the feature refinement module constructs an EHA-Attention mechanism within decoder blocks. This serves a dual purpose: it generates higher-level semantic and contextual representations while mitigating potential inter-modal conflicts and enhancing the comprehension of each individual modality. Finally, the refined features are fed into the depression detection module for prediction. Experimental validation on the LMVD dataset demonstrates that UniMDD achieves an accuracy of 76.68%, surpassing the current state-of-the-art method, DepMamba, by a margin of 4.55 percentage points. These results underscore the advancement and robustness of the proposed UniMDD framework. Depression Detection Multimodal Data Fusion Large Model Artificial Intelligence Full Text Additional Declarations No competing interests reported. Cite Share Download PDF Status: Under Review Version 1 posted Reviewers invited by journal 08 May, 2026 Editor assigned by journal 10 Mar, 2026 Submission checks completed at journal 10 Mar, 2026 First submitted to journal 10 Mar, 2026 You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-9000115","acceptedTermsAndConditions":true,"allowDirectSubmit":false,"archivedVersions":[],"articleType":"Research Article","associatedPublications":[],"authors":[{"id":636762230,"identity":"5fa84226-b6c8-44a3-9136-8851c33212f0","order_by":0,"name":"Yongwei Liao","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAA2UlEQVRIiWNgGAWjYFACxgaDBAYJHjb25gMHPvwgXouNHB/PscSDM3uItyrNWE4ix/gwBxsRag2ONzcUPKg4nNjGc+bDYQYeBnl+sQMEtJw5CHTYGaAW9t4NhwssGAxnzk7Ar8XsRmKDQWIbyJazGw7P4GFIMLhNSMv9h0At/4BaJHIeHOZhI0bLDWCIJTakGbNJ5DAQp8X+DNBhCcds5Nh4jhkAA1mCsF8k248/M/xRI8Ej3978+MOHHzby/NIEtAABmwESR4KgchBgfkCUslEwCkbBKBi5AABKXUu14RGLRgAAAABJRU5ErkJggg==","orcid":"","institution":"Shenzhen City Polytechnic","correspondingAuthor":true,"prefix":"","firstName":"Yongwei","middleName":"","lastName":"Liao","suffix":""},{"id":636762231,"identity":"ee12f7f7-ec4d-45ef-b44c-307378e7c293","order_by":1,"name":"Yinbin Zhang","email":"","orcid":"","institution":"Shenzhen City Polytechnic","correspondingAuthor":false,"prefix":"","firstName":"Yinbin","middleName":"","lastName":"Zhang","suffix":""},{"id":636762232,"identity":"991f4f69-baa2-48db-b232-d94aa86d723f","order_by":2,"name":"Weiwei Chen","email":"","orcid":"","institution":"Shenzhen City Polytechnic","correspondingAuthor":false,"prefix":"","firstName":"Weiwei","middleName":"","lastName":"Chen","suffix":""},{"id":636762233,"identity":"3bd7c1f3-1fdc-4e5a-a694-6aa7ce180de8","order_by":3,"name":"Zhenjun Li","email":"","orcid":"","institution":"Shenzhen City Polytechnic","correspondingAuthor":false,"prefix":"","firstName":"Zhenjun","middleName":"","lastName":"Li","suffix":""},{"id":636762236,"identity":"b80bf4a9-e99f-4c9e-b0d6-d22627e4d0aa","order_by":4,"name":"Bing Zhou","email":"","orcid":"","institution":"Shenzhen City Polytechnic","correspondingAuthor":false,"prefix":"","firstName":"Bing","middleName":"","lastName":"Zhou","suffix":""}],"badges":[],"createdAt":"2026-03-01 08:24:11","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-9000115/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-9000115/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":109759960,"identity":"f3e335dc-70c2-42f7-89c6-50cae72aada8","added_by":"auto","created_at":"2026-05-22 07:27:59","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":1269299,"visible":true,"origin":"","legend":"","description":"","filename":"manuscript.pdf","url":"https://assets-eu.researchsquare.com/files/rs-9000115/v1_covered_49b5f84e-9742-490e-a67f-aa05328d0386.pdf"}],"financialInterests":"No competing interests reported.","formattedTitle":"Refinement Feature Fusion for Multi-model Depression Detection","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":false,"hideJournal":false,"highlight":"","institution":"","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"discover-artificial-intelligence","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":false,"externalIdentity":"diai","sideBox":"Learn more about [Discover Artificial Intelligence](https://www.springer.com/44163)","snPcode":"","submissionUrl":"","title":"Discover Artificial Intelligence","twitterHandle":"","acdcEnabled":true,"dfaEnabled":true,"editorialSystem":"stoa","reportingPortfolio":"Discover Series","inReviewEnabled":true,"inReviewRevisionsEnabled":true},"keywords":"Depression Detection, Multimodal, Data Fusion, Large Model, Artificial Intelligence","lastPublishedDoi":"10.21203/rs.3.rs-9000115/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-9000115/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"\u003cp\u003eThis paper proposes UniMDD (Unified Multi-modal Depression Detection), a novel framework designed for multimodal depression detection based on video interviews. It employs an efficient multi-model data fusion strategy, comprising four sequential modules: feature extraction, feature fusion, feature refinement, and depression detection. Specifically, the feature extraction module leverages OpenFace for facial features, VGGish for audio features, and a Large Language Model for textual features. The core innovation lies in the fusion and refinement stages. The feature fusion module designs a Mixed Attention Network(MAN) to capture intricate cross-modal interactions and effectively integrate complementary information from different data streams. Subsequently, the feature refinement module constructs an EHA-Attention mechanism within decoder blocks. This serves a dual purpose: it generates higher-level semantic and contextual representations while mitigating potential inter-modal conflicts and enhancing the comprehension of each individual modality. Finally, the refined features are fed into the depression detection module for prediction. Experimental validation on the LMVD dataset demonstrates that UniMDD achieves an accuracy of 76.68%, surpassing the current state-of-the-art method, DepMamba, by a margin of 4.55 percentage points. These results underscore the advancement and robustness of the proposed UniMDD framework.\u003c/p\u003e","manuscriptTitle":"Refinement Feature Fusion for Multi-model Depression Detection","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2026-05-18 03:32:09","doi":"10.21203/rs.3.rs-9000115/v1","editorialEvents":[{"type":"communityComments","content":0},{"type":"reviewersInvited","content":"","date":"2026-05-08T08:29:16+00:00","index":"","fulltext":""},{"type":"editorAssigned","content":"","date":"2026-03-10T10:25:57+00:00","index":"","fulltext":""},{"type":"checksComplete","content":"","date":"2026-03-10T07:25:27+00:00","index":"","fulltext":""},{"type":"submitted","content":"Discover Artificial Intelligence","date":"2026-03-10T06:40:20+00:00","index":"","fulltext":""}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"discover-artificial-intelligence","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":false,"externalIdentity":"diai","sideBox":"Learn more about [Discover Artificial Intelligence](https://www.springer.com/44163)","snPcode":"","submissionUrl":"","title":"Discover Artificial Intelligence","twitterHandle":"","acdcEnabled":true,"dfaEnabled":true,"editorialSystem":"stoa","reportingPortfolio":"Discover Series","inReviewEnabled":true,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"e1293a32-0a92-4596-b4dd-10522f05ee36","owner":[],"postedDate":"May 18th, 2026","published":true,"recentEditorialEvents":[{"type":"reviewersInvited","content":"25","date":"2026-05-08T08:29:16+00:00","index":"","fulltext":""}],"rejectedJournal":[],"revision":"","amendment":"","status":"under-review","subjectAreas":[],"tags":[],"updatedAt":"2026-05-18T03:32:09+00:00","versionOfRecord":[],"versionCreatedAt":"2026-05-18 03:32:09","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-9000115","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-9000115","identity":"rs-9000115","version":["v1"]},"buildId":"8U1c8b4HqxoKbykW_rLl7","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.