Adaptive Sparse Multimodal Transformer for Efficient Action Recognition on Resource-Constrained Edge Devices | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Research Article Adaptive Sparse Multimodal Transformer for Efficient Action Recognition on Resource-Constrained Edge Devices Nnaemeka Kingsley Ugwumba This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-8168079/v1 This work is licensed under a CC BY 4.0 License Status: Posted Version 1 posted You are reading this latest preprint version Abstract Multimodal sensing platforms such as wearable devices, mobile robots, and smart environments increasingly require real-time interpretation of visual, acoustic, and inertial data under stringent computational and energy constraints. Although Transformer-based architectures provide strong representational capacity, their quadratic attention complexity limits practical deployment on resource-constrained systems. This paper presents the Adaptive Sparse Multimodal Transformer (ASMT), a content-adaptive sparse attention framework designed for efficient multimodal action recognition. ASMT introduces a lightweight token-importance gating module that selects a compact subset of informative tokens across modalities, enabling attention computation on a reduced sequence while preserving cross-modal dependencies. Unlike fixed-pattern sparsity methods, ASMT dynamically adapts token selection to input characteristics, improving efficiency without degrading accuracy. Experiments on two widely used multimodal benchmarks, MMAct and UTD-MHAD, demonstrate that ASMT achieves accuracy comparable to state-of-the-art multimodal Transformers while reducing attention FLOPs by up to 63 percent and lowering total inference latency by 41 percent on edge-oriented hardware. These results indicate that ASMT provides a practical and scalable architecture for real-time multimodal inference in embedded and mobile applications. Artificial Intelligence and Machine Learning NeuroSparse Transformers sparse attention multimodal learning real-time processing edge AI biologically-inspired AI spiking neural networks efficient transformers sensor fusion computational efficiency stream processing action recognition resource-constrained deployment attention mechanisms neuromorphic computing Full Text Additional Declarations The authors declare no competing interests. Cite Share Download PDF Status: Posted Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-8168079","acceptedTermsAndConditions":true,"allowDirectSubmit":true,"archivedVersions":[],"articleType":"Research Article","associatedPublications":[],"authors":[{"id":550535475,"identity":"4ff169a5-7ffb-4742-afba-9fb54ae57568","order_by":0,"name":"Nnaemeka Kingsley Ugwumba","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAABMklEQVRIiWNgGAWjYPCCBBjjAAM/mIQAA+K0SDZAtEgQr8UAagVOLebSzQ8f3ahIk2eQSL8mzZtzR974Ru7BwwUV9+oY2Ju3STDuqEXXYjnnmLFxzpkcwwaJnDJp3m3PDLfdyEs4PONMsQQDz7EyCcYzx9G1GNxIMJPObatgBGpJu8277TDjths5Bod52xIkGCRyzCQY245hakn//jv3X4U9TIv95hkwLfJvcGjJMWPObchJbJBIPwbSkrhBAm4LD0hLDRYtxdI5x9KSG3jesP+cu+1Z8owzbwyAfkmQbONJK7ZIbDuAxWEbP+fUJNs2sKc/Nni77Y5tf3uO8eeCigR+fvbDG298bKvDFtBgYH+ABxELzCCCDUQkMBzGqYWBgf0BqhYowG3LKBgFo2AUjBQAAIY0dhyXTjBlAAAAAElFTkSuQmCC","orcid":"https://orcid.org/0009-0000-2493-9846","institution":"University of Port Harcourt, Port Harcourt, Nigeria","correspondingAuthor":true,"prefix":"","firstName":"Nnaemeka","middleName":"Kingsley","lastName":"Ugwumba","suffix":""}],"badges":[],"createdAt":"2025-11-20 22:39:36","currentVersionCode":1,"declarations":{"humanSubjects":false,"vertebrateSubjects":true,"conflictsOfInterestStatement":false,"humanSubjectEthicalGuidelines":false,"humanSubjectConsent":false,"humanSubjectClinicalTrial":false,"humanSubjectCaseReport":false,"vertebrateSubjectEthicalGuidelines":true},"doi":"10.21203/rs.3.rs-8168079/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-8168079/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":96794541,"identity":"c16429ed-cbf9-4dd2-aaa9-7d51d57c0f07","added_by":"auto","created_at":"2025-11-26 07:18:01","extension":"docx","order_by":0,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":165607,"visible":true,"origin":"","legend":"","description":"","filename":"ASMT.docx","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/dc71a16723f924eed7e16e79.docx"},{"id":96915935,"identity":"4201bd15-89f6-440a-820f-bd12e4b817cf","added_by":"auto","created_at":"2025-11-27 14:07:47","extension":"json","order_by":1,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":342,"visible":true,"origin":"","legend":"","description":"","filename":"rs8168079.json","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/82f2e910776b819a59407548.json"},{"id":96916403,"identity":"a95f8f91-c1c0-483c-adaf-a343c4b41abc","added_by":"auto","created_at":"2025-11-27 14:08:33","extension":"xml","order_by":2,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":61123,"visible":true,"origin":"","legend":"","description":"","filename":"rs81680790enriched.xml","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/953b00548e62ec6df4785899.xml"},{"id":96915536,"identity":"49757f9d-4f7c-4cb9-9175-e1fb32bc1133","added_by":"auto","created_at":"2025-11-27 14:07:21","extension":"png","order_by":3,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":52186,"visible":true,"origin":"","legend":"","description":"","filename":"floatimage1.png","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/5de7da7f5dc6e8876629a757.png"},{"id":96794540,"identity":"05198959-350c-47f5-aefd-c860c4d14ab0","added_by":"auto","created_at":"2025-11-26 07:18:01","extension":"png","order_by":4,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":2620,"visible":true,"origin":"","legend":"","description":"","filename":"floatimage2.png","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/724f65d99995c910fec4f13b.png"},{"id":96794546,"identity":"9fd7c339-87fa-4f04-92a7-618d1db56d0c","added_by":"auto","created_at":"2025-11-26 07:18:01","extension":"png","order_by":5,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":18049,"visible":true,"origin":"","legend":"","description":"","filename":"floatimage3.png","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/9c27958fa0ffc28b45e0f8e6.png"},{"id":96794549,"identity":"087e6deb-b100-4cf8-80fe-43ad8043bc7a","added_by":"auto","created_at":"2025-11-26 07:18:01","extension":"png","order_by":6,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":60252,"visible":true,"origin":"","legend":"","description":"","filename":"floatimage4.png","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/f351df75fd878f230ac38638.png"},{"id":96916978,"identity":"66422db6-7c8f-4c6b-abd9-179262fb86d6","added_by":"auto","created_at":"2025-11-27 14:09:06","extension":"png","order_by":7,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":11854,"visible":true,"origin":"","legend":"","description":"","filename":"Onlinefloatimage1.png","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/c8d9746d4cf4c254ddfa7b0e.png"},{"id":96916745,"identity":"0422dbe0-9723-46bd-92f0-5a1855752b3e","added_by":"auto","created_at":"2025-11-27 14:08:52","extension":"png","order_by":8,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":1719,"visible":true,"origin":"","legend":"","description":"","filename":"Onlinefloatimage2.png","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/46c7ee2fa6d6cc0df7e2d448.png"},{"id":96794547,"identity":"a72a5d18-3725-4a57-8421-e677f9c00486","added_by":"auto","created_at":"2025-11-26 07:18:01","extension":"png","order_by":9,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":9029,"visible":true,"origin":"","legend":"","description":"","filename":"Onlinefloatimage3.png","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/eab4c89f5d12a4bc8ee66e08.png"},{"id":96794551,"identity":"692a57a9-87d8-4378-824a-a4216c8bee84","added_by":"auto","created_at":"2025-11-26 07:18:02","extension":"png","order_by":10,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":14530,"visible":true,"origin":"","legend":"","description":"","filename":"Onlinefloatimage4.png","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/01c21cbfe80d945ee1edeb31.png"},{"id":96915647,"identity":"9d23f7cd-8407-4ab2-96f7-6de9bb970c64","added_by":"auto","created_at":"2025-11-27 14:07:27","extension":"xml","order_by":11,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":60285,"visible":true,"origin":"","legend":"","description":"","filename":"rs81680790structuring.xml","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/d6717c9ff35dbfcb6cab5f74.xml"},{"id":96794552,"identity":"fae4f24b-c9f1-47a4-b8f3-ae81e53ce934","added_by":"auto","created_at":"2025-11-26 07:18:02","extension":"html","order_by":12,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":71520,"visible":true,"origin":"","legend":"","description":"","filename":"earlyproof.html","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1/4563af80d50b0dc518cf7294.html"},{"id":96922857,"identity":"931f2d0e-2db0-4da4-bc57-05e69353cde9","added_by":"auto","created_at":"2025-11-27 14:20:03","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":382328,"visible":true,"origin":"","legend":"","description":"","filename":"ASMT.pdf","url":"https://assets-eu.researchsquare.com/files/rs-8168079/v1_covered_6c205d23-adef-4a9e-baa1-b214ebf23172.pdf"}],"financialInterests":"The authors declare no competing interests.","formattedTitle":"\u003cp\u003e\u003cstrong\u003eAdaptive Sparse Multimodal Transformer for Efficient Action Recognition on Resource-Constrained Edge Devices\u003c/strong\u003e\u003c/p\u003e","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":true,"hideJournal":true,"highlight":"","institution":"Laskenta Technologies Limited","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true},"keywords":"NeuroSparse Transformers, sparse attention, multimodal learning, real-time processing, edge AI, biologically-inspired AI, spiking neural networks, efficient transformers, sensor fusion, computational efficiency, stream processing, action recognition, resource-constrained deployment, attention mechanisms, neuromorphic computing","lastPublishedDoi":"10.21203/rs.3.rs-8168079/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-8168079/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"\u003cp\u003eMultimodal sensing platforms such as wearable devices, mobile robots, and smart environments increasingly require real-time interpretation of visual, acoustic, and inertial data under stringent computational and energy constraints. Although Transformer-based architectures provide strong representational capacity, their quadratic attention complexity limits practical deployment on resource-constrained systems. This paper presents the Adaptive Sparse Multimodal Transformer (ASMT), a content-adaptive sparse attention framework designed for efficient multimodal action recognition. ASMT introduces a lightweight token-importance gating module that selects a compact subset of informative tokens across modalities, enabling attention computation on a reduced sequence while preserving cross-modal dependencies. Unlike fixed-pattern sparsity methods, ASMT dynamically adapts token selection to input characteristics, improving efficiency without degrading accuracy. Experiments on two widely used multimodal benchmarks, MMAct and UTD-MHAD, demonstrate that ASMT achieves accuracy comparable to state-of-the-art multimodal Transformers while reducing attention FLOPs by up to 63 percent and lowering total inference latency by 41 percent on edge-oriented hardware. These results indicate that ASMT provides a practical and scalable architecture for real-time multimodal inference in embedded and mobile applications.\u003c/p\u003e","manuscriptTitle":"Adaptive Sparse Multimodal Transformer for Efficient Action Recognition on Resource-Constrained Edge Devices","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2025-11-26 07:17:57","doi":"10.21203/rs.3.rs-8168079/v1","editorialEvents":[{"type":"communityComments","content":0}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"2ebaabe4-3c99-4688-8619-ac5d173b08b4","owner":[],"postedDate":"November 26th, 2025","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"posted","subjectAreas":[{"id":58577205,"name":"Artificial Intelligence and Machine Learning"}],"tags":[],"updatedAt":"2025-11-26T07:17:57+00:00","versionOfRecord":[],"versionCreatedAt":"2025-11-26 07:17:57","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-8168079","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-8168079","identity":"rs-8168079","version":["v1"]},"buildId":"8U1c8b4HqxoKbykW_rLl7","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.