Multimodal Human Behavior Recognition Based on Contextual Semantics and Skeleton | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Research Article Multimodal Human Behavior Recognition Based on Contextual Semantics and Skeleton Cheng Liu, Kaile Ni, Xirui Wang, Jiaqing Fei, Kecheng Song, Jingqian Gu, and 2 more This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-4722459/v1 This work is licensed under a CC BY 4.0 License Status: Posted Version 1 posted You are reading this latest preprint version Abstract Skeleton-based human behavior recognition has been widely studied due to its efficiency and robustness to complex backgrounds. While skeleton data accurately captures the dynamic changes in human posture, it overly relies on the quality of skeleton data and lacks interaction with the environment. In cases where skeleton keypoints are missing, using only skeleton data for behavior recognition results in significantly reduced performance. To address these issues, this paper proposes a method for behavior recognition that combines contextual semantics with skeleton detection. It fully considers the correlation between human skeletons, objects, and the interaction between humans and objects. When recognizing behaviors from human skeletons, this method simultaneously identifies objects near human skeletons and performs multimodal fusion after forming semantic information. It utilizes transformer-based semantic similarity calculation to determine the possible correlation between behaviors and targets and finally combines the scores from two stages to obtain the final prediction results. Experimental results show that on the UCF101 dataset, which is closer to real-world scenarios, the proposed method achieves an 8.4% improvement in accuracy compared to PoseConv3D. Behavior recognition Contextual semantics Semantic similarity Multimodal Full Text Additional Declarations No competing interests reported. Cite Share Download PDF Status: Posted Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-4722459","acceptedTermsAndConditions":true,"allowDirectSubmit":true,"archivedVersions":[],"articleType":"Research Article","associatedPublications":[],"authors":[{"id":329649764,"identity":"a44953e1-8c9f-4af4-ba91-5351cbd47eb9","order_by":0,"name":"Cheng Liu","email":"","orcid":"","institution":"Northwest University","correspondingAuthor":false,"prefix":"","firstName":"Cheng","middleName":"","lastName":"Liu","suffix":""},{"id":329649765,"identity":"378dd80c-0318-4a0d-bf96-e0b70d43e194","order_by":1,"name":"Kaile Ni","email":"","orcid":"","institution":"Northwest University","correspondingAuthor":false,"prefix":"","firstName":"Kaile","middleName":"","lastName":"Ni","suffix":""},{"id":329649766,"identity":"84736efb-dbe0-4cab-b7fe-112398943c7e","order_by":2,"name":"Xirui Wang","email":"","orcid":"","institution":"Northwest University","correspondingAuthor":false,"prefix":"","firstName":"Xirui","middleName":"","lastName":"Wang","suffix":""},{"id":329649767,"identity":"6257239f-9221-4f2e-900e-f24e0992cd05","order_by":3,"name":"Jiaqing Fei","email":"","orcid":"","institution":"Northwest University","correspondingAuthor":false,"prefix":"","firstName":"Jiaqing","middleName":"","lastName":"Fei","suffix":""},{"id":329649768,"identity":"5e14ea5d-df07-4448-baa2-c7e92834e10c","order_by":4,"name":"Kecheng Song","email":"","orcid":"","institution":"Northwest University","correspondingAuthor":false,"prefix":"","firstName":"Kecheng","middleName":"","lastName":"Song","suffix":""},{"id":329649769,"identity":"5d7a4d20-d244-418b-af9e-2a6844919722","order_by":5,"name":"Jingqian Gu","email":"","orcid":"","institution":"Northwest University","correspondingAuthor":false,"prefix":"","firstName":"Jingqian","middleName":"","lastName":"Gu","suffix":""},{"id":329649770,"identity":"74abedcc-11e2-4667-8282-95b5e65e7e98","order_by":6,"name":"Zehao Hu","email":"","orcid":"","institution":"Northwest University","correspondingAuthor":false,"prefix":"","firstName":"Zehao","middleName":"","lastName":"Hu","suffix":""},{"id":329649771,"identity":"4dfd0cf4-9b9c-4728-b0cb-f6b9e8eec6da","order_by":7,"name":"Lin Wang","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAzUlEQVRIiWNgGAWjYLCCBwYMchAWG7FaEgwYjEnVwsCQ2EC0FoPjZw+/SCi4k75dIseA4UPZYQb+2Q0EtJzJS7NIMHiWu3NGjgHjjHOHGSTuHMCvxexAjplBgsHh3A23cwyYedsOMxhIJBDQcv4NWEu6AUjLX6K03MgxfgDUkgDWwkiMFvsbb8yAgXzYcMP9ZwUHe86l80jcIKBFsj/H+MOHP4flDc4c3vjgR5m1HP8MAlqAgE0CxjoAxDwE1QMB8wdiVI2CUTAKRsEIBgDGYEbwSb79TwAAAABJRU5ErkJggg==","orcid":"","institution":"Northwest University","correspondingAuthor":true,"prefix":"","firstName":"Lin","middleName":"","lastName":"Wang","suffix":""}],"badges":[],"createdAt":"2024-07-11 07:24:36","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-4722459/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-4722459/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":65119207,"identity":"b0efe569-3d95-4fdb-b27e-650ed212995e","added_by":"auto","created_at":"2024-09-23 21:01:41","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":3029416,"visible":true,"origin":"","legend":"","description":"","filename":"ArticleTitle1.pdf","url":"https://assets-eu.researchsquare.com/files/rs-4722459/v1_covered_1382a456-491a-4dfd-b2a7-64549ec9adf9.pdf"}],"financialInterests":"No competing interests reported.","formattedTitle":"Multimodal Human Behavior Recognition Based on Contextual Semantics and Skeleton","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":false,"hideJournal":true,"highlight":"","institution":"","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true},"keywords":"Behavior recognition, Contextual semantics, Semantic similarity, Multimodal","lastPublishedDoi":"10.21203/rs.3.rs-4722459/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-4722459/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"Skeleton-based human behavior recognition has been widely studied due to its efficiency and robustness to complex backgrounds. While skeleton data accurately captures the dynamic changes in human posture, it overly relies on the quality of skeleton data and lacks interaction with the environment. In cases where skeleton keypoints are missing, using only skeleton data for behavior recognition results in significantly reduced performance. To address these issues, this paper proposes a method for behavior recognition that combines contextual semantics with skeleton detection. It fully considers the correlation between human skeletons, objects, and the interaction between humans and objects. When recognizing behaviors from human skeletons, this method simultaneously identifies objects near human skeletons and performs multimodal fusion after forming semantic information. It utilizes transformer-based semantic similarity calculation to determine the possible correlation between behaviors and targets and finally combines the scores from two stages to obtain the final prediction results. Experimental results show that on the UCF101 dataset, which is closer to real-world scenarios, the proposed method achieves an 8.4% improvement in accuracy compared to PoseConv3D.","manuscriptTitle":"Multimodal Human Behavior Recognition Based on Contextual Semantics and Skeleton","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2024-08-06 18:12:44","doi":"10.21203/rs.3.rs-4722459/v1","editorialEvents":[{"type":"communityComments","content":0}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"8f459067-d3f4-4aed-822d-e7d50826266a","owner":[],"postedDate":"August 6th, 2024","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"posted","subjectAreas":[],"tags":[],"updatedAt":"2024-09-23T20:53:32+00:00","versionOfRecord":[],"versionCreatedAt":"2024-08-06 18:12:44","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-4722459","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-4722459","identity":"rs-4722459","version":["v1"]},"buildId":"qtupq5eGEP_6zYnWcrvyt","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.