Text-Guided Dynamic Structure Prediction for Category-Agnostic Pose Estimation | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Research Article Text-Guided Dynamic Structure Prediction for Category-Agnostic Pose Estimation Khanh-Duy Nguyen, Guan-Bin Lin, Quoc-Viet Nguyen, Huy-Hoang Vo, and 3 more This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-9101076/v1 This work is licensed under a CC BY 4.0 License Status: Under Review Version 1 posted 4 You are reading this latest preprint version Abstract In the field of computer vision, keypoint detection is a crucial area of research for understanding the structure of an object. This has led to the emergence of Category-Agnostic Pose Estimation (CAPE), a novel task that utilizes a single model to localize keypoints across diverse object categories. To broaden its application, recent research has incorporated text descriptions into the CAPE task. However, when extending text descriptions to the CAPE task, shortcomings in skeleton information learning and graph information propagation persist, making it difficult for the model to utilize comprehensive structural information, which in turn affects the precision of keypoint localization. In this paper, we propose a model that integrates a Semantic Skeleton Refiner and optimizes the Graph Transformer Decoder architecture, using features from text descriptions as a guidance for dynamic structure prediction. We conducted experiments on the MP-100 dataset and compared our model with existing state-of-the-art models in the CAPE domain. The experimental results indicate that our method outperforms current leading models in the field of CAPE. CAPE Pose Estimation Text Guidance Graph Transformer Skeleton Modeling Keypoints Full Text Additional Declarations No competing interests reported. Cite Share Download PDF Status: Under Review Version 1 posted Reviewers invited by journal 11 May, 2026 Editor assigned by journal 12 Mar, 2026 Submission checks completed at journal 12 Mar, 2026 First submitted to journal 12 Mar, 2026 You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-9101076","acceptedTermsAndConditions":true,"allowDirectSubmit":false,"archivedVersions":[],"articleType":"Research Article","associatedPublications":[],"authors":[{"id":638661582,"identity":"e9904c1e-74c7-4dcd-9e51-2539b3d72fd6","order_by":0,"name":"Khanh-Duy Nguyen","email":"","orcid":"","institution":"National Central University","correspondingAuthor":false,"prefix":"","firstName":"Khanh-Duy","middleName":"","lastName":"Nguyen","suffix":""},{"id":638661583,"identity":"b332b05e-4e62-4321-9666-05386fbbfa03","order_by":1,"name":"Guan-Bin Lin","email":"","orcid":"","institution":"National Central University","correspondingAuthor":false,"prefix":"","firstName":"Guan-Bin","middleName":"","lastName":"Lin","suffix":""},{"id":638661584,"identity":"884f6ad7-6562-4af6-84a2-f1ed6cbaa6d8","order_by":2,"name":"Quoc-Viet Nguyen","email":"","orcid":"","institution":"National Central University","correspondingAuthor":false,"prefix":"","firstName":"Quoc-Viet","middleName":"","lastName":"Nguyen","suffix":""},{"id":638661585,"identity":"c468a819-57c3-4f80-971c-f1bf81e00cef","order_by":3,"name":"Huy-Hoang Vo","email":"","orcid":"","institution":"National Central University","correspondingAuthor":false,"prefix":"","firstName":"Huy-Hoang","middleName":"","lastName":"Vo","suffix":""},{"id":638661586,"identity":"6a85a710-07d0-4343-a578-7b45e628078b","order_by":4,"name":"Min-Te Sun","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAz0lEQVRIiWNgGAWjYLCCDwwMPDA2YwMxOhhnoGphJqyFmQeJQ1iLbv8Zs8c2NXUy/Ay8Bz/zMNjIbjjAf0wCnxazGznmxjnHDvNINvAlS/MwpBlvOMDMRkALj5l0bsMBHoMDPGZAFx5OBGm5gVfL+TNm0pYNdTz2EC3/idByIMdMmrGBmceAAazlABFabqSVSfYA/SJxmMdYco5BsvHMw8zmP/A77PA2iR81dfb87T2GH95U2Mn2HW98bIBPCwKAY8MAxhgFo2AUjIJRQBEAAKKNPqTxs6q9AAAAAElFTkSuQmCC","orcid":"","institution":"National Central University","correspondingAuthor":true,"prefix":"","firstName":"Min-Te","middleName":"","lastName":"Sun","suffix":""},{"id":638661587,"identity":"c482aec0-1440-4056-8280-5ca9e18089e5","order_by":5,"name":"Kazuya Sakai","email":"","orcid":"","institution":"Tokyo Metropolitan University","correspondingAuthor":false,"prefix":"","firstName":"Kazuya","middleName":"","lastName":"Sakai","suffix":""},{"id":638661588,"identity":"95edea71-d95a-4b26-b070-1f62919d3b59","order_by":6,"name":"Wei-Shinn Ku","email":"","orcid":"","institution":"Auburn University","correspondingAuthor":false,"prefix":"","firstName":"Wei-Shinn","middleName":"","lastName":"Ku","suffix":""}],"badges":[],"createdAt":"2026-03-12 06:53:39","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-9101076/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-9101076/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":109222265,"identity":"9576f33e-72ca-4a31-a842-89f2df84f780","added_by":"auto","created_at":"2026-05-13 21:06:42","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":598590,"visible":true,"origin":"","legend":"","description":"","filename":"DudleyMVATextGuidedDynamicStructurePredictionforCategoryAgnosticPoseEstimation.pdf","url":"https://assets-eu.researchsquare.com/files/rs-9101076/v1_covered_2e1db8d7-fd51-4f4e-ac10-b3caef3504a3.pdf"}],"financialInterests":"No competing interests reported.","formattedTitle":"Text-Guided Dynamic Structure Prediction for Category-Agnostic Pose Estimation","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":false,"hideJournal":false,"highlight":"","institution":"","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"machine-vision-and-applications","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":false,"externalIdentity":"mvap","sideBox":"Learn more about [Machine Vision and Applications](https://www.springer.com/journal/138)","snPcode":"138","submissionUrl":"https://submission.springernature.com/new-submission/138/3","title":"Machine Vision and Applications","twitterHandle":"","acdcEnabled":true,"dfaEnabled":true,"editorialSystem":"stoa","reportingPortfolio":"Springer Hybrid","inReviewEnabled":true,"inReviewRevisionsEnabled":false},"keywords":"CAPE, Pose Estimation, Text Guidance, Graph Transformer, Skeleton Modeling, Keypoints","lastPublishedDoi":"10.21203/rs.3.rs-9101076/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-9101076/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"\nIn the field of computer vision, keypoint detection is a crucial area of research for understanding the structure of an object. This has led to the emergence of Category-Agnostic Pose Estimation (CAPE), a novel task that utilizes a single model to localize keypoints across diverse object categories.\nTo broaden its application, recent research has incorporated text descriptions into the CAPE task. However, when extending text descriptions to the CAPE task, shortcomings in skeleton information learning and graph information propagation persist, making it difficult for the model to utilize comprehensive structural information, which in turn affects the precision of keypoint localization. In this paper, we propose a model that integrates a Semantic Skeleton Refiner and optimizes the Graph Transformer Decoder architecture, using features from text descriptions as a guidance for dynamic structure prediction. We conducted experiments on the MP-100 dataset and compared our model with existing state-of-the-art models in the CAPE domain. The experimental results indicate that our method outperforms current leading models in the field of CAPE.\n","manuscriptTitle":"Text-Guided Dynamic Structure Prediction for Category-Agnostic Pose Estimation","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2026-05-13 17:37:54","doi":"10.21203/rs.3.rs-9101076/v1","editorialEvents":[{"type":"communityComments","content":0},{"type":"reviewersInvited","content":"","date":"2026-05-12T03:20:44+00:00","index":"","fulltext":""},{"type":"editorAssigned","content":"","date":"2026-03-12T12:44:18+00:00","index":"","fulltext":""},{"type":"checksComplete","content":"","date":"2026-03-12T12:43:53+00:00","index":"","fulltext":""},{"type":"submitted","content":"Machine Vision and Applications","date":"2026-03-12T06:47:54+00:00","index":"","fulltext":""}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"machine-vision-and-applications","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":false,"externalIdentity":"mvap","sideBox":"Learn more about [Machine Vision and Applications](https://www.springer.com/journal/138)","snPcode":"138","submissionUrl":"https://submission.springernature.com/new-submission/138/3","title":"Machine Vision and Applications","twitterHandle":"","acdcEnabled":true,"dfaEnabled":true,"editorialSystem":"stoa","reportingPortfolio":"Springer Hybrid","inReviewEnabled":true,"inReviewRevisionsEnabled":false}}],"origin":"","ownerIdentity":"15611320-e1d0-4759-bb33-f7d9f7c6f146","owner":[],"postedDate":"May 13th, 2026","published":true,"recentEditorialEvents":[{"type":"reviewersInvited","content":"9","date":"2026-05-12T03:20:44+00:00","index":"","fulltext":""}],"rejectedJournal":[],"revision":"","amendment":"","status":"under-review","subjectAreas":[],"tags":[],"updatedAt":"2026-05-13T17:37:54+00:00","versionOfRecord":[],"versionCreatedAt":"2026-05-13 17:37:54","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-9101076","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-9101076","identity":"rs-9101076","version":["v1"]},"buildId":"XKTyCvWXoU3ODBz1xrDgd","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.