Enhanced Techniques for Pedestrian Localization and Movement Prediction with Single-View Angle Analysis | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Research Article Enhanced Techniques for Pedestrian Localization and Movement Prediction with Single-View Angle Analysis Lin Gao, Handong Li, Yanyun Li, Fan Liu This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-4855405/v1 This work is licensed under a CC BY 4.0 License Status: Posted Version 1 posted You are reading this latest preprint version Abstract Based on a single-view scenario, our approach employs a unified Transformer framework featuring an enhanced multi-scale sparse attention mechanism to simultaneously perform three tasks: multi-pedestrian 3D pose estimation, tracking, and prediction. Initially, video data is processed to extract information, followed by training a video transformer to encode spatio-temporal features from multiple frames. This transformer decodes significant pose features from multi-person pose queries. These pose queries are then used for regression to predict multi-person pose trajectories and future movements in a single shot. To mitigate the challenges of occlusion and the complexity of pedestrian motion, we utilize a backbone network to extract features and implement an improved multi-scale spatio-temporal attention mechanism. This mechanism aggregates spatio-temporal information from multiple frames at various scales and captures long-term interactions. The backbone network excels at extracting detailed features from video data, while the multi-scale spatio-temporal attention mechanism, with its compact parameters, ensures a balance between efficiency and accuracy. Consequently, the integration of these components enhances prediction accuracy without excessively increasing model parameters. Transformer framework multi-frame clips multi-scale sparse attention mechanism multi-scale spatio-temporal attention mechanism Full Text Additional Declarations No competing interests reported. Cite Share Download PDF Status: Posted Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-4855405","acceptedTermsAndConditions":true,"allowDirectSubmit":true,"archivedVersions":[],"articleType":"Research Article","associatedPublications":[],"authors":[{"id":339440575,"identity":"1b675bf5-0c3a-49d6-a81a-8943cea66a46","order_by":0,"name":"Lin Gao","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAA9UlEQVRIiWNgGAWjYHACxgNAQo6fvQHCayBGD0iLsWTPARK1JG64kUCkFnn/AwwHv9TYMDbcfH75Mw+DjeyGA8zPHuDTYgg0/LDMsTRmxtk5BcY8DGnGGw6wmRvg1TKDgeGwBNthNmbpnIRkHobDiRsO8LBJ4NXSfwCo5d9hHjbJMwmHeRj+E9Yiz5DAcPBj22EJHgn2g808DAcIazGQSGw4zNiXZiDBk8PMOMcg2XjmYTYz/Lb0Hz748Mc3m/r9x48//vCmwk6273jzM/y2HGBsYOYBM3mA4QQKKmZ86kG2NABj7weYyf6AgNpRMApGwSgYqQAAWG9MYvvXamMAAAAASUVORK5CYII=","orcid":"","institution":"Guizhou University","correspondingAuthor":true,"prefix":"","firstName":"Lin","middleName":"","lastName":"Gao","suffix":""},{"id":339440576,"identity":"477f2118-fba8-447e-86be-10d496fcbc6a","order_by":1,"name":"Handong Li","email":"","orcid":"","institution":"Guizhou University","correspondingAuthor":false,"prefix":"","firstName":"Handong","middleName":"","lastName":"Li","suffix":""},{"id":339440577,"identity":"fac40402-87c8-4c1a-beaf-b1f8bb18e5a1","order_by":2,"name":"Yanyun Li","email":"","orcid":"","institution":"Guizhou University","correspondingAuthor":false,"prefix":"","firstName":"Yanyun","middleName":"","lastName":"Li","suffix":""},{"id":339440578,"identity":"6721ebdd-429f-4ab9-bebc-e83bf6e4380a","order_by":3,"name":"Fan Liu","email":"","orcid":"","institution":"Guizhou University","correspondingAuthor":false,"prefix":"","firstName":"Fan","middleName":"","lastName":"Liu","suffix":""}],"badges":[],"createdAt":"2024-08-04 05:08:20","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-4855405/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-4855405/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":64002957,"identity":"6bfb0838-6f8f-498f-b652-cd0bcbd864f4","added_by":"auto","created_at":"2024-09-04 20:42:15","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":506283,"visible":true,"origin":"","legend":"","description":"","filename":"EnhancedTechniquesforPedestrianLocalizationandMovementPredictionwithSingleViewAngleAnalysis.pdf","url":"https://assets-eu.researchsquare.com/files/rs-4855405/v1_covered_06f341c7-49cb-4bcd-9d90-925fba7208fa.pdf"}],"financialInterests":"No competing interests reported.","formattedTitle":"Enhanced Techniques for Pedestrian Localization and Movement Prediction with Single-View Angle Analysis","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":false,"hideJournal":true,"highlight":"","institution":"","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true},"keywords":"Transformer framework, multi-frame clips, multi-scale sparse attention mechanism, multi-scale spatio-temporal attention mechanism","lastPublishedDoi":"10.21203/rs.3.rs-4855405/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-4855405/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"\u003cp\u003eBased on a single-view scenario, our approach employs a unified Transformer framework featuring an enhanced multi-scale sparse attention mechanism to simultaneously perform three tasks: multi-pedestrian 3D pose estimation, tracking, and prediction. Initially, video data is processed to extract information, followed by training a video transformer to encode spatio-temporal features from multiple frames. This transformer decodes significant pose features from multi-person pose queries. These pose queries are then used for regression to predict multi-person pose trajectories and future movements in a single shot. To mitigate the challenges of occlusion and the complexity of pedestrian motion, we utilize a backbone network to extract features and implement an improved multi-scale spatio-temporal attention mechanism. This mechanism aggregates spatio-temporal information from multiple frames at various scales and captures long-term interactions. The backbone network excels at extracting detailed features from video data, while the multi-scale spatio-temporal attention mechanism, with its compact parameters, ensures a balance between efficiency and accuracy. Consequently, the integration of these components enhances prediction accuracy without excessively increasing model parameters.\u003c/p\u003e","manuscriptTitle":"Enhanced Techniques for Pedestrian Localization and Movement Prediction with Single-View Angle Analysis","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2024-08-30 06:12:07","doi":"10.21203/rs.3.rs-4855405/v1","editorialEvents":[{"type":"communityComments","content":0}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"9bddd390-4ae2-46ca-8283-82019bc09a5d","owner":[],"postedDate":"August 30th, 2024","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"posted","subjectAreas":[],"tags":[],"updatedAt":"2024-09-04T20:34:07+00:00","versionOfRecord":[],"versionCreatedAt":"2024-08-30 06:12:07","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-4855405","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-4855405","identity":"rs-4855405","version":["v1"]},"buildId":"qtupq5eGEP_6zYnWcrvyt","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.