A Uyghur–Chinese parallel dataset of proverbs | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF data-descriptor A Uyghur–Chinese parallel dataset of proverbs xiang yi, Maitiyasen Duolaitiniyazi, Sulaiman Maitusun, Feiruzai Abulikemu, and 2 more This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-8773278/v1 This work is licensed under a CC BY 4.0 License Status: Posted Version 1 posted You are reading this latest preprint version Abstract Uyghur-Chinese paired resources that couple brief, culturally situated expressions with matched speech remain scarce, yet they are valuable for low-resource machine translation and speech-enabled NLP. We introduce UyZh-FolkSpeech, a manually curated Uyghur-Chinese dataset covering proverbs, frequently used daily phrases, and common words or short phrases, collected entirely in-house by the author team through elicitation, transcription, and bilingual alignment. 1 , 2 The release provides 1,984 text items in total, including 953 short-sentence pairs and 1,031 word/short-phrase entries, each assigned an immutable identifier (UYZH-S-* for sentences; UYZH-W-* for words/phrases). For every item, we release four native-speaker recordings (S01–S04; two male and two female), yielding 7,936 linked audio clips. Across the full audio collection, the total duration is 08:28:17 (30,497.196 seconds, approximately 8.47 hours), and the mean clip duration is 3.84 seconds. All audio is distributed as M4A files encoded with AAC-LC at 48 kHz, mono, with a target bitrate of approximately 64 kbps, and is linked to text records via a manifest that includes required technical metadata. The package further includes recommended train/validation/test splits (including a fixed eval50 list), and optional scripts/configs to reproduce the provided fine-tuning example. The dataset (text, metadata, and audio) is released under CC BY 4.0 and is version-pinned to the GitHub release tag release-2026-01-25 alongside the corresponding Hugging Face dataset page. Full Text Additional Declarations No competing interests reported. Cite Share Download PDF Status: Posted Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-8773278","acceptedTermsAndConditions":true,"allowDirectSubmit":true,"archivedVersions":[],"articleType":"data-descriptor","associatedPublications":[],"authors":[{"id":621890826,"identity":"25deb3ea-8725-4125-b5ef-cc141dcddae8","order_by":0,"name":"xiang yi","email":"","orcid":"","institution":"Northwest Minzu University","correspondingAuthor":false,"prefix":"","firstName":"xiang","middleName":"","lastName":"yi","suffix":""},{"id":621890827,"identity":"74411046-b760-4133-9efb-10775201e777","order_by":1,"name":"Maitiyasen Duolaitiniyazi","email":"","orcid":"","institution":"Northwest Minzu University","correspondingAuthor":false,"prefix":"","firstName":"Maitiyasen","middleName":"","lastName":"Duolaitiniyazi","suffix":""},{"id":621890828,"identity":"af7a1782-38cc-45d1-8d76-04bfcf5e29bb","order_by":2,"name":"Sulaiman Maitusun","email":"","orcid":"","institution":"Northwest Minzu University","correspondingAuthor":false,"prefix":"","firstName":"Sulaiman","middleName":"","lastName":"Maitusun","suffix":""},{"id":621890829,"identity":"1aa06eaa-b5a4-4a21-af61-f4a4e731a005","order_by":3,"name":"Feiruzai Abulikemu","email":"","orcid":"","institution":"Northwest Minzu University","correspondingAuthor":false,"prefix":"","firstName":"Feiruzai","middleName":"","lastName":"Abulikemu","suffix":""},{"id":621890830,"identity":"117ec217-5c73-4b9d-86d1-86273ab48609","order_by":4,"name":"Najiye Aimilajiang","email":"","orcid":"","institution":"Northwest Minzu University","correspondingAuthor":false,"prefix":"","firstName":"Najiye","middleName":"","lastName":"Aimilajiang","suffix":""},{"id":621890831,"identity":"ffdb6699-d482-4a17-a520-9814441cddde","order_by":5,"name":"Abuduwaili Keremu","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAzklEQVRIiWNgGAWjYBAC+/bmgw8+8NjU2x9vIFKLAc+xZMMZMmkJDGcOEKtFIsdMmMfmcALDjQQitZhLJJgx8OQw5zHOfLzxBkONTTRBLZY9D9IeSJxhK2aWTiu2YDiWlttAUM/xhOMGhj08jG3SOWYSjA2HidByILFNIvGfBGOP5BkitRicSGaTOMBjkDhDgodILZI9x5gNG3gSjA14gH5JIMYv/Oz9Hx//4fkvZ8B+eOONDzU2RPgF2ZESCaQoh2ghVccoGAWjYBSMDAAALtpAsfFlILIAAAAASUVORK5CYII=","orcid":"","institution":"Northwest Minzu University","correspondingAuthor":true,"prefix":"","firstName":"Abuduwaili","middleName":"","lastName":"Keremu","suffix":""}],"badges":[],"createdAt":"2026-02-03 08:53:35","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-8773278/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-8773278/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":107483603,"identity":"eeaa71ef-0037-4459-a0e8-a2b3f3c147bb","added_by":"auto","created_at":"2026-04-22 02:28:21","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":491008,"visible":true,"origin":"","legend":"","description":"","filename":"AUyghurChineseparalleldatasetofproverbs.pdf","url":"https://assets-eu.researchsquare.com/files/rs-8773278/v1_covered_40f2e175-0c3b-4e00-925e-655c32e29204.pdf"}],"financialInterests":"No competing interests reported.","formattedTitle":"A Uyghur–Chinese parallel dataset of proverbs","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":false,"hideJournal":true,"highlight":"","institution":"","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true},"keywords":"","lastPublishedDoi":"10.21203/rs.3.rs-8773278/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-8773278/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"\u003cp\u003eUyghur-Chinese paired resources that couple brief, culturally situated expressions with matched speech remain scarce, yet they are valuable for low-resource machine translation and speech-enabled NLP. We introduce UyZh-FolkSpeech, a manually curated Uyghur-Chinese dataset covering proverbs, frequently used daily phrases, and common words or short phrases, collected entirely in-house by the author team through elicitation, transcription, and bilingual alignment.\u003csup\u003e\u003cspan citationid=\"CR1\" class=\"CitationRef\"\u003e1\u003c/span\u003e,\u003cspan citationid=\"CR2\" class=\"CitationRef\"\u003e2\u003c/span\u003e\u003c/sup\u003e\u003c/p\u003e \u003cp\u003eThe release provides 1,984 text items in total, including 953 short-sentence pairs and 1,031 word/short-phrase entries, each assigned an immutable identifier (UYZH-S-* for sentences; UYZH-W-* for words/phrases). For every item, we release four native-speaker recordings (S01\u0026ndash;S04; two male and two female), yielding 7,936 linked audio clips. Across the full audio collection, the total duration is 08:28:17 (30,497.196 seconds, approximately 8.47 hours), and the mean clip duration is 3.84 seconds. All audio is distributed as M4A files encoded with AAC-LC at 48 kHz, mono, with a target bitrate of approximately 64 kbps, and is linked to text records via a manifest that includes required technical metadata. The package further includes recommended train/validation/test splits (including a fixed eval50 list), and optional scripts/configs to reproduce the provided fine-tuning example. The dataset (text, metadata, and audio) is released under CC BY 4.0 and is version-pinned to the GitHub release tag release-2026-01-25 alongside the corresponding Hugging Face dataset page.\u003c/p\u003e","manuscriptTitle":"A Uyghur–Chinese parallel dataset of proverbs","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2026-04-17 19:59:49","doi":"10.21203/rs.3.rs-8773278/v1","editorialEvents":[{"type":"communityComments","content":0}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"8adf88ac-d1a0-41cc-8233-3eba51fdc231","owner":[],"postedDate":"April 17th, 2026","published":true,"recentEditorialEvents":[{"type":"editorInvitedReview","content":"","date":"2026-04-29T03:06:05+00:00","index":21,"fulltext":""}],"rejectedJournal":[],"revision":"","amendment":"","status":"posted","subjectAreas":[],"tags":[],"updatedAt":"2026-04-17T19:59:50+00:00","versionOfRecord":[],"versionCreatedAt":"2026-04-17 19:59:49","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-8773278","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-8773278","identity":"rs-8773278","version":["v1"]},"buildId":"XKTyCvWXoU3ODBz1xrDgd","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.