Large-Scale Hybrid Dialogue Data Processing for Transformer-Based Generative Chatbots Using Pretrained DeBERTa Embeddings | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Research Article Large-Scale Hybrid Dialogue Data Processing for Transformer-Based Generative Chatbots Using Pretrained DeBERTa Embeddings Tarek Barhoum, Mina Ibrahim, Karam Alghazi This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-7530129/v1 This work is licensed under a CC BY 4.0 License Status: Posted Version 1 posted You are reading this latest preprint version Abstract This paper presents a scalable generative chatbot built on a Transformer-based encoder decoder architecture with pretrained DeBERTa embeddings. The model is trained on a hybrid large scale dialogue corpuscomprising over 120K question–answer pairs from real-world datasets (QuAC, DailyDialog) and curated synthetic dialogues generated by large language models. The architecture incorporates multi-head self-attention, positional encoding, residual connections, and a pre-norm strategy to enhance contextual understanding and generalization. Experimental results demonstrate a training accuracy of 99% and a BLEU score of 90.1%, highlighting the model’s effectiveness in processing and generating coherent responses from massive heterogeneous conversational datasets. This work contributes to big data analytics in NLP by integrating large-scale dataset curation with advanced Transformer-based modeling for conversational AI. Artificial Intelligence and Machine Learning Big Data Analytics Large-Scale Natural Language Processing Transformer Architecture Pretrained DeBERTa Embeddings Hybrid Dialogue Corpora Generative Chatbot Large Language Models (LLMs) Scalable Conversational AI Multi-Head Self-Attention Contextual Response Generation. Full Text Additional Declarations The authors declare no competing interests. Cite Share Download PDF Status: Posted Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-7530129","acceptedTermsAndConditions":true,"allowDirectSubmit":true,"archivedVersions":[],"articleType":"Research Article","associatedPublications":[],"authors":[{"id":509883237,"identity":"4e2256aa-28ce-4981-98ba-c53f7cf7748d","order_by":0,"name":"Tarek Barhoum","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAA00lEQVRIiWNgGAWjYFACHgYGxgYGBn4wycBMjAaoFskGkrUYHADzidBiz8B78HPljsPRxscPt0kwVFgnNvCfMSBgC1+y5Nkzh3O3nUkEajmTntggkUNAi/wbA8nGNqCWA0AtjG2HgVp4CNnCY/wTpGVz/0Ogln+HiXEYjxnYlg0SIFsagFoYCDnsAF+aZWNbeu6MGw+bLRKOpRu3SaQV4NXC3sB7+GZjm3Vuf3/6wxsfaqxl+/kPb8CrBRUkADEbAwd+h2G1+QHJWkbBKBgFo2BYAwCWcEX6HQh8dgAAAABJRU5ErkJggg==","orcid":"https://orcid.org/0000-0002-9132-6940","institution":"Arab International University","correspondingAuthor":true,"prefix":"","firstName":"Tarek","middleName":"","lastName":"Barhoum","suffix":""},{"id":509883238,"identity":"00b13fc9-4410-4da3-8693-2f9222e8697e","order_by":1,"name":"Mina Ibrahim","email":"","orcid":"","institution":"Arab Inernational University","correspondingAuthor":false,"prefix":"","firstName":"Mina","middleName":"","lastName":"Ibrahim","suffix":""},{"id":509883240,"identity":"43e179bf-10c4-491d-9a21-cc84609430f6","order_by":2,"name":"Karam Alghazi","email":"","orcid":"","institution":"Arab International University","correspondingAuthor":false,"prefix":"","firstName":"Karam","middleName":"","lastName":"Alghazi","suffix":""}],"badges":[],"createdAt":"2025-09-03 20:35:12","currentVersionCode":1,"declarations":{"humanSubjects":false,"vertebrateSubjects":true,"conflictsOfInterestStatement":false,"humanSubjectEthicalGuidelines":false,"humanSubjectConsent":false,"humanSubjectClinicalTrial":false,"humanSubjectCaseReport":false,"vertebrateSubjectEthicalGuidelines":true},"doi":"10.21203/rs.3.rs-7530129/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-7530129/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":92061651,"identity":"7df15c3a-6474-48e2-86a9-626cdf0f07df","added_by":"auto","created_at":"2025-09-24 08:23:17","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":609962,"visible":true,"origin":"","legend":"","description":"","filename":"LargeScaleHybrid.pdf","url":"https://assets-eu.researchsquare.com/files/rs-7530129/v1_covered_1e74c20b-30c8-4eb7-ada3-5d4e9954687a.pdf"}],"financialInterests":"The authors declare no competing interests.","formattedTitle":"\u003cp\u003e\u003cstrong\u003eLarge-Scale Hybrid Dialogue Data Processing for Transformer-Based Generative Chatbots Using Pretrained DeBERTa Embeddings\u003c/strong\u003e\u003c/p\u003e","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":true,"hideJournal":true,"highlight":"","institution":"Arab International University","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true},"keywords":"Big Data Analytics, Large-Scale Natural Language Processing, Transformer Architecture, Pretrained DeBERTa Embeddings, Hybrid Dialogue Corpora, Generative Chatbot, Large Language Models (LLMs), Scalable Conversational AI, Multi-Head Self-Attention, Contextual Response Generation.","lastPublishedDoi":"10.21203/rs.3.rs-7530129/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-7530129/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"\u003cp\u003eThis paper presents a scalable generative chatbot built on a Transformer-based encoder decoder architecture with pretrained DeBERTa embeddings. The model is trained on a hybrid large scale dialogue corpuscomprising over 120K question–answer pairs from real-world datasets (QuAC, DailyDialog) and curated synthetic dialogues generated by large language models. The architecture incorporates multi-head self-attention, positional encoding, residual connections, and a pre-norm strategy to enhance contextual understanding and generalization. Experimental results demonstrate a training accuracy of 99% and a BLEU score of 90.1%, highlighting the model’s effectiveness in processing and generating coherent responses from massive heterogeneous conversational datasets. This work contributes to big data analytics in NLP by integrating large-scale dataset curation with advanced Transformer-based modeling for conversational AI.\u003c/p\u003e","manuscriptTitle":"Large-Scale Hybrid Dialogue Data Processing for Transformer-Based Generative Chatbots Using Pretrained DeBERTa Embeddings","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2025-09-24 08:15:09","doi":"10.21203/rs.3.rs-7530129/v1","editorialEvents":[{"type":"communityComments","content":0}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"549d6b96-4626-421c-9b17-a5e18ef2167d","owner":[],"postedDate":"September 24th, 2025","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"posted","subjectAreas":[{"id":54157413,"name":"Artificial Intelligence and Machine Learning"}],"tags":[],"updatedAt":"2025-09-24T08:15:09+00:00","versionOfRecord":[],"versionCreatedAt":"2025-09-24 08:15:09","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-7530129","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-7530129","identity":"rs-7530129","version":["v1"]},"buildId":"8U1c8b4HqxoKbykW_rLl7","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.