Parallel Architectures for Large - Scale Document Processing:Integrating OCR and RAG Pipelines | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Research Article Parallel Architectures for Large - Scale Document Processing:Integrating OCR and RAG Pipelines Alejandro Jaime, Veronica Gil-Costa, Marcelo Errecalde, Leticia Cagnina This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-8602947/v1 This work is licensed under a CC BY 4.0 License Status: Posted Version 1 posted You are reading this latest preprint version Abstract This paper shows that enterprise-scale OCR processing is achievable using consumer-grade hardware and open-source software, eliminating dependence on expensive cloud services. We present three parallel architectures for massive PDF document processing: (1) a Ray-based distributed pipeline with integrated RAG capabilities achieving 24.3x speedup with fault tolerance, (2) a local multi-process architecture using ProcessPoolExecutor that achieves 69.9x speedup---reducing processing time from 5 hours to 4.3 minutes for 11,368 pages, and (3) a hybrid design combining Ray orchestration with optimized local workers, projecting 199x speedup ( 1.5 minutes) with three GPUs. Experiments on banking documents using an Intel Core i9 with dual RTX 4090 GPUs (\ $ 5,000-7,000 USD) demonstrate super-linear scaling efficiency up to 1,531% in CPU+GPU configurations. Quality evaluation against Azure Document Intelligence establishes a 24.78% Character Error Rate for the open-source pipeline (PaddleOCR + fuzzy reconstruction), quantifying the fundamental speed-quality trade-off between 100 and 300 DPI processing. These results democratize capabilities previously exclusive to commercial cloud services, enabling organizations to process large document corpora at enterprise throughput without per-page API costs or vendor lock-in. OCR PDF Processing Ray orchestratio Full Text Additional Declarations No competing interests reported. Cite Share Download PDF Status: Posted Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-8602947","acceptedTermsAndConditions":true,"allowDirectSubmit":true,"archivedVersions":[],"articleType":"Research Article","associatedPublications":[],"authors":[{"id":575419649,"identity":"ead63798-814c-4b5a-985d-2a866b638f8f","order_by":0,"name":"Alejandro Jaime","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAABFUlEQVRIie2PMUvEMBTHHxTapdL15LD9BMILBVFO+jGcUwLN0uNudCwUOukHkA5+BQ8hOAYyuPgFxOW6uAkdvUE0bcRBTF0F8wvkveH98s8DcDj+IgEAAnhDS/WBeOjkdkrxvinpqNBflK97IK+MbSeqPbne3S1WEUDXd+Upvz5TnU7J4sPqZ2WmfEouH/jJVQVslotiuXkqUCssPZKWGBUi7jUKUQLVilpuWjooMhcWJVFRT96Mwl5z8c5Jy/tJBVUI6WdKoVMkTebldApRPqYHDcf9GorjXDByMy/XkqJ9l/i+fiYvzQKj4II97kSWJC2/7fvzLLauD+DjWLyQmq+Ok2gdH2e3pgbm1aSanHY4HI5/yAc0NGCPFlvEwgAAAABJRU5ErkJggg==","orcid":"","institution":"National University of La Plata","correspondingAuthor":true,"prefix":"","firstName":"Alejandro","middleName":"","lastName":"Jaime","suffix":""},{"id":575419651,"identity":"8b7613b8-bf83-40fb-8983-5d38725872cb","order_by":1,"name":"Veronica Gil-Costa","email":"","orcid":"","institution":"National University of San Luis","correspondingAuthor":false,"prefix":"","firstName":"Veronica","middleName":"","lastName":"Gil-Costa","suffix":""},{"id":575419654,"identity":"b1119a5f-fcb6-40ed-ab55-c1265ae9431e","order_by":2,"name":"Marcelo Errecalde","email":"","orcid":"","institution":"National University of San Luis","correspondingAuthor":false,"prefix":"","firstName":"Marcelo","middleName":"","lastName":"Errecalde","suffix":""},{"id":575419656,"identity":"4a73900b-fd95-475a-a100-b7fc162025df","order_by":3,"name":"Leticia Cagnina","email":"","orcid":"","institution":"National University of San Luis","correspondingAuthor":false,"prefix":"","firstName":"Leticia","middleName":"","lastName":"Cagnina","suffix":""}],"badges":[],"createdAt":"2026-01-14 14:53:02","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-8602947/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-8602947/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":100561372,"identity":"b8b3412b-de73-4e53-a149-b98d5dbf8dbe","added_by":"auto","created_at":"2026-01-19 08:44:00","extension":"json","order_by":0,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":5322,"visible":true,"origin":"","legend":"","description":"","filename":"ecfceac0f4614b93bd24fa953bda7119.json","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/123375c2a602bea1a9b641b7.json"},{"id":100560963,"identity":"c66ffc84-7ca3-4511-b713-3b6ffae58068","added_by":"auto","created_at":"2026-01-19 08:43:54","extension":"xml","order_by":1,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":149172,"visible":true,"origin":"","legend":"","description":"","filename":"ecfceac0f4614b93bd24fa953bda71191enriched.xml","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/928bdd5763761ca14e3f4adb.xml"},{"id":100561250,"identity":"91760bca-aad3-4934-9ab2-295c29721e49","added_by":"auto","created_at":"2026-01-19 08:43:58","extension":"pdf","order_by":2,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":481756,"visible":true,"origin":"","legend":"","description":"","filename":"AlejandroJaimeArquitecturaParalelaOCR.pdf","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/178d6a2c0c9c5668527f99de.pdf"},{"id":100561187,"identity":"f51da928-5d88-4cd2-9e62-9c7c63af9d64","added_by":"auto","created_at":"2026-01-19 08:43:58","extension":"pdf","order_by":3,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":40821,"visible":true,"origin":"","legend":"","description":"","filename":"CoverLetters.pdf","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/9296562bb211aa3237945174.pdf"},{"id":100560996,"identity":"0fdfa47e-46eb-403c-9ec8-f4212c6ffcec","added_by":"auto","created_at":"2026-01-19 08:43:54","extension":"eps","order_by":4,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":2890,"visible":true,"origin":"","legend":"","description":"","filename":"empty.eps","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/854c67baab5ba821cfe58a30.eps"},{"id":100561168,"identity":"98a48a6d-0a82-4246-a5a5-7d037e7d7b30","added_by":"auto","created_at":"2026-01-19 08:43:57","extension":"bst","order_by":5,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":146013,"visible":true,"origin":"","legend":"","description":"","filename":"snapacite.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/713b64ee075e2a57e4f4fb05.bst"},{"id":100561044,"identity":"fb0825aa-e335-449a-a254-01402486b65f","added_by":"auto","created_at":"2026-01-19 08:43:55","extension":"bst","order_by":6,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":29828,"visible":true,"origin":"","legend":"","description":"","filename":"snaps.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/c08b28e2c063096e1f6a18a2.bst"},{"id":100594733,"identity":"adea788c-e89a-4a32-892f-fc9bb6accced","added_by":"auto","created_at":"2026-01-19 13:44:35","extension":"pdf","order_by":7,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":421391,"visible":true,"origin":"","legend":"","description":"","filename":"snarticle.pdf","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/5458b590c9c918f4f13be7d5.pdf"},{"id":100561103,"identity":"1a8b0de7-7e89-467b-91df-2b0a2a2d6cb1","added_by":"auto","created_at":"2026-01-19 08:43:56","extension":"bst","order_by":8,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":35515,"visible":true,"origin":"","legend":"","description":"","filename":"snbasic.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/a8e78f03b1b402823a4fc27f.bst"},{"id":100561088,"identity":"3b96e736-185e-4999-8209-9e683906bbc2","added_by":"auto","created_at":"2026-01-19 08:43:56","extension":"bst","order_by":9,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":33968,"visible":true,"origin":"","legend":"","description":"","filename":"snchicago.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/0d4992ce0c9d72477c333888.bst"},{"id":100561356,"identity":"1d12bd56-624a-470c-9551-6c6c01441846","added_by":"auto","created_at":"2026-01-19 08:43:59","extension":"cls","order_by":10,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":55857,"visible":true,"origin":"","legend":"","description":"","filename":"snjnl.cls","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/bb3f8df97360a2241eeac054.cls"},{"id":100561366,"identity":"68fa58f6-2eef-4013-afdc-43a1a9a104c8","added_by":"auto","created_at":"2026-01-19 08:44:00","extension":"bst","order_by":11,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":64023,"visible":true,"origin":"","legend":"","description":"","filename":"snmathphysay.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/0cc6c8c78fe9aa8734f12452.bst"},{"id":100561173,"identity":"564ac667-2410-4fab-8924-d28d7e00f20d","added_by":"auto","created_at":"2026-01-19 08:43:57","extension":"bst","order_by":12,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":64166,"visible":true,"origin":"","legend":"","description":"","filename":"snmathphysnum.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/c916354fa219647aae5356f2.bst"},{"id":100561093,"identity":"46d979b3-1d31-4ac9-8966-593338d28390","added_by":"auto","created_at":"2026-01-19 08:43:56","extension":"bst","order_by":13,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":37333,"visible":true,"origin":"","legend":"","description":"","filename":"snnature.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/54d29f3948499661942f0a02.bst"},{"id":100560973,"identity":"72b3ab8d-63c3-4c80-951b-d766d46a5f4a","added_by":"auto","created_at":"2026-01-19 08:43:54","extension":"bst","order_by":14,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":39951,"visible":true,"origin":"","legend":"","description":"","filename":"snvancouveray.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/e5846fd40cab822c008b4b23.bst"},{"id":100560907,"identity":"5a1067ee-407a-40ef-b54f-792ec9b79d36","added_by":"auto","created_at":"2026-01-19 08:43:52","extension":"bst","order_by":15,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":40758,"visible":true,"origin":"","legend":"","description":"","filename":"snvancouvernum.bst","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/86cf9dc12122f20f14d8182e.bst"},{"id":100595822,"identity":"3b835279-c66a-49e3-8160-a268a686e62b","added_by":"auto","created_at":"2026-01-19 13:49:26","extension":"xml","order_by":16,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":156515,"visible":true,"origin":"","legend":"","description":"","filename":"ecfceac0f4614b93bd24fa953bda71191structuring.xml","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/ad9351add48a1009746d2d75.xml"},{"id":100561076,"identity":"57837351-f1e9-4e77-83aa-1006748a2545","added_by":"auto","created_at":"2026-01-19 08:43:56","extension":"html","order_by":17,"title":"","display":"","copyAsset":false,"role":"acdc-reference","size":168293,"visible":true,"origin":"","legend":"","description":"","filename":"earlyproof.html","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1/d76b971394ec813f10204568.html"},{"id":105135971,"identity":"3de05d4c-0717-4478-918f-c22ceef98db5","added_by":"auto","created_at":"2026-03-22 07:39:48","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":471740,"visible":true,"origin":"","legend":"","description":"","filename":"AlejandroJaimeArquitecturaParalelaOCR.pdf","url":"https://assets-eu.researchsquare.com/files/rs-8602947/v1_covered_c212607b-623b-4607-a43d-849df2b9cb24.pdf"}],"financialInterests":"No competing interests reported.","formattedTitle":"Parallel Architectures for Large - Scale Document Processing:Integrating OCR and RAG Pipelines","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":true,"hideJournal":true,"highlight":"","institution":"","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true},"keywords":"OCR, PDF Processing, Ray orchestratio","lastPublishedDoi":"10.21203/rs.3.rs-8602947/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-8602947/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"\u003cp\u003eThis paper shows that enterprise-scale OCR processing is achievable using consumer-grade hardware and open-source software, eliminating dependence on expensive cloud services. We present three parallel architectures for massive PDF document processing: (1) a Ray-based distributed pipeline with integrated RAG capabilities achieving 24.3x speedup with fault tolerance, (2) a local multi-process architecture using ProcessPoolExecutor that achieves 69.9x speedup---reducing processing time from 5 hours to 4.3 minutes for 11,368 pages, and (3) a hybrid design combining Ray orchestration with optimized local workers, projecting 199x speedup ( 1.5 minutes) with three GPUs. Experiments on banking documents using an Intel Core i9 with dual RTX 4090 GPUs (\\\u003cspan\u003e$\u003c/span\u003e5,000-7,000 USD) demonstrate super-linear scaling efficiency up to 1,531% in CPU+GPU configurations. Quality evaluation against Azure Document Intelligence establishes a 24.78% Character Error Rate for the open-source pipeline (PaddleOCR + fuzzy reconstruction), quantifying the fundamental speed-quality trade-off between 100 and 300 DPI processing. These results democratize capabilities previously exclusive to commercial cloud services, enabling organizations to process large document corpora at enterprise throughput without per-page API costs or vendor lock-in.\u003c/p\u003e","manuscriptTitle":"Parallel Architectures for Large - Scale Document Processing:Integrating OCR and RAG Pipelines","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2026-01-19 08:29:17","doi":"10.21203/rs.3.rs-8602947/v1","editorialEvents":[{"type":"communityComments","content":0}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"475ff465-9384-46c5-948e-312a081e4678","owner":[],"postedDate":"January 19th, 2026","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"posted","subjectAreas":[],"tags":[],"updatedAt":"2026-03-22T07:39:14+00:00","versionOfRecord":[],"versionCreatedAt":"2026-01-19 08:29:17","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-8602947","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-8602947","identity":"rs-8602947","version":["v1"]},"buildId":"XKTyCvWXoU3ODBz1xrDgd","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.