BioMatics 1.0: A Wasserstein Distance Approach for Next-Generation Multiple Sequence Alignment | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Biological Sciences - Article BioMatics 1.0: A Wasserstein Distance Approach for Next-Generation Multiple Sequence Alignment Orkid Coskuner-Weber, Yusuf Ari, Yildiray Berberoglu, Vladimir Uversky This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-7032532/v1 This work is licensed under a CC BY 4.0 License Status: Posted Version 1 posted You are reading this latest preprint version Abstract Accurate multiple sequence alignment (MSA) is central to understanding protein evolution, structure, and function. We present BioMatics 1.0, a novel MSA algorithm that applies optimal transport principles through the Wasserstein distance to align amino acid distributions across positions, enabling refined detection of structural and evolutionary patterns. Unlike conventional score-based methods, BioMatics 1.0 constructs profile-to-profile alignments using Earth Mover’s Distance over per-position frequency vectors, guided by BLOSUM62 log-odds similarity. This is complemented by entropy-adaptive gap penalties that dynamically modulate alignment behavior in variable or weakly conserved regions. Benchmark evaluations across curated datasets spanning conserved domains, structural motifs, and heterogeneous families demonstrate that BioMatics 1.0 outperforms widely used tools in Column Score (CS) accuracy and achieves competitive or comparable Sum-of-Pairs Score (SPS) results. Its architecture prioritizes residue-level alignment precision, yielding results that are particularly informative for downstream tasks such as phylogenetic reconstruction and structure-informed modeling. Biological sciences/Computational biology and bioinformatics/Computational models Biological sciences/Biological techniques/Bioinformatics Multiple Sequence Alignment (MSA) Wasserstein Distance Optimal Transport Entropy-Weighted Gap Penalties Evolutionary Signal Advancement Full Text Additional Declarations There is NO Competing Interest. Supplementary Files SupportingMaterialsSectionBiomatics1.0OCW.docx BioMatics 1.0: A Wasserstein Distance Approach for Next-Generation Multiple Sequence Alignment Cite Share Download PDF Status: Posted Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-7032532","acceptedTermsAndConditions":true,"allowDirectSubmit":true,"archivedVersions":[],"articleType":"Biological Sciences - Article","associatedPublications":[],"authors":[{"id":480226489,"identity":"f88fab0c-97ac-4f51-93d3-bf488a0303ca","order_by":0,"name":"Orkid Coskuner-Weber","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAuElEQVRIiWNgGAWjYFACHiCugDAlSNByxgCmxYBILYxtpGgxON578HHlvD/yBgeYD97mYfiTT1jLmXPJhme3GRhuOMCWbM3DYGDZQEiL2Y0cM8nGbQaMGw7wmEkDtRB2mdn9N+Y/G+cY2G84wP+NSC03eMwYGxsMEoG2sBGnxf5MXrJkwzHj5JmH2Ywt5xgYE9Yi2X724MeGGjnbvuPND2+8qZAjKmKggBlEkKJhFIyCUTAKRgFuAAD3AzZsqX1AxgAAAABJRU5ErkJggg==","orcid":"https://orcid.org/0000-0002-0772-9350","institution":"Turkish-German University","correspondingAuthor":true,"prefix":"","firstName":"Orkid","middleName":"","lastName":"Coskuner-Weber","suffix":""},{"id":480226490,"identity":"a7686365-8ddd-4488-bc3a-b4774b8dddbb","order_by":1,"name":"Yusuf Ari","email":"","orcid":"","institution":"Turkish-German University","correspondingAuthor":false,"prefix":"","firstName":"Yusuf","middleName":"","lastName":"Ari","suffix":""},{"id":480226491,"identity":"15c6493d-ff9a-42e9-a65d-9e2eff5f2e4e","order_by":2,"name":"Yildiray Berberoglu","email":"","orcid":"","institution":"Turkish-German University","correspondingAuthor":false,"prefix":"","firstName":"Yildiray","middleName":"","lastName":"Berberoglu","suffix":""},{"id":480226492,"identity":"7691aed8-7ae6-48fc-b585-2dd18f340c68","order_by":3,"name":"Vladimir Uversky","email":"","orcid":"https://orcid.org/0000-0002-4037-5857","institution":"University of South Florida","correspondingAuthor":false,"prefix":"","firstName":"Vladimir","middleName":"","lastName":"Uversky","suffix":""}],"badges":[],"createdAt":"2025-07-02 21:00:26","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-7032532/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-7032532/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":86691775,"identity":"2e85f5e4-813a-40c5-8fd7-10c31c6bb9ff","added_by":"auto","created_at":"2025-07-14 14:45:42","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":1109683,"visible":true,"origin":"","legend":"","description":"","filename":"Biomatics1.0OCWVU.pdf","url":"https://assets-eu.researchsquare.com/files/rs-7032532/v1_covered_ef4d6f93-282b-4019-b397-cd7061f0fd1a.pdf"},{"id":85992958,"identity":"a239c6d4-3890-46cc-b514-5e3b08e8fa84","added_by":"auto","created_at":"2025-07-04 05:36:35","extension":"docx","order_by":1,"title":"","display":"","copyAsset":false,"role":"supplement","size":26627,"visible":true,"origin":"","legend":"BioMatics 1.0: A Wasserstein Distance Approach for Next-Generation Multiple Sequence Alignment","description":"","filename":"SupportingMaterialsSectionBiomatics1.0OCW.docx","url":"https://assets-eu.researchsquare.com/files/rs-7032532/v1/bc972ddd4b00898a67e24e09.docx"}],"financialInterests":"There is \u003cb\u003eNO\u003c/b\u003e Competing Interest.","formattedTitle":"BioMatics 1.0: A Wasserstein Distance Approach for Next-Generation Multiple Sequence Alignment","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":true,"hideJournal":true,"highlight":"","institution":"","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true},"keywords":"Multiple Sequence Alignment (MSA), Wasserstein Distance, Optimal Transport, Entropy-Weighted Gap Penalties, Evolutionary Signal Advancement","lastPublishedDoi":"10.21203/rs.3.rs-7032532/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-7032532/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"Accurate multiple sequence alignment (MSA) is central to understanding protein evolution, structure, and function. We present BioMatics 1.0, a novel MSA algorithm that applies optimal transport principles through the Wasserstein distance to align amino acid distributions across positions, enabling refined detection of structural and evolutionary patterns. Unlike conventional score-based methods, BioMatics 1.0 constructs profile-to-profile alignments using Earth Mover’s Distance over per-position frequency vectors, guided by BLOSUM62 log-odds similarity. This is complemented by entropy-adaptive gap penalties that dynamically modulate alignment behavior in variable or weakly conserved regions. Benchmark evaluations across curated datasets spanning conserved domains, structural motifs, and heterogeneous families demonstrate that BioMatics 1.0 outperforms widely used tools in Column Score (CS) accuracy and achieves competitive or comparable Sum-of-Pairs Score (SPS) results. Its architecture prioritizes residue-level alignment precision, yielding results that are particularly informative for downstream tasks such as phylogenetic reconstruction and structure-informed modeling.","manuscriptTitle":"BioMatics 1.0: A Wasserstein Distance Approach for Next-Generation Multiple Sequence Alignment","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2025-07-04 05:36:31","doi":"10.21203/rs.3.rs-7032532/v1","editorialEvents":[{"type":"communityComments","content":0}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"3372d601-49ed-4261-8a9c-42a8f20c9827","owner":[],"postedDate":"July 4th, 2025","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"posted","subjectAreas":[{"id":50998853,"name":"Biological sciences/Computational biology and bioinformatics/Computational models"},{"id":50998854,"name":"Biological sciences/Biological techniques/Bioinformatics"}],"tags":[],"updatedAt":"2025-07-14T14:37:34+00:00","versionOfRecord":[],"versionCreatedAt":"2025-07-04 05:36:31","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-7032532","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-7032532","identity":"rs-7032532","version":["v1"]},"buildId":"8U1c8b4HqxoKbykW_rLl7","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.