Machine learning inference of natural product chemistry across biosynthetic gene cluster types | Research Square window.SnipcartSettings = { analytics: { enabled: false } }; (function() { var accessVector = localStorage.getItem('access_vector') || ''; window.dataLayer = window.dataLayer || []; if (accessVector) { window.dataLayer.push({ user: { profile: { profileInfo: { snid: accessVector } } } }); } })(); (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-K279D39R'); Browse Preprints In Review Journals COVID-19 Preprints AJE Video Bytes Research Tools Research Promotion AJE Professional Editing AJE Rubriq About Preprint Platform In Review Editorial Policies Our Team Advisory Board Help Center Sign In Submit a Preprint Cite Share Download PDF Article Machine learning inference of natural product chemistry across biosynthetic gene cluster types Martin Larralde, Georg Zeller This is a preprint; it has not been peer reviewed by a journal. https://doi.org/ 10.21203/rs.3.rs-6220876/v1 This work is licensed under a CC BY 4.0 License Status: Posted Version 1 posted You are reading this latest preprint version Abstract With ever-increasing volumes of sequencing data for biosynthetic gene clusters (BGCs), computational methods to accurately predict which secondary metabolites result from these are critically lacking. Here, we present CHAMOIS, a machine learning-based tool for predicting chemical properties of secondary metabolites from protein domains annotated in the input BGCs. CHAMOIS infers 485 chemical properties from the ChemOnt ontology using logistic regression. It accurately predicts 111 such properties (AUPRC > 0.5) in cross-validation against known instances. Although CHAMOIS is not explicitly trained on biosynthetic knowledge, many of the inferred links between protein domains and metabolite properties are consistent with scientific literature, others suggest new biochemical functions of uncharacterized biosynthetic domains. Finally, CHAMOIS can pinpoint which BGC within a given genome produces a pre-specified metabolite (correct BGC in 69% of cases ranked among the top 5), which holds great potential for prioritising experimental BGC characterisation and discovery of novel biosynthetic enzymes. Biological sciences/Computational biology and bioinformatics/Machine learning Biological sciences/Chemical biology/Natural products Biological sciences/Chemical biology/Biosynthesis Full Text Additional Declarations There is NO Competing Interest. Supplementary Files SupplementaryTables14.xls Supplementary Tables 360460supp493285st2d2x.docx Supplementary Figures Cite Share Download PDF Status: Posted Version 1 posted You are reading this latest preprint version Research Square lets you share your work early, gain feedback from the community, and start making changes to your manuscript prior to peer review in a journal. As a division of Research Square Company, we’re committed to making research communication faster, fairer, and more useful. We do this by developing innovative software and high quality services for the global research community. Our growing team is made up of researchers and industry professionals working together to solve the most critical problems facing scientific publishing. Also discoverable on Platform About Our Team In Review Editorial Policies Advisory Board Help Center Resources Author Services Accessibility API Access RSS feed Manage Cookie Preferences © Research Square 2026 | ISSN 2693-5015 (online) Privacy Policy Terms of Service Do Not Sell My Personal Information {"props":{"pageProps":{"initialData":{"identity":"rs-6220876","acceptedTermsAndConditions":true,"allowDirectSubmit":true,"archivedVersions":[],"articleType":"Article","associatedPublications":[],"authors":[{"id":535828234,"identity":"cad58da9-f575-4839-871a-03dc82735261","order_by":0,"name":"Martin Larralde","email":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZAAAAAyAQMAAABI0h/eAAAABlBMVEX///8AAABVwtN+AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAA3ElEQVRIiWNgGAWjYFAC5gYQKcNwgPkAsVoYwVp4GA6wJQBpA5K08BgQp0W3gbHxc+UeOx6+4z3fHldU/JEzZ+BOfIBPi9kBxmbJM8+SeSTPnN1ueOaMgbFlA+9mvHYBtTRINhxg5jG4kbtNsrHNIHHDAd5tEoRs+dlwoJ7H4P6bZzAt238Q0NIGtOUw0BYeNrgt+HQwmB1mbLNsOHAc6Jc0c8OGM8bGBod5N+N32PHmwzcbDlTL8R0//OxhQ4WcnMHx3o0f8FrDjGCyYYgQBGwkqB0Fo2AUjIKRBACDxk15TpPqMAAAAABJRU5ErkJggg==","orcid":"","institution":"Leiden University Medical Center","correspondingAuthor":true,"prefix":"","firstName":"Martin","middleName":"","lastName":"Larralde","suffix":""},{"id":535828235,"identity":"567e9a9e-6736-45ff-aa20-a6949317bab9","order_by":1,"name":"Georg Zeller","email":"","orcid":"","institution":"Leiden University Medical Center","correspondingAuthor":false,"prefix":"","firstName":"Georg","middleName":"","lastName":"Zeller","suffix":""}],"badges":[],"createdAt":"2025-03-13 14:11:09","currentVersionCode":1,"declarations":"","doi":"10.21203/rs.3.rs-6220876/v1","doiUrl":"https://doi.org/10.21203/rs.3.rs-6220876/v1","draftVersion":[],"editorialEvents":[],"editorialNote":"","failedWorkflow":false,"files":[{"id":95657429,"identity":"955f2fbc-139b-4386-96bf-f4b5bf03dae2","added_by":"auto","created_at":"2025-11-11 16:20:49","extension":"pdf","order_by":1,"title":"","display":"","copyAsset":false,"role":"manuscript-pdf","size":2370256,"visible":true,"origin":"","legend":"","description":"","filename":"MainText.pdf","url":"https://assets-eu.researchsquare.com/files/rs-6220876/v1_covered_3aa95ff7-332a-4704-8635-1b6032ac6dc0.pdf"},{"id":94584925,"identity":"4898ad5e-6e70-40ce-aca7-949513112dbe","added_by":"auto","created_at":"2025-10-28 18:15:47","extension":"xls","order_by":1,"title":"","display":"","copyAsset":false,"role":"supplement","size":3382272,"visible":true,"origin":"","legend":"Supplementary Tables","description":"","filename":"SupplementaryTables14.xls","url":"https://assets-eu.researchsquare.com/files/rs-6220876/v1/955b49f64af5228a1bf98ab3.xls"},{"id":94584793,"identity":"60e5655a-859a-4f7f-9f3f-1e257b3c9455","added_by":"auto","created_at":"2025-10-28 18:15:38","extension":"docx","order_by":2,"title":"","display":"","copyAsset":false,"role":"supplement","size":83402,"visible":true,"origin":"","legend":"Supplementary Figures","description":"","filename":"360460supp493285st2d2x.docx","url":"https://assets-eu.researchsquare.com/files/rs-6220876/v1/004643d8faead4051f285e1a.docx"}],"financialInterests":"There is \u003cb\u003eNO\u003c/b\u003e Competing Interest.","formattedTitle":"Machine learning inference of natural product chemistry across biosynthetic gene cluster types","fulltext":[],"fulltextSource":"","fullText":"","funders":[],"hasAdminPriorityOnWorkflow":false,"hasManuscriptDocX":false,"hasOptedInToPreprint":true,"hasPassedJournalQc":"","hasAnyPriority":true,"hideJournal":true,"highlight":"","institution":"","isAcceptedByJournal":false,"isAuthorSuppliedPdf":true,"isDeskRejected":"","isHiddenFromSearch":false,"isInQc":false,"isInWorkflow":false,"isPdf":true,"isPdfUpToDate":true,"isWithdrawnOrRetracted":false,"journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true},"keywords":"","lastPublishedDoi":"10.21203/rs.3.rs-6220876/v1","lastPublishedDoiUrl":"https://doi.org/10.21203/rs.3.rs-6220876/v1","license":{"name":"CC BY 4.0","url":"https://creativecommons.org/licenses/by/4.0/"},"manuscriptAbstract":"With ever-increasing volumes of sequencing data for biosynthetic gene clusters (BGCs), computational methods to accurately predict which secondary metabolites result from these are critically lacking. Here, we present CHAMOIS, a machine learning-based tool for predicting chemical properties of secondary metabolites from protein domains annotated in the input BGCs. CHAMOIS infers 485 chemical properties from the ChemOnt ontology using logistic regression. It accurately predicts 111 such properties (AUPRC \u003e 0.5) in cross-validation against known instances. Although CHAMOIS is not explicitly trained on biosynthetic knowledge, many of the inferred links between protein domains and metabolite properties are consistent with scientific literature, others suggest new biochemical functions of uncharacterized biosynthetic domains. Finally, CHAMOIS can pinpoint which BGC within a given genome produces a pre-specified metabolite (correct BGC in 69% of cases ranked among the top 5), which holds great potential for prioritising experimental BGC characterisation and discovery of novel biosynthetic enzymes.","manuscriptTitle":"Machine learning inference of natural product chemistry across biosynthetic gene cluster types","msid":"","msnumber":"","nonDraftVersions":[{"code":1,"date":"2025-10-28 16:31:01","doi":"10.21203/rs.3.rs-6220876/v1","editorialEvents":[{"type":"communityComments","content":0}],"status":"published","journal":{"display":true,"email":"
[email protected]","identity":"researchsquare","isNatureJournal":false,"hasQc":true,"allowDirectSubmit":true,"externalIdentity":"","sideBox":"","snPcode":"","submissionUrl":"/submission","title":"Research Square","twitterHandle":"researchsquare","acdcEnabled":true,"dfaEnabled":false,"editorialSystem":"","reportingPortfolio":"","inReviewEnabled":false,"inReviewRevisionsEnabled":true}}],"origin":"","ownerIdentity":"cf2a128c-c415-480e-9926-d6ad7c0211dd","owner":[],"postedDate":"October 28th, 2025","published":true,"recentEditorialEvents":[],"rejectedJournal":[],"revision":"","amendment":"","status":"posted","subjectAreas":[{"id":56971278,"name":"Biological sciences/Computational biology and bioinformatics/Machine learning"},{"id":56971279,"name":"Biological sciences/Chemical biology/Natural products"},{"id":56971280,"name":"Biological sciences/Chemical biology/Biosynthesis"}],"tags":[],"updatedAt":"2025-11-11T12:40:23+00:00","versionOfRecord":[],"versionCreatedAt":"2025-10-28 16:31:01","video":"","vorDoi":"","vorDoiUrl":"","workflowStages":[]},"version":"v1","identity":"rs-6220876","journalConfig":"researchsquare"},"__N_SSP":true},"page":"/article/[identity]/[[...version]]","query":{"redirect":"/article/rs-6220876","identity":"rs-6220876","version":["v1"]},"buildId":"8U1c8b4HqxoKbykW_rLl7","isFallback":false,"isExperimentalCompile":false,"dynamicIds":[84888],"gssp":true,"scriptLoader":[]}
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.