metaboprep v2: Broadening the application of the  metaboprep  beyond metabolomics

doi:10.1101/2025.11.11.687831

metaboprep v2: Broadening the application of the metaboprep beyond metabolomics

2025 · doi:10.1101/2025.11.11.687831

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 23,877 characters · extracted from preprint-html · click to expand

metaboprep v2: Broadening the application of the metaboprep beyond metabolomics | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Confirmatory Results metaboprep v2: Broadening the application of the metaboprep beyond metabolomics View ORCID Profile Nicholas Sunderland , David A Hughes , Mathew A Lee , Alec McKinlay , Nicholas J Timpson , Laura J Corbin doi: https://doi.org/10.1101/2025.11.11.687831 Nicholas Sunderland 1 Bristol Heart Institute, University Hospitals Bristol and Weston NHS Foundation Trust , Bristol, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nicholas Sunderland David A Hughes 2 Population and Public Health Sciences, Pennington Biomedical Research Center , Baton Rouge, LA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mathew A Lee 3 International Agency for Research on Cancer, World Health Organization , Lyon, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alec McKinlay 4 Population Health Sciences, University of Bristol , Oakfield House, Oakfield Grove, Bristol, UK 5 MRC Integrative Epidemiology Unit at the University of Bristol , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nicholas J Timpson 4 Population Health Sciences, University of Bristol , Oakfield House, Oakfield Grove, Bristol, UK 5 MRC Integrative Epidemiology Unit at the University of Bristol , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Laura J Corbin 4 Population Health Sciences, University of Bristol , Oakfield House, Oakfield Grove, Bristol, UK 5 MRC Integrative Epidemiology Unit at the University of Bristol , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: laura.corbin{at}bristol.ac.uk Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract High-throughput multiplex assays for metabolomics and proteomics offer opportunities for biomarker discovery and disease stratification in epidemiological research. The complexity of these datasets requires robust, standardized and transparent preprocessing workflows to ensure reproducibility and comparability across studies. We present an updated and enhanced version of the metaboprep R package. Originally designed for metabolomics data, it has now been extended to support proteomics datasets from platforms such as Olink® and SomaScan®. This release introduces a user-friendly, modular, object-oriented architecture using R’s S7 system, enabling improved input format flexibility, streamlined report generation and increased compatibility with other third-party tools. The updated pipeline is structured in three parts: data import, filtering and summary, and output generation. This structure provides a reproducible yet customizable framework for pre-analysis data preparation with utility across multiple omics platforms and particular value in supporting multi-cohort epidemiological research. Availability and implementation The metaboprep package is implemented in R and freely available at: https://github.com/MRCIEU/metaboprep . Introduction In epidemiology, the high-throughput measurement of large numbers of variables capturing substantial portions of variation in specific domains of biological variation (“omics”) has opened a range of analytical applications including prediction of disease risk, subtyping and classification of disease, biomarker discovery and mechanism dissection ( Babu and Snyder, 2023 ). In recognition of the demand for increased consistency and transparency in the preanalytical processing of metabolomics data across cohorts and studies, we previously published the R package metaboprep ( Hughes et al ., 2022 ). This package was designed to enable the extraction of metabolomics data from different platforms and for the processing, summary and preparation of those data for subsequent statistical analysis within a standardized and reproducible workflow. As technologies enabling the simultaneous measurement of multiple circulating proteins become increasingly used in epidemiological studies, a similar demand for standardized workflows has arisen in this field ( Bowser and Robinson, 2023 ; Babu and Snyder, 2023 ). Proteomics data generated by high-throughput assays share many characteristics in common with similarly produced metabolomics data – a high number of variables or features, a range of data distributions, variable detection limits and the possible influence of technical covariates (e.g. batch effects). In this context, we have seen metaboprep being useful in pre-processing proteomics data derived by both Olink, using their Olink Target® platform (Goulding et al .; Bull et al ., 2024 ), and by SomaLogic, using their SomaScan® assay ( Goudswaard et al ., 2023 ). Here we describe an update to our established metabolomics pre-analysis platform ( metaboprep ), including an extension to its omics data scope. While retaining the original purpose of the package, this update to metaboprep (v2.0) allows a single transparent pipeline for preprocessing metabolomics data with flexibility for the user to define their own criteria for sample and feature filtering, but with updates designed to fulfil two principal objectives: (1) improved flexibility regarding input data types and formats; and (2) improved user experience and reporting. Toolbox Pipeline design metaboprep is an R package designed to standardize the steps involved in preparing large metabolomics and proteomics datasets for statistical analysis. The updated package (v2.0) retains the functionality of the previous pipeline with functions now accessible within discrete customizable modules, along with flexible data import and export workflows. This includes compatibility with community standards (COMETS Analytics ( Temprosa et al ., 2022 )) and use with existing analytical software (MetaboAnalyst ( Pang et al ., 2024 )). Modules can be run in isolation or combined into an end-to-end pipeline resulting in filtered datasets and data summary reporting. Data transformations, sample and feature exclusions, and pre-processing metrics, are stored within a central Metaboprep R S7 class object ( Vaughan et al ., 2024 ). This object can be used in various workflows in a way that feels familiar to R users, particularly those accustomed to dplyr ( Wickham, Hadley et al ., 2023 ) or tidyr ( Wickham, Hadley et al ., 2024 ) syntax. An overview of the overall pre-processing pipeline and underlying modules is presented in Figure 1 . Download figure Open in new tab Figure 1. An overview ‘cheat sheet’ of the metaboprep v2 . 0 R package functions and pre-processing pipeline. Implementation metaboprep v2 . 0 is implemented using R (version 4.5.1) ( R Core Team, 2025 ) and is dependent upon R version 4.1.0 or greater. Code is available from https://github https://github.com/MRCIEU/metaboprep along with an associated GitHub Pages website ( https://mrcieu.github.io/metaboprep/ ), providing a walkthrough of the package. Example datasets are available within the package for users to test and explore the utility of metaboprep v2 . 0 . To enhance structure, extensibility, and user experience, metaboprep v2 . 0 leverages the modern S7 object-oriented system in R. Central to this design is the ‘Metaboprep’ class - an S7 object serving as a structured container for omics data, including arrays and data frames of feature measurements, sample and feature metadata, summary statistics, and exclusion flags. The S7 framework enables robust class property validation with rule-based checks, such as ensuring alignment between sample and feature IDs and associated metadata, thereby maintaining data integrity throughout the pipeline ( Vaughan et al ., 2024 ). A key feature of the ‘Metaboprep’ class is its layered data architecture, which stores multiple versions of the dataset (e.g., raw input, filtered data, and transformed data) as separate layers within a three-dimensional numerical array. This approach preserves the full processing history while allowing easy access and comparison across stages, promoting reproducibility and auditability of analyses. Moreover, all processing functions in metaboprep v2 . 0 accept and return ‘Metaboprep’ objects, enforcing standardized workflows and consistent handling of data, and propagation of metadata. This design facilitates modular, extensible, and transparent data processing aligned with tidy software principles and reproducible research standards. To ensure backward compatibility with metaboprep v1.0, we provide the function run_metaboprep1() , which reproduces the original routine and accepts the legacy parameter file format. Modularity Metaboprep v2 . 0 is designed with modularity and flexibility at its core to accommodate diverse data sources and user needs. The pipeline is organized into three distinct parts: (1) data import; (2) summary and filtering; and (3) output generation (including report creation). This segmentation improves usability and makes debugging easier. The modular structure allows users to run individual components independently or as part of an integrated workflow. The pipeline begins with a data preparation phase that supports reading from commercial provider formats such as Nightingale Health (CoreMetabolomics), Metabolon, Inc. (Global Discovery Panel), Olink® (Olink Explore) and SomaLogic (SomaScan). The format of the standard deliverables from these companies is known to change overtime and whilst the import functions can handle the common current and legacy formats, the ability for users to input their own custom datasets through use of simple base R data structures (data frames and matrices) provides some degree of future proofing. The read_[provider]() functions handle data extraction from often complex Microsoft Excel and custom data formats into a list of base R data types: a numeric matrix of abundance data and data frames of sample and feature metadata. The only core requirement of the sample and feature data frames is that they contain a column named “ sample_id ” and “ feature_id ”, respectively. The modularization of the import architecture, an improvement on the metaboprep v1.0, supports rapid adaptation to emerging platforms and evolving file specifications, futureproofing metaboprep v2 . 0 against evolving data sources. Example (simulated) datasets emulating the structure of common provider outputs and associated ‘read in’ functions are included ( https://github.com/MRCIEU/metaboprep/tree/master/inst/extdata ). This allows users to trial the package without proprietary data. Example (simulated) datasets currently included as part of the package and based on data releases received by us from providers include: Metabolon, Inc. (Global Discovery Panel) v1 (pre-October 2020) and v2 (October 2020 onwards); Nightingale Health (CoreMetabolomics) v1 (pre-April 2020) and v2 (April 2020 onwards). In addition, example datasets have been generated based on code and datasets provided in GitHub repositories of Olink ( https://github.com/Olink-Proteomics/OlinkRPackage ) and SomaLogic ( https://github.com/SomaLogic ). The ‘Metaboprep’ class is then created, with its constructor taking the data matrix and sample and feature metadata data frames. By default, the input data is entered into the object’s layered 3D data matrix (array) at index “input” along the third dimension. Validation checks are conducted at instantiation to ensure matching feature and sample identifiers in the data and enforcing correct order. Pre-analysis processing is often necessary, for example batch normalization procedures. Calling functions such as batch_normalise() on the ‘Metaboprep’ object inserts another layer along the object’s data array third dimension which can then be specified as the source data for subsequent operations such as the pre-processing pipeline. At the center of metaboprep functionality is the pre-processing routine, called with the quality_control() function. This is a wrapper around underlying summarization, outlier analysis and filtering modules, that if needed can be called individually to explore the data and assist in debugging. Several arguments provide the user with flexibility in selecting samples or features to include, through the “ sample_ids ” and “ feature_ids ” arguments. There is also the possibility to exclude certain features from the outlier and filtering steps whilst retaining them in the resulting data, through use of the “ features_exclude_but_keep” argument – this can be useful in datasets that contain derived ratio measures (Nightingale Health) or xenobiotics with high degrees of missingness (Metabolon, Inc.). The quality_control() function adds a further data layer along the ‘Metaboprep’ object’s data array third dimension called “qc”, representing the cleaned data ready for analysis, as well as populating the object’s “exclusion” field with a list of excluded samples and features and the reason for their exclusion. Report generation Metaboprep v2 . 0 includes optional report generation capabilities that provide detailed summaries of the pre-processing and filtering steps that have been applied to the data. These reports facilitate transparency and reproducibility by documenting key metrics such as missingness rates, outlier detection, sample exclusions and data transformations at each processing stage. The output formats are flexible, with options to generate comprehensive HTML reports for review as well as PDF files suitable for sharing with collaborators and including in publications. An example report is included in Supplementary Data S1 . Documentation and examples Comprehensive documentation for metaboprep v2 . 0 is provided through multiple channels to support users of all levels. A detailed README on the GitHub repository offers step-by-step instructions for running the pipeline. To further enhance user experience, metaboprep v2 . 0 leverages pkgdown to generate a user-friendly, searchable website that organizes the package documentation, reference manuals and vignettes in one accessible location ( https://mrcieu.github.io/metaboprep/ ). This online resource offers guided aids, including tutorials and examples, helping users to quickly understand and apply the package functions. The vignettes included with the package provide practical walkthroughs and use cases. Conclusion Metaboprep v2 . 0 provides a flexible, standardized, and reproducible framework for preprocessing multi-omics data, including metabolomics and proteomics. Leveraging modern S7 object-oriented design, it ensures data integrity and transparent tracking of processing steps through layered data storage. The package also generates fully documented, platform-agnostic summary reports for supplementary materials. With improved input flexibility, modular pipeline components, and compatibility with diverse platforms, this update to metaboprep enhances usability and supports consistent workflows for epidemiological studies. Code availability The metaboprep R package, along with installation instructions and source code, is available on GitHub ( https://github.com/MRCIEU/metaboprep ). Precompiled binaries for common platforms are also available via the MRC-IEU R Universe, enabling easy installation using install.packages(“metaboprep”, repos = c(“ https://mrcieu.r-universe.dev “, “ https://cloud.r-project.org “)). Funding NS is funded by the GW4-CAT Wellcome PhD programme. LC, NJT and NS work in a Medical Research Council (UKRI) funded unit (MC_UU_00032/1 and MC_UU_00032/3). AM is supported by a CRUK PhD studentship [grant number C18281/A30905]. MAL is supported by funding from IIG_FULL_2021_008, which was obtained from Wereld Kanker Onderzoek Fonds, as part of the World Cancer Research Fund International grant programme, and from INCA_15849, which was obtained from Institut National du Cancer. Where authors are identified as personnel of the International Agency for Research on Cancer/World Health Organization, the authors alone are responsible for the views expressed in this article and they do not necessarily represent the decisions, policy, or views of the International Agency for Research on Cancer/World Health Organization. Figure legends Supplementary Data S1 . An example PDF report generated using metaboprep v2 . 0 . Footnotes https://github.com/MRCIEU/metaboprep https://mrcieu.github.io/metaboprep/ References ↵ Babu , M. and Snyder , M. ( 2023 ) Multi-Omics Profiling for Health . Mol Cell Proteomics , 22 , 100561 . OpenUrl CrossRef PubMed ↵ Bowser , B.L. and Robinson , R.A.S. ( 2023 ) Enhanced Multiplexing Technology for Proteomics . Annual Review of Analytical Chemistry , 16 , 379 – 400 . OpenUrl PubMed ↵ Bull , C.J. et al. ( 2024 ) Impact of weight loss on cancer-related proteins in serum: results from a cluster randomised controlled trial of individuals with type 2 diabetes . eBioMedicine , 100 . ↵ Goudswaard , L.J. et al. ( 2023 ) Using trials of caloric restriction and bariatric surgery to explore the effects of body mass index on the circulating proteome . Sci Rep , 13 , 21077 . OpenUrl PubMed Goulding , N. et al. ( 2022 ) Inflammation proteomics datasets in the ALSPAC cohort . Wellcome Open Res , 7 , 277 . OpenUrl PubMed ↵ Hughes , D.A. et al. ( 2022 ) metaboprep: an R package for preanalysis data description and processing . Bioinformatics , 38 , 1980 – 1987 . OpenUrl PubMed ↵ Pang , Z. et al. ( 2024 ) MetaboAnalyst 6.0: towards a unified platform for metabolomics data processing, analysis and interpretation . Nucleic Acids Research , 52 , W398 – W406 . OpenUrl CrossRef PubMed ↵ R Core Team ( 2025 ) R: A Language and Environment for Statistical Computing . ↵ Temprosa , M. et al. ( 2022 ) COMETS Analytics: An Online Tool for Analyzing and Meta-Analyzing Metabolomics Data in Large Research Consortia . American Journal of Epidemiology , 191 , 147 – 158 . OpenUrl PubMed ↵ Vaughan , D. et al. ( 2024 ) S7: An object oriented system meant to become a successor to S3 and S4 . ↵ Wickham , Hadley et al. ( 2023 ) dplyr: A Grammar of Data Manipulation . ↵ Wickham , Hadley et al. ( 2024 ) tidyr: Tidy Messy Data . View the discussion thread. Back to top Previous Next Posted November 12, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following metaboprep v2: Broadening the application of the metaboprep beyond metabolomics Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share metaboprep v2: Broadening the application of the metaboprep beyond metabolomics Nicholas Sunderland , David A Hughes , Mathew A Lee , Alec McKinlay , Nicholas J Timpson , Laura J Corbin bioRxiv 2025.11.11.687831; doi: https://doi.org/10.1101/2025.11.11.687831 Share This Article: Copy Citation Tools metaboprep v2: Broadening the application of the metaboprep beyond metabolomics Nicholas Sunderland , David A Hughes , Mathew A Lee , Alec McKinlay , Nicholas J Timpson , Laura J Corbin bioRxiv 2025.11.11.687831; doi: https://doi.org/10.1101/2025.11.11.687831 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17635) Bioengineering (13859) Bioinformatics (41846) Biophysics (21401) Cancer Biology (18534) Cell Biology (25422) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24285) Genetics (15582) Genomics (22463) Immunology (17700) Microbiology (40298) Molecular Biology (17141) Neuroscience (88424) Paleontology (666) Pathology (2825) Pharmacology and Toxicology (4813) Physiology (7633) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00