GIMMEcpg: Global Imputation of Mean CpG MEthylation in Real-time

doi:10.1101/2025.03.17.643746

GIMMEcpg: Global Imputation of Mean CpG MEthylation in Real-time

2025 · doi:10.1101/2025.03.17.643746

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 55,342 characters · extracted from preprint-html · click to expand

GIMMEcpg: Global Imputation of Mean CpG MEthylation in Real-time | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results GIMMEcpg: Global Imputation of Mean CpG MEthylation in Real-time View ORCID Profile Ismail Moghul , View ORCID Profile Niuzheng Chai , View ORCID Profile Nikolas Pontikos , View ORCID Profile Alison Hardcastle , View ORCID Profile Javier Herrero , View ORCID Profile Stephan Beck doi: https://doi.org/10.1101/2025.03.17.643746 Ismail Moghul 1 University College London, UCL Cancer Institute , London, UK 2 University College London, Institute of Ophthalmology , London, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ismail Moghul For correspondence: ucbtmog{at}ucl.ac.uk Niuzheng Chai 2 University College London, Institute of Ophthalmology , London, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Niuzheng Chai Nikolas Pontikos 2 University College London, Institute of Ophthalmology , London, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nikolas Pontikos Alison Hardcastle 2 University College London, Institute of Ophthalmology , London, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alison Hardcastle Javier Herrero 1 University College London, UCL Cancer Institute , London, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Javier Herrero Stephan Beck 1 University College London, UCL Cancer Institute , London, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Stephan Beck Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Whole-genome DNA methylation (methylome) analysis is of broad interest to biomedical research due to its central role in human development and disease. However, generating high-quality methylomes at scale remains challenging due to inherent technical limitations. While imputation has the potential to help overcome this problem, no existing approach adequately addresses the scaling issue. Here, we present GIMMEcpg (Global Imputation of Mean cpg MEthylation), a novel imputation tool that scales efficiently from single samples to large cohort studies. GIMMEcpg uses a custom feature dataset built from known CpG sites within the same dataset to impute missing values by calculating the distance-weighted mean of the methylation value of the two immediately neighbouring CpG sites. We benchmarked GIMMEcpg for speed and accuracy against multiple imputation methods using downsampled datasets produced from high-quality (∼100x) Whole Genome Bisulfite Sequencing (WGBS) data. With a 10x downsampled dataset, GIMMEcpg was able to process the dataset and impute 9.14 Million CpG sites within 7 seconds (R: 0.78, MAE: +5.6%, RMSE: +10.9%). Our results demonstrate that GIMMEcpg is 39-2,562 times faster than three existing methylation imputation tools (BoostMe, DeepCpG, and MethImpute) while maintaining comparable accuracy. To quantify GIMMEcpg’s scalability, we applied it to the most extensive single collection of WGBS data (N=645 at variable coverage) from the EpiATLAS generated by the International Human Epigenome Consortium (IHEC). Using a single, standard CPU server, GIMMEcpg processed and imputed an additional 2.4 billion CpG methylation values across the 645 datasets in less than a day, enriching the EpiATLAS methylome resource by 20%. This demonstrates that GIMMEcpg scales to large cohort studies with only a subtle impact on accuracy, as illustrated by our benchmark. We also developed a machine learning variant, GIMMEcpg.ml, which delivers a higher accuracy compared to existing methodologies. Using the same 10x downsampled benchmarking dataset, GIMMEcpg.ml achieved a Person Correlation of 0.87 compared to the ground truth, representing an improvement of 0.11 over the best performing alternative method. Additionally, GIMMEcpg.ml has a Mean Absolute Error (MAE) of 8.67%, which is 2.63% lower than the most accurate performing alternative. While this enhanced accuracy comes at the cost of increased computation requirements, GIMMEcpg.ml is a useful tool where higher accuracy is preferred over scalability. For increased accessibility, GIMMEcpg is freely available under an MIT license as R and Python packages at https://github.com/ucl-medical-genomics/gimmecpg-r and https://github.com/ucl-medical-genomics/gimmecpg-python , respectively. Introduction Over the past decades, plummeting sequencing costs have made genomic and epigenomic analysis more accessible to researchers (Kris A. Wetterstran n.d.; Heidi Chial 2008 ). As a result, whole-genome DNA methylation (methylome) analysis has become possible ( Beck 2010 ) and has been applied to many studies in health and disease ( Day & Sweatt 2010 ; Jin & Liu 2018 ; Feinberg 2007 ). However, the vast majority of existing methylome data is still based on array technologies that only cover 1-3% of the human methylome. In fact, over 99% of studies on the EWAS atlas were carried out using arrays ( Li et al. 2019 ). Sequencing-based methylome analysis first gained popularity with the introduction of Reduced Representation Bisulfite Sequencing (RRBS) ( Meissner et al. 2005 ) and targeted bisulfite sequencing technologies, but these typically still only cover 14-15% of the entire methylome (Tanić et al. 2022). In comparison, only a few studies to date involved large-scale Whole Genome Bisulfite Sequencing (WGBS), most notably the DNA Methylation Atlas ( Loyfer et al. 2023 ) and the IHEC EpiATLAS ( EpiATLAS Consortium 2025 ). Despite its limitations, bisulfite treatment has been the gold standard for the analysis of 5-methylcytosine (5mC) for over 30 years ( Frommer et al. 1992 ). However, it is important to note that this method cannot distinguish 5mC from its oxidised forms 5-hydroxymethylcytosine (5hmC), 5-formylcytosine (5fC) and 5-carboxycytosine (5caC) ( Wu & Zhang 2017 ). Therefore, it can only distinguish between modified and unmodified cytosines. For the purpose of this study, we designate all modified cytosines as ‘methylated’, as the vast majority of modifications constitute 5mC. Although bisulfite sequencing remains popular, many alternative and potentially superior methods have been developed for mapping DNA methylation and other epigenetic modifications by sequencing technologies ( Chen et al. 2023 ). Bisulfite treatment also introduces multiple technical challenges that complicate the analysis. One significant issue is incomplete conversion, where some unmethylated cytosines are not converted into uracil, making it impossible to differentiate between a methylated cytosine and an unconverted, unmethylated cytosine. The bisulfite conversion rate typically ranges from 95-98% ( Warnecke et al. 2002 ). In human WGBS datasets, this means that approximately 1.4 million cytosines are unconverted (assuming a total of 29 million haploid CpGs and a 95% conversion rate), which will, as a result, be incorrectly interpreted as methylated. Another challenge is DNA fragmentation, as bisulfite treatment causes extensive DNA fragmentation ( Simpson et al. 2017 ), negatively affecting downstream PCR amplification. Furthermore, these issues are particularly relevant in the context of bulk-cell sequencing. Bulk cell samples typically contain multiple cell types of varied developmental stages, which affects the precision and accuracy of calculating average CpG methylation values. It is, therefore, crucial to ensure sufficient sequence coverage over each CpG site to accurately call methylation values and, where possible, deconvolve them into their constituent cell types using a cell type deconvolution algorithm (reviewed in ( Teschendorff & Relton 2018 ). Imputation can overcome some of these limitations and has been used in genetic studies for several decades with high accuracy. It is now a standard part of many Genome-Wide Association Studies (GWAS) and other genomic analyses ( Marchini & Howie 2010 ). Over the last few years, imputation has also been explored in the context of DNA methylation. Several tools have been developed for this purpose, each with its own approach and scope. DeepCpG, initially designed for single-cell WGBS data, has also shown applicability to bulk-cell WGBS data. It utilises deep learning techniques to determine associations between DNA sequence patterns (1kb) and between neighbouring CpG sites (50 CpG sites) ( Angermueller et al. 2017 ). Melissa, another tool developed for single-cell data, utilises a Bayesian hierarchical method to cluster local methylation patterns and leverages this information to impute missing data effectively ( Kapourani & Sanguinetti 2019 ). For bulk-cell WGBS data, tools like BoostMe and METHimpute have been developed. BoostMe uses a gradient boosting algorithm to extract useful information from the two immediate, neighbouring CpG sites as well as the sample average ( Zou et al. 2018 ). On the other hand, METHimpute leverages a Hidden Markov Model-based binomial test that learns from the methylation status of neighbouring cytosines ( Taudt et al. 2018 ). ChromImpute is the first imputation method that was designed for large epigenomic studies with multi-dimensional data types, including histone modifications and RNA-seq ( Ernst & Kellis 2015 ). It can predict the methylation signal in a target sample based on two classes of features: signal from other data types in the same sample and the corresponding methylation signal in other samples. The ability of ChromImpute to integrate information across multiple dataset types (including histone modifications, RNA-Seq, and WGBS) and across multiple samples made it the method of choice for imputation of major epigenomic resources such as EpiMap ( Boix et al. 2021 ) and the IHEC EpiATLAS ( EpiATLAS Consortium 2025 ). Even though ChromImpute can impute WGBS if other histone and/or RNA-seq data are present, it is unable to do so with just WGBS data which is the focus of this study. For this reason, ChromImpute was excluded from the benchmarking presented here. While these imputation tools are valuable for extracting maximum value from a dataset, many come with high computational demands that limit their scalability to large datasets. Addressing this gap, we introduce GIMMEcpg (Global Imputation of Mean CpG Methylation), a novel imputation tool specifically designed to efficiently scale from analysing a single sample to handling extensive cohort studies. Results GIMMEcpg: Global Imputation of Mean CpG Methylation GIMMEcpg is built upon the established principle that the methylation value of CpG sites correlates highly with neighbouring CpG sites, with this correlation increasing as the distance between the neighbouring CpG sites decreases ( Eckhardt et al. 2006 ; Moghul 2021 ; Li et al. 2010 ). This key feature has also been leveraged by other imputation tools, such as DeepCpG and BoostMe, for imputing missing values. Eckhardt et al. (2006) first demonstrated that a significant correlation was observed between neighbouring CpG sites up to distances of 1Kb, which then rapidly deteriorated for distances larger than 2kb ( Eckhardt et al. 2006 ). However, as this study was carried out using old Amplicon-based technology over 15 years ago, we decided to re-evaluate this relationship using high-quality (100x coverage) WGBS datasets. Our analysis, as illustrated in Supplementary Figure 1, reveals that the Pearson correlation between neighbouring CpG sites decreases more rapidly with increasing genomic distance than previously reported. At very short distances, the Pearson correlation approaches 1, but it rapidly declines linearly to a correlation of 0.3 at 250bp. Beyond this distance, although some correlation persists between neighbouring CpG sites, it is considerably weaker, reaching random levels of correlation past 1.5kb. GIMMEcpg builds on this information by employing a simple yet effective imputation strategy, where the distance-weighted mean of the two neighbouring CpG sites is calculated, as described in the equation below. This simplified mathematical approach significantly reduces computational requirements compared to more complex Machine Learning or Deep Learning models while maintaining high accuracy within biologically relevant distances (i.e. below 1kb). Equation 1 : The methylation value (β 0 ) of a CpG site with an unknown methylation value can be defined as the weighted average of the methylation value of the neighbouring CpG sites. The methylation value of the upstream and downstream neighbouring CpG sites are denoted by β 1 and β -1 , respectively. The linear distance in base pairs between the focal CpG site (with the unknown methylation value) and each downstream and upstream neighbouring CpG site are denoted by D 1 and D -1 , respectively . This formula is based on the assumption that there is a correlation between neighbouring CpG sites. Therefore, it is important to avoid imputing CpG sites using this formula where this assumption no longer holds. By default, GIMMEcpg does not impute CpG sites if both neighbouring CpG sites are further than 1KB away from the focal CpG site. GIMMEcpg provides the distance to the closest neighbouring CpG as a proxy for a confidence value to aid in interpretation and allow for customised downstream analysis. For example, end-users are able to choose a stricter distance cut-off if desired, balancing accuracy with the number of CpG sites imputed based on their specific research needs. Baseline Accuracy Establishing a reliable ground truth dataset is crucial to assessing the performance of imputation methods like GIMMEcpg. Therefore, we utilised high-quality (100x coverage) bulk-cell WGBS data from the IHEC EpiATLAS for this purpose ( EpiATLAS Consortium 2025 ). Our approach involved downsampling two high-coverage WGBS datasets to simulate WGBS datasets with lower CpG coverage. This process resulted in 48 downsampled datasets, comprising six datasets at eight different coverage levels. In this manner, the actual methylation value is known for each missing data point in the downsampled dataset, allowing for precise accuracy assessment of any subsequent imputation. It is essential to note that the imputation accuracy calculated using a lower coverage downsampled dataset compared to a higher coverage original dataset is likely to be overestimated. This overestimation occurs due to an expected decrease in accuracy from having data with a lower CpG coverage. To account for this, it is important to determine the baseline error (or residual noise) by comparing methylation values in each downsampled dataset to the original dataset for CpG sites present before imputation (i.e., the baseline accuracy). Our analysis revealed that at a 10x coverage, a very high Pearson correlation of 0.96 exists between the downsampled dataset and the corresponding original dataset. However, when examining the Mean Absolute Error (MAE) and the Root Mean Squared Error (RMSE), it becomes clear that some additional variance (or error) was introduced as a result of downsampling. These effects appear relatively small, with an MAE of 6% and RMSE of 8.8%. The baseline accuracy present in the datasets downsampled to a 10x coverage is shown in Figure 1A,D,E , while the baseline accuracy for all the downsampled datasets with coverages ranging from 5x to 60x are shown in Supplementary Figure 2. Download figure Open in new tab Figure 1: [A]: Baseline error in downsampled datasets with a 10x coverage (i.e. residual noise in the dataset due to being at lower coverage). A distribution, averaged over six downsampled datasets with a 10x coverage, comparing the methylation value in the downsampled dataset and the original 100x methylation values used here as ground truth. The plot only includes information from CpG sites with a greater than a 10x read depth. Any deviation from the y = x-axis corresponds to an error in the methylation value. The baseline accuracy for all the downsampled datasets with coverages ranging from 5x to 60x are shown in Supplementary Figure 2. [B] GIMMEcpg Imputation Accuracy in downsampled datasets with 10x coverage. A distribution of imputed values, comparing the imputed methylation value and the corresponding actual methylation value in the original dataset. Any deviation from the y = x-axis corresponds to an error in the methylation value. However, it is important to note that these plots do not take into account the baseline error and, consequently, are expected to be overestimated. The GIMMEcpg Imputation accuracy for all the downsampled datasets with coverages ranging from 5x to 60x are shown in Supplementary Figure 3. [C] GIMMEcpg.ml Imputation Accuracy in downsampled datasets with 10x coverage . A distribution of imputed values, comparing the imputed methylation value and the corresponding actual methylation value in the original dataset. Any deviation from the y = x-axis corresponds to an error in the methylation value. However, it is important to note that these plots do not take into account the baseline error and, consequently, are expected to be overestimated. The GIMMEcpg.ml Imputation accuracy for all the downsampled datasets with coverages ranging from 5x to 60x are shown in Supplementary Figure 4. [D-E] Box plots showing the distribution of the baseline absolute error (green and the error associated with GIMMEcpg (blue) and GIMMEcpg.ml (pink), grouped according to the CpG density and the genomic annotation of the CpG site. Outliers are not shown on the plot due to the number of outliers; rather, the box-plot whiskers are extended to the maxima and minima with a dashed line. The box plots show summarised information from six downsampled datasets with a 10x coverage. Interestingly, when grouping the CpG sites based on genomic features, the MAE is not equivalent across different genomic features despite the coverage downsampling being carried out randomly. Furthermore, the MAE is minimal in certain genomic features such as CpG islands, 5’ UTRs, promoters and first exons, while being elevated in other genomic features like intergenic regions, introns, 3’ UTRs and LNC RNA. These results also highlight that even at 30x, there is an MAE of 3.3% and an RMSE of 5%. This finding highlights that each methylation value is only accurate to ±3%, compared to when sequenced at 100x. The implication of this is significant in that 30x WGBS should not be used to infer DMPs with a differential methylation value smaller than 6%. GIMMEcpg Performance GIMMEcpg was able to impute missing data in a WGBS dataset with 10x coverage in 7 seconds while using under 40GB of RAM. In these six downsampled datasets with a 10x coverage, an additional 9.1 million CpG sites were imputed in each dataset, with an average Pearson correlation of 0.78, an MAE of 11.6% and an RMSE of 19.7%. These numbers are slightly higher than the baseline error in the same dataset (i.e., an increased MAE of +5.6% and an increased RMSE of +10.9%). These results suggest that GIMMEcpg is able to process and impute a complete WGBS dataset extremely efficiently while ensuring a relatively high level of accuracy. The performance of GIMMEcpg when imputing the downsampled dataset with a 10x coverage is shown in Figure 1B,D,E , while the performance of GIMMEcpg for all the downsampled datasets with coverages ranging from 5x to 60x are shown in Supplementary Figure 3. Furthermore, the accuracy of the imputed values can be further explored by separating the imputed CpG sites by the genomic annotation. The imputed methylation value has a fairly low MAE in CpG Islands (0.4% at 10x), 5’ UTRs (0.5% at 10x), and first exons (1.4%). These results can be explained by the fact that these regions are more CpG-dense regions, where GIMMEcpg works best (i.e., regions where the neighbouring CpG site is close by). GIMMEcpg.ml An alternative machine learning approach using model stacking was also explored, where existing data from within the same dataset were utilised to train multiple models which are then merged to produce a single ensemble model. As part of this approach, multiple machine learning techniques, including XgBoost ( Chen & Guestrin 2016 ), Gradient Boosting Machines (GBM) ( Friedman 2001 ), Extremely Randomised Trees (XRT) ( Geurts et al. 2006 ), Distributed Random Forests (DRF) ( Cevid et al. 2022 ), Generalised Linear Models (GLM) ( Nelder & Wedderburn 1972 ) and a fully-connected, multi-layer Artificial Neural Networks (ANN) ( Hopfield 1988 ) were employed. Furthermore, an automated grid search was utilised to improve the model’s accuracy through 5-fold cross-validation, before being merged into a single ensemble model. The feature dataset is generated from known data (i.e. CpG sites with a corresponding methylation value) within the dataset. The feature dataset is inspired from information utilised within the GIMMEcpg approach. Here, the methylation value of known CpG sites is used as a feature, together with the methylation value and distance to the two immediately neighbouring CpG sites. In this manner, a dataset with five features is produced ( Figure 2B ) containing the focal methylation value, together with the genomic distance and the methylation value of the two immediately neighbouring CpG sites. After training, the best-performing model from cross-validation is selected and used to impute the DNA methylation value of missing CpG sites using a similar dataset generated for the missing CpG sites. Download figure Open in new tab Figure 2: Imputation using feature dataset produced from information on immediate CpG neighbours. [A] A schematic showing a CpG site (β 0 ) with an unknown methylation value and its two immediate neighbouring CpG sites. The methylation value of the upstream and downstream neighbouring CpG sites are denoted by β 1 and β -1 , respectively. The linear distance in base pairs between the focal CpG site (with the unknown methylation value) and each downstream and upstream neighbouring CpG site are denoted by D 1 and D -1 , respectively. The linear distance in base pairs between the focal CpG site (with the unknown methylation value) and each downstream and upstream neighbouring CpG site are denoted by C 1 and C -1 , respectively. [B] The feature table shows cases where the focal CpG site has a known methylation value. This would be used to train the model, where the algorithm will attempt to calculate a value close to the actual methylation value using the rest of the data in the feature dataset. [C] For each CpG site with an unknown methylation value, a similar feature dataset is generated with the β 0 value left blank. A trained model can then be used to calculate the value of β 0 using the information in the remainder of the feature dataset. When applied to the six downsampled datasets at a coverage of 10x and given 30 minutes of training time, GIMMEcpg.ml imputed an additional 13.8 million CpG sites at an MAE of 8.67% and a Pearson correlation of 0.87 (See Figure 1C,D,E and Supplementary Figure 4). These results demonstrate that GIMMEcpg.ml is able to successfully train a highly accurate model from data within the same sample. Furthermore, by exploring the methylation value through grouping by genomic annotation, it becomes apparent that GIMMEcpg.ml is significantly more accurate than GIMMEcpg and works best in CpG-dense regions such as CpG Islands, 5 UTRs, first exons, and promoters. These results suggest that the machine learning approach of GIMMEcpg.ml offers a significant improvement in accuracy over the standard GIMMEcpg method. However, this comes at the cost of increased computational time and complexity. The choice between GIMMEcpg and GIMMEcpg.ml may depend on the specific requirements of a research project, balancing the trade-off between imputation accuracy, computational efficiency and scalability. Benchmarking against alternative methods We systematically evaluated and benchmarked purpose-built DNA methylation imputation algorithms (such as DeepCpG, BoostME and MethImpute) together with mean imputation as a baseline imputation methodology. A total of ten different methodologies, also summarised in Table 1 , were applied to the 48 downsampled datasets (described above), and the imputation accuracy (MAE, RMSE, R) and compute performance (time taken and memory utilised) were calculated. The accuracy and performance results are shown in Table 1 and Figure 3 . View this table: View inline View popup Download powerpoint Table 1. Accuracy metrics for GIMMEcpg and existing imputation methodologies at eight different coverage levels ranging from 5x to 60x. R: Pearson Correlation; MAE: Mean Absolute Error (%), RMSE: Root Mean Squared Error (%), RAM - Memory used (GB), Time (seconds). The colour scale is relative to each coverage block and is described in the key, where green is shown for better-performing values (i.e. higher number of CpGs or higher R, or lower error rates), and red used for worse values. View this table: View inline View popup Download powerpoint Table 2. Summary of downsampled datasets produced from S1_W1 and S2_W1. The statistics have been produced by calculating a mean across six datasets at each coverage level. At the coverage level, three datasets were downsampled from S1_W1 and a further three datasets from S2_W1. Most statistics are displayed for the entire dataset (i.e. CpG sites with a coverage higher than 1) as well as for filtered subsets where the CpG site has a coverage higher or equal to 10x. Download figure Open in new tab Figure 3: GIMMEcpg.ml Imputation Accuracy at 10x coverage: Accuracy metrics for GIMMEcpg and existing imputation methodologies at 10x coverage level. R: Pearson Correlation; MAE: Mean Absolute Error (%), Time (seconds). A simplistic imputation approach would be to use the mean to replace missing values. This is particularly appropriate for DNA methylation data because the vast majority of the methylome is highly methylated (and thus close to the mean methylation). In a 10x dataset, it was possible to impute a total of 17.68 million CpG sites with an MAE of 21.9% and an RMSE of 28.2%. As a simple algorithm, the computational resources of such an approach are minimal, with a 2-second runtime using 7GB RAM. METHimpute, a purpose-built imputation tool designed for DNA methylation data, was more accurate than mean imputation, but at the expense of increased computation complexity. METHimpute took 11 minutes (684 seconds) to impute a total of 17.68 million CpG sites in this 10x downsampled dataset with a Pearson correlation of 0.71 and a MAE of 16.4% and a RMSE of 22.5%. DeepCpG and BoostMe are purpose-built imputation tools with the best-performing imputation accuracy. For this 10x downsampled dataset, DeepCpg achieved a Pearson Correlation of 0.7 and an MAE of 11.6. However, DeepCpG is extremely slow, taking approximately 5 hours per dataset, despite using a GPU. On the other hand, BoostMe takes roughly 5 minutes to run per dataset with a Pearson Correlation of 0.76 and an MAE of 11.3%. Interestingly, even though METHimpute had a slightly higher Pearson Correlation than DeepCpG, MethImpute’s MAE was significantly higher (i.e. a difference of 4.8%) than that produced from DeepCpG. In comparison, GIMMEcpg is roughly 38 times and 2560 times faster than BoostMe and DeepCpG, respectively. Despite being many times faster, GIMMEcpg is roughly just as accurate as BoostMe and DeepCpG. GIMMEcpg has a higher Pearson correlation than BoostMe or DeepCpG while having a similar MAE (i.e. GIMMEcpg at 11.6% vs BoostME at 11.3% and DeepCpG at 11.6%). In contrast, GIMMEcpg.ml is significantly more accurate than DeepCpG and BoostMe. GIMMEcpg.ml produced a significantly higher Pearson correlation of 0.87 (compared to DeepCpG’s 0.7 and BoostMe’s 0.76) and a significantly lower MAE of 8.67% (compared to DeepCpG’s 11.6% and BoostMe’s 11.3%). This MAE is only 2.67% higher than the baseline MAE, suggesting that GIMMEcpg.ml is able to impute values that are only off by 2.67% compared to the background noise. As a model learning approach, GIMMEcpg.ml is slower than BoostMe but much faster than DeepCpG. During our internal tests, we were able to reproduce similar accuracies by restricting the analysis time (i.e. to 5 minutes), however, in order to ensure GIMMEcpg.ml was robust, the analysis time was increased to 30 minutes with an early stopping approach used, such that the model stops training once the MAE cannot be further decreased during cross validation. These benchmarking results demonstrate that GIMMEcpg and GIMMEcpg.ml offer a compelling balance of accuracy and computational efficiency, making them particularly suitable for large-scale methylome studies. To our knowledge, GIMMEcpg is the fastest DNA methylation imputation tool currently available, while GIMMEcpg.ml is the best-performing DNA methylation imputation tool currently available. Application to PGP-UK & EpiATLAS datasets To demonstrate GIMMEcpg’s capability, we applied it to the WGBS datasets from the Personal Genome Project UK (PGP-UK) (Chervova et al. 2019) and the EpiATLAS ( EpiATLAS Consortium 2025 ). The PGP-UK WGBS data consists of ten WGBS datasets with a coverage of 15.9x. These datasets included 15.9 million CpG sites with a read depth higher than 10x. Running GIMMEcpg took roughly a minute to process all 10 WGBS datasets and imputed an additional 12.2 million CpG sites. As these WGBS datasets are matched with an Illumina 450k array panel from the same individual, it is possible to assess GIMMEcpg’s accuracy using CpG sites that are common between the array panel and the WGBS dataset. However, it is important to note that differences would be expected due to significantly different processing and analysis technologies between WGBS and array panels. When considering the entire dataset, the average increase in MAE and RMSE after imputation was 1.4% and 2.7%, respectively. The Pearson correlation decreased by 0.03 on average. When focusing on just the imputed CpG sites, the average increase in MAE and RMSE (compared to baseline error across CpG sites known in both datasets) was 4.1% and 7.2%, respectively. To demonstrate GIMMEcpg scalability, we applied GIMMEcpg to the EpiATLAS WGBS datasets produced by IHEC ( EpiATLAS Consortium 2025 ). This dataset comprises 645 WGBS datasets of varying coverage. GIMMEcpg took 4 hours and 15 minutes to run on 645 WGBS datasets on a single HPC node (80 CPU threads and 512GB RAM) by sequentially processing each dataset after the next. This included the time to read and process the file and write the output to disk. Before imputation, the 645 methylomes of the EpiATLAS DNA methylation matrix (minimum 10x read depth for each CpG site) had approximately 6.8 billion missing data points out of a total 18.7 billion data points (29 million haploid CpG sites x 645 datasets). Every single CpG site had a missing data point from at least one sample. Row-wise deletion, where CpG sites with missing data in more than 25% of samples are removed, would cause the number of CpG sites to decrease from roughly 29 million to 9 million. GIMMEcpg was able to impute a total of 2.4 billion data points (using the default 1kb filter), resulting in an additional 20% of data points to the EpiATLAS methylation dataset. Even though the accuracy of the imputation cannot be determined for these imputed sites within the EpiATLAS dataset, the benchmark within this study helps provide an idea of accuracy based on the dataset’s individual coverage and the corresponding distance to the neighbouring CpG sites. These applications to real-world datasets demonstrate GIMMEcpg’s efficiency in processing large-scale methylome data and its ability to significantly reduce missing data points, potentially enabling more comprehensive analyses of methylation patterns across samples. Discussion Recent improvements to experimental, instrumental, and computational approaches have made it possible to investigate whole-genome methylomes on an unprecedented scale. Imputation algorithms have greatly helped to improve and harmonise such large-scale datasets but lacked the scalability for routine analysis. To address this limitation and enable all researchers to carry out such large-scale methylome analyses, we have developed GIMMEcpg. In situations where higher accuracy is required, we developed GIMMEcpg.ml. The evaluation of GIMMEcpg against existing methodologies such as BoostMe and DeepCpG reveals that GIMMEcpg stands out for its exceptional speed while maintaining accuracy that is on par with the more established methods. This speed becomes particularly advantageous when handling the whole methylome datasets that are becoming increasingly common and suggests that efficiency does not necessarily have to come at the cost of precision. Moreover, during our benchmark, existing methodologies (BoostME and DeepCpG) failed to work as effectively at lower coverage datasets (i.e. 5-7x), which is arguably where they would provide the most benefit. In comparison, GIMMEcpg was able to work as effectively at lower coverages. GIMMEcpg demonstrates that more sophisticated and complex methodologies built using AI or larger training datasets do not always translate to increased accuracy. Interestingly, DeepCpG learns from significantly more information than BoostMe and GIMMEcpg but did not show a proportional increase in accuracy over simpler methods. Similarly, BoostME’s use of a complex gradient boosting algorithm did not improve accuracy compared to GIMMEcpg. In fact, the increased input dataset size and complex methodologies resulted in drastically increasing the computational requirements as well as the runtime. Therefore, this highlights the importance of benchmarking against more traditional and less complex methodologies. Furthermore, GIMMEcpg.ml significantly surpasses the accuracy of both BoostMe and DeepCpG and is expected to be ideal for cases where higher accuracy is required at the expense of slower runtime - for example, when identifying DMPs with small differential methylation, the imputation accuracy becomes more critical. GIMMEcpg.ml is therefore a favourable alternative to DeepCpG and BoostMe for most applications. However, when additional data types (such as histone modifications or RNA-seq) are available, tools like ChromImpute should be explored. Nonetheless, most routine DNA methylation studies do not include multiple epigenomic modalities and thus cannot be used with ChromImpute. Additionally, there isn’t always enough biosample available for generating this multi-dimensional data. The increasing production of DNA methylation datasets, fueled by advances in sequencing technologies and national biobanks, points to a future where the efficiency and scalability of computational tools will be even more critical. National Biobanks, such as in Estonia and Japan have started to routinely produce whole methylome datasets at scale and the UK Biobank recently announced to generate 50,000 methylomes using Nanopore-seq (Anon n.d.). Furthermore, technologies like Nanopore-, SMRT, TAB- and Biomodal-seq are able to produce methylome datasets at no or little additional cost, making them even more accessible to the scientific community. Given these advances, it can be expected that the number of methylome datasets will grow exponentially over the coming years, requiring algorithms like GIMMEcpg to enable swift analysis at scale without compromising accuracy. It’s worth noting that GIMMEcpg is expected to become even faster as the underlying technology ( https://pola.rs/ ) is under active development with a large community. The technology has recently added support for GPUs and can automatically be used (if available) for relevant parts of the underlying algorithm where appropriate, further improving its performance. GIMMEcpg and GIMMEcpg.ml are available as docker images and can thus be quickly deployed to cloud computing environments. Furthermore, the drastically reduced computational requirements of GIMMEcpg compared to alternative tools and faster run times would also equate to drastically lower cloud computing costs. The reduced computational load would also likely equate to a decreased carbon footprint (Grealey et al. 2022). Despite its advantages, GIMMEcpg does have some limitations. Due to its reliance on neighbouring information to impute missing sites, its application is mainly limited to data where neighbouring CpG sites are close enough to the missing site to have highly correlated methylation values. In general, this means that the neighbouring CpG and missing sites should be within 1kb of each other. Since the feature dataset for GIMMEcpg is built using only the information within each sample, GIMMEcpg cannot integrate information from other data types, such as RNA-Seq or histone data. Similarly, the imputation model for GIMMEcpg is built on a per-sample basis and does not translate across samples within a cohort or to different cell types. Furthermore, GIMMEcpg was developed for bulk WGBS data, and its utility on single-cell methylation data has not been confirmed. In conclusion, GIMMEcpg and GIMMEcpg.ml represents a significant advancement in methylome imputation, offering a balance of speed, accuracy, and scalability that is well-suited to large-scale methylome studies. GIMMEcpg and GIMMEcpg.ml are easily accessible through Python and R libraries, and its performance and efficiency make it a valuable tool for researchers working with whole-genome methylation data. Methods Datasets The datasets used for the benchmark carried out in this work have been produced by IHEC and are part of the EpiATLAS ( EpiATLAS Consortium 2025 ). Two deeply sequenced WGBS (IHECRE00000101.3 and IHECRE00000155.3) were produced from CD14-positive, CD16-negative classical monocytes. IHECRE00000101.3 was produced from venous blood from a healthy, 60-65 years old female donor (C004SQ), while IHECRE00000155.3 was produced from cord blood from a healthy, 0-5 years old female donor (C005PS). These WGBS datasets were sequenced using purified cells on an Illumina HiSeq 2000 instrument. The datasets were prepared using 101 base pair, pair-ended reads. To achieve deep coverage, a total of 11 (IHECRE00000101.3) and 10 (IHECRE00000155.3) sequencing runs were carried out for each sample, and the sequenced reads were then merged before further processing. The WGBS summary CpG count files for IHECRE00000101.3 and IHECRE00000155.3 are available through IHEC, while the raw data is available on EGA (EGAX00001097774 and EGAX00001097775 respectively). Downsampling These two deeply sequenced datasets have a CpG coverage of approximately 100x. This is more than three times the standard coverage (30x) that is recommended by IHEC for WGBS. As such, this makes these two datasets ideal for producing multiple downsampled datasets, each with a lower CpG coverage. To downsample these two datasets, a random read (together with its paired mate) is selected from all mapped reads (i.e., from the BAM file) and then merged to produce another dataset. A new mapped BAM file with a lower sequencing coverage can be produced in silico by selecting a specific percentage of read pairs. The samtools view command from the Samtools package (version: 1.11) was used with the ‘-s’ parameter to randomly select random read pairs from the input BAM file, producing a smaller, downsampled BAM file. Moreover, to increase the randomness of which read pairs are selected, a different seed number was used each time the input file was downsampled. The downsampled BAM file can then be processed using the standard pipeline to extract the downsampled dataset’s methylation information. A total of 24 datasets were downsampled for each dataset, with three datasets produced at each of the following eight different coverage levels: 5%, 7%, 10%, 15%, 20%, 25%, 30% and 60%. To ensure a high level of diversity between the 24 datasets, a different seed number was used each time data was downsampled. In conclusion, a total of 48 downsampled datasets were produced for both samples (i.e. 24 datasets for each sample) by using three different seed numbers. A summary of the downsampled datasets is shown in Supplementary Table 1. GIMMEcpg R and Python Packages GIMMEcpg is available as both R and Python packages. Both implementations use the Polars library and its lazy application programming interface (API) (Vink et al. 2024). These features allow GIMMEcpg to process data in parallel and carry out automatic query optimisation. In addition, GIMMEcpg has been built with an option to handle large out-of-RAM datasets by utilising the lazy API’s streaming mode. Before imputing, GIMMEcpg first collapses CpG sites on complementary strands into one row. Following this, GIMMEcpg filters out CpG sites with low read coverage; any sites not meeting the minimum coverage threshold will be considered missing and subsequently imputed. The default coverage threshold is ten reads. GIMMEcpg then cross-references the sample methylation data with a reference containing all possible CG sites in the human genome to identify missing CpG sites for imputation. The package includes two reference files, one each for hg38 and hg19. Users may also choose to supply their own reference files or generate them with the provided R script. Only autosomal CpG sites are considered for constructing the feature dataset and imputing missing methylation values. In addition, any CpG sites falling within the ENCODE blacklisted regions are excluded from both the feature dataset and imputation (Amemiya et al., 2019). Exclusion of ENCODE blacklisted CpG sites is the default behaviour; however, it is possible to override it or to provide a custom blacklist. GIMMEcpg depends almost entirely on Polars to perform the required calculations. On the other hand, GIMMEcpg.ml relies on H2OAutoML’s supervised machine learning algorithm to train several models and stacked ensembles based on the known CpG sites in the dataset ( LeDell & Poirier 2020 ). The models are then ranked, and the best model based on mean residual deviance is used to impute the missing methylation values. If a maximum neighbour distance has been specified, GIMMEcpg will only use known CpG sites within the neighbouring distance threshold for the feature set and impute only the missing sites that are less than the specified distance from its immediate neighbours. The default threshold for maximum neighbour distance in both GIMMEcpg and GIMMEcpg.ml modes is 1kb. References ↵ Angermueller , C. et al. , 2017 . DeepCpG: accurate prediction of single-cell DNA methylation states using deep learning . Genome biology , 18 ( 1 ), p. 67 . OpenUrl CrossRef PubMed Anon, Landmark genetics partnership to probe the causes of cancer and dementia . Available at: https://www.ukbiobank.ac.uk/learn-more-about-uk-biobank/news/landmark-genetics-part nership-to-probe-the-causes-of-cancer-and-dementia [Accessed March 17, 2025 ]. ↵ Beck , S. , 2010 . Taking the measure of the methylome . Nature biotechnology , 28 ( 10 ), pp. 1026 – 1028 . OpenUrl CrossRef PubMed Web of Science ↵ Boix , C.A. et al. , 2021 . Regulatory genomic circuitry of human disease loci by integrative epigenomics . Nature , 590 ( 7845 ), pp. 300 – 307 . OpenUrl CrossRef PubMed ↵ Cevid , D. et al. , 2022 . Distributional random forests: Heterogeneity adjustment and multivariate distributional regression . Journal of machine learning research: JMLR , 23 ( 333 ), pp. 1 – 79 . OpenUrl ↵ Chen , T. & Guestrin , C. , 2016 . XGBoost: A Scalable Tree Boosting System . in Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. KDD ‘16: The 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining . New York, NY, USA : ACM . Available at: https://dl.acm.org/doi/abs/10.1145/2939672.2939785 . ↵ Chen , X. et al. , 2023 . Mapping epigenetic modifications by sequencing technologies . Cell death and differentiation . Available at : doi: 10.1038/s41418-023-01213-1 . OpenUrl CrossRef Chervova , O. et al. , 2019 . The Personal Genome Project-UK, an open access resource of human multi-omics data . Scientific data , 6 ( 1 ), p. 257 . OpenUrl PubMed ↵ Day , J.J. & Sweatt , J.D. , 2010 . DNA methylation and memory formation . Nature neuroscience , 13 ( 11 ), pp. 1319 – 1323 . OpenUrl CrossRef PubMed Web of Science ↵ Eckhardt , F. et al. , 2006 . DNA methylation profiling of human chromosomes 6, 20 and 22 . Nature genetics , 38 ( 12 ), pp. 1378 – 1385 . OpenUrl CrossRef PubMed Web of Science ↵ EpiATLAS Consortium , 2025 . IHEC EpiATLAS Data . Available at: https://ihec-epigenomes.org/epiatlas/data/ [Accessed March 17, 2025 ]. ↵ Ernst , J. & Kellis , M. , 2015 . Large-scale imputation of epigenomic datasets for systematic annotation of diverse human tissues . Nature biotechnology , 33 ( 4 ), pp. 364 – 376 . OpenUrl CrossRef PubMed ↵ Feinberg , A.P. , 2007 . Phenotypic plasticity and the epigenetics of human disease . Nature , 447 ( 7143 ), pp. 433 – 440 . OpenUrl CrossRef PubMed Web of Science ↵ Friedman , J.H. , 2001 . Greedy function approximation: A gradient boosting machine . Annals of statistics , 29 ( 5 ), pp. 1189 – 1232 . OpenUrl CrossRef Web of Science ↵ Frommer , M. et al. , 1992 . A genomic sequencing protocol that yields a positive display of 5-methylcytosine residues in individual DNA strands . Proceedings of the National Academy of Sciences of the United States of America , 89 ( 5 ), pp. 1827 – 1831 . OpenUrl Abstract / FREE Full Text ↵ Geurts , P. , Ernst , D. & Wehenkel , L. , 2006 . Extremely randomized trees . Machine learning , 63 ( 1 ), pp. 3 – 42 . OpenUrl CrossRef Grealey , J. et al. , 2022 . The carbon footprint of bioinformatics . Molecular biology and evolution , 39 ( 3 ), p. msac034 . OpenUrl CrossRef PubMed ↵ Heidi Chial , 2008 . DNA Sequencing Technologies Key to the Human Genome Project . Nature Education , 1 ( 1 ), p. 219 . OpenUrl ↵ Hopfield , J.J. , 1988 . Artificial neural networks . IEEE circuits and devices magazine , 4 ( 5 ), pp. 3 – 10 . OpenUrl ↵ Jin , Z. & Liu , Y. , 2018 . DNA methylation in human diseases . Genes & Diseases , 5 ( 1 ), pp. 1 – 8 . OpenUrl PubMed ↵ Kapourani , C.-A. & Sanguinetti , G. , 2019 . Melissa: Bayesian clustering and imputation of single-cell methylomes . Genome biology , 20 ( 1 ), p. 61 . OpenUrl PubMed Kris A. Wetterstran, Sequencing Costs: Data from the NHGRI Genome Sequencing Program . Genome.gov . Available at: https://www.genome.gov/sequencingcostsdata [Accessed February 3, 2021 ]. ↵ LeDell , E. & Poirier , S. , 2020 . H2O AutoML: Scalable Automatic Machine Learning . Available at: https://www.semanticscholar.org/paper/H2O-AutoML%3A-Scalable-Automatic-Machine-Learning-LeDell-Poirier/22cba8f244258e0bba7ff4bb70c4e5b5ac3e2382 [Accessed October 18, 2024 ]. ↵ Li , M. et al. , 2019 . EWAS Atlas: a curated knowledgebase of epigenome-wide association studies . Nucleic acids research , 47 ( D1 ), pp. D983 – D988 . OpenUrl CrossRef PubMed ↵ Li , Y. et al. , 2010 . The DNA methylome of human peripheral blood mononuclear cells . PLoS biology , 8 ( 11 ), p. e1000533 . OpenUrl CrossRef PubMed ↵ Loyfer , N. et al. , 2023 . A DNA methylation atlas of normal human cell types . Nature , 613 ( 7943 ), pp. 355 – 364 . OpenUrl CrossRef PubMed ↵ Marchini , J. & Howie , B. , 2010 . Genotype imputation for genome-wide association studies . Nature reviews. Genetics , 11 ( 7 ), pp. 499 – 511 . OpenUrl CrossRef PubMed Web of Science ↵ Meissner , A. et al. , 2005 . Reduced representation bisulfite sequencing for comparative high-resolution DNA methylation analysis . Nucleic acids research , 33 ( 18 ), pp. 5868 – 5877 . OpenUrl CrossRef PubMed Web of Science ↵ Moghul , M.I. , 2021 . Imputation Aided Methylation Analysis. Doctoral. UCL (University College London) . Available at: https://discovery.ucl.ac.uk/id/eprint/10132964/ [Accessed July 27, 2023 ]. ↵ Nelder , J.A. & Wedderburn , R.W.M. , 1972 . Generalized Linear Models . Journal of the Royal Statistical Society. Series A (General) , 135 ( 3 ), p. 370 . OpenUrl CrossRef Web of Science ↵ Simpson , J.T. et al. , 2017 . Detecting DNA cytosine methylation using nanopore sequencing . Nature methods , 14 ( 4 ), pp. 407 – 410 . OpenUrl PubMed Tanić , M. et al. , 2022 . Comparison and imputation-aided integration of five commercial platforms for targeted DNA methylome analysis . Nature biotechnology , 40 ( 10 ), pp. 1478 – 1487 . OpenUrl PubMed ↵ Taudt , A. et al. , 2018 . METHimpute: imputation-guided construction of complete methylomes from WGBS data . BMC genomics , 19 ( 1 ), p. 444 . OpenUrl CrossRef PubMed ↵ Teschendorff , A.E. & Relton , C.L. , 2018 . Statistical and integrative system-level analysis of DNA methylation data . Nature reviews. Genetics , 19 ( 3 ), pp. 129 – 147 . OpenUrl CrossRef PubMed Vink , R. et al. , 2024 . pola-rs/polars: Python Polars 0.20.7, Zenodo . Available at: https://zenodo.org/records/10616464 [Accessed October 18, 2024 ]. ↵ Warnecke , P.M. et al. , 2002 . Identification and resolution of artifacts in bisulfite sequencing . Methods , 27 ( 2 ), pp. 101 – 107 . OpenUrl CrossRef PubMed Web of Science ↵ Wu , X. & Zhang , Y. , 2017 . TET-mediated active DNA demethylation: mechanism, function and beyond . Nature reviews. Genetics , 18 ( 9 ), pp. 517 – 534 . OpenUrl CrossRef PubMed ↵ Zou , L.S. et al. , 2018 . BoostMe accurately predicts DNA methylation values in whole-genome bisulfite sequencing of multiple human tissues . BMC genomics , 19 ( 1 ), p. 390 . OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted March 18, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following GIMMEcpg: Global Imputation of Mean CpG MEthylation in Real-time Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share GIMMEcpg: Global Imputation of Mean CpG MEthylation in Real-time Ismail Moghul , Niuzheng Chai , Nikolas Pontikos , Alison Hardcastle , Javier Herrero , Stephan Beck bioRxiv 2025.03.17.643746; doi: https://doi.org/10.1101/2025.03.17.643746 Share This Article: Copy Citation Tools GIMMEcpg: Global Imputation of Mean CpG MEthylation in Real-time Ismail Moghul , Niuzheng Chai , Nikolas Pontikos , Alison Hardcastle , Javier Herrero , Stephan Beck bioRxiv 2025.03.17.643746; doi: https://doi.org/10.1101/2025.03.17.643746 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41937) Biophysics (21452) Cancer Biology (18589) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15156) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00