Non-invasive multi-cancer detection using DNA hypomethylation of LINE-1 retrotransposons

doi:10.1101/2024.01.20.23288905

Non-invasive multi-cancer detection using DNA hypomethylation of LINE-1 retrotransposons

2024 · doi:10.1101/2024.01.20.23288905

preprint OA: gold CC-BY-NC-ND-4.0

📄 Open PDF Full text JSON View at publisher

Full text 125,106 characters · extracted from preprint-html · click to expand

Non-invasive multi-cancer detection using DNA hypomethylation of LINE-1 retrotransposons | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Non-invasive multi-cancer detection using DNA hypomethylation of LINE-1 retrotransposons Marc Michel , Maryam Heidary , Anissa Mechri , Kévin Da Silva , Marine Gorse , Victoria Dixon , Klaus von Grafenstein , Charline Bianchi , Caroline Hego , Aurore Rampanou , Constance Lamy , Maud Kamal , Christophe Le Tourneau , Mathieu Séné , Ivan Bièche , Cecile Reyes , David Gentien , Marc-Henri Stern , Olivier Lantz , Luc Cabel , Jean-Yves Pierga , François-Clément Bidard , View ORCID Profile Chloé-Agathe Azencott , View ORCID Profile Charlotte Proudhon doi: https://doi.org/10.1101/2024.01.20.23288905 Marc Michel 1 Inserm U934, CNRS UMR3215, Institut Curie, PSL Research University ; Paris, France 2 CBIO-Center for computational biology, Mines Paris, PSL Research University ; Paris, France 3 INSERM U900, Institut Curie, PSL Research University ; Paris, France 4 Circulating Tumor Biomarkers laboratory, INSERM CIC BT-1428, Institut Curie ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Maryam Heidary 4 Circulating Tumor Biomarkers laboratory, INSERM CIC BT-1428, Institut Curie ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Anissa Mechri 1 Inserm U934, CNRS UMR3215, Institut Curie, PSL Research University ; Paris, France 5 Univ Rennes, Inserm, EHESP, Irset (Institut de recherche en santé, environnement et travail) - UMR_S 1085 ; Rennes, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kévin Da Silva 5 Univ Rennes, Inserm, EHESP, Irset (Institut de recherche en santé, environnement et travail) - UMR_S 1085 ; Rennes, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Marine Gorse 5 Univ Rennes, Inserm, EHESP, Irset (Institut de recherche en santé, environnement et travail) - UMR_S 1085 ; Rennes, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Victoria Dixon 5 Univ Rennes, Inserm, EHESP, Irset (Institut de recherche en santé, environnement et travail) - UMR_S 1085 ; Rennes, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Klaus von Grafenstein 5 Univ Rennes, Inserm, EHESP, Irset (Institut de recherche en santé, environnement et travail) - UMR_S 1085 ; Rennes, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Charline Bianchi 5 Univ Rennes, Inserm, EHESP, Irset (Institut de recherche en santé, environnement et travail) - UMR_S 1085 ; Rennes, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Caroline Hego 4 Circulating Tumor Biomarkers laboratory, INSERM CIC BT-1428, Institut Curie ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aurore Rampanou 4 Circulating Tumor Biomarkers laboratory, INSERM CIC BT-1428, Institut Curie ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Constance Lamy 6 Department of Drug Development and Innovation (D3i) , Institut Curie; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Maud Kamal 6 Department of Drug Development and Innovation (D3i) , Institut Curie; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Christophe Le Tourneau 6 Department of Drug Development and Innovation (D3i) , Institut Curie; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mathieu Séné 7 Pharmacogenomics Unit, Genetics Department, Institut Curie ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ivan Bièche 7 Pharmacogenomics Unit, Genetics Department, Institut Curie ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Cecile Reyes 8 Genomics Platform, Translational Research Department, Research Center, Institut Curie, PSL Research University ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site David Gentien 8 Genomics Platform, Translational Research Department, Research Center, Institut Curie, PSL Research University ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Marc-Henri Stern 9 Inserm U830, Institut Curie, PSL Research University ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Olivier Lantz 10 Inserm U932, Institut Curie, PSL Research University ; Paris, France 11 Laboratory of clinical immunology, INSERM CIC BT-1428 , Institut Curie; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Luc Cabel 12 Department of Medical Oncology, Institut Curie ; Paris and Saint Cloud, France 13 CNRS UMR144, Institut Curie, PSL Research University ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jean-Yves Pierga 4 Circulating Tumor Biomarkers laboratory, INSERM CIC BT-1428, Institut Curie ; Paris, France 12 Department of Medical Oncology, Institut Curie ; Paris and Saint Cloud, France 14 Université Paris Cité ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site François-Clément Bidard 4 Circulating Tumor Biomarkers laboratory, INSERM CIC BT-1428, Institut Curie ; Paris, France 12 Department of Medical Oncology, Institut Curie ; Paris and Saint Cloud, France 15 UVSQ, Université Paris-Saclay ; Saint Cloud, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chloé-Agathe Azencott 2 CBIO-Center for computational biology, Mines Paris, PSL Research University ; Paris, France 3 INSERM U900, Institut Curie, PSL Research University ; Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Chloé-Agathe Azencott Charlotte Proudhon 1 Inserm U934, CNRS UMR3215, Institut Curie, PSL Research University ; Paris, France 4 Circulating Tumor Biomarkers laboratory, INSERM CIC BT-1428, Institut Curie ; Paris, France 5 Univ Rennes, Inserm, EHESP, Irset (Institut de recherche en santé, environnement et travail) - UMR_S 1085 ; Rennes, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Charlotte Proudhon For correspondence: charlotte.proudhon{at}inserm.fr Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Purpose The detection of circulating tumor DNA, which allows non-invasive tumor molecular profiling and disease follow-up, promises optimal and individualized management of patients with cancer. However, detecting small fractions of tumor DNA released when the tumor burden is reduced remains a challenge. Experimental Design We implemented a new highly sensitive strategy to detect base-pair resolution methylation patterns from plasma DNA and assessed the potential of hypomethylation of LINE-1 retrotransposons as a non-invasive multi-cancer detection biomarker. The DIAMOND (Detection of Long Interspersed Nuclear Element Altered Methylation ON plasma DNA) method targets 30-40,000 young L1 scattered throughout the genome, covering about 100,000 CpG sites and is based on a reference-free analysis pipeline. Results Resulting machine learning-based classifiers showed powerful correct classification rates discriminating healthy and tumor plasmas from 6 types of cancers (colorectal, breast, lung, ovarian, gastric cancers and uveal melanoma including localized stages) in two independent cohorts (AUC = 88% to 100%, N = 747). DIAMOND can also be used to perform copy number alterations (CNA) analysis which improves cancer detection. Conclusions This should lead to the development of more efficient non-invasive diagnostic tests adapted to all cancer patients, based on the universality of these factors. Statement of significance The DIAMOND assay is a new highly sensitive strategy to detect base-pair resolution methylation patterns of LINE-1 retrotransposons (L1) from plasma DNA. It targets 30-40,000 young L1 scattered throughout the genome, covering about 100,000 CpG sites and is based on a reference-free analysis pipeline. This provided high coverage data using affordable sequencing depth, which is instrumental to achieve high sensitivity and work with minute amounts of cell-free DNA. Resulting machine learning-based classifiers showed powerful discrimination between healthy and tumor plasmas from 6 types of cancers (colorectal, breast, lung, ovarian, gastric cancers and uveal melanoma including localized stages) in two independent cohorts (AUC = 88% to 100%, N = 747). DIAMOND data can also be used to perform copy number alterations (CNA) analysis which improves cancer detection. Introduction Extensive research has shown that tumor genetic alterations can be detected from plasma DNA of patients with cancer 1 – 3 . This paved the way for the use of molecular analyses performed from liquid biopsies to genotype tumors non-invasively 4 , 5 and demonstrated the potential of circulating tumor DNA (ctDNA) as a marker of cancer progression 6 , 7 . It is also a powerful prognostic factor 8 enabling detection of tumor masses not perceptible clinically, after surgery or during treatment. These approaches promise optimal management of cancer patients and are currently playing an important role in oncology 9 , 10 . However, several technological obstacles still limit their widespread application. Samples collected at early stages of tumor progression, or during and after treatment, may contain less than one mutant copy per milliliter of plasma 1 , 11 . This is below the detection limit of most used technologies, even when testing multiple genetic alterations simultaneously. Moreover, most methods are biased towards preselected recurrent mutations, which do not cover all tumors. We observed in our previous studies 12 – 15 that approximately 25% of patients affected with breast cancer do not display common mutations trackable in plasma DNA, even at advanced stages. Therefore, it is necessary to develop more sensitive and more informative detection tools. Multiple studies have demonstrated the central role of epigenetic processes in the onset, progression, and treatment of cancer. Epigenetic alterations (i.e., changes in the pattern of chromatin modifications such as DNA methylation and histone modifications) are promising candidates for cancer detection, diagnosis and prognosis 16 , 17 . These extended markers provide an additional level of information, overlooked by methods that only question genetic alterations 18 . Aberrant DNA methylation is a hallmark of neoplastic cells 16 , which combine hypermethylation of a wide range of tumor suppressor genes along with a global hypomethylation of the genome 19 . DNA methylation is a stable modification, which affects a large number of CpG sites per region and per genome and will be key to achieve increased detection sensitivity 20 . Moreover, the concordance of the methylation status between multiple CpGs of the same region can help detect low frequency anomalies among a heterogeneous population of molecules 21 , 22 . Finally, combining several genomic regions allows to capture a wide range of tumor alleles and cover the heterogeneous profiles of cancer patients 23 . Previous studies have shown that cellular DNA methylation patterns are conserved in cell-free DNA (cfDNA) and that detection of cancer-specific profiles at the genome-wide scale is feasible 24 – 27 . Until now, most studies investigating plasma DNA methylation patterns have targeted a limited number of regions at high depth, using PCR-based methods 28 – 30 , or explored genome-wide at low depth with high-throughput sequencing 24 – 26 , 31 . Both approaches have limited sensitivity, as focusing on a few regions does not cover cancer-type and patient variability and low depth cannot detect small fractions of ctDNA. More recent studies, relying on the capture of regions of interest coupled with deep sequencing have investigated the performance of larger numbers of regions at high depth 21 , 32 – 40 . These methods enabled sensitive detection and classification of cancer from plasma DNA. However, since they largely focus on cancer hypermethylation and unique sequences, it involves targeting specific regions for each cancer type. As a result, developing a cost-effective universal pan-cancer test remains a challenge. Remarkably, cancer-related hypomethylation has been reported in almost all classes of repeated sequences 41 , from dispersed retrotransposons to clustered satellite repeated DNA, and within multiple forms of cancers 42 . In particular, this leads to the reactivation of retrotransposons, resulting in the acquisition of genomic instability, chromosomal rearrangements and the production of chimeric transcripts between the transposable element and its adjacent locus. Hypomethylation of the internal promoter of Long-Interspersed Element-1 (L1) has been described as a hallmark of many human cancers 42 , 43 , which can result in the reactivation of intact L1 elements 44 and the abnormal production of their transcripts and proteins. Transposition of these competent elements induces DNA double-strand breaks and damages the genome. A recent study identified 4 types of cancer (esophagus, head and neck, lung and colorectal) with a large amount of damage linked to retrotranspositions involving mostly L1s 45 . Another study identified the transposition event responsible for initiating colorectal cancer by mutating the APC gene 46 . To obtain a global representation of the hypomethylation occurring during carcinogenesis and to increase sensitivity, we chose to target primate-specific copies of L1 retrotransposons (L1PA). These elements have tens of thousands of copies per cell and are hypomethylated in multiple cancers 42 . Two studies have explored L1 global methylation profiles from plasma 47 , 48 of lung and colorectal cancers, using qPCR-based methods, but reported a low detection sensitivity, below 70%. Indeed, repeats being inherently difficult to map, detecting their methylation profiles at the single base-pair resolution requires sophisticated downstream analysis. To overcome this, we have developed a method to detect methylation patterns of primate specific L1 elements (L1PA) from cfDNA, which we named DIAMOND (for D etection of Long I nterspersed Nuclear Element A ltered M ethylation ON plasma D NA). We implemented computational tools to accurately align sequencing data without a reference genome and applied prediction models, trained by machine learning algorithms, integrating patterns of methylation, overall and at the single molecule level. The aim of this study was to assess the potential of circulating DNA methylation changes at L1s as a universal tumor biomarker, and to develop new highly sensitive strategies to detect cancer-specific signatures in blood. Results Targeting primate-specific LINE-1 elements reveals plasma DNA-methylation patterns genome-wide We developed a PCR-based targeted bisulfite method coupled to deep sequencing to detect methylation patterns of L1PA elements. We used sodium bisulfite chemical conversion to achieve base-pair resolution analysis and designed a multiplexed PCR based on 8 amplicons covering L1PAs ( Fig. 1A , Table S1 , Fig. S1A ). We detected thousands of L1PA elements scattered throughout the genome as shown by the genomic hits obtained from a healthy plasma, an ovarian tumor, and a uveal melanoma tumor sequenced at high depth ( Fig. 1B , Table S2 ). We observed similar profiles for the three samples, as well as for healthy and cancer plasmas with standard coverage ( Fig. S1B-E ). This demonstrated the robustness of the approach. Overall, the estimated number of L1PA targets is about 30-40,000 elements per genome including half of the human specific copies (L1HS) and many copies of the other L1PA subfamilies ( Fig. 1C , Table S2 ). This represents an estimate of 87-120,000 CpG sites. View this table: View inline View popup Download powerpoint Table S1. Targeted bisulfite sequencing primers View this table: View inline View popup Download powerpoint Table S2. Target copies and CpG sites relative to the number of sequencing reads Download figure Open in new tab Fig 1. Targeting primate-specific LINE-1 elements reveals plasma DNA-methylation patterns genome-wide A. CpG density along the structure of a human specific LINE-1 (L1HS) element, which contains 95 CpG. The DIAMOND assay targets 30 CpG. Each target amplicon is highlighted by a black bar below the structure. The number of CpG sites detected per amplicon is displayed in blue. B. L1PA copy number hit by uniquely and/or randomly mapped reads, obtained from a healthy plasma versus ovarian (OVC, top track) or uveal melanoma (UVM, middle track) tumor tissue samples ‘deep sequenced’ (54M, 44M or 46M reads respectively) over the distribution of L1PA elements annotated in the genome (RepeatMasker on hg38, grey bottom track). C. Histogram summarizing the most represented sub-families of L1 targeted by the DIAMOND assay in the 3 ‘deep sequenced’ samples, in descending order (sum of copies across the 3 samples). The colors highlight the relative contribution of L1PA copies hit by reads uniquely mapped, randomly mapped or both. D. Methylation pattern observed across the 8 regions targeted along the L1 element in the healthy plasma sample ‘deep-sequenced’. Metaplot showing the average methylation levels at each CpG position. Amplicon limits are delineated with grey dotted lines. The dark line marks the end of the 5’UTR. Average levels per amplicon are indicated. Following deep sequencing, reads are traditionally mapped back to the genome. However, the majority of sequencing reads from repetitive sequences are assigned randomly during mapping steps and are subsequently lost for classical differentially methylated region (DMR) calling 49 . We, thus, developed a new computational pipeline to accurately align repetitive sequencing data without using a reference genome ( Fig. S1F ). To perform this, we clustered all good quality reads based on their similarity, extracted representative sequences from the largest clusters and used them for multiple sequence alignment. We then aligned all the reads back onto this custom database. Using such reference-free method, we preserved the majority of our data and could extract the informative CpG sites agnostically. We selected sites with a CG/TG content ≥ 20% including at least 5% of CG to ensure that the position of interest carries some DNA methylation marks. This selection was done on healthy samples to avoid biases related to cancer hypomethylation. We retrieved 35 CpG positions covered by our panel including two additional CpGs with respect to the L1HS consensus annotations, located within amplicon 2 ( Fig. S2A-B ). As expected, the 5’ end of the L1 copies targeted is heavily methylated 42 , 50 , particularly within the 2 nd amplicon. We also observed quite high levels in both the 5 th amplicon (69% in average, Fig. 1D ), which covers part of the ORFI, and the last two CpGs of amplicon 8, which is located immediately upstream of the 3’UTR. Amplicon 3, which has the lowest methylation levels within the 5’ end, displayed sequencing data with atypical distributions and showed less robust performances (not shown). Hence, we further eliminated it from the rest of the study, resulting in a total of 30 CpG positions analyzed. Overall, this reference-free method retrieved methylated sites contained by the youngest LINE-1 elements present in the human genome allowing us to study their DNA-methylation levels and motifs from minute amount of DNA such as plasma cfDNA. L1PA hypomethylation is detectable from plasma DNA in multiple forms of cancer We first tested the DIAMOND approach on methylation controls, cancer cell lines and tissue samples. The overall methylation levels demonstrated an extensive L1PA hypomethylation specifically in cancer samples, including colorectal (CRC), ovarian (OVC), breast (BRC) and uveal melanoma (UVM) cancer cell lines as well as OVC, BRC and UVM tumors compared to healthy white blood cells and healthy tissues collected adjacent to ovarian tumors ( Fig. 2A , Table S3 ). Next, we tested a cohort of 473 plasma samples including 123 healthy plasma controls and plasma samples from patients with 6 different types of cancer, covering metastatic (M+) and localized (M0) stages ( Table S4 . This includes colorectal and ovarian cancers in which a substantial rate of L1 hypomethylation has previously been reported 51 , 52 . We detected a statistically significant L1PA hypomethylation in cfDNA of metastatic colorectal cancer (CRC M+), breast cancer (BRC M+) and uveal melanoma (UVM M+) samples as well as in locally advanced ovarian cancers (OVC M0, stages III) and localized gastric cancers (GAC M0) ( Fig. 2B , Table S3 ). The global methylation was not significantly different in metastatic non-small cell lung cancers (LC M+) nor in localized stages of breast cancer (BRC M0). However, focusing strictly on global methylation levels provides only part of the information. View this table: View inline View popup Download powerpoint Table S3. Statistical results of global methylation differences between healthy samples and cancer subgroups using Mann–Whitney U test View this table: View inline View popup Table S4. List of plasma samples. Sample IDs were not known to anyone outside the research group Download figure Open in new tab Fig 2. L1PA hypomethylation is detectable from plasma DNA in multiple forms of cancer A. Global DNA methylation of fully methylated (healthy white blood cells (WBC) DNA treated with SssI) and unmethylated (whole-genome amplified healthy WBC DNA) controls, cancer cell lines or tissues. Ovarian healthy tissues were collected next to ovarian tumors. The global methylation levels for each sample correspond to the percentage of CG dinucleotides at each CpG site averaged by the number of CpG sites. Statistical differences between controls, cell lines or tissues and healthy white blood cells (WBC) were computed using Mann–Whitney U test (p Fully_meth. = 9.97e-07, p Unmeth. = 1.86e-06, p CRC_Cells = 0.266, p OVC_Cells = 1.20e-02, p BRC_Cells = 2.47e-06, p UVM_Cells = 6.77e-05, p Healthy_OVC_Tissues = 0.063, p OVC_Tissues = 8e-03, p BRC_Tissues = 4.10e-04, p UVM_Tissues = 0.88, Table S3 ). B. Global DNA methylation in cancer plasma including metastatic stages (M+) and non-metastatic stages (M0) as well as healthy donor plasmas. Statistical differences between each cancer subgroup and healthy samples were computed using Mann–Whitney U test (p CRC_M+ = 1.27e-29, p BRC_M+ = 3.79e-19, p UVM_M+ = 8.29e-06, p LC_M+ = 0.655, p OVC_M0 = 1.94e-05, p GAC_M0 = 4.28e-08, p BRC_M0 = 9.10e-01, Table S3 ). Black dotted lines represent the median. C. Methylation level at each targeted CpG sites (x-axis), for each healthy sample (y-axis) depicted as a heatmap. CpG numbers are indicated. The metaplot represents the average methylation levels of the population. Amplicon numbers are indicated. D. Differential methylation levels between healthy samples and patients for each type of cancer represented as metaplots. E. Proportion of methylation motifs, called haplotypes, for each amplicon (mean centered per amplicon). Only the most important features are represented (see Fig. 3F and methods). Blue arrows highlight the most abundant haplotype in each amplicon. F. Mean centered abundance of the most important haplotypes with the highest co-methylation patterns (mostly fully methylated or fully unmethylated molecules) in cancer subgroups compared to healthy donors. Statistical significances were computed using Mann–Whitney U test on raw haplotype proportions ( Table S5 ). View this table: View inline View popup Table S5. Statistical results of proportion differences between healthy samples and cancer subgroups for the 372 haplotypes using Mann–Whitney U test We further computed the levels of methylation at each CpG target ( n =30) for these plasma samples and observed specific patterns of methylation along the L1 structure, which are robustly conserved among the 123 healthy donors ( Fig. 2C ). When considering all cancer samples together, we observed a clear difference with the methylation of healthy samples. We detected a steady hypomethylation through all CpG targets except for the two sites within amplicon 8 ( Fig. 2D ). This is also true for metastatic colorectal cancers (CRC M+), breast cancers (BRC M+) and uveal melanoma (UVM M+). Clear hypomethylation is also observable for localized gastric (GAC M0) and ovarian (OVC M0) cancers, in particular at amplicon #1, #4 and #6, while the differences are less striking for localized breast cancers (BRC M0) and metastatic non-small cell lung cancers (LC M+). The distinction between most cancers and healthy samples were dependent on multiple CpG positions belonging to different amplicons along L1s, as shown by PCA analysis ( Fig. S2C ). The least discriminating positions were located within amplicon 8, which is consistent with the metaplots shown in Fig. 2D . Next, we analyzed the motifs of methylation at the molecule level, which provide a more detailed signal. These haplotypes correspond to true patterns of methylation of adjacent CpGs, detected for each amplified DNA molecule. This was achieved by the incorporation of unique molecular identifiers (UMIs) into the library ( Fig. S1A ). Based on the combination of the 30 CpG targets divided into their 7 amplicons, we extracted a total of 372 unique features ( Fig. S2D ). We observed highly robust representation profiles of haplotypes among the 123 healthy samples ( Fig. 2E ). For most amplicons, the fully methylated molecules were the most represented, as expected for healthy controls. However, we observed a high proportion of totally unmethylated haplotypes in amplicon #6 and #7. This can be explained by the fact that older L1 copies are often truncated in 5’ and less regulated by DNA methylation, leading to the capture of molecules with lower DNA methylation in 3’. Nevertheless, several intermediate patterns were also among the most important features and were found to be differentially represented in healthy and cancer samples ( Table S5 ). Fully methylated haplotypes were significantly under-represented in most cancer subgroups and in most amplicons ( Fig. 2F ). On the contrary, fully unmethylated haplotypes were over-represented in most cancer subgroups and in most amplicons. This is also well illustrated by the PCA analysis shown in Fig. S2E , underlining the contribution of highly methylated haplotypes towards the healthy group versus the lowly methylated haplotypes separating cancer samples (middle panel). This separation involves haplotypes from all amplicons (right panel). Next, we compared the methylation profiles of tumor and plasma paired samples (OVC = 10, Fig. S3A-F ; BRC = 16, Fig. S3G-K ) by calculating the correlation between their methylation differences relative to the mean methylation of healthy donor plasmas. We observed a better correlation with methylation haplotype portions than with single CG methylation features ( Fig. S3B-C and Fig. S3H-I , Table S6 ). These results demonstrate that L1 hypomethylation can robustly be observed from cancer plasma DNA at the level of single CpG sites and more importantly at the level of methylation haplotypes. View this table: View inline View popup Download powerpoint Table S6. List of paired plasma and tissues samples and their spearman correlation for various features L1PA hypomethylation-based classifiers recognize samples from multiple forms of cancer We then trained classification models using random forests, with the 30 features corresponding to the levels of methylation at each CpG target or the 372 features corresponding to the proportions of haplotypes, or both, and assessed their performances to automatically separate healthy from tumor plasmas. By testing all cancer samples without cancer-type specification, the methylation of L1PA elements showed an extremely good ability to discriminate between healthy and tumor plasmas, with an overall area under the curve (AUC) of 94-95% for the 3 types of features ( Fig. 3A, 3C , S4A-B ). Next, we trained distinct models to estimate the performances for each cancer type and/or dissemination stage (M0 vs M+). These models were extremely performant in metastatic colorectal and breast cancers but also stage III ovarian cancers and localized gastric cancers, with nearly perfect classifications and AUCs between 98-100% ( Fig. 3B-C , S4A-B ). Additionally, we observed excellent performances for metastatic lung cancers and uveal melanoma and more importantly for localized stages of breast cancer (AUC BRC_M0 = 92-95%). These models provide very good sensitivities at 99% specificity ( Fig. 3D ), in particular for CRC M+, BRC M+, OVC M0, GAC M0 and BRC M0. The latter is one of the most difficult cancer to detect non invasively, as reported in previous liquid biopsy multi-cancer tests 11 , 39 , 40 . Download figure Open in new tab Fig 3. L1PA hypomethylation-based classifiers recognize samples from multiple forms of cancers A-B. Receiver Operating Characteristic (ROC) curves obtained for plasma samples classification using single-CpG methylation levels (n=30) or haplotype proportions (n=372) with the ‘all cancers’ model ( A ) or the ‘cancer-types’ models ( B ). All classifications include 5000 stratified random repetitions of learning on 60% of the samples and testing on the 40% left, with undersampling for classes equilibrium (results with and without undersampling are presented in Fig. S3A-B ). N CRC_M+ = 75, N BRC_M+ = 97, N LC_M+ = 50, N UVM_M+ = 55, N OVC_M+ = 4 (included only in ‘all cancers’ testing), N OVC_M0 = 18, N GAC_M0 = 27, N BRC_M0 = 23 tested versus 123 healthy donors. ROC curves shown are obtained by averaging the sensitivity and specificity of each repetition of learning. C-D. Performances for classifiers using single CpG methylation levels (grey), haplotype proportions (black) or both (white) presented as AUCs ( C ) or sensitivities at 99% specificity ( D ). Average AUCs are computed from the 5000 AUCs generated by each repetition of learning. Bars indicate 95% CI. E-F. Importance (mean decrease in impurity) of the features used by the classifiers depicted as clustered heatmaps. The features correspond to the CpG targets ( E ) or the haplotypes ( F ). Only the most important haplotypes (feature importance level >1%) are shown. G. ROC curves obtained for plasma samples classification with the 3-stage model, using haplotype features. H-I. Performances for the 3-stage classifiers using single CpG methylation levels (grey) or haplotype proportions (black) presented as AUCs ( H ) or sensitivities at 99% specificity ( I ). Early stages (I/II, N=31), locally advanced stages (III, N=30) and metastatic stages (IV, N=281) J. Cancer detection rates with the methylation-based DIAMOND assay (haplotypes and CG methylation) vs common recurrent mutations for samples assessed in previous studies ( 13 , 14 , 55 , 56 ) or with NGS ( Table S6 ). Overall, we observed similar results using single-CpG methylation levels, haplotype proportions or both features. This can be explained by the high correlation observed between the 2 types of features ( Fig. S4C ). We compared these performances for each subgroup with the classification rates extracted from the model ‘all’, testing all cancer samples together. We observed similar AUCs for most cancers, but overall, they were better classified with their ‘expert’ cancer specific models ( Fig. S4D-E ). This shows that certain specificities of cancer type may confuse the current ‘all’ model and affect the sensitivities at 99% specificities. Subsequently, we evaluated the importance of the features used by our classifiers ( Fig. 3E-F ). CpG positions displayed different patterns in the various cancer subgroups that can be informative for distinct cancer types or stages ( Fig. 3E ) . Nonetheless, we identified features which are common to many types of cancer such as most CpGs of amplicon 1 and the first CpG of amplicon 6. Other features seemed to be characteristic of specific subgroups, such as CG7-14 which are the most important features for sorting localized stages of breast cancer (BRC M0) or CG15-18, in particular CG17, which are part of the top features for metastatic breast cancers (BRC M+). Besides their dissemination status, the different important features detected in the BRC M0 and M+ subgroups may result from the fact that they were composed of different breast cancer subtypes, with 100% of hormone dependent (HR+ HER2-) cancers in the BRC M+ subgroup and a mixture of subtypes, including also triple negative (TNBC) and HER2+ cancers, in the BRC M0 subgroup ( Table S7 ) . Indeed, we observed that slightly different hypomethylation patterns in HR+ HER2-BRC versus TNBC ( Fig. S4F-H ). We observed that haplotype features showed consistent patterns with important CG features ( Fig. 3F ). However, haplotypes provide a more detailed view of the methylation patterns with a strong importance of the most methylated or non-methylated molecules. We still observed that some methylation intermediates are important for cancer detection (ex: in amplicon #1 in CRC M+ and GAC M0, #2 in BRC M0, #4 in BRC M+ and other subgroups, #5 in LC M+ and UVM M+, #7 in OVC, #8 in BRC M0 and LC M+). BRC M+ and M0 subgroups cluster together, showing that even if they are not highly similar, they are closer to each other than to other cancer types. These breast cancer specificities are worth exploring further in the future. Overall, this suggests that L1PA methylation alterations detected from cfDNA vary in different types and stages of cancer. View this table: View inline View popup Download powerpoint Table S7. Breast cancer subtype composition of M0 and M+ subgroups in C1 and C2 To estimate the ability of DIAMOND to detect cancer at early stages of the disease, we build classifiers for 3 stage classes gathering all cancer types: early stages (I/II, N=31), locally advanced stages (III, N=30) and metastatic stages (IV, N=281). Classifications were highly performant for all 3 stage categories (AUC Early = 95%, AUC Adv. = 97%, AUC Meta = 95%; Fig. 3G-H , S4I, S4D ) with a mean sensitivity of 70% for early stages, (Sen Early = 70%, Sen Adv. = 90%, Sen Meta = 69%; Fig. 3I , S4E ). Next, we analyzed whole genome bisulfite sequencing (WGBS) from healthy and cancer plasma from two recently published studies 53 , 54 to evaluate if we can retrieve L1PA hypomethylation signal with non-targeted methods ( Fig. S5A , see methods). Consistent with our findings, we observed statistically significant hypomethylation in cancer samples ( Fig. S5B-H ). These whole genome approaches were nevertheless not as efficient as the DIAMOND targeted approach to automatically detect cancer from L1PA hypomethylation as they cover less well the regions of interest ( Fig. S5A , S5F, S5I ). Indeed, they involve mapping on a reference genome leading to loss of data. Moreover, L1PA methylation measured with DIAMOND largely outperforms methods based on the detection of mutations. In comparison, the identification of the same tumor samples via the detection of frequent recurrent mutations, which is commonly used in the clinic, does not exceed 57% for ovarian cancer ( Table S8 ), 38% for colon cancer 55 and 52% for metastatic breast cancer 13 , 14 ( Fig. 3J ). We particularly achieved remarkable performance on the cohort of 27 localized gastric cancers with a detection rate of 95% of true positive as compared to 12% for mutation screening 56 . This is mostly due to the fact that methylation changes occur in virtually all cancer patients, unlike recurrent mutations. View this table: View inline View popup Download powerpoint Table S8. Mutation screening of ovarian cancer samples from the discovery cohort Multi-cancer classification performances are reproducible on an independent cohort To validate the DIAMOND approach, we tested a second independent cohort consisting of 214 patients affected with the same types of cancers as in the first cohort, excluding uveal melanoma and non-metastatic gastric cancers, along with 60 healthy donors ( Fig. 4A ). First, we confirmed that the methylation patterns along the L1 structure were highly reproducible between healthy donors from cohorts 1 and 2, at the level of single-CpG targets ( Fig. 4B ) but also for haplotype proportions ( Fig. 4C , Table S9 ). While methylation at single-CpG within cancer subgroups showed slightly more variability ( Fig. S6A ), global methylation levels were quite reproducible between the two cohorts, showing similar distributions and no statistical differences ( Fig. 4D , Table S10 ), except for non-metastatic ovarian cancers. There was an important heterogeneity among the OVC M0 samples of cohort 2, which clustered into two distinct groups, while cohort 1 was more homogeneous ( Fig. S6B ). Notably, no correlation was found with available clinico-histopathological parameters (age, staging, CA125 level, mutational status, treatment or response to therapy). Differential haplotype proportions between healthy and cancer subgroups were also mostly conserved ( Fig. S6C , Table S11 ). Overall, the method showed good reliability with the 7-amplicon panel used and good robustness in detecting L1 methylation levels and changes. View this table: View inline View popup Download powerpoint Table S9. Statistical results of proportion differences between healthy samples from C1 and C2 for 14 haplotypes selected using Mann–Whitney U test View this table: View inline View popup Download powerpoint Table S10. Statistical results of global methylation differences between healthy samples and cancer subgroups from C1 and C2 using Mann–Whitney U test View this table: View inline View popup Download powerpoint Table S11. Statistical results of proportion differences between healthy samples and cancer subgroups from C1 and C2 for 14 haplotypes selected using Mann–Whitney U test Download figure Open in new tab Fig 4. Multi-cancer classification performances are reproducible on an independent cohort A. Number of patients and healthy donors (HD) in the discovery cohort (C1) and in the validation cohort (C2) for each cancer type and dissemination stage (non-metastatic: M0 vs metastatic: M+, NA: stage not available). Generated using Servier Medical Art. B. Methylation level at each targeted CpG sites (x-axis), for each healthy sample (y-axis) from C1 vs C2, depicted as a heatmap. No clustering is done on the data, which comes ordered by targeted CpG site on the x-axis (amplicon numbers are indicated). The metaplots represent the average levels for donors of C1 versus C2 at each CpG site. C. Mean centered abundance of the most important haplotypes, with the highest co-methylation patterns, in healthy donors from C1 vs C2 (Statistical differences computed using Mann–Whitney U test are available in Table S9 ). D. Comparison of the global levels of methylation in C1 vs C2. Methylation levels are calculated as explained previously in Fig. 2 . The p-values are computed using Mann–Whitney U test (p CRC_M+ = 0.680, p OVC_M+ = 0.816, p BRC_M+ = 0.783, p LC_M+ = 0.596, p Healthy = 0.316, p OVC_M0 = 4.74e-05, p BRC_M0 = 0.132, Table S10 ). Black doted lines represent the median. E. Performances for validation classifiers using haplotype features presented as ROC curves, AUCs and sensitivities at 99% specificity obtained with the ‘all’ cancers model or the ‘expert’ models for cancer subgroups. All classifications include 5000 stratified random repetitions of learning on the whole discovery cohort and testing on the whole validation cohort without undersampling. ROC curves shown are obtained by averaging the sensitivity and specificity of each repetition of learning. Average AUCs are computed from the 5000 AUCs generated by each repetition of learning. Bars indicate 95% CI. F. Performances for 3-stage ‘expert’ classifiers: early stages (I/II, N C1 =31, N C2 =38), locally advanced stages (III, N C1 =30, N C2 =54) and metastatic stages (IV, N C1 =281, N C2 =113) presented as mean ROC curves, AUCs, or sensitivities at 99% specificity. G . Performances for integrated models (‘all’ or ‘stack’) when training for the specific group tested (seen) versus when not training for this subgroup (blind). These classifications have been performed on the whole sample set including C1 and C2. Since age-related changes in DNA methylation have been described 57 , 58 and that the healthy donors included in the study are younger overall than the cancer patients ( Fig. S7A ), we have investigated whether there was an effect on the methylation patterns we studied. We found a significant but very small effect which appeared much smaller that the effect of disease status ( Fig. S7B-C ). This small effect was tending towards an increase in methylation with age ( Fig. S7D ), and we observed similar patterns and differences between healthy and cancer samples when adjusting for the age ( Fig. S7E-G , Table S12 ). Furthermore, we observed similar performances in age-matched and non-age-matched cohorts extracted from C2 ( Fig. S7H-J ), demonstrating that age is not a confounding factor. View this table: View inline View popup Download powerpoint Table S12. Statistical results of global methylation differences between original data and age-adjusted data using Mann–Whitney U test To validate our classifiers, we trained models on the entire first cohort and evaluated them on the second set of independent samples. The results showed excellent classification performances with an overall AUC of 88% when testing all cancers together with no annotations of their histological types and AUC between 88%-100% for the cancer subgroup ‘expert’ models with sensitivities at 99% specificity between 49%-100% ( Fig. 4E ) including 55% for localized breast cancers. It was, however, lower for metastatic lung cancer with a mean sensitivity of 49%. We observed that haplotype models were more robust compared to single-CpG methylation rates ( Fig. S6D-G ). This could be explained by the fact that haplotypes capture true methylation patterns at the molecule level, enabling to discard noise caused by experimental variability for example. Next, we applied the same validation method, training on C1 and testing on C2, for the 3-stage ‘expert’ classifiers and observed great classification performances with a mean AUC of 99% and a mean sensitivity of 79% for early stages ( Fig. 4F ). When comparing the performances for each subgroup with the classification rates extracted from the model ‘all’, we observed similar AUCs for most cancers, but lower rates for the BRC M0 subgroup and the early-stage group ( Fig. S8A ), which is mostly composed of BRC M0 ( Fig. S8B ). To further investigate the generalization of our marker, we have also tested the ability of DIAMOND to detect cancers of a type or subgroup for which the model is not trained. To do so, we have combined the data from cohorts C1 and C2 and trained on the whole data set, systematically removing one cancer subgroup or one cancer type and testing specifically for this subgroup or type (see Methods, Fig. 4G , S8C ). We observed performances similar to when the subgroup is included in the train set, which demonstrates the universality of L1PA hypomethylation. Classification performance is noticeably lower for the BRC M0 subgroup when the model still performs well for OVC M0 and GAC M0. This may be because the BRC M0 subgroup contains the earliest stages of both cohorts ( Fig. S8B ) and the model is less well trained for these stages when removing it. To improve this, we have developed another integrated model that is trained for all cancer types and stages together but also incorporates the probabilities from the ‘expert’ models, defining profiles for various stages and dissemination status. This ‘stack’ model provides a unique prediction for cancer versus healthy status for each sample at once, independently of their cancer type / dissemination status, and allows to balance for the cancer characteristics used to train the model by including them all. We have then extracted the performances per cancer subgroups out of this integrated model, which reached better performances to classify BRC M0 and early-stages samples (see Methods, Fig. 4G , S8A ). This demonstrates the robustness of cancer detection by probing L1PA hypomethylation from plasma DNA with the DIAMOND assay and its ability to detect early stages. Finally, we evaluated if the extent of L1PA hypomethylation was associated with survival. In the validation cohort, we compared patients with high methylation to those with low methylation levels (above or below the median threshold - Fig. S8D ) and observed that more pronounced hypomethylation is clearly associated with shorter survival ( Fig. S8E ). This effect is thought to mainly reflect the tumor burden, which directly impacts the fraction of tumor DNA circulating in the blood, but it remains an interesting non-invasive tumor marker. DIAMOND data contain signal to infer the tumor burden, which improves cancer detection When looking at the overall methylation rates, we detected significantly more hypomethylation for more advanced stages of the disease, in particular in metastatic stages compared to localized stages ( Fig. 5A , Table S13 ). However, there was no significant differences between metastatic tumor tissues and primary tissues ( Fig. 5B , Table S14 ), which confirmed that L1PA methylation alteration is an early event in carcinogenesis 59 , 60 and also affects early-stage cancers. The differences observed in the blood ( Fig. 5A ) reflect the ctDNA fraction, which is known to be directly influenced by the tumor burden and the stage of the disease 1 , 9 . This demonstrates the quantitative potential of DIAMOND, which could help quantify the tumor burden and monitor the disease. View this table: View inline View popup Download powerpoint Table S13. Statistical results of global methylation differences between cancer stages using Mann–Whitney U test View this table: View inline View popup Download powerpoint Table S14. Statistical results of global methylation differences between primary and metastatic tissues using Mann–Whitney U test Download figure Open in new tab Fig 5. DIAMOND data contain signal to infer the tumor burden, which improves cancer detection A-B. Comparison of the average levels of methylation observed in localized vs metastatic plasma samples ( A , p Early/Adv. = 0.327, p Adv./Meta. = 3.14e-11, p Early/Meta. = 2.82e-14; p BRC_M0/M+ = 1.3e-18, p OVC_M0/M+ = 0.006, p GAC_M0/M+ = 0.005, Table S13 ) or in primary vs metastatic tissues ( B , p OVC = 0.257, p UVM = 0.820, Table S14 ). C . L1PA unique hits obtained for 15 breast cancer cell lines compared to the distribution of Cytoscan probes distributed throughout the human genome. D. Genome-wide z-score for all cancer (N = 564) vs healthy plasma samples (N = 120, 63 of the total 183 HDs are used as references to compute the z-score and are not displayed here, p = 1.21e-20). E. Genome-wide z-score by cancer subgroups vs healthy samples. The p-values are computed using Mann–Whitney U test (p CRC_M+ = 2.05e-18, p BRC_M+ = 1.01e-18, p UVM_M+_ = 0.169, p LC_M+ = 0.769, p OVC_M+_ = 1.84e-11, p GAC_M+ = 0.003, p BRC_M0 = 5.12e-17, p OVC_M0 = 1.09e-12, p GAC_M0 = 8.40e-06, Table S15 ) F. Correlation analysis for genome-wide z-score versus global methylation (r overall = −0.62, p = 1.25e-69). G. Performances of the 2-step model incorporating CNA with DNA methylation analysis (Classification is done as follow: Proba Cancer ≤ Threshold C1 AND GZ-score ≤ 121: prediction = Healthy; Proba Cancer > Threshold C1 OR GZ-score > 121: prediction = Cancer, see methods). View this table: View inline View popup Download powerpoint Table S15. Statistical results of Genome-wide z-score differences between healthy samples and cancer subgroups using Mann–Whitney U test View this table: View inline View popup Table S16. Cancer prediction and sample labelling with the 2-step classification integrating CNA analysis - healthy vs all cancers model View this table: View inline View popup Table S17. Cancer prediction and sample labelling with the 2-step classification integrating CNA analysis - healthy vs BRC M+ model View this table: View inline View popup Table S18. Cancer prediction and sample labelling with the 2-step classification integrating CNA analysis - healthy vs BRC M0 model View this table: View inline View popup Download powerpoint Table S19. Cancer prediction and sample labelling with the 2-step classification integrating CNA analysis - healthy vs CRC M+ model View this table: View inline View popup Download powerpoint Table S20. Cancer prediction and sample labelling with the 2-step classification integrating CNA analysis - healthy vs LC M+ model View this table: View inline View popup Download powerpoint Table S21. Cancer prediction and sample labelling with the 2-step classification integrating CNA analysis - healthy vs OVC M+ model View this table: View inline View popup Download powerpoint Table S22. Cancer prediction and sample labelling with the 2-step classification integrating CNA analysis - healthy vs OVC M0 model A recognized marker to non-invasively estimate the tumor burden and the fraction of ctDNA is the aneuploidy or copy number alterations (CNA) 61 , 62 , a hallmark of cancer genomes 63 . Given that DIAMOND hits are dispersed throughout the genome ( Fig. 1B ), we investigated the possibility to use our data to perform CNA analysis. The mFast-SeqS approach had previously used a PCR-based L1PA targeting as a prescreening tool to estimate the fraction of ctDNA 64 , 65 . This was done on native DNA whereas our data resulted from bisulfite-treated DNA. We first tested this approach on 15 breast cancer cell lines that were also assessed by CytoScan HD microarrays for aneuploidy. DIAMOND provided an average of 78 000 uniquely mappable reads per cell line, corresponding to around 10 000 L1PA copies precisely located in the genome. These L1PA hits homogeneously overlapped with regions covered by CytoScan probes along the genome ( Fig. 5C ). We computed z-scores, quantifying copy number alterations, at the level of chromosome arms as previously described ( 65 , 66 , see methods) and obtained similar results to those found with CytoScan arrays ( Fig. S9A ). We observed low alteration scores for the normal-like breast cell line HTERT-HME1 and good correlations between the 2 methods for the majority of the cell lines ( Fig. S9B ). Next, we computed genome-wide z-scores in healthy and cancer plasma samples and observed high alteration scores specifically in cancer samples ( Fig. 5D ). Cancer subgroups z-scores mirrored global hypomethylation profiles ( Fig. 5E , 2B and 4D, Table S15 ), both reflecting tumor burden and ctDNA fractions available. However, global methylation rates and z-scores were only moderately anti-correlated ( Fig. 5F ), demonstrating that these are partially independent markers that can provide distinct signals ( Fig. S9C ). To obtain a final classification labelling each sample as healthy or cancer, we used a 2-step categorization incorporating CNA analysis, which improved cancer detection. We used the probability of the cancer prediction provided by the methylation-based validation models (‘expert ’ and ‘all ’ models), applying a threshold identified on the discovery cohort, followed by a reclassification of samples which presented a z-score > 121, as cancer. This cut-off value was deduced from a cross-validation applied on C1 (see methods and Fig. S9C ). This classifier achieved high sensitivities with specificities between 97-100% for the ‘expert’ models and 93% for the integrated ‘all’ model in 4 distinct cancer types (BRC, CRC, LC, OVC, Fig. 5G ) and could be applied as is in the clinic. This was particularly promising for localized breast cancer with a sensitivity of 100% with both model types. Discussion In this study, we established a robust proof of concept that targeting hypomethylation of retrotransposons from cell-free DNA is a sensitive and specific biomarker to detect multiple forms of cancer non-invasively. Repetitive regions provide genome-wide information as they hold half of the CpG sites present in the human genome 67 . Hypomethylation of L1 elements, which is a common feature of multiple forms of cancer, is a universal marker and helps cover the heterogeneous profiles of cancer patients in a single test. Previous methylation studies have left these regions aside as they are inherently difficult to map, and DMR analysis is commonly performed on mapped data. However, repeats were previously used to profile aneuploidy and one of the first reported liquid biopsy cancer detection test was based on serum DNA integrity calculated with the ratio of short over long Alu fragments 68 . Here, we developed a new pipeline to detect methylation profiles at repeats with a single base-pair resolution, without resorting to mapping on a reference genome. This allowed us to rescue unmappable sequences (such as polymorphic copies that are not annotated in the reference genomes) or sequences that are difficult to map (such as multi-mappers) and retain most of our data. This is instrumental in achieving high sensitivity. The DIAMOND assay demonstrated high performance in detecting cancer samples and we established its feasibility in six different cancer types, including three at localized stages. It outperforms mutation screening, as it covers virtually all patients and presents a promising marker for pan-cancer detection. Previous cfDNA multi-cancer tests enabled sensitive detection and classification of multiple forms of cancers such as the CancerSeek test, based on the combination of mutations and proteins detection 11 , 69 and more recently combined with aneuploidy profiling using representation of Alu sequences 70 or the cfDNA methylation test Galleri from GRAIL, which targets 1,100,000 CpGs within unique regions 39 , 40 . In comparison, DIAMOND targets about 100,000 CpG sites within 30-40,000 copies of LINE-1 using a unique set of probes for all these copies and the various cancer type tested. Very recently, several multi-cancer detection methods based on retrotransposons were published. This includes the detection of the circulating L1 protein ORF1p 60 as well as the probing of the representation of all type of repeats from Annapragada and colleagues 71 . Another study demonstrated that cfDNA cleavage profiles at Alu sequences reflect their methylation status and can be used to detect liver cancer and nasopharyngeal carcinoma 72 . The DIAMOND assay interrogates specifically the methylation status of LINE-1 retrotransposons to obtain a global representation of the hypomethylation occurring during carcinogenesis and assess the potential of circulating DNA methylation changes at L1PA elements as a universal tumor biomarker. Our aim was to develop a new highly sensitive strategy to detect cancer-specific signatures in blood. Overall, DIAMOND reached great performances for multi-cancer detection using ‘expert’ models for specific cancer subgroups or integrated models including all types of cancers. Lower detection rate in metastatic lung cancer may be related to the fact that these samples seem to have a low tumor burden as indicated by their genome-wide z-scores ( Fig. 5E ). However, integrating other regions with cancer-specific methylation changes could help improve detecting this type of cancer. The DIAMOND assay provides methylation profiles from minute amount of cfDNA, down to a few nanograms, with high precision and high coverage using an affordable sequencing depth. Stronger L1PA hypomethylation was associated with shorter overall survival demonstrating its prognostic value. We therefore anticipate that our method has the potential to be applied for the development of routine clinical tests. While our integrated models provide proof of concept for cancer detection with blood screening in asymptomatic patients, the ‘expert’ models could also be useful to help diagnosis when we suspect the cancer location or to perform disease follow up. To push the DIAMOND assay towards a clinically applicable test, we also demonstrated that DIAMOND data can be used to perform copy number alterations analysis which improves cancer detection. We integrated this analysis in a classifier providing ‘healthy’ or ‘cancer’ labels for each sample and reached a detection of 91% of true positives for all cancers together and in particular a 100% sensitivity with 100% specificity for localized breast cancer with the BRC M0 expert model. Further testing with a larger number of samples covering earlier stages, including an independent cohort of gastric non-metastatic cancers, more subtypes and different types of cancer will enable to consolidate and expand these findings. Moreover, this will strengthen the classification models, which will perform better with more samples for training and testing. In the current study, we did not control for ethnic origin of samples tested. The potential impact of L1 polymorphism linked to ancestry should be estimated in future studies. It will also be important to study the impact of other conditions, such as auto-immune diseases, which may lead to the detection of L1 hypomethylation in blood. The recent study on the detection of circulating L1 ORF1p in cancer by Taylor and colleagues 60 demonstrated a high specificity and no sign of L1 reactivation in blood of patients with auto-immune disease, indicating that it might be a cancer-specific phenomenon. DIAMOND analysis could further be used to infer the tumor burden and monitor the disease to better detect minimal residual disease and the relapse early. However, the impact of treatments on methylation status should be investigated first. Overall, we developed a turnkey analysis method that identifies tumor plasmas across multiple types of cancer with the same marker. This approach offers an optimized balance between the number of targeted regions and sequencing depth, which could extensively improve the sensitivity of ctDNA detection in a cost-effective manner and improve management of patients with cancer. Methods Materials Cell lines Cell lines screened in Fig. 2A are the following: CRC (HCT116); OVC (SKOV, Caov3, ES-2); BRC (MDA-MB453, SKBR3, MDA-MB361, HCC202, ZR75.1, HCC70, BT474, MDA-MB231, Cal51, MDA-MB157, BT20, MCF7, HCC1954, HCC1569, HCC38); UVM (MP38, MP41, MP46, MP65, MM28, Mel285, Mel270, 92.1, Mel202, omm2.5, Mel290, mm66, omm1). Tissue and plasma samples Archived tissue samples (ovarian adjacent tumor tissues, ovarian primary and metastatic tumors, breast tumors and uveal melanoma tissues) were retrieved from the Pathology department of Institut Curie. Healthy white blood cells and healthy plasma were collected from blood of healthy donors through the French blood establishment (agreement #16/EFS/031) under French and European ethical practices. Blood samples from patients treated at the Institut Curie (Paris, France) were collected, after written informed consent, as part of the following studies: resectable metastatic colorectal cancers from the Prodige14 trial (approved by a French Personal Protection Committee – “CPP-Comité de Protection des Personnes Sud Méditerranée IV” and registered in ClinicalTrials.gov under NCT01442935 ); non-small cell lung cancer and metastatic HR+ HER2-breast cancer from the ALCINA study (approved by a French Personal Protection Committee and registered in ClinicalTrials.gov under NCT02866149 ; treatment-naïve ovarian cancer or triple-negative breast cancer patients eligible for surgery or neoadjuvant chemotherapy from the SCANDARE study (approved by the French National Agency for the Safety of Medicines and Health Products “ANSM - Agence National de Sécurité du Médicament”, a French Personal Protection Committee and registered in ClinicalTrials.gov under NCT03017573 ); multiple-types of metastatic cancers from the SHIVA02 study (approved by the French National Agency for the Safety of Medicines and Health Products “ANSM - Agence National de Sécurité du Médicament”, a French Personal Protection Committee and registered in ClinicalTrials.gov under NCT03084757 ), non-metastatic operable gastric cancers and advanced uveal melanoma from CTC-CEC-ADN study (approved by a French Personal Protection Committee and registered in ClinicalTrials.gov under NCT02220556 ). Additional archived samples were also retrieved from the biobank of the Institut Curie, patients having provided informed consent for research use. All samples were obtained in accordance with the ethical guidelines, with the principles of Good Clinical Practice and the Declaration of Helsinki. This study was approved by the Internal Review Board and Clinical Research Committee of the Institut Curie. Blood samples were collected at the time of inclusion, before the start of the treatment, in EDTA tubes. Plasma was isolated within 4 h, to ensure a good quality of cfDNA, by centrifugation at 820 g for 10 min, followed by a second centrifugation of the supernatant at 16,000 g for 10 min and stored at −80°C until use. Methods Preparation of DNA from cell lines and tissues and cfDNA Isolation of DNA from cell lines and healthy white blood cells (buffy coats) was performed using the QIAamp DNA Mini Kit or QIAamp DNA Blood Mini Kit (Qiagen) according to the manufacturer’s instructions. DNA from cryopreserved and formalin-fixed paraffin embedded (FFPE) tumor tissues was extracted using a classical phenol chloroform protocol and the NucleoSpin® FFPE DNA kit (Macherey Nagel), respectively. cfDNA was extracted from 2 ml of plasma using the automated QIAsymphony Circulating DNA kit (Qiagen), the Maxwell RSC ccfDNA LV plasma kit (Promega) or manual QIAamp circulating nucleic acid kit (Qiagen), according to the manufacturer’s instructions, and eluted in 60 μl, 75 μl or 36 μl, respectively. We verified that the extraction method did not impact our results ( Fig. S10A-B ). Isolated DNA was quantified by Qubit® 2.0 Fluorometer using dsDNA HS Assay Kit (Thermo Fisher Scientific) according to the manufacturer’s instructions and stored at −20°C until use. Bisulfite conversion We used sodium bisulfite-based chemical conversion to achieve base-pair resolution analysis, which is crucial to address methylation levels at single CpG dinucleotides and the co-methylation of multiple CpG sites to determine methylation haplotypes (methylation state of successive CpG sites). Bisulfite treatment of the isolated genomic DNA (up to 200 ng) from the cancer tissues, cancer cell lines and buffy coats was performed using an EZ DNA Methylation-Gold Kit™ (Zymo Research, CA, USA), following the manufacturer’s instructions. Bisulfite treatment of cfDNA (isolated from 2 ml of plasma) was performed using the Zymo EZ DNA Methylation-Lightning Kit™ (Zymo Research, CA, USA), according to the manufacturer’s instructions. Bisulfite-treated DNA was stored at −80°C and further used to build a sequencing library. We have compared the methylation profiles obtained with bisulfite conversion or enzymatic conversion (NEBNext® Enzymatic Methyl-seq Conversion Module, 73 ) followed by amplification with the DIAMOND targets and deep sequencing. We observed similar methylation profiles ( Fig. S11 ). We have tested various starting quantities of DNA (100ng, 10ng, 5ng) of white blood cells extracted from healthy donors (buffy coats). We have also compared cfDNA (10ng) and DNA from MCF7 breast cancer cell lines (100ng) know to display hypomethylation at L1PA elements. We observed that the methylation profiles along the 30 CpG targets are similar when starting with 100ng ( Fig. S11A ) but, in our hands, the bisulfite conversion seems more robust with decreasing stating DNA quantities ( Fig. S11B-C ). We thus observe some differences with the cfDNA samples at 10ng ( Fig. S11D ). We could still detect very similar hypomethylation profiles in MCF7 breast cancer cell lines ( Fig. S11E ). This shows that EM-seq could be used to profile L1PA methylation changes but the same conversion should be used throughout the study. Primer design Eight primer pairs were designed using the LINE-1 Human Specific (L1HS) consensus sequence from Repbase ( Fig. 1A ). Although 5’UTR (promoter region) is CpG-rich and common target for methylation quantitation, L1PA copies are frequently 5’-truncated. Therefore, primers were also designed for ORFI and ORFII to target more L1PA elements and improve the sensitivity of our assay. All primers were designed for plus strand of bisulfite converted DNA, using the MethPrimer or PyroMark Softwares. Targeted regions contained 2-7 CpG targets and ranged from 101bp to 150bp, to better capture cfDNA fragments, which have a mean size of 167bp 74 , ( Table S1 ). Primers were methylation-independent, encompassing 0 to 2 CpGs (none toward the 5′ end), and were degenerated to target both the methylated and unmethylated states. They contained Fluidigm universal CS (common sequence) tags at their 5′ ends. We incorporated a 16 N (random nucleotides) as unique molecular identifiers (UMI) between the target-specific sequence and the CS2 in the reverse primers for signal deconvolution to detect true low frequency alterations and for reducing errors. As LINE-1 hold thousands of copies per genome, a high number of distinct UMIs is essential for unique barcoding of each target molecule. The 16 N stretch between the target-specific sequence and the CS1 in forward primers was used to increase diversity of sequencing libraries and improve sequencing quality. All primers were obtained from Eurogentec (RP-cartridge purification method). Preparation of targeted bisulfite sequencing libraries To limit batch effects, sequencing and library preparation batches contained both cancer samples and healthy donors. We also specifically processed the validation cohort C2 with equilibrated number of healthy and cancer samples distributed in 5 experimental batches. Sequencing libraries were prepared using three PCR steps ( Fig. S1A ): 1) target-specific linear amplification for UMI assignment, 2) target-specific exponential amplification and 3) barcoding PCR for sample identification. Each library was prepared in two individual reactions (due to the overlap of amplicon 2 with other primers), including: I) Multiplex PCR amplification of 7 probes (Amplicon 1, 3, 4, 5, 6, 7, 8), and II) Single PCR amplification of amplicon 2. UMI assignment for multiplex reaction was performed using Platinum™ Multiplex PCR kit Master Mix (Thermofisher, Life Technologies SAS) in a 25 μL reaction containing 1x Platinum™ Multiplex PCR Master Mix, 0.01-0.06 µM mix of reverse primers and up to 5 ng bisulfite-converted DNA at the following thermocycling conditions: 95°C for 5 min followed by 1 cycle at 95°C for 30 s, 58°C for 90 s, 72°C for 40 s. UMI assignment for single reaction was performed using Hot Star Taq Plus DNA Polymerase (Qiagen) in a 25 μL reaction containing 1x Taq PCR Buffer, 0.65 U Hot Star Taq (5U/μL), 0.2 μM dNTPs, 1.5 mM MgCl2, 0.1 μM amplicon 2 reverse primer, up to 4 ng of bisulfite-converted DNA at the following thermocycling conditions: 95°C for 10 min followed by 1 cycle at 94°C for 60 s, 58°C for 30 s, 72°C for 40 s. To ensure complete removal of the reverse primers and dNTPs, each 25 μL reaction was treated with 50U of Exonuclease I and 10U of FastAP Thermosensitive Alkaline Phosphatase (Thermo Fisher Scientific) at 37°C for 1 h and heat-inactivated at 80°C for 15 min. Target-specific exponential amplification for multiple reaction was performed using Platinum™ Multiplex PCR kit Master Mix in a 50 μL reaction containing 1x Platinum™ Multiplex PCR Master Mix, 0.01-0.06 µM mix of forward primers, 0.2 μM CS2 reverse primer and 20 μL of purified PCR product at the following thermocycling conditions: 95°C for 5 min followed by 28 cycles at 95°C for 30 s, 58°C for 90 s, 72°C for 30 s followed by a 10 min incubation at 72°C. Target-specific exponential amplification for single reaction was performed using Hot Star Taq Plus DNA Polymerase in a 25 μL reaction containing 1x Taq PCR Buffer, 0.65 U Hot Star Taq (5U/μL), 0.2 μM dNTPs, 1.5 mM MgCl2, 0.2 μM amplicon 2 forward primer, 0.2 μM CS2 reverse primer and 8 ul of purified PCR product at the following thermocycling conditions: 95°C for 10 min, 25 cycles at 94°C for 60 s, 58°C for 30 s, 72°C for 30 s and 10 min at 72°C. PCR products of multiplex and single reaction were pooled together after quantification by qPCR and purified using Agencourt AMPure XP (Beckman Coulter) at 1.2x ratio according to the manufacturer’s protocol. Purified DNA was eluted in 30 ul of water. Barcoding PCR was performed using universal fluidigm primers. 25 μL of purified pooled PCR product, 1x Phusion HF Buffer, 1 U Phusion Hot Start II DNA Polymerase (Thermo Fisher Scientific), 0.2 μM fluidigm primer, and 0.2 mM dNTPs were mixed in the final volume of 50 μL and amplified with the following conditions: 98 °C for 2 min, followed by 20-25 cycles of 98 °C for 10 s, 62°C for 30 s, and 72 °C for 30 s followed by a 5 min incubation at 72°C. The amplified product was purified by two consecutive AMPure XP steps using 1) a low concentration of AMPure XP beads (0.6x – 0.7x ratio) where the beads containing the larger fragments are discarded and supernatant collected (reverse purification) and 2) higher beads concentration (1.1x – 1.2x ratio) where the beads containing fragments of interest were collected and purified according to the manufacturer’s protocol. Size-selected libraries were eluted in 15 μL of low-EDTA TE buffer. The libraries were quantified with Qubit HS DNA kit (Thermo Fisher Scientific), qualified with nano-electrophoresis (TapeStation, Agilent), and pooled equimolarly for sequencing. Sequencing was performed on Illumina HiSeq rapid run mode or NovaSeq (PE 30bp, 170bp). Preprocessing of the reads For each sample, FASTQ files containing raw sequences, composed by the following parts: CS1, forward UMI, forward primer, insert, reverse primer, reverse UMI, and CS2 ( Fig. S1A ) were first filtered for reads quality (average >Q20 per read) and then demultiplexed (i.e., cut using atropos v1.1.31) using forward and reverse primer sequences. FASTA files were created per primer-set, containing inserts and reverse UMIs for deduplication, as they are unique for each input DNA molecule. Inserts and reverse UMI were then filtered on expected sizes (with a tolerance of ± 5 bases for the inserts). Filtered inserts and UMIs sequences were concatenated and deduplicated using vsearch v2.15.2. Reverse UMIs were then trimmed and resulting inserts from all samples were aggregated into a single FASTA file per primer-set. Clustering, extraction of representative sequences and global alignment Using vsearch (with the following parameters: --cluster_fast --notrunclabels -- fasta_width 0 --iddef 4 --id 0 --qmask none --clusterout_sort --consout ), a clustering based on sequence identity was applied to each FASTA file, or a subset of 20 million reads randomly chosen if a given file comprised more. The 10 largest clusters’ representative sequences were isolated in separate files. Using mafft v7.508 (with the following parameters: -- globalpair --maxiterate 1000), the 10 representative sequences were aligned pairwise resulting in a reference database for each primer-set. Lastly, using mothur v 1.48.0 (with the following parameters: #align.seqs(candidate=, template=, align=needleman, match=1, mismatch=-1, gapopen=-1, gapextend=0)) on each primer-set FASTA file, all sequences from all samples were aligned to the corresponding reference. CG calling, methylation levels and haplotypes extraction To call CpG dinucleotides of interest, a sliding window of 2 bp was used on all aligned sequences to determine the distribution of dinucleotides along each amplicon target. A first threshold of ≥ 20% of CG/TG dinucleotides was used to select potential CpG sites. A second threshold was applied to eliminate dinucleotide with ≥95% TG and select position with at least 5% methylation rate. From the aligned sequences, the patterns of methylation were extracted and compiled into either average levels of methylation at each previously identified CpG sites, or proportions of methylation haplotypes for each sample. Machine learning-based classification models The resulting data (represented as average levels of methylation per CpG site or proportions of methylation haplotypes or both) were used to do supervised learning of statistical models using the random forest classifier algorithm 75 from Python package scikit-learn 76 , with the following hyperparameters: n_estimators=300, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’sqrt’, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None. The rational for choosing random forest over other learning methods was driven by three main factors: 1) it is less prone to overfitting 75 ; 2) it shows excellent performance even when the quantitative relationship between features and observations is biased in favor of the former, such as when using methylation haplotypes data representation 77 ; 3) random forests also inherently return measures of variable importance 75 , such as mean decrease in impurity, which greatly facilitate the interpretability of model decisions. The features used to train the models were the average levels of methylation per CG site (n=30), the proportions of methylation haplotypes (i.e., the combinatorial of all the possible methylation status of CG sites within a given amplicon, n=372) or both. No additional transformation nor feature selection was performed on the data. Expert and All cancer models Model classifications were run 5000 times in order to estimate variance and confidence intervals. For the discovery step, in each run, as many samples from each class were randomly drawn to construct a balanced subset of the data 78 . The samples from these draws were stratified by class and split into 60% for training, 40% for evaluation. For the validation step, we trained the model on the entire cohort 1 and evaluated it on cohort 2. The true and false positive rates for all possible classification threshold were evaluated at each run, with interpolation to generate an average ROC curve with 95% confidence interval for the 5000 runs. 95% confidence interval has been calculated with the following formula: Μ±z*s/√n with M the average of the variable, z the confidence level (z∼1.96 for 95%CI), s the standard deviation, n the number of samples in the variable. Blind models We trained a random forest on haplotypes features, removing one cancer type or subgroup from the training set. The specific cancer type or subgroup is then assessed in the test set. We pooled together the discovery and validation cohorts, training on 2/3 of all the samples-- excluding the cancer type or subgroup to test for-- and testing on the remaining one-third of the samples. The only exception was metastatic gastric cancer: as they are made up of only 3 samples, they were systematically moved to the test set, consequently blinding the model towards GAC M+ samples. We also trained a stacked version with this setup (see below). Stacked machine learning model We developed a model referred to in the paper as “Stack”. This model uses one Random Forest model for each combination of cancer type and metastatic status, known as the “expert sub-model.” Each expert sub-model was trained on one-third of the healthy plasma samples and one-third of the samples matching the cancer subgroup of interest (cancer type and dissemination status). These expert sub-models were then combined into a Random Forest Stack model, which uses both the haplotype features and the probabilities output by each expert sub-model. The final Random Forest Stack model was trained on an additional one-third of the healthy and cancer plasma samples and tested on the remaining samples (which represent one third of the healthy samples and one third of each subgroup). Mutation screening for ovarian cancer samples Ovarian tumor genotyping was performed using the TIGER panel previously developed by Institut Curie 79 , which targets 78 genes or using ‘custom NGS’ with amplicons targeting Tp53 and TSC2 , which are the 2 most frequently mutated genes in this group of patients, with TruSeq library constructions for low input material (dual strand technology). After mutations identification, ovarian cancer plasmas were genotyped using custom NGS or ddPCR as previously done 14 . Sequencing was performed on a MiSeq V3-150 (25M) with Paired End 75bd protocol. Results are presented in Table S8 . Whole genome bisulfite sequencing analysis To see if we could retrieve cancer-associated L1PA hypomethylation in others plasma studies, we analyzed data sets from 2 recent studies profiling cell-free DNA methylation with Whole Genome Bisulfite Sequencing (WGBS) in healthy and cancer patients (Liu et al. 2024 53 ; Gao et al. 2022 54 ). Liu et al. analyzed methylation at 75 617 CpGs in the whole genome (estimated with Bis-SNP after mapping with Bismark) of 17 healthy donors and 31 cancer patients. We extracted the methylation levels at CpG residing within L1PA families hit by DIAMOND (mostly L1HS-L1PA10, see Fig. 1C ). To do so, we identified CpG dinucleotides covered by L1HS-L1PA10 elements, and their position within L1 based on the L1HS consensus sequence, and we selected the DIAMOND copies using Bedtools 80 . Gao et al. , provided FASTQ files data for 123 breast cancer and 40 healthy patients. In order to avoid breast-cancer subtype effect and age effect, we subsampled the data to generate an age matched cohort with 16 healthy and 15 HR+HER2-M+ patients. We first merged the paired-end reads using Fastp 81 and mapped them on the hg38 reference genome using bismark . At this step, we followed two different approaches to obtain the methylation level at the 30 L1PA CpGs that we studied with DIAMOND. 1) Using bismark extractor , we retrieved the methylation levels of all CpGs covered and intersected it with the dinucleotides covered by DIAMOND using Bedtools. Results produced with this approach are subsequently called “Gao et al (2022) Bismark”. 2) Using mothur 82 , we aligned the reads that mapped on L1HS-L1PA10 elements on an aggregate of native and converted (CG converted to TG) reference sequences of L1HS. All read that mapped with a score lower than 51% of similarity (score computed by mothur ) were rejected (this threshold was established from the percentile at 99% of similarity score distribution from 1000 random sequences 167 bp long). Results generated by this approach are subsequently called “Gao et al (2022) Mothur ”. Finally, for “Gao et al (2022) Bismark” and “Gao et al (2022) Mothur “, we refined the results to only take in account the copy of L1HS-L1PA10 targeted by DIAMOND. Further refinements were done to only consider L1HS-L1PA3 copies with the second approach and is referred as “L1HS-L1PA3”. Survival analysis Survival analysis has been performed with survival and survminer R packages - https://cran.r-project.org/web/packages/survival/citation.html , doi:10.32614/CRAN.package.survminer. Copy number alterations analysis Cytoscan HD microarrays: 250 ng of gDNA from 15 breast cell lines (1 normal-like: HTERT-HME1 and 14 cancer cell lines: MDA-MB231, MDA-MB453, HCC1569, BT20, HCC1954, HCC38, MDA-MB361, ZR 75.1, MDA-MB157, MCF7, SKBR3, HCC202, HCC70, BT474) were characterized using Affimetrix/Thermo Cytoscan HD microarrays at the Genomics facility of Institut Curie to profile aneuploidy. To compare with the z-score by chromosome arm, we calculated the mean of Weighted Log2 combining probes by chromosome arms. DIAMOND CNA: 1) Z-score calculation: prepossessed reads were uniquely mapped on hg38 genome using Bismark (version 0.23.1). As in Belic et al. 2015 66 , only the reads with an alignment score > 15 were kept. Resulting reads from all amplicons (excluding #2 and #3) were merged and normalized number of reads per chromosome arm (excluding sexual chromosomes X and Y) per sample were calculated with R. Next, the amplifications/deletions score was computed using the following formula: with i = a given chromosome arm, n = a given sample and controls = a set of reference samples (white blood cells from 10 healthy reference samples for the cell lines, 63 healthy plasma from C1 as a reference for cancer and healthy plasma samples). Genome-wide z-scores were computed by summing the squared z-scores of all chromosome arms. 2) Z-score threshold identification: to identify altered versus normal z-scores, we performed 5-fold cross validation of simple cutoff classification model on the discovery cohort (N Healthy = 60, N Cancer = 350) using the genome-wide z-score and calculated the threshold that maximize the sensitivity at 100% specificity. 2-step classification for sample labelling First, we selected the threshold for the probability of the cancer prediction (Proba Cancer ) on the discovery cohort maximizing the sensitivity for a 99% specificity, per ‘cancer-type’ model. We applied this threshold on the Proba Cancer computed with the validation models and reclassified samples which presented a z-score > 121, as cancer (Proba Cancer ≤ Threshold C1 AND GZ-score ≤ 121: prediction = Healthy; Proba Cancer > Threshold C1 OR GZ-score > 121: prediction = Cancer) see Tables S16-22. Data Availability Data have been deposited as methylation matrices (CG% or haplotypes%) on the Zenodo database with the following accession code: https://zenodo.org/uploads/12206227 and as compressed FASTQ files at the European Genome-Phenome Archive at https://ega-archive.org/ under the accession code EGAD50000000646 . WGBS sequencing data were downloaded from publicly available database at https://zenodo.org/records/7779198 and from the National Center for Biotechnology Information ( https://www.ncbi.nlm.nih.gov ) under the accession number PRJNA494975. The code used to analyze the data is available on github: https://github.com/ProudhonLab . Funding The NGS facility was supported by ANR-10-EQPX-03 (Equipex) and ANR-10-INBS-09-08 (France Génomique Consortium) grants and by the Cancéropôle Île-de-France. This research was supported by grants, of which C.P. was recipient, from: The European Research Council (ERC-StG EpiDetect), The Ligue contre le cancer (RS17-75-75), The prematuration program of the Centre National pour la Recherche Scientifique (CNRS), The SiRIC 2 Curie program (INCa-DGOS-Inserm_12554), The DEEP Strive funding (LABEX DEEP 11-LBX0044). CAA research was supported in part by the French government under management of Agence Nationale de la Recherche as part of the “Investissements d’avenir” program, reference ANR-19-P3IA-0001 (PRAIRIE 3IA Institute). Author Contributions MM, MH and CP designed the study MM, MH, AM, CH, MG, VD, CB, MS, CR and DG performed the experiments MM, MH, AM, KDS, MG, KVG, CAA and CP analyzed the data. MM, AM, KDS, MG and KVG performed the statistical analyses. CH, AR, FCB, JYP, CL, MK, CLT, IB, MHS, OL, LC contributed with identification of clinical samples. MM, MH, AM, KDS, MG, CAA and CP wrote the manuscript. All authors participated in revising the manuscript and approved this final version. Competing Interests statement CP, MM, MH, and CAA have an ongoing patent application relating to circulating tumor DNA analysis. Supplementary materials Download figure Open in new tab Fig S1 A. Scheme of the targeted bisulfite sequencing strategy used to build the DIAMOND assay libraries. The protocol starts by the incorporation of unique molecular identifiers (UMI) via 1 cycle of linear PCR to identify each initial molecule present in the sample. We also incorporated a 2 nd set of molecular identifiers (UID) during the 2 nd PCR in order to generate libraries with enough nucleotide diversity which is crucial for a successful downstream sequencing (See method section for more details). B-E. Hits obtained across the genome using the DIAMOND assay with standard coverage (indicated in million reads – M) in healthy plasma (HP) over the L1PA annotations (B) , in HP versus colorectal cancer (CRC) plasmas (C) , in HP versus breast cancer (BRC) plasmas (D) , and in HP versus ovarian cancer (OVC) plasmas (E) , (see also Table S2 ). F. Summary flow chart illustrating the pipeline developed for reference-free alignment of sequencing data (see also Methods). Download figure Open in new tab Fig S2 A. Proportion of CG and TG dinucleotides within the 8 regions targeted, after reference-free alignment. Number of CpG sites detected vs expected are indicated for each amplicon. The dashed red line represents the first threshold applied to select dinucleotide positions with >20% (GC+TG). A second threshold of >5% CG is applied. B. Alignment of 10 representative sequences of the largest clusters obtained for amplicon 2 relative to the L1HS consensus sequence, highlighting 2 additional CpG positions identified (dark green). C. Principal component analysis, based on the average methylation level at each CpG position (n=30), showing the distribution of healthy and cancer samples annotated for their cancer subgroups in the two first dimensions (left panel), and the contribution of CpG positions used as components (right panel). D. Number of possible haplotype features extracted from the 30 CpGs within the 7-amplicon panel. E. Principal component analysis, based on haplotypes proportions (n=372), showing the distribution of healthy controls and cancer samples annotated for their cancer subgroups in the two first dimensions (left panel), and the contribution of haplotypes used as components (middle and right panels). Middle panel highlights 4 groups of methylation levels relative to the haplotype components. The right panel highlights the contribution of the various amplicons. Download figure Open in new tab Fig S3. A-F. Analysis of ovarian tumor and plasma paired samples. A. Global DNA methylation in ovarian paired samples (ovarian healthy tissues, ovarian tumors: 7 primary and 3 metastatic tumors, ovarian cancer plasmas) from 10 patients versus healthy donor plasmas, N=183. B-C. Correlation of the methylation differences, relative to the mean methylation of healthy donor plasmas, between tumor and plasma paired samples. Ranges of correlation scores ( ρ ) are indicated. B. Correlation for single CpG methylation levels (30 features per sample). C. Correlation for methylation haplotype proportions (372 features per sample). D. Mean methylation observed at each CpG position across L1PA elements in healthy ovarian tissues, ovarian tumors (primary or metastatic), ovarian cancer plasmas and healthy donor plasmas. E. Methylation pattern observed at each CpG position in paired samples from patient with healthy ovarian tissue, primary tumor, metastatic tumor and cancer plasma. F. Proportion of fully methylated or fully unmethylated haplotypes, for each amplicon (mean centered per amplicon) in paired ovarian healthy tissues, tumor tissues, cancer plasmas and healthy donor plasmas. G-K. Analysis of breast tumor and plasma paired samples. G. Global DNA methylation in breast primary tumors and cancer plasmas from 16 patients versus healthy donor plasmas, N=183. H-I. Correlation of the methylation differences, relative to the mean methylation of healthy donor plasmas, between tumor and plasma paired samples. Ranges of correlation scores ( ρ ) are indicated. H. Correlation for single CpG methylation levels (30 features per sample). I. Correlation for methylation haplotype proportions (372 features per sample). J. Mean methylation observed at each CpG position across L1PA elements in breast tumors, breast cancer plasmas and healthy donor plasmas. K. Proportion of fully methylated or fully unmethylated haplotypes, for each amplicon (mean centered per amplicon) in paired breast tumor tissues, cancer plasmas and healthy donor plasmas. Paired samples and their correlation details are listed in Table S6 . Download figure Open in new tab Fig S4. A-B. Performances for classifiers using single CpG methylation features ( A ) or haplotype features ( B ) with undersampling (plain bars) or not (hatched bars) presented as AUCs or sensitivities at 99% specificity. C. High correlation between haplotype representation and CpG positions shown by the proportion of haplotypes with at least one correlation to CpG sites along spearman rho correlation thresholds. Amplicons 1, 4-8 show high correlation all along. Amplicon 2 shows lower correlations due to its very high number of haplotypes (n = 264). n: number of haplotypes per amplicon. D-E. Performances for each subgroup tested separated (black bars) or from the model testing all cancer samples together (orange) using haplotype features presented as AUCs ( D ) or sensitivities at 99% specificity ( E ). F-H. Comparison of methylation patterns in hormone dependent (HR+HER2-, N=125) vs triple negative (TNBC N=16) metastatic breast cancers at the overall level ( F ), single CpG level ( G ) or the haplotype level ( H ). BRC M+ from C1 and C2 are analyzed (see Table S7 ). I . ROC curves obtained for plasma samples classification with the 3-stage model, using single-CpG methylation rates. Download figure Open in new tab Fig S5. A-I. Analyses performed with whole genome bisulfite sequencing (WGBS) data (see methods section for more details). A. Mean coverage observed at each 30 CpG position belonging to the L1HS-L1PA10 copies in the various analyses performed from WGBS data and DIAMOND. B. Methylation pattern observed across the 30 CpGs targeted with DIAMOND in 2 healthy plasma samples and 1 prostate plasma sample extracted from the WGBS data published in Liu et al. 53 . Methylation data were available as CG% at 75 617 CpGs genome wide. Only these 3 samples presented high enough coverage to profile reliable methylation patterns along L1PA elements (covering 26 135 CpGs). C-I. Analysis of WGBS data from Gao et al. 54 , which profiled metastatic hormone dependent breast cancer (BRC M+) and healthy plasma samples. C. Age distribution of the age-matched cohort selected and analyzed, composed of 16 healthy and 15 BRC M+ plasmas. D-F. Methylation data analyzed with bismark extractor (‘Bismark analysis’). This includes positions specifically annotated as CpGs. D. Average DNA methylation levels demonstrating significant hypomethylation in BRC M+ (p=2.53e-04). E. Methylation patterns observed with WGBS analyzed with bismark extractor demonstrating lower methylation in BRC M+ across the 30 CpG of L1PA elements studied with DIAMOND. F. ROC curve obtained for healthy versus cancer plasmas classification with single-CpG methylation rates (AUC = 78%). G-I. Methylation data analyzed by extracting the CpG positions covered by L1HS-L1PA10 DIAMOND hits independently of the nature of the dinucleotide observed in the WGBS (‘Mothur analysis’). G. Average DNA methylation levels demonstrating significant hypomethylation in BRC M+ (p=0.011). H. Methylation patterns observed with WGBS analyzed with mothur demonstrating lower methylation in BRC M+ across the 30 CpG of L1PA elements studied with DIAMOND. I. ROC curves obtained for healthy versus cancer plasmas classification with single-CpG methylation rates, using positions covered by L1PA1-10 elements hit with DIAMOND (AUC = 71%) or by only L1HS, L1PA2 and L1PA3 elements hit with DIAMOND (AUC = 78%), which represent more than half of the data obtained with our targeted method. Download figure Open in new tab Fig S6. A. Comparison of the methylation level at individual CpG sites along the L1 for each cancer subgroup in cohort 1 (C1) versus cohort 2 (C2). Amplicon numbers are indicated. B. Clustering analysis on OVC M0 stages of cohort 1 (C1) and cohort 2 (C2) showing a split of C2 into cluster 1 and 2 and separation of the whole cohort 1 in a single cluster (cluster 3) including 2 samples of C2. C. Comparison of the haplotype proportions for each cancer subgroup and the healthy controls in cohort 1 (C1) versus cohort 2 (C2). Light grey = healthy donors C1, dark grey = healthy donors C2. Statistical analyses are reported in Table S11 . D-E. ROC curves obtained for plasma samples classification in the validation cohort with the ‘all cancers’ model ( D ) or the ‘cancer-types’ models ( E ) using single CpG methylation features. All classifications include 5000 stratified random repetitions of learning on the whole discovery cohort and testing on the whole validation cohort without undersampling. F-G . Performances for validation classifiers using CpG methylation (grey) or haplotypes (black) features presented as AUCs ( F ) or sensitivity at 99% specificity ( G) . Average AUCs are computed from the 5000 AUCs generated by each repetition of learning. Bars indicate 95% CI. Download figure Open in new tab Fig S7. A. Age distribution of all cancer vs healthy samples (C1 + C2). B. Linear models demonstrating that the age did not predict the methylation patterns alone (Model 1) and had a significant but small effect (see also ( C )) in combination with the ‘biological class’ representing the cancer subgroups (Model 2). C. Impact extent of the parameters tested represented by the absolute values of their Estimate (red: significant, grey: not significant). D. Global methylation levels for all samples versus their age. Correlation curves illustrate small impact of age on the methylation patterns in our study, with an increase in methylation with age. E. Global methylation levels of original data next to age-adjusted data, organized by subgroups, demonstrating that cancer subgroups versus healthy differences remained similar when adjusting for the age. Statistical differences between each cancer subgroup and healthy samples were computed using Mann– Whitney U test (see Table S12 for detailed p-values and CI). F-G. PCA analysis on original data (left panel) or age-adjusted using AC-PCA to adjust for confounding factors conjointly (right panel) for single CpG methylation levels ( F ) or haplotypes ( G ). Adjusting for the age seemed to have limited impact on the data indicating that age is not a confounding factor here. H. Age distribution in the age-matched vs non-age-matched cohorts extracted from C2. I. Both cohorts are composed of 76 samples including half healthy and the same proportions of M+ cancer types. J. Classification performances for ‘all cancer’ model with age-matched vs non-age-matched cohorts presented as AUCs. Download figure Open in new tab Fig S8. A. Comparison of the performances of the ‘expert’ models testing each subgroup separated (black bars), the ‘all’ model testing all cancer samples together (orange) or the ‘stack’ model integrating both models (blue bars) using haplotype features presented as AUCs. B. Composition of cancer types per stage-groups (early (I/II), locally advanced (III) or metastatic (IV)). C. Performances for integrated models (‘all’ or ‘stack’) when training for the specific cancer type tested (seen) versus when not training for this cancer type overall (blind). These classifications have been performed on the whole sample set including C1 and C2. BRC M0/M+, OVC M0/M+ or GAC M0/M+ have been removed all together. D. Distribution of the global methylation levels in patients of the validation cohort. Median = 56%. E. Survival analysis comparing patients with high and low methylation levels (N=79 vs N=95 respectively). Download figure Open in new tab Fig S9. A. Heatmap comparing copy number alterations (CNA) profiled by DIAMOND Z-score versus Cytoscan HD microarrays for 1 normal-like breast cell line (HTERT-HME1) and 14 breast cancer cell lines (in order: MDA-MB231, MDA-MB453, HCC1569, BT20, HCC1954, HCC38, MDA-MB361, ZR 75.1, MDA-MB157, MCF7, SKBR3, HCC202, HCC70, BT474) organized by increasing aneuploidy measured by the chromosome arms z-scores. Correlation scores are indicated below the heatmap. B. Correlation between CNA measured by CytoScan HD microarrays and DIAMOND, r overall = 0.68, p = 8.73e-80. C . CNA as a function of the probability of a sample to be classified as cancer (Proba Cancer) in the validation models. Graphs for cancer subgroups and all cancers together are shown. Download figure Open in new tab Fig S10. A. Principal component analysis (PCA), based on the average methylation level at each CpG position (n=30), showing the distribution of 30 pools of healthy plasma which were each divided into 3 replicates extracted in parallel with the Qiasymphony (Q), Maxwell (Max) and the Manual (Man) kits. The table shows the results of Wilcoxon tests demonstrating no significant differences between the 3 extraction methods. B. PCA showing the distribution of the various batches of healthy donors compared to cancer samples of C1 and C2 highlighting that the experimental effect that can be observed between various library construction batches are distributed along dimension 2 when the disease status effect appears in dimension 1 (see also Fig. S2C , E ). Download figure Open in new tab Fig S11. A-E. Comparison of the methylation level at individual CpG sites across L1 for samples treated with enzymatic (EM) versus bisulfite (BS) conversion followed by amplification with the DIAMOND targets and deep sequencing or with various DNA starting quantities. A. 100ng DNA from healthy white blood cells (WBC) treated with EM versus BS conversion. B. 10ng healthy cfDNA vs various DNA input quantities from WBC converted with BS. C. 10ng healthy cfDNA vs various DNA input quantities from WBC converted with EM. D. 10ng cfDNA or WBC DNA treated with BS versus EM conversion. E. DNA from healthy WBC versus MCF7 breast cancer cell lines treated with EM versus BS conversion. Acknowledgments We thank the members of the C.P.’s laboratory for critical reading of the manuscript. We are grateful to D. Bourc’his and her team for hosting us during part of this study. We thank the members of the ICGex NGS platform of the Institut Curie, especially S. Lameiras, V. Raynal and S. Baulande for advice and the non-profit organization “La Vannetaise” for financial support. Footnotes In this new version, we have now included: – Analysis of plasma and tumor paired samples demonstrating very good correlation using methylation haplotype proportions (new Fig.S3) – Comparison of methylation profiles of hormone dependent and triple negative breast cancers to explain the differences of the M0 and M+ subgroups (new Fig.S4F-H) – The performances per cancer-type and stage from the all cancer model (new Fig.S4D-E and Fig.S8A) – Analysis from publicly available plasma WGBS data which shows that we can retrieve L1PA hypomethylation in cancer compared to healthy from genome wide profiles (new Fig.S5) – Analysis comparing an age-matched cohort and a non-aged-matched cohort extracted from C2 demonstrating identical classification performances in both cohorts (new Fig.S7H-J). – Survival analysis on the validation cohort demonstrating that higher hypomethylation is associated with shorter survival (new Fig.S8D-E) – Controls demonstrating no effect of the cfDNA extraction method employed (new Fig.S10) – Bisulfite and enzymatic conversion comparison (new Fig.S11) – Data for the screening of ovarian cancer mutations (new Table S6) – New data presenting a new integrated model and tests where we exclude cancer subgroups or types from the train set and which are well recognized as cancer by our models to highlight the universality of our approach (new Fig.4G and Fig.S8A-C). References 1. ↵ Bettegowda , C. et al. Detection of circulating tumor DNA in early- and late-stage human malignancies . Science Translational Medicine 6 , 224ra24 – 224ra24 ( 2014 ). OpenUrl Abstract / FREE Full Text 2. Newman , A. M. et al. An ultrasensitive method for quantitating circulating tumor DNA with broad patient coverage . Nature Medicine 20 , 548 – 554 ( 2014 ). OpenUrl CrossRef PubMed 3. ↵ Garcia-Murillas , I. et al. Mutation tracking in circulating tumor DNA predicts relapse in early breast cancer . Science Translational Medicine 7 , 302ra133 ( 2015 ). OpenUrl Abstract / FREE Full Text 4. ↵ Bidard , F.-C. , Weigelt , B. & Reis-Filho , J. S . Going with the flow: from circulating tumor cells to DNA . Science Translational Medicine 5 , 207ps14 – 207ps14 ( 2013 ). OpenUrl FREE Full Text 5. ↵ Diaz , L. A. & Bardelli , A . Liquid Biopsies: Genotyping Circulating Tumor DNA . Journal of Clinical Oncology 32 , 579 – 586 ( 2014 ). OpenUrl Abstract / FREE Full Text 6. ↵ Ignatiadis , M. , Sledge , G. W. & Jeffrey , S. S . Liquid biopsy enters the clinic — implementation issues and future challenges . Nature Reviews Clinical Oncology 1 – 16 ( 2021 ) doi: 10.1038/s41571-020-00457-x . OpenUrl CrossRef PubMed 7. ↵ Heitzer , E. , Haque , I. S. , Roberts , C. E. S. & Speicher , M. R . Current and future perspectives of liquid biopsies in genomics-driven oncology . Nature Reviews Genetics 20 , 1 – 18 ( 2019 ). OpenUrl PubMed 8. ↵ Stover , D. G. et al. Association of Cell-Free DNA Tumor Fraction and Somatic Copy Number Alterations With Survival in Metastatic Triple-Negative Breast Cancer . J. Clin. Oncol . 36 , 543 – 553 ( 2018 ). OpenUrl CrossRef PubMed 9. ↵ Wan , J. C. M. et al. Liquid biopsies come of age: towards implementation of circulating tumour DNA . Nature Reviews Cancer 17 , 223 – 238 ( 2017 ). OpenUrl CrossRef PubMed 10. ↵ Mattox , A. K. et al. Applications of liquid biopsies for cancer . Science Translational Medicine 11 , eaay1984 ( 2019 ). OpenUrl FREE Full Text 11. ↵ Cohen , J. D. et al. Detection and localization of surgically resectable cancers with a multi-analyte blood test . Science 1 , eaar3247 – 10 ( 2018 ). OpenUrl 12. ↵ Riva , F. et al. Patient-Specific Circulating Tumor DNA Detection during Neoadjuvant Chemotherapy in Triple-Negative Breast Cancer . Clin Chem 63 , 691 – 699 ( 2017 ). OpenUrl Abstract / FREE Full Text 13. ↵ Jeannot , E. et al. A single droplet digital PCR for ESR1 activating mutations detection in plasma . Oncogene 73 , 1 – 9 ( 2020 ). OpenUrl 14. ↵ Darrigues , L. et al. Circulating tumor DNA as a dynamic biomarker of response to palbociclib and fulvestrant in metastatic breast cancer patients . Breast Cancer Res . 1 – 10 ( 2021 ) doi: 10.1186/s13058-021-01411-0 . OpenUrl CrossRef PubMed 15. ↵ Silveira , A. B. et al. Multimodal liquid biopsy for early monitoring and outcome prediction of chemotherapy in metastatic breast cancer . npj Breast Cancer 1 – 9 ( 2021 ) doi: 10.1038/s41523-021-00319-4 . OpenUrl CrossRef 16. ↵ Baylin , S. B. & Jones , P. A . A decade of exploring the cancer epigenome - biological and translational implications . Nature Reviews Cancer 11 , 726 – 734 ( 2011 ). OpenUrl CrossRef PubMed Web of Science 17. ↵ Flavahan , W. A. , Gaskell , E. & Bernstein , B. E . Epigenetic plasticity and the hallmarks of cancer . Science 357 , eaal2380 – 10 ( 2017 ). OpenUrl Abstract / FREE Full Text 18. ↵ van der Pol , Y. & Mouliere , F. Toward the Early Detection of Cancer by Decoding the Epigenetic and Environmental Fingerprints of Cell-Free DNA . Cancer Cell 36 , 350 – 368 ( 2019 ). OpenUrl CrossRef PubMed 19. ↵ Esteller , M . Cancer epigenomics: DNA methylomes and histone-modification maps . Nature Reviews Genetics 8 , 286 – 298 ( 2007 ). OpenUrl CrossRef PubMed Web of Science 20. ↵ Feinberg , A. P . The Key Role of Epigenetics in Human Disease Prevention and Mitigation . N Engl J Med 378 , 1323 – 1334 ( 2018 ). OpenUrl CrossRef PubMed 21. ↵ Guo , S. et al. Identification of methylation haplotype blocks aids in deconvolution of heterogeneous tissue samples and tumor tissue-of-origin mapping from plasma DNA . Nature Genetics 49 , 635 – 642 ( 2017 ). OpenUrl CrossRef PubMed 22. ↵ Li , W. et al. CancerDetector: ultrasensitive and non-invasive cancer detection at the resolution of individual reads using cell-free DNA methylation sequencing data . Nucleic Acids Research 46 , e89 – e89 ( 2018 ). OpenUrl CrossRef PubMed 23. ↵ Moarii , M. , Reyal , F. & Vert , J.-P . Integrative DNA methylation and gene expression analysis to assess the universality of the CpG island methylator phenotype . Hum Genomics 9 , 26 ( 2015 ). 24. ↵ Chan , K. C. A. et al. Noninvasive detection of cancer-associated genome-wide hypomethylation and copy number aberrations by plasma DNA bisulfite sequencing . Proceedings of the National Academy of Sciences of the United States of America 110 , 18761 – 18768 ( 2013 ). OpenUrl Abstract / FREE Full Text 25. Legendre , C. et al. Whole-genome bisulfite sequencing of cell-free DNA identifies signature associated with metastatic breast cancer . Clinical Epigenetics 7 , 1 – 10 ( 2015 ). OpenUrl PubMed 26. ↵ Sun , K. et al. Plasma DNA tissue mapping by genome-wide methylation sequencing for noninvasive prenatal, cancer, and transplantation assessments . Proceedings of the National Academy of Sciences of the United States of America 112 , E5503 – E5512 ( 2015 ). OpenUrl Abstract / FREE Full Text 27. ↵ Lehmann-Werman , R. et al. Identification of tissue-specific cell death using methylation patterns of circulating DNA . Proceedings of the National Academy of Sciences 201519286 – 34 ( 2016 ) doi: 10.1073/pnas.1519286113 . OpenUrl Abstract / FREE Full Text 28. ↵ Garrigou , S. et al. A Study of Hypermethylated Circulating Tumor DNA as a Universal Colorectal Cancer Biomarker . Clinical Chemistry 62 , 1 – 11 ( 2016 ). OpenUrl FREE Full Text 29. Barault , L. et al. Discovery of methylated circulating DNA biomarkers for comprehensive non-invasive monitoring of treatment response in metastatic colorectal cancer . Gut 67 , 1995 – 2005 ( 2018 ). OpenUrl Abstract / FREE Full Text 30. ↵ Jin , S. et al. Efficient detection and post-surgical monitoring of colon cancer with a multi-marker DNA methylation liquid biopsy . Proc. Natl. Acad. Sci. U.S.A . 118 , e2017421118 – 8 ( 2021 ). OpenUrl Abstract / FREE Full Text 31. ↵ Lun , F. M. F. et al. Noninvasive prenatal methylomic analysis by genomewide bisulfite sequencing of maternal plasma DNA . Clinical Chemistry 59 , 1583 – 1594 ( 2013 ). OpenUrl Abstract / FREE Full Text 32. ↵ Shen , S. Y. et al. Sensitive tumour detection and classification using plasma cell-free DNA methylomes . Nature 563 , 579 – 583 ( 2018 ). OpenUrl CrossRef PubMed 33. Nassiri , F. et al. Detection and discrimination of intracranial tumors using plasma cell-free DNA methylomes . Nature Medicine 26 , 1 – 12 ( 2020 ). OpenUrl CrossRef PubMed 34. Nuzzo , P. V. et al. Detection of renal cell carcinoma using plasma and urine cell-free DNA methylomes . Nature Medicine 26 , 1 – 11 ( 2020 ). OpenUrl CrossRef PubMed 35. Liu , X. et al. Comprehensive DNA methylation analysis of tissue of origin of plasma cell-free DNA by methylated CpG tandem amplification and sequencing (MCTA-Seq) . Clinical Epigenetics 1 – 13 ( 2019 ) doi: 10.1186/s13148-019-0689-y . OpenUrl CrossRef PubMed 36. Luo , H. et al. Circulating tumor DNA methylation profiles enable early diagnosis, prognosis prediction, and screening for colorectal cancer . Science Translational Medicine 12 , eaax7533 ( 2020 ). OpenUrl Abstract / FREE Full Text 37. Chen , X. et al. Non-invasive early detection of cancer four years before conventional diagnosis using a blood test . Nature Communications 11 , 1 – 10 ( 2020 ). OpenUrl CrossRef PubMed 38. Cao , F. et al. Integrated epigenetic biomarkers in circulating cell-free DNA as a robust classifier for pancreatic cancer . Clinical Epigenetics 1 – 14 ( 2020 ) doi: 10.1186/s13148-020-00898-2 . OpenUrl CrossRef 39. ↵ Liu , M. C. et al. Sensitive and specific multi-cancer detection and localization using methylation signatures in cell-free DNA . Ann. Oncol . 31 , 745 – 759 ( 2020 ). OpenUrl PubMed 40. ↵ Klein , E. A. et al. Clinical validation of a targeted methylation-based multi-cancer early detection test using an independent validation set . Annals of Oncology 32 , 1167 – 1177 ( 2021 ). OpenUrl CrossRef PubMed 41. ↵ Ross , J. P. , Rand , K. N. & Molloy , P. L . Hypomethylation of repeated DNA sequences in cancer . Epigenomics 2 , 245 – 269 ( 2010 ). OpenUrl CrossRef PubMed Web of Science 42. ↵ Burns , K. H . Transposable elements in cancer . Nature Reviews Cancer 17 , 415 – 424 ( 2017 ). OpenUrl CrossRef PubMed 43. ↵ Rodić , N. et al. Long Interspersed Element-1 Protein Expression Is a Hallmark of Many Human Cancers . Am J Pathology 184 , 1280 – 1286 ( 2014 ). OpenUrl CrossRef PubMed 44. ↵ Lanciano , S. et al. Locus-level L1 DNA methylation profiling reveals the epigenetic and transcriptional interplay between L1s and their integration sites . Cell Genom . 100498 ( 2024 ) doi: 10.1016/j.xgen.2024.100498 . OpenUrl CrossRef 45. ↵ Rodriguez-Martin , B. et al. Pan-cancer analysis of whole genomes identifies driver rearrangements promoted by LINE-1 retrotransposition . Nature Genetics 52 , 306 – 319 ( 2020 ). OpenUrl CrossRef PubMed 46. ↵ Scott , E. C. et al. A hot L1 retrotransposon evades somatic repression and initiates human colorectal cancer . Genome Res . 26 , 745 – 755 ( 2016 ). OpenUrl Abstract / FREE Full Text 47. ↵ Gainetdinov , I. V. et al. Hypomethylation of human-specific family of LINE-1 retrotransposons in circulating DNA of lung cancer patients . Lung Cancer 99 , 127 – 130 ( 2016 ). OpenUrl PubMed 48. ↵ Nagai , Y. et al. LINE-1 hypomethylation status of circulating cell-free DNA in plasma as a biomarker for colorectal cancer . Oncotarget 8 , 11906 – 11916 ( 2017 ). OpenUrl PubMed 49. ↵ Robinson , M. D. et al. Statistical methods for detecting differentially methylated loci and regions . Front Genet 5 , 324 ( 2014 ). 50. ↵ Yoder , J. A. , Walsh , C. P. & Bestor , T. H . Cytosine methylation and the ecology of intragenomic parasites . Trends Genet . 13 , 335 – 340 ( 1997 ). OpenUrl CrossRef PubMed Web of Science 51. ↵ Woloszynska-Read , A. et al. Coordinated Cancer Germline Antigen Promoter and Global DNA Hypomethylation in Ovarian Cancer: Association with the BORIS/CTCF Expression Ratio and Advanced Stage . Clinical Cancer Research 17 , 2170 – 2180 ( 2011 ). OpenUrl Abstract / FREE Full Text 52. ↵ Ogino , S. et al. A Cohort Study of Tumoral LINE-1 Hypomethylation and Prognosis in Colon Cancer . JNCI: Journal of the National Cancer Institute 100 , 1734 – 1738 ( 2008 ). OpenUrl CrossRef PubMed Web of Science 53. ↵ Liu , Y. et al. FinaleMe: Predicting DNA methylation by the fragmentation patterns of plasma cell-free DNA . Nat. Commun . 15 , 2790 ( 2024 ). OpenUrl PubMed 54. ↵ Gao , Y. et al. Whole-genome bisulfite sequencing analysis of circulating tumour DNA for the detection and molecular classification of cancer . Clin. Transl. Med . 12 , e1014 ( 2022 ). OpenUrl 55. ↵ Bidard , F.-C. et al. Circulating Tumor Cells and Circulating Tumor DNA Detection in Potentially Resectable Metastatic Colorectal Cancer: A Prospective Ancillary Study to the Unicancer Prodige-14 Trial . Cells 8 , 516 – 13 ( 2019 ). OpenUrl CrossRef PubMed 56. ↵ Cabel , L. et al. Limited Sensitivity of Circulating Tumor DNA Detection by Droplet Digital PCR in Non-Metastatic Operable Gastric Cancer Patients . Cancers 11 , 396 – 10 ( 2019 ). OpenUrl PubMed 57. ↵ Field , A. E. et al. DNA Methylation Clocks in Aging: Categories, Causes, and Consequences . Molecular Cell 71 , 882 – 895 ( 2018 ). OpenUrl CrossRef PubMed 58. ↵ Fraga , M. F. & Esteller , M . Epigenetics and aging: the targets and the marks . Trends Genet . 23 , 413 – 418 ( 2007 ). OpenUrl CrossRef PubMed Web of Science 59. ↵ Pisanic , T. R. et al. Long Interspersed Nuclear Element 1 Retrotransposons Become Deregulated during the Development of Ovarian Cancer Precursor Lesions . Am J Pathology 189 , 513 – 520 ( 2019 ). OpenUrl CrossRef PubMed 60. ↵ Taylor , M. S. et al. Ultrasensitive detection of circulating LINE-1 ORF1p as a specific multi-cancer biomarker . Cancer Discov . ( 2023 ) doi: 10.1158/2159-8290.cd-23-0313 . OpenUrl CrossRef 61. ↵ Adalsteinsson , V. A. et al. Scalable whole-exome sequencing of cell-free DNA reveals high concordance with metastatic tumors . Nature Communications 8 , 1 – 12 ( 2017 ). OpenUrl CrossRef PubMed 62. ↵ Douville , C. et al. Assessing aneuploidy with repetitive element sequencing . Proc. Natl. Acad. Sci. U.S.A . 47 , 201910041 ( 2020 ). 63. ↵ Beroukhim , R. et al. The landscape of somatic copy-number alteration across human cancers . Nature 463 , 899 – 905 ( 2010 ). OpenUrl CrossRef PubMed Web of Science 64. ↵ Kinde , I. , Papadopoulos , N. , Kinzler , K. W. & Vogelstein , B . FAST-SeqS: A Simple and Efficient Method for the Detection of Aneuploidy by Massively Parallel Sequencing . PLoS ONE 7 , e41162 – 8 ( 2012 ). OpenUrl CrossRef PubMed 65. ↵ Belic , J. et al. mFast-SeqS as a Monitoring and Pre-screening Tool for Tumor-Specific Aneuploidy in Plasma DNA . in Circulating Nucleic Acids in Serum and Plasma – CNAPS IX vol. 924 147 – 155 (Circulating Nucleic Acids in Serum and Plasma – CNAPS IX, 2016 ). OpenUrl 66. ↵ Belic , J. et al. Rapid Identification of Plasma DNA Samples with Increased ctDNA Levels by a Modified FAST-SeqS Approach . Clin. Chem . 61 , 838 – 849 ( 2015 ). OpenUrl Abstract / FREE Full Text 67. ↵ Rollins , R. A. et al. Large-scale structure of genomic methylation patterns . Genome Research 16 , 157 – 163 ( 2006 ). OpenUrl Abstract / FREE Full Text 68. ↵ Umetani , N. et al. Prediction of Breast Tumor Progression by Integrity of Free Circulating DNA in Serum . J. Clin. Oncol . 24 , 4270 – 4276 ( 2006 ). OpenUrl Abstract / FREE Full Text 69. ↵ Lennon , A. M. et al. Feasibility of blood testing combined with PET-CT to screen for cancer and guide intervention . Science 369 , ( 2020 ). 70. ↵ Douville , C. & Tomasetti , C . Machine learning to detect the SINEs of cancer . Science translational medicine ( 2024 ) doi: 10.1126/scitranslmed.adi3883 . OpenUrl CrossRef 71. ↵ Annapragada , A. V. et al. Genome-wide repeat landscapes in cancer and cell-free DNA . Sci. Transl. Med . 16 , eadj9283 ( 2024 ). OpenUrl PubMed 72. ↵ Zhou , Q. , et al. Epigenetic analysis of cell-free DNA by fragmentomic profiling . Proc National Acad Sci 119 , e2209852119 ( 2022 ). OpenUrl CrossRef PubMed 73. ↵ Vaisvila , R. et al. Enzymatic methyl sequencing detects DNA methylation at single-base resolution from picograms of DNA . Genome Research ( 2021 ) doi: 10.1101/gr.266551.120 . OpenUrl Abstract / FREE Full Text 74. ↵ Snyder , M. W. , Kircher , M. , Hill , A. J. , Daza , R. M. & Shendure , J . Cell-free DNA Comprises an In Vivo Nucleosome Footprint that Informs Its Tissues-Of-Origin . Cell 164 , 57 – 68 ( 2016 ). OpenUrl CrossRef PubMed 75. ↵ Breiman , L. Random Forests . Mach Learn 45 , 5 – 32 ( 2001 ). OpenUrl CrossRef PubMed Web of Science 76. ↵ Pedregosa , F. et al. Scikit-learn: Machine Learning in Python . Arxiv ( 2012 ). 77. ↵ Boulesteix , A. , Janitza , S. , Kruppa , J. & König , I. R . Overview of random forest methodology and practical guidance with emphasis on computational biology and bioinformatics . Wiley Interdiscip Rev Data Min Knowl Discov 2 , 493 – 507 ( 2012 ). OpenUrl 78. ↵ Barandela , R. , Sánchez , J. S. , García , V. & Rangel , E. Strategies for learning in class imbalance problems . Pattern Recogn 36 , 849 – 851 ( 2003 ). OpenUrl 79. ↵ Basse , C. , et al. Relevance of a molecular tumour board (MTB) for patients’ enrolment in clinical trials: experience of the Institut Curie . ESMO Open 3 , e000339 ( 2018 ). OpenUrl Abstract / FREE Full Text 80. ↵ Quinlan , A. R. & Hall , I. M . BEDTools: a flexible suite of utilities for comparing genomic features . Bioinformatics 26 , 841 – 842 ( 2010 ). OpenUrl CrossRef PubMed Web of Science 81. ↵ Chen , S. , Zhou , Y. , Chen , Y. & Gu , J. fastp: an ultra-fast all-in-one FASTQ preprocessor . Bioinformatics 34 , i884 – i890 ( 2018 ). OpenUrl CrossRef PubMed 82. ↵ Schloss , P. D. et al. Introducing mothur: Open-Source, Platform-Independent, Community-Supported Software for Describing and Comparing Microbial Communities . Appl. Environ. Microbiol . 75 , 7537 – 7541 ( 2009 ). OpenUrl Abstract / FREE Full Text View the discussion thread. Back to top Previous Next Posted October 30, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Non-invasive multi-cancer detection using DNA hypomethylation of LINE-1 retrotransposons Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Non-invasive multi-cancer detection using DNA hypomethylation of LINE-1 retrotransposons Marc Michel , Maryam Heidary , Anissa Mechri , Kévin Da Silva , Marine Gorse , Victoria Dixon , Klaus von Grafenstein , Charline Bianchi , Caroline Hego , Aurore Rampanou , Constance Lamy , Maud Kamal , Christophe Le Tourneau , Mathieu Séné , Ivan Bièche , Cecile Reyes , David Gentien , Marc-Henri Stern , Olivier Lantz , Luc Cabel , Jean-Yves Pierga , François-Clément Bidard , Chloé-Agathe Azencott , Charlotte Proudhon medRxiv 2024.01.20.23288905; doi: https://doi.org/10.1101/2024.01.20.23288905 Share This Article: Copy Citation Tools Non-invasive multi-cancer detection using DNA hypomethylation of LINE-1 retrotransposons Marc Michel , Maryam Heidary , Anissa Mechri , Kévin Da Silva , Marine Gorse , Victoria Dixon , Klaus von Grafenstein , Charline Bianchi , Caroline Hego , Aurore Rampanou , Constance Lamy , Maud Kamal , Christophe Le Tourneau , Mathieu Séné , Ivan Bièche , Cecile Reyes , David Gentien , Marc-Henri Stern , Olivier Lantz , Luc Cabel , Jean-Yves Pierga , François-Clément Bidard , Chloé-Agathe Azencott , Charlotte Proudhon medRxiv 2024.01.20.23288905; doi: https://doi.org/10.1101/2024.01.20.23288905 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Oncology Subject Areas All Articles Addiction Medicine (574) Allergy and Immunology (865) Anesthesia (304) Cardiovascular Medicine (4460) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (611) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15251) Forensic Medicine (31) Gastroenterology (1132) Genetic and Genomic Medicine (6621) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4564) Health Policy (1372) Health Systems and Quality Improvement (1617) Hematology (544) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15938) Intensive Care and Critical Care Medicine (1107) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6642) Nursing (346) Nutrition (1001) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3350) Ophthalmology (981) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1698) Pharmacology and Therapeutics (694) Primary Care Research (714) Psychiatry and Clinical Psychology (5464) Public and Global Health (9259) Radiology and Imaging (2212) Rehabilitation Medicine and Physical Therapy (1372) Respiratory Medicine (1198) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (533) Surgery (715) Toxicology (100) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a039c4c1f8bc593a',t:'MTc4MDEwMjQ3Nw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-21T05:10:58.409756+00:00

License: CC-BY-NC-ND-4.0