DLBCLone: A unified framework for neighbourhood-based genetic subtyping of lymphomas

doi:10.1101/2025.09.18.25335809

DLBCLone: A unified framework for neighbourhood-based genetic subtyping of lymphomas

2025 · doi:10.1101/2025.09.18.25335809

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 60,360 characters · extracted from preprint-html · click to expand

DLBCLone: A unified framework for neighbourhood-based genetic subtyping of lymphomas | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search DLBCLone: A unified framework for neighbourhood-based genetic subtyping of lymphomas Luke Klossok , Kostiantyn Dreval , Manuela Cruz , Jasper C.H. Wong , Sierra Gillis , Brett Collinge , Christian Steidl , David W. Scott , Laura K. Hilton , View ORCID Profile Ryan D. Morin doi: https://doi.org/10.1101/2025.09.18.25335809 Luke Klossok 1 Department of Molecular Biology and Biochemistry, Simon Fraser University , Burnaby, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kostiantyn Dreval 1 Department of Molecular Biology and Biochemistry, Simon Fraser University , Burnaby, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Manuela Cruz 1 Department of Molecular Biology and Biochemistry, Simon Fraser University , Burnaby, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jasper C.H. Wong 2 Centre for Lymphoid Cancer, BC Cancer Research Institute , Vancouver, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sierra Gillis 2 Centre for Lymphoid Cancer, BC Cancer Research Institute , Vancouver, BC, Canada 3 Canada’s Michael Smith Genome Sciences Centre, BC Cancer Research Institute , Vancouver, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Brett Collinge 2 Centre for Lymphoid Cancer, BC Cancer Research Institute , Vancouver, BC, Canada 4 Department of Pathology and Laboratory Medicine, University of British Columbia , Vancouver, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Christian Steidl 2 Centre for Lymphoid Cancer, BC Cancer Research Institute , Vancouver, BC, Canada 4 Department of Pathology and Laboratory Medicine, University of British Columbia , Vancouver, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site David W. Scott 2 Centre for Lymphoid Cancer, BC Cancer Research Institute , Vancouver, BC, Canada 5 Division of Medical Oncology, Department of Medicine, University of British Columbia , Vancouver, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: rdmorin{at}sfu.ca Laura K. Hilton 2 Centre for Lymphoid Cancer, BC Cancer Research Institute , Vancouver, BC, Canada 1 Department of Molecular Biology and Biochemistry, Simon Fraser University , Burnaby, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ryan D. Morin 1 Department of Molecular Biology and Biochemistry, Simon Fraser University , Burnaby, BC, Canada 2 Centre for Lymphoid Cancer, BC Cancer Research Institute , Vancouver, BC, Canada 3 Canada’s Michael Smith Genome Sciences Centre, BC Cancer Research Institute , Vancouver, BC, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ryan D. Morin Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Genetic subtyping of diffuse large B-cell lymphoma (DLBCL) has been slow to gain clinical adoption. Available classifiers either leave many tumours unclassified or depend on exome-wide features and copy-number profiles, which are not always available in routine practice. We introduce DLBCLone, a neighbourhood-based framework that enables panel-aware genetic subtyping compatible with existing taxonomies. DLBCLone learns a 2-D reference map of mutation profiles (UMAP) from a labeled training cohort, freezes this map, and deterministically projects new cases into the same latent space. Class labels are then inferred by weighted K-nearest neighbours, limiting over-assignment by considering the local density of unclassified neighbours. By default, classification thresholds optimize per-class balanced accuracy, but can be adjusted to suit study needs. The framework is intended to emulate (or “clone”) existing schemas such as LymphGen or DLBClass. Trained on a harmonized cohort of 2,130 DLBCLs, DLBCLone classifiers for different gene panels achieved consistently improve classification rates relative to fixed-threshold baselines while maintaining a reasonable per-class performance. On an in-house cohort of 323 patients, it assigned an additional 98 samples without compromising accuracy relative to LymphGen. On an external exome-sequenced subset from a 1,001-patient cohort, DLBCLone achieved a 51% classification rate (vs 36% for LymphGen) at an overall accuracy of 0.70. Compared with another LymphGen approximator (LymphPlex), DLBCLone reached a 74% classification rate (vs 55%). In general, the DLBCLone-reclassified tumours had molecular features consistent with their new labels. DLBCLone provides a deterministic, reproducible, and extensible approach to genetic subtyping under real-world constraints, facilitating prospective studies that rely on either targeted panels or more comprehensive sequencing strategies. DLBCLone is open source and available in the GAMBLR.predict package ( https://github.com/morinlab/gamblr.predict ). Introduction Diffuse large B-cell lymphoma (DLBCL) comprises biologically diverse cancers unified by morphology. Early genomic work divided DLBCL into two gene expression-based molecular subgroups with prognostic implications, with ABC-DLBCL generally showing worse outcomes than GCB-DLBCL 1 . More recently, a “dark-zone” gene expression signature (DZsig) was proposed to further sub-divide GCB-DLBCL, with DZsig-positive cases having outcomes comparable to ABC-DLBCL 2 . Comprehensive sequencing studies have identified >125 recurrently mutated genes in DLBCL, revealing patterns of co-occurrence and mutual exclusivity that motivated genetically defined subgroups 3 , 4 . This work culminated in LymphGen, a probabilistic classifier that assigns tumors to EZB, MCD, BN2, ST2, N1, and A53 5 . EZB can be further sub-divided based on presence of the DZsig gene expression signature or, in lieu of this, MYC translocation status. A separate group proposed a five-class system with each group (clusters C1–C5) broadly aligning with a LymphGen counterpart, with the exception of N1. Results from a third study indicated that at least one of these classes (ST2) could be further sub-divided. Differences across studies reflect both methodological choices and feature scope: for example, the panel-based analysis by Lacy et al could not recover A53/C2 (largely driven by copy-number variants (CNVs)) and did not reproduce N1. A later update from this group reassigned truncating NOTCH1 cases accordingly, bringing the “HMRN” system closer to LymphGen 6 – 8 . As genetic subgroups gained traction, their clinical relevance—and potential therapeutic vulnerabilities—have been explored, including some encouraging post hoc analyses supporting rational combinations in specific subgroups 9 . Despite this, adoption has been slow while the community evaluates technical and analytical barriers to bringing genetic classification to real-world settings. The deliberately conservative thresholds used by LymphGen leave ∼39–47% of cases unclassified (“Other”), which complicates trial accrual 5 . While the original description of the C1-C5 system suggested a nearly complete classification rate, these groupings were originally assigned by clustering, which cannot be used to assign new samples due to the inter-dependence of all samples on cluster membership. A reproducible tool for this (DLBClass) became available only recently, leaving no options other than LymphGen in the interim 10 . In this setting, a demand for alternatives led to the emergence of simplified proxies for genetic classification 11 . LymphPlex, for example, uses a 37-gene panel to produce LymphGen-like labels (e.g., N1-like, ST2-like) and reports higher classification rates with modest panels 12 . To date, LymphPlex is the only framework that has been formally incorporated into a subtype-guided prospective trial 13 , 14 . Despite the appeal of comprehensive sequencing, gene panels persist in clinical workflows, and yet no commercial panels have been designed around LymphGen/DLBClass features. Moreover, because current classifiers were optimized using the full feature universe, their performance can degrade unpredictably when specific genes or CNVs are missing. Collectively, these realities motivate the existence of a tool with flexibility for features and tuneable classification rates. We present DLBCLone, which lets users examine how feature combinations map onto DL-BCL subtypes and build reproducible, panel-aware classifiers à la carte. Unlike fixed-schema methods, DLBCLone is system-agnostic. Given a sufficiently sized labeled (training) cohort, previously analyzed with an available classifier (e.g. LymphGen), DLBCLone can emulate genetic taxonomies and classify new cases deterministically. Rather than requiring the experimental data to cater to the requirements of the classifier, DLBCLone is designed to adapt to the available data while revealing their impact on overall and per-class accuracy. We demonstrate the flexibility of this approach across a variety of gene panels by training on over 2000 DLBCLs from our harmonized cohort. We validate generalizability on independent, previously unseen, real panel-based datasets. Across these external cohorts, DLBCLone yields consistently higher classification rates than fixed-threshold baselines (e.g., LymphGen) while maintaining comparable or improved per-class balanced accuracy, supporting its suitability for prospective studies that rely on panels or exomes. Results Genetic features associated with LymphGen classes Our training data comprises samples from numerous publications that were sequenced with either exome or whole genome sequencing (WGS) and accompanying BCL2 and BCL6 rearrangement status ( Methods ). Noting the range of sequencing depth and a strong association between low coverage and LymphGen classification rate, we began by eliminating samples with exceedingly poor coverage ( Figure 1 ) 15 . The remaining 2130 samples with unambiguous LymphGen assignments were used for all subsequent analyses. Download figure Open in new tab Figure 1: Association between sequencing depth and study on LymphGen class representation. A) The distribution of coverage values across all exome capture (left) and WGS (right) samples. The variability across the exomes was particularly striking, with more than 300 samples having less than 15x average corrected coverage. For the purposes of this analysis, we considered those as QC-Fail. B) There is a clear difference in the success of LymphGen on the samples that passed QC (top) compared to the remaining samples. The proportion of cases assigned to a non-Other class was 45% in the QC-Pass samples. Although the rate of most LymphGen classes was similar, N1 samples were not found in three of the largest cohorts. C) Considering the six largest studies with samples in this analysis, the Reddy cohort was the major contributor to a higher rate of LymphGen-Other. D) The distribution of LymphGen classes in the same cohorts after removing QC-Fail samples. To aid in feature selection for DLBCL classification, we compared the frequency of genetic features between samples based on their assigned LymphGen class ( Methods ). Aiming to develop a flexible method suitable for either gene panels or comprehensive sequencing data, we focused on simple somatic mutations and translocations that are detected in routine clinical practice ( BCL2 and BCL6 ). For select genes, features were derived only from specific mutation classes. For example, we tabulated NFKBIZ 3 ′ UTR mutations and separately annotated MYD88 hotspot and non-hotspot mutations. This analysis recovered nearly all the features used by LymphGen ( Supplemental Table S1 ). Interestingly, 28 additional associations between mutations and at least one LymphGen class were identified ( Table 1 ). While some of these associations were reported in the original description of these classes (e.g. STAT6 and TMEM30A ) 4 , their mutations are not utilized within LymphGen. Some of these genes are, however, utilized by the DLBClass tool, such as TMSB4X , KLHL6 , BRAF , STAT6 , GNA13 and TOX 10 . For some of the remaining genes, the low incidence could explain why these associations were not previously detected. For example, FCGR2B mutations were in less than 4% of EZB samples and CDKN2A mutations occur in less than 7% of MCDs. Though mutations in these genes are not common, CNVs affecting thes loci have been described in GCB and ABC DLBCL, respectively 16 – 18 . When CNV features are given to LymphGen, the presence of CDKN2A deletions (but not mutations) contributes to MCD classification. Two novel features that were strongly enriched in BN2 are non-hotspot mutations in MYD88 and NFKBIZ 3 UTR mutations. Although detectable by exome sequencing, coverage of these regions varies across lymphoma gene panels. As LymphGen does not use these features, we evaluated whether adding them to training improves classification accuracy and robustness. View this table: View inline View popup Table 1: Novel features enriched per LymphGen class. Separation of DLBCL with different feature sets To visualize patterns across many genetic features, we use the Uniform Manifold Approximation and Projection (UMAP), which reduces high-dimensional data into a two-dimensional map where nearby points have more similar mutation profiles. We began by exploring the relationship between selected gene sets and tumours assigned to different LymphGen classes. Across different gene sets, samples of the same class tended to occupy similar regions in UMAP space ( Figure 2 ). The main exception was N1, defined solely by NOTCH1 mutations. Our inhouse panel (LySeqST) showed separation comparable to the LymphGen feature set( Figure 2C ) 19 . This was encouraging because this panel was developed, in part, to apply LymphGen to clinical samples. We noted varying degrees of overlap between ST2 and BN2, most pronounced with the LymphPlex panel and also seen with the Lacy panel ( Figure 2D,E ). The samples labeled Other by LymphGen were diffusely distributed with many lying in the proximity of bona fide class neighborhoods. Download figure Open in new tab Figure 2: UMAP transformation of DLBCLs using different feature sets. Each plot shows the result of UMAP using the a subset of genes as described. BCL2 and BCL6 translocation status was used for A, B and D to be consistent with the methods used in the corresponding studies. All DLBCL samples assigned to a single class by LymphGen were included. Points are coloured based on their class as defined by LymphGen. A) All features that were significantly associated with a LymphGen class in our post-hoc analysis. B) The full set of features used by the LymphGen algorithm. C) The LySeqST 125-gene panel. D) A 93-gene panel used in the validation of LymphGen. E) 78 genes from Lacy et al. This includes only the genes that were used in their final clustering, further restricted to the genes considered Tier 1 DLBCL genes. 3 F) The 37 genes used in the LymphPlex algorithm. DLBCLone performance without tuning We reasoned that if a DLBCL sample could be mapped to this latent space, the class of previously analyzed (i.e. training) samples in the same local neighborhood could allow inference of its LymphGen class by proximity. Considering that some Lymphgen-Other samples have mutations in genes associated with one of the LymphGen classes, we speculated that a datadriven neighbor-based approach could achieve a classification rate exceeding LymphGen by tuning the threshold between assigning a sample to a class rather than Other. To support different use cases, our approach assigns up to two classifications to each sample. The first represents a more generous (“greedy”) strategy that attempts to maximize classification rate. The second results from an optimization between classification rate and accuracy, effectively increasing the barrier for samples to be assigned to any class. To illustrate the general approach, we implemented a DLBCLone model to approximate LymphGen utilizing all genetic features we determined to be significantly associated with a class. The greedy assignments exceeded ∼74% accuracy for all but one class. Each of MCD and EZB were classified with ∼95% accuracy. Unfortunately, a minority of the N1 samples were correctly assigned ( Figure 3A ), which was not surprising given the lack of any clear cohesion of these samples in the UMAP. The outgroup-optimized classifications reduced the classification rate from 96% to 78.3% with only modest reductions in accuracy ( Figure 3B ). Download figure Open in new tab Figure 3: Accuracy of DLBCLone greedy and optimized labels. A basic DLBCLone classifier was trained using all features significantly associated with a LymphGen class (Supplemental Table S1). Alluvial summaries of the relationship between LymphGen assignments and the LymphGen class assigned by DLBCLone. Each ribbon is coloured according to the original (“true”) class of the training sample. The total number and percentage of training samples assigned to the same class by DLBCLone is indicated in boxes in the centre. The coloured boxes on either side indicate the number of samples that changed to another class. Outer boxes are repeated on both sides for clarity and coloured based on the opposite class. A) DLBCLone without optimization (greedy) B) DLBCLone optimized for balance between classification and Other. Influence of feature repertoire and weighting While this performance was encouraged, we explored strategies to further improve accuracy, particularly for under-performing classes such as N1. To bolster the technique, we next investigated approaches to enhance this separation, using the vizualizations of UMAP transformations as a guide. We were particularly motivated to improve separation of classes with more diffuse patterns in the UMAP such as N1 and to reduce the overlap between BN2 and ST2. The weighting strategy we adopted combines sets of related features that support a given class into meta-features, a technique also used by DLBClass. To maintain flexibility, we do not perscribe which genes should contribute to the meta-features. In some scenarios, a user may prefer to optimize recovery of only certain classes, in which case they may choose to provide meta-features only for those. For the following analyses, we used a set of meta-features for each of the LymphGen classes ( Table 2 ). View this table: View inline View popup Download powerpoint Table 2: Mutations used to construct meta-features We sought to quantify the influence of different feature combinations on the ability to infer genetic subroup of a sample based solely on the LymphGen class of samples in its neighborhood. Importantly, the LySeqST panel was designed to cover all LymphGen features whereas the Lacy and LymphPlex panels are each incomplete. We created UMAP transformations for the three panels shown in Figure 3 using the features present in each panel and the metafeatures described above. Figure 4 shows the original UMAPs for the three feature sets using meta-features ( A–C ). There is a more clear separation of most N1 samples from the remaining class neighborhoods. The ST2 and BN2 neighborhoods also show a greater degree of separation, particularly in the UMAP based on our LySeqST panel. The effect of freezing the reference map and projecting the same samples with the locked transform can be seen as well ( Figure 4D–F ). Although the points shift slightly, they preserve their local neighbourhood structures. This deterministic projection is essential for prospective use: new cases can be added and classified in the same coordinate system at any time without refitting or moving the reference map, ensuring consistent classification even when some data are unavailable—or intentionally held out—at training time. Finally, Figure 4G–I colours the same projected points by their new class assignment, illustrating how the local neighbourhoods correspond to common DLBCLone predictions. Download figure Open in new tab Figure 4: Latent representations with feature weighting. A-C) The initial UMAP transformation of the training samples based on the LySeqST genes (A), Lacy genes (B), or LymphPlex genes (C) with weighting applied to a core set of features for each class. Points are coloured by the LymphGen class of the sample. D-F) show the corresponding result after each sample is transformed using the stable UMAP model with umap_transform. G-I) The same points from D-F coloured by their predicted LymphGen class from the DLBCLone optimized model derived from this UMAP. Overall, the classifiers derived from the three panels showed encouraging results ( Figure 5 ). The DLBCLone-LySeqST classifier yielded the highest agreement across LymphGen groups, ranging from 61.6% (ST2) to 95.8% (N1), using the optimized predictions. With a classification rate of 65%, DLBCLone-LySeqST assigned 173 of the LymphGen-Other samples to a new class. The two smaller panels each achieved higher classification rates along with a slight reduction in accuracy across most classes ( Figure 5,B-C ). For DLBCLone-LymphPlex, based on the smallest panel, the accuracy per class ranged from 50% (for ST2) to 100% (for N1). One might conclude from this result that a smaller panel is preferrable for applications that require a higher classfication rate. However, we caution against this interpretation because the quality of classifications for the newly classified samples is difficult to assess, since they (by definition) conflict with what has been considered ground truth. Download figure Open in new tab Figure 5: The training accuracy and classification results are summarized for DLBCLone models trained with four different feature sets A) Genes significantly associated with LymphGen from our post hoc analysis (n=85). B) LySeqST genes (n=125) C) Lacy genes (n=78) D) LymphPlex genes (n=37). For a qualitative assessment, we visualized the classifications from our two best models alongside each sample’s genetic features( Figure 6 ). Samples are arranged to group those with discrepant assignments between LymphGen and DLBCLone. The samples reassigned from a LymphGen class to Other by DLBCLone represent a minority. Encouragingly, while officially unclassified, the greedy DLBCLone assignment was correct for the majority of these. Their assignment to Other by our optimized approach likely reflects different factors considered by LymphGen for in/out membership. In contrast, we were intrigued by the substantial number of samples that transitioned from LymphGen-Other to a class. Considering the number of samples reassigned to each class, the we saw the most growth of BN2, followed by ST2, MCD and EZB. In theory, these result from class-informative features in the full gene set that affect DLBCLone decisions but are considered less important (or ignored) by LymphGen. Indeed, LymphGen considers mutations in only 64 genes of the 125 covered by the LySeqST panel and the 85 features identified from our post hoc analysis. Download figure Open in new tab Figure 6: Mutation patterns of DLBCLs classified by the model trained with LySeqST. This oncoplot includes all samples from the training cohort except those classified as Other by both LymphGen and DLBCLone. The genes are arranged from bottom to top based on the association between their mutations and BN2, EZB, MCD, N1 or ST2 (respectively). The remaining genes (top) were not significantly associated with any LymphGen class. Each sample is annotated at the bottom with its original LymphGen class along with the DLBCLone greedy and optimized predictions using the classifier derived from the LymphGen-enriched features identified through our post hoc analysis of LymphGen ( Figure 5A ). DLBCLone assignments from the classifier trained using the LySeqST panel are also shown ( Figure 5B ). For comparison, the which_max label is a naive class inferred by taking the maximal count of classenriched mutations. Validation of classifiers on external data We assessed generalizability using two external datasets. The first was a 1,001-patient cohort, of whom 337 underwent exome sequencing that the authors used to assign LymphGen labels (the Ruijin cohort) 12 . We applied the DLBCLone model trained on LymphPlex features to infer LymphGen labels for all cases. On the exome subset, our classification rate was 51.2%, exceeding LymphGen’s 36.4% ( Figure 7A ), with an overall accuracy of 0.70. Two caveats apply. First, mutations in genes not on the LymphPlex panel were available to LymphGen but hidden from DLBCLone; notably, 123/1,001 samples had zero LymphPlex genes mutated, forcing an “Other” assignment. Second, the LymphGen results include the A53 class, which DLBCLone was not trained to predict, so the nine A53 samples are counted as errors. Excluding A53, accuracies were similar for DLBCLone and LymphGen (0.728 vs 0.751), and DLBCLone achieved a slightly higher classification rate (52.4% vs 49.5%). Download figure Open in new tab Figure 7: Performance of DLBCLone models trained on external data. A) Concordance between LymphGen class labels from the Ruijin exome cohort and DLBCLone approximations of LymphGen using the same feature set. B) Concordance between LymphPlex and DLBCLone approximations of LymphGen using the LymphPlex panel data from the Ruijin cohort. C) Correspondence between LymphGen class labels from the Lacy study and DLBCLone approximations of LymphGen using the same feature set. D) Correspondence between HMRN class labels from the Lacy cohort and DLBCLone approximations of LymphGen using the same feature set. ST2 was conidered equivalent to both SOCS1/SGK1 and TET2/SGK1. The remainder of the Ruijin cohort was only subjected to panel-based sequencing along with FISH for BCL2 and BCL6 translocations and, together, these were used by the authors to infer LymphGen-like classes using their LymphPlex algorithm 12 . Comparing LymphPlex labels to DLBCLone’s optimized labels showed our ability to achieve a higher classification rate (55% vs 74.3%)( Figure 7B ). If we consider LymphPlex as truth, the accuracy of DLBCLone assignments was 0.648. Again, all samples assigned A53 by LymphPlex were, by definition, considered mis-classified by DLBCLone. This metric also considers the 114 LymphPlex-Other samples given a different class by DLBClone. At least some of the discrepancies represent situations in which DLBCLone outperforms LymphPlex. We trained another classifier for another in-house panel, which was used to sequence a cohort of 323 patients (the BC cohort) 16 . Remarkably, on the validation cohort, DLBCLone (optimized) achieved a classification rate of 88% with an accuracy of 0.85 (excluding LymphGen-Other and A53). An additional 98 samples that were either Other or A53 were assigned to one of the available classes (26 BN2, 36 EZB, 21 MCD, 2 N1 and 13 ST2). The mutation profiles of these re-classified samples, in general, show patterns of translocations, mutations and gene expression (COO) subgroup consistent with their DLBCLone-assigned class ( Figure 8 ; Supplemental Table S2 ). For example, a substantial number of BCL6-translocated samples were reclassified from Other to BN2 by DLBCLone and several ABC samples with the MYD88 hot spot mutation were added to MCD. Similarly, the majority of the new EZB and ST2 samples were GCB. Importantly, these molecular features were intentionally hidden during the training of this classifer to be consistent with the data used for LymphGen assignment in Wright et al 20 . Download figure Open in new tab Figure 8: Mutation patterns of DLBCLs from BC validation cohort. This oncoplot includes all samples that were subjected to targeted sequencing in our previous study 16 . These samples were also used to validate the LymphGen algorithm 5 . The genes are arranged from bottom to top based on the association between their mutations and BN2, EZB, MCD, N1 or ST2 (respectively). The remaining genes (bottom) were not significantly associated with any LymphGen class. Each sample is annotated at the bottom with its LymphGen class from Wright et al . along with the DLBCLone greedy and optimized predictions using a classifier trained on the same features in Figure 2D , which included mutation status of the NFKBIZ 3 UTR. BCL2 and BCL6 translocation status is shown but this information was hidden during DLB-CLone training. Download figure Open in new tab Figure 9: Overview of the DLBCLone procedure for inferring the class for a given sample after projection onto the UMAP alongside the training data for a given K (K = 9). A) The score for each class is first calculated using the Euclidean distances to the nearest K non-Other samples. The greedy label is the class with the maximal value. B) Next, we calculate the ratio between the maximal class score and the local Other score ( ρ ), which is similarly derived from the distances to the Other samples in the same radius. The optimized class assignment retains the greedy label if the magnitude of the original class score exceeds the threshold τ ⋆ or, failing that, ρ exceeds a second threshold ρ ⋆ . Discussion Real-world applications of genetic subtyping for DLBCL are only beginning to emerge, commonly making use of targeted sequencing rather than exomes or genomes 14 , 21 – 23 . As expected, performance of classifiers trained on comprehensive data degrades as the available feature space shrinks. In silico, a ∼400-gene panel classified 58% of the original cohort from Schmitz et al. with ∼92% overall accuracy when CNVs were included 4 , while a real-world application reached ∼55%. Another study using a ∼300-gene custom panel performed similarly (∼53%) 8 , 21 . In practice, nearly half of DLBCL tumours sequenced with these panels are left without a subtype assignment. We found a substantial number of genes whose mutation status is strongly associated with a LymphGen class ( Table 1 ), highlighting a broader feature space that could enhance classification rates and inform custom panel design for DLBCL. Notably, some of the genetic features we identified are already leveraged by DLBClass, possibly contributing to its higher classification rate. Nonetheless, with exome data, DLBClass assigns ∼75% with high confidence, leaving the remainder effectively unclassified. Limiting to mutations from the Lacy panel further reduced this to 61% high-confidence assignments 6 , 8 , 10 . This underscores the need for an approach that can achieve high classification rates with complete genetic data and maintain them within the constraints of a given panel. By training directly on the features queried by a panel, we consistently achieved higher classification rates on the same inputs while maintaining competitive per-class accuracy. We focused on LymphGen in this study because DLBClass requires high-quality copy-number profiles that are harder to resolve from many clinical panels 10 . In principle, however, DLB-CLone can be used to emulate DLBClass because the framework is feature-agnostic and can incorporate copy-number states and other events (e.g., SV/FISH) as additional features. Orthogonal molecular descriptors can also be included as categorical inputs. We did not explore CNV-augmented models here, but this is a natural extension that should improve recovery of CNV-driven classes (e.g., A53/C2) and help disambiguate overlap when mutation-only panels are sparse. Without robust CNV input, we anticipate reduced performance specifically for C2, mirroring DLBClass’s dependence on copy number. While the performance of DLBCLone is encouraging, our results leave some questions unresolved. Some disagreements with LymphGen likely reflect true biological signal—features that are informative but under-weighted or unused by fixed classifiers—rather than error; the same logic extends to LymphGen-Other samples that join a true class. Since discrepancies exist between LymphGen and DLBClass, the true class boundaries remain unsettled across taxonomies. Meta-features improved recovery of N1, whereas other classes emerged more naturally from individual features. Based on our post hoc analysis of LymphGen-classified training samples, we identified TBL1XR1 as enriched for mutations in N1 ( Table 1 ). TBL1XR1 mutations are also strongly associated with the MCD genetic subgroup and can promote the establishment of extranodal lymphomas 24 , motivating further work to determine whether NOTCH1-mutated cases form a distinct class or extend MCD. In this evolving landscape, DLBCLone offers immediate, practical value. Without any new data, it can be used to evaluate the theoretical accuracy of a panel design. It is provided as an open-source R package and can be integrated into pipelines or wrapped in a Shiny user interface 25 . Our training workflow prioritizes flexibility, allowing users to tune the balance between classification rate and accuracy. Each trained model can be frozen and exported for local reuse or shared for broader application, with deterministic projection ensuring reproducible prospective assignment of samples. In short, DLBCLone provides a taxonomy-agnostic approach to DLBCL subtyping that may help bring subtype labeling within reach of more scientists, clinicians, and patients. Methods Patient samples We assembled a cohort of 2542 patients diagnosed with diffuse large B-cell lymphoma (DLBCL). The raw sequencing data used in this study is available from the European Genome-phenome Archive at the European Bioinformatics Institute (EGAS00001007053, EGAS00001001709, EGAS00001004285, EGAS00001002936, EGAS50000000328, EGAS00001004469, EGAS00001006327, EGAS00001001600, EGAS00001001692, EGAS00001002606) 16 , 26 – 34 and can be accessed through a request as detailed on the EGA website. In addition, the analysis was supplemented with the DLBCL whole-genome (WGS) and whole-exome (WES) data available from dbGAP (accessions phs000235, phs000328, phs000527, phs000450, and phs003023) 35 – 42 . Mutation analysis All WGS and WES raw sequencing data was analyzed for sequencing quality according to the previously published guidelines 43 . Simple somatic mutations (SSM) were detected as previously described in detail 37 . Briefly, SSM were called using SLMS-3 pipeline that amalgamates Strelka 44 , SAGE, Lofreq 45 , and mutect 46 to report somatic variants only supported by at least three algorithms. When the raw data was aligned to a different genome build, the identified SSM were converted to the GRCh37 genome build coordinates using Crossmap 47 . The detected variants were further annotated using command line vcf2maf (version 1.6.18) and Variant Effect Predictor (cache version 86). When available, the fluorescence in situ hybridization (FISH) was performed for BCL2 , MYC , and BCL6 , and the FISH results were used for structural variant (SV) annotation. In addition, all WGS tumours were also analyzed for SV breakpoints using both Manta (version 1.6.0) 48 and GRIDSS2.0 (version 2.9.4) 49 . SV calls from both tools were combined using the BioConductor StructuralVariantAnnotation package. Samples with SV connecting MYC , BCL2 , or BCL6 associated with a known recurrent partner locus were also considered positive. Post hoc inference of genetic features associated with LymphGen classes We determined which individual genetic features were enriched in a LymphGen class using Fisher’s exact tests. The analysis included only training-cohort samples with unambiguous LymphGen assignments (i.e., excluding Other and composite labels). For every feature–class pair, we formed a 2×2 table comparing the number of mutated and non-mutated tumors inside the class versus outside the class, and performed a two-sided Fisher’s exact test. Unless noted otherwise, features were considered positively associated with a class when they met a nominal threshold (e.g., 1). The resulting per-class feature lists were used to interpret neighborhood structure, guide meta-feature design, and visualize discrepant assignments in downstream analyses. Encoding and weighting genetic features LymphGen uses a basic binary encoding to designate the presence and absence of any mutation in a gene or a copy number event (when provided). A notable restriction of this is that a synonymous mutation in a gene is given equal weight as a missense mutation. DLBClass employs a more complicated base-4 encoding, using 1 for silent mutations and low-level CNVs, 2 for non-silent mutations and high-level CNVs and 3 for SVs and 0 for the absence of a feature 10 . Using this encoding, we found that UMAP tended to force samples with SVs away from the remaining samples. For our analyses used base-3 encoding except where this was not possible due to limitations of the data sets. Specifically, we reserved 1 for silent mutations in certain genes and 2 for either non-silent mutations or SV. We did not find reasonable justification to include silent mutations in every gene. Instead, we limit this to the genes known to be affected by aberrant somatic hypermutation (aSHM). As some data sets have already removed/hidden silent mutations, we also examined the effect of binary encoding. There was no obvious global differences in the layout of samples for the UMAP derived from these two alternative encodings ( not shown ). Latent representation for neighbor identification We first build a reference embedding from the training cohort. Starting with the encoded feature matrix X (base-3 by default; binary where silent calls are unavailable), we optionally append meta-features and apply per-feature weights to obtain X ′ . We then fit UMAP on X ′ with 2D output, using cosine distance, a fixed random seed, and stored transform. This yields a stable coordinate system (the “reference map”) that is retained for subsequent use in training and prediction. For deployment and for cross-dataset comparability, we freeze the reference map and project samples with UMAP’s out-of-sample transform (e.g., uwot::umap_transform) using the same preprocessing and seed. This step does not refit the layout or move the reference points; each sample is placed near its training-set neighbors in the existing space. As a consequence, projected coordinates for training samples are not identical their original fit coordinates—the original layout reflects a global optimization over all points, whereas the transform places points relative to a fixed neighbor graph. Crucially, local neighborhoods are preserved, and the placement of new samples is deterministic and reproducible across runs and machines. Training DLBCLone uses a K-nearest neighbors (KNN) approach to assign DLBCLs to any genetic classification system ( Figure 8 ). LymphGen was used for this analysis because we were able to run their classifier on our samples. The algorithm relies on the local neighborhood of samples in the UMAP embedding. Importantly, because the UMAP algorithm is influenced by its input data, any subsequent addition of data would normally change the latent representation. We mitigate this by generating a model-based UMAP for a given set of features using a fixed seed and stable reference cohort. Samples are then projected using its out-of-sample transform, which positions each sample near its training-set neighbors while holding the original embedding fixed 50 . This step is accomplished by the make_and_annotate_umap function in our R package (GAMBLR.predict). At this stage, there is no risk of overfitting this model because only the mutations, not the class labels, are considered by UMAP. The next phase in training is to assign every sample to a class (not Other) with the exception of those with zero features, which are assigned Other. The Euclidean distances d between a sample and all K-nearest neighbors in the training set is calculated from the 2-dimensional UMAP coordinates of all training samples. At this stage, the samples in the outgroup (Other) are not considered neigbors. For a test sample i , we retrieve neighbours from the embedded training set, exclude Other neighbours when forming the non-Other neighbourhood, and add a small constant ε = 10 −1 to distances (to avoid dividing by zero). Neighbour weights are inverse-distance: Let be the first K nearest non-Other neighbours, and those among them labeled as class c in the ground truth. For each of the main classes c ≠ Other, we define the score for class c , S c ( i ) as Using this score, we obtain the greedy DLBCLone label for sample i , ŷ greedy ( i ) with ties resolved deterministically by a fixed class order. These computations are implemented in weighted_knn_predict_with_conf and process_votes . We separately quantify the relative density of Other samples in the neighborhood. Let be the radius to the K th non-Other neighbour. We define the Other score for sample i S Other ( i ), as as the sum of the inverse of distances to Other neighbours inside this radius, and the purity ratio To limit over-assignment, we gate the greedy label using (i) the ratio ρ ( i ) and (ii) an absolute top score S max ( i ) = max c S c ( i ). Thresholds ( ρ ⋆ , t ⋆ ) are selected by grid search to maximize macro (per-class) balanced accuracy (optionally with a cap on classification rate). The final prediction is Hyperparameter selection We select K over an odd integer grid (here K ∈ {7, 9, …, 19}) and ( ρ ⋆ , t ⋆ ) by grid search on the training set to maximize a user-chosen objective. By default, the objective is macro balanced accuracy excluding ground-truth Other cases. Alternatives include overall accuracy or a weighted harmonic mean of balanced accuracy, classification rate, and concordance. Ties in arg max are broken by a fixed class order (user-configurable). This procedure is implemented in DLBCLone_optimize_params . Classification For a given feature set, each sample i is represented by a base-3 vector x i ∈ {0, 2} G (gene- or event-level; e.g., MYD88 hotspot, BCL2 / BCL6 translocation). Missing features are treated as absent (0). If synonymous mutations are unavailable, a binary encoding. If feature weights or meta-features were generated during training, the same treatment is applied to the new (previously unseen) samples. The frozen UMAP originally fit from the training data is restored (e.g. from disk). New samples are projected with UMAP transform using a fixed seed, and then assigned a greedy and optimized label via the KNN scores and the learned ( K , ρ ⋆ , t ⋆ ) thresholds ( DLBCLone_predict ). Data Availability No new data was produced in this study. All data used in this study is available through the repositories and study accessions detailed in the text. Acknowledgements The authors gratefully acknowledge funding support from the Canadian Institutes of Health Research. This study was also supported by a Program Project Grant from the Terry Fox Research Institute (#1061,#1108). References 1. ↵ Alizadeh , A. A. et al. Distinct types of diffuse large B-cell lymphoma identified by gene expression profiling . Nature 403 , 503 – 511 ( 2000 ). OpenUrl CrossRef PubMed Web of Science 2. ↵ Ennishi , D. et al. Double-Hit Gene Expression Signature Defines a Distinct Subgroup of Germinal Center B-Cell-Like Diffuse Large B-Cell Lymphoma . Journal of Clinical Oncology: Official Journal of the American Society of Clinical Oncology 37 , 190 – 201 ( 2019 ). OpenUrl PubMed 3. ↵ Coyle , K. M. , Dreval , K. , Hodson , D. J. & Morin , R. D . Audit of B-cell cancer genes . Blood Advances 9 , 2019 – 2031 ( 2025 ). OpenUrl PubMed 4. ↵ Schmitz , R. et al. Genetics and Pathogenesis of Diffuse Large B-Cell Lymphoma . The New England Journal of Medicine 378 , 1396 – 1407 ( 2018 ). OpenUrl CrossRef PubMed 5. ↵ Wright , G. W. et al. A Probabilistic Classification Tool for Genetic Subtypes of Diffuse Large B Cell Lymphoma with Therapeutic Implications . Cancer Cell 37 , 551 – 568.e14 ( 2020 ). OpenUrl CrossRef PubMed 6. ↵ Lacy , S. E. et al. Targeted sequencing in DLBCL, molecular subtypes, and outcomes: A Haematological Malignancy Research Network report . Blood 135 , 1759 – 1771 ( 2020 ). OpenUrl CrossRef PubMed 7. Morin , R. D. & Scott , D. W . DLBCL subclassification: Divide and conquer? Blood 135 , 1722 – 1724 ( 2020 ). OpenUrl PubMed 8. ↵ Runge , H. F. P. et al. Application of the LymphGen classification tool to 928 clinically and genetically-characterised cases of diffuse large B cell lymphoma (DLBCL) . British Journal of Haematology 192 , 216 – 220 ( 2021 ). OpenUrl CrossRef PubMed 9. ↵ Wilson , W. H. et al. Effect of ibrutinib with R-CHOP chemotherapy in genetic subtypes of DLBCL . Cancer Cell 39 , 1643 – 1653.e3 ( 2021 ). OpenUrl CrossRef PubMed 10. ↵ Chapuy , B. et al. DLBclass: A probabilistic molecular classifier to guide clinical investigation and practice in diffuse large B-cell lymphoma . Blood 145 , 2041 – 2055 ( 2025 ). OpenUrl PubMed 11. ↵ Mishina , T. et al. Genetic subtype classification using a simplified algorithm and mutational characteristics of diffuse large B-cell lymphoma in a Japanese cohort . British Journal of Haematology 195 , 731 – 742 ( 2021 ). OpenUrl PubMed 12. ↵ Shen , R. et al. Simplified algorithm for genetic subtyping in diffuse large B-cell lymphoma . Signal Transduction and Targeted Therapy 8 , 145 ( 2023 ). OpenUrl 13. ↵ Shen , Y.-G. et al. Genetic subtype-guided immunochemotherapy in relapsed and refractory diffuse large B cell lymphoma: A phase 2 investigator-initiated nonrandomized clinical trial (GUIDANCE-06) . Signal Transduction and Targeted Therapy 10 , 232 ( 2025 ). OpenUrl 14. ↵ Zhang , M. et al. Genetic subtype-guided immunochemotherapy in diffuse large B cell lymphoma: The randomized GUIDANCE-01 trial . Cancer Cell ( 2023 ) doi: 10.1016/j.ccell.2023.09.004 . OpenUrl CrossRef 15. ↵ Dreval , K. , Boutros , P. C. & Morin , R. D . Minimum Information for Reporting a Genomics Experiment . Blood blood . 2022016095 ( 2022 ) doi: 10.1182/blood.2022016095 . OpenUrl CrossRef 16. ↵ Arthur , S. E. et al. Genome-wide discovery of somatic regulatory variants in diffuse large b-cell lymphoma . Nature communications 9 , 1 – 14 ( 2018 ). OpenUrl PubMed 17. Nowicka , M. et al. Prognostic significance of FCGR2B expression for the response of DLBCL patients to rituximab or obinutuzumab treatment . Blood Advances 5 , 2945 – 2957 ( 2021 ). OpenUrl CrossRef PubMed 18. ↵ Lenz , G. et al. Molecular subtypes of diffuse large B-cell lymphoma arise by distinct genetic pathways . Proceedings of the National Academy of Sciences 105 , 13520 – 13525 ( 2008 ). OpenUrl Abstract / FREE Full Text 19. ↵ Hilton , L. K. et al. Relapse Timing Is Associated With Distinct Evolutionary Dynamics in Diffuse Large B-Cell Lymphoma . Journal of Clinical Oncology 41 , 4164 – 4177 ( 2023 ). OpenUrl CrossRef PubMed 20. ↵ Wright , G. W. et al. A probabilistic classification tool for genetic subtypes of diffuse large b cell lymphoma with therapeutic implications . Cancer cell 37 , 551 – 568 ( 2020 ). OpenUrl CrossRef PubMed 21. ↵ Zhu , M.-L. et al. Validation of LymphGen classification on a 400-gene clinical nextgeneration sequencing panel in diffuse large B-cell lymphoma: Real-world experience from a cancer center . Haematologica 109 , 2326 – 2330 ( 2024 ). OpenUrl PubMed 22. Melani , C. et al. Combination Targeted Therapy in Relapsed Diffuse Large B-Cell Lymphoma . The New England Journal of Medicine 390 , 2143 – 2155 ( 2024 ). OpenUrl CrossRef PubMed 23. ↵ Krupka , J. A. et al. The DIRECT study: A roadmap for ctDNA-based risk prediction, molecular profiling and MRD detection in Diffuse Large B Cell Lymphoma . 2025.04.14.25325806 ( 2025 ) doi: 10.1101/2025.04.14.25325806 . OpenUrl Abstract / FREE Full Text 24. ↵ Venturutti , L. et al. TBL1XR1 Mutations Drive Extranodal Lymphoma by Inducing a Pro-tumorigenic Memory Fate . Cell 182 , 297 – 316.e27 ( 2020 ). OpenUrl CrossRef PubMed 25. ↵ Chang , W. et al. Shiny: Web Application Framework for r . ( 2025 ). 26. ↵ Hilton , L. K. et al. Relapse timing is associated with distinct evolutionary dynamics in diffuse large b-cell lymphoma . Journal of Clinical Oncology 41 , 4164 – 4177 ( 2023 ). OpenUrl CrossRef PubMed 27. Kridel , R. et al. Histological transformation and progression in follicular lymphoma: A clonal evolution study . PLoS medicine 13 , e1002197 ( 2016 ). OpenUrl 28. Hilton , L. K. et al. The double-hit signature identifies double-hit diffuse large b-cell lymphoma with genetic events cryptic to FISH. Blood , The Journal of the American Society of Hematology 134 , 1528 – 1532 ( 2019 ). OpenUrl 29. Chong , L. C. et al. High-resolution architecture and partner genes of MYC rearrangements in lymphoma with DLBCL morphology . Blood advances 2 , 2755 – 2765 ( 2018 ). OpenUrl Abstract / FREE Full Text 30. Rushton , C. K. et al. Genetic and evolutionary patterns of treatment resistance in relapsed b-cell lymphoma . Blood advances 4 , 2886 – 2898 ( 2020 ). OpenUrl CrossRef PubMed 31. Nadeu , F. et al. Detection of early seeding of richter transformation in chronic lymphocytic leukemia . Nature medicine 28 , 1662 – 1671 ( 2022 ). OpenUrl CrossRef PubMed 32. Hubschmann , D. et al. Mutational mechanisms shaping the coding and noncoding genome of germinal center derived b-cell lymphomas . Leukemia 35 , 2002 – 2016 ( 2021 ). OpenUrl PubMed 33. Pan-cancer analysis of whole genomes . Nature 578 , 82 – 93 ( 2020 ). OpenUrl CrossRef PubMed 34. ↵ Reddy , A. et al. Genetic and functional drivers of diffuse large b cell lymphoma . Cell 171 , 481 – 494 ( 2017 ). OpenUrl CrossRef PubMed 35. ↵ Morin , R. D. et al. Mutational and structural analysis of diffuse large b-cell lymphoma using whole-genome sequencing. Blood , The Journal of the American Society of Hematology 122 , 1256 – 1265 ( 2013 ). OpenUrl 36. Morin , R. D. et al. Frequent mutation of histone-modifying genes in non-hodgkin lymphoma . Nature 476 , 298 – 303 ( 2011 ). OpenUrl CrossRef PubMed Web of Science 37. ↵ Thomas , N. et al. Genetic subgroups inform on pathobiology in adult and pediatric burkitt lymphoma . Blood 141 , 904 – 916 ( 2023 ). OpenUrl CrossRef PubMed 38. Pasqualucci , L. et al. Inactivating mutations of acetyltransferase genes in b-cell lymphoma . Nature 471 , 189 – 195 ( 2011 ). OpenUrl CrossRef PubMed Web of Science 39. Jain , M. D. et al. Whole-genome sequencing reveals complex genomic features underlying anti-CD19 CAR t-cell treatment failures in lymphoma. Blood , The Journal of the American Society of Hematology 140 , 491 – 503 ( 2022 ). OpenUrl 40. Chapuy , B. et al. Molecular subtypes of diffuse large b cell lymphoma are associated with distinct pathogenic mechanisms and outcomes . Nature medicine 24 , 679 – 690 ( 2018 ). OpenUrl CrossRef PubMed 41. Lohr , J. G. et al. Discovery and prioritization of somatic mutations in diffuse large b-cell lymphoma (DLBCL) by whole-exome sequencing . Proceedings of the National Academy of Sciences 109 , 3879 – 3884 ( 2012 ). OpenUrl Abstract / FREE Full Text 42. ↵ Schmitz , R. et al. Genetics and pathogenesis of diffuse large b-cell lymphoma . New England journal of medicine 378 , 1396 – 1407 ( 2018 ). OpenUrl CrossRef PubMed 43. ↵ Dreval , K. , Boutros , P. C. & Morin , R. D . Minimal information for reporting a genomics experiment. Blood , The Journal of the American Society of Hematology 140 , 2549 – 2555 ( 2022 ). OpenUrl 44. ↵ Kim , S. et al. Strelka2: Fast and accurate calling of germline and somatic variants . Nature methods 15 , 591 – 594 ( 2018 ). OpenUrl PubMed 45. ↵ Wilm , A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets . Nucleic acids research 40 , 11189 – 11201 ( 2012 ). OpenUrl CrossRef PubMed Web of Science 46. ↵ Benjamin , D. et al. Calling somatic SNVs and indels with Mutect2 . BioRxiv 861054 ( 2019 ). 47. ↵ Zhao , H. et al. CrossMap: A versatile tool for coordinate conversion between genome assemblies . Bioinformatics 30 , 1006 – 1007 ( 2014 ). OpenUrl CrossRef PubMed Web of Science 48. ↵ Chen , X. , et al. Manta: Rapid detection of structural variants and indels for germline and cancer sequencing applications . Bioinformatics 32 , 1220 – 1222 ( 2016 ). OpenUrl CrossRef PubMed 49. ↵ Cameron , D. L. et al. GRIDSS: Sensitive and specific genomic rearrangement detection using positional de bruijn graph assembly . Genome research 27 , 2050 – 2060 ( 2017 ). OpenUrl Abstract / FREE Full Text 50. ↵ Melville , J. Uwot: The Uniform Manifold Approximation and Projection (UMAP) Method for Dimensionality Reduction . ( 2025 ). View the discussion thread. Back to top Previous Next Posted September 19, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following DLBCLone: A unified framework for neighbourhood-based genetic subtyping of lymphomas Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share DLBCLone: A unified framework for neighbourhood-based genetic subtyping of lymphomas Luke Klossok , Kostiantyn Dreval , Manuela Cruz , Jasper C.H. Wong , Sierra Gillis , Brett Collinge , Christian Steidl , David W. Scott , Laura K. Hilton , Ryan D. Morin medRxiv 2025.09.18.25335809; doi: https://doi.org/10.1101/2025.09.18.25335809 Share This Article: Copy Citation Tools DLBCLone: A unified framework for neighbourhood-based genetic subtyping of lymphomas Luke Klossok , Kostiantyn Dreval , Manuela Cruz , Jasper C.H. Wong , Sierra Gillis , Brett Collinge , Christian Steidl , David W. Scott , Laura K. Hilton , Ryan D. Morin medRxiv 2025.09.18.25335809; doi: https://doi.org/10.1101/2025.09.18.25335809 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Hematology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6600) Geriatric Medicine (668) Health Economics (997) Health Informatics (4538) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (541) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3333) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9232) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00d41278fef593a',t:'MTc3OTYzNTcxMw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00