Full text
70,107 characters
· extracted from
preprint-html
· click to expand
Anticlustering for Sample Allocation To Minimize Batch Effects | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Anticlustering for Sample Allocation To Minimize Batch Effects View ORCID Profile Martin Papenberg , Cheng Wang , View ORCID Profile Maïgane Diop , Syed Hassan Bukhari , View ORCID Profile Boris Oskotsky , View ORCID Profile Brittany R. Davidson , Kim Chi Vo , Binya Liu , Juan C. Irwin , View ORCID Profile Alexis Combes , View ORCID Profile Brice Gaudilliere , Jingjing Li , David K. Stevenson , View ORCID Profile Gunnar W. Klau , View ORCID Profile Linda C. Giudice , View ORCID Profile Marina Sirota , View ORCID Profile Tomiko T. Oskotsky doi: https://doi.org/10.1101/2025.03.03.641320 Martin Papenberg 1 Department of Experimental Psychology, Heinrich Heine University Düsseldorf , Düsseldorf, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Martin Papenberg Cheng Wang 2 Eli and Edythe Broad Center of Regeneration Medicine and Stem Cell Research, University of California , San Francisco, CA, USA 3 Bakar Computational Health Sciences Institute, University of California , San Francisco, CA, USA 4 Parker Institute for Cancer Immunotherapy, University of California , San Francisco, CA, USA 5 Department of Neurology, School of Medicine, University of California , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Maïgane Diop 6 Department of Anesthesiology, Perioperative and Pain Medicine, Stanford University , Stanford, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Maïgane Diop Syed Hassan Bukhari 3 Bakar Computational Health Sciences Institute, University of California , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Boris Oskotsky 3 Bakar Computational Health Sciences Institute, University of California , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Boris Oskotsky Brittany R. Davidson 7 UCSF CoLabs, University of California , San Francisco, San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Brittany R. Davidson Kim Chi Vo 8 Center for Reproductive Sciences, Department of Obstetrics, Gynecology and Reproductive Sciences, University of California , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Binya Liu 8 Center for Reproductive Sciences, Department of Obstetrics, Gynecology and Reproductive Sciences, University of California , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Juan C. Irwin 8 Center for Reproductive Sciences, Department of Obstetrics, Gynecology and Reproductive Sciences, University of California , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alexis Combes 7 UCSF CoLabs, University of California , San Francisco, San Francisco, CA, USA 9 Department of Medicine, University of California , San Francisco, San Francisco, CA, USA 10 Department of Pathology, University of California , San Francisco, San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alexis Combes Brice Gaudilliere 6 Department of Anesthesiology, Perioperative and Pain Medicine, Stanford University , Stanford, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Brice Gaudilliere Jingjing Li 2 Eli and Edythe Broad Center of Regeneration Medicine and Stem Cell Research, University of California , San Francisco, CA, USA 3 Bakar Computational Health Sciences Institute, University of California , San Francisco, CA, USA 4 Parker Institute for Cancer Immunotherapy, University of California , San Francisco, CA, USA 5 Department of Neurology, School of Medicine, University of California , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site David K. Stevenson 11 Department of Pediatrics, Stanford University , Stanford, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gunnar W. Klau 12 Algorithmic Bioinformatics, Institute for Computer Science, Heinrich Heine University Düsseldorf , Düsseldorf, Germany 13 Center for Digital Medicine, Heinrich Heine University Düsseldorf , Düsseldorf, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Gunnar W. Klau Linda C. Giudice 8 Center for Reproductive Sciences, Department of Obstetrics, Gynecology and Reproductive Sciences, University of California , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Linda C. Giudice Marina Sirota 3 Bakar Computational Health Sciences Institute, University of California , San Francisco, CA, USA 14 Department of Pediatrics, University of California San Francisco , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Marina Sirota Tomiko T. Oskotsky 3 Bakar Computational Health Sciences Institute, University of California , San Francisco, CA, USA 15 Division of Clinical Informatics and Digital Transformation, Department of Medicine, University of California San Francisco , San Francisco, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tomiko T. Oskotsky For correspondence: tomiko.oskotsky{at}ucsf.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract High throughput sequencing is a powerful tool for processing large amounts of DNA and RNA samples in batches. Proper experimental design and statistical methods are required to mitigate systematic technical factors due to differences in batches (“batch effects”), as data variation due to these non-biological factors can mask actual biological differences. We propose using anticlustering as an automated method to assign samples to balanced batches. Anticlustering effectively negates differences in (numeric and/or categorical) covariates among batches, and implements user-defined restrictions on the number of batches, the number of samples per batch, and whether to assign certain samples to the same batch (“must-link constraints”). A simulation study shows that anticlustering is better at achieving balance among batches than previous approaches. An application from the UCSF-Stanford Endometriosis Center for Discovery, Innovation, Training and Community Engagement (“ENACT”, https://enactcenter.org/ ) is presented as a real-life example. In the application, multiple samples provided by an individual had to be processed on the same batch, so that comparisons among different samples of the same patient were not diluted by batch effects. The novel Two Phase Must Link (2PML) anticlustering algorithm realized the must-link restrictions while simultaneously obtaining balance among batches regarding disease stage, menstrual cycle phase, case versus control sample, and clinical site. All methods presented here are accessible via the free and open source R package anticlust ( https://cran.r-project.org/package=anticlust ). An interactive visualization and web-based batch assignment tool are made available in the Rshiny app “anticlust” ( https://anticlust.org/ ). Background/Introduction With advances in high throughput technologies in recent decades, increasing amounts of data have become available for research, including genomics and epigenomics, bulk, single-cell, and single-nucleus transcriptomics, proteomics, and metabolomics. High throughput sequencing (i.e., “next generation sequencing”), for example, is a powerful method for sequencing large amounts of DNA and RNA that is faster and more cost-efficient than traditional sequencing techniques (e.g., Sanger method), and has contributed to an acceleration of the rate of scientific discovery. Unless the number of samples is sufficiently small to fit in a single batch, samples that undergo high throughput analyses are generally processed in multiple batches. However, systematic technical factors (“batch effects”) can arise when samples are processed in different batches. 1 – 3 Proper experimental design and statistical methods are required to mitigate batch effects as data variation due to these non-biological factors can mask actual biological differences. 1 – 4 Unless batch effects are appropriately addressed, conclusions from downstream analyses can be misleading or erroneous. 1 – 3 Computational packages including sva 5 and ComBat 6 can perform batch correction; however, these programs cannot correct all issues with batches. For example, if all samples of patients with a particular condition were assigned to one batch and another batch contained only samples from patients without this condition, then even with batch correction algorithms, it would remain difficult to discern whether differences between the patients’ samples were due to the condition or technical reasons. Hence, it is critical to account for sample variation at the experimental design stage, e.g., avoiding uneven batch sizes, and having balanced classes across batches. 1 – 3 , 7 Resources such as OSAT 8 and, more recently, a propensity score (PS)-based method 9 have been proposed to facilitate the allocation of samples to batches so that the impact of batch effects can be minimized. While these approaches have been applied to facilitate real-life batch allocation problems, they come with limitations that reduce their applicability. For example, OSAT is limited to handling categorical variables such as biological sex or race, while numeric variables such as age are not supported (or have to be categorized artificially). The PS-based approach has so far only been implemented to handle up to 4 batches, while real-life applications in high throughput sequencing often have to deal with more batches. Moreover, neither of these approaches implements additional constraints that can be crucial for obtaining valid inferences in experiments. Similar to how teachers want to assign a class of students to equitable groups while keeping some students together in the same group, researchers want to assign samples to balanced batches while keeping some samples (e.g., those from one individual) together in the same batch. There are occasions when researchers want to distribute samples across batches so that the batches have similar representations for features of interest and, at the same time, keep certain samples together on the same batch so that they can be analyzed without concerns of batch effect. For example, if researchers have more than one sample from each patient in their study and they want to compare an individual patient’s samples as well as compare all patients’ samples, then ideally all samples for a particular patient would be on just one batch (not multiple batches) while having balance across the batches for the study. In the context of cluster analysis, these kinds of requirements have been coined must-link constraints 10 , and implementing them can be beneficial in the context of batch assignment. In this paper, we propose using anticlustering as an automated and improved process to assign samples to balanced batches. Anticlustering is a method that divides a set of elements into disjunct groups, with the aim of maximizing similarity among the different groups. 11 – 13 Papenberg and Klau 11 presented the free and open source R package anticlust that implements a variety of algorithms for anticlustering. Since its introduction, it has been used for diverse tasks in a wide range of research fields. 14 – 20 To the best of our knowledge, however, anticlustering has not previously been applied for batch assignment in high throughput sequencing or other high throughput molecular profiling technologies. To close this gap, we provide readers with an introduction to anticlustering, putting a practical focus on batch assignment in high throughput sequencing. Moreover, we introduce a novel method and implementation to include must-link constraints with anticlustering. To evaluate the usefulness of anticlustering for batch assignment, we conducted an extensive simulation study that compared the anticlust package 11 to the existing OSAT package and the PS-based method. We demonstrate that anticlust’s ability to assign samples to balanced batches is better than that of these existing methods and anticlust’s must-link constraints did not decrease batch balance considerably. Anticlust batch-assignment and must-link features have already been implemented by the UCSF-Stanford Endometriosis Center for Discovery, Innovation, Training and Community Engagement (“ENACT”, https://enactcenter.org/ ) for the center’s projects leveraging single-nuclei transcriptomics and other molecular profiling technologies to elucidate mechanisms involved in endometriosis. 21 Readers who wish to apply anticlustering in their research are provided with additional documentation and code examples in the online supplementary materials ( https://osf.io/eu5gd/ ). All code and data required to reproduce the analyses presented in this paper are openly available from this repository. All anticlustering methods presented here are also openly available and have been implemented in anticlust ( https://cran.r-project.org/package=anticlust ). Results Anticlustering algorithm The standard algorithm anticlust uses to optimize balance among batches is an exchange method that consists of two steps. 11 , 22 First, it randomly assigns samples to batches, while adhering to cardinality constraints imposed by the researcher. Our algorithm allows any predefined number of samples per batch, whereas the most common requirement is that each batch must have the same number of samples. The initialization step is followed by an optimization process that systematically exchanges samples between batches. Each exchange is chosen in such a way that it leads to the largest possible improvement in balance (or similarity) among batches. Similarity among batches is quantified through a mathematical objective function that is known from cluster analysis; in cluster analysis, however, samples would be assigned in such a way that the objective function is minimal, while in anticlustering it has to be maximal. We used the popular diversity objective, which is based on a mathematical notion of pairwise dissimilarity among individual samples. 12 In particular, the diversity is computed as the sum of all distances among samples belonging to the same batch (see Figure 1 ). As the basic distance measure for computing the diversity, we use the Euclidean or squared Euclidean distance based on features, such as age or body mass index (BMI), of the individuals from whom the samples were obtained. Via binary encoding, the diversity can also incorporate categorical variables such as race or disease stage, which are oftentimes considered as parameters when striving for balance among batches. 8 , 9 To ensure comparable weight of all features when computing the diversity, it is recommended to standardize the input variables before computing the pairwise distances. Standardization is particularly useful if the variables differ strongly in their ranges. 23 Download figure Open in new tab Figure 1. Representation of the conversion from numeric features to Euclidean distance, and (anti)clustering assignments based on minimum and maximum diversity using the Euclidean distance. Panel A illustrates 12 hypothetical values of BMI and age in a scatter plot. Panel B represents the Euclidean distances between features as a straight line in the two-dimensional space. The Euclidean distance is proportional to the length of the connecting lines in panel B. Panel C illustrates a clustering assignment of the 12 data points to K = 3 equal-sized groups via minimum diversity. Panel D illustrates an anticlustering assignment of the 12 data points to K = 3 equal-sized groups via maximum diversity. The diversity is computed as the sum of within-cluster distances, which are highlighted in Panel C and Panel D through connecting lines. Maximizing the Euclidean diversity simultaneously leads to similar distribution of the input features among batches (for equal-sized, balanced batches). To add must-link constraints to anticlustering, we propose a new algorithm called Two Phase Must Link (2PML). Phase 1 is a straightforward adaptation of the standard algorithm for unconstrained anticlustering. To initialize Phase 1, we enforce the must-link constraints through a randomized bin packing algorithm. During the following optimization step, we use a modified representation of the data set in which all samples that must be linked together are treated as a single unit: a must-link clique . Using the modified data set, the optimization step applies the same exchange process as the unconstrained algorithm, with the exception that exchanges are restricted to cliques of the same size, thus ensuring that batches remain filled according to the cardinality requirements (e.g., equal-sized batches). Phase 2 drops the restriction of performing exchanges between cliques of the same size and systematically improves over the assignment found in Phase 1. The Methods section describes both phases of 2PML in detail. Simulation Study To thoroughly evaluate the usefulness of anticlustering for batch assignment, we compared it to two alternative approaches in a large-scale simulation study: (1) the OSAT method by Yan et al. 8 , which already has been used frequently for the purpose of batch assignment 24 – 26 , and (2) propensity score batch assignment (PSBA) by Carry et al. 9 , which has been proposed more recently and has not yet been cited by other researchers. As a secondary contribution of the simulation study, we investigated whether the quality of batch assignment is reduced when must-link constraints are included with anticlustering, as compared to an unconstrained anticlustering assignment. The simulation was implemented using the R programming language (version 4.4.2 27 on an Intel i7-10700 computer (4.800GHz x 8) with 16 GB RAM, running Ubuntu 20.04.6 LTS. To obtain a fair comparison to the alternative approaches, anticlustering was implemented using the default settings of the anticlust package (Version 0.8.9-1) that—despite numerous updates—have remained unchanged since its original presentation. 11 That is, we optimized the diversity objective via the exchange method, using the Euclidean distance as the measure of pairwise dissimilarity. Anticlustering subject to must-link constraints used the novel 2PML algorithm, applying one iteration of Phase 1 and one iteration of Phase 2, respectively. For OSAT, we used the implementation that is freely available as an R package from Bioconductor (Version 1.52.0). 28 We used the default OSAT algorithm optimal shuffle with 5,000 repetitions. We also evaluated the alternative algorithm optimal block , but decided not to include it in the final large-scale simulation, because it provided comparable results while running at a significantly slower pace. Carry et al. 9 provided an R implementation of PSBA that is available from an accompanying online repository ( https://github.com/carryp/PS-Batch-Effect ). In the original publication, the authors presented a version of PSBA that investigates all possible batch assignments and selects the one that minimizes discrepancy in average propensity scores among batches. However, investigating all possible assignments is only feasible for small data sets and therefore not generally applicable. In the online repository, the authors therefore included an implementation that randomly generates a user-defined number of batch assignments, and selects the best one among them. For comparability with OSAT, we set the number of random assignments to 5,000. For the simulation, we generated 10,000 data sets. Data sets were processed via (a) OSAT, (b) PSBA, (c) unconstrained anticlustering and (d) anticlustering subject to must-link constraints. Because the OSAT method is only applicable to categorical variables, we generated categorical variables in our simulation. For anticlustering and PSBA, the categorical variables were binary-coded before the methods were applied. OSAT directly used the categorical variables as input because its objective function is computed using category frequencies. For each data set, we randomly determined the number of categorical variables M (2-5), the number of classes per variable P (2-5), the total sample size N (between 50 and 500), and the number of batches K (2, 4, or 10 equal-sized batches). PSBA was only applied for K = 2 and K = 4 batches because the authors’ implementation only allows the assignment to a maximum of four batches. For constrained anticlustering, must-link constraints were generated by creating a random integer (between 1 and N ) for each of the N samples, which served as an ID variable; that is, if two samples were randomly assigned the same ID, they were required to be assigned to the same batch. This procedure resulted in a distribution of constraints that was similar to the distribution of constraints in our motivating application: 58% of all samples had no must-link partner; 29% had 1 must-link partner; 10% had 2 must-link partners, and 3% had 3 or more must-link partners. Across the 10,000 simulation runs, the average run time for the competing assignment methods was 0.10s for unconstrained anticlustering, 0.12s for anticlustering with the must-link feature, 3.68s for OSAT, and 31.81s for PSBA, making anticlustering about 29 and 311 times faster than OSAT and PSBA, respectively. For each simulation run for each of the four methods, we computed a χ 2 test to assess the balance among batches for each of the 2-5 variables, as performed by Yan et al. 8 In this context, a higher p -value indicates that there is more balance among batches, i.e., that the batches are more similar with regard to the covariates. For 84% of all variables that were generated during the simulation, balance among batches was better when using anticlustering as compared to OSAT. Balance was equal in 16% of all comparisons, and only in 0.3% (= 105 of all 34890 pairwise comparisons), OSAT outperformed anticlust. For 78% of all variables that were generated during the simulation, balance was better when using anticlustering as compared to PSBA. Balance was equal in 22% of all comparisons, and only in 0.13% (= 31 of all 23090 pairwise comparisons), PSBA outperformed anticlustering. Figure 2 illustrates average p -values in dependence on the number of variables and the number of batches. When increasing the number of variables from 2 to 5, the average p -value for OSAT declined from 0.99 to 0.72 whereas the average p -value for anticlustering remained greater than 0.99. PSBA also demonstrated a decrease in p -value when increasing the number of variables, but less so than OSAT (from 0.99 to 0.91). In the online supplementary materials, we provide Figures illustrating the simulation results while aggregating across the other variables that varied in the simulation (Figure 3: number of samples; Figure 4: number of categories per variable). Download figure Open in new tab Figure 2. Average p -values from χ 2 tests, based on the number of batches and the number of variables. Higher p -values indicate higher balance. Anticlustering maintained a comparable level of balance across all conditions. OSAT’s performance decreased the most with an increasing number of variables. Remarkably, the constrained anticlustering assignment led to better balance than the OSAT and PSBA methods that did not employ any constraints (see Figure 2 ). In general, using must-link constraints hardly affected the balance among batches: On average, the anticlustering assignment that was subjected to must-link constraints achieved 99.9% of the objective value of the unconstrained assignment. In an astonishing 75% of all cases, balance was not at all reduced by the must-link constraints. Hence, must-link constraints are not only desirable from a researcher’s point of view, but they also do not contribute to a considerable decrease in batch balance. Application The ENACT Center’s work is concerned with researching endometriosis disease. One of the ENACT projects required assigning 320 tissue samples of women who either have the disease (i.e., case) or not (i.e., controls) to 20 batches (16 samples per batch). To illustrate the application, we prepared a synthetic data set that resembles the actual data set, which is not disclosed for reasons of medical confidentiality. In the synthetic dataset, all women who did not have the disease ( n = 50) provided exactly one sample, and all women who have the disease ( n = 89) provided multiple samples: 40 patients provided 2 samples, 25 patients provided 3 samples, 11 patients provided 4 samples, 10 patients provided 5 samples, and 6, 7, or 8 samples were provided by one patient, respectively. In total, the synthetic data set consisted of 320 samples from 139 unique individuals. For the assignment, we sought balance with regard to four categorical variables: is the disease present (“yes” = case sample; “no” = control sample); stage of disease (“none”, “Stage I or II”, “Stage III or IV”); clinical site (University of California San Francisco (“UCSF”) or other); menstrual cycle phase (proliferative (“PE”) or secretory (“SE”)). Other demographic variables such as age and BMI were also assessed, but not deemed to be critical covariates for the assignment. However, post-hoc checks revealed that no significant discrepancies occurred in these other variables, even though they were not explicitly considered via anticlustering. In our actual experiment, we only used the results of the must-link restricted assignment. However, for the purpose of illustration, we also applied unconstrained anticlustering and OSAT to show what level of balance can be achieved when no constraints are imposed. Tables 1 - 3 illustrate the distribution of the categorical variables across batches for unconstrained anticlustering, OSAT, and constrained anticlustering, respectively. The tables were generated using the R package tableone 29 . Table cells contain both information about frequencies and percentages (in parentheses), whether a batch represents only individuals whose samples are unique to that batch (“True” or “False”), and p-values from χ 2 tests for the comparison of the four relevant covariates across the batches. The unconstrained methods successfully balanced all four variables among batches (all p -values > .999 for anticlustering; all p -values > .99 for OSAT). While all assignments were conducted using the 320 individual samples as input—as opposed to the 139 individual patients—we also verified the satisfactory balance on the level of individuals (see the upper rows in Tables 1 - 3 ). Table 3 illustrates the level of balance achieved by constrained anticlustering, which ensured that samples belonging to the same patient were assigned to the same batch. Note that in this application, the must-link constraints put rather severe restrictions on the assignments that were possible; for example, up to 50% of a batch had to be occupied with the samples of a single patient. Still, batches turned out to be highly balanced when including them via the 2PML algorithm (all p-value s > .99). Remarkably, our method filled each of the 20 batches with at least one of the 21 individuals with disease Phase I or II (see section Individuals in Table 3 ). View this table: View inline View popup Download powerpoint Table 1. Summary statistics on an individual- and sample level for the batches assigned using the unrestricted anticlustering method, with p -values from χ 2 tests. View this table: View inline View popup Download powerpoint Table 2. Summary statistics on an individual- and sample level for the batches assigned using the OSAT method, with p -values from χ 2 tests. View this table: View inline View popup Download powerpoint Table 3. Summary statistics on an individual- and sample level for the batches assigned using the anticlustering method with the must-link feature, with p -values from χ 2 tests. An interactive visualization of batch assignment for our synthetic dataset with OSAT and anticlust and the balance on a sample level achieved by these methods, as reflected in Tables 1 - 3 , as well as a web-based batch assignment tool, are made available in the Rshiny app “anticlust”, https://anticlust.org/ . Discussion High throughput molecular profiling technologies commonly require that samples are processed in batches, while confounding effects among batches should be minimized as much as possible. 8 , 9 Though post-hoc corrections during statistical analysis are possible to compensate for imbalances in covariates, exercising control on an a priori level—i.e., creating batches that are as similar as possible on relevant covariates—is crucial as well. 7 Previous methods for batch assignment include OSAT, which can be used to minimize discrepancy on categorical covariates, and PSBA, which minimizes discrepancy among batches with regard to average propensity scores. We introduced anticlustering as a robust and improved alternative for assigning samples to batches, enabling similarity across categorical and numerical covariates. While anticlustering has already been used in a variety of settings, 14 – 20 it has not yet been put to use in the context of batch assignment, which poses additional challenges as the integration of must-link constraints. We presented a motivating example to illustrate how anticlustering can be used to effectively assign samples to batches. The example reproduced a real-life application at the UCSF-Stanford Endometriosis Center for Discovery, Innovation, Training and Community Engagement. In our application, we strove for similarity among batches regarding (1) case vs control sample, (2) clinical site, (3) menstrual cycle phase, and (4) disease stage. Anticlustering proved to be an effective tool in order to meet these criteria. We also required that all samples provided by the same person had to be processed in the same batch, to facilitate the comparison of all of an individual’s samples and avoid confounding the analysis with batch effects. For our work, having the ability to assign each individual’s samples intentionally to the same batch would allow greater confidence in findings from our comparison of the individual’s samples as concerns about batch effects would be mitigated. A similar situation is conceivable in the context of assigning students to classes: Groups of friends have to be assigned to the same class, while striving for overall similarity among—and heterogeneity within—classes. These must-link requirements could not be fulfilled using existing approaches such as OSAT and PSBA, and we therefore developed a novel method (2PML) that for the first time combined the theory of constrained cluster analysis 10 with the domain of anticlustering. Our evaluation revealed that anticlustering provides several critical advantages over alternative approaches. First, the simulation study indicates that in the context of achieving balance in batch assignment tasks, anticlustering outperforms OSAT and PSBA quite considerably ( Figure 2 ). Note that these results were not due to a different tradeoff of speed and quality: The improved anticlustering results were obtained much faster than the results of OSAT and PSBA. Moreover, as compared to OSAT and PSBA, anticlustering can be applied more flexibly in a broader range of settings. For example, OSAT’s usage is restricted to using categorical variables as input. Anticlustering can deal with numeric as well as categorical variables, or even using a mixture of these types of variables. Categorical variables can (a) be included as numeric variables via binary coding, as we implemented in the current simulation study and application, or (b) as a “hard constraint” that strictly prioritizes balance according to one categorical variable over the other variables. 11 PSBA—at least given the current implementation provided by the authors—is restricted to creating a maximum of four batches. Anticlustering has been implemented to handle an arbitrary number of batches, as well as user-defined batch sizes. The anticlust package also provides additional functionality not discussed here, including the possibility to choose among a variety of anticlustering objective functions, different optimization algorithms (including optimal methods), and a specialized algorithm for very large data sets. The clear superiority of anticlustering over the alternatives OSAT and PSBA, as indicated by our simulation study, might be met with skepticism. 30 In a strict sense, the results only pertain to the conditions realized in the simulation (e.g., regarding the number of samples or the number and type of variables). Still, we argue that the results reflect a true difference in potency among approaches. In particular, anticlust uses a better optimization algorithm than used by either OSAT or PSBA. The anticlust package by default relies on a variant of the LCW method. 22 LCW employs a systematic improvement search by exchanging each sample with the most promising candidate that is currently part of a different batch. Anticlust even provides more sophisticated optimization algorithms that have been developed recently, such as the bicriterion iterated local search heuristic by Brusco et al. 12 , and the three-phase search approach by Yang et al. 31 However, to obtain the most fair comparison in the simulation study, we applied the default method without additional tweaks. Before explaining differences in methodological performance among anticlustering, OSAT, and PSBA in more detail, we would like to clarify an important distinction regarding batch assignment methods. Such methods are generally characterized by two components: (1) an objective function that quantifies balance among batches; (2) an optimization algorithm that assigns samples to batches in such a way that balance is maximized. Among anticlustering, OSAT, and PSBA, both components differ. Hence, observed differences in the simulation study cannot unambiguously be attributed to either component. In particular, decreased performance of OSAT and PSBA does not imply that both components are inferior. Importantly, we believe that the objective functions employed by OSAT and PSBA have merit, but that both methods leave room for improvement regarding their optimization algorithms. First, the PSBA implementation provided by its developers uses a random sampling algorithm. That is, PSBA generates a fixed number of batch assignments at random—possibly while implementing a stratification on one of the input variables—and then selects the one assignment that minimizes discrepancy in propensity scores among samples. While we believe that discrepancy in propensity scores is a useful criterion, mere random sampling is generally considered to be a poor optimization algorithm. 32 OSAT uses a more advanced optimization algorithm than PSBA, applying pairwise exchanges between samples as does LCW. However, the exchange method used by OSAT is less sophisticated. In particular, it does not conduct a systematic search that selects the best possible exchange for each sample, but instead randomly attempts a (fixed) number of exchanges. Limitations While our evaluation highlights the power of anticlustering for batch assignment, some limitations of the approach should be considered. Anticlustering effectively negates differences in observed covariates among batches, but it is only as potent as the data it receives as input. Whether the covariates anticlustering uses for assignment are actually important to address a research question, is up to the researcher to decide. If important covariates have not been measured, they cannot be balanced. Moreover, anticlustering cannot overcome problems associated with the data themselves. Imprecise measurement and missing data are problems that affect anticlustering just as with any other method of data analysis. The degree to which anticlustering ensures balance also depends on the data. Increasing the number of samples facilitates finding acceptable balance; using additional covariates or smaller group sizes reduces balance among batches. 23 Given that these parameters usually cannot be chosen freely, researchers should always carefully inspect the observed balance after applying anticlustering. Another limitation of the anticlustering approach is that it can only be used if all samples are available before conducting batch allocation; anticlustering requires that all covariates have been measured at the start. It cannot be used to sequentially fill batches as additional samples become available. 33 Conclusion In this paper, we introduced anticlustering as an effective method to assign samples to batches to mitigate batch effects and ensure the robustness of analyses of data from next-generation sequencing and other high throughput molecular profiling technologies. Additionally, we implemented a novel feature for anticlustering, which allows researchers to specify pairs or groups of samples that must be assigned to the same batch. Anticlustering including its must-link feature is freely available via the open source R package anticlust ( https://CRAN.R-project.org/package=anticlust ) and can be interactively visualized and utilized through the R Shiny app, https://anticlust.org . Via the accompanying package website ( https://github.com/m-Py/anticlust ), additional documentation and community support are available. Methods Anticlustering is an optimization method that is characterized by (a) an objective function that quantifies the balance among batches, and (b) an algorithm that conducts the batch assignment in such a way that balance among batches is maximized. Anticlustering owes its name to the fact that the objective functions it uses are the reversal of criteria used in classical cluster analysis. For example, Späth 13 already recognized that by maximizing instead of minimizing the k-means criterion (the “variance”), he was able to create groups that are similar to each other, and presented it as an improvement over the more intuitive random assignment. 34 Brusco et al. 12 recognized that other objective functions known from cluster analysis can also be implemented in the context of anticlustering. In our application, we optimized the diversity objective to maximize similarity among batches. While diversity is technically a measure of within-batch heterogeneity, its maximization simultaneously leads to minimal difference between the distribution of the input variables among batches (at least when batches are equal-sized). 23 , 35 Papenberg and Klau 11 referred to the maximization of the diversity as “anticluster editing” because the minimization of the diversity is also well-known from the area of cluster analysis—under the term “cluster editing”. 36 , 37 The diversity is computed on the basis of a measure of pairwise dissimilarity among samples. In particular, it is defined as the overall sum of all dissimilarities among samples that are assigned to the same batch. 12 Hence, the diversity is not directly computed on the basis of sample features, but instead it relies on a distance measure that is computed on the basis of these features, for each pair of samples. In the context of anticlustering, the Euclidean distance is the most common measure that translates features to pairwise dissimilarities. 11 , 38 The Euclidean distance is defined as where M is the number of numeric features describing two samples x = ( x 1 , …, x M ) and y = ( y 1 , …, y M ). When samples are described by two features, the Euclidean distance corresponds to the geometric, “straight line” distance between two points in a two-dimensional space; more similar data points are closer to each other (see Figure 1 ). For categorical variables, we use one-hot binary encoding 39 before including them in the computation; in our example application, we used the squared Euclidean distance on binary encoded features as input for anticlustering. An anticlustering algorithm assigns samples to batches in such a way that the objective function—here, the diversity—is maximized. Anticlustering usually employs heuristic optimization algorithms. 31 While heuristics generally provide satisfying results in the context of anticlustering 11 , they do not guarantee finding the globally best assignment among all possibilities. In principle, enumerating all possible assignments is a valid strategy to obtain an optimal assignment. However, this approach quickly becomes impossible due to an exponential growth of the way in which assignments can be conducted. 11 Moreover, because anticlustering problems (with the exception of some special cases) are also NP-hard 35 , there is likely no algorithm that identifies the globally best assignment without considering all possibilities (at least in the worst case). In practice, heuristics are therefore indispensable. In the anticlust package, a heuristic exchange algorithm is by default used to maximize the diversity. 11 , 13 , 22 It consists of two steps: an initialization step and an optimization step. As initialization, it randomly assigns samples to batches of equal size or user-defined sizes. After initialization, the algorithm selects a sample and checks how the diversity would change if the sample were swapped with each sample that is currently assigned to a different batch. After simulating each exchange, it realizes the one exchange that increases the diversity the most. It does not conduct an exchange if no improvement in diversity is possible. This procedure is repeated for each sample and it terminates after the last sample is processed. The procedure might also restart at the first element and reiterate through all samples until no pairwise exchange can lead to any further improvement, i.e., until a local maximum is found. In anticlust, we also implemented this local maximum search, which corresponds to the algorithm LCW. 22 For better results, it is also possible to restart the search algorithm multiple times using different (random) initializations. 13 Two Phase Must Link Algorithm In our application, samples belonging to the same patient were required to be assigned to the same batch. We refer to a set of samples that must be linked together within the same batch as a must-link clique . We use the term singleton to refer to samples that are not part of a clique. Basically, anticlustering with the must-link feature uses a modified representation of the original data set in which singletons and cliques are used as input elements—as opposed to unconstrained anticlustering, where each individual sample constitutes an input element. To implement the must-link requirements while striving for similarity among batches, we developed the Two Phase Must Link (2PML) algorithm for anticlustering. Phase 1 The first phase of 2PML is an adaptation of the local maximum search algorithm LCW, which however ensures that must-link constraints are respected. Some adjustments of the algorithm are required to ensure (a) we obtain a valid partitioning regarding the cardinality constraints (e.g., equal-sized batches) and (b) the diversity is computed correctly during optimization. We therefore had to adjust both the initialization phase as well as the optimization phase of the algorithm. During initialization, we first assign all cliques to batches. Each clique must be assigned completely to one of the batches and samples within a clique must not be split apart. At the same time, the maximum capacity of each batch must not be exceeded. Using this conceptualization, the initialization step corresponds to a bin packing problem, which is one of the classical NP-complete problems in computer science. 40 That is, we assign a weight to each clique, corresponding to the number of samples it contains. When filling batches, the sum of the weights of the cliques in each batch must not exceed its capacity. Many exact and heuristic bin packing algorithms have been developed. As the default method, we use a randomized first fit heuristic to fill batches: For each clique, we iterate through all batches in random order and assign it to the first batch where it fits. The process is expected to evenly distribute the must-link cliques among batches. This random component is particularly useful if we apply multiple restarts of the optimization algorithm. After assigning cliques to batches, the singleton samples can be assigned arbitrarily to fill the remaining space. Note that our randomized first fit algorithm is a heuristic that may not find an assignment of must-link groups to batches even if one is theoretically available. If the heuristic indicates that the batches cannot hold the must-link cliques, we therefore use an exact algorithm based on integer linear programming (ILP) as a fallback option, which allows us to verify if the constraints really cannot be satisfied. To this end, we implemented an adaptation of the standard bin packing ILP model by Martello and Toth 41 : The number of cliques is given by n . The model has binary decision variables x ij to encode whether clique j ( j = 1, …, n ) is assigned to batch i ( i = 1, …, K ). The model uses K values c i to represent the capacity of each batch. The model has n values c j to encode the weight of each clique, i.e., the number of samples it represents. Constraint (2) ensures that the weight of each batch is not exceeded; constraint (3) ensures that each clique is assigned to exactly one batch. Note that during the initialization step, we only need to test if the constraints (2) and (3) can be satisfied; in the context of must-link anticlustering, any feasible assignment is equally valid. For this reason, the objective function (1) is chosen to be constant for each assignment that satisfies the constraints. It does not actually contribute to solving the problem, and the model therefore only tests if the must-link constraints can be satisfied. In anticlust, we call one of the programs Symphony 42 , 43 , GLPK 44 , or lpSolve 45 to yield an optimal assignment for the bin packing problem as initialization for the must-linked samples. For the optimization phase of the exchange algorithm, we generate a modified representation of the original distance matrix that preserves all relevant information. To this end, we compute new pairwise distances between (a) each clique and the other cliques and between (b) each clique and the singleton samples. The new distances are given as the sum of all pairwise distances between (a) the samples in two different cliques, or (b) between all samples of a clique and a singleton. In the context of maximizing the diversity, this transformation sufficiently preserves the relevant information in the original distance matrix. 37 Using the initial assignment and the transformed distance matrix, we apply the same exchange algorithm as we described for the unrestricted anticlustering problem. However, during the exchange process, we only permit exchanges between cliques of the same cardinality (e.g., patients providing the same number of samples) to ensure that the cardinality constraints are respected throughout. As opposed to unconstrained anticlustering, this restriction limits the number of exchanges that are evaluated. Hence, we developed Phase 2 to overcome the limitations of Phase 1. Phase 2 The second phase of 2PML is an improvement phase that drops the restriction of performing exchanges among must-link cliques of the same cardinality. It is initialized using the assignment obtained in Phase 1. It iterates through all must-link cliques and for each clique, selects a combination of samples for exchange. The combination of samples is chosen in such a way that (a) must-link constraints are preserved, (b) all samples are currently part of the same batch, and (c) the total number of samples is equal to the size of the current clique. Among all combinations of samples that fit these requirements, one combination is randomly chosen. If no feasible combination is found, the next clique is evaluated. If a combination is found that fits the requirements (a)-(c), the effect of exchanging it with the current clique is evaluated. Exchanges that do not lead to an improvement in diversity are discarded; exchanges that lead to an improvement in diversity are retained. For example, for a clique of size 4, we might select four singletons for exchange; a combination of two cliques of size 2; a clique of size 3 and a singleton; or two singletons and a clique of size 2. Note that generating all possible combinations for exchange corresponds to the subset sum problem, which is another classical NP-hard problem in computer science. 40 Due to its computational complexity, we cannot exhaustively generate all exchange combinations for large clique sizes. We exhaustively generate all combinations for cliques of up to 10 samples; for larger cliques, we use a heuristic to generate a reduced set of feasible exchange combinations: We generate a set of equal-sized cliques, possibly filled with singletons to ensure the number of samples is sufficient. For example, if this rule were applied to cliques of size 7 (which it is not, because we generate all possible combinations for cliques of size 7), we would obtain the combinations (1, 1, 1, 1, 1, 1, 1), (2, 2, 2, 1) and (3, 3, 1) for exchange. After iterating through all cliques, the improved batch assignment is once again used as input for the optimization step of Phase 1. That is because—maybe counterintuitively—while the new assignment is likely improved over the initial assignment that was obtained in Phase 1, it may no longer be locally optimal. It can therefore be restored to local optimality for additional improvement. The idea of restoring local optimality of a perturbed assignment is inspired by the iterated local search by Brusco et al. 12 Multiple Restarts Both phases of 2PML can be called repeatedly as part of a multiple restart algorithm. In anticlust, when users request multiple restarts, half of the restarts will perform Phase 1 and the other half will perform Phase 2. The best assignment across all restarts in Phase 1 is used as input for the first iteration of Phase 2. The second iteration of Phase 2 will again perform an improvement phase, this time using the output of the first iteration of Phase 2 as input; the third iteration then uses the output of the second iteration, and so forth. The process is repeated until all user-defined restarts are conducted. Optimal anticlustering using must-link constraints Due to the computational complexity of anticlustering, batch assignment problems are usually tackled using heuristic algorithms. Still, for some problem constellations—in particular when N is not large—it is possible to employ exact algorithms that find the globally best batch assignment. Papenberg and Klau 11 presented an ILP model to find globally optimal batch assignments for the diversity objective. It can be used to solve problem instances of up to about N = 30 in acceptable running time; Schulz 46 used a time limit of 1800 seconds and showed that up to 30 samples can be assigned optimally in this time. In the current paper, we extend the model by Papenberg and Klau 11 to allow it to include must-link constraints. The extension is actually quite straightforward: To induce must-link constraints in the context of an exact algorithm, it is sufficient to adjust the distance matrix used as input. Whenever the pairwise distance between two samples is set to ∞—and the set of must-link constraints can be satisfied—any globally optimal assignment will place these samples in the same batch, because the objective value associated with such an assignment is necessarily better than that of an assignment that places them in different batches. This “trick” of adjusting the data input to induce must-link constraints has long been used in the context of exact algorithms for cluster editing. 37 Including must-link constraints in anticlustering increases the number of samples that can be processed, because it reduces the effective number of samples. In our online supplementary materials, we show that up to about 40 samples can be processed using a time limit of 1800 seconds on a desktop computer; beyond that, the running time is expected to increase exponentially with an increasing number of samples. Acknowledgements This study was funded, in part, by the National Institutes of Health, the Eunice Kennedy Shriver National Institute for Child Health and Human Development, P01 HD106414. The funder played no role in study design, data collection, analysis, and interpretation of data, or the writing of this manuscript. The authors sincerely thank Martin Breuer, Jessica Gelfand, Hannah Hengelbrock, Edna Rodas, and Vivian Siu for their valuable contributions to this work. Footnotes https://osf.io/eu5gd/ References 1. ↵ Leek , J.T. , Scharpf , R.B. , Bravo , H.C. , Simcha , D. , Langmead , B. , Johnson , W.E. , Geman , D. , Baggerly , K. , and Irizarry , R.A . ( 2010 ). Tackling the widespread and critical impact of batch effects in high-throughput data . Nat. Rev. Genet . 11 , 733 – 739 . doi: 10.1038/nrg2825 . OpenUrl CrossRef PubMed Web of Science 2. ↵ Goh , W.W.B. , Wang , W. , and Wong , L . ( 2017 ). Why Batch Effects Matter in Omics Data, and How to Avoid Them . Trends Biotechnol . 35 , 498 – 507 . doi: 10.1016/j.tibtech.2017.02.012 . OpenUrl CrossRef PubMed 3. ↵ Goh , W.W.B. , Yong , C.H. , and Wong , L . ( 2022 ). Are batch effects still relevant in the age of big data? Trends Biotechnol . 40 , 1029 – 1040 . doi: 10.1016/j.tibtech.2022.02.005 . OpenUrl CrossRef PubMed 4. ↵ Akey , J.M. , Biswas , S. , Leek , J.T. , and Storey , J.D . ( 2007 ). On the design and analysis of gene expression studies in human populations . Nat. Genet . 39 , 807 – 808 ; author reply 808-809. doi: 10.1038/ng0707-807 . OpenUrl CrossRef PubMed Web of Science 5. ↵ Leek , J.T. , Johnson , W.E. , Parker , H.S. , Jaffe , A.E. , and Storey , J.D . ( 2012 ). The sva package for removing batch effects and other unwanted variation in high-throughput experiments . Bioinformatics 28 , 882 – 883 . doi: 10.1093/bioinformatics/bts034 . OpenUrl CrossRef PubMed Web of Science 6. ↵ Zhang , Y. , Parmigiani , G. , and Johnson , W.E . ( 2020 ). ComBat-seq: batch effect adjustment for RNA-seq count data . NAR Genomics Bioinforma . 2 , lqaa078. doi: 10.1093/nargab/lqaa078 . OpenUrl CrossRef PubMed 7. ↵ Mirzayi , C. , Renson , A. , Zohra , F. , Elsafoury , S. , Geistlinger , L. , Kasselman , L.J. , Eckenrode , K. , van de Wijgert , J. , Loughman , A. , Marques , F.Z. , et al. ( 2021 ). Reporting guidelines for human microbiome research: the STORMS checklist . Nat. Med . 27 , 1885 – 1892 . doi: 10.1038/s41591-021-01552-x . OpenUrl CrossRef PubMed 8. ↵ Yan , L. , Ma , C. , Wang , D. , Hu , Q. , Qin , M. , Conroy , J.M. , Sucheston , L.E. , Ambrosone , C.B. , Johnson , C.S. , Wang , J. , et al. ( 2012 ). OSAT: a tool for sample-to-batch allocations in genomics experiments . BMC Genomics 13 , 689 . doi: 10.1186/1471-2164-13-689 . OpenUrl CrossRef PubMed 9. ↵ Carry , P.M. , Vigers , T. , Vanderlinden , L.A. , Keeter , C. , Dong , F. , Buckner , T. , Litkowski , E. , Yang , I. , Norris , J.M. , and Kechris , K . ( 2023 ). Propensity scores as a novel method to guide sample allocation and minimize batch effects during the design of high throughput experiments . BMC Bioinformatics 24 , 86 . doi: 10.1186/s12859-023-05202-6 . OpenUrl CrossRef PubMed 10. ↵ Davidson , I . ( 2009 ). Clustering with Constraints . In Encyclopedia of Database Systems , L. Liu and M. T. Özsu , eds. ( Springer US ), pp. 393 – 396 . doi: 10.1007/978-0-387-39940-9_610 . OpenUrl CrossRef 11. ↵ Papenberg , M. , and Klau , G.W . ( 2021 ). Using anticlustering to partition data sets into equivalent parts . Psychol. Methods 26 . doi: 10.1037/met0000301 . OpenUrl CrossRef 12. ↵ Brusco , M.J. , Cradit , J.D. , and Steinley , D . ( 2020 ). Combining diversity and dispersion criteria for anticlustering: A bicriterion approach . Br. J. Math. Stat. Psychol . 73 , 375 – 396 . doi: 10.1111/bmsp.12186 . OpenUrl CrossRef PubMed 13. ↵ Späth , H . ( 1986 ). Anticlustering: Maximizing the variance criterion . Control Cybern . 15 , 213 – 218 . OpenUrl 14. ↵ Nagel , J. , Morgan , D.P. , Gürsoy , N.Ç. , Sander , S. , Kern , S. , and Feld , G.B . ( 2024 ). Memory for rewards guides retrieval . Commun. Psychol . 2 , 1 – 17 . doi: 10.1038/s44271-024-00074-9 . OpenUrl CrossRef PubMed 15. Schaper , M.L. , Kuhlmann , B.G. , and Bayen , U.J . ( 2023 ). Metacognitive differentiation of item memory and source memory in schema-based source monitoring . J. Exp. Psychol. Learn. Mem. Cogn . 49 , 743 – 765 . doi: 10.1037/xlm0001207 . OpenUrl CrossRef PubMed 16. Tuti , T. , Aluvaala , J. , Malla , L. , Irimu , G. , Mbevi , G. , Wainaina , J. , Mumelo , L. , Wairoto , K. , Mochache , D. , Hagel , C. , et al. ( 2022 ). Evaluation of an audit and feedback intervention to reduce gentamicin prescription errors in newborn treatment (ReGENT) in neonatal inpatient care in Kenya: a controlled interrupted time series study protocol . Implement. Sci . 17 , 32 . doi: 10.1186/s13012-022-01203-w . OpenUrl CrossRef PubMed 17. Fan , S. , Ponisio , M.R. , Xiao , P. , Ha , S.M. , Chakrabarty , S. , Lee , J.J. , Flores , S. , LaMontagne , P. , Gordon , B. , Raji , C.A. , et al. ( 2024 ). AmyloidPETNet: Classification of Amyloid Positivity in Brain PET Imaging Using End-to-End Deep Learning . Radiology 311 , e231442 . doi: 10.1148/radiol.231442 . OpenUrl CrossRef PubMed 18. Rahu , I. , Kull , M. , and Kruve , A . ( 2024 ). Predicting the Activity of Unidentified Chemicals in Complementary Bioassays from the HRMS Data to Pinpoint Potential Endocrine Disruptors . J. Chem. Inf. Model . 64 , 3093 – 3104 . doi: 10.1021/acs.jcim.3c02050 . OpenUrl CrossRef PubMed 19. Stadelmann , V.A. , Gerossier , E. , Kettenberger , U. , and Pioletti , D.P . ( 2025 ). Combining systemic and local osteoporosis treatments: A longitudinal in vivo microCT study in ovariectomized rats . Bone 192 , 117373 . doi: 10.1016/j.bone.2024.117373 . OpenUrl CrossRef PubMed 20. ↵ Bruns , E. , Scholz , I. , Koppe , G. , Kirsch , P. , and Gerchen , M.F . ( 2025 ). Self-referential belief shares common neural correlates with general belief . Sci. Rep . 15 , 2137 . doi: 10.1038/s41598-024-84445-6 . OpenUrl CrossRef PubMed 21. ↵ UCSF / Stanford Endometriosis Center for Discovery, Innovation, Training and Community Engagement . ENACT . https://www.enactcenter.org . 22. ↵ Weitz , R.R. , and Lakshminarayanan , S . ( 1998 ). An empirical comparison of heuristic methods for creating maximally diverse groups . J. Oper. Res. Soc . 49 , 635 – 646 . doi: 10.1057/palgrave.jors.2600510 . OpenUrl CrossRef 23. ↵ Papenberg , M . ( 2024 ). K-Plus anticlustering: An improved k-means criterion for maximizing between-group similarity . Br. J. Math. Stat. Psychol . 77 , 80 – 102 . doi: 10.1111/bmsp.12315 . OpenUrl CrossRef PubMed 24. ↵ DNA methylation networks underlying mammalian traits | Science https://www.science.org/doi/10.1126/science.abq5693 . 25. Obesity is associated with altered gene expression in human tastebuds | International Journal of Obesity https://www.nature.com/articles/s41366-018-0303-y . 26. ↵ Transcriptomic analysis to identify genes associated with selective hippocampal vulnerability in Alzheimer’s disease | Nature Communications https://www.nature.com/articles/s41467-021-22399-3 . 27. ↵ R: The R Project for Statistical Computing https://www.r-project.org/ . 28. ↵ OSAT Bioconductor . http://bioconductor.org/packages/OSAT/ . 29. ↵ Yoshida , K. , Bartel , A. , Chipman , J.J. , Bohn , J. , McGowan , L.Da ., Barrett , M. , Christensen , R.H.B. , and gbouzill ( 2022 ). tableone: Create “Table 1” to Describe Baseline Characteristics with or without Propensity Score Weights . Version 0 . 13 .2. OpenUrl 30. ↵ Boulesteix , A.-L. , Lauer , S. , and Eugster , M.J.A . ( 2013 ). A Plea for Neutral Comparison Studies in Computational Sciences . PLOS ONE 8 , e61562 . doi: 10.1371/journal.pone.0061562 . OpenUrl CrossRef PubMed 31. ↵ Yang , X. , Cai , Z. , Jin , T. , Tang , Z. , and Gao , S . ( 2022 ). A three-phase search approach with dynamic population size for solving the maximally diverse grouping problem . Eur. J. Oper. Res . 302 , 925 – 953 . doi: 10.1016/j.ejor.2022.02.003 . OpenUrl CrossRef 32. ↵ Skiena , S.S . ( 2008 ). The Algorithm Design Manual (Springer London) doi: 10.1007/978-1-84800-070-4 . OpenUrl CrossRef 33. ↵ Treasure , T. , and MacRae , K.D . ( 1998 ). Minimisation: the platinum standard for trials?: Randomisation doesn’t guarantee similarity of groups; minimisation does . BMJ 317 , 362 – 363 . doi: 10.1136/bmj.317.7155.362 . OpenUrl FREE Full Text 34. ↵ Steinley , D . ( 2006 ). K-means clustering: a half-century synthesis . Br. J. Math. Stat. Psychol . 59 , 1 – 34 . doi: 10.1348/000711005X48266 . OpenUrl CrossRef PubMed Web of Science 35. ↵ Feo , T.A. , and Khellaf , M . ( 1990 ). A class of bounded approximation algorithms for graph partitioning . Networks 20 , 181 – 195 . doi: 10.1002/net.3230200205 . OpenUrl CrossRef 36. ↵ Shamir , R. , Sharan , R. , and Tsur , D . ( 2004 ). Cluster graph modification problems . Discrete Appl. Math . 144 , 173 – 182 . doi: 10.1016/j.dam.2004.01.007 . OpenUrl CrossRef Web of Science 37. ↵ Böcker , S. , Briesemeister , S. , and Klau , G.W . ( 2011 ). Exact Algorithms for Cluster Editing: Evaluation and Experiments . Algorithmica 60 , 316 – 334 . doi: 10.1007/s00453-009-9339-7 . OpenUrl CrossRef 38. ↵ Gallego , M. , Laguna , M. , Martí , R. , and Duarte , A . ( 2013 ). Tabu search with strategic oscillation for the maximally diverse grouping problem . J. Oper. Res. Soc . 64 , 724 – 734 . doi: 10.1057/jors.2012.66 . OpenUrl CrossRef 39. ↵ Cerda , P. , Varoquaux , G. , and Kégl , B . ( 2018 ). Similarity encoding for learning with dirty categorical variables . Mach. Learn . 107 , 1477 – 1494 . doi: 10.1007/s10994-018-5724-2 . OpenUrl CrossRef 40. ↵ Garey , M.R. , and Johnson , D.S . ( 1979 ). Computers and Intractability: A Guide to the Theory of NP-completeness . 41. ↵ Martello , S. , and Toth , P . ( 1990 ). Bin-packing problem . Knapsack Probl. Algorithms Comput. Implement ., 221 – 245 . 42. ↵ Harter , R. , Hornik [aut, K., cre, Theussl , S. , Szymanski , C. , and Schwendinger , F . ( 2021 ). Rsymphony: SYMPHONY in R . Version 0 . 1 – 33 . OpenUrl 43. ↵ Ralphs , T.K. , and Güzelsoy , M. ( 2005 ). The Symphony Callable Library for Mixed Integer Programming . In The Next Wave in Computing, Optimization, and Decision Technologies , B. Golden , S. Raghavan , and E. Wasil , eds. ( Springer US ), pp. 61 – 76 . doi: 10.1007/0-387-23529-9_5 . OpenUrl CrossRef 44. ↵ Theussl , S. , Hornik , K. , Buchta , C. , Schwendinger , F. , and Schuchardt , H . ( 2024 ). Rglpk: R/GNU Linear Programming Kit Interface . Version 0 . 6 – 5 .1. OpenUrl 45. ↵ Csárdi , G. , and Berkelaar , M . ( 2024 ). lpSolve: Interface to “Lp_solve” v. 5.5 to Solve Linear/Integer Programs . Version 5.6.23 . 46. ↵ Schulz , A . ( 2022 ). A new mixed-integer programming formulation for the maximally diverse grouping problem with attribute values . Ann. Oper. Res . 318 , 501 – 530 . doi: 10.1007/s10479-022-04707-2 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted March 11, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Anticlustering for Sample Allocation To Minimize Batch Effects Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Anticlustering for Sample Allocation To Minimize Batch Effects Martin Papenberg , Cheng Wang , Maïgane Diop , Syed Hassan Bukhari , Boris Oskotsky , Brittany R. Davidson , Kim Chi Vo , Binya Liu , Juan C. Irwin , Alexis Combes , Brice Gaudilliere , Jingjing Li , David K. Stevenson , Gunnar W. Klau , Linda C. Giudice , Marina Sirota , Tomiko T. Oskotsky bioRxiv 2025.03.03.641320; doi: https://doi.org/10.1101/2025.03.03.641320 Share This Article: Copy Citation Tools Anticlustering for Sample Allocation To Minimize Batch Effects Martin Papenberg , Cheng Wang , Maïgane Diop , Syed Hassan Bukhari , Boris Oskotsky , Brittany R. Davidson , Kim Chi Vo , Binya Liu , Juan C. Irwin , Alexis Combes , Brice Gaudilliere , Jingjing Li , David K. Stevenson , Gunnar W. Klau , Linda C. Giudice , Marina Sirota , Tomiko T. Oskotsky bioRxiv 2025.03.03.641320; doi: https://doi.org/10.1101/2025.03.03.641320 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7622) Biochemistry (17648) Bioengineering (13868) Bioinformatics (41876) Biophysics (21422) Cancer Biology (18552) Cell Biology (25458) Clinical Trials (138) Developmental Biology (13364) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15589) Genomics (22475) Immunology (17711) Microbiology (40325) Molecular Biology (17144) Neuroscience (88469) Paleontology (666) Pathology (2826) Pharmacology and Toxicology (4815) Physiology (7635) Plant Biology (15113) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9814) Zoology (2268)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.