A tailored variant filtering procedure for multi-breed and multi-species unbalanced animal SNP collections

doi:10.1101/2025.10.14.682050

A tailored variant filtering procedure for multi-breed and multi-species unbalanced animal SNP collections

2025 · doi:10.1101/2025.10.14.682050

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 45,798 characters · extracted from preprint-html · click to expand

A tailored variant filtering procedure for multi-breed and multi-species unbalanced animal SNP collections | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results A tailored variant filtering procedure for multi-breed and multi-species unbalanced animal SNP collections View ORCID Profile Barbara Lazzari , Marco Milanesi , Andrea Talenti , Arianna Bionda , Yefang Li , Lin Jiang , Philippe Bardou , Gwenola Tosser Klopp , Paola Crepaldi , Licia Colli doi: https://doi.org/10.1101/2025.10.14.682050 Barbara Lazzari 1 Istituto di Biologia e Biotecnologia Agraria, National Research Council , Milan, Italy 2 Dipartimento di Scienze Animali, della Nutrizione e degli Alimenti, Università Cattolica del S. Cuore , Piacenza, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Barbara Lazzari Marco Milanesi 2 Dipartimento di Scienze Animali, della Nutrizione e degli Alimenti, Università Cattolica del S. Cuore , Piacenza, Italy 3 Department for Innovation in Biological, Agro-food and Forest systems (DIBAF), University of Tuscia , Via S. Camillo de Lellis snc, 01100 Viterbo, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Andrea Talenti 4 The Roslin Institute, Royal (Dick) School of Veterinary Studies, University of Edinburgh , Midlothian, EH25 9RG, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: andrea.talenti{at}ed.ac.uk Arianna Bionda 5 Department of Agricultural and Environmental Sciences - Production, Landscape, Agroenergy (DISAA), Università degli Studi di Milano , Via Celoria 2, 20133 Milano, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yefang Li 6 State Key Laboratory of Animal Biotech Breeding, Institute of Animal Science, Chinese Academy of Agricultural Sciences (CAAS) , Beijing, P. R. China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Lin Jiang 6 State Key Laboratory of Animal Biotech Breeding, Institute of Animal Science, Chinese Academy of Agricultural Sciences (CAAS) , Beijing, P. R. China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Philippe Bardou 7 GenPhySE, Université de Toulouse , INRA, ENVT, Castanet Tolosan, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Gwenola Tosser Klopp 7 GenPhySE, Université de Toulouse , INRA, ENVT, Castanet Tolosan, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Paola Crepaldi 5 Department of Agricultural and Environmental Sciences - Production, Landscape, Agroenergy (DISAA), Università degli Studi di Milano , Via Celoria 2, 20133 Milano, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Licia Colli 2 Dipartimento di Scienze Animali, della Nutrizione e degli Alimenti, Università Cattolica del S. Cuore , Piacenza, Italy 8 BioDNA Centro di Ricerca sulla Biodiversità e sul DNA Antico, Università Cattolica del S. Cuore , Piacenza, Italy Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Preview PDF Abstract Technological advancements and decrease of costs of whole-genome sequencing approaches has made available a huge and ever increasing amount of resequencing data for many species. It is thus now possible to assemble large sized datasets encompassing the molecular variation of several species and/or populations or breeds. Nonetheless, these datasets can be extremely variable in terms of geographical provenance and sample sizes, with taxonomic groups varying from hundreds to just a few or even one single entry. In such circumstances, the application of standard filtering approaches may lead to the introduction of biases and to the under/over representation of some groups or gene pools. Commonly adopted variant filtering approaches relying on Minor Allele Frequency (MAF) and Linkage Disequilibrium (LD) may not be suitable to treat datasets representing broadscale diversity of multiple species, due to remarkable differences in LD structure and in the frequency of variants at the local vs. global scale. Thus, by exploiting the VarGoats 1000 goat genome project data as an optimal case study, we devised a novel approach based on within-population subsampling, Minor Allele Count (MAC) and marker spacing (bp-space), specifically designed to avoid biases introduced by standard filtering procedures and to adequately represent continental and species-specific variation. Starting from a quality-filtered dataset of >28M SNPs from 1372 animals, we obtained a dataset of <14M markers and 750 individuals, complying with the initial requirements and more handy for further computational steps. The dataset was validated by PCA, Neighbor Joining and Admixture analyses. Introduction After their introduction about 20 years ago, high-throughput sequencing technologies have been increasingly exploited in genetic studies and have now become a routine tool to characterize genome-wide variation in wild and domestic species. The increasing availability of reference genome sequences, together with the decreased costs and the highly standardized data production, has led to the accumulation of large amounts of resequencing data in public databases, so that it is now possible to assemble large-scale genomic datasets for several domestic and wild species. Remarkable examples of large-sized assembled datasets exist for cattle ( Hayes & Daetwyler, 2019 ;), sheep ( Li et al., 2020 ), pigs (Groenen, 2017); buffaloes ( Pineda et al., 2024 ), donkeys ( Todd et al, 2022 ), canids ( Plassais et al, 2019 ), and more. As an example of the extensive availability of resequencing data, the NCBI SRA database contains 98,591 sequence data entries for Bos taurus representing breeds from all over the world, 2,100 sequencing data entries for the grey wolf Canis lupus , a species with a cosmopolitan distribution, while more than 4,200 genome data are currently available for six species of Darwin’s finches (NCBI SRA database accessed on Aug 27 th , 2025). Previous literature showed that populations from different geographical areas or with different evolutionary histories (e.g. domesticated vs. wild relative species) or demographic (e.g. small vs. large effective population size, bottlenecks etc.) usually differ in terms of variant occurrence and frequency, and in Linkage Disequilibrium (LD) structure ( Dunning et al., 2000 ; Rossi et al., 2009 ; Lucek and Willi, 2021 ). This behavior becomes particularly pronounced in domestic species, where human management often prevents random mating and reduces between-breeds gene flow ( Jasielczuk et al., 2020 ; Qanbari, 2020 ). Evidence from SNP data highlighted that the number of markers that are found in linkage differs for goat populations from different continental areas, i.e. Africa, Europe and Southwestern Asia ( Colli et al., 2018 ). Population structure analysis also displayed remarkable differences in the distribution of molecular variation both within and between continents: Central and Northern European breeds tend to form well defined and separated clusters, while in Southern Europe and within the African continent the goat populations are less differentiated, with the occurrence of regional gene pools that are shared by several populations of neighboring countries. Ascertainment bias in SNP panels is a renowned phenomenon which derives from the over-representation of specific gene pools in the panel of populations/breeds used to identify and select the polymorphisms ( Lachance & Tishkoff, 2013 ; Dokan et al., 2021 ). Thus, individuals from populations/breeds either included in the discovery panel or closely related to them will consistently display higher levels of variation compared to representatives of other gene pools that have not been accounted for during the marker ascertainment procedure. A similar occurrence may arise when in large-scale WGS datasets assembled from public sources or by collating data from different projects, specific populations/breeds or gene pools become overrepresented. During the filtering steps that normally follow raw sequence data Quality Control (QC) and precede data analyses, the predominance of one or a few gene pools can increase the frequency of population-specific variants at the expenses of less represented but highly informative variation characterizing minor gene pools, and can artefactually drive the detection of LD patterns towards the overrepresented structure. When carrying out the filtering steps, users often apply default program settings for filtering, which can lead to problems, e.g. removing Hardy-Weinberg equilibrium deviations that are important to understand population structure ( Andrews & Luikart, 2014 ). Filtering is an issue of fundamental importance in dataset preparation, and evidence has shown that the same dataset filtered in different ways can return completely different results ( Shafer et al., 2017 ; Larson t al., 2021). The filtering approaches usually adopted rely on the calculation of the frequency of the less represented allele (i.e., Minor Allele Frequency) and on the removal of variants that are found in linkage beyond a specific threshold value overall (LD-based pruning) ( Hemstrom et al., 2024 ). Altogether these approaches can contribute to the removal of low-frequency variants typical of minor gene pools, with the ultimate effect of introducing biases at multiple levels during the preparation of a working dataset. Besides penalizing the variation of the less represented populations/species in favor of the more represented ones, this also affects the estimation of LD which depends on the allelic frequency of SNP pairs ( Qanbari, 2020 ). As a consequence, popularly adopted analysis methods that rely on allelic frequency calculations can be negatively affected ( Dokan et al, 2021 ), thus leading to distortions in the detection of population structure, the estimation of allele sharing etc., particularly when the minor gene pools are involved. The VarGoats project represents a remarkable example of a global-scale assemblage of whole-genome sequences deriving from the aggregation of project-specific data from multiple providers (projects VarGoats and NextGen, https://projects.ensembl.org/nextgen/ ) and publicly available sources (NCBI BioSample, https://www.ncbi.nlm.nih.gov/biosample/ ). The current VarGoats dataset is an extended version of the one described in Denoyelle et al . (2021) which encompasses 1,372 animals from 132 local and transboundary domestic goat populations, and nine wild goat species collected in Europe, Africa, Asia, and Oceania ( https://www.goatgenome.org/vargoats.html ; Table 1 ). VarGoats WGS data are accompanied by metadata as individual sex, provenance, geographical coordinates of sampling locations (latitude and longitude) and other accessory information. The availability of verified metadata makes the VarGoats set particularly suitable to mine and validate SNP filtering results according to what is expected considering animals’ breed of origin, geographical localization at sampling, and providers’ indications on single animals’ provenance. Nonetheless, containing data from multiple unharmonized sources, VarGoats dataset is not normalized in terms of number of animals per breed and species, which ranges from 1 to 163. This is particularly striking for the nine wild Capra species that are accounted for by only a few subjects, i.e. 30 animals overall, which risks to further penalize variants from these underrepresented, overly different individuals in a dataset strongly unbalanced towards Capra hircus . All these factors, together with the need to create an easy-to-manage representative dataset, called for the implementation of an ad hoc SNP filtering procedure, also applicable to other datasets sharing the same issues. View this table: View inline View popup Download powerpoint Table 1. List of Capra hircus (blue), Capra aegagrus (green) and wild goat (red) breeds represented in the VarGoats dataset with the number of animals for each breed. Results and Discussion After mapping to the ARS1 C. hircus genome (GCA_001704415.1), quality filtering and phasing, 28,321,956 autosomal biallelic SNPs were retained out of a starting set of 77,280,295 variants. This dataset (referred to as the 28M dataset) represented the starting point for the following procedures. Within-breed subsampling for samples size harmonization The first step towards the selection of a representative subset of markers dealt with the uneven distribution of animals across breeds. After removal of samples which did not reach the minimum quality standards required to be included in further analyses, 13 populations were represented by transboundary breeds sampled in different geographical areas. Considering that subjects from these breeds may be reared under different management conditions in different countries, to adequately account for these occurrences we decided to split animals based on both breed and country of provenance, thus creating 163 country_breed groups (149 from C. hircus , 2 from Capra aegagrus and 12 from wild goats). Considering the extremely large range of variation of country_breed sample sizes, i.e. 1-163, we first built a distribution of country_breed sample sizes, which guided our choice to set the maximum number of individuals per country_breed at eight, to avoid the overrepresentation of breed-specific gene pools. All the animals belonging to country_breed groups with eight individuals or less were retained. To reduce the size of the 35 country_breed groups containing more than eight animals without losing within-group variability, we used the subsampling approach implemented by BITE v2.0 ( https://www.biorxiv.org/content/10.1101/181610v1 ) which maximizes the genetic variability of the original larger dataset, removing highly related animals. Following the subsampling, 750 out of 1,372 individuals were retained (domestic, DOM, C. hircus , 720 individuals in 149 country_breed groups; bezoars, BEZ, C. aegagrus , 16 individuals in 2 country_breed groups; wild goats, WIL, 14 individuals in 12 country_breed groups), representing all the validated breeds and species of the VarGoats dataset. The 28,321,956 phased SNPs of the selected 750 samples (referred to as the 28M_BALANCED dataset) were used as the starting point for the subsequent thinning procedure, carried out to devise a handy dataset that can be managed with lower computational requirements by removing redundancy while maintaining full information in terms of breed diversity. Selection of Minor Allele threshold value To guarantee an adequate representation of the variation of domestic, bezoars and wild goats, three sets of variants were extracted from the 28M_BALANCED dataset, including the subsets of SNPs present in the DOM, BEZ and WIL groups of animals, respectively. As expected, SNPs belonging to each subset partially overlapped. Due to the largely different sample sizes of the DOM, BEZ and WIL groups, the resulting total number of alleles scored at each position in the full dataset was also highly dissimilar (1,440 for DOM, 32 for BEZ and 28 for WIL). Considering this unbalancing, we evaluated the effects of applying either a Minimum Allele Frequency (MAF) or a Minimum Allele Count (MAC) threshold to the datasets. Due to the disparity in the number of alleles, the frequency of the minor allele varied largely in the three groups and selecting a common threshold for MAF was, therefore, not possible. Applying a threshold value based on the Minor Allele Count, instead, was independent from the number of alleles representing each group, and granted the selection of well documented SNP positions. Considering that the maximum sample size had been set to eight, we chose a cutoff value of MAC>=4 corresponding to a frequency of the minor allele of 0.0028 in the DOM group,, 0.125 in BEZ and 0.143 in WIL ( Figure 1 ). The MAC>=4 threshold value was then applied on the three subsets of SNPs and the resulting datasets were merged, obtaining a total of 20,695,419 SNPs (20M_BALANCED_MAC dataset) that had MAC>=4 in at least one of the three groups of animals and represented 73.07% of the original dataset . Percentages of SNPs with MAC>=4 in the three clusters and in the merged 20M_BALANCED_MAC dataset indicate that most of the BEZ variants are shared with DOM ones, while the WIL cluster contains a higher proportion of private variants ( Figure 2A )). Download figure Open in new tab Figure 1. SNP distribution in the domestic (DOM, blue), bezoar (BEZ, green), and wild (WIL, red) datasets. The orange horizontal line (MAC=4) represents the threshold at Minor Allele Count equal to 4. The light blue dotted lines represent the corresponding thresholds for Minor Allele Frequency (MAF) for DOM (MAF=0.0028), BEZ (MAF=0.125) and WIL (MAF=0.143). SNP count was organized in bins. Bars height represents the number of SNPs (mln SNP) for each bin. 40 equally spaced MAF bins were created between MAF=0 and MAF=0.5, while MAC bins correspond to integer counts from 0 to 60. Panel A ; SNPs with MAC values from 0 to 60. Panel B : SNPs with MAC values from 0 to 10. SNPs with MAC>60 are not shown. Download figure Open in new tab Figure 2. A : Percentage of SNPs of the 28M_BALANCED dataset retrieved by filtering the three clusters (DOM, BEZ and WIL) with mac>=4. Percentages are given for single clusters and merged clusters. B : Percentage of SNPs from the 28M_BALANCED dataset surviving filtering with MAC>=4, MAC>=4 + bp-space>=30, and MAC>=4 + LD>0.9 on the domestic (DOM) + bezoar (BEZ) + wild (WIL) clusters. C : Percentage of cluster-specific SNPs represented in the MAC>=4 + bp-space (orange) and MAC>=4 + LD-filtered (green) datasets, with respect to the MAC>=4 filtered dataset (blue). Additional filtering strategy: LD vs. marker spacing To further thin the dataset, we evaluated the effects of applying additional filtering cutoffs based either on linkage disequilibrium (LD) or on marker spacing to the 20M_BALANCED_MAC dataset. The plink v1.9 ( https://www.cog-genomics.org/plink/1.9/credits ) bp-space filter excludes one variant from each pair closer than the given bp count. The indep-pairwise parameter produces a pruned subset of markers that are in approximate linkage equilibrium with each other. We did not want to carry out a strong pruning based on LD, not to bias the results in favor of the most represented gene pools, so we adopted a shallow cutoff of 0.9 for the LD-based filtering, compared to a bp-space>=30 bps cutoff. LD was a little more efficient in thinning the SNP number with respect to the initial dataset, the percentage of residual SNPs after filtering with LD>0.9 being 47.06%. The bp-space>=30 filter, instead, retained 48.96% of the SNPs of the original dataset ( Figure 2B ). We then calculated the proportion of SNPs from the DOM, BEZ and WIL groups surviving the filtering steps with LD vs. bp-space, and observed that, as expected, LD-based pruning removed a higher number of variants specific to the less represented gene pools. WIL and, to a lesser extent, BEZ, in fact, exhibited a reduction higher than DOM in the percentage of group-specific SNPs surviving the LD-based filtering ( Figure 2C ). To ensure an even representation of the different gene pools, we adopted the bp-space filtering which returned a dataset of 13,987,126 SNPs (14M_BALANCED_MAC_BPS dataset) from 750 goats representing 136 breeds and 163 country_breed groups. In Figure 3 , overlaps among the DOM, BEZ and WIL datasets are displayed for the 28M dataset (1372 individuals), as well as for the 14M_BALANCED_MAC_BPS dataset (750 individuals), showing the proportions of common variants among datasets. Results confirm a higher intersection of variants from domestic goats with those from bezoars, rather than with those from wild goats, and show that the subsampling and filtering procedures did not cause substantial differences in the proportions of variants among species in favour of the most represented. Download figure Open in new tab Figure 3. Upset plots of the 28M dataset (Panel A) and of the 14M_BALANCED_MAC_BPS dataset (Panel B). Numeric intersections of variants across datasets are displayed. Over the bars, the percentage of SNPs with respect to the full dataset is given. A second dataset focused on domestic goats was prepared using the same parameters and cutoffs starting from the 28M_BALANCED dataset after removing the 30 bezoars and wild goats. This dataset, referred to as the 12M_DOM_BALANCED_MAC_BPS dataset, encompassed 11,939,620 SNPs and 720 individuals (128 domestic goat breeds and 149 country_breed groups; Figure 4 ). Download figure Open in new tab Figure 4. Schematic representation of the filtering procedure applied to the VarGoats dataset. To verify the effectiveness of the two balanced datasets (750 individuals and 720 individuals) in classifying animals according to species, breed, and geographical provenance, we performed a Principal Component Analysis (PCA), built Neighbor-Joining trees (NJ) and carried out a population structure analysis with ADMIXTURE software. The results were visually inspected very carefully to evaluate the datasets suitability to work as a reliable input for subsequent biodiversity analyses, taking advantage of the availability of metadata that allow tracing single animals in terms of assigned breed, provenance and geographical localization. The Admixture analysis was run on the 720 individuals 12M_DOM_BALANCED_MAC_BPS dataset, resulting in a clustering of samples in line with expectations based on previous results ( Colli et al., 2018 ) and on the known evolutionary history and relationships of goat breeds. To further validate the population structure robustness, additional Admixture runs were performed on 10 subsets, each consisting of 200K randomly extracted SNPs. The plot of cross validation (CV) errors from these runs showed that CV values were highly consistent across runs and indicated the value of K=12 as the best K ( Supplementary Figure 1 ). Download figure Open in new tab Supplementary Figure 1. Admixture CV error plot of 10 subsets, each consisting of 200K SNPs randomly extracted from the 720 individuals balanced dataset (only domestic goats). The NJ tree of the 750 individuals 14M_BALANCED_MAC_BPS dataset ( Figure 5A ) displayed a good resolution of wild goat species, and also resolved domestic goats according to their continental provenance. Populations of transboundary or cosmopolitan breeds were correctly assigned to their ancestral gene pool of origin ( Figure 5B ). The PCA carried out on the of the 720 individuals 12M_DOM_BALANCED_MAC_BPS dataset ( Figure 5C ) confirmed Admixture and NJ tree results obtained before. Download figure Open in new tab Figure 5. A : Neighbor Joining (NJ) tree of the 14M_BALANCED_MAC_BPS dataset. B : An example of Admixture results (K2 to K20) of a region displaying country_breeds with sampling location different than Europe within the European cluster (SAA_TZ = Saanen_Tanzania, ORB_RU = Orenburg Cachemire_Russia, SAA_CN = Saanen_China). C : Principal Component analysis of the 12M_DOM_BALANCED_MAC_BPS dataset. Conclusions Due to the decreasing cost of resequencing data, the last few years have seen an increase in the production of genomic data for many species, including domestic ones. In most cases, the datasets that can be assembled starting from these data are often highly unbalanced in terms of representativeness of local gene pools’ variation, with some populations or breeds being overrepresented. Also, these genomic collections are usually large-sized and require big computational facilities to be analyzed, thus preventing research groups with limited computation resources from being autonomous in driving data analyses. Under the light of these premises, it is necessary to proceed with variant filtering before processing data in diversity studies, but strategies that allow to conserve the representative fraction of the total variation have to be applied. The VarGoats dataset is an example of these occurrences since it represents a collection of whole-genome sequences that was carried out based on requirements of different projects, and is, as a consequence, highly variable in terms of number of animals per breed, geographical provenance, farming systems, year of sampling etc. The subsampling and filtering procedures adopted to thin the VarGoats 28M dataset to a more manageable 14M/12M set proved to be adequate to retain the information content mirroring the gene pools of wild goat species, of both transboundary and local breeds and of local gene pools characterizing different geographical areas. These sets of markers are suitable for further goat diversity analyses aiming at the reconstruction of evolutionary history of goat populations across the world, the identification of signatures of selection, the study of the genetic bases of environmental adaptation etc.. The strategies and parameters adopted in this study can be applied to datasets with features similar to VarGoats one, when problems like unequal distribution of individuals across breeds and species are encountered and a reduction in marker numbers is desirable. Thresholds can be adjusted according to the target dataset size, provided that a careful inspection of the reduced dataset is carried out to verify that its original information content has been retained and no bias has been introduced during the filtering steps. Materials and Methods Variant quality filtering The entire dataset comprises animals from the VarGoats collection, which weresequenced, at 15X coverage on an Illumina HiSeq X sequencer (Denoyelle et al, 2021), and animals from public databases. In Table 1 , the list of breeds included in the dataset is given for each species, together with the number of animals per breed. The 1,372 animals were mapped to the ARS1 C. hircus genome (GCA_001704415.1) with the BWA-MEM software (v0.7.X). Variants were called with GATK -HaplotypeCaller ( McKenna et al ., 2010 ), then, preliminary quality filters were applied using VQSR (keeping the 99.9 to 100.00 tranche), selecting variants with GATK quality >100 and retaining only biallelic SNPs, resulting in a dataset of 77,280,295 SNPs (Denoyelle et al, 2021). Outlier animals were identified through the analysis of a subset of SNPs included in the Goat_IGGC_65K_v2 array ( https://www.goatgenome.org/projects.html#50K_snp_chip2 ) extracted for the VCF file, after quality control on the markers. Observed heterozygosity (H O ), PCA and Admixture were used to detect outlier individuals. For the H O for each breed the individuals with a level of H O above Q3 + 1.5xIQR or below Q1 - 1.5xIQR were considered outliers, and removed from the dataset before performing further quality filtering steps on VCF files. Subsequent simulations with different cutoffs were run to evaluate filtering effects on the number of surviving variants ( Suppl. Figure 2 ) and on breed representation in the dataset. The mean depth (DP) >6, mean genotype quality (GQ) >20, and call rate (CR) >0.9 filters were applied with BCFtools ( Danecek et al ., 2021 ), and 28,645,747 SNPs were retrieved. Outliers that had previously been removed for filtering cutoffs definition were newly added to the dataset. Most of them were considered mislabelled and taken into consideration for relabeling. Phasing was then performed with Beagle 5.3 with standard parameters, and unphased SNPs were removed. The resulting dataset encompasses 28,321,956 quality filtered, biallelic, phased SNPs (28M dataset). Download figure Open in new tab Supplementary Figure 2. Plot of the effects of filtering according to genotype quality (GQ) and call rate on the number of surviving variants. Relabeling and subsampling The identification of misbehaving samples and their eventual relabeling were based on the evaluation of genotype data at a subset of autosomal SNPs from the Illumina GoatSNP60 BeadChip ( Tosser-Klopp et al., 2014 ). A subset of 59929 SNPs present in chromosomes 1-29 and MT in the Illumina 60K was extracted from the VCF files and compared with the same markers typed with the 60K array over the same individuals. The resulting raw datasets were both filtered by MAF 0.10. Animals with concordance rate between extracted SNPs and SNP chip data below 70% were marked and considered mislabeled in one of the two datasets. The first 5 principal components (PCs, about 15% of explained variance in total) were used in PCA analysis, and animals were marked as outliers in each PC if exceeding the 1 st or 3 rd quartile of more than IQR*1.5 (Inter Quartile Range, IQR). Animals classified as outliers in more than 60% of the cases were marked. An exploratory population structure analysis was carried out with ADMIXTURE software ( Alexander et al, 2009 ) with a five fold Cross-Validation error calculation, based on which K=8 was identified as the best fitting solution. In each of the eight clusters, animals were identified as outliers if exceeding the 1 st or 3 rd quartile of more than IQR*1.5. Animals classified as outliers in more than 60% of the cases were marked. Observed Heterozygosity H O was also calculated, finding a mean value of H O = 0.3477. Animals were marked if their H O was higher than 0.5392 or lower than 0.1561. Other clustering approaches as Neighbor-joining trees based on Allele Sharing Distance (ASD) were further used to check animals’ classification and mark as outliers individuals not clustering within their declared breed. When possible, animals were relabeled with the correct breed assignment according to the results obtained with the described evaluations, otherwise they were marked as unknown breed (UNK). A subsampling procedure was then performed to exclude subjects with problems related either to the quality of the genomic data or to the behavior with respect to the assigned breed in any of the control analyses, resulting in the removal of 15 animals from 6 underrepresented C. hircus country_breeds. After the subsampling step based on individual quality and breed assignment, BITE v2.0 ( https://doi.org/10.1101/181610 ) was used to select animals to be retained within those belonging to overrepresented country_breeds, setting at 8 the maximum number of animals per country_breed. Thinning procedures and filtered datasets validation The analyses required to select and test parameters to be applied for variant filtering were performed with plink v1.9 ( https://www.cog-genomics.org/plink/1.9/credits ) (--maf, –mac, –bp-space and –indep-pairwise parameters), VCFtools ( Danecek et al ., 2011 ), BCFtools and custom scripts. The GCTA software (Yang et al, 2011) was used to estimate the genetic relationship matrix (GRM) between pairs of individuals from a set of SNPs and perform the PCA analysis. A distant matrix was computed with plink v1.9 and used to prepare the Neighbor Joining tree with MEGA ( Kumar et al, 2008 ). The software ADMIXTURE was run with default parameters and five fold cross-validation for K from 2 to 8. Data Availability VarGoats sequence datasets are available at the NCBI BioProject db, accession PRJEB74076, PRJEB74075, PRJEB50463, PRJEB37507, PRJEB37841, PRJEB37276, PRJEB37122, PRJEB37208, PRJEB31857, PRJNA529091, PRJEB3134, PRJEB3135, PRJEB3136, PRJEB4371, PRJEB5166, and PRJEB5900. Use of these data is regulated by a data sharing agreement which is available here: http://www.goatgenome.org/vargoats_agreement.html . Acknowledgements We acknowledge Prof. J.A. (Hans) Lenstra from the Utrecht University for his contribution in the relabeling and subsampling procedures, and Dr. Benjamin D. Rosen and Dr. Mahesh Neupane from the Animal Genomics and Improvement Laboratory, USDA-ARS University, for phasing the 28M SNP dataset. Footnotes ↵ * shared first ↵ £ shared last Bibliography ↵ Alexander DH , Novembre J , Lange K ( 2009 ). Fast model-based estimation of ancestry in unrelated individuals . Genome Research , 19 : 1655 – 1664 . OpenUrl Abstract / FREE Full Text ↵ Andrews KR , Luikart G. Recent novel approaches for population genomics data analysis . Mol Ecol . 2014 Apr ; 23 ( 7 ): 1661 – 7 . doi: 10.1111/mec.12686 . OpenUrl CrossRef Web of Science Browning BL , Tian X , Zhou Y , and Browning SR ( 2021 ). Fast two-stage phasing of large-scale sequence data . Am J Hum Genet 108 ( 10 ): 1880 – 1890 . doi: 10.1016/j.ajhg.2021.08.005 . OpenUrl CrossRef PubMed ↵ Colli , L. , Milanesi , M. , Talenti , A. et al. Genome-wide SNP profiling of worldwide goat populations reveals strong partitioning of diversity and highlights post-domestication migration routes . Genet Sel Evol 50 , 58 ( 2018 ). doi: 10.1186/s12711-018-0422-x . OpenUrl CrossRef ↵ Danecek P , Auton A , Abecasis G , Albers CA , Banks E , DePristo MA , Handsaker R , Lunter G , Marth G , Sherry ST , McVean G , Durbin R and 1000 Genomes Project Analysis Group . 2011 . TheVariant Call Format and VCFtools . Bioinformatics . ↵ Danecek P , Bonfield JK , Liddle J , Marshall J , Ohan V , Pollard MO , Whitwham A , Keane T , McCarthy SA , Davies RM , Li H. 2021 . Twelve years of SAMtools and BCFtools . GigaScience , Volume 10 , Issue 2 , February 2021, giab008 , doi: 10.1093/gigascience/giab008 . OpenUrl CrossRef PubMed Denoyelle et al. , Genet Sel Evol . 2021 Nov 8; 53 ( 1 ): 86 . doi: 10.1186/s12711-021-00659-6 . OpenUrl CrossRef PubMed ↵ Dokan K , Kawamura S , Teshima KM . Effects of single nucleotide polymorphism ascertainment on population structure inferences . G3 (Bethesda) . 2021 Sep 6; 11 ( 9 ): jkab128 . doi: 10.1093/g3journal/jkab128 . OpenUrl CrossRef ↵ Dunning AM , Durocher F , Healey CS , Teare MD , McBride SE , Carlomagno F , Xu CF , Dawson E , Rhodes S , Ueda S , Lai E , Luben RN , Van Rensburg EJ , Mannermaa A , Kataja V , Rennart G , Dunham I , Purvis I , Easton D , Ponder BA . The extent of linkage disequilibrium in four populations with distinct demographic histories . Am J Hum Genet . 2000 Dec ; 67 ( 6 ): 1544 – 54 . doi: 10.1086/316906 . OpenUrl CrossRef PubMed Web of Science Groenen , M.A.M. A decade of pig genome sequencing: a window on pig domestication and evolution . Genet Sel Evol 48 , 23 ( 2016 ). doi: 10.1186/s12711-016-0204-2 OpenUrl CrossRef PubMed ↵ Hayes BJ , Daetwyler HD . 1000 Bull Genomes Project to Map Simple and Complex Genetic Traits in Cattle: Applications and Outcomes . Annu Rev Anim Biosci . 2019 Feb 15; 7 : 89 – 102 . doi: 10.1146/annurev-animal-020518-115024 . Epub 2019 Dec 3. PMID: 30508490 . OpenUrl CrossRef PubMed ↵ Hemstrom W , Grummer JA , Luikart G , Christie MR . Next-generation data filtering in the genomics era . Nat Rev Genet . 2024 Nov ; 25 ( 11 ): 750 – 767 . doi: 10.1038/s41576-024-00738-6 . OpenUrl CrossRef PubMed ↵ Jasielczuk I , Gurgul A , Szmatoła T , Semik-Gurgul E , Pawlina-Tyszko K , Szyndler-Nędza M , Blicharski T , Szulc K , Skrzypczak E , Bugno-Poniewierska M. Comparison of linkage disequilibrium, effective population size and haplotype blocks in Polish Landrace and Polish native pig populations . Livestock Science , Volume 231 , 2020 , 103887 , ISSN 1871-1413 , doi: 10.1016/j.livsci.2019.103887 . OpenUrl CrossRef ↵ Lachance J , Tishkoff SA . SNP ascertainment bias in population genetic analyses: why it is important, and how to correct it . Bioessays . 2013 Sep ; 35 ( 9 ): 780 – 6 . doi: 10.1002/bies.201300014 . OpenUrl CrossRef PubMed Web of Science ↵ Li , X. , Yang , J. , Shen , M. et al. Whole-genome resequencing of wild and domestic sheep identifies genes associated with morphological and agronomic traits . Nat Commun 11 , 2815 ( 2020 ). doi: 10.1038/s41467-020-16485-1 . OpenUrl CrossRef PubMed ↵ Lucek K , Willi Y ( 2021 ) Drivers of linkage disequilibrium across a species’ geographic range . PLoS Genet 17 ( 3 ): e1009477 . doi: 10.1371/journal.pgen.1009477 . OpenUrl CrossRef ↵ Kumar S , Nei M , Dudley J , Tamura K ( 2008 ). MEGA: A Biologist-Centric Software for Evolutionary Analysis of DNA and Protein Sequences . Briefings in Bioinformatics 9 : 299 – 306 . OpenUrl CrossRef PubMed Web of Science Larson WA , Isermann DA , Feiner ZS . Incomplete bioinformatic filtering and inadequate age and growth analysis lead to an incorrect inference of harvested-induced changes . Evol Appl . 2020 Sep 12; 14 ( 2 ): 278 – 289 . doi: 10.1111/eva.13122 . OpenUrl CrossRef PubMed ↵ McKenna A , Hanna M , Banks E , Sivachenko A , Cibulskis K , Kernytsky A , Garimella K , Altshuler D , Gabriel S , Daly M , DePristo MA . The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data . 2010 . Genome Research 20 : 1297 – 303 . OpenUrl Abstract / FREE Full Text ↵ Paulene S Pineda , Ester B Flores , Lilian P Villamor , Connie Joyce M Parac , Mehar S Khatkar , Hien To Thu , Timothy P L Smith , Benjamin D Rosen , Paolo Ajmone-Marsan , Licia Colli , John L Williams , Wai Yee Low , 1000 Buffalo Genomes Consortium, Disentangling river and swamp buffalo genetic diversity: initial insights from the 1000 Buffalo Genomes Project , GigaScience , Volume 13 , 2024 , giae053 , doi: 10.1093/gigascience/giae053 . OpenUrl CrossRef ↵ Qanbari S. On the Extent of Linkage Disequilibrium in the Genome of Farm Animals . Front Genet . 2020 Jan 17; 10 : 1304 . doi: 10.3389/fgene.2019.01304 . OpenUrl CrossRef PubMed ↵ Rossi M , Bitocchi E , Bellucci E , Nanni L , Rau D , Attene G , Papa R. Linkage disequilibrium and population structure in wild and domesticated populations of Phaseolus vulgaris L . Evol Appl . 2009 Nov ; 2 ( 4 ): 504 – 22 . doi: 10.1111/j.1752-4571.2009.00082.x OpenUrl CrossRef PubMed ↵ Todd ET , Tonasso-Calvière L , Chauvey L , Schiavinato S , Fages A , Seguin-Orlando A , Clavel P , Khan N , Pérez Pardal L , Patterson Rosa L , Librado P , Ringbauer H , Verdugo M , Southon J , Aury JM , Perdereau A , Vila E , Marzullo M , Prato O , Tecchiati U , Bagnasco Gianni G , Tagliacozzo A , Tinè V , Alhaique F , Cardoso JL , Valente MJ , Telles Antunes M , Frantz L , Shapiro B , Bradley DG , Boulbes N , Gardeisen A , Horwitz LK , Ö ztan A , Arbuckle BS , Onar V , Clavel B , Lepetz S , Vahdati AA , Davoudi H , Mohaseb A , Mashkour M , Bouchez O , Donnadieu C , Wincker P , Brooks SA , Beja-Pereira A , Wu DD , Orlando L. The genomic history and global expansion of domestic donkeys . Science . 2022 Sep 9; 377 ( 6611 ): 1172 – 1180 . doi: 10.1126/science.abo3503 . Epub 2022 Sep 8. PMID: 36074859 . OpenUrl CrossRef PubMed ↵ Plassais , J. , Kim , J. , Davis , B.W. et al. Whole genome sequencing of canids reveals genomic regions under selection and variants influencing morphology . Nat Commun 10 , 1489 ( 2019 ). doi: 10.1038/s41467-019-09373-w . OpenUrl CrossRef PubMed ↵ Shafer ABA , Peart CR , Tusso S , Maayan I , Brelsford A , Wheat CW , Wolf JBW . Bioinformatic processing of RAD-seq data dramatically impacts downstream population genetic inference . Methods in Ecology and Evolution Volume 8 , Issue 8 , August 2017 , Pages 907 – 917 . OpenUrl ↵ Tosser-Klopp G , Bardou P , Bouchez O , Cabau C , Crooijmans R , Dong Y , Donnadieu-Tonon C , Eggen A , Heuven HC , Jamli S , Jiken AJ , Klopp C , Lawley CT , McEwan J , Martin P , Moreno CR , Mulsant P , Nabihoudine I , Pailhoux E , Palhière I , Rupp R , Sarry J , Sayre BL , Tircazes A , Jun Wang , Wang W , Zhang W ; International Goat Genome Consortium. Design and characterization of a 52K SNP chip for goats . PLoS One . 2014 Jan 22; 9 ( 1 ): e86227 . doi: 10.1371/journal.pone.0086227 . Erratum in: PLoS One. 2016 Mar 24;11(3):e0152632. doi: 10.1371/journal.pone.0152632 OpenUrl CrossRef PubMed Yang et al. ( 2011 ). GCTA: a tool for Genome-wide Complex Trait Analysis . Am J Hum Genet . 88 ( 1 ): 76 – 82 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 15, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A tailored variant filtering procedure for multi-breed and multi-species unbalanced animal SNP collections Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A tailored variant filtering procedure for multi-breed and multi-species unbalanced animal SNP collections Barbara Lazzari , Marco Milanesi , Andrea Talenti , Arianna Bionda , Yefang Li , Lin Jiang , Philippe Bardou , Gwenola Tosser Klopp , Paola Crepaldi , Licia Colli bioRxiv 2025.10.14.682050; doi: https://doi.org/10.1101/2025.10.14.682050 Share This Article: Copy Citation Tools A tailored variant filtering procedure for multi-breed and multi-species unbalanced animal SNP collections Barbara Lazzari , Marco Milanesi , Andrea Talenti , Arianna Bionda , Yefang Li , Lin Jiang , Philippe Bardou , Gwenola Tosser Klopp , Paola Crepaldi , Licia Colli bioRxiv 2025.10.14.682050; doi: https://doi.org/10.1101/2025.10.14.682050 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41910) Biophysics (21436) Cancer Biology (18576) Cell Biology (25480) Clinical Trials (138) Developmental Biology (13368) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40360) Molecular Biology (17163) Neuroscience (88534) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00