Patterns of population structure and genetic variation within the Saudi Arabian population

doi:10.1101/2025.01.10.632500

Patterns of population structure and genetic variation within the Saudi Arabian population

2025 · doi:10.1101/2025.01.10.632500

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 111,772 characters · extracted from preprint-html · click to expand

Patterns of population structure and genetic variation within the Saudi Arabian population | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Patterns of population structure and genetic variation within the Saudi Arabian population View ORCID Profile D.K. Malomane , M.P. Williams , Leqi Tian , Ji Tang , C.D. Huber , S. Mangul , M. Abedalthagafi , View ORCID Profile C. W. K. Chiang doi: https://doi.org/10.1101/2025.01.10.632500 D.K. Malomane 1 Center for Genetic Epidemiology, Department of Population and Public Health Sciences, Keck School of Medicine, University of Southern California , Los Angeles, CA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for D.K. Malomane For correspondence: kholo642{at}gmail.com matthew.williams{at}wallawalla.edu malak.althgafi{at}tuftsmedicine.org charleston.chiang{at}med.usc.edu M.P. Williams 2 Department of Biology, Pennsylvania State University, University Park , PA 3 Department of Biological Sciences, Walla Walla University, College Place , WA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: kholo642{at}gmail.com matthew.williams{at}wallawalla.edu malak.althgafi{at}tuftsmedicine.org charleston.chiang{at}med.usc.edu Leqi Tian 1 Center for Genetic Epidemiology, Department of Population and Public Health Sciences, Keck School of Medicine, University of Southern California , Los Angeles, CA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ji Tang 1 Center for Genetic Epidemiology, Department of Population and Public Health Sciences, Keck School of Medicine, University of Southern California , Los Angeles, CA Find this author on Google Scholar Find this author on PubMed Search for this author on this site C.D. Huber 2 Department of Biology, Pennsylvania State University, University Park , PA Find this author on Google Scholar Find this author on PubMed Search for this author on this site S. Mangul 4 Małopolska Centre of Biotechnology, Jagiellonian University , Kraków, Poland 5 Department of Computers, Informatics, and Microelectronics, Technical University of Moldova , Chisinau, 2045, Moldova 6 Department of Computational Biology, School of Digital Public Health, Mohamed bin Zayed University of Artificial Intelligence , Abu Dhabi, UAE 7 Department of Clinical Pharmacy, Alfred E. Mann School of Pharmacy and Pharmaceutical Sciences, University of Southern California , Los Angeles, CA 90089, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site M. Abedalthagafi 8 Department of Pathology and Laboratory Medicine, Tufts University School of Medicine , Boston, MA, United States Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: kholo642{at}gmail.com matthew.williams{at}wallawalla.edu malak.althgafi{at}tuftsmedicine.org charleston.chiang{at}med.usc.edu C. W. K. Chiang 1 Center for Genetic Epidemiology, Department of Population and Public Health Sciences, Keck School of Medicine, University of Southern California , Los Angeles, CA 9 Department of Quantitative and Computational Biology, University of Southern California , Los Angeles, CA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for C. W. K. Chiang For correspondence: kholo642{at}gmail.com matthew.williams{at}wallawalla.edu malak.althgafi{at}tuftsmedicine.org charleston.chiang{at}med.usc.edu Abstract Full Text Info/History Metrics Supplementary material Preview PDF ABSTRACT Both the demographic history and cultural practices influence the pattern of genetic variation, sometimes in opposing manners. For Saudi Arabians, being situated at the hub connecting multiple continents is expected to increase heterogeneity and diversity due to cross-continental migrations. On the other hand, cultural practices such as endogamy to promote social stability may have also promoted regional isolations and reduced heterogeneity. To better understand the genomic impact of these potentially opposing forces, we genotyped and sequenced 3,352 and 302 individuals, respectively, from Saudi Arabians to study their population structure and admixture history, and patterns of genetic variation. We identified twelve genetic sub-clusters that correlated with geographical regions, differentiated by distinct components of ancestry based on comparisons to modern and ancient DNA references. These sub-clusters also showed variation across ranges of the genome covered in runs of homozygosity, reflecting potential differences in kinship or marital practices, as well as differences in population size changes over time. Using 25,488,981 variants found in whole genome sequencing, we found that the Saudi do not show the depletion of rare alleles typically observed in isolated populations, though they do show the expected pattern of enrichment of alleles bioinformatically annotated as deleterious when compared to Africans/African Americans and Non-Finnish Europeans from gnomAD. Saudi sub-clusters with greater inbreeding and lower effective population sizes showed greater enrichment of deleterious alleles as well. Taken together, our results suggest that Saudi’s history and culture impact its pattern of genetic variation and potentially to the population health. We also made available the allele frequency estimates of alleles discovered in our samples so to start a foundation on which to interpret medical- and pharmaco-genomic findings from these populations. INTRODUCTION Saudi Arabia is the largest country in the Arabian Peninsula (AP), the major hub that connects Africa, Asia and Europe. Despite their central location, Saudi Arabians have been relatively under-represented in large-scale genomic studies to understand the genetic architecture of complex traits and diseases. The largest publicly available database for genetic variation in this region is the 2,884 and 147 individuals with summarized exome and whole genome sequencing data, respectively, of “Middle Eastern” origin from gnomAD ( Karczewski et al. 2020 ; Chen et al. 2024) and 161 whole genome sequenced individuals across four countries from the Human Genome Diversity Panel ( Bergström et al. 2020 )(though also see references ( Project Team 2015 ; Almarri et al. 2020 ; Mbarek et al. 2022 ) for other sequencing efforts). Generating the genomic data and characterizing its pattern is the first step towards understanding the genotype-phenotype relationships for these populations. Both demographic history and cultural practices / human customs can shape the pattern of genetic variation genome-wide. The genetic diversity of today’s Arabians is shaped by a complexity of ancestries from historic split and admixture events. The AP is considered one of the initial sites of historic human migration out of Africa (OOA), with presence of human footprints reported at least since 50 – 60 thousand years ago (kya) and as early as 85 – 120 kya ( Armitage et al. 2011 ; Fernandes et al. 2012 ; Henn et al. 2012 ; Rodriguez-Flores et al. 2016 ; Almarri et al. 2020 ). The AP is also hypothesized ( Lazaridis et al. 2014 ; Lazaridis et al. 2016 ; Ferreira et al. 2021 ) to be one of the most likely homelands of a hypothesized deeply diverged “ghost” ancestry, the Basal Eurasians, who are thought to have diverged from the non-Africans shortly after the OOA. This lineage is then thought to have remained isolated until experiencing a later admixture in the Middle East around 38kya ( Vallini et al. 2024 ) whereby Arabian and other present-day Middle Eastern populations are believed to have descended in-part from this population and carry a higher proportion of this ancestry ( Lazaridis et al. 2016 ; Ferreira et al. 2021 ). Given its hub location geographically, Arabians have also experienced series of admixtures, and the present-day Arabians could have shared ancestries with various groups including Africans, South Asians, Europeans, Levantines, and Iranians ( Fernandes et al. 2019 ; Almarri et al. 2020 ; Martiniano et al. 2024 ). For these geographical and historical reasons, one may expect Arabian genomes to show greater heterogeneity and diversity. On the other hand, despite the rich history of ancestries and being in a major geographical hub, for centuries the genetic pool of the Arab countries and the Greater Middle East (GME) have been greatly influenced and refined by mating practices. Arab countries have a high rate of endogamous and consanguineous marriages ( Tadmouri et al. 2009 ; Khayat et al. 2024 ), especially in Saudi Arabia with rates as high as 58% ( el-Hazmi et al. 1995 ; Ben Halim et al. 2013 ). These endogamous marriages preserve family structure and strengthen bonds, and can ensure cultural, religious, financial and social stability ( Bittles 2008 ; Ben Halim et al. 2013 ; Alkuraya 2014 ). Many of the consanguineous marriages are found between first cousins (e.g. 28.4% ( el-Hazmi et al. 1995 )), but are also found extended to members of the same or related tribal groups. Endogamy leads to regional genetic isolation and population substructure. A recent study analyzing the population structure of Saudi Arabia showed a signature of tribal stratification within the population ( Mineta et al. 2021 ). Co-inheritance of recessively-acting alleles with functional consequences could increase the prevalence of genetic disorders, some of which have indeed been observed in Saudi Arabia ( Aleissa et al. 2022 ; Temaj et al. 2022 ). While in the long run these deleterious recessive alleles may be exposed to purifying selection due to increased homozygosity ( Hedrick and Garcia-Dorado 2016 ; Delatycki et al. 2020 ), consanguinity and/or reproductive compensation have been suggested to counteract the effectiveness of purifying selection in endogamous populations ( Overall et al. 2002 ; Alsalem et al. 2013 ; Greater Middle East Variome Consortium et al. 2016 ; Sahoo et al. 2021 ). Therefore, the long-term isolation could lead to reduced heterogeneity and enrichment of bioinformatically-annotated deleterious alleles ( Ober et al. 1999 ; Castellano et al. 2014 ; Lohmueller 2014 ; Simons and Sella 2016 ). That is, cultural practices may counteract the impacts due to geography and ancestry, resulting in an abundance of deleterious alleles. Prior genomic studies of Saudi Arabia have described broad population structure and identified signatures of tribal stratification ( Mineta et al. 2021 ), but have not been able to address how that structure relates to the formation of the Saudi gene pool over deep time, nor how it translates into measurable genomic risk. We tackle both questions. By integrating whole-genome sequencing with ancient DNA from Epipaleolithic through Medieval Arabian Peninsula and surrounding populations, we reconstruct the ancestry components and admixture timing that shaped each Saudi sub-cluster, moving the field from pattern description to mechanistic history. By characterizing allelic architecture using three independent functional annotation tools across 25,488,981 variants, we directly quantify the net impact of admixture and endogamy on deleterious variant burden, a question not yet addressed by genomics studies in Saudi Arabians. Finally, by identifying nearly 2.5 million variants absent from gnomAD, 19.63% of which are predicted deleterious, we demonstrate that current reference databases are systematically incomplete for this population in ways that matter clinically. By bridging the gap between deep-time demography and modern functional variation, this study provides a framework for understanding how the intersection of history and culture dictates the landscape of human disease. RESULTS Genetic substructure of Saudi Arabians We merged 3,352 genotyped Saudi individuals after quality control (see Methods ) with 302 whole genome sequencing (WGS) samples and explored the population structure based on 603,833 shared segregating sites. Principal component analysis (PCA) on the combined set revealed substantial structure within our sample population ( Figure S1 ), consistent with geographical isolations within the population. We defined discrete sub-population for downstream comparisons and analysis by performing clustering analysis which suggested that twelve genetic sub-clusters within the Saudi population best fit the data ( Methods; Figure S2A ). We found that the 12 clusters corresponded to geographical regions within Saudi Arabia, with each cluster generally consisting of a majority of its members from a single geographical region (Central, West, North, South, or East) whether using harmonized regional labels or self-reported labels when available ( Methods ; Table S1, Figure 1A and Figure S2B ). The main exceptions are clusters 11 and 12, both of which consisted of individuals from multiple prevalent regions. In regard to their unique genetic structure, cluster5 from the Western region appeared to be most differentiated from the rest of the cohort ( Figure 1A ; also see PCs 6 and 7 in Figure S1 ). Multiple genetic clusters can be affiliated to the same geographic regions (e.g. clusters 2, 3, and 9 from Central region; 4, 5, 7, and 10 from the Western region, etc.), reflecting the limited demographic resolution of our data, since we do not have access to specific tribal affiliation of each participant due to privacy protections. Even though multiple tribes can inhabit a specific region and inter-tribal marriages within a region is expected to be limited ( Mineta et al. 2021 ), our definition of genetic clusters may not necessarily correspond to distinct or closely related tribes. Instead, they are used to reveal broad geographical and genetic patterns in our study. Download figure Open in new tab Figure 1. The genetic structure of Saudi Arabians and its relation to global populations. (A) A two-dimensional UMAP of Saudi Arabians based on the top 10 principal components. Each individual is colored based on the affiliated tribal region (see (D)). WGS samples did not have self-reported or harmonized tribal affiliation and are assigned their own color. (B) PCA of Saudi Arabian clusters and HGDP populations. Saudi Arabians are grouped in a single group. Inset shows clusters 1 - 10 colored according to the most prevalent tribal region represented in the cluster (see Table S1 ). Because clusters 11-12 has no single dominating tribal region, they were assigned distinct separate colors. (C) Admixture analysis of Saudi Arabian clusters and HGDP populations for K = 4 (top) and K = 9 (bottom). ME – Middle Eastern, AFR – African, EA – East Asian, CSA – Central & South Asian, EUR – European, OC – Oceania, AMR – American. The names of Saudi clusters and HGDP populations are shown on the bottom X-axis. However, due to limited space some of the labels for smaller populations from HGDP are omitted. Grouped regional labels are shown on the top X-axis of plots. We show the admixture results of the Saudi clusters alone in Figure S3B . (D) A regional map of Saudi Arabia with matching colors to the regional labels in (A) and (B). Genetic ancestries that can increase diversity Situated at the crossroad of the African, Eastern and Western Eurasian continent, one may expect the Saudi Arabians to represent multiple components of ancestries and overall exhibit greater genetic diversity than populations in Europe or Asia. To investigate the genetic and ancestral diversity, we compared the Saudi clusters to the populations from the Human Genome Diversity Panel (HGDP) ( Bergström et al. 2020 ). Consistent with previous reports ( Mineta et al. 2021 ; Elliott et al. 2022 ), the Saudi individuals clustered between Africans, Central & South Asians and Europeans and were the most distant to East Asians ( Figure 1B ). The Saudi’s genetic affinity towards African reference individuals, implicated by a clinal distribution in the first two PCs, are largely driven by cluster12 and cluster3 ( Figure 1B inset), while cluster11 showed a mixed affinity towards Europeans, Africans and Central & South Asians. The remaining clusters co-localized mainly with the Middle Eastern reference individuals from the HGDP panel ( Figure 1B ). Our observation from PCA is also corroborated by unsupervised ADMIXTURE analysis at two different resolutions of K (K = 4 and 9), where clusters 11 and 12 exhibited the highest levels of admixture ( Figure 1C , Figure S3A, S3B, and Table S2 ). At K = 4 where major continental ancestries relevant to Saudi Arabians are differentiated, the most dominating ancestry in the Saudi clusters was one largely shared with the HGDP Middle Eastern populations (Druze, Bedouin, Mozabite, and Palestinian; cyan ancestry in Figure 1C , top). Clusters 12, 11, and 3 had on average less than two-thirds of this Middle Eastern (ME)-like ancestry component and were enriched with African-like (red) and/or European-like (green) ancestries. At K = 9, the relationship between cluster11 and the Central & South Asians (CSA) that we observed on PCA can also be observed ( Figure 1C bottom, Table S3 ), where cluster11 carried more (average proportion = 0.223) of such CSA-like ancestry compared to other clusters (average proportions less than 0.1). To further elucidate Saudi genetic history and its ancestral components, we integrated whole genome sequencing data from 302 Saudi individuals with ancient DNA (aDNA) datasets (1240k AADR v62.0 ( Mallick et al. 2024 ); Methods ). Despite its importance, the Arabian Peninsula remains underrepresented in aDNA records with available samples limited to two locations: Bahrain in the Persian Gulf (four Tylos-period individuals, ∼2200–1400 BP; ( Martiniano et al. 2024 )) and Medieval Soqotra at the mouth of the Gulf of Aden (twelve individuals, ∼1300 BP; ( Sirak et al. 2024 )). Using D-statistic symmetry tests of cladality ( Methods ), we assessed whether each of the 12 Saudi sub-clusters are consistent with a model of population continuity with these ancient Arabian groups ( Figure S4, Table S4 ). Across all tests, we rejected a model of strict population continuity between ancient Arabian groups and present-day Saudi populations, finding instead a consistent pattern of African and Levantine-related ancestry input. This observation was corroborated through affinity D -statistics which showed that clusters 1–10 have increased genetic affinity to ancient Levantine populations (e.g., Syria_TellQarassa_Umayyad.SG, Israel_Natufian.AG, and Neolithic/EBA samples; Figure S5, Table S5 ) dating to the 7th and early 8th centuries, Epipaleolithic, and Neolithic to Early Bronze Age periods (∼1200 to 11,500 years ago). Moreover, outgroup f 3-statistics revealed all clusters, with the exception of cluster 12, showed maximal shared drift along a Levant-Anatolia-Mediterranean-Balkan axis ( Figure 2A , Figure S6, Table S6 ). These results suggest that since the Late Tylos period, the ancient Arabian gene pool received additional input from a population best represented by Neolithic or Early Bronze Age Levantine samples. Download figure Open in new tab Figure 2: Genetic affinities and admixture modeling of Saudi clusters. (A) Outgroup f3-statistics of the form f3(Han.DG; Saudi, WorldPop) mapping shared genetic drift between specific Saudi clusters c_5 and c_12 and global populations, stratified by temporal periods (years BP). Darker colors indicate higher shared drift. See Figure S6 for full results across Saudi clusters. (B) Admixture signals illustrated via admixture f3-statistics, demonstrating gene flow trajectories and affinities across the region. The color scale denotes the Z-score, with red lines indicating significant admixture. Ancient Bahrain MH1MH2 samples were used in admixture f3-statistics here; for admixture f3-statistics involving other ancient Arabian samples and Saudi clusters, see Figure S7 . (C, D) Genetic ancestry modeling using qpAdm , displaying the admixture weights based on more proximal (T2 in C) or deep (Distal in D) temporal source populations. Error bars denote +/- 1 standard error. Top two plausible models ranked by p-value for each Saudi cluster, if available, are shown from left to right. On top of the Levantine-related West Eurasian ancestry, several Saudi clusters exhibit a major sub-Saharan African ancestry component, represented in the ancient DNA dataset by Eastern and Northeastern African populations, including the Kenya-associated pastoralists, foragers, and their modern proxies (e.g. Kenya_KisimaFarmA5_PN, Kenya_HyraxHill_PN, Kenya_Kakapel_LIA, Kenya_Somali) as well as populations from the coastal / Great Lakes region (e.g. Tanzania_Lindi_Swahili, Tanzania_Pemba_600BP/1400BP, Uganda_Munsa_LIA) ( Figure S4 ). This pattern is most pronounced in cluster 12, followed by cluster 3. Outgroup f 3-statistics identify cluster 12 as a clear outlier in the estimated shared drift with ancient sub-Saharan African samples over at least the last 10,000 years ( Figure 2A ), and formal admixture f 3-statistics confirm the signal of a mixed ancient Arabian + sub-Saharan African ancestry in clusters 3, 8, 11, and 12 ( Figure 2B , Figure S7, Table S7 ). Taken together, the shared drift and admixture tests with ancient samples confirmed the strong affinity between some Saudi sub-clusters and sub-Saharan Africans inferred from modern reference populations described above ( Figure 1C ). Other than sub-Saharan African ancestry, cluster 11 exhibits an additional distinct pattern. Its affinity D-statistics profile shifts away from an exclusively Levantine-related ancestry toward a mixed Anatolian and European Neolithic/Mediterranean and CSA profile ( Figure S5 ). This is most evident in the admixture f 3-statistic, which reveal exclusive connections to Eastern European, CSA, and Steppe groups ( Figure 2B , Figure S7 ). These findings corroborate the European and CSA-like ancestry profile observed in ADMIXTURE analysis with modern reference samples described above ( Figure 1C ). We employed qpAdm to model the genetic ancestry of modern Saudi Arabians through a time-stratified lens whereby we grouped the available aDNA reference populations into thirteen broad geographical regions encompassing major regions in Europe, Caucasus, Levant, Anatolia, Arabia, and Africa ( Methods, Tables S8-S11 ). Our analysis successfully modeled the genetic ancestry profile of the Saudi Arabians at two different temporal strata (T2: 1,000-3,000 BP; Distal: ancestrally representative populations ∼ > 7,500 BP). In the proximal period to the present (T2; 1,000–3,000 BP), parsimonious genetic models for the Saudi clusters fall into two broad patterns. Seven clusters (c1, c2, c5, c6, c7, c9, c10) fit a simple two-source structure of ∼24–49% Arabian-related ancestry and ∼51–76% North African-related ancestry ( Figure 2C ). Within these models, the Late Tylos groups MH1-MH2 and MH3 appear most frequently as the Arabian source, with the Early-to-Middle Tylos sample observed less frequently. These results are consistent with affinity D-statistics ( Figure S5 ), where the North African aDNA sources appear to capture the Levantine-Anatolian and sub-Saharan components. For the remaining clusters, an additional ancestry component from Southern/Eastern Africa (c3, c4 and c11) and/or Europe (c3 and c11) was required to achieve a parsimonious fit ( Figure 2C ). Cluster12 initially failed to fit any tested qpAdm model with up to six sources and required a focused analysis ( Methods ) to identify a single plausible four-source model: Late Tylos MH3 (∼20%), Northern Africa (∼12%), Southern Africa (∼56%), and Southern Europe (∼12%). These results underscore the ancestral diversity and heterogeneity of cluster12, particularly its significant Sub-Saharan ancestry. Such complexity possibly derives from the fact that both cluster11 and cluster12 include individuals from multiple distinct geographical regions. For the most distant temporal stratum utilizing representative ancestral populations of available regions, a common baseline ancestry structure is shared among many clusters (c1, c3, c4, c5, c7, c8, c9, c10). This baseline is adequately modeled as a three-source mixture: Iranian Neolithic/Caucasus Hunter-Gatherer-related ancestry (up to ∼33%), Neolithic Anatolian Farmer via Greece and Crete-related ancestry (up to ∼30%), and Epipaleolithic Levantine ancestry via Israel Natufian (∼32-57%) ( Figure 2D ). These findings are consistent with the demographic modeling of Marchi et al. ( Marchi et al. 2022 ), wherein three meta-populations emerged following the Late Glacial Maximum (LGM) recolonization of the Middle East. Cluster12 departs from the three-source ancestry baseline by integrating a Morocco-Iberomaurusian-related component (∼46%) with Neolithic Anatolian (∼19%) and Iranian/Caucasus (∼35%) elements. This Iberomaurusian-related ancestry, which possesses distant genetic connectivity to Levantine Epipaleolithic Natufians, appears to also capture ancestry related to the hypothesized ‘Basal Eurasian’ lineage – a ghost lineage that diverged from the primary out-of-Africa lineage prior to Neanderthal introgression ( Lazaridis et al. 2014 ; Fregel et al. 2018 ; Van De Loosdrecht et al. 2018 ; Yang and Fu 2018 ). The ancestry of the three remaining Saudi clusters (c2, c6, c11) could not be explained by the baseline structure alone. Cluster11 required a four-source model, adding a European Hunter-Gatherer ancestry component – either Eastern (∼4%) or Western (∼8%) – to the three-source baseline ( Figure 2D ), a finding supported by admixture f 3-statistics ( Figure 2B ; Figure S7 ). Conversely, clusters 2 and 6 incorporate varying proportions of a Pre-Pottery Neolithic Mesopotamian source (c2 = < 1%; c6 = ∼19%), which itself represents a composite of Levantine, Anatolian, and CHG/Zagros lineages ( Altınışık et al. 2022 ). The observation with clusters 2 and 6 suggests that while the currently available distal sources best represent the broad ancestral diversity of the Saudi clusters during the Neolithic-Paleolithic periods, they are likely still imperfect proxies. Finally, previous research has identified genetic component among present-day AP populations related to Basal Eurasian ancestry ( Fregel et al. 2018 ; Van De Loosdrecht et al. 2018 ). Using f 4-statistics ( Methods ), we estimated the relative amount of Basal Eurasian ancestry using various ancient and present-day African outgroups and identified a latitudinal gradient where Basal Eurasian proportions decrease in the northern Middle East, placing most Saudi clusters alongside the southern Levant and Arabian Bedouins ( Figure S8A, Table S12 ). At the extreme of this gradient is cluster12 who possesses estimates comparable to present-day East Africans (e.g., from Eritrea and Ethiopia, and the Afar and Shaigi peoples). Whilst this outlier effect is more pronounced when using a present-day African source as the outgroup (e.g., Yoruba) rather than the ancient North African Basal Eurasian surrogate, Iberomaurusian ( Figure S8B, Table S12 ), distinguishing African admixture from putative Basal Eurasian ancestry remains a challenge. The social structure that can reduce genetic variation Genetic ancestries, preeminently driven by genetic similarity to African-related ancestries but also possibly by Basal Eurasian ancestries, could dictate the pattern of genetic variation of the Saudi Arabians. On the other hand, another major force not to be overlooked is the cultural practices, such as endogamy, that establishes the social structure in this population and could impact the pattern of variation. We thus examined one of the genetic hallmarks of endogamy, runs of homozygosity (ROH). Comparing the number of ROH (NROH) vs. the sum total length of ROH (SROH), we found that the Saudi Arabians exhibit a diverse distribution of ROH between clusters and between the individuals ( Figure 3A and 3B ). Across Saudi clusters, the median SROH ranged from 38.12 Mb to 232.6 Mb, while the median NROH ranged from 42 to 150 ROHs. Clusters 12, 11, and 3 had the shortest mean length and smallest mean number of ROHs ( Figure 3A ), consistent with greater admixture from more diverse African ancestral populations ( Figure 1C ). Cluster5 had the highest burden of ROH, with highest average number and total length of ROH, reflecting a consequence of both long-term small effective population size and/or consanguinity ( Ceballos et al. 2018 ) ( Figure 3A ). Download figure Open in new tab Figure 3. Runs of homozygosity in Saudi Arabians. (A) Average total length and number of ROH per cluster. The numbers next to the symbol represents the mean ME-like ancestry proportion. (B) Total length and number of ROH per individual across the Saudi Arabian cohort. For (A) and (B), symbols are colored by the geographical region associated with each cluster ( Figure 1D ). (C) Total length of ROH vs ancestry proportion per individual stratified by three length classes of ROHs. Short ROHs indicate homozygosity from ancient or distant ancestry, i.e. background relatedness. Intermediate ROHs likely arise from background relatedness with moderate level of inbreeding from past few generations, often due to reduced population sizes or reproductive isolation (e.g. due to geographic or cultural preferences), or from recent bottlenecks followed by recovery. Long ROHs arise through recent inbreeding and are common in populations with high levels of consanguinity ( Pemberton et al. 2012 ; Thompson 2013 ; Ceballos et al. 2018 ). ROH – Runs of homozygosity, ME - Middle Eastern, EA – East Asia. We also followed a previous approach ( Pemberton et al. 2012 ) and divided the ROHs based on length into short, intermediate, and long classes. When classified by the sizes, we can observe that the overall pattern of NROH vs. SROH ( Figure 3B ) are driven by the long ROHs ( Figure S9 ), where contributions of SROHs are driven by fewer but longer ROHs in the long length class. Long ROHs tend to arise from recent inbreeding. The pattern of NROH and SROH across Saudi sub-clusters are also consistent for ROH of the short and intermediate length classes ( Figure S10A and S10B ), but varied for the long ROH class, implying a different pattern of recent consanguinity across sub-clusters in contrast to their shared ancient demographic events. Therefore, the consequences of consanguinity in not only increasing SROH but also increasing the variance of SROH in a population ( Ceballos et al. 2018 ; Ceballos et al. 2021 ). When examined in light of estimated genetic ancestry (at the continental level, K = 4; Figure 1C ), the pattern of NROH vs. SROH showed a distinct relationship with the proportion of ME-like ancestries, with greater ME-like ancestry also showing greater ROH in length and number ( Figure 3A ). In fact, we found that SROH is positively correlated with ME-like ancestry proportions, and negatively correlated with the proportion of African, European and East Asian ancestries ( Figure S10C ). This observation is seen across length classes of ROHs, though more attenuated for long ROHs ( Figure 3C ). We reasoned that this ancestry effect is again likely reflecting the commonly practiced endogamous marriages and recent consanguinity associated with the ME-like ancestry. Isolation by endogamy may also be reflected in persistent small population sizes. We leveraged the dense marker information from the 302 WGS individuals to reconstruct genome-wide genealogies and infer the population size trajectories within the Saudi sub-clusters ( Figure 4 ). We found that all clusters experienced similar history through the out-of-Africa bottleneck (∼100 kya), followed by a recovery to a local maximum in effective population size (Ne) 10-20 kya, a period consistent with the early Holocene Wet Phase / Holocene Humid Period, characterized by wet conditions which resulted in expansion of lakes and rivers and extensive grasslands ( Petraglia et al. 2020 ). Following the Holocene period, sub-clusters began to diverge around 6-10kya, coinciding with the Arabian aridification, which is responsible for the desert conditions in most of the Arabia as we know it today ( Petraglia et al. 2020 ; Martiniano et al. 2024 ). Clusters with less ME-like ancestries and stronger signature of admixture (such as clusters 12, 11, 8, & 3), showed less severe decline in Ne compared to those that have high ME-like ancestry. Cluster5 in particular, showed the most severe bottleneck and remained low in Ne in the recent times. Cluster5 appears to resemble the pattern of the tribe labelled as T25 in a previous study ( Mineta et al. 2021 ); both originated from the Western region showing the highest level of inbreeding within the respective study. T25 is said to have been subjected to strict intratribal marriages. Such social practices can indeed result in persistent small Ne as observed here, as well as our observed pattern in ROH ( Figure 3 ). Download figure Open in new tab Figure 4. Population size trajectories between the Saudi Arabian sub-clusters. Effective population sizes were computed from genealogical trees using RELATE (see Methods ). The number of samples per cluster used for the estimates can be found in Table S1 . Allelic architecture of Saudi Arabians Genetic variation in Saudi Arabian WGS data Having investigated extensively the genetic signature of ancestry, admixture history, and endogamy in the Saudi, we next sought to determine how these demographic and cultural factors influence the patterns of genetic variation in Saudi. There are potentially two competing effects: first, significant contributions from African-related and Basal Eurasian ancestries, at least in some sub-clusters, would increase genetic diversity and facilitate more effective purifying selection. Conversely, culturally embedded practice of endogamy is expected to decrease genetic diversity, reduce effective population sizes, and relax the effect of purifying selection, potentially leading to the enrichment of functionally deleterious alleles. There is empirical evidence of observing an excess of functionally deleterious alleles in isolated populations ( Lim et al. 2014 ; Pedersen et al. 2017 ; Locke et al. 2019 ). Driven by bottlenecks, these populations exhibit two hallmark signatures of their allelic architecture: the paucity of rare alleles and the enrichment of deleterious alleles at intermediate frequencies. The pattern has not been examined among populations like the Saudi Arabians, where both admixtures and the long-standing practice of endogamy could both affect the allelic architecture. Here we leverage the whole genome sequences (WGS) from 302 Saudi individuals to investigate the net impact of these opposing forces on pattern of variation. In total, 25,488,981 autosomal variants were called and retained after quality control (QC) ( Methods ). We first compared allelic frequency spectra and allelic homozygosity in the Saudi Arabians (all WGS individuals) to the Middle Eastern population in gnomAD (gnomAD-MID). The two have relatively similar patterns in the genome-wide alternative allele frequency spectra though Saudi had proportionally slightly fewer common variants ( Figure S11A ). The allele frequencies are highly concordant (r = 0.98) between the two populations ( Figure S11B ), but Saudi Arabians have approximately 2x more homozygous genotypes than gnomAD-MID (e.g. an average of 20% and 10% of the genotypes are homozygous for variants with alternative allele frequency > 5% in Saudi and gnomAD-MID, respectively; Figure S11A and S11C ). The higher proportion of homozygous variants confirms that the Saudi and the gnomAD-MID population are not reflective of the same underlying populations. However, because the frequency spectra and correlation of allele frequencies are highly similar ( Figure S11 ), we thus utilize both samples to compare the pattern of variation with gnomAD African/African Americans (gnomAD-AFR) and non-Finnish Europeans (gnomAD-EUR) to better understand the impact of the unique history in the Arabian Peninsula on its current pattern of variation. Distribution of functionally deleterious variants We annotated the variants using three different annotation tools: VEP (v.110) ( McLaren et al. 2016 ), AlphaMissense ( Cheng et al. 2023 ), and Genomic Pre-trained Network (GPN) ( Benegas et al. 2023 ). AlphaMissense predicts the pathogenicity of missense variants while GPN predicts the deleteriousness for both coding and non-coding variants. The distribution of the variants by functional classes are shown in Table S13 . Of the called variants, 2,459,950 (9.7%) variants were not previously identified in gnomAD v4.1 ( Karczewski et al. 2020 ; Chen et al. 2024) and thus are potentially novel or Saudi-specific. We refer to these variants as the “previously unknown variants”, or PUVs. As expected, the PUVs are highly enriched with rare alleles ( e.g. 83% of them are singletons in our dataset, compared to 32% singletons among the known variants; Figure S12 ). In addition, proportionally more PUVs (19.63%) were annotated to be deleterious than known ones found in gnomAD (7.2%). This Implies that the PUVs are not just sequencing errors distributed randomly across the genome, but are enriched for rare variants of functional consequences that are maintained in the Saudi population. We then compared the allelic architecture of functionally deleterious alleles in the Saudi population to other continental populations from gnomAD. Compared to gnomAD-AFR individuals, the Saudi tend to show proportionally more deleterious alleles than those annotated to be benign or neutral across algorithms ( Figure 5A ), particularly for variants up to ∼5% frequency. Overall, relative to gnomAD-AFR, between the 0.5 - 5% frequency, we found a 13% proportional increase of deleterious (likely pathogenic) alleles annotated by AlphaMissense in the Saudi Arabians compared to 7% proportional decrease of the benign alleles ( P < 0.01; Figure 5A ). When annotated by VEP and GPN, at the same frequency range, we observed a consistent pattern i.e. a 3% proportional increase in loss-of-function variants in the Saudi Arabians compared to 10% proportional decrease in neutral (synonymous) ones ( P < 0.01) by VEP, and an 11% proportional increase in the first percentile of alleles by deleteriousness compared to 3% proportional decrease in the 99 th percentile (e.g. the most likely neutral) of alleles when annotated by GPN ( Figure 5A ). We also qualitatively replicated these patterns by comparing the exome samples from gnomAD-MID to gnomAD-AFR ( Figure S13 ), or by comparing Saudi to gnomAD-EUR ( Figure S14 ). Download figure Open in new tab Figure 5. Distribution of minor allele frequency across functional classes. (A) Ratio of Saudi to gnomAD-AFR variants. The sample size of gnomAD-AFR is based on downsampling to Saudi sample size, n = 302. (B) Ratio of Saudi cluster groupA to cluster groupB variants. The sample size of cluster groupB is based on downsampling to groupA sample size, n = 124. Variant functional consequences were annotated based on VEP (loss-of-function, missense, or synonymous variants), AlphaMissense (likely pathogenic, likely benign, and ambiguous), and GPN. AC and AF refer to allele count and allele frequency, respectively. AFg5 refers to allele frequency greater than 5%. Top_1p refers to variants with the top 1% of GPN scores (more deleterious) and Bottom_1p refers to variants with the bottom 1% of GPN scores (more neutral). AFR denotes the gnomAD-AFR sample. LOF refers to Loss of function. ** and * denote frequency bins with significant difference between the most deleterious (red) and most neutral (green) through bootstrapping at p < 0.01 and < 0.05, respectively. We also compared the enrichment of deleterious alleles between Saudi sub-clusters. Because of the smaller number of individuals within each cluster having WGS data ( Table S1 ), we grouped the clusters into two groups: groupA which contained clusters with greater amount of SROH and lower effective population sizes (clusters 2, 4, 5, 6, 9, and 10), and groupB which showed less SROH and higher effective population sizes (clusters 12, 11, 3, and 8). We left out cluster1 from this analysis as it tends to fall in the middle of the two groups. GroupA had generally fewer number of variants compared to groupB ( Figure S15 ), consistent with its lower genetic diversity, smaller effective population sizes, and greater SROH, and also showed greater enrichment of deleterious alleles ( Figure 5B ). Notably, across these analyses and particularly when comparing Saudi or gnomAD-MID to gnomAD-AFR or gnomAD-EUR, we did not observe one of the hallmark signatures often seen in other isolated populations – the paucity of rare alleles. In general, both the Saudi and gnomAD-MID had more rare alleles ( e.g. variants with alternative allele counts of 2 or 3) than the gnomAD-AFR population, potentially due to the admixture history in the Arabian Peninsula, such as through the Basal Eurasian ancestry. Consistent with this hypothesis, we found a correlation between measures of Basal Eurasian ancestry (based on the f4-ratio, Methods ) and observed heterozygosity per individual (R 2 = 0.612; Figure S16A ). However, a similar trend is also observed between estimated African-like ancestries (based on K = 4, Figure 1C ) and heterozygosity (R 2 = 0.814; Figure S16B ). Both a linear mixed-model and nested model comparison testing the association of Basal Eurasian ancestry and heterozygosity were not significant after accounting for African-like ancestries (LMM P = 0.519, LRT P = 0.5; Figure S16C-E ). However, together, these ancestry predictors explained a large proportion of the variance in heterozygosity across the dataset (Marginal R 2 ∼80.5%). This explained variance was almost entirely driven by the African-like ancestry component (Partial R 2 = 0.698), a finding consistently observed even if we iteratively left one Saudi cluster out of the analysis at a time ( Figure S17 ). Importantly, while Basal Eurasian ancestries and African-like ancestries estimated here moderately colinear (variance inflation factor (VIF) = 3.62), it remains unclear whether the excess of rare alleles in Saudi compared to gnomAD-AFR populations is driven by elevated Basal Eurasian ancestry or differences between Eastern- and Western-African-like ancestries. DISCUSSION Scholars have long emphasized the complex demographic histories shaping the genetic architecture of populations across the Arabian Peninsula and have called for improved characterization to better understand regional genetics and health implications ( Charati 2021 ; Elliott et al. 2022 ). Situated at the crossroads of Africa and Eurasia, Arabian populations are expected to exhibit elevated heterozygosity due to extensive intercontinental interactions. Conversely, deeply rooted traditions of endogamy in Saudi Arabia promote homozygosity, with potential health consequences ( Sahoo et al. 2021 ; Aleissa et al. 2022 ). Here, we characterize the fine-scale population structure of Saudi Arabia using 3,252 genotyped and 302 whole-genome sequenced individuals sampled across the country, examining how genomic history and social structure jointly shape genetic variation and disease risk. Integration of our WGS data with published ancient genomes clarifies the formation of the contemporary Saudi gene pool. For most clusters (c1, c2, c5, c6, c7, c9, c10), we identified a recent deviation from ancient Arabian ancestry driven by the integration of Levantine/Anatolian-related components. This shift is reflected in both D-statistics ( Figure S4, Figure S5 ) and T2-period qpAdm modeling ( Figure 2C ), where the most parsimonious framework for many clusters involves a two-way mixture of T2_nAfrica (∼51–76%) and ancient Arabian groups (Late Tylos MH1, MH2, or MH3 ∼24–49%). The efficacy of North African samples during this period as a proxy is explained by its comprising of ancient Egyptian ( Schuenemann et al. 2017 ) and Kulubnarti Nubian ( Sirak et al. 2021 ) samples, both of whom harbor significant shared ancestry with Levantine Near Eastern populations. In the case of the Kulubnarti Nubians, this is thought to reflect continuous gene flow along the Nile corridor from 490 BCE to 850 CE. Superimposed on this base model, African-related admixture distinguishes cluster12 and, to a lesser extent, clusters 3, 8, and 11 from other Saudi clusters. Previous studies attributed African ancestry in Arabia primarily to Bantu-speaking sources from Eastern or Southern Africa, dating to 400–1754 years ago ( Hellenthal et al. 2014 ; Fernandes et al. 2019 ; Almarri et al. 2020 ), consistent with the Arab slave trade. However, the Central, Eastern, and Southern African affinities observed here suggest additional contributions linked to 17th–20th century Red Sea and trans-Saharan slave trades, when Eastern (e.g. Ethiopia, Eritrea, Somalia, and Sudan) and Central (e.g. Chad and Congo Basin) African regions were major sources of enslaved individuals ( Miran 2022 ). These historical processes are reflected in elevated effective population sizes ( Figure 4 ) and reduced ROH burdens ( Figure 3 ) in the most admixed Saudi clusters. Against this backdrop of intercontinental mobility, persistent consanguineous and endogamous practices have maintained substantial reproductive isolation among clans and tribes, generating genetically distinct sub-clusters in close geographic proximity, consistent with previous reports ( Mineta et al. 2021 ). Although recent admixture with Africans, Europeans, and Central/South Asians may continue, its spread is likely constrained by social structure, as suggested for the neighboring Emirati populations ( Elliott et al. 2022 ). Endogamy and consanguinity increase runs of homozygosity (SROH), elevating health risks. Consistent with high consanguinity rates reported in Madinah and the Western region ( El-Mouzan et al. 2007 ), we observe pronounced signatures of endogamy in Western clusters (clusters 4, 5, 7, and 10), including longer ROHs, reduced effective population sizes, and limited admixture ( Figure 3 ). These practices likely counteract the diversity-enhancing effects of admixture, promoting reduced diversity and enrichment of functionally consequential alleles. Demographic history is known to influence the pattern of variation, particularly for deleterious variation. Classical bottleneck models predict depletion of rare variants and enrichment of low-frequency deleterious alleles due to drift. While this pattern is typical in isolated European populations ( Wang et al. 2014 ; Pedersen et al. 2017 ; Locke et al. 2019 ), Saudi genomes display a distinct profile. We detect an excess of rare variants (AF < 1%) relative to gnomAD-AFR, possibly reflecting heterogeneous African-related ancestries (e.g. from Eastern Africa) and/or contributions from Basal Eurasian lineages uncommon outside of Arabia, though we were unable to disentangle between the two due to their collinearity ( Figure S16 ). Because both lineages harbor ancestry that predates or bypassed the primary bottleneck of the out-of-Africa expansion, distinguishing them requires either ancient Arabian genomes from the period of Basal Eurasian isolation, which do not yet exist, or reference panels that better resolve Eastern versus Western African ancestry components; the latter is more tractable. Beyond the excess of rare variants, deleterious alleles are nonetheless enriched within the 0.5–5% frequency range. This pattern likely reflects the combined effects of historical bottlenecks and persistent endogamy. Although increased homozygosity may theoretically facilitate purging, reduced effective population sizes limit the efficiency of purifying selection. Subgroups with higher endogamy show greater enrichment of deleterious alleles than more admixed clusters ( Figure 5B ), consistent with findings from other bottlenecked populations ( Lohmueller et al. 2008 ; Lim et al. 2014 ; Locke et al. 2019 ) and supporting prior evidence of limited genetic purging in Saudi Arabia (Greater Middle East Variome Consortium et al. 2016). We emphasize that sub-clusters were defined based on genetic similarity and used as analytical units due to limited tribal affiliation data. In admixed populations, such clustering may obscure finer-scale structure, particularly when distinct Arabian-origin groups share external admixture sources. Although our clusters broadly correlate with geography and show partial concordance with previously described indigenous tribal groupings ( Mineta et al. 2021 ), they should be interpreted as genetically similar groups rather than direct representations of tribal lineages. Now with a better understanding of the genetic history of Saudi Arabians, future studies incorporating improved demographic metadata and ancestral reference panels may refine these inferences using ancestry-specific approaches ( Moreno-Estrada et al. 2013 ; Browning et al. 2016 ; Tang and Chiang 2025 ). These limitations notwithstanding, collectively, our results demonstrated that Saudi Arabia’s demographic history has shaped its contemporary genetic landscape with potential implications for health. The persistence of endogamy and consanguinity continues to elevate genetic risk and remains common ( Warsy et al. 2014 ; Albanghali 2023 ). Public health initiatives, including mandatory premarital screening and genetic counseling ( Saffi and Howard 2015 ; Delatycki et al. 2020 ; Aleissa et al. 2022 ), have increased awareness and informed reproductive decision-making. Although such programs have not substantially reduced at-risk marriages, they have contributed to reductions in affected births through prenatal diagnosis and related interventions ( Saffi and Howard 2015 ; Albanghali 2023 ; Khayat et al. 2024 ). Continued education and socioeconomic development may further influence the prevalence of consanguineous unions, which remain more common in rural and less-educated communities ( Tadmouri et al. 2009 ). Finally, our study helps address the persistent under-representation of Arabian populations in global genomic datasets (Greater Middle East Variome Consortium et al. 2016; Almarri et al. 2020 ; Elfatih et al. 2024 ; Oleksyk et al. 2025 ). In gnomAD, individuals of Middle Eastern origin comprise only 0.2% of genomes and 0.38% of exomes, compared to substantially higher representation of European and African populations (e.g. 44.6% and 27.3% of genomes). This imbalance limits clinical knowledge and application of regionally enriched variants (Greater Middle East Variome Consortium et al. 2016) and reduces the accuracy of polygenic prediction and imputation in Arabian populations ( Thareja et al. 2021 ; Cahoon et al. 2024 ). Notably, 9.7% of protein-altering variants identified here are previously unreported and disproportionately predicted to be deleterious. Expanding representation of Middle Eastern populations in reference panels such as gnomAD and TOPMed is therefore essential ( Oleksyk et al. 2025 ). Acknowledging that public data sharing requires careful consideration of national and community interests, we provide allele frequencies for 25,488,981 high-quality variants to enhance genomic resources for this underrepresented population. METHODS Data collection, processing and quality control For all studied samples, written informed consent was obtained from each participant, all of whom were above 18 years of age. The human subjects for this study were derived from a comprehensive collection of Institutional Review Board (IRB)-approved research protocols focused on the genetics of various diseases and control groups. These protocols, approved under the Saudi Genome Project Satellite site, include approval numbers 16-300, 16-310, and 20-211, and were reviewed and approved by the IRBs at King Abdulaziz City for Science and Technology (KACST) and King Fahad Medical City (KFMC). Tribal affiliations reported by some participants were anonymized and referenced solely by their geographical locations. In compliance with Saudi privacy legislation and the protection of human subject confidentiality, the sharing of raw genotyping and clinical data is restricted. Access to this data requires prior approval from the Saudi National Bioethics Committee. Array data Sample collection, genotyping and quality control A total of 3,752 samples were collected in Saudi Arabia between the years 2017 - 2020 as control individuals for various projects, such as the GenOMICC International project and covid19 host genetics consortium studies ( COVID-19 Host Genetics Initiative et al. 2022 ; Pairo-Castineira et al. 2023 ). Individuals were genotyped on the Axiom Genome-wide CEU 1 Array including customized variants following the manufacturer’s specifications for sample preparation, including whole genome amplification, fragmentation, denaturation, and hybridization. Genome-wide SNP genotyping was performed using the automated, high-throughput GeneTitan system from Affymetrix. We filtered individuals with sample call rates < 0.9 using PLINK v1.9 ( Chang et al. 2015 ) on each plate individually before merging the autosomal SNPs across the different plates, resulting in a merged set of 757,790 SNPs. We removed duplicates and non-biallelic variants, retaining 703,986 SNPs. We then filtered SNPs with greater than 10% missing rate and SNPs that did not pass Hardy Weinberg Equilibrium (HWE) test ( P < 10 -6 ) using PLINK, resulting in a total of 606,349 SNPs for analysis. We lifted over the genomic coordinates from human reference genome hg19 to hg38. We phased the data using Beagle v5.2 ( Browning et al. 2021 ). Removing close relatives and filtering out outliers Using the 3,752 Saudi samples and 606,349 SNPs, we pruned the dataset by linkage disequilibrium (LD) (using the command --indep—pairwise 50 5 0.8 in PLINK), resulting in 547,307 SNPs to estimate individuals’ relatedness using King v2.2.5 ( Manichaikul et al. 2010 ). We removed twins (or duplicated individuals) as well as first degree relatives, retaining 3,403 samples. Furthermore, we performed PCA and performed two iterations of outlier (defined as being > 6 standard deviation (SD) away from the mean in any of the first 10 PCs), resulting in 3,352 samples left for further analyses. Defining and imputing the samples’ regional affiliation Due to privacy protection and ethical restrictions, we did not have access to specific tribal name of each individual to help define the unit of analysis in our study or help interpret our findings. Instead, we have limited information of the geographic region of origin of the participant, inferred based on their family name that is linked to geographical location. We thus aimed to use available regional geographical affiliations to validate and interpret the results of clustering based on genetic data. However, 82% of the individuals in our data (2,740 of the 3,352) did not have self-reported geographical information. We thus imputed such information using the software HARE (harmonized ancestry and race/ethnicity) package ( Fang et al. 2019 ) based on the available Self-identified Race/Ethnicity (SIRE) regional information of 612 individuals. SIRE in our data were derived from either self-report or individual’s family name that is presumed to reflect their tribal affiliation linked to a particular geographical location ( Table S1 ). The HARE package combines genetically inferred structure based on PCA with available SIRE information to train a support vector machine (SVM) classifier that could correct for potentially mislabeled SIRE and predict the race/ethnicity, in this case geographical regional label, for those individuals missing SIRE. We used the HARE to impute regional information of the samples missing a SIRE label in our dataset using the first 30 PCs as the input data. We used the highest predicted membership probability (L 1 ) labels to aid in the interpretation of the population sub-clusters that we infer from genetic data. Whole genome sequencing (WGS) data Sequencing information and processingh4 In addition to the genotyped samples, 349 samples were whole-genome sequenced (WGS) to a targeted depth of 30x. The samples were prepared following the Illumina’s TruSeq Nano sample preparation protocol and sequenced on an Illumina HiSeq X-ten machine. The raw sequences were aligned against the human reference genome GRCh38 using the Burrows-Wheeler Aligner (BWA) version 0.7.10 ( Li and Durbin 2009 ). Picard tools version 1.117 was used to mark duplicates ( McKenna et al. 2010 ). All sample preparation, sequencing, sequence alignment, pre-processing, quality control before calling of variants and BAM file augmentation were performed by deCODE genetics ( https://www.decode.com ), and a more detailed information on these steps is documented in Jónsson et al. ( Jónsson et al. 2017 ). Variant calling and filtering We merged the gVCFs of the 349 samples using CombineGVCFs in GATK ( Van Der Auwera et al. 2013 ) and subsequently performed a joint genotyping calling using GenotypeGVCFs. We performed variant quality score calibration (VQRS) on the combined samples using VariantRecalibrator and ApplyVQRS in GATK ( Van Der Auwera et al. 2013 ). We supplied the homo sapiens reference assembly 38 (Homo_sapiens_assembly38.fasta) and used the following resources: HapMap III variants were used as training and truth sets with prior priority of 15, 1000G omni2.5 sites were used as training set with prior priority of 12, 1000G phase1 high confidence SNPs was used as training set with prior priority of 10 and the dbSNP138 as known SNPs with prior probability of 2. For the annotations, we included the QD, MQ, MQRankSum, ReadPosRankSum FS and SOR. We used 99% sensitivity level to filter the SNPs. Quality control on samples and SNPs All 349 samples had missing genotyping rate 10% and 53,981 SNPs based on HWE threshold ( P < 10 -6 ), leaving 26,781,476 SNPs. We removed non-biallelic sites which left 26,408,559 variants. Further filtering was applied on specific downstream analyses when appropriate. To exclude outliers in our samples, we merged the 349 WGS samples with our array data and the HGDP dataset at segregating SNPs shared across all datasets. A principal component analysis (PCA) was performed using PLINK and we used HARE to impute missing self-reported individual nationalities (e.g. self-identified nationality as Saudi or not). We excluded 8 samples which were not imputed as a Saudi. We then removed monomorphic sites which were introduced by calling the variants including these potentially non-Saudi samples, leaving 25,488,981 variants. We filtered samples based on relatedness using King software v2.2.5 ( Manichaikul et al. 2010 ). For estimating the relatedness, we randomly sampled 550,000 SNPs with minor allele frequency > 1% after LD pruning ( --indep-pairwise 50 5 0.5 using PLINK ) to estimate the relatedness. We removed 37 twins/duplicates and first-degree relatives. Using the PCA, we further removed 2 samples that appeared as extreme outliers (> 6 SD on any of the first 10 PCs), leaving 302 samples. Haplotype phasing was performed on the remaining 302 samples and 25,488,981 variants using Eagle v2.4.1 ( Loh et al. 2016 ). Annotation of variants We annotated the variants using the popular VEP (v.110) ( McLaren et al. 2016 ) as well as two recently published annotation tools, AlphaMissense ( Cheng et al. 2023 ) and Genomic Pre-trained Network (GPN) ( Benegas et al. 2023 ). The AlphaMissense only annotates missense variants and has three functional classes, “likely pathogenic”, “ambiguous” and “likely benign”. The GPN annotates all genomic variants and assign a deleteriousness score to each variant in gnomAD (v3). We downloaded the pre-computed scores from https://huggingface.co/datasets/songlab/gnomad/resolve/main/test.parquet , accessed 2/9/2024. Merging of Saudi whole-genome-sequence data with ancient genomes We first back-converted the genomic coordinates of the Saudi whole-genome sequencing (WGS) data from the hg38 reference assembly to hg19 using LiftOver. We filtered the dataset using PLINK v1.9-beta7 (--snps-only, --geno 0, --allow-no-sex) to retain only biallelic SNPs with a 100% genotyping rate. This filtering step removed 725,421 sites containing missing data from the baseline dataset. We then converted the filtered dataset into EIGENSTRAT format and merged the Saudi WGS data with the Allen Ancient DNA Resource (AADR) v.62.0 Eigenstrat files with the Eigensoft mergeit function which merges two data sets into a third producing the union of the individuals and the intersection of the SNPs in the first two. We then merged this dataset with the Eigenstrat genotype data from Soqotra resulting in a final dataset of 1,032,250 SNPs and 18,017 individuals. We generated an additional dataset merging the Saudi WGS and Soqotra Eigenstrat genotype data with the AADR v.62.0 modern human origins dataset, retaining 506,776 SNPs and 22,333 individuals. A complete list of the individuals and their associated group labels for each of the downstream analyses can be found in Table S14 . Population genetic analyses We used the larger collection of Saudi genotyped samples to investigate the genetic substructure and historic admixtures of the population. We then utilized the high-density genome-wide marker information from the WGS data to investigate differences in genetic ancestries with aDNA, population size trajectories, and allelic architecture of functional variants within the social structure of the Saudi population. Evaluation of population structure and identifying discrete population clusters We merged the fully filtered array and WGS datasets, based on segregating markers. We performed PCA followed by UMAP ( McInnes et al. 2018 ; Diaz-Papkovich et al. 2019 ) to combine the first 10 PCs and reduced them into two-dimensions in order to explore the population structure. Based on the UMAP results, we assigned individuals to subpopulations using K-means clustering from the R package stats . To determine the optimal number of K clusters, we used the Average Silhouette Width (ASW), which is a popular and trusted method to produce quality clustering ( Rousseeuw 1987 ; Batool and Hennig 2021 ). The ASW uses values between -1 and 1 to measure how similar/dissimilar is an object to others within its cluster as well as objects in different clusters, with higher numbers representing a better fit and appropriateness of clustering. Likewise, a high ASW value corresponds to an optimal number of K clusters for partitioning a particular set of objects. Based on ASW, we determined that K = 12 is the optimal value that best fit the data, although K = 5 or 9 could be equally sensible ( Figure S18 ). We validated these clustering by evaluating the concurrence between the clusters and the tribal region assignments. We used these clusters as representative of the social structure and also used them in the whole genome sequencing samples to evaluate patterns of genetic diversity within the Saudi population. Analysis of ancestry components We conducted the unsupervised admixture analysis using ADMIXTURE software v1.3 ( Alexander et al. 2009 ). We conducted 10 independent runs of admixture analysis for each K and retained the run with maximum likelihood. We used the cross-validation procedure, implemented in the program, to identify the best number of ancestral populations K which fits our data ( Figure S3C ). Saudi and ancient genome analyses We focused on investigating the genomic history and genetic ancestry of the Saudi, leveraging the available ancient DNA data in AP and surrounding region. We utilized multiple formulations of the D and f -statistics (both f 3 and f 4 statistics) as well as admixture modeling from qpAdm. All of these analyses were performed using the functions in the admixtools R package. D-statistics We computed two configurations of D-statistics to assess genetic affinity and symmetry of Saudi clusters to four ancient Arabian populations. We filtered results to include only statistics based on at least 50,000 SNPs. For both analyses, the parameters for the qpdstat function include f4mode = FALSE, boot = FALSE and allsnps = TRUE. We performed two specific configurations of the D-statistic: D-symmetry: To test for cladality, we computed statistics of the form D(Test, SaudiCluster; WorldPop, Karitiana.DG), where “Test” represented either ancient Bahrain or Soqotra, “WorldPop” comprised diverse ancient and present-day Eurasian and African populations, and for the outgroup population we selected Karitiana, an indigenous Brazilian population following Sirak et al. ( Sirak et al. 2024 ). Significantly negative D-statistic (Z-score < -3) estimates indicate greater shared drift between WorldPop and Saudi clusters relative to the ancient Arabian Test populations, while significantly positive values demonstrate the inverse relationship, with either result rejecting the hypothesis of population continuity. D-affinity: To identify populations sharing excess genetic drift with Saudi clusters relative to ancient Arabia, we computed statistics of the form D(WorldPop, Test; SaudiCluster, Karitiana.DG). Significantly negative D-statistics indicate that Saudi clusters share more excess drift with the Arabian Test population than with WorldPops. Conversely, significantly positive values denote the inverse: greater shared drift between Saudi clusters and WorldPops. f 4-statisitcs for Basal Eurasian ancestries We used two approaches to estimate Basal Eurasian ancestry. First, we used the f 4-statistic of form f 4(TestPops, Han.DG, Russia_UstIshim_IUP_snpAD.DG, Outgroup) and implemented this via the qpdstat function with f4mode = TRUE. The TestPops included the Saudi clusters and a selection of present-day Middle Eastern, Arabian, and East African populations, while Yoruba.DG, Saharawi.DG, Mozabite.DG, and Morocco_Iberomaurusian.AG served as the outgroups ( Figure S8A, S8B ; Table S12 ). Second, on an individual level within the Saudi clusters we modeled the relationship between African and Basal Eurasian ancestry and genomic diversity using a linear mixed-effects model (LMM) and a nested model comparison via likelihood ratio test. For these model comparisons, Basal Eurasian ancestry proportions were estimated in each Saudi individual using f4-ratio statistics via the admixtools2 package, with the ratio defined as f4(Saudi, WHG; UstIshim, Kostenki14) / f4(Iberomaurusian, WHG; UstIshim, Kostenki14). These estimates were merged with African ancestry components (K=4 in ADMIXTURE analysis) and observed heterozygosity (O{HET}) calculated from phased whole-genome sequencing (WGS) data via vcftools. Prior to modeling, both continuous ancestry predictors (f4-ratio and K4) were z-score standardized (mu = 0, sigma = 1) to allow for direct comparison of their effect sizes (standardized beta coefficients). Linear Mixed-Effects Modeling To model the relative impact of Basal and African ancestries while accounting for Saudi substructure, we fitted a Linear Mixed-Effects Model (LMM) using the lme4 and lmerTest packages. Ancestry components were treated as fixed effects, and Saudi cluster membership was included as a random intercept. P-values for the fixed effects were estimated using Satterthwaite’s degrees of freedom method. To assess the impact of correlation between the two ancestries we calculated the Variance Inflation Factor (VIF) using the car package in R. We evaluated model performance using Nakagawa and Schielzeth’s approach to partition variance into Marginal R 2 and Conditional R 2 where the former estimates the proportion of total variance explained exclusively by the fixed ancestry effects, while the latter measures the variance explained by the entire model (fixed ancestries + random clusters). To evaluate the unique contribution of each ancestry, we calculated the Partial R 2 via nested model comparison. This was achieved by subtracting the Marginal R 2 of a reduced model (lacking the predictor of interest) from the Marginal R 2 of the full model. This metric isolates the percentage of heterozygosity variance strictly attributable to a specific ancestry, independent of the other. Finally, we adjusted for between cluster variance by computing within-cluster partial correlations using the ppcor package. Here we residualized the observed heterozygosity, f4-ratio Basal Eurasian ancestry estimates and ADMIXTURE African ancestry estimate against only the Saudi cluster random intercept and calculated the Pearson correlation between these residuals. To formally test whether the addition of Basal Eurasian or African ancestry significantly improved the model’s fit, we performed Likelihood Ratio Tests (LRT) using an Analysis of Variance (anova) in R. The significance of model improvement was derived from a chisq distribution of the difference in deviance (2 times log-likelihood) between the Base Model (one ancestry) and the Full Model (two ancestries). We additionally evaluated the Akaike Information Criterion (AIC) to compare relative model quality. f 3 -statistics We computed two forms of f 3 -statistics, the outgroup f 3 and admixture f 3 , using the Admixtools2 qp3pop function to further investigate genetic affinity between Saudi cluster and ancient samples. To quantify the shared genetic drift between Saudi clusters and diverse set of 1698 Eurasian and African populations dating between the present-day and 50 thousand years ago, we computed the outgroup f 3 -statistic of form f 3 (Han.DG; SaudiCluster, WorldPop), where the Han.DG population served as the outgroup to anchor the comparison. Higher f 3 -statistic values in this configuration indicate increased shared genetic drift between the Saudi cluster and the test population relative to the outgroup. We merged the statistical results with metadata to associate each test population with its mean sample age (BP) and geographic coordinates. We generated geospatial heatmaps using ggplot2, plotting test populations on a world map background. Point colors representing f 3 -statistic magnitude were mapped using the “Zissou1” continuous palette from the wesanderson package. To analyze temporal trends, the data were stratified into four time periods (T1: 10,000 BP) and faceted by Saudi cluster. To test whether Saudi clusters are the result of admixture between specific ancient Arabian populations and diverse global groups, we computed the admixture f 3 statistic of form f 3 (SaudiCluster; Test, WorldPop), where “Test” was either ancient Bahrain or Soqotra, and “WorldPop” included diverse Eurasian and African populations. Negative f 3 statistic values with significant Z-scores (Z < -3) were interpreted as evidence that the Saudi cluster is a mixture of lineages related to the Ancient Arabian Source and the WorldPop Population. We visualized significant admixture signals using “flight-path” maps generated with ggplot2, sf, and rnaturalearth. For the subset of four target Saudi clusters that produced significant f 3 statistic values (c3, c8, c11, and c12), we plotted curved lines connecting the fixed Ancient Arabian Source to the Global Population. Ancestry Modeling with qpAdm We performed temporal qpAdm admixture modeling to reconstruct the ancestry of Saudi clusters through time and space. To form a set of consistent candidate source populations for qpAdm analyses we utilized a series of initial outgroup f 3 and PCA analyses. Specifically, we repeated our outgroup f 3 -statistic test as described above with a pooled Saudi population consisting of individuals from each of the 12 clusters. From the f 3 -statistic test results we stratified the Test populations into three time periods (T1 = 3000 BP) and the following 13 global regions: Eastern Europe, Western Europe, Southern Europe, Caucasus, Levant, Iran, Anatolia, Arabia, Central Africa, Western Africa, Northern Africa, Southern Africa, and Eastern Africa. Within each time period, we merged into meta-population source for downstream qpAdm analysis those populations that had similar outgroup f 3 -statistic estimates at the top of the range and also displayed evidence of clustering in principal component analysis (PCA) space ( Figure S19, Table S14 ). PCA analysis was performed with the smartpca function in EIGENSOFT v.7.2.0 using the parameter file options, numoutlieriter: 0, numoutlierevec: 0, numoutevec: 10, newshrink: YES, hiprecision: YES, and lsqproject: YES, and projecting the ancient genomes genotyped on the Affymetrix Axiom Genome-Wide Human Origins 1 array (“HO” AADR v.62.0) onto 70 global modern populations and the Saudi cluster individuals. The list of meta-population sources derived from AADR can be found in Table S14 , with their age distributions visualized in Figure S19 . We then performed qpAdm across four distinct time periods: the three proximal time-stratified models (T3, T2, and T1) plus a distal model (regional samples older than T3). For all models, we utilized the parameters allsnps = TRUE and fudge_twice = TRUE. Distal Modeling For the Distal model, we employed a “rotation” strategy where no populations were fixed in the reference (right) set. Instead, we rotated a pool of nine source populations between the target (left) and reference (right) positions. The rotating sources were: CHG, Morocco_Iberomaurusian.AG, Greece_Crete_Aposelemis_N.AG, Turkey_Central_Boncuklu_PPN.AG, Turkey_Southeast_Cayonu_PPN.SG, Iran_GanjDareh_N.AG, Israel_Natufian.AG, WHG, and EEHG. We tested all possible combinations of 1 to 5 sources. Proximal Modeling (T3, T2, T1) For the time-stratified models, we used a fixed set of outgroup (right) populations: CHG, Morocco_Iberomaurusian.AG, Greece_Crete_Aposelemis_N.AG, Iran_GanjDareh_N.AG, Turkey_Southeast_Cayonu_PPN.SG, WHG, EEHG, Israel_Natufian.AG, and Turkey_Central_Boncuklu_PPN.AG. We performed a combinatorial testing of 1 to 6 sources from specific candidate pools for each period. Due to the challenge of controlling for qpAdm model violations resulting from left and right-set admixture ( Flegontova et al. 2025 ), especially with modern target and ancient source populations ( Agranat-Tamir et al. 2020 ), we restricted our proximal modeling to a nonrotating protocol using the following sources: T3 Candidate Sources: T3_Caucasus, T3_Iran, T3_wEurope, T3_Levant, T3_Anatolia, T3_sAfrica, T3_eAfrica, T3_nAfrica, T3_eEurope, T3_cAfrica, and T3_sEurope. T2 Candidate Sources: T2_nAfrica, T2_Anatolia, T2_Iran, T2_eAfrica, T2_sAfrica, T2_Caucasus, T2_wEurope, T2_Levant, T2_eEurope, T2_sEurope, and three ancient Arabian sources (T2_Arabia_EMTylos_SeleucidCharacene.SG, T2_Arabia_LTylos_Sasanian_MH1MH2.SG, T2_Arabia_LTylos_Sasanian_MH3.SG). T1 Candidate Sources: T1_Iran, T1_eAfrica, T1_Caucasus, T1_sEurope, T1_sAfrica, T1_Levant, T1_eEurope, T1_cAfrica, T1_wEurope, T1_Anatolia, T1_nAfrica, and T1_Arabia. We filtered results to retain only plausible models, defined as those with valid admixture weights [0-1] and p-value ≥ 0.01. For each target population, we selected the two plausible representative models based on the highest p-values and least number of sources ( Figure 2C-D ). Error bars representing standard errors were included for all admixture components. In addition to the qpAdm model p -value, we also reported the p -value testing whether the difference between two models of rank difference 1 is significant (nested p -value) in Table S8-S11 . Only the T2 and the Distal modeling produced plausible models based on these criteria, suggesting that in T1 and T3 we may not have adequate ancestry references for modeling. Runs of homozygosity ROH are continuous segments of homozygous genotypes inherited from common ancestor ( Ceballos et al. 2018 ). Following Choudhury et al. ( Choudhury et al. 2020 ), we used PLINK function -- option-homozyg to identify runs of homozygosity (ROH) using the following parameters: we considered at least 100 SNPs for ROH, with a total length ≥ 100 kilobases and at least one SNP per 50 kb on average; we set a scanning window to contain 100 SNPs, allowed 1 heterozygous call and 5 missing calls per scanning window. We used three component Gaussian mixture model from the Mclust package (v.6.1) in R following Pemberton et al. ( Pemberton et al. 2012 ) to classify the ROHs into short ( 1671 kb) sizes. Short ROHs indicate homozygosity from ancient or distant ancestry, i.e. background relatedness. Intermediate ROHs likely arise from background relatedness with moderate level of inbreeding from past few generations, often due to reduced population sizes or reproductive isolation (e.g. due to geographic or cultural preferences), or from recent bottlenecks followed by recovery. Long ROHs arise through recent inbreeding and are common in populations with high levels of consanguinity ( Thompson 2013 ; Ceballos et al. 2018 ). Demographic history Utilizing the phased WGS data, we estimated effective population sizes at different time points within the Saudi sub-clusters using RELATE v1.1.9 ( Speidel et al. 2019 ). We used the RelateFileFormats in the Relate package to convert files from VCF format into haps/sample file format. For ancestral allele flipping, we provided RELATE with the human ancestor sequences release 107. We computed the genealogical trees using the parameters -m 1.25e-8 -N 30,000 and subsequently used the EstimatePopulationSize.sh script provided with the Relate package to estimate the effective population sizes. To lessen the bias in inference due to excessive amount of ROH, we performed the analysis by randomly selecting one phased haplotype per individual and ran Relate in haploid mode. Enrichment of functionally deleterious alleles We compared the allelic architecture between Saudi Arabian and the gnomAD v4 African/ African American (gnomAD-AFR), non-Finnish European (gnomAD-EUR) and Middle Eastern (gnomAD-MID) populations. To check for potential enrichment or purging of deleterious alleles in the Saudi, we computed the ratio of the proportional site frequency spectra for the deleterious alleles in Saudi to gnomAD-AFR or gnomAD-EUR, and contrasted it to the same ratio based on neutral or benign alleles. Utilizing the gnomAD exomes, which have a larger number of Middle Easterners compared to the genomes, we also made comparisons between the Middle Easterns and gnomAD-AFR and gnomAD-EUR. Significance differences in the ratios between variants functional classes were tested through bootstrapping. For every comparison between populations or subpopulations, we used Hypergeometric (v 3.6.2) distribution in R to down-sample both populations to equal sample sizes. All exome comparisons were down-sampled to gnomAD-MID sample size. To account for technical differences in data generation of WGS call sets between gnomAD and Saudi data, we used the proportions of variants from the normalized allele frequency spectra rather than number of variants to compare the ratio between the Saudi and the gnomAD populations at a given allele count or frequency bin. However, when comparisons were made between two gnomAD populations or between two Saudi subpopulations, the actual number of variants were used. DATA AVAILABILITY In compliance with Saudi privacy legislation and the protection of human subject confidentiality, access to individual and clinical data requires prior approval from the Saudi National Bioethics Committee. The Saudi variants discovered through WGS and their estimated allele frequencies are deposited in the Figshare repository and can be accessed through this link: https://doi.org/10.6084/m9.figshare.28059686.v1 and the array allele frequencies estimated per Saudi sub-cluster can be accessed via https://doi.org/10.6084/m9.figshare.28280060 . AUTHOR CONTRIBUTIONS M.A and C. W. K. C. conceived the study. D.K.M., M.P.W. and C.W.K.C. designed the analysis. C. W. K. C., S.M., and M.A. acquired funding for the data generation and analysis in this study. M.A. performed sample acquisition and data generation and processing. D.K.M., M.P.W., L.T., and J.T. analyzed the data. J.T. and C.D.H. provided analysis tools and resources. D.K.M., M.P.W., M.A, C.D.H. and C.W.K.C. interpreted the results. D.K.M., M.P.W. and C.W.K.C. wrote the manuscript with input from all co-authors. DECLARATION OF INTERESTS The authors declare no competing interests. SUPPLEMENTAL INFORMATION Word document: Supplementary Figures S1 – S19 Word document: Supplementary Tables S1 – S3, S13 Excel spreadsheet: Supplementary Tables S4-S12, S14 ACKNOWLEDGMENTS We are very grateful to the study participants who donated the samples used in the study. We would like to thank all the volunteers over the years for their invaluable contributions to this research. Special thanks are extended to Albandari Alowayn, Rasha Aljelaify, Mariam AlSaeed, Amal Almutairi, Fatimah Alqubaishi, and Ebtehal AlSolme for their assistance with sample collection and processing, as well as to Hadeel Elbardisy and Junghyun Jung for their efforts in data processing and preparation. We thank Michael Campbell and Arun Durvasula for providing feedback on the drafted manuscript, and Rui Leite Portela Martiniano for providing us with the Bahrain ancient DNA data. This study was supported by National Institute of General Medical Sciences (NIGMS) of the National Institute of Health (NIH) under award number R35GM142783 (to C.W.K.C.), the National Institute of Allergy and Infectious Diseases (NIAID) of the NIH under the award number R01AI173172 (to S.M.), the National Science Foundation under the award number 2135954 (to S.M.). This study was also partially funded by King Abdulaziz City for Science and Technology (KACST) (to M.A.), as part of various international genomics health research initiatives, conducted under approved agreements between KACST and the Karolinska Institute, University of Southern California, Brigham and Women’s Hospital, and deCODE Genetics. M.A. has served as the principal investigator for the Saudi Genome Project, funded by KACST, and as the director of a satellite site at King Fahad Medical City between 2016-2023. We also extend our gratitude to deCODE Genetics and the KACST Genotyping and Sequencing Facilities/Saudi Genome Project for their technical support. Computation for this work was supported by University of Southern California’s Center for Advanced Research Computing ( https://www.carc.usc.edu ). Funder Information Declared National Institute of General Medical Sciences (NIGMS) of the National Institute of Health (NIH) , R35GM142783 (to C.W.K.C.) National Institute of Allergy and Infectious Diseases, https://ror.org/043z4tv69 , R01AI173172 (to S.M.) National Science Foundation , 2135954 (to S.M.) King Abdulaziz City for Science and Technology (KACST) (to M.A.) Footnotes we have largely re-framed the manuscript to be more precise and included additional analyses. REFERENCES ↵ Agranat-Tamir L , Waldman S , Martin MAS , Gokhman D , Mishol N , Eshel T , Cheronet O , Rohland N , Mallick S , Adamski N , et al. 2020 . The Genomic History of the Bronze Age Southern Levant . Cell 181 : 1146 – 1157 .e11. OpenUrl CrossRef PubMed ↵ Albanghali MA . 2023 . Prevalence of Consanguineous Marriage among Saudi Citizens of Albaha, a Cross-Sectional Study . Int J Environ Res Public Health 20 : 3767 . OpenUrl PubMed ↵ Aleissa M , Aloraini T , Alsubaie LF , Hassoun M , Abdulrahman G , Swaid A , Eyaid WA , Mutairi FA , Ababneh F , Alfadhel M , et al. 2022 . Common disease-associated gene variants in a Saudi Arabian population . Ann Saudi Med 42 : 29 – 35 . OpenUrl CrossRef PubMed ↵ Alexander DH , Novembre J , Lange K . 2009 . Fast model-based estimation of ancestry in unrelated individuals . Genome Res . 19 : 1655 – 1664 . OpenUrl Abstract / FREE Full Text ↵ Alkuraya FS . 2014 . Genetics and genomic medicine in Saudi Arabia . Mol Genet Genomic Med 2 : 369 – 378 . OpenUrl PubMed ↵ Almarri MA , Haber M , Lootah RA , Hallast P , Turki SA , Martin HC , Xue Y , Tyler-Smith C. 2020 . The Genomic History of the Middle East . Available from: http://biorxiv.org/lookup/doi/10.1101/2020.10.18.342816 ↵ Alsalem AB , Halees AS , Anazi S , Alshamekh S , Alkuraya FS. 2013 . Autozygome Sequencing Expands the Horizon of Human Knockout Research and Provides Novel Insights into Human Phenotypic Variation.Gojobori T, editor . PLoS Genet 9 : e1004030 . OpenUrl CrossRef PubMed ↵ Altınışık NE , Kazancı DD , Aydoğan A , Gemici HC , Erdal ÖD , Sarıaltun S , Vural KB , Koptekin D , Gürün K , Sağlıcan E , et al. 2022 . A genomic snapshot of demographic and cultural dynamism in Upper Mesopotamia during the Neolithic Transition . Sci. Adv . 8 : eabo3609 . OpenUrl CrossRef PubMed ↵ Armitage SJ , Jasim SA , Marks AE , Parker AG , Usik VI , Uerpmann H-P . 2011 . The Southern Route “Out of Africa”: Evidence for an Early Expansion of Modern Humans into Arabia . Science 331 : 453 – 456 . OpenUrl Abstract / FREE Full Text ↵ Batool F , Hennig C . 2021 . Clustering with the Average Silhouette Width . Computational Statistics & Data Analysis 158 : 107190 . OpenUrl ↵ Ben Halim N , Ben Alaya Bouafif N , Romdhane L , Kefi Ben Atig R , Chouchane I , Bouyacoub Y , Arfa I , Cherif W , Nouira S , Talmoudi F , et al. 2013 . Consanguinity, endogamy, and genetic disorders in Tunisia . J Community Genet 4 : 273 – 284 . OpenUrl CrossRef PubMed ↵ Benegas G , Batra SS , Song YS . 2023 . DNA language models are powerful predictors of genome-wide variant effects . Proc. Natl. Acad. Sci. U.S.A . 120 : e2311219120 . OpenUrl CrossRef PubMed ↵ Bergström A , McCarthy SA , Hui R , Almarri MA , Ayub Q , Danecek P , Chen Y , Felkel S , Hallast P , Kamm J , et al. 2020 . Insights into human genetic variation and population history from 929 diverse genomes . Science 367 : eaay5012 . OpenUrl Abstract / FREE Full Text ↵ Bittles AH . 2008 . A Community Genetics Perspective on Consanguineous Marriage . Public Health Genomics 11 : 324 – 330 . OpenUrl ↵ Browning BL , Tian X , Zhou Y , Browning SR . 2021 . Fast two-stage phasing of large-scale sequence data . The American Journal of Human Genetics 108 : 1880 – 1890 . OpenUrl CrossRef PubMed ↵ Browning SR , Grinde K , Plantinga A , Gogarten SM , Stilp AM , Kaplan RC , Avilés-Santa ML , Browning BL , Laurie CC . 2016 . Local Ancestry Inference in a Large US-Based Hispanic/Latino Study: Hispanic Community Health Study/Study of Latinos (HCHS/SOL) . G3 Genes|Genomes|Genetics 6 : 1525 – 1534 . OpenUrl ↵ Cahoon JL , Rui X , Tang E , Simons C , Langie J , Chen M , Lo Y-C , Chiang CWK . 2024 . Imputation accuracy across global human populations . The American Journal of Human Genetics 111 : 979 – 989 . OpenUrl CrossRef PubMed ↵ Castellano S , Parra G , Sánchez-Quinto FA , Racimo F , Kuhlwilm M , Kircher M , Sawyer S , Fu Q , Heinze A , Nickel B , et al. 2014 . Patterns of coding variation in the complete exomes of three Neandertals . Proc. Natl. Acad. Sci. U.S.A . 111 : 6666 – 6671 . OpenUrl Abstract / FREE Full Text ↵ Ceballos FC , Gürün K , Altınışık NE , Gemici HC , Karamurat C , Koptekin D , Vural KB , Mapelli I , Sağlıcan E , Sürer E , et al. 2021 . Human inbreeding has decreased in time through the Holocene . Current Biology 31 : 3925 – 3934 .e8. OpenUrl CrossRef PubMed ↵ Ceballos FC , Joshi PK , Clark DW , Ramsay M , Wilson JF . 2018 . Runs of homozygosity: windows into population history and trait architecture . Nat Rev Genet 19 : 220 – 234 . OpenUrl CrossRef PubMed ↵ Chang CC , Chow CC , Tellier LC , Vattikuti S , Purcell SM , Lee JJ . 2015 . Second-generation PLINK: rising to the challenge of larger and richer datasets . Gigascience 4 : s13742-015 – 0047–0048 . OpenUrl CrossRef ↵ Charati H . 2021 . Patterns of Genetic Structure and Evidence of Gene Flow between Arabian Peninsula and European Populations . AJBSR 12 : 285 – 291 . OpenUrl Chen S , Francioli LC , Goodrich JK , Collins RL , Kanai M , Wang Q , Alföldi J , Watts NA , Vittal C , Gauthier LD , et al. 2024a . A genomic mutational constraint map using variation in 76,156 human genomes . Nature 625 : 92 – 100 . OpenUrl CrossRef PubMed ↵ Cheng J , Novati G , Pan J , Bycroft C , Žemgulytė A , Applebaum T , Pritzel A , Wong LH , Zielinski M , Sargeant T , et al. 2023 . Accurate proteome-wide missense variant effect prediction with AlphaMissense . Science 381 : eadg7492 . OpenUrl CrossRef PubMed ↵ Choudhury A , Aron S , Botigué LR , Sengupta D , Botha G , Bensellak T , Wells G , Kumuthini J , Shriner D , Fakim YJ , et al. 2020 . High-depth African genomes inform human migration and health . Nature 586 : 741 – 748 . OpenUrl CrossRef PubMed ↵ COVID-19 Host Genetics Initiative , COVID-19 Host Genetics Initiative , Leadership , Pathak GA , Karjalainen J , Stevens C , Neale BM , Daly M , Ganna A , Writing group , et al. 2022 . A first update on mapping the human genetic architecture of COVID-19 . Nature 608 : E1 – E10 . OpenUrl CrossRef PubMed ↵ Delatycki MB , Alkuraya F , Archibald A , Castellani C , Cornel M , Grody WW , Henneman L , Ioannides AS , Kirk E , Laing N , et al. 2020 . International perspectives on the implementation of reproductive carrier screening . Prenatal Diagnosis 40 : 301 – 310 . OpenUrl PubMed ↵ Diaz-Papkovich A , Anderson-Trocmé L , Ben-Eghan C , Gravel S. 2019 . UMAP reveals cryptic population structure and phenotype heterogeneity in large genomic cohorts.Tishkoff SA, editor . PLoS Genet 15 : e1008432 . OpenUrl CrossRef PubMed ↵ Elfatih A , Saad C , The Qatar Genome Program Research Consortium , Qatar Genome Project Management , Ismail S , Al-Muftah W , Badji R , Darwish D , Fadl T , Yasin H , et al. 2024 . Analysis of 14,392 whole genomes reveals 3.5% of Qataris carry medically actionable variants . Eur J Hum Genet 32 : 1465 – 1473 . OpenUrl CrossRef PubMed ↵ Elliott KS , Haber M , Daggag H , Busby GB , Sarwar R , Kennet D , Petraglia M , Petherbridge LJ , Yavari P , Heard-Bey FU , et al. 2022 . Fine-Scale Genetic Structure in the United Arab Emirates Reflects Endogamous and Consanguineous Culture, Population History, and Geography.Heyer E, editor . Molecular Biology and Evolution 39 : msac039 . OpenUrl PubMed ↵ El-Mouzan MI , Al-Salloum AA , Al-Herbish AS , Qurachi MM , Al-Omar AA . 2007 . Regional variations in the prevalence of consanguinity in Saudi Arabia . Saudi Med J 28 : 1881 – 1884 . OpenUrl Abstract / FREE Full Text ↵ Fang H , Hui Q , Lynch J , Honerlaw J , Assimes TL , Huang J , Vujkovic M , Damrauer SM , Pyarajan S , Gaziano JM , et al. 2019 . Harmonizing Genetic Ancestry and Self-identified Race/Ethnicity in Genome-wide Association Studies . The American Journal of Human Genetics 105 : 763 – 772 . OpenUrl CrossRef PubMed ↵ Fernandes V , Alshamali F , Alves M , Costa MD , Pereira JB , Silva NM , Cherni L , Harich N , Cerny V , Soares P , et al. 2012 . The Arabian Cradle: Mitochondrial Relicts of the First Steps along the Southern Route out of Africa . The American Journal of Human Genetics 90 : 347 – 355 . OpenUrl CrossRef PubMed ↵ Fernandes V , Brucato N , Ferreira JC , Pedro N , Cavadas B , Ricaut F-X , Alshamali F , Pereira L. 2019 . Genome-Wide Characterization of Arabian Peninsula Populations: Shedding Light on the History of a Fundamental Bridge between Continents.Mulligan C, editor . Molecular Biology and Evolution 36 : 575 – 586 . OpenUrl CrossRef PubMed ↵ Ferreira JC , Alshamali F , Montinaro F , Cavadas B , Torroni A , Pereira L , Raveane A , Fernandes V. 2021 . Projecting Ancient Ancestry in Modern-Day Arabians and Iranians: A Key Role of the Past Exposed Arabo-Persian Gulf on Human Migrations.Huerta-Sanchez E, editor . Genome Biology and Evolution 13 : evab194 . OpenUrl PubMed ↵ Flegontova O , Işıldak U , Yüncü E , Williams MP , Huber CD , Kočí J , Vyazov LA , Changmai P , Flegontov P. 2025 . Performance of qpAdm -based screens for genetic admixture on graph–shaped histories and stepping stone landscapes.Barton N, editor . GENETICS 230 : iyaf047 . OpenUrl PubMed ↵ Fregel R , Méndez FL , Bokbot Y , Martín-Socas D , Camalich-Massieu MD , Santana J , Morales J , Ávila-Arcos MC , Underhill PA , Shapiro B , et al. 2018 . Ancient genomes from North Africa evidence prehistoric migrations to the Maghreb from both the Levant and Europe . Proc. Natl. Acad. Sci. U.S.A . 115 : 6774 – 6779 . OpenUrl Abstract / FREE Full Text ↵ Greater Middle East Variome Consortium , Scott EM , Halees A , Itan Y , Spencer EG , He Y , Azab MA , Gabriel SB , Belkadi A , Boisson B , et al. 2016 . Characterization of Greater Middle Eastern genetic variation for enhanced disease gene discovery . Nat Genet 48 : 1071 – 1076 . OpenUrl CrossRef PubMed ↵ el-Hazmi MA , al-Swailem AR , Warsy AS , al-Swailem AM , Sulaimani R , al-Meshari AA . 1995 . Consanguinity among the Saudi Arabian population . J Med Genet 32 : 623 – 626 . OpenUrl Abstract / FREE Full Text ↵ Hedrick PW , Garcia-Dorado A . 2016 . Understanding Inbreeding Depression, Purging, and Genetic Rescue . Trends in Ecology & Evolution 31 : 940 – 952 . OpenUrl PubMed ↵ Hellenthal G , Busby GBJ , Band G , Wilson JF , Capelli C , Falush D , Myers S . 2014 . A Genetic Atlas of Human Admixture History . Science 343 : 747 – 751 . OpenUrl Abstract / FREE Full Text ↵ Henn BM , Cavalli-Sforza LL , Feldman MW . 2012 . The great human expansion . Proc. Natl. Acad. Sci. U.S.A . 109 : 17758 – 17764 . OpenUrl Abstract / FREE Full Text ↵ Jónsson H , Sulem P , Kehr B , Kristmundsdottir S , Zink F , Hjartarson E , Hardarson MT , Hjorleifsson KE , Eggertsson HP , Gudjonsson SA , et al. 2017 . Whole genome characterization of sequence diversity of 15,220 Icelanders . Sci Data 4 : 170115 . OpenUrl PubMed ↵ Karczewski KJ , Francioli LC , Tiao G , Cummings BB , Alföldi J , Wang Q , Collins RL , Laricchia KM , Ganna A , Birnbaum DP , et al. 2020 . The mutational constraint spectrum quantified from variation in 141,456 humans . Nature 581 : 434 – 443 . OpenUrl CrossRef PubMed ↵ Khayat AM , Alshareef BG , Alharbi SF , AlZahrani MM , Alshangity BA , Tashkandi NF. 2024 . Consanguineous Marriage and Its Association With Genetic Disorders in Saudi Arabia: A Review . Cureus [Internet] . Available from: https://www.cureus.com/articles/222916-consanguineous-marriage-and-its-association-with-genetic-disorders-in-saudi-arabia-a-review ↵ Lazaridis I , Nadel D , Rollefson G , Merrett DC , Rohland N , Mallick S , Fernandes D , Novak M , Gamarra B , Sirak K , et al. 2016 . Genomic insights into the origin of farming in the ancient Near East . Nature 536 : 419 – 424 . OpenUrl CrossRef PubMed ↵ Lazaridis I , Patterson N , Mittnik A , Renaud G , Mallick S , Kirsanow K , Sudmant PH , Schraiber JG , Castellano S , Lipson M , et al. 2014 . Ancient human genomes suggest three ancestral populations for present-day Europeans . Nature 513 : 409 – 413 . OpenUrl CrossRef PubMed Web of Science ↵ Li H , Durbin R . 2009 . Fast and accurate short read alignment with Burrows–Wheeler transform . Bioinformatics 25 : 1754 – 1760 . OpenUrl CrossRef PubMed Web of Science ↵ Lim ET , Würtz P , Havulinna AS , Palta P , Tukiainen T , Rehnström K , Esko T , Mägi R , Inouye M , Lappalainen T , et al. 2014 . Distribution and Medical Impact of Loss-of-Function Variants in the Finnish Founder Population.Cutler D, editor . PLoS Genet 10 : e1004494 . OpenUrl CrossRef PubMed ↵ Locke AE , Steinberg KM , Chiang CWK , Service SK , Havulinna AS , Stell L , Pirinen M , Abel HJ , Chiang CC , Fulton RS , et al. 2019 . Exome sequencing of Finnish isolates enhances rare-variant association power . Nature 572 : 323 – 328 . OpenUrl CrossRef PubMed ↵ Loh P-R , Danecek P , Palamara PF , Fuchsberger C , A Reshef Y , K Finucane H , Schoenherr S , Forer L , McCarthy S , Abecasis GR , et al. 2016 . Reference-based phasing using the Haplotype Reference Consortium panel . Nat Genet 48 : 1443 – 1448 . OpenUrl CrossRef PubMed ↵ Lohmueller KE . 2014 . The distribution of deleterious genetic variation in human populations . Current Opinion in Genetics & Development 29 : 139 – 146 . OpenUrl PubMed ↵ Lohmueller KE , Indap AR , Schmidt S , Boyko AR , Hernandez RD , Hubisz MJ , Sninsky JJ , White TJ , Sunyaev SR , Nielsen R , et al. 2008 . Proportionally more deleterious genetic variation in European than in African populations . Nature 451 : 994 – 997 . OpenUrl CrossRef PubMed Web of Science ↵ Mallick S , Micco A , Mah M , Ringbauer H , Lazaridis I , Olalde I , Patterson N , Reich D . 2024 . The Allen Ancient DNA Resource (AADR) a curated compendium of ancient human genomes . Sci Data 11 : 182 . OpenUrl PubMed ↵ Manichaikul A , Mychaleckyj JC , Rich SS , Daly K , Sale M , Chen W-M . 2010 . Robust relationship inference in genome-wide association studies . Bioinformatics 26 : 2867 – 2873 . OpenUrl CrossRef PubMed Web of Science ↵ Marchi N , Winkelbach L , Schulz I , Brami M , Hofmanová Z , Blöcher J , Reyna-Blanco CS , Diekmann Y , Thiéry A , Kapopoulou A , et al. 2022 . The genomic origins of the world’s first farmers . Cell 185 : 1842 – 1859 .e18. OpenUrl CrossRef PubMed ↵ Martiniano R , Haber M , Almarri MA , Mattiangeli V , Kuijpers MCM , Chamel B , Breslin EM , Littleton J , Almahari S , Aloraifi F , et al. 2024 . Ancient genomes illuminate Eastern Arabian population history and adaptation against malaria . Cell Genomics 4 : 100507 . OpenUrl PubMed ↵ Mbarek H , Devadoss Gandhi G , Selvaraj S , Al-Muftah W , Badji R , Al-Sarraj Y , Saad C , Darwish D , Alvi M , Fadl T , et al. 2022 . Qatar genome: Insights on genomics from the Middle East . Human Mutation 43 : 499 – 510 . OpenUrl CrossRef PubMed ↵ McInnes L , Healy J , Saul N , Großberger L. 2018 . UMAP: Uniform Manifold Approximation and Projection . JOSS 3 : 861 . OpenUrl CrossRef ↵ McKenna A , Hanna M , Banks E , Sivachenko A , Cibulskis K , Kernytsky A , Garimella K , Altshuler D , Gabriel S , Daly M , et al. 2010 . The Genome Analysis Toolkit: A MapReduce framework for analyzing next-generation DNA sequencing data . Genome Res . 20 : 1297 – 1303 . OpenUrl Abstract / FREE Full Text ↵ McLaren W , Gil L , Hunt SE , Riat HS , Ritchie GRS , Thormann A , Flicek P , Cunningham F . 2016 . The Ensembl Variant Effect Predictor . Genome Biol 17 : 122 . OpenUrl CrossRef PubMed ↵ Mineta K , Goto K , Gojobori T , Alkuraya FS. 2021 . Population structure of indigenous inhabitants of Arabia.Idaghdour Y, editor . PLoS Genet 17 : e1009210 . OpenUrl CrossRef PubMed ↵ Spear T Miran J . 2022 . Red Sea Slave Trade . In: Spear T , editor. Oxford Research Encyclopedia of African History . 1st ed. Oxford University Press New York, NY . Available from: https://academic.oup.com/edited-volume/61663/chapter/553479929 ↵ Moreno-Estrada A , Gravel S , Zakharia F , McCauley JL , Byrnes JK , Gignoux CR , Ortiz-Tello PA , Martínez RJ , Hedges DJ , Morris RW , et al. 2013 . Reconstructing the Population Genetic History of the Caribbean.Tarazona-Santos E, editor . PLoS Genet 9 : e1003925 . OpenUrl CrossRef PubMed ↵ Ober C , Hyslop T , Hauck WW . 1999 . Inbreeding Effects on Fertility in Humans: Evidence for Reproductive Compensation . The American Journal of Human Genetics 64 : 225 – 231 . OpenUrl CrossRef PubMed Web of Science ↵ Oleksyk TK , Wolfsberger WW , Chhugani K , Huang Y-N , Pokrytiuk V , Shchubelka K , Zelikovsky A , Pasaniuc B , Jinga V , Bucur O , et al. 2025 . Challenges and Recommendations in Establishing National Human Diversity Genomic Projects . Available from: https://arxiv.org/abs/2510.19869 ↵ Overall ADJ , Ahmad M , Nichols RA . 2002 . The effect of reproductive compensation on recessive disorders within consanguineous human populations . Heredity 88 : 474 – 479 . OpenUrl CrossRef PubMed ↵ Pairo-Castineira E , Rawlik K , Bretherick AD , Qi T , Wu Y , Nassiri I , McConkey GA , Zechner M , Klaric L , Griffiths F , et al. 2023 . GWAS and meta-analysis identifies 49 genetic variants underlying critical COVID-19 . Nature 617 : 764 – 768 . OpenUrl CrossRef PubMed ↵ Pedersen C-ET , Lohmueller KE , Grarup N , Bjerregaard P , Hansen T , Siegismund HR , Moltke I , Albrechtsen A . 2017 . The Effect of an Extreme and Prolonged Population Bottleneck on Patterns of Deleterious Variation: Insights from the Greenlandic Inuit . Genetics 205 : 787 – 801 . OpenUrl Abstract / FREE Full Text ↵ Pemberton TJ , Absher D , Feldman MW , Myers RM , Rosenberg NA , Li JZ . 2012 . Genomic Patterns of Homozygosity in Worldwide Human Populations . The American Journal of Human Genetics 91 : 275 – 292 . OpenUrl CrossRef PubMed ↵ Petraglia MD , Groucutt HS , Guagnin M , Breeze PS , Boivin N . 2020 . Human responses to climate and ecosystem change in ancient Arabia . Proc. Natl. Acad. Sci. U.S.A . 117 : 8263 – 8270 . OpenUrl Abstract / FREE Full Text ↵ Project Team SG . 2015 . The Saudi Human Genome Program: An oasis in the desert of Arab medicine is providing clues to genetic disease . IEEE Pulse 6 : 22 – 26 . OpenUrl ↵ Rodriguez-Flores JL , Fakhro K , Agosto-Perez F , Ramstetter MD , Arbiza L , Vincent TL , Robay A , Malek JA , Suhre K , Chouchane L , et al. 2016 . Indigenous Arabs are descendants of the earliest split from ancient Eurasian populations . Genome Res . 26 : 151 – 162 . OpenUrl Abstract / FREE Full Text ↵ Rousseeuw PJ . 1987 . Silhouettes: A graphical aid to the interpretation and validation of cluster analysis . Journal of Computational and Applied Mathematics 20 : 53 – 65 . OpenUrl ↵ Saffi M , Howard N . 2015 . Exploring the Effectiveness of Mandatory Premarital Screening and Genetic Counselling Programmes for β-Thalassaemia in the Middle East: A Scoping Review . Public Health Genomics 18 : 193 – 203 . OpenUrl CrossRef PubMed ↵ Sahoo SA , Zaidi * AA , Anagol S , Mathieson * I. 2021 . Long Runs of Homozygosity Are Correlated with Marriage Preferences across Global Population Samples . Human Biology 93 : 201 – 216 . OpenUrl CrossRef PubMed ↵ Schuenemann VJ , Peltzer A , Welte B , Van Pelt WP , Molak M , Wang C-C , Furtwängler A , Urban C , Reiter E , Nieselt K , et al. 2017 . Ancient Egyptian mummy genomes suggest an increase of Sub-Saharan African ancestry in post-Roman periods . Nat Commun 8 : 15694 . OpenUrl CrossRef PubMed ↵ Simons YB , Sella G . 2016 . The impact of recent population history on the deleterious mutation load in humans and close evolutionary relatives . Current Opinion in Genetics & Development 41 : 150 – 158 . OpenUrl PubMed ↵ Sirak K , Jansen Van Rensburg J , Brielle E , Chen B , Lazaridis I , Ringbauer H , Mah M , Mallick S , Micco A , Rohland N , et al. 2024 . Medieval DNA from Soqotra points to Eurasian origins of an isolated population at the crossroads of Africa and Arabia . Nat Ecol Evol 8 : 817 – 829 . OpenUrl PubMed ↵ Sirak KA , Fernandes DM , Lipson M , Mallick S , Mah M , Olalde I , Ringbauer H , Rohland N , Hadden CS , Harney É , et al. 2021 . Social stratification without genetic differentiation at the site of Kulubnarti in Christian Period Nubia . Nat Commun 12 : 7283 . OpenUrl CrossRef PubMed ↵ Speidel L , Forest M , Shi S , Myers SR . 2019 . A method for genome-wide genealogy estimation for thousands of samples . Nat Genet 51 : 1321 – 1329 . OpenUrl CrossRef PubMed ↵ Tadmouri GO , Nair P , Obeid T , Al Ali MT , Al Khaja N , Hamamy HA. 2009 . Consanguinity and reproductive health among Arabs . Reprod Health 6 : 17 . OpenUrl CrossRef PubMed ↵ Tang J , Chiang CWK . 2025 . A genealogy-based approach for revealing ancestry-specific structures in admixed populations . Am J Hum Genet 112 : 1906 – 1922 . OpenUrl PubMed ↵ Temaj G , Nuhii N , Sayer JA . 2022 . The impact of consanguinity on human health and disease with an emphasis on rare diseases . J Rare Dis 1 : 2 . OpenUrl ↵ Thareja G , Al-Sarraj Y , Belkadi A , Almotawa M , The Qatar Genome Program Research (QGPR) Consortium , Qatar Genome Project Management , Ismail S , Al-Muftah W , Badji R , Mbarek H , et al. 2021 . Whole genome sequencing in the Middle Eastern Qatari population identifies genetic associations with 45 clinically relevant traits . Nat Commun 12 : 1250 . OpenUrl PubMed ↵ Thompson EA . 2013 . Identity by Descent: Variation in Meiosis, Across Genomes, and in Populations . Genetics 194 : 301 – 326 . OpenUrl Abstract / FREE Full Text ↵ Vallini L , Zampieri C , Shoaee MJ , Bortolini E , Marciani G , Aneli S , Pievani T , Benazzi S , Barausse A , Mezzavilla M , et al. 2024 . The Persian plateau served as hub for Homo sapiens after the main out of Africa dispersal . Nat Commun 15 : 1882 . OpenUrl CrossRef PubMed ↵ Van De Loosdrecht M , Bouzouggar A , Humphrey L , Posth C , Barton N , Aximu-Petri A , Nickel B , Nagel S , Talbi EH , El Hajraoui MA , et al. 2018 . Pleistocene North African genomes link Near Eastern and sub-Saharan African human populations . Science 360 : 548 – 552 . OpenUrl Abstract / FREE Full Text ↵ Van Der Auwera GA , Carneiro MO , Hartl C , Poplin R , Del Angel G , Levy-Moonshine A , Jordan T , Shakir K , Roazen D , Thibault J , et al. 2013 . From FastQ Data to High-Confidence Variant Calls: The Genome Analysis Toolkit Best Practices Pipeline . CP in Bioinformatics [Internet] 43 . Available from: https://currentprotocols.onlinelibrary.wiley.com/doi/10.1002/0471250953.bi1110s43 ↵ Wang SR , Agarwala V , Flannick J , Chiang CWK , Altshuler D , Flannick J , Manning A , Hartl C , Agarwala V , Fontanillas P , et al. 2014 . Simulation of Finnish Population History , Guided by Empirical Genetic Data, to Assess Power of Rare-Variant Tests in Finland. The American Journal of Human Genetics 94 : 710 – 720 . OpenUrl ↵ Warsy A , Al-Jaser M , Albdass A , Al-Daihan S , Alanazi M . 2014 . Is consanguinity prevalence decreasing in Saudis? : A study in two generations. Afr H. Sci . 14 : 314 . OpenUrl ↵ Yang MA , Fu Q . 2018 . Insights into Modern Human Prehistory Using Ancient Genomes . Trends in Genetics 34 : 184 – 196 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted May 06, 2026. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Patterns of population structure and genetic variation within the Saudi Arabian population Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Patterns of population structure and genetic variation within the Saudi Arabian population D.K. Malomane , M.P. Williams , Leqi Tian , Ji Tang , C.D. Huber , S. Mangul , M. Abedalthagafi , C. W. K. Chiang bioRxiv 2025.01.10.632500; doi: https://doi.org/10.1101/2025.01.10.632500 Share This Article: Copy Citation Tools Patterns of population structure and genetic variation within the Saudi Arabian population D.K. Malomane , M.P. Williams , Leqi Tian , Ji Tang , C.D. Huber , S. Mangul , M. Abedalthagafi , C. W. K. Chiang bioRxiv 2025.01.10.632500; doi: https://doi.org/10.1101/2025.01.10.632500 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Evolutionary Biology Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41913) Biophysics (21436) Cancer Biology (18578) Cell Biology (25482) Clinical Trials (138) Developmental Biology (13372) Ecology (19890) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15600) Genomics (22483) Immunology (17728) Microbiology (40365) Molecular Biology (17164) Neuroscience (88540) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15136) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9818) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00