Disentangling complex language contact and admixture in the broad Gansu-Qinghai region

doi:10.1101/2025.04.28.650932

Disentangling complex language contact and admixture in the broad Gansu-Qinghai region

2025 · doi:10.1101/2025.04.28.650932

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 61,719 characters · extracted from preprint-html · click to expand

Disentangling complex language contact and admixture in the broad Gansu-Qinghai region | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Disentangling complex language contact and admixture in the broad Gansu-Qinghai region Hongye Jin , Yuxin Tao , Chengkun Yang , Sizhe Yang , Wenjing Sun , Linguistic Silk Road Research Consortium , Dan Xu , View ORCID Profile Menghan Zhang doi: https://doi.org/10.1101/2025.04.28.650932 Hongye Jin 1 State Key Laboratory of Genetics and Development of Complex Phenotypes, Center for Evolutionary Biology, Human Phenome Institute, Zhangjiang Fudan International Innovation Center, School of Life Science, Fudan University , Shanghai, 200438, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yuxin Tao 1 State Key Laboratory of Genetics and Development of Complex Phenotypes, Center for Evolutionary Biology, Human Phenome Institute, Zhangjiang Fudan International Innovation Center, School of Life Science, Fudan University , Shanghai, 200438, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chengkun Yang 1 State Key Laboratory of Genetics and Development of Complex Phenotypes, Center for Evolutionary Biology, Human Phenome Institute, Zhangjiang Fudan International Innovation Center, School of Life Science, Fudan University , Shanghai, 200438, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Sizhe Yang 2 Research Institute of Intelligent Complex Systems, Fudan University , Shanghai, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site Wenjing Sun 1 State Key Laboratory of Genetics and Development of Complex Phenotypes, Center for Evolutionary Biology, Human Phenome Institute, Zhangjiang Fudan International Innovation Center, School of Life Science, Fudan University , Shanghai, 200438, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site 3 Department of English and Linguistics Language Typology, Johannes Gutenberg-Universität Mainz , Mainz, Germany Dan Xu 3 Department of English and Linguistics Language Typology, Johannes Gutenberg-Universität Mainz , Mainz, Germany 4 Le Centre de recherches linguistiques sur l’Asie orientale (CRLAO) , Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: mhzhang{at}fudan.edu.cn dxusong{at}uni-mainz.de Menghan Zhang 1 State Key Laboratory of Genetics and Development of Complex Phenotypes, Center for Evolutionary Biology, Human Phenome Institute, Zhangjiang Fudan International Innovation Center, School of Life Science, Fudan University , Shanghai, 200438, China 2 Research Institute of Intelligent Complex Systems, Fudan University , Shanghai, China 5 Institute of Modern Languages and Linguistics, Fudan University , Shanghai, China Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Menghan Zhang For correspondence: mhzhang{at}fudan.edu.cn dxusong{at}uni-mainz.de Abstract Full Text Info/History Metrics Preview PDF Abstract Language evolution in the Gansu-Qinghai (GQ) region provides a key perspective for understanding cultural development along the Silk Road. Previous genetic and archaeological studies revealed complex multiethnic interactions shaped by migration and sociocultural exchange. However, the lack of structured linguistic data and computational tools for studying language contact has limited rigorous analysis of sociocultural evolution in this area. Here, we presented a new hybrid dataset of phonological and morpho-syntactic features from languages sampled across the GQ region. We introduced a computational framework to assess language contact and admixture, allowing us to quantify interaction among GQ languages and trace their origins. Our results showed that GQ languages have distinct contact patterns in their phonological and morphosyntactic systems, with some languages exhibiting clear evidence of mixture. Using a new statistical method, Trait Sharing Test ( TS -TEST), based on tree topology, we identified significant influences from Sinitic, Tibetan, Mongolic, and Turkic languages in shaping GQ linguistic diversity. These findings highlight the GQ region as a linguistic convergence zone on the Silk Road and provide a foundation for quantitative research on language contact and admixture. Our work strengthens the linguistic perspective on cultural evolution in the GQ region and supports future interdisciplinary studies integrating languages, genes, and material cultures. 1. Introduction As a corridor linking Western and Eastern cultural spheres, the Silk Road has historically facilitated profound cultural, economic, and technological exchanges between China and Central/Western Asia 1 , 2 . Positioned along the eastern section of this ancient trade network, the Gansu-Qinghai (GQ) region is notable for its extraordinary ethnic diversity 3 - 5 , hosting communities such as the Han, Hui, Tibetan, and various Mongolic- and Turkic-speaking populations. It represents an enduring hub for multi-ethnic interaction and population admixture, spanning prehistoric and modern eras. Archaeological evidence indicates that at around 4,000 BP, populations cultivating wheat and barley from the west encountered millet-farming communities from the Loess Plateau in this region, leading to extensive technological exchanges 6 . Genetic studies further demonstrate that the populations in the GQ region exhibit complex genetic compositions, with contributions from East Asia, the Tibetan Plateau, Siberia, and Central/Western Asia 7 - 9 . Demographic studies also show that the GQ region remains a focal point for multiethnic migration and coexistence in contemporary times. 10 These all underscore its historical and modern significance as a cultural and genetic crossroads. Language evolution is a bridge between demographic dynamics and sociocultural development. The interplay of population activities and socio-cultural interactions in the GQ region has given rise to high linguistic diversities, e.g., phylogenetic diversity and complex geographic patterns. Extensive efforts have been devoted to the linguistic fieldwork and documentation of languages in this region 11 - 15 . Accordingly, several languages in the GQ region, such as Wutun 16 and Tangwang 17 , have been categorized as “mixed” or “mixing” languages. Previous studies have also highlighted the GQ region as a linguistic area or Sprachbund , in which genetically unrelated languages share areal features due to subsequent contacts 18 - 20 . In the GQ area, these features include word order, case-marking system, and strategies to express possession and presence 19 . The susceptibility of these structural features to borrowing is consistent with established linguistic models 21 . When language contact is minimal, borrowing typically begins with non-basic vocabulary; As contact intensifies, structural features are increasingly affected. Structural borrowing starts with new phonemes with new phones in loanwords and shifts in the use of adpositions. It then extends to new phonemic contrasts and syllable structures in native vocabulary, as well as changes in word order and case-marking systems. Throughout this process, basic vocabulary remains little affected 21 . Studies of language evolution primarily rely on basic vocabulary, as it shows less contact-induced changes 22 and provides divergence information. In contrast, structural features provide a more effective basis for investigating language contact and admixture, offering a lens for examining the interplay of language, population, and sociocultural dynamics in the GQ region 22 , 23 . Quantifying language contact and admixture presents a significant challenge. Over the past two decades, introducing computational methods inspired by evolutionary biology has provided new perspectives on the study of language evolution 24 - 26 . Numerous biological approaches utilize genetic marker data to trace ancestral contributions within individual genomes 27 or to analyze genetic admixture at the population level 28 , 29 . These approaches can be adapted to linguistics by treating languages as analogous to individuals and linguistic features comparable to genetic markers 30 . Several studies of language evolution have demonstrated the effectiveness of evolutionary biology methods in exploring linguistic structures and interactions. Examples include investigations into the languages of the ancient Sahul continent 31 , the proposed Transeurasian language family 23 , and Chinese dialects 26 . Applying these methods offers a promising avenue for unraveling the GQ region’s complex language contact and admixture history. Noting this, we aim to quantitatively disentangle language contact and admixture in the GQ region. Therefore, we compiled a dataset of 146 structural traits across 23 languages spoken geographically covering the GQ region ( Figure 1 ). These languages can be classified into four linguistic phyla of northern China: Sinitic, Tibetan, Mongolic, and Turkic. We utilized a computational framework inspired by population genetics. This framework included linguistic relationships investigation, network analysis, population structure estimation, and admixture inference. Additionally, we developed a statistical test to quantify the influence of different sources on GQ languages. This test was based on comparisons of tree topology patterns, a method commonly employed to study introgression within evolutionary biology 32 , 33 . By adopting this interdisciplinary approach, our study sheds light on the intricate multiethnic interactions that have shaped the GQ region. Download figure Open in new tab Figure 1. The 23 languages are included in this study. Colors indicate main phyla to which languages belong. Triangles with black borders represent languages in the GQ region, and circles with white borders represent non-GQ languages. 2. Results 2.1. Principal component analysis of linguistic structures in the GQ region To investigate linguistic relationships among the 23 sampled languages, we conducted Principal Component Analysis using 146 structural traits of phonological and morpho-syntactic systems. The phonological PCA ( Figure 2a ) showed four primary clusters: (1) Sinitic languages and Lhasa Tibetan, (2) Amdo Tibetan dialects (Jianzha and Arou), (3) two Turkic languages (Uygur and Kazak), and (4) all Mongolic languages alongside with two Turkic in the GQ region (Salar and West Yugur). By comparison, the morpho-syntax PCA ( Figure 2c ) produced different clusters: (1) Sinitic varieties excluding Wutun, (2) Tibetan and Wutun, (3) Turkic languages with two GQ Mongolic languages (Bonan and East Yugur), and (4) other Mongolic languages. The key differences between the phonological and morpho-syntactic patterns were observed for Lhasa, Wutun, and four Altaic samples. Specifically, while Lhasa Tibetan and Wutun grouped with Sinitic in phonology, they aligned with Tibetan in morpho-syntax. Similarly, from the phonological perspective, Salar and West Yugur clustered with the Mongolic, while in morpho-syntax, Bonan and East Yugur clustered with the Turkic. Overall, the PCA results illuminated structural affinities within the linguistically diverse GQ region, offering insights into the complex interplay of phonological and morpho-syntactic characteristics. Download figure Open in new tab Figure 2. PC plots, Neighbor-nets, and delta scores of 23 languages. a-b : PC plots based on phonological ( a ) and morpho-syntactic ( b ) traits. c - d : Neighbor-nets based on pair-wise distances derived from phonological ( c ) and morpho-syntactic ( d ) traits. e-f : Phonological ( e ) and morpho-syntactic ( f ) delta scores for GQ and non-GQ languages. p -values were calculated using two-sample Wilcoxon test. *, p -value < 0.05; **, p -value < 0.01. 2.2. Neighbour-Net and delta score measurement of languages in the GQ region To elucidate the interrelations among the 23 sampled languages, we calculated Hamming distances based on the presence or absence of phonological and morpho-syntactic traits, respectively. Pairwise distances were analyzed using the neighbor-net algorithm 34 ( Figures 2c – d ). Both networks exhibited extensive reticulations, highlighting widespread linguistic contact in both systems. In the phonological network ( Figure 2c ), Sinitic languages, East Yugur, Bonan, and Tu showed relatively short branch lengths and abundant box-like structures. On the other hand, the morpho-syntactic network ( Figure 2d ) showed the most intricate reticulations in GQ-Sinitic, East Yugur, West Yugur, and Bonan. These results suggested that these languages were involved in substantial horizontal influence. To quantify the extent of language contact, we calculated delta scores 35 for each language. A two-sample Wilcoxon test comparing GQ and non-GQ languages showed that GQ languages had significantly higher delta scores in both phonological and morpho-syntactic systems (p < 0.05, Figure 2e-f ), confirming more extensive contact within the GQ region. To assess whether contact dynamics in phonology and morpho-syntax were correlated, we computed Spearman and Kendall correlation coefficients for delta scores of GQ languages. Neither test showed statistical significance (Spearman: ρ = 0.257, p -value = 0.354; Kendall: τ = 0.143, p -value = 0.495), indicating no monotonic relationship between the two systems. These findings highlighted strong but distinct processes of language contact among GQ languages at the phonological and morpho-syntactic systems. 2.3. Fine-scale structure of languages from multiple ethnic groups To investigate the fine-scale structure of the 23 languages, we used STRUCTURE to dissect their linguistic compositions based on phonological and morpho-syntactic traits, respectively. The Evanno method 36 favoured K = 3 as the optimal number of components for both analyses (Figure S1). In the phonological analysis ( Figure 3a ), the three components were Sinitic, Tibetan, and Altaic-related (blue, yellow, and green, respectively). In the GQ region, Sinitic languages were predominantly associated with the Sinitic-related component, except for Wutun, which displayed a significant Tibetan-related signal. Altaic languages in the GQ region showed varying similarities with East Yugur and Bonan, manifesting the highest proportions of Sinitic-related components. In the morpho-syntactic analysis ( Figure 3b ), the three major compositions were also Sinitic, Tibetan, and Altaic-related (red, gray, and orange, respectively). Within the GQ region, most Sinitic languages displayed notable Tibetan and Altaic-related components. The situation is particularly extreme for Wutun, which showed only a minimal proportion of the Sinitic-related component. Among Altaic languages in the GQ region, varying proportions of Sinitic and Tibetan-related components were observed. Bonan contained the highest proportion of Tibetan-related components, while East and West Yugur displayed considerable Sinitic-related components. Overall, these results shed light on classification and sharing patterns of structural features. Download figure Open in new tab Figure 3. Fine-scale structure and compositional entropy of languages. a-b : Fine-scale structure of 23 languages inferred by STRUCTURE software using phonological ( a ) and morpho-syntactic ( b ) traits, respectively. c-d : Compositional entropy values derived from the K = 3 compositions of phonology ( c ) and morpho-syntax ( d ) for each GQ language. Upper-right windows show Wilcoxon tests comparing the GQ Altaic and GQ Sinitic entropy differences. n.s., not significant, p -value ≥ 0.05; *, p -value < 0.05. To quantify the diversity of language components of GQ languages, we estimated compositional entropies for each language based on fine-scale structures at optimal K = 3 in both systems ( Figure 3c-d ). We observed that the compositional entropies of the two Amdo Tibetan dialects (Jianzha and Arou) were consistently low, while the patterns for Sinitic and Altaic languages in the GQ region were more complicated. We compared entropy values between GQ Sinitic and GQ Altaic languages and found that in morpho-syntax, GQ Sinitic have significant higher entropy values than GQ Altaic (two-sample Wilcoxon test, p -value < 0.05), while in phonology, GQ Altaic languages displayed higher mean and median entropy values than GQ Sinitic (mean: GQ Altaic = 0.46, GQ Sinitic = 0.31; median: GQ Altaic = 0.42, GQ Sinitic = 0.19), although the difference did not achieve statistical significance. These findings suggest that Amdo Tibetan in the GQ region has relatively simple structures in both systems, GQ Altaic languages exhibit more complex phonological structure of Altaic-, Sinitic-, and Tibetan-related components compared to GQ Sinitic languages, and GQ Sinitic languages show significantly more complex morpho-syntactic structure (See SI). In addition, we further examined the relationship between structural complexity in phonology and morpho-syntax across GQ languages based on their compositional entropy values. There was no statistically significant correlation (Spearman: ρ = 0.061, p -value = 0.832; Kendall: τ = -0.029, p -value = 0.923), suggesting independent evolutionary trajectories for phonological and morpho-syntactic complexity. Our results provide quantitative signals for language contact and potential admixture in the GQ region. 2.4. Assessing the linguistic contributions of neighboring languages to the GQ region To examine whether the structural influences of GQ languages were from neighboring linguistic groups, we classified four language groups (GQ, Sinitic [Sn], Tibetan [Tb], and Altaic [Alt]) according to STRUCTURE results. Admixture F3 statistics, computed from phonological and morpho-syntactic traits, showed significantly negative values for F3(GQ; Sn, Alt) ( Figure 4a ; phonology: p = 0; morpho-syntax: p = 0.042; bootstrap permutation test; see Table S1-2 for full results). This indicated a mixture of Sinitic and Altaic influences on the GQ language group. To corroborate this finding, Bayesian network modeling further identified directed edges from Sn, Alt, and Tb language groups to the GQ group ( Figure 4b , S9; see Table S4-5 for full results), highlighting their contributions. These findings demonstrate that combined influences from Sinitic, Altaic, and Tibetic languages have shaped the structural features of languages in the GQ region. Download figure Open in new tab Figure 4. Admixture statistics and Bayesian network. a : Dots present admixture statistics measured using morpho-morpho-syntactic and phonological traits. The violin plots show distributions derived from bootstrap permutations. b : A summary plot of Bayesian network results, adapted from the network constructed using overall data, integrating results from phonological and morpho-syntactic analyses. Solid arrows indicate edges that appeared in the analysis using overall data; dashed arrows indicate the edge did not appear in the analysis using overall data but appeared in the phonological or morpho-syntactic analysis. The dashed line without arrows indicates conflicting results across the three analyses (See SI). The thickness of the arrows is related to the strength of edge importance. 2.5. Inferring admixture sources on individual GQ languages To investigate the influence of four major language groups in northern China—Sinitic, Tibetan, Mongolic, and Turkic—on languages in the GQ region, we developed a statistical method, the Trait Sharing Test ( TS -Test). Based on tree topology comparisons, this method assesses whether the trait sharing between a target language (a language in the GQ region) and a source language group (one of the four) is statistically significant. A significant result indicates a substantial contribution from the source group. Phonological and morpho-syntactic traits were analyzed separately. Based on the TS -Test, the phonological analysis ( Figure 5a ) revealed that most GQ languages primarily reflected contributions from their respective phyla. For example, Xining, a Sinitic language, exhibited substantial contributions from the Sinitic group. West Yugur showed significant input from both Turkic and Mongolic sources. We found strong Sinitic influence in East Yugur. Wutun had pronounced influences from both Tibetan and Sinitic. In contrast, Salar displayed no statistically significant evidence of phonological trait sharing with any of the four groups. This may be due to minor or diffuse contributions from multiple sources, as reflected in its fine-scale structure ( Figure 3a ), which resulted in a random-like pattern. Therefore, the small number of phonological traits in our dataset (n = 26) may have limited the resolution necessary to detect these influences. On the other hand, the morpho-syntactic results ( Figure 5b ) revealed more complex patterns of language admixture. West Yugur, East Yugur, Tu, Salar, Wutun, and Gangou exhibited substantial contributions from two source groups. Bonan demonstrated influence by the greatest number of sources, receiving significant contributions from three groups: Tibetan, Mongolic, and Turkic. These findings provided a detailed perspective on how linguistic interactions with neighboring groups have shaped individual GQ languages. Download figure Open in new tab Figure 5. Significant trait sharing between languages in the GQ region and the four sources. The segments in the pie charts represent substantial contributions from sources corresponding to respective colors. A gray circle for Salar in the phonological plot indicates that it showed no significant influence from any of the sources. 3. Discussion The Silk Road was one of the most transformative exchange networks in human history, facilitating the convergence of diverse peoples, goods, and cultures. Positioned along the eastern stretch of this historic corridor, the Gansu-Qinghai (GQ) region has been a critical nexus of multi-dimensional interactions that have profoundly shaped the cultural and linguistic landscape of Northwest China. Understanding the patterns of multi-ethnic contact in this region is central to exploring its rich sociolinguistic complexity. In this study, we compiled a dataset of 146 structural traits of phonological and morphosyntactic systems from 23 languages, which offered robust material for examining linguistic admixture in the GQ region. According to the dataset, we proposed a new computational framework to quantitatively disentangle the language contact and admixture. Our findings revealed that linguistic interactions in the GQ region exhibited high intensity, with distinct patterns governing phonological and morphosyntactic systems. By applying population genetic approaches, we inferred the fine-scale linguistic structure of these languages and demonstrated that the observed structural patterns in the GQ region reflect the combined influence of Sinitic, Altaic, and Tibetan language groups. Additionally, we developed a statistical tool to test trait sharing and showed how Sinitic, Tibetan, Mongolic, and Turkic sources have shaped the structural diversity of GQ languages. The GQ region is characterized by its striking linguistic diversity, with different languages from distinct phyla coexisting in proximity. Our PCA of 23 language samples identified four major clusters in both phonological and morpho-syntactic systems, which broadly corresponded to the Sinitic, Amdo Tibetan, Turkic, and Mongolic language groups. Language contacts among these groups led to the complex process of admixture, as exemplified by Wutun and four Altaic languages spoken locally (Bonan, East Yugur, Salar, and West Yugur). Wutun is linguistically recognized as a mixed language 16 . Its grammatical structure has been profoundly shaped by Amdo Tibetan, meanwhile, most of its lexicons derive from Mandarin Chinese, which results in a phonological system compatible with other Chinese dialects 37 . Consistent with these descriptions, our PCA showed Wutun clustering with Sinitic languages in phonology, while with Amdo Tibetan in morpho-syntax. Among Altaic languages in the GQ region, previous linguistic studies proposed that East and West Yugur form a “mini- Sprachbund ,” with Bonan and Salar constituting another 20 . In this region, several phonological features, such as consonant strength and specific phonemic merging 20 , were shared by Mongolic and Turkic languages. Additionally, morpho-syntactic traits in GQ Mongolic, such as plural markers and conditional converbs, appear to reflect Turkic impact 14 . Our findings in this study support these observations as Turkic languages (Salar, West Yugur) grouped with Mongolic languages in the phonological PC space, while Bonan and East Yugur (Mongolic) clustered with Turkic languages in morpho-syntax. These patterns highlighted language contact and admixture among Mongolic and Turkic languages in the GQ region. Moreover, GQ Sinitic languages exhibited distinctive areal features compared to other Chinese varieties, such as OV word order and case marking 19 , resulting from contacts with non-Sinitic groups. Our morpho-syntactic PCA exhibited a similar pattern in which five GQ Sinitic languages clustered close to each other. These results aligned with previous studies and strengthened the evidence for GQ as a linguistic area. Different language systems followed different dynamics during language contact and admixture in the GQ region. Previous studies report that Mongolic and Turkic languages in this region experienced varying degrees of Tibetanization or Sinicization in their phonological systems 38 . At the same time, GQ Sinitic varieties extensively borrow morpho-syntactic features from Altaic and Tibetan languages 18 - 20 . Our computational analyses provided new evidence supporting these patterns. We found that the intensities of phonological and morpho-syntactic contacts and admixture in GQ languages should be largely independent. Specifically, GQ Altaic languages showed more complex phonological strcuture than GQ Sinitic, whereas GQ Sinitic languages exhibited significantly more complex morpho-syntactic structure. Notably, language contact and admixture in the GQ region can be partially explained by the genetic history of its populations. None of the groups speaking GQ languages are genetically homogeneous 19 . For example, speakers of Bonan and Salar prevalently carry Y-haplogroups associated with Sino-Tibetan populations 9 , 39 , and we showed that both languages contain notable Sinitic and Tibetan-related components. However, some cases in the GQ region illustrate a mismatch between linguistic affiliation and population activities. The ancestors of Bonan, Dongxiang, and Salar speakers likely originated from Turkic-speaking populations in Central or Western Asia 9 , 40 , 41 ; however, Bonan and Dongxiang peoples shifted to Mongolic languages, while Salar people retained their Turkic language. Other examples include the Wutun speakers. Linguists have previously hypothesized that ancestors of Wutun speakers may have migrated from Nanjing or Sichuan 42 . Currently, Wutun bears strong Amdo Tibetan influence, and its speakers self-identify as Tibetan. Overall, detailed genetic studies of speakers of languages in the GQ region remain limited. Current genetic research has primarily focused on large-scale east–west admixture patterns, and the genetic background of each population in this region still requires further investigations. More comprehensive interdisciplinary research is needed to clarify the complex interplay of languages, genes, and cultures in the GQ region. In the GQ region, certain linguistic structural features have played key roles in the dynamics of language contact and admixture and have been shaped by intensified selective pressures. Among ten structural traits identified in our study (Table S7), case-marking systems were particularly noteworthy, including AccDat (identical accusative and dative case markers), ErgInst (identical ergative and instrumental markers), and NomAcc (nominative-accusative alignment). The adoption of case marking in Sinitic languages is a defining characteristic of the GQ linguistic area 18 - 20 . A previous study, based on an early 20th-century Gansu dialect corpus, suggested that case markers in GQ Sinitic originated from prosodic fillers used as topic markers, arising through sustained contact between Sinitic and non-Sinitic speakers 43 . Our findings echoed this view, highlighting the pivotal role of case-marking features in language contacts in the GQ region. In addition, several traits were instrumental in distinguishing the three linguistic components identified in the region’s fine-scale language structure 23 (See Materials and methods and Figure S2). For example, vowel harmony in the past ( HarmVPast ) distinguishes the Altaic-related component in phonology, while prepositions and postpositions ( PrepPost ) mark the Sinitic-related component in morpho-syntax. These findings provided data-driven analyses of crucial features during language contact and admixture in the GQ region. In addition, we employed tree topology comparisons to disentangle the influences of distinct source languages on target languages and introduced a statistical approach, TS -Test. This approach assesses the proportion of shared traits between targets and sources while controlling similarities arising by chance through statistical evaluation. As the number of traits increases, we expect the accuracy of this method to improve accordingly. Applying this approach, we delineated the respective contributions of Sinitic, Tibetan, Mongolic, and Turkic languages to those spoken in the GQ region. We anticipate that TS -Test will facilitate broader culture-related quantitative investigations, offering a useful tool for exploring linguistic and cultural exchanges across diverse contexts. 4. Conclusions In this study, by analyzing structural features across 23 languages, we confirm extensive language contact and admixture in the GQ region, supporting previous proposals of GQ region as a linguistic Sprachbund 18 - 20 . Our results reveal distinct dynamics of contact and admixture in phonological and morphosyntactic systems. The overall structural pattern of GQ languages can largely be attributed to the combined influences of Sinitic, Mongolic, Turkic, and Tibetan languages ( Figure 6 ). However, the extent of each influence varies across individual GQ languages. Key structural features, such as case-marking strategies, highlight the role of multi-ethnic contact in shaping the linguistic landscape of this region. Future studies should expand the number of language samples and linguistic features to better understand the GQ region, Northwestern China, and the Silk Road. Integrating linguistic, genetic, and archaeological evidence will illuminate the intricate history of multi-ethnic exchanges along the Silk Road. Download figure Open in new tab Figure 6. Summary of linguistic contact and admixture in the GQ region as influenced by surrounding linguistic groups. 5. Materials and methods 5.1. Data Collection The linguistic data were collected by linguists working in different languages. Among the 23 language datasets, 18 were obtained from recent fieldwork, and 5 were sourced from existing documents. A list of these authors is annexed at the end of this paper. The dataset contained 26 phonological traits and 120 morpho-syntactic traits across 23 languages. We designated this dataset as GQ_admix. In this dataset, there are 15 languages sampled from the GQ region. These comprised seven Sinitic varieties (Xining, Linxia, Tangwang, Gangou, Wutun, Xiahe, and Lanzhou), two Amdo Tibetan (Jianzha and Arou), four Mongolic (Dongxiang, Bonan, Tu, and East Yugur), and two Turkic (Salar and West Yugur). Besides, eight languages outside the GQ area are included—Standard Mandarin (Beijing), Baoding, Zhangjiakou, Lhasa, Chahar, Horqin, Uygur, and Kazak. 5.2. PCA, Neighbor-net, and delta score To give profiles of the 23 languages, we conducted principal component analysis (PCA) based on their structural traits. The PCA was performed using the prcomp() function from R base package, and the proportion of variance explained by each principal component was calculated using the fviz_eig() function from the factoextra package. Next, we implemented the Neighbor-net algorithm 34 to further examine language contact. Neighbor-Net is a phylogenetic network model based on an agglomerative method. A higher number of box-like structures in a derived network indicates greater horizontal exchange. To quantify the contact strength of languages, we calculated the delta scores 35 for each language. The delta score is a statistical measure for detecting non-treelike signals in data; a higher delta score indicates that a language has obscured more treelike signals and suggests stronger contact. These methods have been widely used to assess tree-like patterns in linguistic data 22 , 26 , 44 . The Neighbor-net was constructed using R package phangorn . Delta scores were computed using SplitsTree v6.3.35 45 . We specified Hamming distance as the distance measure. 5.3 Fine-scale Structure of Languages To investigate the nuanced structure of languages, we conducted population stratification analysis using the software STRUCTURE v2.3.4 27 . Following the similar computational process in the previous studies 23 , 26 , 31 , we performed separate clustering analyses on phonological and morpho-syntactic data, respectively. Considering the nature of linguistic data, we set the ploidy level of language individuals to 1. We employed the “admixture” model and allowed the model to estimate the Dirichlet parameter α for admixture proportions. We allowed independent allele frequencies across populations and fixed the Allele Frequencies Parameter λ At 1.0. We made the STRUCTURE software calculate from the hypothetical population number K = 2 to K = 10, and for each K value, we performed 30 repetitions. We set the Markov chain to run for 10,000 generations after 10,000 generations of burn-in. We analyzed STRUCTURE outputs using structureHarvester 46 , which implemented the Evanno method 36 to calculate ΔK statistics based on the rate of change in log probability between successive K-values. The hypothetical number corresponding to the highest ΔK values represents the optimal number of components. Replicates for each K-value were summarized using the Greedy algorithm in CLUMPP 47 . Results were visualized through stacked bar plots. To quantify the diversity of language components, we calculated the compositional entropy using Shannon entropy estimation for each language based on the fine-scale structure inferred at the optimal K number. Higher entropy values indicate a higher linguistic diversity of languages. We calculated such entropies using the R package entropy . We performed two-sample Wilcoxon tests to assess differences in linguistic diversity of languages between GQ Sinitic and GQ Altaic. To identify instrumental traits in delineating components, we examined inferred allele frequencies for each component provided in STRUCTURE outputs. We selected the replicate with the highest Estimated Prob of Data at optimal K in phonological and morpho-syntactic analyses, respectively. For each hypothetical component, we manually extracted the frequency of each trait being present (value = 1). Traits that were present (frequency > 2/3) in at least one component and were absent (frequency < 2/3) in one other component were selected as critical traits. 5.4. F3 Statistics and Bayesian Networks To investigate whether the admixture of surrounding language groups can explain the structural features in the GQ region, we employed two methods: the admixture F3 statistic and the Bayesian network. The admixture F3 statistic, introduced by Patterson 28 , is a widely adopted measure in population genetics for assessing whether a population represents an admixture between two other populations. In the genetic analysis, F3(C; A, B) is defined as the average product of allele frequency differences at each genetic marker between a target population C and two source populations A and B. A negative F3 value indicates that the target population C is a mix of populations A and B. Analogously, in our analysis, we treated linguistic structural traits as genetic markers and language groups as populations. We defined four language groups (GQ, Sinitic [Sn], Tibetan [Tb], and Altaic [Alt]) based on STRUCTURE results. These groups included languages that maximally represent each of the three inferred components and potentially mixed GQ languages. Notably, although Dongxiang was constituted by a relatively pure Altaic-related component in both systems, we did not include it in the Alt language group but in the GQ language group. This was based on the fact that Dongxiang is located deep in the GQ linguistic area geographically, and it was reported to have some highly distinctive characteristics, making it not readily intelligible to any other Mongolic language speakers 14 . We then computed F3(GQ; Sn, Alt) , F3(GQ; Sn, Tb) , and F3(GQ; Tb, Alt) using phonological, and morpho-syntactic traits in our dataset. The computational procedure involved two main steps. First, we calculated trait frequencies within each language group. For each trait, we computed the product of frequency differences between GQ and each pair of source groups. The F3 statistic was then derived as the means of these products across all traits. Second, to assess whether the obtained F3 values were significantly negative, we utilized a permutation test framework. We performed a bootstrap resampling process with 500 iterations. A value was considered significantly negative if more than 95% of bootstrap replicates yielded negative values. The distribution of bootstrap results was visualized using a violin plot. All F3 calculations were implemented through a custom R script. Bayesian networks are directed acyclic graph models that represent a set of variables and their conditional dependencies 48 . In a Bayesian network, nodes are sets of variables, and edges (arrows) between nodes indicate direct probabilistic dependencies. The state of a downstream node depends on the state of an upstream node. In the current study, we interpreted this as indicating a directed contribution from one language group to another. We constructed Bayesian networks using structural trait frequencies from the four language groups (i.e., GQ, Sn, Tb, Alt) and employed a penalized node-average likelihood approach 49 for network scoring. The model determined the presence and directionality of arrows between nodes, yielding an optimal network structure, and assessed the importance of each connection. To ensure comparability across networks, we reported each edge’s importance as its proportional contribution to the overall model likelihood. We interpreted this value as the relative influence of an upstream node on a downstream node. The Bayesian networks were constructed using the hc() function from the R package bnlearn , and the importance of each connection was scored using the package’s score() function. 5.5 Language trait sharing testing To quantify the trait sharing between a target language and multiple source languages, we derived a statistical method based on the comparison of tree topology patterns. We named this tool Trait Sharing Test ( TS -Test). This method used triadic combinations as the basic analytical unit (See SI). Given a target language t and a set of source languages S (with at least two sources), the system iterated through all possible triadic combinations formed by the target and different source pairs { t, s i , s j J . In the first step, within each triadic combination, the analysis first filtered traits by removing those with missing values and eliminating invariant traits (where t, s i , and s j share identical values), resulting in a subset of valid traits. Subsequently, each trait can be used to construct a tree involving the target and two sources, whose topology indicates whether the target aligns with one of the sources or is distinct from both. It then calculated the matching rate between the target and each source language, i.e., the proportion of trees with the target and a source clustered together. In the second step, at the global level, each source language’s local matching rates across all its participating triadic combinations were averaged, and the results were normalized to produce a global sharing score □ ∈ [0,1]. We implemented a permutation test framework to assess the statistical significance of the trait sharing between target and source languages. We produced 5000 permutation samples of each target language’s binary traits vector while maintaining the marginal traits counts. For instance, if the actual data contained 50 traits with 20 taking value 1 and the remaining 30 traits taking value 0, the permuted samples would preserve this 20:30 distribution. This process generated a null distribution of similarity scores, from which we constructed the 95% Highest Density Area (HDR). If the observed sharing score □ exceeded the upper threshold of the corresponding 95% HDR, we would reject the null hypothesis that the target and source language trait distributions were independent, concluding that their trait sharing was statistically significant. In TS -Test, both the target and source can represent individual languages or language groups. If a target or source contains multiple languages, the program uses vector replication during triadic combination calculations to ensure all possible inter-language combinations are considered. Specifically, if a triadic combination contains two source groups with m and n languages, respectively, and a target group with k languages, the trait vectors of languages in these source groups will be replicated n × k and m × k times respectively. In contrast, the target language vectors will be replicated m × n times. This expanded vector set will contain m × n × k combination configurations. We represented the observed sharing scores using stacked bar plots and showed the null distributions of different sources using stacked kernel density plots. Sources showing statistically significant trait sharing were marked with asterisks. The R scripts required to execute the TS -Test are provided in the supplementary files. 5.6. Outlier Traits Detection To better understand the traits that are crucial in shaping the scenario of languages, we conducted a PCA-based outlier detection to identify outlier traits showing distinct distributions compared to most traits. We conducted outlier detection separately using phonological, morpho-syntactic, and overall traits in our dataset. We excluded traits with variant frequencies below 5%. After PCA, we selected the top three principal components for the following use, as they collectively explained over 50% of the variance in all three analyses. This method performed regression between traits and principal components, generating z-scores that indicate the degree of deviation from the overall mean. Based on these z-scores, it calculated distance statistics that should follow a chi-square distribution under the null hypothesis of no outlier traits, thereby identifying potential outliers. We applied false discovery rate (FDR) correction using the R package qvalue , and we used 5% as the threshold for determining significant outlier traits. Results were visualized using Manhattan plots. The final set of outlier traits comprised the union of significant outliers detected across all three analyses. The outlier detection was conducted using the R package pcadapt 50 . Data and Codes Availability Data and codes sharing are not applicable until the manuscript has been accepted and published formally. Detailed member information in the Linguistic Silk Road Research Consortium Department of English and Linguistics, Johannes Gutenberg University Mainz, Mainz, Germany Dan Xu and Hao Li Institute of Modern Languages and Linguistics, Fudan University, Shanghai, China Menghan Zhang and Hongye Jin Le Centre de recherches linguistiques sur l'Asie orientale (CRLAO), Paris, France Saiyinjiya Caidengduoerji Institute of Ethnology and Anthropology, Chinese Academy of Social Sciences, Beijing, China Siqinchaoketu and Weibin Yin School of Culture, History & Language, Australian National University, Canberra, Australia Yarjis Xueqing Zhong School of Liberal Arts, Nanjing University, Nanjing, China Kai Sun School of Sociology and Anthropology, Xiamen University, Xiamen, China Jin Sun Department of Chinese Language and Literature, Sun Yat-sen University, Guangzhou, China Mingyuan Shao and Feiyiming Yan Chinese Language and Culture College, Huaqiao University, Xiamen, China Bingli Liu School of Literature, Zhejiang University, Hangzhou, China Xingzhou Wei School of Humanities, Shanghai Normal University, Shanghai, China Shuangcheng Wang School of Literature and Communication, Qinghai Minzu University, Qinghai, China Wei Ma School of Chinese Ethnic Minority Languages and Literatures, Minzu University of China Zhencao Zhong Acknowledgements This research was supported by the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation program (Grant Agreement No. 883700 TRAM), the National Natural Science Foundation of China (32470656 and T2122007), the National Key R&D Program of China (2020YFE0201600), the National Social Science Foundation (23&ZD317 and 20&ZD301), Shanghai Municipal Science and Technology Major Project (2017SHZDZX01). We also thank the Shanghai Institute for Mathematics and Interdisciplinary Sciences (SIMIS) for their financial support (grant number SIMIS-ID-2024-ZL). We thank the members of Linguistic Silk Road Research Consortium: Saiyinjiya Caidengduoerji, Siqinchaoketu, Yarjis Xueqing Zhong, Hao Li, Kai Sun, Jin Sun, Feiyiming Yan, Mingyuan Shao, Weibin Yin, Bingli Liu, Xingzhou Wei, Shuangcheng Wang, Mingyuan Shao, Wei Ma, and Zhencao Zhong for valuable assistance in data collection, collation, and annotation. Funder Information Declared European Research Council (ERC) under the European Union's Horizon 2020 research and innovation programEuropean Research Council (ERC) under the European Union's Horizon 2020 research and innovation program, , No. 883700 TRAM National Natural Science Foundation of ChinaNational Natural Science Foundation of China, , 32470656 , T2122007 National Key R&D Program of ChinaNational Key R&D Program of China, , 2020YFE0201600 National Social Science FoundationNational Social Science Foundation, , 23&ZD317 , 20&ZD301 Shanghai Municipal Science and Technology Major ProjectShanghai Municipal Science and Technology Major Project, , 2017SHZDZX01 Shanghai Institute for Mathematics and Interdisciplinary SciencesShanghai Institute for Mathematics and Interdisciplinary Sciences, , SIMIS-ID-2024-ZL References ↵ Liu , J. Xibei shidi yu silu wenming (in Chinese) . ( Gansu Education Publishing House , 2023 ). ↵ Rong , X. The Silk Road and Cultural Exchanges between East and West . Vol. 14 ( Brill , 2022 ). ↵ Hao , S. Gansu teyou minzu wenhua xingtai yanjiu (in Chinese) . ( Ethnic Publishing House , 1999 ). Qin , Y. Gan-Ning-Qing diqu duominzu geju xingchengshi yanjiu (in Chinese) . ( Ethnic Publishing House , 2005 ). ↵ Zhu , W. Jindai Gan-Ning-Qing diqu minzu guanxi yanjiu (in Chinese) . ( Social Siences Academic Press (China ), 2017 ). ↵ Dong , G. et al. Prehistoric trans-continental cultural exchange in the Hexi Corridor, northwest China . The Holocene 28 , 621 – 628 ( 2018 ). doi: 10.1177/0959683617735585 OpenUrl CrossRef ↵ He , G. et al. Pilot work of the 10K Chinese People Genomic Diversity Project along the Silk Road suggests a complex east-west admixture landscape and biological adaptations . SCIENCE CHINA Life Sciences 68 , 914 – 933 ( 2025 ). doi: 10.1007/s11427-024-2748-4 OpenUrl CrossRef PubMed Xiong , J. et al. Inferring the demographic history of Hexi Corridor over the past two millennia from ancient genomes . Sci Bull (Beijing) 69 , 606 – 611 ( 2024 ). doi: 10.1016/j.scib.2023.12.031 OpenUrl CrossRef PubMed ↵ Dan Xu & Hui Li Xu , D. & Wen , S. in Languages and Genes in Northwestern China and Adjacent Regions (eds Dan Xu & Hui Li ) 55 – 78 ( Springer Singapore , 2017 ). ↵ Zhang , B. Multi-ethnic migration in Northwest China . ( University of Groningen , 2018 ). ↵ Mo , C. Gan-Qing yuyan quyu hanyu fangyan zhi xingcheng ji yanbian yanjiu (in Chinese) . ( Ethnic Publishing House , 2023 ). Xu , D. Tangwanghua yanjiu (in Chinese) . ( Ethnic Publishing House , 2014 ). Dwyer , A. M. Salar: a study in Inner Asian language contact processes . Vol. 37 ( Otto Harrassowitz Verlag , 2007 ). ↵ Janhunen , J. The Mongolic Languages . ( Routledge , 2006 ). ↵ Roos , M. E. The Western Yugur (Yellow Uygur) Language: Grammar, Texts, Vocabulary . ( 2000 ). ↵ Sandman , E. Wutun as a mixed language . New perspectives on mixed languges: From core to fringe , 325 – 359 ( 2021 ). ↵ Dan Xu & Hui Li Xu , D. & Wen , S. in Languages and Genes in Northwestern China and Adjacent Regions (eds Dan Xu & Hui Li ) 87 – 105 ( Springer Singapore , 2017 ). ↵ Slater , K. W. A grammar of Mangghuer: a Mongolic language of China’s Qinghai-Gansu Sprachbund . ( Routledge , 2005 ). ↵ Xu , D. & Peyraube , A. Zhongguo jingnei gansu qinghai yidai de yuyan quyu (in Chinese) . Hanyu xuebao , 2 – 15 +95 ( 2018 ). ↵ Nugteren , H. Mongolic phonology and the Qinghai-Gansu languages . ( Leiden University , 2011 ). ↵ Thomason , S. G. & Kaufman , T. Language contact, creolization, and genetic linguistics . ( Univ of California Press , 2023 ). ↵ Greenhill , S. J. et al. Evolutionary dynamics of language systems . Proc Natl Acad Sci U S A 114 , E8822 – E8829 ( 2017 ). doi: 10.1073/pnas.1700388114 OpenUrl Abstract / FREE Full Text ↵ Hübler , N. & Greenhill , S. J. Modelling admixture across language levels to evaluate deep history claims . Journal of Language Evolution 7 , 166 – 183 ( 2023 ). doi: 10.1093/jole/lzad002 OpenUrl CrossRef ↵ Gray , R. D. , Drummond , A. J. & Greenhill , S. J. Language Phylogenies Reveal Expansion Pulses and Pauses in Pacific Settlement . Science 323 , 479 – 483 ( 2009 ). doi: 10.1126/science.1166858 OpenUrl Abstract / FREE Full Text Heggarty , P. et al. Language trees with sampled ancestors support a hybrid model for the origin of Indo-European languages . Science 381 , eabg0818 ( 2023 ). doi: 10.1126/science.abg0818 OpenUrl CrossRef PubMed ↵ Yang , C. et al. Large-scale lexical and genetic alignment supports a hybrid model of Han Chinese demic and cultural diffusions . Nature Human Behaviour 8 , 1163 – 1176 ( 2024 ). doi: 10.1038/s41562-024-01886-9 OpenUrl CrossRef PubMed ↵ Pritchard , J. K. , Stephens , M. & Donnelly , P. Inference of Population Structure Using Multilocus Genotype Data . Genetics 155 , 945 – 959 ( 2000 ). doi: 10.1093/genetics/155.2.945 OpenUrl Abstract / FREE Full Text ↵ Patterson , N. et al. Ancient Admixture in Human History . Genetics 192 , 1065 – 1093 ( 2012 ). doi: 10.1534/genetics.112.145037 OpenUrl Abstract / FREE Full Text ↵ Lipson , M. et al. Reconstructing Austronesian population history in Island Southeast Asia . Nat Commun 5 , 4689 ( 2014 ). doi: 10.1038/ncomms5689 OpenUrl CrossRef PubMed ↵ Syrjänen , K. , Honkola , T. , Lehtinen , J. , Leino , A. & Vesakoski , O. Applying Population Genetic Approaches within Languages: Finnish Dialects as Linguistic Populations . Language Dynamics and Change 6 , 235 – 283 ( 2016 ). doi: 10.1163/22105832-00602002 OpenUrl CrossRef ↵ Reesink , G. , Singer , R. & Dunn , M. Explaining the Linguistic Diversity of Sahul Using Population Models . PLOS Biology 7 , e1000241 ( 2009 ). doi: 10.1371/journal.pbio.1000241 OpenUrl CrossRef PubMed ↵ Shang , H.-Y. et al. Phytop: a tool for visualizing and recognizing signals of incomplete lineage sorting and hybridization using species trees output from ASTRAL . Horticulture Research 12 ( 2024 ). doi: 10.1093/hr/uhae330 OpenUrl CrossRef ↵ Santos , S. H. D. et al. Massive inter-species introgression overwhelms phylogenomic relationships among jaguar, lion, and leopard . Systematic Biology ( 2025 ). doi: 10.1093/sysbio/syaf021 OpenUrl CrossRef ↵ Bryant , D. & Moulton , V. Neighbor-Net: An Agglomerative Method for the Construction of Phylogenetic Networks . Molecular Biology and Evolution 21 , 255 – 265 ( 2004 ). doi: 10.1093/molbev/msh018 OpenUrl CrossRef PubMed Web of Science ↵ Holland , B. R. , Huber , K. T. , Dress , A. & Moulton , V. δ Plots: A Tool for Analyzing Phylogenetic Distance Data . Molecular Biology and Evolution 19 , 2051 – 2059 ( 2002 ). doi: 10.1093/oxfordjournals.molbev.a004030 OpenUrl CrossRef PubMed Web of Science ↵ Evanno , G. , Regnaut , S. & Goudet , J. Detecting the number of clusters of individuals using the software STRUCTURE: a simulation study . Mol Ecol 14 , 2611 – 2620 ( 2005 ). doi: 10.1111/j.1365-294X.2005.02553.x OpenUrl CrossRef PubMed Web of Science ↵ Janhunen , J. , Peltomaa , M. , Sandman , E. & Xiawu , D. in Languages of the world ( Lincom Europa , 2008 ). ↵ Janhunen , J. Describing and transcribing the phonologies of the Amdo Sprachbund . Centering the Local. A Festschrift for Dr. Charles Kevin Stuart on the Occasion of his Sixtieth Birthday. Asian Highlands Perspectives 37 , 122 – 137 ( 2015 ). OpenUrl ↵ Shou , W.-H. et al. Y-chromosome distributions among populations in Northwest China identify significant contribution from Central Asian pastoralists and lesser influence of western Eurasians . Journal of Human Genetics 55 , 314 – 322 ( 2010 ). doi: 10.1038/jhg.2010.30 OpenUrl CrossRef PubMed Web of Science ↵ Jianzhong , M. & Stuart , K. “Stone Camels and Clear Springs” The Salar’s Samarkand Origins . Asian folklore studies , 287 – 298 ( 1996 ). ↵ Szeto , P. Y. Revisiting the Amdo Sprachbund: Genes, languages, and beyond . Himalayan Linguistics 20 ( 2022 ). doi: 10.5070/h920353645 OpenUrl CrossRef ↵ Chen , N. Wutunhua yinxi (in Chinese) . Minzu yuwen , 1 – 10 ( 1988 ). ↵ Xu , D. Gan-Qing yidai yu/bin ge biaoji “a/ha” tanyuan (in Chinese) . Zhongguo yuwen , 3 – 19 +126 ( 2024 ). ↵ Ferraz Gerardi , F. et al. Lexical phylogenetics of the Tupi-Guarani family: Language, archaeology, and the problem of chronology . PLoS One 18 , e0272226 ( 2023 ). doi: 10.1371/journal.pone.0272226 OpenUrl CrossRef PubMed ↵ Huson , D. H. & Bryant , D. The SplitsTree App: interactive analysis and visualization using phylogenetic trees and networks . Nature Methods 21 , 1773 – 1774 ( 2024 ). doi: 10.1038/s41592-024-02406-3 OpenUrl CrossRef ↵ Earl , D. A. & VonHoldt , B. M. STRUCTURE HARVESTER: a website and program for visualizing STRUCTURE output and implementing the Evanno method . Conservation genetics resources 4 , 359 – 361 ( 2012 ). OpenUrl CrossRef ↵ Jakobsson , M. & Rosenberg , N. A. CLUMPP: a cluster matching and permutation program for dealing with label switching and multimodality in analysis of population structure . Bioinformatics 23 , 1801 – 1806 ( 2007 ). doi: 10.1093/bioinformatics/btm233 OpenUrl CrossRef PubMed Web of Science ↵ Jensen , F. V. & Nielsen , T. D. Bayesian networks and decision graphs . Vol. 2 ( Springer , 2007 ). ↵ Bodewes , T. & Scutari , M. Learning Bayesian networks from incomplete data with the node-average likelihood . International Journal of Approximate Reasoning 138 , 145 – 160 ( 2021 ). doi: 10.1016/j.ijar.2021.07.015 OpenUrl CrossRef Privé , F. , Luu , K. , Vilhjálmsson , B. J. & Blum , M. G. B. Performing Highly Efficient Genome Scans for Local Adaptation with R Package pcadapt Version 4 . Molecular Biology and Evolution 37 , 2153 – 2154 ( 2020 ). doi: 10.1093/molbev/msaa053 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted April 30, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Disentangling complex language contact and admixture in the broad Gansu-Qinghai region Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Disentangling complex language contact and admixture in the broad Gansu-Qinghai region Hongye Jin , Yuxin Tao , Chengkun Yang , Sizhe Yang , Wenjing Sun , Linguistic Silk Road Research Consortium , Dan Xu , Menghan Zhang bioRxiv 2025.04.28.650932; doi: https://doi.org/10.1101/2025.04.28.650932 Share This Article: Copy Citation Tools Disentangling complex language contact and admixture in the broad Gansu-Qinghai region Hongye Jin , Yuxin Tao , Chengkun Yang , Sizhe Yang , Wenjing Sun , Linguistic Silk Road Research Consortium , Dan Xu , Menghan Zhang bioRxiv 2025.04.28.650932; doi: https://doi.org/10.1101/2025.04.28.650932 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Evolutionary Biology Subject Areas All Articles Animal Behavior and Cognition (7634) Biochemistry (17690) Bioengineering (13892) Bioinformatics (41935) Biophysics (21451) Cancer Biology (18587) Cell Biology (25499) Clinical Trials (138) Developmental Biology (13375) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24316) Genetics (15609) Genomics (22505) Immunology (17736) Microbiology (40393) Molecular Biology (17180) Neuroscience (88598) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15151) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9824) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-06-28T06:30:42.658729+00:00