CagY sequence and structural motifs are associated with ancestry and disease in world-wide Helicobacter pylori strains

preprint OA: closed CC-BY-4.0
📄 Open PDF Full text JSON View at publisher
Full text 71,113 characters · extracted from preprint-html · click to expand
CagY sequence and structural motifs are associated with ancestry and disease in world-wide Helicobacter pylori strains | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results CagY sequence and structural motifs are associated with ancestry and disease in world-wide Helicobacter pylori strains View ORCID Profile Mario Angel López-Luis , Roberto C Torres , Jay V. Solnick , M. Constanza Camargo , View ORCID Profile Alfonso Méndez-Tenorio , Javier Torres doi: https://doi.org/10.1101/2025.06.20.660796 Mario Angel López-Luis 1 Departamento de Bioquímica, Laboratorio de Biotecnología y Bioinformática Genómica, Escuela Nacional de Ciencias Biológicas, Instituto Politécnico Nacional , Campus Lázaro Cárdenas Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mario Angel López-Luis Roberto C Torres 2 Unidad de Investigación en Enfermedades Infecciosas, UMAE Pediatría, Instituto Mexicano del Seguro Social , Ciudad de México, México Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jay V. Solnick 3 Department of Medicine and Department of Microbiology and Immunology, University of California , Davis, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site M. Constanza Camargo 4 Division of Cancer Epidemiology and Genetics, National Cancer Institute , Rockville, MD, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Alfonso Méndez-Tenorio 1 Departamento de Bioquímica, Laboratorio de Biotecnología y Bioinformática Genómica, Escuela Nacional de Ciencias Biológicas, Instituto Politécnico Nacional , Campus Lázaro Cárdenas Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Alfonso Méndez-Tenorio For correspondence: amendezt{at}ipn.mx uimeip{at}gmail.com Javier Torres 2 Unidad de Investigación en Enfermedades Infecciosas, UMAE Pediatría, Instituto Mexicano del Seguro Social , Ciudad de México, México Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: amendezt{at}ipn.mx uimeip{at}gmail.com Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Helicobacter pylori ( Hp ) interacts with gastric epithelial cells using a multi-protein type 4 secretion system (T4SS) whose function is regulated by CagY. CagY is the T4SS backbone, connecting the bacterial cytoplasm with the epithelial cytoplasm and interacting with most other T4SS proteins. However, its mechanisms of action are unknown. We aimed to comprehensively analyse cagY in a worldwide collection of strains to unmask any correlation between ancestry and/or disease and sequence diversity, including an analysis of critical domains and associated structural changes. We used 674 cagY sequences derived from1,012 Hp genomes, sequenced with Single Molecule Real-Time from the Hp Genome Project. Phylogenetic and principal component analyses (PCA) revealed a strong population structure among European, Asian, African and American clusters, which interfered with other analyses. To address this heterogeneity, dimensional reduction analysis with PCA and Linear Discriminant Analyses were used to minimize interference before studying possible association of cagY diversity with disease. For this analysis, cagY genes were fragmented using unitig-caller with a designed analytical pipeline that clearly separated the non-atrophic gastritis (NAG), advanced intestinal metaplasia and gastric cancer (GC) groups. Sixty-four unitigs significantly separated GC from NAG based on cagY sequences, and the most GC-associated unitigs were localized in the middle repeat region of the protein (MRR), particularly in the A module, including the cysteine-containing motif YLDCVSQ. Proline was enriched in the MRR and VirB10 regions. The consequences of the diversity on the protein structure were analysed using ChimeraX. The validated and stabilized 14-multimer model showed major structural changes along the multimer-structure, particularly in the cysteine-rich MRR region, that might be one mechanism by which CagY regulates T4SS function. Introduction Helicobacter pylori ( Hp ) colonizes the gastric epithelium of about 43% of the human world population, and it is usually acquired in early childhood due to intimate contact with mother and other family members infected with the bacteria ( Li et al., 2023 ). The infection is asymptomatic in most individuals, although about 2% of those infected will develop gastric cancer (GC) after decades of gastric colonization ( Herrera & Parsonnet, 2009 ) . Hp has coevolved with humans most probably since the origin of our species ( Falush et al., 2003 ), resulting in an exquisite adaption to live in the hostile environment of the stomach. Accordingly, it is not a surprise to learn that the population structure of Hp mirrors that of humans and can be used as an excellent marker of ancient and recent human migrations ( Falush et al., 2003 ). In addition, the ancestry of Hp strains may be studied with fineSTRUCTURE, a program developed to study human populations ( Moodley et al., 2009 ). The best documented Hp virulence factor is the pathogenicity island, cagPAI, consisting of operons with about 17 genes that codes for a T4SS, a complex structure that spans from the Hp cytoplasm through the bacterial membrane and facing out to interact with the membrane of the gastric epithelial cell. This T4SS allows the translocation of DNA, bacterial cell envelope precursor metabolites and the oncogenic protein CagA to the gastric epithelial cells. Once in the cytoplasm, the EPIYA motifs of CagA become phosphorylated in order to interact with kinases and other cellular molecules, leading to activation of inflammatory mediators and cytoskeletal modifications ( Alipour, 2029 ; Sharafutdinov et al., 2021 ), increasing the risk for the development of gastric diseases. CagY is a key protein for the function of the T4SS, regulating its pro-inflammatory activity, modulating the intensity of gastric inflammation and hence the tissue damage ( Delahay et al., 2008 ). CagY is a 2,000 aa protein with an extremely unusual high number of cysteines (around 58 residues) and it is one of the largest and more complex bacterial proteins. Its sequence contains different domains that function as the tunnel of the T4SS “syringe”, connecting the cytoplasm of Hp with the cytoplasm of the epithelial cell, spanning the bacterial internal membrane (IM), periplasm and the outer membrane (OM), to fa+-ce outside ( Hu et al., 2019 ). Furthermore, CagY includes two polymorphic repeated regions, the Five Repeat Region (FRR) an intrinsically disorder region (IDR) with unknown function and the Middle Repeat Region (MRR) an 800 amino acids domain with two repetitive modules, A and B. Each module includes the motifs: delta (δ), mu (μ) and alpha (α) for module A and epsilon ((ε), lambda (λ) and beta (β) for module B ( Delahay et al., 2008 ; Liu et al., 1999 ). In addition, it has the antenna projection (AP), a region located at the tip of the T4SS that interacts with the gastric epithelial cells ( Tran et al., 2023 ). It is important to note that CagY does not share homology with any known protein, except for a short region in the carboxyl terminus, homologous to VirB10 ( Barrozo et al., 2016 ; Delahay et al., 2008 ; Tran et al., 2023 ). Furthermore, several of the other Cag proteins bind along the different domains of CagY to form the complex T4SS ( Backert & Tegtmeyer, 2017 ; Delahay et al., 2008 ; Fischer et al., 2020 ). It is not a surprise then that CagY seems to have the key function of modulating the activity of the T4SS working as a rheostat to turn on and off the adaptative immune response ( Barrozo et al., 2016 ; Sierra et al., 2019 ; Tegtmeyer et al., 2020 ). The mechanism to regulate this process is uncertain, although there is evidence suggesting that recombination between or mutations within the repeat modules in the MRR region are associated with the rheostat function ( Barrozo et al., 2013 , 2016 ; Delahay et al., 2008 ; Sierra et al., 2019 ). In addition, it has been suggested that CagY may also induce inflammation by interacting with TLR5, to produce Interleukin 8 (IL8) ( Tegtmeyer et al., 2020 ).The VirB10 homologous region is also reported to bind to integrins in the outer membrane of gastric epithelium cells, an activity that may depend on the composition of the MRR region ( Koelblen et al., 2017 ). Despite its importance for the function and virulence of the T4SS, very little is known about the mechanisms of action and about structure-function of CagY. The study of this protein is very challenging because of its high sequence diversity, multiple domains, large size including the 800 amino acids repeats region and its multimer structure ( Delahay et al., 2008 ). Furthermore, because of its highly diverse gene sequence and unusually long repeats region, the sequences deposited in databases are often not reliable because they are usually truncated or incorrectly annotated ( Liu et al., 1999 ). Only a fraction of the protein has been crystalized and its structure as a monomer or multimer is yet unknown ( Sheedlo et al., 2020 ). We were recently able to model the structure of the CagY multimer using several different approaches and considering the reported crystalized regions of the protein ( López-Luis et al., 2023 ). In this work we aimed to comprehensively analyse cagY and its encoded protein to learn about the correlation of sequence-structure-function-disease. For this analysis, we used the genomes from the H. pylori Genome Project ( Hp GP) ( Thorell et al., 2023 ), the largest worldwide collection of strains sequenced using long-read sequencing. The cagY sequences were analysed with an array of machine learning methods, because of their ability to discern complex patterns or to make high confidence modelling simulations ( Jumper et al., 2021 ) plus other recently developed bioinformatic tools, including by ours ( López-Luis et al., 2023 ). Methods and material Helicobacter pylori dataset The Hp GP was launched by the US National Cancer institute ( Hp GP, NCI, USA) and the H. pylori strains were sequenced with Single Molecule Real-Time (SMRT)/Pacific Biosciences (PacBio) ( Eid et al., 2009 ). A total of 1,012 H. pylori genomes were provided by the NCI with metadata including ancestry of the strains and clinical condition of the individuals ( Thorell et al., 2023 ). The genomes were filtered for the presence of cagPAI and then the cagY gene was extracted from Prokka v1.14.6 ( Seemann, 2014 ) and HMMer v3.2.1 ( Wheeler & Eddy, 2013 ) outputs, extracting 1000 nucleotides upstream and downstream of cagY , to avoid truncated sequences. For validation of results nine previously reported cag Y sequences were included in the analyses ( Barrozo et al., 2013 ; Jackson et al., 2020 ). We finally studied 682 sequences of cagY from different countries ( Suppl Table 1 ). Phylogenetic analysis of cagY gene The cagY gene was aligned using a Hidden Markov Model (HMM) with the 100 longest cagY genes, constructed with HMMer v3.2.1 software ( Wheeler & Eddy, 2013 ) followed by a progressive alignment of all cagY genes with ClustalW v2 ( Larkin et al., 2007 ). Then, a phylogenetic tree was constructed with Maximum Likelihood using GTR model followed by independent rate of inversions and transversions with MEGA v11 Software ( Tamura et al., 2021 ) and visualized with itol v7.2 ( Letunic & Bork, 2024 ). The Single Nucleotide Polymorphisms (SNPs) were extracted from the cagY alignment with SNP-sites v2.4.1 ( Page et al., 2016 ) and a Principal Component Analysis (PCA) of the presence/absence of SNPs constructed with a script in python using the scikit-learn library ( Pedregosa et al., 2011 ) and RStudio and finally plotted with ggplot2 library ( Wickham, 2016 ). Estimation of Virtual Genomic fingerprints with VAMPhyRE Virtual Genomic Fingerprints (VGFs) were built with Virtual Analysis Method for Phylogenomic fingeRprint Estimation (VAMPhyRE) using a library of 65,536 probes of 8-mers, as previously described ( Muñoz-Ramírez et al., 2017 ). The obtained table of phylogenetic distances was analyzed with MEGA v11 ( Tamura et al., 2021 ) using Unweighted Pair Group Method with Arithmetic Mean (UPGMA) and itol v7.2 ( Letunic & Bork, 2024 ). In addition, a dimensional reduction analysis was done with PCA and informative features with an importance value (Mean Decrease Gini, MDG) >zero selected using RandomForest (RF) and displayed by ggplot2 library ( Wickham, 2016 ) in R. Phylogenetic network of cagY gene Genetic distances between each pair of strains were obtained with Nexus and formatted as a triangular distance matrix, as previously described ( Muñoz-Ramirez et al., 2021 ). Each distance was normalized between 0 and 1 according to: where NWij is the normalized weights between strains i and j, Wij is the genetic distance between strains i and j, and Wij(max), Wij(min) are the maximum and minimum value for the genetic distance in the whole network respectively. With this normalization a value of 0 means the two strains have the highest genetic similarity, and a value of 1 means the two strains are the most dissimilar. Analyses of cagY variants and their probable association with disease To evaluate the association between cagY diversity and gastric diseases, we implemented the fragmentation of cagY gene into unitigs using unitig-caller v1.3.1 ( Holley & Melsted, 2020 ). The program constructs a database of unique unitigs with length of at least 31 pb in each cagY sequence, and a presence/absence matrix was built for each of the 674 cagY sequences with a python script. Next, a dimensional reduction analysis with PCA and LDA was run. For the LDA analyses we selected the most informative unitigs with MDG values >zero using Random Forest (RF). For the PCA we used the first 20 principal components (PC) with all unitigs and select the more informative for each PC components. For comparation of the presence/absence of unitigs in GC vs NAG and IM vs NAG a Fisher Exact Test was used, and the p-value corrected with Bonferroni. Finally, we aligned the unitigs to cagY of the reference strain 26695-ATCC of Hp to localize the significant unitigs (p-value <0.01) using a Python script. The results were represented in a Manhattan plot, illustrating the unitig position in cagY and its p-value in each disease group. Classification and construction of Machine Learning model To train and evaluate the prediction capability of Machine Learning (ML) models for disease, a frequency matrix with all unitigs was constructed (including those that significantly distinguished the groups) with a pipeline developed in Python v3.9 and scikit-learn library ( Pedregosa et al., 2011 ). The following models were tested, Multinomial Naive Bayes (MNB), Support Vectorial Machine (SVM), Logistic Regression (LR), RandomForest (RF) and Multi-Layer Perceptron (MLP) as simple Neural Network (NN). For RF method we determined the importance of each unitig with the MDG value. All classification algorithms were used with default values. The performance of informative unitigs was estimated using ROC-AUC and F1 Score used for training and 20% to test data, samples we paired using the Random over sampler of scikit learn, and a 10-training with different seed was done ( Asnicar et al., 2024 ; Mahesh, 2020 ; Raimondi et al., 2019 ). All methods were compared with Wilcoxon Rank Sums Test and a p-value < 0.05 was considered as significant. Localization and co-presence of informative unitigs along CagY To investigate whether informative unitigs exhibit disease-specific co-occurrence patterns suggestive of distinct secondary-structural arrangements, we used the cagY sequence of reference strain 26695 as a scaffold for unitig mapping. All informative unitigs were aligned to the 26695 loci via a custom Python pipeline. We then assessed the pairwise presence/absence of variants across the gene using Fisher’s exact test, and plotted significative co-occurrence events, together with their genomic coordinates and association p-values, in a Manhattan plot generated with ggplot2 ( Wickham, 2016 ) in RStudio. Homology modelling and Molecular dynamics of CagY The effect of the sequence variation on the structure of cagY was estimated doing a homology modeling of the CagY protein with MODELLER (v10.4) ( Eswar et al., 2006 ), with python 3.9, using CagY of Hp 26695 as a reference, as previously described ( López-Luis et al., 2023 ). 1000 homology models were constructed for each CagY sequence to select the best model based on the DOPE score. The tridimensional model was aligned with ChimeraX software ( Pettersen et al., 2021 ) based on CagY 26695 topology. To stabilize and refine the model, we constructed a trimmer of each CagY protein, and the system of explicit solvent was constructed with VMD software ( Humphrey et al., 1996 ), using the recommended script with NAMD3 software ( Phillips et al., 2020 ) for 10 to 50 ns. The Root Mean Square Deviation (RMSD) and Root Mean Square Fluctuation (RMSF) along Equilibrium Molecular Dynamics (EMD) were calculated with Python MDAnalysis ( Cohen et al., 2023 ). Finally, the comparation of position of RMSD between CagY structures was made with ChimeraX. The plots were constructed and displayed with ggplot2 in Rstudio. Results Phylogenetic analyses reveal a strong population structure for cagY gene and for the MRR region of the gene Phylogenetic reconstruction of cagY using a kmer based reconstruction ( Figure 1a ). showed a strong population structure with clusters for European, Asian, African and Latin American populations, similar to what we and others have previously observed with the analyses of the core genome ( Muñoz-Ramírez et al., 2017 ; Yahara et al., 2013 ) suggesting a strong coevolution of cagY enough to mirror the ancestry of its human host. Another approach, a phylogenetic network analyses also confirmed the strong population structure of cagY ( Figure 1b ), depicting the grouping of the gene sequences following their ancestry. Even the analyses of only the MRR region showed a population structure like the whole cagY gene ( Figure 1c ). Furthermore, an LDA supervised reduction dimension of cagY and MRR sequences also showed a clear separation following ancestry ( Suppl Figure 1a and 1b ). Thus, phylogenetic and reduction dimension approaches confirmed the strong population structure of cagY and its MRR fragment. Download figure Open in new tab Figure 1. Phylogenetic analysis of the c agY gene using the VAMPhyRE strategy. Colours represent the ancestry of Helicobacter pylori . a) Phylogenetic analysis of the complete cagY gene, showing the clustering of H. pylori ancestries, which mirrors the clustering of the core genome. b) The tree cluster related Hp populations, distinguishing the ancestries of Asia-Europe and the America-Africa regions. c) Phylogenetic analysis of the MRR region of the cagY gene, showing the ancestry clustering of the MRR with lower resolution compared to the complete cagY gene. Bioinformatic tools using unitigs reveal association of cagY diversity with disease Once the strong population structure of cagY was revealed, the challenge was to avoid its effect that confounds other analyses and search for any correlation of cagY variants with disease. After trying several tests, we found an approach that could distinguish the clinical groups, the fragmentation of each of the cagY genes into unitigs to construct a library and scan back all genes, a strategy previously used to more efficiently compare genomes in Genome Wide Association Studies (GWAS). After scanning each gene with the unitigs library, a presence/absence matrix was built to run a dimensional reduction analysis with PCA ( Suppl Figure 2 ) and as expected, components C1 and C2 clearly clustered following the Hp ancestries ( Suppl Figure 2a ), but not the clinical source of the cagY sequence ( Suppl Figure 2b ). A variance analysis showed that the first 20 components explained almost 60% of the variance. According to these results another dimensional reduction analysis using a supervised LDA was implemented, to detect “hidden” patterns associated with disease. The resulted LDA plot showed a clear discrimination of the clinical groups, NAG, IM and GC ( Figure 2a ), LD1 discriminated GC from IM, whereas LD2 discriminate NAG from GC and IM. We then asked to which components of the PCA analysis ( Suppl Table 2 ) were associated the unitigs most informative to discriminate the disease group and for that we determined the mean decrease gini value (MDG) of each informative unitig ( Figure 2b ). Most of the unitigs with the highest MDG value were associated with components 10 or higher, and although some were also informative for ancestry (PC1-associated) it was possible to identify unitigs informative for disease but not for population structure. These results indicate that our pipeline analyses made a correct adjustment for population structure, resulting in a reliable approach to predict the clinical origin based on cagY diversity. Download figure Open in new tab Figure 2. Machine Learning analysis of cagY unitigs. a) Linear Discriminant Analysis (LDA) of the Presence/Absence matrix of unitigs. The analysis shows the phenotype-based clustering. b) Mean Decrease Gini (MDG) values of significant unitigs, where the colour represents the Principal Component (PC) associated with each unitig. The highest MDG values are related to higher PCs, while the lowest values are associated with PC 1 and PC 2. Short fragments of cagY (unitigs) may distinguish variants of cagY associated with GC The unitigs significantly associated with each clinical group were identified and aligned to CagY from Hp 26695 reference strain (Suppl. Table 2). Sixty-five unitigs were significantly associated with GC, whereas no unitig was found significantly associated with IM. Accordingly, the group of IM was not included in the following analyses, which was performed using NAG as controls and GC as cases. The ability of the 65 GC-associated unitigs to predict disease group was validated with supervised ML methods. Five supervised ML models were trained using three set of unitigs, all 22,721 unitigs from the generated library, the 65 GC-associated unitigs and 65 randomly selected unitigs ( Suppl Table 3 ). The RF model was the best prediction model with a ROC-AUC value over 83% and a F1 Score of 80% with the 65 GC-associated unitigs. In contrast, the analyzes with all-library unitigs, resulted in significantly lower ROC-AUC values and F1 Scores ( Suppl Figure 3 ). Sequence of the unitigs informative for GC locate mostly in the MRR of CagY Once the informative value of the GC-associated unitigs was validated, their position was located by aligning the sequences in the CagY protein of Hp-26695 reference strain ( Figure 3a ) and the functional domain identified. The 65 unitigs were identified in 79 different positions, 65 were informative for GC and the other 14 for NAG ( Suppl Table 4 ). Most of the GC-associated unitigs located at the MRR region and 13 mapped repeatedly in 37 different positions, starting at the 8 th repetitive module corresponding to an A module. Among the 37 unitigs, 32 mapped in the A module mostly in the delta-mu motifs, and only 5 located in the B module. The predominant location of GC-associated unitigs at the MRR is in accordance with previous reports that have associated this region with modulation of the function of the T4SS ( Barrozo et al., 2013 , 2016 ; Bella et al., 2021 ; Delahay et al., 2008 ; Liu et al., 1999 ; Tegtmeyer et al., 2020 ). Additionally, we found the presence of cysteine codifying codons in 25 of the 37 unitigs mapping within MRR ( Figure 3a ), an important observation because of the role of these many cysteine residues in the structure of the protein and its final multimer, as described below. Other informative unitigs located outside the MRR, mainly in the VirB10 region and between MRR and VirB10 ( Figure 3a ). All the NAG-associated unitigs mapped outside MRR, between MRR and VirB10 and in the FRR domain. In addition, we performed an analysis of copresence of pairs of unitigs, to identify those that were significantly co-present in the GC group, as illustrated in the Manhattan plot of Figure 3a . The analyses showed that some unitigs in the MRR region were significantly co-present with unitigs outside MRR, particularly in the VirB10 domain. Download figure Open in new tab Figure 3. Unitig association analysis of the cagY gene. a) Manhattan plot of unitig-phenotype associations, where colours represent the frequency in each phenotype, green to NAG and Red to GC. Below the plot, the copresence analysis, functional domains, and cysteine and proline codons positions are displayed. The analysis highlights significant associated regions in cagY represented as unitigs. The copresence analysis illustrates intragenic GC interactions. b) Three-dimensional structure of the MRR region, showing the representation of unitigs and cysteine positions. c) Three-dimensional structure of the 14 subunits of the MRR region, illustrating the presence of unitigs and cysteine positions. Tridimensional structure of CagY-MRR highlights the importance of associated unitigs Once the informative unitigs were mapped in the amino acid sequence of CagY we moved to elucidate the effects these variants may have on the tertiary structure of the protein using the analytical pipeline we recently developed to model CagY ( López-Luis et al., 2023 ). We first mapped the GC-informative unitigs in the structure of MRR, where many included a Cys residue as described before ( Figure 3a ). In the monomer the unitigs concentrated in a highly folded Cys-rich region sitting in the periplasm of the membrane ( Figure 3b ). The multimer (14-mer) of the protein forms a donut at the base of the T4SS with the MRR region, which lumen is supposed to be the channel through which CagA and other molecules are translocated ( Figure 3c ). The figure shows that unitigs including cysteine form concentric rings shaping the donut where the AYLDCVSQAK uniting (U27) located in the inner ring, facing to the lumen of the tunnel. The composition of this structure suggests that variation in the aa sequences around the Cys bridges could result in modification of the diameter of the tunnel. To further study this possibility, we next analyzed the structure of the 14-multimer and began with the reference HP26695 CagY to spot the known domains and the location of the informative unitigs ( Figure 4a ). Whereas most of the GC-associated unitigs located at the base on the amino side of the protein, some also localized up in the neck and in the antenna region (AP) within the VirB10 homologue motif. Figure 4b shows the localization of the informative unitigs in the 14-multimer, where it is clear how the unitigs interact to form rings along the complex CagY-multimer that constitute the skeleton of the T4SS. Download figure Open in new tab Figure 4. Three-dimensional representation of unitigs in CagY 26695. Purple represents the MRR region, dark gray represents the VirB10 homologous region, yellow represents the TM region, red and green represent the association of unitigs, and orange represents cysteine positions. a) Three-dimensional structure of the CagY multimer, showing the representation of unitigs within the CagY multimer. b) Monomer structure of CagY, in this structure we see the unitigs amog CagY sequence. c) log2 of Fold Change of unitig presence whitin CagY phenotypes. Once the model with the reference sequence was ready, the analyses was done with sequence variants associated to different diseases but with similar ancestry (hspSWEuropeMiscAmerica) two NAG, two IM and two GC. The stabilized models of the six CagY sequences ( Suppl. Figure 4a ) were different from each other, as evidenced by the RMSD values of comparing structures from the same disease ( Table 1 ), even when we compare the structures of same phenotype ( Suppl. Figure 5 ). Validation analyses for the six models are shown in suppl Figure 5a . The stabilized and validated 14-mer model for the six proteins is presented in Figure 5 , where marked differences between proteins can be observed all along the multimer, from the antenna and down to the MRR region. In the above row of figure 5 , CagY from the NAG strain HpGP-COL-120 shows the lumen of the tunnel closed, whereas in IM (strain HpGP-MEX-001) and GC (strain HpGP-PER-114) the tunnel is open. However, in the row below, all three multimers from NAG, IM and GC, respectively, present the tunnel open. Among the many changes, we noticed that the structure of the two GC sequences had a narrower diameter in the neck region (orange arrow in Figure 5 ). It seems that the structure of the CagY multimer is as diverse as the sequences, and it is hard to identify any changes associated with the disease of origin. View this table: View inline View popup Download powerpoint Table 1. Determination of the Root Mean Square Deviation value comparating between each of the eight CagY structures analyzed Download figure Open in new tab Figure 5. HpGP SWEurope CagY multimeric stabilized structures. The figure shows six structures of the CagY protein, where columns represent the phenotype and are color-coded accordingly. The GC-associated CagY structure displays a stretched pore. The IM-associated CagY structure shows the least reduction in pore size, while the NAG-associated CagY does not exhibit structural similarity. Another approach was to model CagY from strains after monkey (rhesus macaque) passage of the J66 IL-8 inducer strain, as described previously by Solnick et al. ( Barrozo et al., 2013 ). The output strain rOut1 did not induce the release of IL-8, whereas the rOut3 and J166 did induce IL-8, despite coming from the same IL-8 inducing strain and only after a brief monkey colonization. Of note, CagY from the non-IL-8 induction strain had marked differences in the virB10 sequence, particularly in the AP region that associated with a reduced dimension of the crown at the multimer tip and changes in the MRR region associated with marked structural differences at the base of the multimer ( Figure 6 ). In both regions the changes led to a closed tunnel, which would cancel the translocation of molecules through the t4SS. In contrast, the IL8+ strain (J166-IL8+ and rOut3-IL8+) presented an opened tunnel, plus other differences in the rest of the multimer. Download figure Open in new tab Figure 6. J166 CagY variants multimeric stabilized structures. The figure shows four structures of the CagY protein. The variants displays a high structure variability within structures, interestingly, the non-IL8 producer, shows pore channel fully closed, meanwhile, the IL8 producer CagY displays a fully open pore channel. Proline, a key amino acid for the structure of complex proteins is abundant in CagY A highly specialized structure like CagY multimer that interact with DNA, proteins and even other metabolites, all with specialized functions, requires a very tight control of the structure and besides cysteine we wondered if the protein was rich in proline. This was the case and CagY presents an average of 50 prolines, most present in the MRR and in the VirB10 homolog regions, shaping rings in the MRR, AP and VirB10 regions ( Figure 7 ). In contrast to cysteine present only in MRR, proline is also important in the structure of the VirB10 region, including AP. One proline is present in each of the A and B repeats and of note, like the cysteines, proline seems to be highly conserved. Download figure Open in new tab Figure 7. Three-dimensional representation of Cysteine and Proline aminoacid within CagY of Hp 26695. a) CagY monomer structure shows the Cysteine and Proline, this last is responsible of globular and VirB10 homologous region conformation as it’s shown in the arrowed regions. b) CagY multimer structure shown the high presence of Cysteine and Proline aminoacid. Finally, we illustrated the cysteines and prolines in the same six sequences from patients ( Figure 8 ). In the only strain showing closed the upper tip (NAG-HpGP-COL-120), proline in AP seems to be fully closing the tunnel. In the two GC strains showing stretched the neck an extra proline is in the more stretched spot. The same extra copy was present in an IM and a NAG strain but here the neck was not stretched. Major differences were observed in the distribution of proline in the MRR and VirB10 regions, associated with important structural changes, but no evident association with disease. Download figure Open in new tab Figure 8. HpGP SWEurope CagY multimeric stabilized structures. The figure shows six structures of the CagY protein, where columns represent the phenotype and are color-coded accordingly. The structures displays the presence of cysteine and proline aminoacid. In the strains before and after monkey passage of J166 ( Figure 9 ) the prolines in the VirB10 and in the MRR regions varied significatively, formed a ring large enough to possible help maintain open the tunnel. In contrast, proline rings in the rOut1 IL8(-) strain had a very tight ring, favoring a closed tunnel ( Figure 9 ). Download figure Open in new tab Figure 9. J166 CagY variants multimeric stabilized structures. The figure shows four structures of the CagY protein. The models display presence of cysteine and proline aminoacids Discussion CagY has shown to be a highly complex protein, the largest protein in Hp and among the largest bacterial virulence proteins, with close to 2,000 aa, 800 of which are a large region of repeating units. It is a protein that traverses from the bacterial cytoplasm to the outside of the bacterium. Furthermore, the sequence of the gene is hard to assemble using short-reads sequencing because of the large number of repeat-sequences, and there is no known homologous gene among all described proteins ( Delahay et al., 2008 ; López-Luis et al., 2023 ). Working with a large collection of cagY genes sequenced using long reads we were able to overcome previous troubles after short reads sequencing. Phylogeny, PCA and networking analyses confirmed the strong population structure of cagY , like what has been observed with the core genome ( Yahara et al., 2013 ). We previously reported this high population structure in other virulence genes like cagA and vacA ( Muñoz-Ramirez et al., 2021) , which suggests these genes have coevolved with its human host, probably because they are under strong selective pressure due to its interaction with human proteins. Furthermore, the analyses limited to the MRR region also showed a clear population structure, indicating this is a CagY region under strong selective pressure to coevolve with its human host. It has been suggested that MRR is an important region to regulate the proinflammatory activity of the t4SS ( Barrozo et al., 2016 ; Bella et al., 2021 ; Sierra et al., 2019 ) but its complexity has hindered proper functional analyses. We recently showed the importance of MRR in the structure of CagY because of its unusual high number of cysteines concentrated in this region of the protein ( López-Luis et al., 2023 ), a density of S-S bridges not observed in any other protein that we are aware of, not even in mammals. High cysteine content has been found in high life domains as vertebrates, reptiles and plants, particularly in kinases or important for stress response and selective pressure, homeostasis and in immune response as well as in protein-structure ( Barrozo et al., 2016 ; Delahay et al., 2008 ; Tran et al., 2023 ; Wang et al., 2014 ; Zhang et al., 2017 ). It is important to highlight that even in higher life domains 48 Cys in one protein is an extremely high and unusual number, making CagY an extraordinarily rare protein, with no known homology to any other protein ( Barrozo et al., 2016 ; Pace & Weerapana, 2013 ). We recently suggested that these Cys residues might be important in the highly folded protein structure formed by the MRR region, but also for the structure of the multimer where 14 MRRs form the base of the donut and the tunnel for the translocation of molecules from HP to the epithelial cell ( López-Luis et al., 2023 ). This means that as much as 672 (14X48) Cys residues are necessary to make this complex structure possible. The study of the origin and evolution of CagY is a challenging and exciting subject, even more if we consider that the HP T4SS evolved to interact specifically with human gastric epithelial cells and is the result of at least 100,000 years of co-evolution human-Hp ( Falush et al., 2003 ). In this sense it is not a surprise the strong population structure of cagY and even of the MRR region of the gene. It was important to understand the extent of population structure in the gene because when present, it may mask or confound any other analyses. Once we learned that common analytical methods reveal a strong population structure, we tried different approaches to analyze if there was a way to distinguish the clinical source of cagY by studying its sequence diversity. In short, the pipeline that worked was the following: a library with unitigs generated after fragmenting each cagY sequence was constructed and used to scan each of the 687 cagY genes studied followed by dimensional reduction analyses. Next, the unitigs that resulted informative to differentiate the clinical groups were selected with both, RandomForest (RF) and LDA. The results showed a clear separation of the genes according to disease, and although some of the selected unitigs were also informative for ancestry, many of them were useful to distinguish between GC, IM or NAG. Thus, we found that the cagY gene sequence does have information that may identify Hp strains with an increased risk of association with GC, a task that has been elusive in previous works. We built a supervised RF model using disease associated CagY fragments and assembled a variant database comprising those informative unitigs linked to clinical outcomes. It is important to highlight that almost 40% of the informative unitigs were also associated to ancestry, documenting the strong population structure of cagY . Still, the other 60% unitigs were not associated with population structure and most of these included unitigs with the highest informative value. With the above information, we were able to locate the position of each unitig associated with GC in the cagY sequence. It is interesting to note that most of the GC informative unitigs mapped to the MRR region, from the A8 to the A21 repeating-module and 25 mapped specifically in motifs δμ. Fifteen of the 25 included cysteine codons, indicating that they should be very relevant to the structure in both, the CagY monomer and the multimer and hence in the activity of the T4SS( Barrozo et al., 2016 ; Bella et al., 2021 ; Kylarova et al., 2016 ; Sierra et al., 2019 ). If the unusually high number of cysteines in the MRR region play a role other than in structure is something that needs to be further investigated ( Zhang et al., 2017 ). It has also been suggested that the A module is responsible for the binding of CagY to α5β1 integrins, and recombination within this module may alter the binding ( Skoog et al., 2018 ); however, we now know that the MRR is located in the periplasmic region and not exposed on the surface. This would then suggest that recombination within the A module may have an indirect effect on the binding to α5β1 integrins, probably as a result of modification of the protein structure. Nine GC-associated unitigs located in the VirB10 homologous region, a region that includes the outer transmembrane region, but also sites of interaction with the other cagPAI proteins CagX and CagT ( Skoog et al., 2018 ). Furthermore, VirB10 also includes the antenna region (AP) a fragment that is very important for the translocation of CagA to gastric epithelial cells, and hence in the regulation of IL8 induction ( Tran et al., 2023 ). Thus, this small region of the CagY protein has multiple functions that might be sensitive to mutations and our results show variants that affect the risk for GC. In addition, the AP showed a strong and significant co-presence with other informative unitigs present in the MRR and FRR regions, suggesting that a modification in one region may affect other functional regions of this large multifunctional protein. Considering the complex multidomain, multifunction and high sequence diversity nature of CagY ( Delahay et al., 2008 ), searching for any structure-function relationship is a very challenging task. Yet, it is a task we should start pursuing and for that, we modelled CagY from NAG, IM and GC strains all with the same hspSWEuropeMiscAmerica ancestry to minimize the influence of the population. As expected, major differences between disease strains were observed along the 14-multimer, with difference in diameter of the central tunnel, in the structure of the MRR formed donut, or in the AP region at the tip of the multimer where interaction with cell receptors occurs ( Tran et al., 2023 ). However, no evident disease-associated differences were observed, most probably because of the low number of sequences tested (two of each disease group) and so, a larger number of cases need to be modelled to find if there is any disease-associated pattern. Another approach was to study CagY from the J166 Hp strain (IL-8 inducer) after inoculated into rhesus monkeys ( Barrozo et al., 2013 ), where we modelled one output strain inducing IL8 and one that did not. A noticeable difference was that in the non-inducing strain the multimer presented the central tunnel closed (like one NAG did), whereas in the inducing strain the tunnel was open (Like in the two GC strains). Thus, changes in the lumen of the tunnel might be one mechanism to regulate the translocation of CagA and hence the induction of inflammation. Although other mechanisms, such as the alteration of the interaction of CagY with the other proteins of the T4SS might also be involved. Also, the cysteine residues could play a key regulatory function in CagY protein, thus, in our previous work ( López-Luis et al., 2023 ), we discussed about how these residues contain reductor power, flexibility and could help to CagA translocation. Prolines are known for their functional roles in transmembrane proteins such as transporters or ion channels ( Sansom, 1992 ). Because of its properties, proline play an important role in the structure of membrane proteins facilitating formation and stabilization of helixes ( Woolfson & Williams, 1990 ) and modeling studies suggest they may also function as molecular switches or hinges by allowing conformational changes (S.P. Sansom & Weinstein, 2000). These observations encouraged us to search for the presence of proline rich regions in CagY, since it is a transmembrane multimer that function as a channel for diverse molecules and has regions with densely packed helices arranged in concentering rings that may work as switch. Not surprisingly, we found these proline rich regions along the protein that in the multimer resulted in rings surrounding the channel at specific locations. It was more abundant in the MRR region regularly spaced between the cysteines, suggesting both amino acids are important in the structure and function of the CagY complex. The VirB10 homolog region was also proline enriched, including the apical AP site. As mentioned before, the modelled 14 CagY-multimer contains approximately 672 cysteine residues and may also include around 700 proline residues, an extraordinarily high concentration of these amino acids in a transmembrane transport system. This work is the first reporting a detail modelling of most of the backbone of the HP T4SS, a bacterial system highly specialized to work with human gastric epithelial cells. In summary, we found that cagY gene present a strong population structure, which attest for an ancient co-evolution of the protein with humans, a fact that interferes with other analyses. Still, a proper informatic analyses of cagY variants may distinguish the clinical groups of NAG, IM and GC and identified the sequences of the short probes (unitigs) that distinguished these groups. The position of each probe in the protein was localized and found that most of the informative unitigs mapped in the complex repetitive region (MRR) of the gene where cysteine bridges and prolines are enriched. The modelling of some of the variants illustrated major changes in the structure of the 14-multimer complex that might be part of the mechanisms by which CagY regulate the function of the T4SS and hence modulate the inflammatory response. Financial support The study was partially supported by a grant from Coordinación de Investigación en Salud, Instituto de Seguro Social Mexico (FIS-IMSS, Reg. PRIO/18/083, 2023). Disclosure statement No potential conflict of interest was reported by the authors. Supplementary Figures Download figure Open in new tab Supplementary Figure 1. Linear Discriminant Analysis (LDA) of the cagY gene based on virtual hybridization. a) LDA of the complete cagY gene, where clusters are clearly defined. b) LDA of the cagY MRR region, showing lower resolution and the clustering of European-American sequences. Download figure Open in new tab Supplementary Figure 2. Principal Component Analysis (PCA) of the unitig presence/absence matrix. a) PCA of the first 5 principal components (PCs). The first 2 components show a clear association with the H. pylori population structure, while the last 3 components do not exhibit a relationship with the population structure of H. pylori . b) Explained variance of the PCA, showing the variance captured by the first 20 PCs. Download figure Open in new tab Supplementary Figure 3. Comparison of three datasets of unitigs tested with the Random Forest algorithm. The figure shows two metrics obtained from training the three datasets: the F1 score and ROC-AUC. In both metrics, the best-performing models were constructed using the significant GC-associated unitigs. Download figure Open in new tab Supplementary Figure 4. Root Mean Square Deviation (RMSD) plot of Molecular Dynamics (MD) stabilization. a) RMSD plot of 50 ns for the CagY models from Barrozo et al., 2013 . b) RMSD plot of 10-20 ns for the HpGP SWEurope CagY models. The bottom plots show the stabilization during the final nanoseconds of the MD simulations. Download figure Open in new tab Supplementary Figure 5. Validation analysis with Ramachandran plot (PROCHECK) and ERRAT server. a) Ramachandran plots of the CagY models from Barrozo et al., 2013 . b) Ramachandran plots of the HpGP SWEurope CagY models. Download figure Open in new tab Supplementary Figure 6. Nucleotide alignment of AP and the three most significative regions of cagY gene based Hp phenotype. The phenotype-based alignments show differences in the proportion of multiple nucleotides in the AP and GC and NAG which is associated with the differences of presence/absence. The MRR unitig (U27) display many variations among sequence. Download figure Open in new tab Supplementary Figure 7. Root Mean Square Deviation comparison of CagY structures of same phenotype. NAG structures show structural homology in almost all protein, only shows differences in the connection of VirB10 homology region with MRR. IM structures show different in most of the structure, only in the inner pore of MRR the structures are similar. The GC structures show structural differences in the MRR on the outside of CagY. Supplementary Tables View this table: View inline View popup Supplementary Table 1. CagY gene status of the HpGP set View this table: View inline View popup Supplementary Table 2. Unitigs significantly associated with GC (p-value) and its principal component. View this table: View inline View popup Supplementary Table 3. ROC-AUC and F1 scores of Machine Learning training for the six models tested, including all values, only the significantly informative and a set randomly choose View this table: View inline View popup Download powerpoint Supplementary Table 4. Position of the 89 significatively informative unitigs in the cagY gene of the reference strain HP-26695; p-value of the disease association Acknowledgment We thank Dr. Santiago Sandoval for his help to perform the phylogenetic network analyses of cagY . Bibliography ↵ Alipour , M. ( 2029 ). Molecular Mechanism of Helicobacter pylori-Induced Gastric Cancer . doi: 10.1007/s12029-020-00518-5/Published OpenUrl CrossRef ↵ Asnicar , F. , Thomas , A. M. , Passerini , A. , Waldron , L. , & Segata , N. ( 2024 ). Machine learning for microbiologists . Nature Reviews Microbiology , 22 ( 4 ), 191 – 205 . doi: 10.1038/s41579-023-00984-1 OpenUrl CrossRef ↵ Backert , S. , & Tegtmeyer , N. ( 2017 ). Type IV secretion and signal transduction of Helicobacter pylori caga through interactions with host cell receptors . Toxins , 9 ( 4 ), 1 – 16 . doi: 10.3390/toxins9040115 OpenUrl CrossRef ↵ Barrozo , R. M. , Cooke , C. L. , Hansen , L. M. , Lam , A. M. , Gaddy , J. A. , Johnson , E. M. , Cariaga , T. A. , Suarez , G. , Peek , R. M. , Cover , T. L. , & Solnick , J. V. ( 2013 ). Functional Plasticity in the Type IV Secretion System of Helicobacter pylori . PLoS Pathogens . doi: 10.1371/journal.ppat.1003189 OpenUrl CrossRef PubMed ↵ Barrozo , R. M. , Hansen , L. M. , Lam , A. M. , Skoog , E. C. , Martin , M. E. , Cai , L. P. , Lin , Y. , Latoscha , A. , Suerbaum , S. , Canfield , D. R. , & Solnick , J. V. ( 2016 ). CagY Is an Immune-Sensitive Regulator of the Helicobacter pylori Type IV Secretion System . Gastroenterology . doi: 10.1053/j.gastro.2016.08.014 OpenUrl CrossRef PubMed ↵ Bella , C. Della , Soluri , M. F. , Puccio , S. , Benagiano , M. , Grassi , A. , Bitetti , J. , Cianchi , F. , Sblattero , D. , Peano , C. , & D’elios , M. M. ( 2021 ). The helicobacter pylori cagy protein drives gastric TH1 and TH17 inflammation and B cell proliferation in gastric malt lymphoma . International Journal of Molecular Sciences , 22 ( 17 ). doi: 10.3390/ijms22179459 OpenUrl CrossRef ↵ Cohen , O. A. , Macdermott-Opeskin , H. , Lee , L. , Hou , T. , Fong , K. D. , Kingsbury , R. , Wang , J. , & Persson , K. A. ( 2023 ). SolvationAnalysis: A Python toolkit for understanding liquid solvation structure in classical molecular dynamics simulations . Journal of Open Source Software , 8 ( 84 ), 5183 . doi: 10.21105/joss.05183 OpenUrl CrossRef ↵ Delahay , R. M. , Balkwill , G. D. , Bunting , K. A. , Edwards , W. , Atherton , J. C. , & Searle , M. S. ( 2008 ). The Highly Repetitive Region of the Helicobacter pylori CagY Protein Comprises Tandem Arrays of an ??-Helical Repeat Module . Journal of Molecular Biology . doi: 10.1016/j.jmb.2008.01.053 OpenUrl CrossRef PubMed ↵ Eid , J. , Fehr , A. , Gray , J. , Luong , K. , Lyle , J. , Otto , G. , Peluso , P. , Rank , D. , Baybayan , P. , Bettman , B. , Bibillo , A. , Bjornson , K. , Chaudhuri , B. , Christians , F. , Cicero , R. , Clark , S. , Dalal , R. , deWinter , A. , Dixon , J. , … Turner , S. ( 2009 ). Real-Time DNA Sequencing from Single Polymerase Molecules . Science , 323 ( 5910 ), 133 – 138 . doi: 10.1126/science.1162986 OpenUrl Abstract / FREE Full Text ↵ Eswar , N. , Webb , B. , Marti-Renom , M. A. , Madhusudhan , M. S. , Eramian , D. , Shen , M. , Pieper , U. , & Sali , A. ( 2006 ). Comparative Protein Structure Modeling Using Modeller . Current Protocols in Bioinformatics , 15 ( 1 ). doi: 10.1002/0471250953.bi0506s15 OpenUrl CrossRef PubMed ↵ Falush , D. , Wirth , T. , Linz , B. , Pritchard , J. , Stephens , M. , Kidd , M. , Blaser , M. , Graham , D. , Vacher , S. , Perez-perez , G. , Yamaoka , Y. , Mégraud , F. , Otto , K. , Reichard , U. , Katzowitsch , E. , Wang , X. , Achtman , M. , & Suerbaum , S. ( 2003 ). Traces of Human Migrations in Helicobacter pylori populations . Science , 299 ( 5612 ), 1582 – 1585 . doi: 10.1126/science.1080857 OpenUrl Abstract / FREE Full Text ↵ Fischer , W. , Tegtmeyer , N. , Stingl , K. , & Backert , S. ( 2020 ). Four Chromosomal Type IV Secretion Systems in Helicobacter pylori: Composition, Structure and Function . Frontiers in Microbiology , 11 . doi: 10.3389/FMICB.2020.01592 OpenUrl CrossRef ↵ Herrera , V. , & Parsonnet , J. ( 2009 ). Helicobacter pylori and gastric adenocarcinoma . In Clinical Microbiology and Infection (Vol. 15 , Issue 11 , pp. 971 – 976 ). Blackwell Publishing Ltd . doi: 10.1111/j.1469-0691.2009.03031.x OpenUrl CrossRef PubMed Web of Science ↵ Holley , G. , & Melsted , P. ( 2020 ). Bifrost: Highly parallel construction and indexing of colored and compacted de Bruijn graphs . Genome Biology , 21 ( 1 ). doi: 10.1186/s13059-020-02135-8 OpenUrl CrossRef PubMed ↵ Hu , B. , Khara , P. , Song , L. , Soe Lin , A. , Frick-Cheng , A. E. , Lorena Harvey , M. , Cover , T. L. , Christie , P. J. , & Scott Hultgren , E. J. ( 2019 ). In Situ Molecular Architecture of the Helicobacter pylori Cag Type IV Secretion System . https://doi.org/10 ↵ Humphrey , W. , Dalke , A. , & Schulten , K. ( 1996 ). VMD: Visual molecular dynamics . Journal of Molecular Graphics , 14 ( 1 ), 33 – 38 . doi: 10.1016/0263-7855(96)00018-5 OpenUrl CrossRef PubMed Web of Science ↵ Jackson , L. K. , Potter , B. , Schneider , S. , Fitzgibbon , M. , Blair , K. , Farah , H. , Krishna , U. , Bedford , T. , Peek , R. M. , & Salama , N. R. ( 2020 ). Helicobacter pylori diversification during chronic infection within a single host generates sub-populations with distinct phenotypes . PLoS Pathogens , 16 ( 12 ). doi: 10.1371/journal.ppat.1008686 OpenUrl CrossRef PubMed ↵ Jumper , J. , Evans , R. , Pritzel , A. , Green , T. , Figurnov , M. , Ronneberger , O. , Tunyasuvunakool , K. , Bates , R. , Žídek , A. , Potapenko , A. , Bridgland , A. , Meyer , C. , Kohl , S. A. A. , Ballard , A. J. , Cowie , A. , Romera-Paredes , B. , Nikolov , S. , Jain , R. , Adler , J. , … Hassabis , D. ( 2021 ). Highly accurate protein structure prediction with AlphaFold . Nature , 596 ( 7873 ), 583 – 589 . doi: 10.1038/s41586-021-03819-2 OpenUrl CrossRef PubMed ↵ Koelblen , T. , Bergé , C. , Cherrier , M. V. , Brillet , K. , Jimenez-Soto , L. , Ballut , L. , Takagi , J. , Montserret , R. , Rousselle , P. , Fischer , W. , Haas , R. , Fronzes , R. , & Terradot , L. ( 2017 ). Molecular dissection of protein–protein interactions between integrin α5β1 and the Helicobacter pylori Cag type IV secretion system . FEBS Journal , 284 ( 23 ), 4143 – 4157 . doi: 10.1111/febs.14299 OpenUrl CrossRef PubMed ↵ Kylarova , S. , Kosek , D. , Petrvalska , O. , Psenakova , K. , Man , P. , Vecer , J. , Herman , P. , Obsilova , V. , & Obsil , T. ( 2016 ). Cysteine residues mediate high-affinity binding of thioredoxin to ASK 1 . The FEBS Journal , 283 ( 20 ), 3821 – 3838 . doi: 10.1111/febs.13893 OpenUrl CrossRef PubMed ↵ Larkin , M. a , Blackshields , G. , Brown , N. P. , Chenna , R. , McGettigan , P. a , McWilliam , H. , Valentin , F. , Wallace , I. M. , Wilm , A. , Lopez , R. , Thompson , J. D. , Gibson , T. J. , & Higgins , D. G. ( 2007 ). Clustal W and Clustal X version 2.0 . Bioinformatics (Oxford, England) , 23 ( 21 ), 2947 – 2948 . OpenUrl CrossRef PubMed Web of Science ↵ Letunic , I. , & Bork , P. ( 2024 ). Interactive Tree of Life (iTOL) v6: Recent updates to the phylogenetic tree display and annotation tool . Nucleic Acids Research , 52 ( W1 ), W78 – W82 . doi: 10.1093/nar/gkae268 OpenUrl CrossRef PubMed ↵ Li , Y. , Choi , H. , Leung , K. , Jiang , F. , Graham , D. Y. , & Leung , W. K. ( 2023 ). Global prevalence of Helicobacter pylori infection between 1980 and 2022: a systematic review and meta-analysis . The Lancet Gastroenterology and Hepatology , 8 ( 6 ), 553 – 564 . doi: 10.1016/S2468-1253(23)00070-5 OpenUrl CrossRef PubMed ↵ Liu , G. , Mcdaniel , T. K. , Falkow , S. , & Karlin , S. ( 1999 ). Sequence anomalies in the Cag7 gene of the Helicobacter pylori pathogenicity island (Vol. 96 ). www.pnas.org . ↵ López-Luis , M. A. , Soriano-Pérez , E. E. , Parada-Fabián , J. C. , Torres , J. , Maldonado-Rodríguez , R. , & Méndez-Tenorio , A. ( 2023 ). A Proposal for a Consolidated Structural Model of the CagY Protein of Helicobacter pylori . International Journal of Molecular Sciences , 24 ( 23 ). doi: 10.3390/ijms242316781 OpenUrl CrossRef ↵ Mahesh , B. ( 2020 ). Machine Learning Algorithms - A Review . International Journal of Science and Research (IJSR) , 9 ( 1 ), 381 – 386 . doi: 10.21275/art20203995 OpenUrl CrossRef ↵ Moodley , Y. , Linz , B. , Yamaoka , Y. , Windsor , H. M. , Breurec , S. , Wu , J.-Y. , Maady , A. , Bernhöft , S. , Thiberge , J.-M. , Phuanukoonnon , S. , Jobb , G. , Siba , P. , Graham , D. Y. , Marshall , B. J. , & Achtman , M. ( 2009 ). The Peopling of the Pacific from a Bacterial Perspective . Science , 323 ( January ), 527 – 530 . OpenUrl Abstract / FREE Full Text ↵ Muñoz-Ramírez , Z. , Mendez-Tenorio , A. , Kato , I. , Bravo , M. M. , Rizzato , C. , Thorell , K. , Torres , R. , Aviles-Jimenez , F. , Camorlinga , M. , Canzian , F. , & Torres , J. ( 2017 ). Whole Genome Sequence and Phylogenetic Analysis Show Helicobacter pylori Strains from Latin America Have Followed a Unique Evolution Pathway . Frontiers in Cellular and Infection Microbiology , 7 ( 50 ), 1 – 12 . doi: 10.3389/fcimb.2017.00050 OpenUrl CrossRef ↵ Muñoz-Ramirez , Z. Y. , Pascoe , B. , Mendez-Tenorio , A. , Mourkas , E. , Sandoval-Motta , S. , Perez-Perez , G. , Morgan , D. R. , Dominguez , R. L. , Ortiz-Princz , D. , Cavazza , M. E. , Rocha , G. , Queiroz , D. M. M. , Catalano , M. , Palma , G. Z. De , Goldman , C. G. , Venegas , A. , Alarcon , T. , Oleastro , M. , Vale , F. F. , … Torres , J. ( 2021 ). A 500-year tale of co-evolution, adaptation, and virulence: Helicobacter pylori in the Americas . ISME Journal , 15 ( 1 ), 78 – 92 . doi: 10.1038/s41396-020-00758-0 OpenUrl CrossRef PubMed ↵ Pace , N. J. , & Weerapana , E. ( 2013 ). Diverse Functional Roles of Reactive Cysteines . ACS Chemical Biology , 8 ( 2 ), 283 – 296 . doi: 10.1021/cb3005269 OpenUrl CrossRef PubMed Web of Science ↵ Page , A. J. , Taylor , B. , Delaney , A. J. , Soares , J. , Seemann , T. , Keane , J. A. , & Harris , S. R. ( 2016 ). SNP-sites: rapid efficient extraction of SNPs from multi-FASTA alignments . Microbial Genomics , 2 ( 4 ), e000056 . doi: 10.1099/mgen.0.000056 OpenUrl CrossRef PubMed ↵ Pedregosa , F. , Michel , V. , Grisel , O. , Blondel , M. , Prettenhofer , P. , Weiss , R. , Vanderplas , J. , Cournapeau , D. , Pedregosa , F. , Varoquaux , G. , Gramfort , A. , Thirion , B. , Grisel , O. , Dubourg , V. , Passos , A. , Brucher , M. , Perrot , M. Duchesnay , & Duchesnay , E. ( 2011 ). Scikit-learn: Machine Learning in Python Gaël Varoquaux Bertrand Thirion Vincent Dubourg Alexandre Passos PEDREGOSA, VAROQUAUX, GRAMFORT ET AL. Matthieu Perrot . In Journal of Machine Learning Research (Vol. 12 ). http://scikit-learn.sourceforge.net . ↵ Pettersen , E. F. , Goddard , T. D. , Huang , C. C. , Meng , E. C. , Couch , G. S. , Croll , T. I. , Morris , J. H. , & Ferrin , T. E. ( 2021 ). UCSF ChimeraX: Structure visualization for researchers, educators, and developers . Protein Science , 30 ( 1 ), 70 – 82 . doi: 10.1002/pro.3943 OpenUrl CrossRef PubMed ↵ Phillips , J. C. , Hardy , D. J. , Maia , J. D. C. , Stone , J. E. , Ribeiro , J. V. , Bernardi , R. C. , Buch , R. , Fiorin , G. , Hénin , J. , Jiang , W. , McGreevy , R. , Melo , M. C. R. , Radak , B. K. , Skeel , R. D. , Singharoy , A. , Wang , Y. , Roux , B. , Aksimentiev , A. , Luthey-Schulten , Z. , … Tajkhorshid , E. ( 2020 ). Scalable molecular dynamics on CPU and GPU architectures with NAMD . The Journal of Chemical Physics , 153 ( 4 ). doi: 10.1063/5.0014475 OpenUrl CrossRef PubMed ↵ Raimondi , D. , Orlando , G. , Vranken , W. F. , & Moreau , Y. ( 2019 ). Exploring the limitations of biophysical propensity scales coupled with machine learning for protein sequence analysis . Scientific Reports , 9 ( 1 ). doi: 10.1038/s41598-019-53324-w OpenUrl CrossRef ↵ Sansom , M. S. P. ( 1992 ). Proline residues in transmembrane helices of channel and transport proteins: a molecular modelling study . “Protein Engineering, Design and Selection,” 5 ( 1 ), 53 – 60 . doi: 10.1093/protein/5.1.53 OpenUrl CrossRef PubMed Web of Science ↵ Seemann , T. ( 2014 ). Prokka: rapid prokaryotic genome annotation . Bioinformatics , 30 ( 14 ), 2068 – 2069 . doi: 10.1093/bioinformatics/btu153 OpenUrl CrossRef PubMed Web of Science ↵ Sharafutdinov , I. , Backert , S. , & Tegtmeyer , N. ( 2021 ). The Helicobacter pylori type IV secretion system upregulates epithelial cortactin expression by a CagA- and JNK-dependent pathway . Cellular Microbiology , 23 ( 10 ). doi: 10.1111/cmi.13376 OpenUrl CrossRef ↵ Sheedlo , M. J. , Chung , J. M. , Sawhney , N. , Durie , C. L. , Cover , T. L. , Ohi , M. D. , & Lacy , D. B. ( 2020 ). Cryo-EM reveals species-specific components within the helicobacter pylori cag type IV secretion system core complex . ELife , 9 , 1 – 22 . doi: 10.7554/ELIFE.59495 OpenUrl CrossRef PubMed ↵ Sierra , J. C. , Suarez , G. , Piazuelo , M. B. , Luis , P. B. , Baker , D. R. , Romero-Gallo , J. , Barry , D. P. , Schneider , C. , Morgan , D. R. , Peek , R. M. , Gobert , A. P. , & Wilson , K. T. ( 2019 ). α-Difluoromethylornithine reduces gastric carcinogenesis by causing mutations in Helicobacter pylori cagY . Proceedings of the National Academy of Sciences of the United States of America , 116 ( 11 ), 5077 – 5085 . doi: 10.1073/PNAS.1814497116/SUPPL_FILE/PNAS.1814497116.SAPP.PDF OpenUrl Abstract / FREE Full Text ↵ Skoog , E. C. , Morikis , V. A. , Martin , M. E. , Foster , G. A. , Cai , L. P. , Hansen , L. M. , Li , B. , Gaddy , J. A. , Simon , S. I. , & Solnick , J. V. ( 2018 ). CagY-Dependent Regulation of Type IV Secretion in Helicobacter pylori Is Associated with Alterations in Integrin Binding . https://doi.org/10 SP Sansom , M. , & Weinstein , H. ( 2000 ). Hinges, swivels and switches: the role of prolines in signalling via transmembrane α-helices . Trends in Pharmacological Sciences , 21 ( 11 ), 445 – 451 . doi: 10.1016/S0165-6147(00)01553-4 OpenUrl CrossRef PubMed Web of Science ↵ Tamura , K. , Stecher , G. , & Kumar , S. ( 2021 ). MEGA11: Molecular Evolutionary Genetics Analysis Version 11 . Molecular Biology and Evolution , 38 ( 7 ), 3022 – 3027 . doi: 10.1093/molbev/msab120 OpenUrl CrossRef PubMed ↵ Tegtmeyer , N. , Neddermann , M. , Lind , J. , Pachathundikandi , S. K. , Sharafutdinov , I. , Gutiérrez-Escobar , A. J. , Brönstrup , M. , Tegge , W. , Hong , M. , Rohde , M. , Delahay , R. M. , Vieth , M. , Sticht , H. , & Backert , S. ( 2020 ). Toll-like Receptor 5 Activation by the CagY Repeat Domains of Helicobacter pylori . Cell Reports , 32 ( 11 ). doi: 10.1016/j.celrep.2020.108159 OpenUrl CrossRef PubMed ↵ Thorell , K. , Muñoz-Ramírez , Z. Y. , Wang , D. , Sandoval-Motta , S. , Boscolo Agostini , R. , Ghirotto , S. , Torres , R. C. , Romero-Gallo , J. , Krishna , U. , Peek , R. M. , Piazuelo , M. B. , Raaf , N. , Bentolila , F. , Aftab , H. , Akada , J. , Matsumoto , T. , Haesebrouck , F. , Colanzi , R. P. , Bartelli , T. F. , … Rabkin , C. S. ( 2023 ). The Helicobacter pylori Genome Project: insights into H. pylori population structure from analysis of a worldwide collection of complete genomes . Nature Communications , 14 ( 1 ). doi: 10.1038/s41467-023-43562-y OpenUrl CrossRef PubMed ↵ Tran , S. C. , McClain , M. S. , & Cover , T. L. ( 2023 ). Role of the CagY antenna projection in Helicobacter pylori Cag type IV secretion system activity . Infection and Immunity , 91 ( 9 ). doi: 10.1128/iai.00150-23 OpenUrl CrossRef PubMed ↵ Wang , Y.-T. , Piyankarage , S. C. , Williams , D. L. , & Thatcher , G. R. J. ( 2014 ). Proteomic Profiling of Nitrosative Stress: Protein S-Oxidation Accompanies S-Nitrosylation . ACS Chemical Biology , 9 ( 3 ), 821 – 830 . doi: 10.1021/cb400547u OpenUrl CrossRef PubMed ↵ Wheeler , T. J. , & Eddy , S. R. ( 2013 ). Nhmmer: DNA homology search with profile HMMs . Bioinformatics , 29 ( 19 ), 2487 – 2489 . doi: 10.1093/bioinformatics/btt403 OpenUrl CrossRef PubMed Web of Science ↵ Wickham , H. ( 2016 ). ggplot2. Elegant Graphics for Data Analysis . Springer International Publishing . doi: 10.1007/978-3-319-24277-4 OpenUrl CrossRef ↵ Woolfson , D. N. , & Williams , D. H. ( 1990 ). The influence of proline residues on α-helical structure . FEBS Letters , 277 ( 1–2 ), 185 – 188 . doi: 10.1016/0014-5793(90)80839-B OpenUrl CrossRef PubMed Web of Science ↵ Yahara , K. , Furuta , Y. , Oshima , K. , Yoshida , M. , Azuma , T. , Hattori , M. , Uchiyama , I. , & Kobayashi , I. ( 2013 ). Chromosome painting in silico in a bacterial species reveals fine population structure . Molecular Biology and Evolution . ↵ Zhang , X. , Nguyen , N. , Breen , S. , Outram , M. A. , Dodds , P. N. , Kobe , B. , Solomon , P. S. , & Williams , S. J. ( 2017 ). Production of small cysteine-rich effector proteins in Escherichia coli for structural and functional studies . Molecular Plant Pathology , 18 ( 1 ), 141 – 151 . doi: 10.1111/mpp.12385 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted June 25, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following CagY sequence and structural motifs are associated with ancestry and disease in world-wide Helicobacter pylori strains Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share CagY sequence and structural motifs are associated with ancestry and disease in world-wide Helicobacter pylori strains Mario Angel López-Luis , Roberto C Torres , Jay V. Solnick , M. Constanza Camargo , Alfonso Méndez-Tenorio , Javier Torres bioRxiv 2025.06.20.660796; doi: https://doi.org/10.1101/2025.06.20.660796 Share This Article: Copy Citation Tools CagY sequence and structural motifs are associated with ancestry and disease in world-wide Helicobacter pylori strains Mario Angel López-Luis , Roberto C Torres , Jay V. Solnick , M. Constanza Camargo , Alfonso Méndez-Tenorio , Javier Torres bioRxiv 2025.06.20.660796; doi: https://doi.org/10.1101/2025.06.20.660796 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7637) Biochemistry (17705) Bioengineering (13899) Bioinformatics (41970) Biophysics (21463) Cancer Biology (18605) Cell Biology (25526) Clinical Trials (138) Developmental Biology (13385) Ecology (19911) Epidemiology (2067) Evolutionary Biology (24329) Genetics (15615) Genomics (22514) Immunology (17743) Microbiology (40424) Molecular Biology (17194) Neuroscience (88650) Paleontology (667) Pathology (2835) Pharmacology and Toxicology (4827) Physiology (7648) Plant Biology (15160) Scientific Communication and Education (2046) Synthetic Biology (4302) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-23T02:00:01.238055+00:00
License: CC-BY-4.0