upsAI: A high-accuracy machine learning classifier for predicting Plasmodium falciparum var gene upstream groups

doi:10.1101/2025.05.19.654848

upsAI: A high-accuracy machine learning classifier for predicting Plasmodium falciparum var gene upstream groups

2025 · doi:10.1101/2025.05.19.654848

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 57,545 characters · extracted from preprint-html · click to expand

upsAI: A high-accuracy machine learning classifier for predicting Plasmodium falciparum var gene upstream groups | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results upsAI: A high-accuracy machine learning classifier for predicting Plasmodium falciparum var gene upstream groups View ORCID Profile Elcid Aaron Pangilinan , Mathieu Quenu , View ORCID Profile Antoine Claessens , View ORCID Profile Thomas D. Otto doi: https://doi.org/10.1101/2025.05.19.654848 Elcid Aaron Pangilinan 1 School of Infection & Immunity, University of Glasgow , United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Elcid Aaron Pangilinan Mathieu Quenu 2 LPHI, CNRS, INSERM, Université de Montpellier , France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Antoine Claessens 2 LPHI, CNRS, INSERM, Université de Montpellier , France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Antoine Claessens Thomas D. Otto 1 School of Infection & Immunity, University of Glasgow , United Kingdom 2 LPHI, CNRS, INSERM, Université de Montpellier , France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Thomas D. Otto For correspondence: thomasdan.otto{at}glasgow.ac.uk Abstract Full Text Info/History Metrics Preview PDF Abstract Plasmodium falciparum erythrocyte membrane protein 1 ( Pf EMP1), encoded by the hypervariable var gene family, is central to malaria pathogenesis, influencing both disease severity and immune evasion. Classifying var genes into upstream groups (upsA, upsB, upsC, upsE) is important for understanding parasite biology and clinical outcomes, but remains challenging, especially with partial sequences, such as the DBLα tag or RNA-Seq assemblies. We developed upsAI, a machine learning-based classifier trained on 2,530 curated var genes, to accurately assign upstream groups using sequence features from different partial gene regions. We compared seven different methods, including support vector machines, random forest, XGB boost and HMMer models. The best model of upsAI for DBLα-tags sequences achieves an overall accuracy of 83%, 92% and for full-length var genes, therefore significantly outperforming existing tools. Further, we propose a new model to distinguish between internal and subtelomeric var genes with high accuracy and scalability. upsAI is available at https://github.com/sii-scRNA-Seq/upsAI , providing a robust and efficient resource for large-scale var gene analysis. It can classify var genes from 20 genomes in under one second. Introduction Malaria remains a major global health burden, with Plasmodium falciparum responsible for the vast majority of the half-million annual deaths. Transmitted by female Anopheles mosquitoes, P. falciparum causes disease during its intraerythrocytic life cycle. While early ring-stage parasites are concealed within infected red blood cells (iRBCs), trophozoite-stage parasites export virulence proteins such as P. falciparum erythrocyte membrane protein 1 (PfEMP1) to the iRBC surface( 1 ). PfEMP1 enables iRBCs to adhere to endothelial cells via host receptors such as CD36, EPCR, or ICAM1, allowing parasites to avoid splenic clearance but contributing to microvascular obstruction and inflammation, hallmarks of malaria pathology. Each parasite expresses only one PfEMP1 variant at a time, due to epigenetically regulated mutually exclusive transcription of var genes, the ∼60-member gene family that encodes PfEMP1. This controlled expression allows the parasite to switch to a new var gene in subsequent cycles, facilitating immune evasion and persistence during chronic, asymptomatic infections( 2 ). Despite their extreme sequence diversity, var genes can be grouped into four major upstream sequence (ups) classes: A, B, C, and E, corresponding to the promoter region upstream of exon 1( 3 ). These ups groups correlate with gene location and orientation: group A and E var genes are sub-telomeric and transcribed toward the telomere; group B genes are either sub-telomeric or within internal clusters; internal group B genes are also called BC, and group C genes are all internally located. Group E comprises the unusual var gene named var2csa , which has a conserved sequence. Internal var genes are clustered together in six different chromosomal regions that also include other variant surface antigens such as rifin and stevor . PfEMP1 proteins share a conserved domain architecture: an N-terminal segment followed by combinations of Duffy-binding-like (DBL) and cysteine-rich interdomain region (CIDR) domains, ending with a transmembrane domain and acidic terminal sequence (ATS). Different combinations of DBL and CIDR domains can further be classified into domain cassettes (DCs) ( 4 ), with different biological properties and consequences for disease aetiology ( 5 , 6 ). The DBLα-CIDRα “head” structure is nearly universal and biologically critical, as it determines host receptor binding and thus influences disease severity. For instance, group A and some B variants bind to EPCR and are associated with cerebral malaria, whereas group B and C variants tend to bind to CD36 and are linked to milder infections ( 5 – 8 ). VAR2CSA is associated with pregnancy-associated malaria as it is the ligand binding to Chondroitin Sufate A in the placenta( 9 ). The DBLα domain is also widely used as a molecular marker. Due to the presence of conserved motifs, degenerate primers can amplify ∼450 bp of the DBLα “tag” region across nearly all var genes. This tag exhibits high diversity between parasite clones and provides a scalable tool for population-level studies of var diversity and expression, even in the absence of reference genomes( 10 ). DBLα tag sequencing has proven to be a cost-effective alternative to whole-genome or transcriptome approaches, capable of tracking population structure and var expression patterns. Recently, Tan and colleagues ( 12 ) highlighted the use of DBLα tags as a scalable solution in epidemiological surveillance. Thereafter, a novel tool “cUps” was developed to take a short DBLα ‘tag’ sequence and predict its var gene ups group( 13 ). The tool uses a database of var gene tag sequences and links those sequences to ups groups. This was achieved using Machine Learning (ML) Hidden Markov Models (HMM), which were fitted to the different DBLα subclasses and var gene ups groups. The tool has an overall reported accuracy of 74.3% trained on 846 tag sequences( 13 ). cUps was found to be very accurate at classifying upsA, but lacks accuracy with upsB and upsCs types, consistently. In light of this, we initially endeavoured to conduct a comparative study of other machine learning techniques to discern the most effective approach for var gene annotation. We developed a new tool, “upsAI”, a suite to run different models for different partial var sequences of different lengths and explore alternate algorithms widely used in classification problems: Support Vector Machines (SVM): These models aim to find an optimal hyperplane that best separates the data points of the training dataset into different, pre-defined classes or labels. They take advantage of the kernel tricks, which transform the space of input features into a higher-dimensional space( 14 ). Random Forest: an ensemble learning method that combines multiple decision trees. Each tree is trained on a different random subset of the data and features; then, the final classification is based on the majority vote of these trees. It is robust to overfitting and can perform well on imbalanced datasets( 15 ). eXtreme Gradient Boosting (XGBoost): a powerful ensemble learning method known for use in imbalanced datasets and sparse training data. It builds decision trees sequentially, where each new tree attempts to correct the errors made by the previous one( 16 ). We trained these models to link different partial sequences of a var gene (tag sequences, domain cassette, exon 1, and full-length var gene) to their upstream group on an existing dataset of 2,530 var gene sequences. Methods Preparation of var Gene Database A database of full-length var genes was extracted from long-read assemblies and used as training data for models. For this, 23 Plasmodium falciparum long read assemblies were recovered from literature and publicly available datasets ( 17 – 19 ). Genome sequences were downloaded from NCBI, and assemblies were annotated using AUGUSTUS ( 20 ). Var gene sequences were identified through a BLAST search on the list of protein-coding sequences. To this dataset, we added 2,835 var gene sequences from 43 isolates from The Gambia, which were extracted from high-quality PacBio HiFi long-read assemblies. The genomes were annotated through Companion v2 ( 21 ). Var genes were retrieved from all assemblies by screening for genes annotated as potentially coding PfEMP1 by the Companion pipeline and filtering for nucleotide lengths greater than 2500 bp. All identified var genes were passed through pre-established HMM models ( 4 ) to define their domain composition. Copies of var2csa were identified by extracting var genes that had a DBLpam domain. Those genes were then extracted from the dataset and annotated as upsE. Annotation of ups Classifications To annotate and link var genes to their upstream groups (ups), we used a combination of cluster-based analyses and phylogenetic trees. Var genes of known ups groups were taken from published datasets ( 4 ), and we mapped the ups annotations for those genes using SAMtools v1.18 ( 22 ) and blastn v2.14.0 ( 23 ) onto our sequences. Then, to cluster all var genes into ups groups, we used phylogenetic tree clustering. First, the 500bp upstream sequences of all var genes were extracted, then aligned with MAFFT v7.520 ( 24 ) and a phylogenetic tree generated using FastTree 2v.1.11 ( 25 ). The var genes of the known ups group were then used to identify and label phylogenetic clades in the tree. Sequences clustering with var genes representative of ups group A, B and C were assigned the corresponding annotation. To generate our final training set, we added the upsE sequences and excluded 3D7 reference sequences. After assigning the ups types, we obtained a final set of 780 upsA, 2545 upsB, and 861 upsC var genes. Additionally, 110 var2csa sequences from the long read assemblies were then included in the dataset as upsE ( Supplemental Table 1 ). To enhance the diversity found in the training datasets, highly similar sequences were filtered out. CD-HIT v4.8.1 ( 26 ) was used to cluster sequences at a 98% identity threshold within each gene section set. This retained the unique representatives with the longest sequence in each cluster. A final diverse dataset of 2530 var genes was obtained. This resulted in 468 upsA, 1,516 upsB, 507 upsC, and 39 upsE. Extraction of partial var sequences To prepare training datasets for ML models, var genes were translated to protein sequences. We generated four types of datasets ( Figure 1 ) by pulling out partial sequences corresponding to: Tags: a sequence of ∼150 amino acids located in the DBLα domain that can be amplified through degenerated PCR primers in field isolates. The N-terminal sequence always starts with “LARSFADIG” ( 27 ). Cassettes, defined the Dblα-Cidrα cassette pair. Exon1: A sequence that contains the NTS domain and is at least 3.5 kb long, but does not contain either a var intron nor the ATS domain. Full-length var gene: The complete coding sequence of the var gene, including both exons. This was performed through custom python scripts. Download figure Open in new tab Figure 1. Structure of var gene-encoded PfEMP1 domains. var genes can be found in subtelomeric regions, upsA/B/E, and internal var genes clusters, upsC/B types. Most var genes comprise an upstream regulatory region and two exons. Exon 1 encodes the extracellular domains, including the N-terminal segment (NTS), and a head structure with a DBLα domain and a cysteine-rich interdomain region (CIDRα). Exon 2 encodes the intracellular acidic terminal segment (ATS). Tag DBLα sequences are conserved fragments of the DBLα domain used in PCR amplification. Domain cassettes are conserved DBLα-CIDRα combinations found across strains. Figure adapted from ( 11 ). Amongst the var genes, some sequences lacked one or more of these components, but to maximise the information gathered from these var genes and gather the largest training datasets, all identified gene sections were used. Additionally, var genes of the canonical Pf3D7 reference genome were removed from this dataset, as it was used for model verification. After excluding identical var genes, a final diverse dataset of 2,530 full-length var genes, 2,379 exon 1s, 1,995 domain cassettes, and 1,970 tags was then used to train the machine learning models ( Supplemental Table 2 ). Creating and Testing Machine Learning Models To classify var genes sequences, amino acid sequences of each partial sequence type were decomposed into their raw peptides (k-mer = 1), dipeptides (k-mer = 2), and tetrapeptides (k-mer = 4) count tables. Those were then used as features for six different machine learning algorithms: Random Forest, XGBoost, and SVMs with four different kernels (Linear, polynomial, Radial Basis Function and Sigmoid). All implementations were done in Python; the library scikit-learn v1.3.2 ( 28 ) was used for all SVM models and random forest, while xgboost v2.1.1 was used for XGBoost. The hyperparameters of the models were optimised through the Python library optuna v4.1.0. The complete var genes datasets were split into 80% training, 20% test datasets and overall accuracies, precisions, sensitively and specificity were calculated and reported for each model using the python libraries scikit-learn 1.3.2, Biopython 1.83, Pandas 2.0.3, and NumPy 1.24.4. Comparison to cUps To ensure a fair comparison with cUps, we first downloaded their original reference dataset of 846 DBLα tag sequences ( 11 ). We then applied leave-one-out cross-validation (LOOCV) using the same dataset on the upsAI models. This allowed a direct comparison of classification accuracy under the same evaluation strategy reported in the original cUps study ( 11 ). We then trained it with the same input as upsAI for a comparison with our data. Validation on Reference 3D7 Genome We used the upsAI models to annotate the ups type of the 61 3D7 var genes( 29 ), with the tag sequences isolated from the full sequences using the previously mentioned method. The occurrence heatmap was done in Python with pandas 2.0.3, Seaborn 0.13.2, and Matplotlib 3.7.5. Var Gene Localisation Model Refined models with added information about var gene localisation were trained in a similar method. To do this, labels of the var sequences were changed to upsA, upsE, upsB_subtelomeric and upsBC_internal. We previously ( 18 ) manually annotated the chromosomal position of the var genes. For the remaining genes, we verify their position on chromosomes known to harbour internal var genes and annotate the var genes accordingly. From the 2,545 upsB dataset, 1,016 subtelomeric and 297 internal sequences were successfully identified. This resulted in an annotated dataset of 780 upsA, 110 upsE, 1,016 upsB_subtelomeric, and 1,158 upsBC_internal full-length var genes. To produce the models using this alternate dataset, the gene regions were extracted as previously mentioned. This led to final unique datasets of 1,550 tags, 1,541 cassettes, 1,890 exon 1’s, and 1,980 full-length var genes. Explainable AI To visualise how a model determines the category to which a sequence is classified, the model contribution weight of each tetrapeptide was extracted from an SVM linear model trained and optimised on only two types of samples for a binary classification (e.g., upsB and upsC). A sliding window for each sequence of interest computes the local average weight of all tetrapeptides at each position, and these values are aggregated to reveal regions of high positive or negative influence, indicating a shift in the model’s classification towards one direction or the other. 3K Genomes In 2018, we published the var genes of over 2500 field isolates ( 30 ). To attribute their ups type, the full and normalised datasets of var genes as amino acids was downloaded from https://github.com/ThomasDOtto/varDB/tree/master/Datasets . The upsAI SVM linear models were used to attribute the ups and localisation types, with the results uploaded to varDB GitHub. All custom programs were written in python and can be found on our GitHub site ( https://github.com/sii-scRNA-Seq/upsAI ). The models used in this analysis are also available on our GitHub. Results Preparation of var Gene Database For this study, var genes from 66 long-read assemblies were collected (see Methods). As the var genes in these assemblies were annotated using different approaches, we re-annotated them as described in the Methods section. On average, 61.7 var genes were annotated per assembly, which aligns with the expected ∼60 var genes per isolate (see Supplemental Table 1 ). An additional dataset comprising 43 isolates from The Gambia was also included. Of the 3,409 var genes identified, 2,940 passed our quality control criteria, resulting in an average of 66.8 var genes per isolate. This yielded a dataset of 4,357 var genes. We then removed duplicate var genes using CD-HIT, resulting in a final dataset of 2,530 unique var genes (see Supplemental Table 2 ). Annotation of ups Classifications To classify the upstream sequences of our var genes by ups type, we used previously published annotations ( 4 ), which included known 600 bp upstream sequences for upsA, upsB, and upsC types. These reference sequences were aligned against our dataset, resulting in confident mapping to 172 of our var genes. Sequences annotated as upsE (n = 110), corresponding to the conserved var2csa , were excluded from phylogenetic-based clustering due to their distinct and well-characterised nature. We then used the annotated sequences from the Rask et al ( 4 ) as guides to label a phylogenetic tree constructed from all 600 bp upstream sequences in our dataset ( Figure 2 ).The resulting tree showed distinct and well-supported clusters for upsA (red) and upsC (blue). In contrast, upsB-associated sequences (yellow) formed multiple subclades, reflecting higher diversity. Clades containing upsB sequences from the Rask database or in-between such clades were classified as ‘upsB’. Download figure Open in new tab Figure 2. Phylogenetic tree of upstream 600 bp sequences from var genes, including mapped Rask 2010 annotations. Manual classification of subtrees into upsA (red), upsB (yellow), and upsC (blue) groups was based on consistency with the reference annotations. After removing duplicates and excluding the 3D7 reference genes, the final dataset included 468 upsA, 1,516 upsB, 507 upsC, and 39 upsE sequences (total: 2,530). These formed the training and validation dataset used throughout our study. Exploration of upsAI Candidate Models To predict upstream types from var gene sequences, we tested Support Vector Machines (SVMs), Random Forest (RanFor), and XGBoost (XGB) across four categories of partial sequence types: tag, cassette, exon 1, and full-length var genes. Each sequence set was filtered to remove highly similar sequences, retaining only the longest representative ( Supplemental Table 2 ). Furthermore, we excluded var genes from the 3D7 reference genome, as these were later used as a ground truth for testing. An 80/20 split was used for model training and testing, with 80% of the data allocated for training and 20% reserved for evaluation. We explored amino acid k-mers of length 1, 2, and 4 as input features. Best results were obtained with tetrapeptides, we did all further analysis with that model ( Table 1 , Supplemental Table 3 ). View this table: View inline View popup Download powerpoint Table 1. Overall accuracies for classifying different var gene sequence regions. Three machine learning models were evaluated: Random Forest (RanFor), XGBoost (XGB), and four SVM kernels: Linear, Polynomial (Poly), Radial Basis Function (RBF), and Sigmoid. Only results using tetrapeptides (k-mer = 4) as input features are shown. An 80/20 split was used for training and testing. The single amino acid features consistently performed worst, with overall accuracies ranging from 0.70 to 0.79. Increasing the k-mer to 2 improved performance to 0.70–0.86. The highest accuracies were obtained using tetrapeptides, with overall performance ranging from 0.75 to 0.91 ( Supplemental Table 3 ). The best-performing method varied by sequence type: SVM Poly and RBF yielded the highest accuracy (0.85) for tag sequences, while SVM Linear performed best for cassettes (0.87), see Table 1 . For exon 1, both SVM Sigmoid and XGBoost achieved the highest accuracy (0.90). For full-length var genes, SVMS with linear, RBF, and Sigmoid Kernels all achieved an accuracy of 0.92. While differences between models within each sequence type were small (1–2%). Notably, there is a difference in runtime for the models ( Supplemental Table 3 ). SVM Linear was consistently over 100 times faster than the others, making it more energy efficient to use ( Supplemental Table 4 ). Extending the input sequence length from tags to full-length genes increased the accuracy by approximately 2–3% at each step. For instance, the best models achieved 0.85 accuracy for tags, 0.87 for cassettes, 0.90 for exon 1, and 0.92 for full-length sequences. In summary, no single model consistently outperformed the others. However, using tetrapeptides as input features and longer input sequences consistently yielded the best results. Accuracies vary between upstream types Next, we were interested in understanding how the tools would perform on the four different ups groups. We generated confusion matrices ( Table 2 ) and compared specificity and sensitivity of each upstream group across all tetrapeptide models for the four sequence types: tag, cassette, exon 1, and full-length var genes ( Table 3 , Supplemental Table 5 ). View this table: View inline View popup Download powerpoint Table 2. Confusion matrices of tetrapeptide models across the four ups types and four sequence classes. Green cells indicate correct classifications; red cells indicate misclassifications. “n” denotes the number of sequences used for the assignment. View this table: View inline View popup Download powerpoint Table 3. Specificity and sensitivity of tetrapeptide models. Sensitivity refers to the proportion of true positives; specificity refers to the proportion of true negatives. Top-performing models are shown. Full results are available in Supplemental Table 5 . As expected, upsE classifications were perfect, due to the strain-transcendent nature of the var2csa genes. The upsA group was also classified with high accuracy, except in a few cases involving SVM Poly and Random Forest models. However, 0.39% of upsB sequences were misclassified as upsA when predicting from tag sequences, and 0.35% were misclassified when using full-length var sequences. Most errors occurred in the classification of upsC. Approximately 25–75% of upsC sequences were misclassified as upsB, depending on the model and sequence type. This may be due to the higher prevalence of upsB in P. falciparum genomes, which made them over-represented in the training datasets. Random Forest and XGBoost are often reported to perform better on unbalanced datasets. However, in this case, we did not observe any substantial advantage over the simpler SVM models. These misclassifications were further examined through per-group specificity and sensitivity analysis ( Table 3 ). Across all models, upsB predictions tended to yield more false positives (i.e., lower specificity), while upsC predictions produced more false negatives (i.e., lower sensitivity). The low specificity of upsB was due to a substantial proportion of upsC sequences being misclassified as upsB. Conversely, upsC predictions were highly specific (specificities above 0.92), indicating that when a sequence was predicted as upsC, it was almost always correct. However, sensitivity was lower, meaning many true upsC sequences were not detected and instead classified as upsB. Notably, the prediction accuracy using full-length sequences reached 92%, compared to 90% using exon 1 only ( Table 2 ). When including exon 2, upsC classification improved, while upsB classification slightly decreased: 78 upsC sequences were correctly classified from full-length genes compared to 64 from exon 1. This suggests a possible linkage between ups type and exon 2 in internal upsC var gene clusters. Overall, longer sequence inputs yielded better classification performance. SVM Poly and XGBoost offered the best trade-off between accuracy and sensitivity. However, for tag sequences, misclassification between upsB and upsC still occurred in approximately 15% of cases. Comparison of upsAI with cUps Currently, no tool predicts the upstream type from a full-length var gene, or other partial sequences, apart from tag sequences. The most recent method, cUps ( 13 ), uses tag sequences as input and applies HMM-based classification, trained on 846 tags with a leave-one-out cross-validation (LOOCV) accuracy of 0.74 ( Table 4 ). To enable a direct comparison, we retrained our classifiers using the same 846-tag dataset and implemented LOOCV. Using tetrapeptide features, our SVM models (RBF and Sigmoid kernels) achieved an accuracy of 0.84, which is 10 percentage points higher than cUps. This performance is comparable to that obtained with our larger training set (∼2500 sequences), prompting us to assess the uniqueness of the cUps dataset. Applying CD-HIT with parameters similar to those used on our unique tag set, we found that only 700 of the 846 sequences were unique, indicating 17% redundancy. This helps explain the similarity in performance despite the smaller dataset. View this table: View inline View popup Download powerpoint Table 4: Comparison of cUps with upsAI on “cUps dataset”. Using the leave-one-out cross-validation accuracies with 846 Tag Sequences of cUps and the upsAI methods. Reported is the overall accuracy. In a complementary analysis, we evaluated cUps using our dataset, with an 80/20 training/testing split (as in Tables 1 - 3 ). For ups types A, B, and C, cUps achieved sensitivities of 1.0, 0.5, and 0.42, respectively. In comparison, upsAI achieved sensitivities of 1.0, 0.9, and 0.3. The specificities were 0.91, 0.83, and 0.67 for cUps, versus 0.99, 0.65, and 0.96 for upsAI. Although individual metrics appear comparable, the overall accuracy of cUps was 0.59, significantly lower than upsAI’s 0.83. This discrepancy could be due to cUps performing better on the underrepresented upsC class, which contributes less to the overall accuracy due to class imbalance. Validation on reference 3D7 genome As further validation, we tested upsAI on the well-annotated P. falciparum 3D7 reference genome ( Figure 3 ). This serves as a strong baseline due to the high confidence in the annotated ups group labels. As expected, all models accurately classified upsA and upsE sequences. Most misclassifications occurred between upsB and upsC , consistent with previous results. Download figure Open in new tab Figure 3. Classification of 3D7 var gene for tag sequences and full var gene. ”Known” indicate the ground truth. The models used here are the highest accuracy models for each category, except for the Tag SVM linear, as this was included as the fastest model. For example, using the SVM linear model on tag sequences, three upsB sequences were misclassified as upsC , and four upsC sequences as upsB . Interestingly, several upsC genes, Pf3D7_0412700 , Pf3D7_0420900 , and Pf3D7_0712000 , were classified differently depending on whether the tag or the full sequence was used. Only one sequence, Pf3D7_0461300 , was misclassified with both sequence types. This suggests that combining tag- and full-sequence models may improve overall classification performance. From a biological perspective, such discrepancies may reflect recent recombination events between var gene groups in 3D7 that differ from those in the training set. Additionally, we observed that upsB classifications were nearly perfect, with errors limited to the so-called BC-type genes - internal upsB variants. In these cases, upsAI misclassified three internal upsB sequences as upsC . These results led us to hypothesise that it may be possible to develop a classifier to distinguish var genes based on chromosomal location, specifically, internal var genes versus subtelomeric ones. A novel classifier for genome location To further investigate classification errors ( Table 2 ), we found that 90% of misclassifications occurred between upsB and upsC genes located within internal var gene clusters. This supports our hypothesis from the 3D7 analysis: that recombination events are more frequent within internal clusters( 11 ), and thus these sequences may form a biologically meaningful composite class ( Figure 3 ). Based on this rationale, we re-annotated 1,980 of the 2,530 genes into four categories: upsA , upsE , upsB_subtelomeric , and upsBC_internal , as detailed in the Methods. We then ran all tetrapeptide-based models on this subset ( Table 5 ). Overall, classification performance for tags remained limited and 3 percent point lower than with the first model ( Table 1 ). This slightly worse result could be explained by the reduced training set (∼75%). View this table: View inline View popup Download powerpoint Table 5. Overall Accuracies Classifying Internal and Subtelomeric var Gene Sequence Sections. The same ML models were used with a reannotated subset of var genes classified as upsA, upsE, upsB_subtelomeric, and upsBC_internal. The best overall tag accuracy reached 0.78, with sensitivity for distinguishing B_subtelomeric from BC_internal ranging from 0.60 to 0.83 (Supplemental Table). Using exon 1 sequences improved sensitivity to 0.67–0.85 and specificity to 0.80–0.91. Notably, full-length var sequences yielded results up to 10 percentage points higher, but these are less relevant given that long-read assemblies are still required for complete var reconstruction and would indicate location. Importantly, while accuracy for tag sequences is lower compared to the original classifier ( Table 1 ), classification of upsA and upsE remains near-perfect. We then compared performance across sequence types ( Table 6 , Supplementary Tables 6, 7). For tag sequences, although the overall accuracy dropped, classification between B_subtelomeric and BC_internal was more balanced than in the previous model. For instance, the polynomial SVM yielded sensitivity and specificity between 0.58 and 0.88. Most predictions were correct, contrasting with earlier results where misclassification between upsB and upsC was frequent. View this table: View inline View popup Download powerpoint Table 6. Classification of var genes from 660 isolates across twelve regions using upsAI. The linear SVM model was applied to classify exon 1 sequences by genomic position (left) and ups type (right). Each region includes 60 isolates. Values represent the percentage of genes assigned to each class. As expected, performance improved with exon 1 sequences, where over 75% of sequences per class were correctly classified. While full-length sequences yielded the highest performance (>85% accuracy), exon 1 might be the most practical input type. Here, overall accuracy reached 0.84, with B_subtelomeric sequences classified at >83% sensitivity and 0.90 specificity using the linear SVM model. In summary, although overall accuracy is comparable to the previous model, classification is more balanced across classes. Moreover, this analysis introduces a novel classifier for predicting the chromosomal position of var genes. Regional tetrapeptide contributions reveal internal recombination signals When testing upsAI on the 3D7 reference genome, we observed that certain sequences were classified differently depending on whether the full-length var gene or only the tag sequence was used ( Figure 3 ). For example, Pf3D7_0420900 has an upstream region classified as upsC, and the full-length model correctly assigned it as such. However, when only the tag sequence was used, the tag model classified it as upsB. A notable advantage of using SVM models is the interpretability of their decision function: the contribution weights of individual tetrapeptides can be extracted. This allows us to explore which regions of a gene - and which specific tetrapeptides - are most informative for classification into ups groups. In Figure 4 , we visualise the tetrapeptide decision weights across the sequence of Pf3D7_0420900 . In the top panel, the decision weights fluctuate along the length of the gene, suggesting no single dominant signal, indicative of a complex or mosaic sequence. To interpret this more systematically, we calculated the cumulative area under the curve (AUC) score across the gene. For Pf3D7_0420900 , the cumulative score strongly supports upsC. However, at the position corresponding to the tag (approximately amino acid position 184), there is a distinct dip in the cumulative score. The tag signal (green line) in this model is strongly associated with upsB, explaining the discrepancy in classification between the tag and full-length models. This suggests that the tag region is more similar to upsB, whereas the remainder of the gene aligns more closely with upsC. Download figure Open in new tab Figure 4. Regional contributions of tetrapeptides to ups classification. Top panel: Decision weights for each tetrapeptide along the sequence of Pf3D7_0420900 , generated by the SVM classifier. Middle and bottom panels: cumulative AUC-based decision scores for two genes, Pf3D7_0420900 (middle) and Pf3D7_0412700 (bottom). Red lines represent cumulative scores from the full-length gene; green lines represent scores from the tag sequence. Positive values indicate stronger support for upsC, while negative values indicate stronger support for upsB. Localised shifts suggest complex sequence composition and potential ancient recombination. A second example is shown in the bottom panel of Figure 4 , where both the tag and full-length sequence of Pf3D7_0412700 are classified as upsC. The cumulative AUC score supports this, although the initial portion of the tag shows some signal associated with upsB, again suggesting a mixed origin. These regional fluctuations in signal, where different tetrapeptides contribute positively or negatively to distinct classes, may be indicative of historical recombination events. The ability to track such regional contributions across var genes provides new insights into their complex evolutionary history. Application of upsAI to global P. falciparum genome data Finally, we applied upsAI to the varDB dataset, which includes approximately 2,500 P. falciparum genomes. Most var gene sequences in this dataset are near-complete exon 1 sequences. We utilised the linear SVM model for exon 1 classification, as it is the most computationally efficient, capable of classifying 100 sequences in approximately 5-15 seconds, while maintaining comparable accuracy to more complex models. Both classification models, the ups type and genomic location (internal vs subtelomeric), were applied. The resulting annotations have been uploaded to the varDB GitHub repository ( https://github.com/ThomasDOtto/varDB ). To reduce potential bias from uneven sampling, we analysed a normalised subset of the dataset consisting of 600 isolates from eleven countries (60 isolates per region), with each isolate containing between 52 and 65 var genes—except for the Senegal dataset, which had slightly fewer ( Table 6 ). While the primary goal of this analysis was to demonstrate the scalability and speed of upsAI for rapid exon 1 annotation, taking less than one second to annotate 1,200 var genes, it is also informative to observe geographical variation in upsA and upsE frequencies, which are the most confidently classified types. However, we advise caution in interpreting these patterns, as the underlying dataset is subject to biases related to sequencing quality and assembly methods. To draw robust biological conclusions regarding var gene distributions, more controlled datasets, such as RNA-Seq-derived contigs from clinical isolates stratified by disease phenotype, would be necessary. Discussion Understanding the ups type of var genes is valuable for interpreting their functional properties and roles in Plasmodium falciparum biology. However, obtaining upstream sequence information is often challenging. This is especially true for tag sequences or RNA-Seq data, where the upstream region may not be captured, and even in whole-genome Illumina datasets, low-complexity and repetitive features of ups sequences hinder assembly( 30 ). To address this, we developed upsAI, a machine learning–based tool that predicts ups types based on sequence features. We evaluated multiple classifiers using different input types. This is a non-trivial task, as var genes are known to recombine frequently, especially within internal gene clusters( 11 , 31 , 32 ). The tool performs well for upsA and upsE genes, which are distinct and less prone to recombination. In contrast, classification of upsB and upsC types is more challenging due to frequent recombination, particularly in internal clusters. Consequently, sensitivity and specificity for these types can be as low as 0.27. Because of class imbalance, a B call is correct 93% of the time, whereas 73% of C types are misclassified as B. These metrics improve with sequence length: overall accuracy rises from 0.82 (tags) to 0.90 (exon 1) and 0.92 (full-length genes). Interestingly, the inclusion of exon 2 shifts the sensitivity and specificity between upsB and upsC. While not formally tested, this suggests that exon 2 may be more tightly linked to the upstream region, potentially because recombination events are more likely to involve exon 1. To further investigate classification errors, we applied an explainable AI approach by analysing the contribution of individual tetrapeptides to upsB or upsC classification in a Pf3D7 var gene that showed discordant results between that tag and var gene sequences. For this gene, the tag sequence showed a strong upsB signal, whereas the remainder of the gene clearly leaned towards upsC. Interestingly, local drops in the cumulative classification signal aligned with potential recombination breakpoints, highlighting the mosaic structure of the gene. The mosaic nature of var genes has been frequently described ( 11 , 33 , 34 ), but our ability to observe it using tetrapeptides linked to upstream sequence types offers a novel approach to identifying sequence features associated with recombination events. We also assessed whether increasing training data would improve performance. While this is likely, the recombinogenic nature of var genes imposes an inherent ceiling on classification accuracy. In comparing models, we found that performance was generally consistent across algorithms. However, for tag sequences, some models offered marginally higher accuracy at significant computational cost - up to 100-fold slower than others. Given this, we opted for the fastest-performing model (linear SVM) for most input types and the polynomial SVM for tags. With trained models, annotating 1,200 var genes from 20 genomes takes approximately one second using linear SVM and 100 seconds using the polynomial model. We benchmarked upsAI against cUps, a HMM-based tool that utilises tag sequences. On their dataset (using leave-one-out cross-validation), upsAI outperformed cUps by 10 percentage points (0.83 vs. 0.74 accuracy, Table 4 ). Notably, 17% of sequences in the cUps dataset were duplicates, suggesting the true accuracy is likely lower. When tested on our dataset, cUps’ overall accuracy was lower, although it showed improved performance on upsC types. The predominance of upsB sequences per genome obscures this distinction. Given the difficulty in distinguishing upsB and upsC, and the interest in chromosomal positioning of var genes, we developed an additional classifier to distinguish subtelomeric from internal var genes ( Table 5 ). For tag sequences, accuracy reached 78%, and for full-length genes, up to 92%. This again suggests that exon 2 may be a strong indicator of chromosomal location. Since most researchers rely on tag sequences or partial assemblies, this classifier enables accurate prediction of genomic location in approximately 78–84% of cases. More broadly, despite the extreme polymorphism and recombination of var genes, our work demonstrates that machine learning methods can be effectively applied to the classification of these genes. Future work could focus on predicting functional features, such as domain cassettes, and linking these to clinical phenotypes. In conclusion, upsAI offers a robust and efficient tool for annotating var genes by their ups type and genomic location. It holds promise for enabling deeper insights into var gene expression patterns in natural infections and facilitating large-scale comparative studies. Declarations Availability of Data and Materials The different genomes were downloaded from the papers mentioned in the method. Code Availability upsAI is an open-source project and MIT licensed application, available here: https://github.com/sii-scRNA-Seq/upsAI Competing interests No competing interests Funding TDO was supported by the ExposUM Institute of the University of Montpellier (grants ANR-21-EXES-0005 and Occitanie Region) and the Wellcome Trust: 104111/Z/14/Z & A. The LaCaixa Foundation funded EAP. Authors’ contribution EAP implemented upsAI, generated the datasets and performed the analysis. MQ and AC assisted with the annotation of ups types and generated P. falciparum genomes from The Gambia. TDO conceptualised the project, TDO and AC organised funding. EAP and TDO wrote the manuscript. All authors gave feedback on the manuscript, performed edits on the manuscript and agreed with the content. Supplemental View this table: View inline View popup Download powerpoint Supplemental Table 1: Complete Training Set of Full Var Genes and ups Assignments used. View this table: View inline View popup Download powerpoint Supplemental Table 2: All and Unique Training Sets of Isolated Var Genes Components Regions View this table: View inline View popup Download powerpoint Supplemental Table 3: Overall accuracy of upsAI, including shorter peptide models View this table: View inline View popup Download powerpoint Supplemental Table 4. upsAI Model Speeds when training then classifying 60 3D7 complete var sequences View this table: View inline View popup Download powerpoint Supplemental Table 5. Specificity and Sensitivity of Tetrapeptide Models for first model. Sensitivity refers to the proportion of true positives, while specificity refers to the proportion of true negatives. View this table: View inline View popup Download powerpoint Supplemental Table 7. Confusion Matrixes of Internal / Subtelomeric Models. View this table: View inline View popup Download powerpoint Supplemental Table 8: Sensitivity and Specificity of Internal/Subtelomeric Models. Split of results for the different sequence types. Acknowledgements We used ChatGPT and Grammarly to edit the grammar of the manuscript. Funder Information Declared Agence Nationale de la Recherche, https://ror.org/00rbzpz17 , ANR-21-EXES-0005 Wellcome Trust, https://ror.org/029chgv08 , 104111/Z/14/Z & A References 1. ↵ Hadjimichael E , Deitsch KW . Variable surface antigen expression, virulence, and persistent infection by Plasmodium falciparum malaria parasites . Microbiol Mol Biol Rev . 2025 ; 89 ( 1 ): e0011423 . OpenUrl CrossRef PubMed 2. ↵ Zhang X , Deitsch KW . The mystery of persistent, asymptomatic Plasmodium falciparum infections . Current Opinion in Microbiology . 2022 ; 70 : 102231 . OpenUrl CrossRef PubMed 3. ↵ Lavstsen T , Salanti A , Jensen ATR , Arnot DE , Theander TG . Sub-grouping of Plasmodium falciparum 3D7 var genes based on sequence analysis of coding and non-coding regions . Malaria Journal . 2003 ; 2 ( 1 ): 27 . OpenUrl CrossRef PubMed 4. ↵ Rask TS , Hansen DA , Theander TG , Gorm Pedersen A , Lavstsen T . Plasmodium falciparum erythrocyte membrane protein 1 diversity in seven genomes--divide and conquer . PLoS Comput Biol . 2010 ; 6 ( 9 ). 5. ↵ Lavstsen T , Turner L , Saguti F , Magistrado P , Rask TS , Jespersen JS , et al. Plasmodium falciparum erythrocyte membrane protein 1 domain cassettes 8 and 13 are associated with severe malaria in children . Proceedings of the National Academy of Sciences of the United States of America . 2012 ; 109 ( 26 ): E1791 – 800 . OpenUrl Abstract / FREE Full Text 6. ↵ Claessens A , Adams Y , Ghumra A , Lindergard G , Buchan CC , Andisi C , et al. A subset of group A-like var genes encodes the malaria parasite ligands for binding to human brain endothelial cells . Proceedings of the National Academy of Sciences of the United States of America . 2012 ; 109 ( 26 ): E1772 – 81 . OpenUrl Abstract / FREE Full Text 7. Tonkin-Hill GQ , Trianty L , Noviyanti R , Nguyen HHT , Sebayang BF , Lampah DA , et al. The Plasmodium falciparum transcriptome in severe malaria reveals altered expression of genes involved in important processes including surface antigen-encoding var genes . PLoS Biol . 2018 ; 16 ( 3 ): e2004328 . OpenUrl CrossRef PubMed 8. ↵ Warimwe GM , Fegan G , Musyoki JN , Newton CR , Opiyo M , Githinji G , et al. Prognostic indicators of life-threatening malaria are associated with distinct parasite variant antigen profiles . Sci Transl Med . 2012 ; 4 ( 129 ): 129ra45 . OpenUrl Abstract / FREE Full Text 9. ↵ Salanti A , Staalsoe T , Lavstsen T , Jensen AT , Sowa MP , Arnot DE , et al. Selective upregulation of a single distinctly structured var gene in chondroitin sulphate A-adhering Plasmodium falciparum involved in pregnancy-associated malaria . Mol Microbiol . 2003 ; 49 ( 1 ): 179 – 91 . OpenUrl CrossRef PubMed Web of Science 10. ↵ Tonkin-Hill G , Ruybal-Pesantez S , Tiedje KE , Rougeron V , Duffy MF , Zakeri S , et al. Evolutionary analyses of the major variant surface antigen-encoding genes reveal population structure of Plasmodium falciparum within and between continents . PLoS Genet . 2021 ; 17 ( 2 ): e1009269 . OpenUrl CrossRef PubMed 11. ↵ Claessens A , Hamilton WL , Kekre M , Otto TD , Faizullabhoy A , Rayner JC , Kwiatkowski D . Generation of antigenic diversity in Plasmodium falciparum by structured rearrangement of Var genes during mitosis . PLoS Genet . 2014 ; 10 ( 12 ): e1004812 . OpenUrl CrossRef PubMed 12. ↵ Tan MH , Shim H , Chan YB , Day KP . Unravelling var complexity: Relationship between DBLalpha types and var genes in Plasmodium falciparum . Front Parasitol . 2022 ; 1 . 13. ↵ Tan MH , Tiedje KE , Feng Q , Zhan Q , Pascual M , Shim H , et al. A paradoxical population structure of var DBLalpha types in Africa . bioRxiv . 2023 . 14. ↵ Burges CJC . A Tutorial on Support Vector Machines for Pattern Recognition . Data Mining and Knowledge Discovery . 1998 ; 2 ( 2 ): 121 – 67 . OpenUrl CrossRef 15. ↵ Breiman L . Random Forests . Machine Learning . 2001 ; 45 ( 1 ): 5 – 32 . OpenUrl CrossRef 16. ↵ Chen T , Guestrin C. XGBoost: A Scalable Tree Boosting System. Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining ; San Francisco, California, USA : Association for Computing Machinery ; 2016 . p. 785 – 94 . 17. ↵ Ruiz JL , Reimering S , Escobar-Prieto JD , Brancucci NMB , Echeverry DF , Abdi AI , et al. From contigs towards chromosomes: automatic improvement of long read assemblies (ILRA) . Briefings in bioinformatics . 2023 ; 24 ( 4 ). 18. ↵ Otto TD , Bohme U , Sanders M , Reid A , Bruske EI , Duffy CW , et al. Long read assemblies of geographically dispersed Plasmodium falciparum isolates reveal highly structured subtelomeres . Wellcome Open Res . 2018 ; 3 : 52 . OpenUrl CrossRef PubMed 19. ↵ Moser KA , Drabek EF , Dwivedi A , Stucke EM , Crabtree J , Dara A , et al. Strains used in whole organism Plasmodium falciparum vaccine trials differ in genome structure, sequence, and immunogenic potential . Genome Med . 2020 ; 12 ( 1 ): 6 . OpenUrl CrossRef PubMed 20. ↵ Stanke M , Keller O , Gunduz I , Hayes A , Waack S , Morgenstern B . AUGUSTUS: ab initio prediction of alternative transcripts . Nucleic Acids Research . 2006 ; 34 (Web Server issue): W435 – 9 . OpenUrl CrossRef PubMed Web of Science 21. ↵ Haese-Hill W , Crouch K , Otto TD . Annotation and visualization of parasite, fungi and arthropod genomes with Companion . Nucleic Acids Res . 2024 ; 52 ( W1 ): W39 – W44 . OpenUrl CrossRef PubMed 22. ↵ Li H , Handsaker B , Wysoker A , Fennell T , Ruan J , Homer N , et al. The Sequence Alignment/Map format and SAMtools . Bioinformatics . 2009 ; 25 ( 16 ): 2078 – 9 . OpenUrl CrossRef PubMed Web of Science 23. ↵ Camacho C , Coulouris G , Avagyan V , Ma N , Papadopoulos J , Bealer K , Madden TL . BLAST+: architecture and applications . BMC Bioinformatics . 2009 ; 10 ( 1 ): 421 . OpenUrl CrossRef PubMed 24. ↵ Katoh K , Standley DM . MAFFT multiple sequence alignment software version 7: improvements in performance and usability . Molecular biology and evolution . 2013 ; 30 ( 4 ): 772 – 80 . OpenUrl CrossRef PubMed Web of Science 25. ↵ Price MN , Dehal PS , Arkin AP . FastTree 2--approximately maximum-likelihood trees for large alignments . PLoS One . 2010 ; 5 ( 3 ): e9490 . OpenUrl CrossRef PubMed 26. ↵ Fu L , Niu B , Zhu Z , Wu S , Li W . CD-HIT: accelerated for clustering the next-generation sequencing data . Bioinformatics . 2012 ; 28 ( 23 ): 3150 – 2 . OpenUrl CrossRef PubMed Web of Science 27. ↵ Taylor HM , Kyes SA , Harris D , Kriek N , Newbold CI . A study of var gene transcription in vitro using universal var gene primers . Mol Biochem Parasitol . 2000 ; 105 ( 1 ): 13 – 23 . OpenUrl CrossRef PubMed Web of Science 28. ↵ Pedregosa F , Varoquaux G , Gramfort A , Michel V , Thirion B , Grisel O , et al. Scikit-learn: Machine Learning in Python . J Mach Learn Res . 2011 ; 12 ( null ): 2825 – 30 . OpenUrl CrossRef PubMed 29. ↵ Bohme U , Otto TD , Sanders M , Newbold CI , Berriman M . Progression of the canonical reference malaria parasite genome from 2002-2019 . Wellcome Open Res . 2019 ; 4 : 58 . OpenUrl CrossRef PubMed 30. ↵ Otto TD , Assefa SA , Böhme U , Sanders M , Kwiatkowski D , Pf3k consortium , et al. Evolutionary analysis of the most polymorphic gene family in falciparum malaria . Wellcome Open Res . 2019 ; 4 ( 193 ). 31. ↵ Sander AF , Lavstsen T , Rask TS , Lisby M , Salanti A , Fordyce SL , et al. DNA secondary structures are associated with recombination in major Plasmodium falciparum variable surface antigen gene families . Nucleic Acids Res . 2014 ; 42 ( 4 ): 2270 – 81 . OpenUrl CrossRef PubMed 32. ↵ Bopp SE , Manary MJ , Bright AT , Johnston GL , Dharia NV , Luna FL , et al. Mitotic evolution of Plasmodium falciparum shows a stable core genome but recombination in antigen families . PLoS Genet . 2013 ; 9 ( 2 ): e1003293 . OpenUrl CrossRef PubMed 33. ↵ Bull PC , Buckee CO , Kyes S , Kortok MM , Thathy V , Guyah B , et al. Plasmodium falciparum antigenic variation. Mapping mosaic var gene sequences onto a network of shared, highly polymorphic sequence blocks . Mol Microbiol . 2008 ; 68 ( 6 ): 1519 – 34 . OpenUrl CrossRef PubMed Web of Science 34. ↵ Zilversmit MM , Chase EK , Chen DS , Awadalla P , Day KP , McVean G . Hypervariable antigen genes in malaria have ancient roots . BMC Evol Biol . 2013 ; 13 : 110 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted May 23, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following upsAI: A high-accuracy machine learning classifier for predicting Plasmodium falciparum var gene upstream groups Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share upsAI: A high-accuracy machine learning classifier for predicting Plasmodium falciparum var gene upstream groups Elcid Aaron Pangilinan , Mathieu Quenu , Antoine Claessens , Thomas D. Otto bioRxiv 2025.05.19.654848; doi: https://doi.org/10.1101/2025.05.19.654848 Share This Article: Copy Citation Tools upsAI: A high-accuracy machine learning classifier for predicting Plasmodium falciparum var gene upstream groups Elcid Aaron Pangilinan , Mathieu Quenu , Antoine Claessens , Thomas D. Otto bioRxiv 2025.05.19.654848; doi: https://doi.org/10.1101/2025.05.19.654848 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41910) Biophysics (21436) Cancer Biology (18576) Cell Biology (25480) Clinical Trials (138) Developmental Biology (13368) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40360) Molecular Biology (17163) Neuroscience (88534) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00