PILOT-GM-VAE: Patient-Level Analysis of single cell Disease Atlas with Optimal Transport of Gaussian Mixture Variational Autoencoders

doi:10.1101/2025.04.10.648188

PILOT-GM-VAE: Patient-Level Analysis of single cell Disease Atlas with Optimal Transport of Gaussian Mixture Variational Autoencoders

2025 · doi:10.1101/2025.04.10.648188

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 54,136 characters · extracted from preprint-html · click to expand

PILOT-GM-VAE: Patient-Level Analysis of single cell Disease Atlas with Optimal Transport of Gaussian Mixture Variational Autoencoders | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results PILOT-GM-VAE: Patient-Level Analysis of single cell Disease Atlas with Optimal Transport of Gaussian Mixture Variational Autoencoders Mehdi Joodaki , Mina Shaigan , Samaneh Samiei , James Nagai , Tiago Maié , Christoph Kuppe , View ORCID Profile Ivan G. Costa doi: https://doi.org/10.1101/2025.04.10.648188 Mehdi Joodaki 1 Institute for Computational Genomics, Joint Research Center for Computational Biomedicine, RWTH Aachen University Medical School , 52074 Aachen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mina Shaigan 1 Institute for Computational Genomics, Joint Research Center for Computational Biomedicine, RWTH Aachen University Medical School , 52074 Aachen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Samaneh Samiei 2 Laboratory for Quantitative Cell Dynamics and Translational Systems Biology, RWTH Aachen University Medical School , 52074 Aachen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site James Nagai 1 Institute for Computational Genomics, Joint Research Center for Computational Biomedicine, RWTH Aachen University Medical School , 52074 Aachen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Tiago Maié 1 Institute for Computational Genomics, Joint Research Center for Computational Biomedicine, RWTH Aachen University Medical School , 52074 Aachen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Christoph Kuppe 2 Laboratory for Quantitative Cell Dynamics and Translational Systems Biology, RWTH Aachen University Medical School , 52074 Aachen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site Ivan G. Costa 1 Institute for Computational Genomics, Joint Research Center for Computational Biomedicine, RWTH Aachen University Medical School , 52074 Aachen, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ivan G. Costa For correspondence: ivan.costa{at}rwth-aachen.de Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Motivation The analysis of single cell disease atlases represents a challenge due to the presence of batch effects, low quality of disease samples, and the multi-scale nature of the data, i.e., samples are described by different cell distributions. Because of these, few computational approaches are performing sample-level disease progression analysis so far. Results Here, we introduce Patient-Level Analysis with Optimal Transport based on Gaussian Mixture Variational Autoencoders (PILOT-GM-VAE). PILOT-GM-VAE explores the power of GM-VAE to estimate models describing complex single cell distributions through efficient optimal transport algorithms for estimating the distance between Gaussian Mixtures. Extensive benchmarking on several single cell disease atlases and competing approaches demonstrates the performance of PILOT-GM-VAE in sample-level clustering, sample-level trajectory inference, and batch correction tasks. Moreover, we performed a case study on a breast cancer disease atlas, where PILOT-GM-VAE highlighted cellular and molecular changes associated with breast cancer disease progression. Availability The software, code, and data for benchmarking are available at https://github.com/CostaLab/PILOT-GM-VAE/tree/main 1 Introduction Single cell sequencing technologies have revolutionized our understanding of cellular diversity and function, enabling researchers to profile gene expression at an unprecedented resolution ( Tang et al ., 2009 ; Macosko et al ., 2015 ), as shown by the recent efforts of the Human Cell Atlas measuring millions of cells in various organs ( Yanai et al ., 2024 ). Although most of the atlases focuses on healthy tissues, there are increasing efforts to create disease single cell atlases, such as in lung diseases ( Sikkema et al ., 2023 ), covid ( Ren et al ., 2021 ), myocardial infarction ( Kuppe et al ., 2022 ), and kidney diseases ( Lake et al ., 2021 ), among others. The computational analysis of disease atlases is challenging due to the presence of experimental artifacts such as batch effects ( Luecken et al ., 2022 ), which are potentially worsened by the inherently lower quality of disease tissues compared to healthy tissue. Another challenge arises from the multi-scale nature of this data, i.e., every sample/patient is represented by a distribution of cells, which are in turn characterized by their genes. Such multi-scale data (samples x cells x genes) 1 cannot be straightforwardly analyzed by “standard” machine learning methods, which usually require tabular data (cells x genes or samples x genes). Recently, a few computational approaches have been proposed for modeling a single cell experiment of a sample (or patient) as a distribution of cells. First, PhEMD (Phenotypic Earth Mover’s Distance; Chen et al . (2020) ) employed Earth Mover’s Distance to measure distances between single cell samples based on diffusion-based clustering. This method was effective in assessing drug responses in cell lines. Yet, it relies on diffusion maps and pseudotime estimates that assume a continuous trajectory of cellular states, which do not necessarily exist in organs with heterogeneous cell populations. We have recently proposed PILOT (Patient-Level Analysis with Optimal Transport; ( Joodaki et al ., 2024 )), which models samples as discrete distributions of cell types. PILOT uses optimal transport-based Wasserstein Distances(Peyré et al ., 2019) to measure the distance between samples. A benchmarking analysis of eight disease scRNA-seq atlases indicated that PILOT had higher performance in clustering and trajectory inference problems when compared to PhEMD or pseudo-bulk patient’s single cell libraries. PILOT also includes the use of non-linear models for interpretation of results, i.e., associated cell types and genes related to the trajectory. Later, Quantized Optimal Transport (QOT) was proposed, building upon the PILOT framework but using a mixture of Gaussian distributions to model the distribution of a particular cell type ( Wang et al ., 2025 ). As with PILOT, this method requires a previous annotation of cells to cell types. GloScope ( Wang et al ., 2024 ) estimates Gaussian Mixture Models (GMMs) independently for each sample to model cell distributions, while it uses a symmetric version of the Kullback-Leibler (KL) divergence to estimate a distance between distributions representing samples. Note also that GloScope was evaluated with limited benchmarking with few data sets and without a comprehensive comparison to all existing competing approaches. 2 Approach In this work, we introduce a PatIent Level Optimal Transport approach with Gaussian Mixture Variational AutoEncoders (PILOT-GM-VAE). It uses VAE to estimate Gaussian mixture models representing the distribution of a single cell disease atlas. Next, PILOT-GM-VAE explores the fact that the Wasserstein Distance between Gaussian Mixtures (GMs) can be efficiently estimated ( Chen et al ., 2018 ) by using an analytical solution to transport two Gaussian distributions ( Takatsu, 2011 ). In detail, for a given integrated and dimensionally reduced single cell disease data, PILOT-GM-VAE uses an unsupervised version of the Variational Autoencoder (GM-VAE) proposed in Kingma et al . (2014) to estimate Gaussian Mixture models ( Fig.1A ). 2 Next, the GM-VAE latent variables are used to obtain a GM representing the distributions of cells in a given sample. The sample-specific GM models are used for estimation of the Wasserstein Distance between samples ( Chen et al ., 2018 ) ( Fig.1B ). The distances can be used for downstream analysis, such as sample-level clustering or sample-level disease progression, as well as to associate cellular states and molecular features (genes) associated with clusters or trajectories as before ( Joodaki et al ., 2024 ) ( Fig.1C ). Download figure Open in new tab Fig. 1: Schematic of the PILOT-GM-VAE framework. (A) PILOT-GM-VAE receives as input data an integrated scRNA-seq data matrix x from a disease atlas. Then, it uses a GM-VAE to encode x into a latent variable z conditional on the discrete variable y. The latter represents the component assignment in a mixture model. The GM-VAE outputs an estimated data x 0 and posterior estimates on the association of data points to components q ( y | x ). (B) PILOT-GM-VAE explores the q ( y | x ) as estimated by the GM-VAE sample-specific GMs (Θ t and Θ s ). These are provided as input to the Wasserstein Distance based on the Bures-Wasserstein formulation. (C) The Wasserstein Distance between samples is used for downstream analysis, such as patient-level disease progression estimation. We benchmark PILOT-GM-VAE against PILOT, PhEMD, QOT, GloScore, and baseline approaches (PILOT-GM, Pseudobulk and cellular Proportions) in ten large single cell disease atlases with up to 250 samples and 1 million cells. We demonstrate that PILOT-GM-VAE has superior results on inferring sample-level clusters or disease trajectories, recapitulating sample disease status. Moreover, we revisit and evaluate the fact that some multi-center disease atlases [covid-19 ( Ren et al ., 2021 ); lung ( Sikkema et al ., 2023 )] have been previously shown to be affected by batch effects ( Joodaki et al ., 2024 ), i.e., the city of sampling or study of the samples dominated the signal of the sample-level analysis. Finally, we showcase the power of PILOT-GM-VAE in analyzing a sample-level trajectory on a breast cancer disease atlas ( Kumar et al ., 2023 ). 3 Methods PILOT-GM-VAE receives as input a matrix X ∈{ℝ} N × D representing an integrated disease atlas of single cell data, where N represents the number of cells and D the dimension of the data (genes or principal components, etc.). Here, x corresponds to an observation (cell) from X . Moreover, the vector s = { s 1 ,…, s N }, where s i ∈ {1,…, L } indicates which sample (patient) the cells belong to. 3.1 Mixture of Gaussians VAE 3.1.1. Generative Process The generative model defines the joint probability of the observed data x , the latent variable z , and a discrete variable y , which describes the component assignments: where where K is the number of mixture components, µ y and Σ y are the mean and covariance of the latent variable z conditioned on y , and f ( z ) is a neural network decoder that maps the latent variable z to the reconstructed data x′. 3.1.2. Inference Process The posterior distribution of the latent variable and component assignment is approximated as: where q ( y | x ) is modeled using a categorical distribution parameterized by an encoder neural network with a Gumbel-Softmax layer as in Jang et al . (2016) ; and q ( z | x, y ) is modeled as a Gaussian distribution with parameters µ y and Σ y produced by the encoder network. To sample from this distribution, we use the VAE re-parameterization trick ( Kingma et al ., 2014 ): 3.2. Objective Function The VAE-GM optimizes a weighted Evidence Lower Bound (ELBO) composed of three terms as follows: where the first term is the mean square error between the input x and the reconstructed output x rec , the second and third terms represent the Kullback-Leibler divergence between the distributions of the latent spaces of z and y and the prior distributions. The weights w rec , w gauss , w cat control the contribution of each term. For the encoder, we used two hidden layers with 50 units each, followed by ReLU activation functions ( Nair and Hinton, 2010 ). The decoder network reconstructs the input data x rec from the latent variable z using two fully connected layers. The latent space z is a 50-dimensional vector. The model was trained using the Adam optimizer (Kinga et al ., 2015) with a learning rate of 0.001. We use a training/validation split of 80% for training and 20% for validation. The batch size is set to 32, ensuring efficient and stable training. To check the convergence of the loss function, we provide an example for the kidney data set ( Lake et al ., 2021 ) in Supp. Fig. S1A. Also, we checked the clustering performance of distinct weight parameters in the kidney data set (Supp. Fig. S1B). We adopt the optimal value of this evaluaiton ( w rec = 2, w gauss = 1, and w cat = 1) for all data sets. 3.2.1. Assignment of Cells to Components After the training of the GM-VAE, we use final posterior probabilities ( q ( y | x )) to relate cells to components. These estimates are given as input for a standard GM estimation ( Pedregosa et al ., 2011 ) with K components for refinement. This is used to to define a cluster I k as: where r ik is the posterior probability of cell i to belong to component k . We can similarly define a sample specific cluster by only considering cells i associated with a sample j , i.e. s i = j 3.3 Patient Specific Mixture Models and Optimal Transport Next, we estimate parameters for a Gaussian of mixture as Θ j = (μ j , Σ j , α j ), where , and , for patient j as follows: where is the sets of cells associated to a component k and patient j ,and is the number of cells associated with sample j allocated to component k . Similarly, the covariance and mixing coefficients can be estimated as: and 3.3.1. Wasserstein Distance between Patients To quantify the difference between the distributions of two patients (source and target) represented by a mixture of gaussian’s with parameters Θ s and Θ t , we employ the Wasserstein Distance between two Gaussian mixtures defined as follows( Bonneel et al ., 2011 ): where K is the number of components in the Gaussian mixture model, and the cost function as the Bures-Wasserstein Distance ( Bhatia et al ., 2019 ) is given by: The Bures term (B) quantifies the difference in shape between two covariance matrices, which is defined as follows: where Tr is the trace operator. 3.3.2. Sample-level clustering and trajectory analysis With the distance matrix W ( Eq. 12 ), we can perform sample-level clustering by using the Leiden algorithm ( Traag et al ., 2019 ); or disease trajectories through diffusion maps ( Coifman and Lafon, 2006 ). In short, diffusion maps construct a similarity matrix using a Gaussian kernel with a bandwidth function: where ∈ is the scale parameter, and ρ s , ρ t are local bandwidth functions that adapt to the density of the data ( Berry and Harlim, 2016 ). The diffusion map embedding is derived from dominant eigenvectors (eigenvectors with the highest eigenvalues) of the normalized similarity matrix. Next, the EIPLGraph algorithm ( Albergante et al ., 2020 ) is used to fit a graph to the diffusion map and to infer the backbone of the trajectory. Start points of trajectories were selected by visual inspection, i.e., selection of regions related to control samples. This provides a ranking of samples based on a disease progression score t = { t 1 , t 2 ,…, t L }, where t l represents the rank of the sample l . In this study, only the two dominant eigenvectors were utilized for diffusion maps. Of note, this supervised step, i.e. selection of root cells, is only important for data interpretation and do not influence benchmarking results, which are invariant to this. 3.4 Datasets We collected ten publicly available single cell disease atlases for the benchmarking of sample-level analysis methods ( Table 1 ). These datasets were chosen based on the availability of at least two sample-level labels (control and disease). We used the same normalization, dimension reduction, and cell type annotation as in the original study unless this was not available. The cell type annotation is used as input by some of the algorithms that require it (such as PILOT, QOT, and Proportions). Eight datasets were used as previously described in Joodaki et al . (2024) . Two of these datasets were updated to include additional or novel samples: kidney disease ( Lake et al ., 2021 ) and myocardial infarction ( Kuppe et al ., 2022 ). The novel and re-analyzed datasets are described below. View this table: View inline View popup Download powerpoint Table 1. Characteristics of the datasets used for benchmarking, including the total number of cells, the total number of samples, the total number of cell types reported in the original study, and the number of classes (disease status) of the samples. We also provide the disease severity scores associated with the disease status. 3.4.1. Processing of single cell sequencing data We used single cell and single-nucleus assays from kidney samples provided by the Kidney Precision Medicine Project ( Lake et al ., 2021 ). Specifically, we included single cell RNA-seq from kidney tissues encompassing 104,314 cells organized into 14 major cell types. This dataset consists of samples from 45 donors: 18 controls, 12 with acute kidney failure, and 15 with chronic kidney disease. This data was obtained from CellxGene. We now include an extended version of the myocardial infarction atlas ( Kuppe et al ., 2022 ), which includes 23 samples representing controls (13), ischemic (5), and fibrotic (5) samples (MI-2). The data contains 132,888 cells clustered at 11 cell types. The data was obtained from CellxGene. The breast cancer atlas provides a detailed genomic and spatial characterization of the adult human breast. This comprehensive dataset comprises 714,331 single cells from 126 donors ( Kumar et al ., 2023 ), including samples from 86 normal breast tissues and 40 cancerous tissues. The cells are categorized into 10 distinct cell types. The pre-processed data was obtained from GEO (GSE195665). The single cell transcriptional profiling of clear cell renal cell carcinoma (ccRCC) ( Zvirblyte et al ., 2024 ) provides a comprehensive dataset characterizing the tumor microenvironment using single cell RNA sequencing. The dataset encompasses 17 samples, including 8 normal and 9 tumor samples, and identifies 6 major cell types. The pre-processed data was obtained from GEO (GSE242299). 3.5 Competing methods We describe here the evaluated competing approaches. By default, we provided the same input data for all methods. Some methods require cell types such as QOT, PILOT, and Proportions, which were defined as explained above. Other approaches have internal clustering or mixture model estimates. These methods were executed to return the same number of cluster/components as the number of cell types described in the data ( Table 1 ). All methods provided results as a sample-level distance matrix, which was used as input for the same clustering (Leiden) and trajectory analysis (diffusion maps followed by EIPLGraph pseudotime inference) pipeline described above. PhEMD processes single cell data including clustering of cells with the Monocle pipeline ( Trapnell et al ., 2014 ). For all datasets, we executed PhEMD with default parameters. An exception is the distribution models, which were chosen per dataset in order to optimize the clustering results 3 . For breast cancer, kidney cancer, kidney, MI (2), covid, lupus, and diabetes, we selected the Gaussian distribution; for lung, we used the Tobit distribution, and the negative binomial was used for the remaining datasets. Quantized Optimal Transport (QOT) ( Wang et al ., 2025 ) estimates Gaussian Mixture Models for every cluster in integrated single cell data and uses the estimated parameters to find distances using optimal transport. We provided integrated, normalized, and cell-labeled scRNA-seq data as input and ran QOT (exact method) with standard parameters. By default, two components were estimated per cell type. As an exception, for MI(1), MI(2), and kidney datasets, the number of components was set to one per cluster, due to failures when running QOT with the default parameter. GloScope is another sample-level approach, which uses GMMs to model sample-specific single cell data and measures distances using the KL divergence. Here, we set the number of components in GMMs to correspond to the number of cell types in each dataset. PILOT is a parameter-free approach, but requires cell type annotations for estimation of sample-level distributions ( Joodaki et al ., 2024 ). We also implemented several baseline approaches. The first one builds Pseudo-bulk libraries by summing gene expression counts for all cells. The Poisson distance Witten (2011) is then used to calculate differences between samples based on these aggregated counts. Another baseline approach is to determine the Proportions of each cell type within each sample from the single cell data, using the cosine distance between samples. A final baseline approach here is to use the Gaussian Mixture estimation algorithm from scikitlearn ( Pedregosa et al ., 2011 ) on the scRNA-seq data (at the sample-level) and use the OT approach proposed before ( Eq. 12 ) to detect distances. This is equivalent to skipping the GM-VAE part of PILOT-GM-VAE and is denoted as PILOT-GM . Notably, for a fair comparison, the number of components in PILOT-GM, PILOT-GM-VAE, QOT, and GloScope is set to the number of cell types in the data. 4 Results 4.1 Benchmarking of Patient-Level Clusters and Trajectories We applied PILOT-GM-VAE and eight competing methods [QOT ( Wang et al ., 2025 ); GloScope ( Wang et al ., 2024 ); PhEMD ( Chen et al ., 2020 )); PILOT ( Joodaki et al ., 2024 )] and baseline methods (PILOT-GM, Pseudobulk, Proportions, and PhEMD) to ten publicly available disease cell atlas with varying numbers of cells, samples and disease entities ( Table 1 ). First, we evaluated the accuracy of methods in a clustering task. For this, we provided all estimated distance matrices to the Leiden algorithm ( Traag et al ., 2019 ), where the resolution was set to identify the corresponding number of classes (sample labels) in the data; and we contrasted the clustering labels with the sample labels using the adjusted Rand Index ( Hubert and Arabie, 1985 ), and used the Friedman-Nemenyi ( Nemenyi, 1963 ) test to evaluate the ranking of methods over all datasets. Regarding clustering accuracy ( Fig. 2.A ), PILOT-GM-VAE has the highest mean ARI value among all competing approaches. The FN test indicates that PILOT-GM-VAE’s ranking is significantly higher (p-value < 0.05) than PhEMD, Proportions, and GloScope methods (see Supp. Table S1 for complete results). Download figure Open in new tab Fig. 2: (A) Box plots showing clustering evaluation based on the Adjusted Rand Index (ARI) across methods. The x-axis represents the methods, and the y-axis represents the ARI score indicating the accuracy of patient-level clustering. Below the box plots, we show the ranking of methods (x-axis; lower values indicate best performer) in accordance with the Friedman–Nemenyi test ranks. The gray area represents the confidence interval (p-value < 0.05) of the top-performing method (lowest rank value). (B) and (C) similar to (A) for the AUCPR and Spearman correlation analysis in the evaluation of the trajectory inference problem. Another evaluated task is the ability to infer disease progression trajectories. For this, we use the sample-level distances as input to a diffusion map algorithm ( Coifman and Lafon, 2006 ) followed by a trajectory and pseudotime inference ( Albergante et al ., 2020 ). First, we evaluate whether the trajectories produced by the algorithms (patient-level pseudotimes) discriminate the sample classes (disease vs. non-disease) using the area under the precision and recall curve (AUCPR). As seen in Fig. 2B , PILOT-GM-VAE has the highest AUCPR than all competing approaches. The FN test indicates that PILOT-GM-VAE significantly outperforms PhEMD, Proportions and PILOT. Examples of diffusion maps and trajectories for PILOT-GM-VAE, PILOT-GM, QOT and Pseudo-bulk can be seen at Supp. Fig. S2. Finally, for multi-class datasets, we can derive a disease severity score (last column of Table 1 ). For example, in the covid-19 dataset ( Ren et al ., 2021 ), we score controls as 1, mild infections as 2, and severe infections as 3. We estimate the Spearman Correlation between the pseudotime and disease severity score. As seen in Fig. 2C , PILOT-GM-VAE has the highest Spearman correlation among all competing approaches. The FN test indicates that PILOT-GM-VAE ranking is significantly higher (p-value < 0.05) than PhEMD. Altogether, these results collectively underscore the advantage of PILOT-GM-VAE in accurately capturing patient-level clusters and complex trajectories compared to other state-of-the-art approaches. 4.2 Benchmarking: computation resources Another important aspect is the computing resources required for the methods. Here, we focus on the top four methods of trajectory benchmarking: QOT, GloScope, PILOT-GM-VAE, and PILOT-GM. Note that other approaches (PILOT, Pseudo-bulk, and Proportions) require fewer computational resources, as they are based on simpler representations of the data, however, these methods were mostly outperformed in the previously described cluster and trajectory accuracy benchmarking. Altogether, PILOT-GM was the fastest approach followed by QOT, PILOT-GM-VAE and GloScope. Regarding memory, GloScope had the lowest requirement, while QOT, PILOT-GM, and PILOT-GM-VAE had overall similar requirements (Supp. Fig. S3 and Tables S2-S3). In the largest dataset, lupus PBMC with 1 million cells and 261 samples, PILOT-GM-VAE performed the analysis in less than seven hours, while requiring 41GB of memory. These results indicate that PILOT-GM-VAE (and the competing approaches) can be used to analyze complex disease single cell atlas data with millions of cells on a high-end notebook or desktop computer. 4.3 Effect of Batches in Sample-Level Analysis A previous study on sample-level analysis of disease single cell atlases ( Joodaki et al ., 2024 ) indicated that the batch effect had a high impact on overall results. This is particularly true for large disease atlases such as the lung ( Sikkema et al ., 2023 ), breast cancer ( Kumar et al ., 2023 ), and covid-19 ( Ren et al ., 2021 ), which were based on multi-center and/or meta-analysis of distinct scRNA-seq studies. To investigate the effect of batch effects on our benchmarking analysis, we estimated the association of the inferred trajectory and sample covariates such as the location of the sample collection, study of origin, and demographic information using an ANOVA F-statistic metric. Here, we consider that methods less affected by batch effects should have a higher F-statistic for the class label than any other covariate of the study. Our results ( Fig. 3 ) show that PILOT-GM-VAE is the only approach, which ranks disease status as the most significant covariate in all three multi-center datasets. On the other hand, the QOT prediction for covid-19 data was associated with the city of sample collection, and PILOT-GM results were mostly related to the sample source (breast cancer atlas) and study source (lung disease atlas) ( Fig. 3 ). These results support that PILOT-GM-VAE predictions are less influenced by potential batch artifacts than competing methods. Download figure Open in new tab Fig. 3: (A) Log-transformed ANOVA F-Statistic (y-axis) for the covariate with maximum F-statistic against evaluated methods (x-axis) in the lung scRNA-seq atlas. The bar color indicates whether the inferred trajectory is mostly associated with the disease label (green) or with covariates associated with batch effects (red or pink). Similar plots for (B) covid and (C) breast cancer disease atlas. 4.4 Disease progression in breast cancer disease atlas To exemplify the power of PILOT-GM-VAE in interpretation, we apply PILOT-GM-VAE to a breast cancer disease atlas ( Kumar et al ., 2023 ). Trajectory analysis can discriminate the majority of controls from cancer samples ( Fig. 4A-B ). For this, PILOT-GM-VAE used mixture models with ten components. Out of these, seven components corresponded directly to one of the ten major cell types described in Kumar et al . (2023) (Supp.Fig. S4). Among other differences, PILOT-GM-VAE found two components associated with fibroblast cells (components three and six), which we denote here as Fibroblast 1 and 2. Moreover, these subpopulations were not related to fibroblast subtypes described in Kumar et al . (2023) (Supp. Fig. S4C). Download figure Open in new tab Fig. 4: (A) Diffusion map of the breast cancer disease atlas. The line represents the predicted disease trajectory. (B) Diffusion map of the breast cancer disease atlas, colored by pseudotime (disease progression). (C) Proportion (y-axis) of the top six components (annotated) (p-value < 0.05 and highest R2) vs. disease progression (x-axis). (D) Gene modules detected in genes with expression specific to Fibroblast 1 (C3) and with significant changes across disease progression (x-axis). (E) GO enrichment of the genes for each cluster. (F) Kaplan-Meier plot showing survival of TCGA breast cancer patients stratified by median expression of the PSAP gene. p-values were estimated based on the top 15 genes (log-rank test) and corrected with the Benjamini-Hochberg procedure. One interesting downstream analysis is to check for how the proportion of cells associated with particular components changes over the inferred disease trajectory by the use of non-linear regression models ( Huber, 1965 ). This detected six components, in which the proportion of cells are related to the inference disease trajectories (p-value < 0.0001; Fig. 4C ). This indicated a loss of epithelial cells (basal C1 and lumsec cells C8) and an increase in immune cells (C2) and endothelial cells (C9). Interestingly the two components related to fibroblast cells (C3 and C6) have distinct changes in cell trajectories, i.e., while Fibroblast 1 (C3) increases along the disease trajectory, Fibroblast 2 (C6) decreases, particularly for samples at the end of the trajectory. To gain further insights into the mechanisms associated with the cancer-associated Fibroblast 1 (C3), we performed a non-linear regression test (p-value < 0.05) to find genes with changes associated with the disease trajectory. This was followed by a Wald test ( Van den Berge et al ., 2020 ) (p-value 0.5) to find genes whose expression is specific to Fibroblast 1 (C3) cells ( Joodaki et al ., 2024 ). A clustering analysis of these genes reveals four main gene modules (1- down-regulation, 2 - transient expression, and 3 and 4 up-regulation). Functional analysis (GO enrichment ( Reimand et al ., 2007 )) indicates that genes with transient expression (module 2) are related to cell differentiation, and clusters associated with up-regulation (modules 3-4) are related to collagen protein and the extracellular matrix ( Fig. 4E ). This indicates that these cells undergo cellular differentiation, leading to fibrosis-associated expression programs. Fibrosis is supportive of the tumor environment and related to poor prognosis in cancers ( Piersma et al ., 2020 ), including breast cancer ( Quintela-Fandino et al ., 2024 ). We next performed survival analysis using gene expression data from the TCGA breast cancer database (13 et al ., 2012) among the top 15 genes related to Fibroblast 1 cluster, we found that the PSAP gene was associated with an increase in mortality ( Fig. 4F ). This gene is part of gene module 3 ( Fig. 4D ) and shows increased expression in disease progression. This gene has been previously related to the activation of androgen and the promotion of breast cancer growth ( Ali et al ., 2015 ; Wu et al ., 2012 ). Altogether, this analysis highlights how PILOT-GM-VAE predicted trajectories can be used to highlight cellular and molecular changes in a disease single cell atlas. 5 Conclusion In this manuscript, we describe PILOT-GM-VAE, which integrates a Gaussian Mixture Variational Autoencoder with optimal transport for patient-level clustering and trajectory analysis. The use of Gaussian mixtures was previously explored by GloScope ( Wang et al ., 2024 ). However, GloScope uses a Kullback–Leibler (KL) distance to contrast GM distributions. In contrast to the Wasserstein Distance, KL is not a metric and ignores the expression information of cells. While QOT ( Wang et al ., 2025 ) also used a similar Wasserstein Distance as PILOT-GM-VAE, it estimates mixtures based on previous cell clustering. Since single cell clustering are mostly based on non-parametric graph-based algorithms such as Leiden ( Traag et al ., 2019 ), it is possible that this leads to poor GM fits. Moreover, the use of VAE-GMs indicated better results in contrast to EM estimated GMs (see PILOT-GM vs PILOT-GM-VAE). This highlights the effectiveness of GM-VAE in modeling complex and noisy single cell data. Our evaluation also included the influence of batch-related covariates in the sample-level analysis, an issue that is largely overlooked in the literature. Once again, PILOT-GM-VAE was the only approach not directly influenced by variables such as the city of sampling or the study origin. A final aspect of our benchmarking is the computational requirements, which indicates higher computational costs for more complex approaches as PILOT-GM-VAE and GloScope. However, all methods can be executed in less than a day with a high-end desktop computer, which is a minimal effort compared to the generation of the disease cell atlas. Altogether, these results highlight the advantage of PILOT-GM-VAE in sample-level analysis tasks such as clustering and trajectory analysis. Another previously poorly explored is the interpretability of sample-level analysis results. While previous approaches, such as PILOT and QOT, rely on previous clustering (or cell type annotation) to facilitate the interpretation of cell type compositional changes, the use of mixtures requires additional annotation of their clusters (or component associations). In a case study of the breast cancer disease atlas. We showed that clustering based on the GM aligned well with the major cell types reported in the original study. Interestingly, one exception supported the sub-clustering of fibroblasts, where compositional changes were inverted in the disease trajectory. Functional analysis indicated an increase in a fibrosis-related fibroblast population with disease progression. Moreover, genes with expression changes related to Fibroblast 1 cells were prognostic of disease outcome. This further supports the power of PILOT-GM-VAE in uncovering novel biological insights in into progression. Some open/future research questions are the extension of PILOT-GM-VAE in spatial disease atlas; or multimodal cell atlas. The first would require the definition of spatially-aware Gaussian mixture estimation approaches. The latter problem could be answered with kernel fusion approaches. Nevertheless, both multimodal and spatial disease atlas data are scarce, which currently impairs a comprehensive benchmarking of computational approaches. Source Code and Data Availability The software, code, trained models, estimated distances for all datasets, and tutorials are available at https://github.com/CostaLab/PILOT-GM-VAE/tree/main . All pre-processed and annotated data is available as python AnnData objects in https://zenodo.org/records/7956950 , https://zenodo.org/records/7957118 and https://zenodo.org/records/14615923 . Acknowledgments This work was funded by grants of the Deutsche Forschungsgemeinschaft KFO5011 (Projekt ID: 445703531) and by the Bundesministerium für Bildung und Forschung (BMBF e:Med Consortia Fibromap and Graphs4Patients). Funding Deutsche Forschungsgemeinschaft https://ror.org/018mejw64 KFO5011 (Projekt ID: 445703531) Footnotes https://github.com/CostaLab/PILOT-GM-VAE/tree/main https://zenodo.org/records/7956950 https://zenodo.org/records/7957118 https://zenodo.org/records/14615923 ↵ 1 Note that a disease single cell atlas cannot be defined as a 3-dimensional tensor, as single cell dimensions cannot be trivially mapped across samples. ↵ 2 The original method from Kingma was based on a semi-supervised problem. ↵ 3 This procedure made PhEMD results over-optimistic. References 13, B.. W. H.. H. M. S. C. L… P. P. J.. K. R., data analysis: Baylor College of Medicine Creighton Chad J. 22 23 Donehower Lawrence A. 22 23 24 25, G., for Systems Biology Reynolds Sheila 31 Kreisberg Richard B. 31 Bernard Brady 31 Bressler Ryan 31 Erkkila Timo 32 Lin Jake 31 Thorsson Vesteinn 31 Zhang Wei 33 Shmulevich Ilya 31, I ., et al. ( 2012 ). Comprehensive molecular portraits of human breast tumours . Nature , 490 ( 7418 ), 61 – 70 . OpenUrl CrossRef PubMed Web of Science ↵ Albergante , L. , Mirkes , E. , Bac , J. , Chen , H. , Martin , A. , Faure , L. , Barillot , E. , Pinello , L. , Gorban , A. , and Zinovyev , A. ( 2020 ). Robust and scalable learning of complex intrinsic dataset geometry via elpigraph . Entropy , 22 ( 3 ), 296 . OpenUrl CrossRef PubMed ↵ Ali , A. , Creevey , L. , Hao , Y. , McCartan , D. , O’Gaora , P. , Hill , A. , Young , L. , and McIlroy , M. ( 2015 ). Prosaposin activates the androgen receptor and potentiates resistance to endocrine treatment in breast cancer . Breast Cancer Research , 17 , 1 – 19 . OpenUrl CrossRef PubMed ↵ Berry , T. and Harlim , J. ( 2016 ). Variable bandwidth diffusion kernels . Applied and Computational Harmonic Analysis , 40 ( 1 ), 68 – 96 . OpenUrl CrossRef ↵ Bhatia , R. , Jain , T. , and Lim , Y. ( 2019 ). On the bures–wasserstein distance between positive definite matrices . Expositiones Mathematicae , 37 ( 2 ), 165 – 191 . OpenUrl CrossRef ↵ Bonneel , N. , Van De Panne , M. , Paris , S. , and Heidrich , W. ( 2011 ). Displacement interpolation using lagrangian mass transport . In Proceedings of the 2011 SIGGRAPH Asia conference , pages 1 – 12 . ↵ Chen , W. S. , Zivanovic , N. , van Dijk , D. , Wolf , G. , Bodenmiller , B. , and Krishnaswamy , S. ( 2020 ). Uncovering axes of variation among single-cell cancer specimens . Nature Methods , 17 ( 3 ), 302 – 310 . OpenUrl CrossRef PubMed ↵ Chen , Y. , Georgiou , T. T. , and Tannenbaum , A. ( 2018 ). Optimal transport for gaussian mixture models . IEEE Access , 7 , 6269 – 6278 . OpenUrl PubMed ↵ Coifman , R. R. and Lafon , S. ( 2006 ). Diffusion maps . Applied and computational harmonic analysis , 21 ( 1 ), 5 – 30 . OpenUrl CrossRef Hrovatin , K. , Bastidas-Ponce , A. , Bakhti , M. , Zappia , L. , Buttner , M. , Sallino , C. , Sterr , M. , Bottcher , A. , Migliorini , A. , Lickert , H. , et al. ( 2022 ). Delineating mouse, 8-cell identity during lifetime and in diabetes with a single cell atlas . bioRxiv , pages 2022 – 12 . ↵ Huber , P. J. ( 1965 ). A robust version of the probability ratio test . The Annals of Mathematical Statistics , pages 1753 – 1758 . ↵ Hubert , L. and Arabie , P. ( 1985 ). Comparing partitions . Journal of classification , 2 ( 1 ), 193 – 218 . OpenUrl CrossRef ↵ Jang , E. , Gu , S. , and Poole , B. ( 2016 ). Categorical reparameterization with gumbel-softmax . arXiv preprint arXiv: 1611.01144 . ↵ Joodaki , M. , Shaigan , M. , Parra , V. , Bülow , R. D. , Kuppe , C. , Hölscher , D. L. , Cheng , M. , Nagai , J. S. , Goedertier , M. , Bouteldja , N. , et al. ( 2024 ). Detection of patient-level distances from single cell genomics and pathomics data with optimal transport (pilot) . Molecular systems biology , 20 ( 2 ), 57 – 74 . OpenUrl CrossRef PubMed Kinga , D. , Adam , J. B. , et al. ( 2015 ). A method for stochastic optimization . In International conference on learning representations (ICLR) , volume 5 . San Diego, California ;. ↵ Kingma , D. P. , Mohamed , S. , Jimenez Rezende , D. , and Welling , M. ( 2014 ). Semi-supervised learning with deep generative models . Advances in neural information processing systems , 27 . ↵ Kumar , T. , Nee , K. , Wei , R. , He , S. , Nguyen , Q. H. , Bai , S. , Blake , K. , Pein , M. , Gong , Y. , Sei , E. , et al. ( 2023 ). A spatially resolved single-cell genomic atlas of the adult human breast . Nature , 620 ( 7972 ), 181 – 191 . OpenUrl CrossRef PubMed ↵ Kuppe , C. , Ramirez Flores , R. O., Li , Z. , Hayat , S. , Levinson , R. T. , Liao , X. , Hannani , M. T. , Tanevski , J. , Wünnemann , F. , Nagai , J. S. , et al. ( 2022 ). Spatial multi-omic map of human myocardial infarction . ↵ Lake , B. B. , Menon , R. , Winfree , S. , Hu , Q. , Ferreira , R. M. , Kalhor , K. , Barwinska , D. , Otto , E. A. , Ferkowicz , M. , Diep , D. , et al. ( 2021 ). An atlas of healthy and injured cell states and niches in the human kidney . bioRxiv , pages 2021 – 07 . ↵ Luecken , M. D. , Büttner , M. , Chaichoompu , K. , Danese , A. , Interlandi , M. , Müller , M. F. , Strobl , D. C. , Zappia , L. , Dugas , M. , Colomé- Tatché , M. , et al. ( 2022 ). Benchmarking atlas-level data integration in single-cell genomics . Nature methods , 19 ( 1 ), 41 – 50 . OpenUrl CrossRef PubMed ↵ Macosko , E. Z. , Basu , A. , Satija , R. , Nemesh , J. , Shekhar , K. , Goldman , M. , Tirosh , I. , Bialas , A. R. , Kamitaki , N. , Martersteck , E. M. , et al. ( 2015 ). Highly parallel genome-wide expression profiling of individual cells using nanoliter droplets . Cell , 161 ( 5 ), 1202 – 1214 . OpenUrl CrossRef PubMed ↵ Nair , V. and Hinton , G. E. ( 2010 ). Rectified linear units improve restricted boltzmann machines . In Proceedings of the 27th international conference on machine learning (ICML-10) , pages 807 – 814 . ↵ Nemenyi , P. B. ( 1963 ). Distribution-free multiple comparisons . Princeton University . ↵ Pedregosa , F. , Varoquaux , G. , Gramfort , A. , Michel , V. , Thirion , B. , Grisel , O. , Blondel , M. , Prettenhofer , P. , Weiss , R. , Dubourg , V. , Vanderplas , J. , Passos , A. , Cournapeau , D. , Brucher , M. , Perrot , M. , and Duchesnay , E. ( 2011 ). Scikit-learn: Machine learning in Python . Journal of Machine Learning Research , 12 , 2825 – 2830 . OpenUrl Peng , J. , Sun , B.-F. , Chen , C.-Y. , Zhou , J.-Y. , Chen , Y.-S. , Chen , H. , Liu , L. , Huang , D. , Jiang , J. , Cui , G.-S. , et al. ( 2019 ). Single-cell rna-seq highlights intra-tumoral heterogeneity and malignant progression in pancreatic ductal adenocarcinoma . Cell research , 29 ( 9 ), 725 – 738 . OpenUrl CrossRef PubMed Perez , R. K. , Gordon , M. G. , Subramaniam , M. , Kim , M. C. , Hartoularos , G. C. , Targ , S. , Sun , Y. , Ogorodnikov , A. , Bueno , R. , Lu , A. , et al. ( 2022 ). Single-cell rna-seq reveals cell type–specific molecular and genetic associations to lupus . Science , 376 ( 6589 ), eabf1970 . OpenUrl CrossRef PubMed Peyré , G. , Cuturi , M. , et al. ( 2019 ). Computational optimal transport: With applications to data science . Foundations and Trends® in Machine Learning , 11 ( 5-6 ), 355 – 607 . OpenUrl CrossRef ↵ Piersma , B. , Hayward , M.-K. , and Weaver , V. M. ( 2020 ). Fibrosis and cancer: A strained relationship . Biochimica et Biophysica Acta (BBA)-Reviews on Cancer , 1873 ( 2 ), 188356 . OpenUrl ↵ Quintela-Fandino , M. , Bermejo , B. , Zamora , E. , Moreno , F. , García-Saenz , J.Á. , Pernas , S. , Martínez-Jañez , N. , Jiménez , D. , Adrover , E. , de Andrés , R. , et al. ( 2024 ). High mechanical conditioning by tumor extracellular matrix stiffness is a predictive biomarker for antifibrotic therapy in her2-negative breast cancer . Clinical Cancer Research , 30 ( 22 ), 5094 – 5104 . OpenUrl CrossRef PubMed ↵ Reimand , J. , Kull , M. , Peterson , H. , Hansen , J. , and Vilo , J. ( 2007 ). g: Profiler—a web-based toolset for functional profiling of gene lists from large-scale experiments . Nucleic acids research , 35 (suppl_2), W193 – W200 . OpenUrl CrossRef PubMed Web of Science ↵ Ren , X. , Wen , W. , Fan , X. , Hou , W. , Su , B. , Cai , P. , Li , J. , Liu , Y. , Tang , F. , Zhang , F. , et al. ( 2021 ). Covid-19 immune features revealed by a large-scale single-cell transcriptome atlas . Cell , 184 ( 7 ), 1895 – 1913 . OpenUrl CrossRef PubMed ↵ Sikkema , L. , Ramírez-Suástegui , C. , Strobl , D. C. , Gillett , T. E. , Zappia , L. , Madissoon , E. , Markov , N. S. , Zaragosi , L.-E. , Ji , Y. , Ansari , M. , Arguel , M.-J. , Apperloo , L. , Banchero , M. , Bécavin , C. , Berg , M. , Chichelnitskiy , E. , Chung , M.-i. , Collin , A. , Gay , A. C. A. , Gote-Schniering , J. , Hooshiar Kashani , B. , Inecik , K. , Jain , M. , Kapellos , T. S. , Kole , T. M. , Leroy , S. , Mayr , C. H. , Oliver , A. J. , Von Papen , M. , Peter , L. , Taylor , C. J. , Walzthoeni , T. , Xu , C. , Bui , L. T. , De Donno , C. , Dony , L. , Faiz , A. , Guo , M. , Gutierrez , A. J. , Heumos , L. , Huang , N. , Ibarra , I. L. , Jackson , N. D. , Kadur Lakshminarasimha Murthy , P. , Lotfollahi , M. , Tabib , T. , Talavera-López , C. , Travaglini , K. J. , Wilbrey-Clark , A. , Worlock , K. B. , Yoshida , M. , Lung Biological Network Consortium , Chen , Y. , Hagood , J. S. , Agami , A. , Horvath , P. , Lundeberg , J. , Marquette , C.-H. , Pryhuber , G. , Samakovlis , C. , Sun , X. , Ware , L. B. , Zhang , K. , Van Den Berge , M. , Bossé , Y. , Desai , T. J. , Eickelberg , O. , Kaminski , N. , Krasnow , M. A. , Lafyatis , R. , Nikolic , M. Z. , Powell , J. E. , Rajagopal , J. , Rojas , M. , Rozenblatt-Rosen , O. , Seibold , M. A. , Sheppard , D. , Shepherd , D. P. , Sin , D. D. , Timens , W. , Tsankov , A. M. , Whitsett , J. , Xu , Y. , Banovich , N. E. , Barbry , P. , Duong , T. E. , Falk , C. S. , Meyer , K. B. , Kropski , J. A. , Pe’er , D. , Schiller , H. B. , Tata , P. R. , Schultze , J. L. , Teichmann , S. A. , Misharin , A. V. , Nawijn , M. C. , Luecken , M. D. , and Theis , F. J. ( 2023 ). An integrated cell atlas of the lung in health and disease . Nature Medicine , 29 ( 6 ), 1563 – 1577 . OpenUrl CrossRef PubMed ↵ Takatsu , A. ( 2011 ). Wasserstein geometry of gaussian measures . ↵ Tang , F. , Barbacioru , C. , Wang , Y. , Nordman , E. , Lee , C. , Xu , N. , Wang , X. , Bodeau , J. , Tuch , B. B. , Siddiqui , A. , et al. ( 2009 ). mrna-seq whole-transcriptome analysis of a single cell . Nature methods , 6 ( 5 ), 377 – 382 . OpenUrl CrossRef PubMed ↵ Traag , V. A. , Waltman , L. , and Van Eck , N. J. ( 2019 ). From louvain to leiden: guaranteeing well-connected communities . Scientific reports , 9 ( 1 ), 5233 . OpenUrl CrossRef PubMed ↵ Trapnell , C. , Cacchiarelli , D. , Grimsby , J. , Pokharel , P. , Li , S. , Morse , M. , Lennon , N. J. , Livak , K. J. , Mikkelsen , T. S. , and Rinn , J. L. ( 2014 ). The dynamics and regulators of cell fate decisions are revealed by pseudotemporal ordering of single cells . Nature biotechnology , 32 ( 4 ), 381 – 386 . OpenUrl CrossRef PubMed ↵ Van den Berge , K. , Roux de Bézieux , H. , Street , K. , Saelens , W. , Cannoodt , R. , Saeys , Y. , Dudoit , S. , and Clement , L. ( 2020 ). Trajectory-based differential expression analysis for single-cell sequencing data . Nature communications , 11 ( 1 ), 1201 . OpenUrl CrossRef PubMed ↵ Wang , H. , Torous , W. , Gong , B. , and Purdom , E. ( 2024 ). Visualizing scrna-seq data at population scale with gloscope . Genome Biology , 25 ( 1 ), 259 . OpenUrl CrossRef PubMed ↵ Wang , Z. , Zhan , Q. , Yang , S. , Mu , S. , Chen , J. , Garai , S. , Orzechowski , P. , Wagenaar , J. , and Shen , L. ( 2025 ). Qot: Quantized optimal transport for sample-level distance matrix in single-cell omics . Briefings in Bioinformatics , 26 ( 1 ), bbae713 . OpenUrl ↵ Witten , D. M. ( 2011 ). Classification and clustering of sequencing data using a poisson model . ↵ Wu , Y. , Sun , L. , Zou , W. , Xu , J. , Liu , H. , Wang , W. , Yun , X. , and Gu , J. ( 2012 ). Prosaposin, a regulator of estrogen receptor alpha, promotes breast cancer growth . Cancer science , 103 ( 10 ), 1820 – 1825 . OpenUrl CrossRef PubMed ↵ Yanai , I. , Haas , S. , Lippert , C. , and Kretzmer , H. ( 2024 ). Cellular atlases are unlocking the mysteries of the human body . ↵ Zvirblyte , J. , Nainys , J. , Juzenas , S. , Goda , K. , Kubiliute , R. , Dasevicius , D. , Kincius , M. , Ulys , A. , Jarmalaite , S. , and Mazutis , L. ( 2024 ). Single-cell transcriptional profiling of clear cell renal cell carcinoma reveals a tumor-associated endothelial tip cell phenotype . Communications Biology , 7 ( 1 ), 780 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted April 16, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following PILOT-GM-VAE: Patient-Level Analysis of single cell Disease Atlas with Optimal Transport of Gaussian Mixture Variational Autoencoders Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share PILOT-GM-VAE: Patient-Level Analysis of single cell Disease Atlas with Optimal Transport of Gaussian Mixture Variational Autoencoders Mehdi Joodaki , Mina Shaigan , Samaneh Samiei , James Nagai , Tiago Maié , Christoph Kuppe , Ivan G. Costa bioRxiv 2025.04.10.648188; doi: https://doi.org/10.1101/2025.04.10.648188 Share This Article: Copy Citation Tools PILOT-GM-VAE: Patient-Level Analysis of single cell Disease Atlas with Optimal Transport of Gaussian Mixture Variational Autoencoders Mehdi Joodaki , Mina Shaigan , Samaneh Samiei , James Nagai , Tiago Maié , Christoph Kuppe , Ivan G. Costa bioRxiv 2025.04.10.648188; doi: https://doi.org/10.1101/2025.04.10.648188 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41910) Biophysics (21436) Cancer Biology (18576) Cell Biology (25480) Clinical Trials (138) Developmental Biology (13368) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40360) Molecular Biology (17163) Neuroscience (88534) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00