Diagnosis of Blood Diseases and Disorders with Topological Deep Learning

doi:10.1101/2025.01.21.25320908

Diagnosis of Blood Diseases and Disorders with Topological Deep Learning

2025 · doi:10.1101/2025.01.21.25320908

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 47,262 characters · extracted from preprint-html · click to expand

Diagnosis of Blood Diseases and Disorders with Topological Deep Learning | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Diagnosis of Blood Diseases and Disorders with Topological Deep Learning Philmore F. Koung , Saba Fatema , View ORCID Profile Nagehan Pakasticali , View ORCID Profile Hung S. Luu , View ORCID Profile Baris Coskunuzer doi: https://doi.org/10.1101/2025.01.21.25320908 Philmore F. Koung 1 University of Texas at Dallas , Richardson, TX USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Saba Fatema 1 University of Texas at Dallas , Richardson, TX USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Nagehan Pakasticali 2 UT Southwestern Medical Center , Dallas, TX USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nagehan Pakasticali Hung S. Luu 2 UT Southwestern Medical Center , Dallas, TX USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hung S. Luu Baris Coskunuzer 1 University of Texas at Dallas , Richardson, TX USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Baris Coskunuzer For correspondence: coskunuz{at}utdallas.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Blood diseases and disorders, including leukemia and infectious diseases of red blood cells, pose significant diagnostic challenges due to their complex presentations and reliance on time-consuming cytomorphological analysis to detect subtle morphologic features. While microscopic examination remains the gold standard in their diagnosis, its dependence on expert interpretation high-lights the need for advanced methods to enhance the diagnostic workflow. Recently, deep learning (DL) methods have shown promise in medical imaging by automating and improving accuracy. However, these approaches often require large, annotated datasets, which are scarce for rare diseases, and they often face interpretability issues, limiting their integration into clinical practice. In this study, we present a novel framework that integrates topological features with state-of-the-art DL techniques to enhance the analysis of cytomorphological images for diagnosing blood disorders. By combining global topological information with the localized patterns captured by DL models, our approach improves diagnostic accuracy while ensuring robustness, interpretability, and reproducibility. Experimental results demonstrate that the inclusion of topological features not only enhances model performance but also proves particularly effective in limited data settings. This methodology addresses critical limitations of existing techniques, advancing the classification and diagnosis of blood disorders while improving efficiency and reliability in medical imaging workflows. 1 Introduction Blood diseases and disorders, including various types of blood cancers, represent a significant global health burden, often characterized by complex diagnostic processes and variable clinical presentations. Accurate diagnosis and classification of these entities rely on cytomorphology and histopathology, including detailed microscopic examination of blood smears and bone marrow samples. While this method remains the gold standard for the diagnosis, it is inherently time consuming and highly dependent on the expertise of the pathologists. With an increasing demand for accurate and timely diagnostic methods, the limitations of traditional approaches highlight the urgent need for advanced technologies to streamline and augment pathological workflows. Recent breakthroughs in machine learning (ML) and deep learning (DL) have opened new avenues in medical imaging, providing opportunities to enhance diagnostic capabilities. DL models, with their ability to detect intricate patterns in images, have tremendous potential for automating and improving histopathological analysis. However, these models often face challenges such as the need for extensive labeled datasets, which are particularly difficult to curate in the context of rare diseases due to their low prevalence and limited availability of annotated data. To overcome these issues, we propose the integration of topological data analysis (TDA) with DL frameworks for the analysis of histopathological images in rare blood diseases and blood cancers. TDA offers a unique perspective by capturing the topological structures and global features in images, which complement the localized patterns typically identified by DL models. By combining these global and local insights, the proposed methodology aims to create more robust, interpretable, reproducible and accurate diagnostic models, ultimately enhancing the effectiveness of the diagnosis and classification of blood diseases. Our contributions can be summarized as follows: - We introduce topological machine learning methods to cytomorphology, providing a novel approach to analyzing cellular structures in critical blood disorders. - We show that topological features are highly effective in diagnosis of various blood disorders, including acute lymphoblastic leukemia (ALL), acute myeloid leukemia (AML), malaria, and babesiosis. - We demonstrate that integrating topological feature vectors with DL models significantly enhances the performance, making these models more robust and reliable. - In limited data settings where DL models typically struggle, topological vectors alone achieve strong results and substantially boost the performance of DL models. - We highlight the interpretability of topological features, addressing a crucial need for leveraging black-box ML models in this domain. 2 Related Work 2.1 Machine Learning Methods for Rare Blood Disease Diagnosis Machine learning (ML) and deep learning (DL) have emerged as transformative tools in detecting and diagnosing blood disorders through histopathological and cytomorpho-logical image analysis. These techniques enhance the ability to identify subtle pathological patterns, improve diagnostic accuracy, and assist clinicians in decision-making. Notable works include the DL-based models in [ 27 ] and [ 35 ], which demonstrated expert-level performance in classifying acute myeloid leukemia (AML) and acute lymphoblastic leukemia (ALL) using high-resolution cytomorphological and blood smear images. To address the scarcity of annotated data for rare diseases, transfer learning has been effectively employed. For instance, [ 24 ] integrates pre-trained CNN architectures like ResNet and DenseNet to classify subtypes of myelodysplastic syndromes (MDS) and other rare hematologic conditions, leveraging general medical imaging datasets to improve model performance. Recent studies have also explored multimodal frameworks and feature selection techniques to enhance model interpretability and diagnostic accuracy. Multimodal approaches, such as [ 22 ] and [ 44 ], integrate cytomorphological image data with genomic and clinical information to improve the diagnosis and risk stratification of complex blood disorders. Feature selection techniques, like those used in [ 25 ], refine extracted features from cytological images, optimizing classification performance for various hematological malignancies. Comprehensive reviews, such as [ 39 ] and [ 16 ], highlight the progress and challenges in applying ML and DL methods to rare blood diseases, emphasizing their potential to bridge clinical gaps. These advancements demonstrate the promise of ML and DL in transforming diagnostic workflows and improving patient outcomes in hematological oncology. 2.2 TDA in Medical Image Analysis Cubical persistence has proven to be a powerful tool for extracting topological features in medical image analysis, with applications in tumor segmentation [ 33 ], brain connectivity studies [ 6 , 36 ], glioblastoma outcome prediction [ 10 ], thoracic disease diagnosis [ 2 ] and genomic data exploration [ 34 ]. Resources like the TDA Applications Library [ 14 ] and DONUT [ 15 ], alongside recent surveys [ 38 ], further highlight its adaptability. Advances in scalable computational frameworks [ 4 , 12 ] have enabled the application of persistent homology (PH) to large-scale biomedical datasets. The integration of PH with machine learning has unlocked new possibilities in diagnostic workflows. PH has been incorporated to enhance image segmentation frame-works [ 17 , 37 , 41 ] and to predict treatment responses in breast cancer [ 42 ]. Recent advancements in topological deep learning underscore its ability to augment CNN-based models for segmentation, classification, and precision medicine applications [ 32 , 45 ]. For instance, PH has been utilized for leukemia classification and the diagnosis of rare blood disorders, offering novel insights into cytomorphological imaging [ 40 , 43 ]. 3 Methodology Our methodology involves two key steps. First, we extract topological feature vectors from the images. Next, we leverage these vectors to diagnose blood disorders by effectively integrating them with recent DL models. 3.1 Topological Vectors for Images Persistent Homology (PH) is a foundational tool in topological data analysis (TDA) that captures the underlying shape and structure of complex datasets. By systematically tracking the emergence and disappearance of topological features at different resolutions, PH provides a robust framework for understanding patterns in diverse data types, including point clouds, networks, and images [ 7 ]. Here, we focus on the application of cubical persistence , a variant of PH tailored specifically for analyzing image data. While we aim to present the construction and application of PH in an accessible manner, detailed theoretical discussions can be found in [ 11 , 9 ]. Here, we outline the process of using cubical persistence for feature extraction from images through three main steps: (1) constructing filtrations, (2) generating persistence diagrams, and (3) vectorizing the persistence information for use in ML models. Step 1: Constructing Filtrations Filtration is the process of building a sequence of nested topological spaces that represent the progressive inclusion of features in the dataset. For image analysis, this involves constructing cubical complexes —a specific type of topological structure derived from binary images. From a grayscale or color image 𝒳 (dimensions r × s ), we extract pixel intensities γ ij for each pixel Δ ij ⊂𝒳. A sublevel filtration is constructed by sequentially activating (coloring black) pixels as their grayscale values reach predefined thresholds t 1 < t 2 < ··· < t N = 255. This creates a sequence of binary images 𝒳 1 ⊂ 𝒳 2 ⊂ · · · ⊂ 𝒳 N , where 𝒳 n = { Δ ij ⊂𝒳 | γ ij ≤ t n }. Alternatively, a superlevel filtration can be constructed by deactivating pixels in descending order of intensity. This results in another sequence, 𝒴 1 ⊂ 𝒴 2 ⊂ · · · ⊂ 𝒴 M , where 𝒴 n = { Δ ij ⊂ 𝒳 | γ ij ≥ s n }, with s 1 > s 2 > · · · > s M = 0. Both filtrations provide complementary views of the image’s topology, enabling comprehensive feature extraction. Step 2: Persistence Diagrams PH captures the evolution of topological features (e.g., connected components, loops, cavities) across the filtration. This information is recorded in a persistence diagram (PD) , which maps each feature σ to its birth and death thresholds ( b σ , d σ ). For a sublevel filtration {𝒳 n }, Birth time b σ marks the threshold t m where a feature σ first appears, and Death time d σ indicates the threshold t n where σ disappears. The persistence diagram PD k (𝒳) for dimension k contains all such pairs for k -dimensional features (e.g., PD 0 for connected components, PD 1 for loops). Features with long lifespans ( d σ − b σ ) are typically more significant, as they represent prominent topological structures. For instance, in the filtration example in Figure 2 , PD 0 (𝒳) captures connected components, while PD 1 (𝒳) records loops (holes). Download figure Open in new tab Fig 1: Sample single cell images for ALL, AML, Malaria and Babesia. Download figure Open in new tab Fig 2: For the 5 × 5 image 𝒳 with the given pixel values, the sublevel filtration is the sequence of binary images 𝒳 1 ⊂ 𝒳 2 ⊂ 𝒳 3 ⊂ 𝒳 4 ⊂ 𝒳 5 . Download figure Open in new tab Fig 3: Topo-Vita Model. In our basic topological model, we first extract topological feature vectors for each image through cubical persistence and feed them into standard ML models. Step 3: Vectorization for Machine Learning Persistence diagrams, while informative, are collections of 2D points that are challenging to use directly in ML models. Thus, vectorization techniques are employed to convert persistence diagrams into a fixed-dimensional representation. One widely used approach is the Betti function , which tracks the number of alive features at each threshold. For an image 𝒳, the Betti function β k ( t ) at threshold t is defined as the count of k -dimensional features in 𝒳 n . This function can be represented as a vector . For example, in Figure 2 , we get 5-dimensional vectors, and . Other advanced vectorization methods, such as persistence landscapes [ 5 ], persistence images [ 1 ], and kernel-based approaches [ 3 ], offer alternatives with varying computational and interpretive benefits [ 3 ]. However, we emphasize Betti functions in this study due to their efficiency and interpretability. 3.2 ML Models In this study, we propose a novel approach to diagnosing blood disorders by integrating topological vectors with DL models. For each image in our dataset, we generate a 400-dimensional feature vector that captures its topological characteristics (Section 3.1). These vectors can be utilized independently or in conjunction with the medical images to train ML models. To evaluate the utility of these features, we design two distinct types of models, each exploring different aspects of their integration and performance. Topo-Vita: Evaluating Standalone Topological Features This is our basic model to initially test the effectiveness of our topological vectors for classifying blood disorders. We simply incorporate two standard ML models with our topological vectors. Multilayer Perceptron A flexible feedforward neural network that has demonstrated robustness in handling diverse feature sets. By using MLP, we assess how well topological vectors alone can capture the diagnostic information embedded in the images. TabTransformer A transformer-based model designed specifically for tabular data [ 21 ]. Given that our topological features can also be interpreted as sequential data , the Tab-Transformer provides an effective mechanism to explore their potential in a more structured and context-aware manner. Topo-VitaX: Enhancing DL Models with Topological Features Our advanced model evaluates the impact of integrating topological vectors into pre-trained DL architectures. The rationale behind this approach is that topological features capture global patterns within an image, offering embeddings that remain robust regardless of data quantity. In contrast, convolutional layers emphasize localized patterns, with their performance heavily reliant on the volume of available data. To explore this synergy, we employed two distinct types of DL architectures. Pre-trained CNNs We purposefully choose a simple architecture ( Figure 4 ) to isolate and highlight the contribution of topological features in improving DL models for classifying rare blood disorders. The fusion involves concatenating the topological vectors with the CNN’s feature embeddings before the final classification layers. Download figure Open in new tab Fig 4: Topo-VitaX Model. In our augmented CNN, we integrate our topological features of images obtained in the Topo-Vita model with convolutional vectors generated by a CNN backbone, followed by processing through a fully connected layer. Vision Transformers (ViTs) In our approach, topological vectors are directly fed into the attention mechanism of ViTs as an additional input, enabling the model to leverage these global structural features during the attention computation. This integration allows the topological features to influence the contextual representation learning within the ViT, enhancing its ability to classify complex patterns in blood smear images. We give the details of our backbone DL models and hyperparameters in Section 4.1. 4 Experiments 4.1 Experimental Setup Datasets For our experiments, we aimed to classify single-cell images of blood disorders, including leukemia and infectious diseases of red blood cells, distinguishing them from normal cells. Specifically, we focused on acute lymphoblastic leukemia (ALL), acute myeloid leukemia (AML), Babesiosis, and Malaria (See Figure 1 for image samples). We used the following publicly available datasets. For ALL, we utilized the ALL-IDB2 dataset from the Acute Lymphoblastic Leukemia Image Database (ALL-IDB) [ 23 ]. For AML, we employed the AML-Cytomorphology MLL Helmholtz dataset from the Cancer Imaging Archive [ 19 ], which includes four subclasses: (i) Acute Promyelocytic Leukemia (APL) with PML::RARA fusion, (ii) AML with NPM1 mutation, (iii) AML with CBFB::MYH11 fusion (excluding NPM1 mutation), and (iv) AML with RUNX1::RUNX1T1 fusion, along with a control group. Each class comprises 1000 images. We conducted experiments using two setups: binary classification (normal vs. abnormal) and 5-way multi-class classification. For Malaria, we relied on the Malaria dataset from the National Library of Medicine [ 30 ]. To differentiate between Malaria, Babesiosis, and noninfected cells, we used the Microscopic Images of Parasite Species dataset from the National Institute of Allergy and Infectious Diseases (NIAID) [ 29 ]. For all datasets, we applied a 70:10:20 random split for training, validation, and testing, respectively. Topological Vectors For all of our datasets, we initially generate the topological feature vectors. First, we apply sublevel filtration using 50 evenly spaced thresholds from 0 to 255 for each color channel (red, blue, green, and grayscale). Following this step, we then apply Betti vectorization to the resulting persistence diagrams and concatenate the resulting vectors for each color channel, creating a 400-dimensional feature vector (or Betti vector) for each image in each dataset. Figure 7 illustrates how our topological vectors distinguish different class in blood disorder datasets. ML Model Hyperparameters To test the standalone effectiveness of Betti vectors, we trained an MLP on 300 epochs with early stopping monitoring validation loss and patience set to 5. Our model consists of 3 hidden layers, the first hidden layers containing 256 units with the last 2 layers containing 128 units. We used ReLU activation functions, cross-entropy loss, and the Adam optimizer with learning rate set to 0.0001 initially, then multiplied by 0.1 after the 50th epoch, and multiplied again by 0.1 after the 75th epoch. Our TabTransformer model was also trained on 300 epochs with dimension embedding of 128, 4 heads, and 5 layers. Additionally, we used the same learning rate scheduler and early stopping setup. DL Model Hyperparameters We used a batch size of 128 for all datasets with 128×128 images for the AML, Malaria, and Babesia datasets and 224×224 images for the ALL dataset. Again, we used cross-entropy loss with the Adam optimizer with a learning rate of 0.0001 multiplied by 0.1 after the 50th epoch and multiplied by 0.1 again after the 75th epoch. Key performance metrics are incorporated such as accuracy and AUC. For our auxiliary Betti vector MLP we used 3 hidden layers, the first with 256 units and 128 units for the last 2 layers. Our classifier MLP consists of 2 hidden layers with 256 units each. We selected feature dimensions of 128 for both the CNN and Betti MLP architectures, based on comparative evaluations of different feature size ratios in our earlier experiments. All MLPs and CNNs used ReLU activation functions. Similar to our ML model, we trained our DL models on 300 epochs and used early stopping monitoring validation loss with patience set to 5. Our code is available at 3 . Baseline Models To evaluate the effect topological features have when combined with DL models, we first choose a base model from a range of CNN models. We used ResNet50 [ 18 ], DenseNet121 [ 20 ], and Xception [ 8 ] as baselines. We also include a vision transformer model, Swin Transformer V2-Tiny [ 26 ]. These models were chosen due to their robust feature extraction capabilities and performance on image classification tasks. The details of our architectures are given in Section 3.2. Computational Complexity & Runtime The computation of topological vectors can be resource-intensive, particularly when applied to high-dimensional datasets [ 31 ]. For 2D images, however, the process is relatively efficient, with a time complexity of approximately 𝒪 (|𝒫| 2.3 ), where |𝒫| represents the total number of pixels [ 28 ]. This implies that the computational demand increases nearly quadratically with the size of the image. By comparison, subsequent tasks such as vectorization and machine learning are significantly less computationally demanding. We conducted all experiments on a high-performance computing system containing one Nvidia L40S. The L40S contains 8 vCPU, 62 GB RAM, and 48 GB VRAM. Generating both Betti-0 and Betti-1 vectors from the Malaria dataset, which contains 27,558 images of size 128 × 128, took 2 hours 32 minutes and 55 seconds. The remaining ML tasks were significantly faster. The most complex model T-SwinV2 for Malaria was completed in 17 minutes and 32 seconds. For the remaining datasets, the generation of topological vectors and the runtime of the ML models were significantly faster. Performance Metrics We used widely recognized performance metrics in the biomedical domain, including Accuracy, ROC-AUC, Precision, and Recall. Given the challenges posed by unbalanced datasets, we primarily focus on reporting ROC-AUC results across most settings. However, detailed results for all metrics are given in Table 4 . 4.2 Results This study had two primary objectives. The first was to demonstrate the effectiveness of topological vectors in detecting blood disorders from blood smear images. To achieve this, we developed a basic ML model that combines topological vectors with traditional ML techniques, allowing us to evaluate the standalone performance of topological vectors. The second objective was to assess the impact of integrating topological vectors with state-of-the-art deep learning models to enhance their performance. Standalone Performance of Topological Vectors The first column of Table 2 and the first two columns of Table 4 present the standalone performance of our topological vectors (Topo-Vita model). It is important to note that this simple model does not involve any pre-training and relies solely on topological features. Across all datasets, the performance is notable, with topological vectors outperforming most DL models and achieving competitive, head-to-head results with the rest. View this table: View inline View popup Download powerpoint Table 1: Dataset Statistics View this table: View inline View popup Download powerpoint Table 2: ROC-AUC Results. Each dataset includes a control group as one class. The Topo column highlights the standalone performance of topological vectors, while the T-Net models represent DL models enhanced with these vectors. For each dataset, the highest improvement for T-Net models is marked blue, and the best-performing model is marked bold . The further performance metrics are given in Table 4 . Additionally, the distinguishing capability of topological vectors is visualized in t-SNE projections ( Figure 5 ). In Figure 5a , the topological vectors clearly separate the ALL and normal classes, achieving a perfect AUC score of 1.0. Furthermore, ALL images form a distinct cluster in the lower left corner, while normal classes are more dispersed. This pattern suggests the potential existence of multiple ALL subtypes, which could be explored in future studies. In Figure 5b , the separation between classes is less pronounced compared to the ALL dataset. However, this visualization is a 2D projection of a 400-dimensional latent space, and the very high AUC score highlights the strong discriminative power of topological vectors in the original latent space. Finally, in Figure 5c , we observe clear and well-defined clusters for normal, Babesia, and malaria classes, reflecting the robust performance of topological vectors with an AUC score of 0.99. This visualization underscores their effectiveness in distinguishing these categories. Download figure Open in new tab Fig 5: t-SNE Figures. For each image, we extract a 400-dimensional topological feature vector as described in Section 3. The figures above illustrate 2D t-SNE projections of these topological embeddings for each dataset, where each point represents an image. Download figure Open in new tab Fig 6: ROC-AUC results. The performance of DL models for each dataset is illustrated with solid lines, while topologically enhanced models are represented with dotted lines in matching colors. Detailed performance metrics are provided in Table 4 . Download figure Open in new tab Fig 7: Betti Curves. 40% confidence bands around the median curves are shown for each class. The x-axis represents color values, while the y-axis indicates the count of components (Betti-0) and the count of holes (Betti-1) in the corresponding binary image 𝒳 t . Improvements of DL Models with Topological Features Our second objective was to evaluate the impact of incorporating topological vectors into popular deep learning (DL) models. Notably, all baseline models were pre-trained, meaning any observed improvement is particularly significant. As shown in Tables 2 and 4 , the inclusion of topological vectors consistently enhances the performance of these pre-trained models, achieving up to a 20% improvement in AUC. These findings highlight the substantial potential for performance enhancement by integrating topological features with DL models. Even with a straightforward method of injecting topological features into CNN architectures, the results demonstrate meaningful gains. With domain-specific adaptations, this synergy between topological and DL features presents promising opportunities for further advancements in performance. Limited Data Performance One of the primary challenges deep learning (DL) models face is their reliance on large quantities of annotated data. In many medical domains, particularly in the context of rare diseases, labeled datasets are often scarce. While DL methods typically learn vector representations through backpropagation, topological methods generate direct embeddings by capturing the inherent topological patterns in the data. This approach makes topological embeddings less dependent on labeled data for effective performance. In this study, we assessed the utility of topological vectors in classifying blood disorders. Table 3 presents the results of our experiments. For consistency, we held the 20% test set constant across all datasets and evaluated model performance using training sub-sets containing 10, 20, and 50 samples per class. To provide a fair comparison of learning performance, we utilized untrained, “vanilla” deep learning models rather than pre-trained ones. Our results demonstrate that in limited data settings, topological models consistently outperform DL models across all datasets. Furthermore, integrating topo-logical vectors with DL models significantly enhances their performance, showcasing the complementary strengths of these approaches. View this table: View inline View popup Download powerpoint Table 3: Limited Data Settings. AUC results of our models using 10, 25, and 50 training samples per class, with the original test set unchanged. For each row, the highest improvement for T-Net models is marked blue, and the best-performing model is marked bold . View this table: View inline View popup Download powerpoint Table 4: All Performance Metrics. Below we present all performance metrics for our models. The first two Topo columns show the standalone performance of topological vectors, while T-Net models represent DL models enhanced with these topological vectors. 4.3 Interpretability of Topological Features In this section, we delve into the interpretability of topological features, emphasizing Betti vectors as a simple yet powerful method for vectorizing persistence diagrams, prized for their high interpretability. Figure 7 illustrates the Betti curves for our datasets, augmented with nonparametric confidence bands [ 13 ]. These visualizations reveal significant differences in topological patterns across classes, underscoring the discriminative power of these features. In the [0, 255] color scale, 0 represents black, and 255 corresponds to white. Thus, darker colors are activated earlier, while lighter regions are activated later. For a given threshold t , the binary image 𝒳 t includes all pixels Δ ij with color values γ ij ≤ t , meaning these pixels are activated (colored black). In Figure 7a , we observe a distinct difference between the normal and ALL classes around the color value of 180. This is reflected in the β 1 (180) values, which represent the number of holes in the binary image 𝒳 𝒳 180 for the blue color channel (Section 3.1). For the normal class, 𝒳 180 typically contains approximately 50 holes, whereas the ALL class exhibits around 300 holes. Figure 8 provides sample 𝒳 180 binary images for both classes, visually highlighting the significantly greater number of holes in the ALL image compared to the normal image. Download figure Open in new tab Fig 8: Sample single cell images and their binary 𝒳 180 images for Normal (left two) and ALL (right two) classes. Similarly, Figure 7c high-lights the distinct behavior between the Malaria and Normal classes, particularly in their β 0 (50) values. In the binary images 𝒳 50 , the Malaria class typically exhibits only a few connected components, whereas the Normal class develops 20–30 components. This clear contrast is evident in the sample images shown in Figure 9 , clearly illustrating the difference in connectivity between the two classes. Download figure Open in new tab Fig 9: Sample single cell images and their binary 𝒳 50 images for Normal (left two) and Malaria (right two) classes. 5 Conclusion In this work, we introduced topological machine learning methods to the domain of cytomorphology and demonstrated their potential in diagnosing critical blood diseases and disorders, including ALL, AML, malaria, and babesiosis. By incorporating topological features into deep learning frameworks, we achieved significant improvements in diagnostic accuracy and model robustness, particularly in data-limited settings where traditional deep learning models often falter. These results highlight the complementary nature of topological methods and deep learning, offering a pathway to more interpretable and effective diagnostic tools. Future work will focus on expanding the methodology to other hematological and rare diseases, exploring real-time integration into clinical workflows, and leveraging advancements in self-supervised learning to further mitigate the challenges posed by limited labeled datasets. Data Availability All data produced are available online at https://www.cancerimagingarchive.net/collection/aml-cytomorphology_mll_helmholtz/ https://www.kaggle.com/datasets/sizlingdhairya1/all-idb-images https://www.kaggle.com/datasets/sizlingdhairya1/all-idb-images Acknowledgments This work was partially supported by the National Science Foundation under grants DMS-2202584, 2229417, and DMS-2220613 and by Simons Foundation under grant # 579977. The authors acknowledge the Texas Advanced Computing Center (TACC) at UT Austin for computational resources which contributed to the research results reported within this paper. Footnotes philmore.koung, saba.fatema, coskunuz{at}utdallas.edu { hung.luu{at}utsouthwestern.edu , nagehan.pakasticali{at}utsouthwestern.edu } 3 https://anonymous.4open.science/r/TopoVita-X-555C References 1. ↵ Adams , H. , et al : Persistence images: A stable vector representation of persistent homology . Journal of Machine Learning Research 18 ( 1 ), 218 – 252 ( 2017 ) 5 OpenUrl 2. ↵ Ahmed , F. , Nuwagira , B. , Torlak , F. , Coskunuzer , B. : Topo-cxr: Chest x-ray tb and pneumonia screening with topological machine learning . In: Proceedings of the IEEE/CVF International Conference on Computer Vision . pp. 2326 – 2336 ( 2023 ) 3 3. ↵ Ali , D. , et al : A survey of vectorization methods in topological data analysis . IEEE Transactions on Pattern Analysis and Machine Intelligence ( 2023 ) 5 4. ↵ Bauer , U. , Kerber , M. , Reininghaus , J. : Distributed computation of persistent homology . Computer Graphics Forum 34 ( 5 ), 173 – 182 ( 2015 ) 3 OpenUrl 5. ↵ Bubenik , P. , Dłotko , P. : A persistence landscapes toolbox for topological statistics . Journal of Symbolic Computation 78 , 91 – 114 ( 2017 ) 5 OpenUrl CrossRef 6. ↵ Caputi , L. , Pidnebesna , A. , Hlinka , J. : Promises and pitfalls of topological data analysis for brain connectivity analysis . NeuroImage 238 , 118245 ( 2021 ) 3 OpenUrl PubMed 7. ↵ Chazal , F. , Michel , B. : An introduction to topological data analysis: fundamental and practical aspects for data scientists . Frontiers in Artificial Intelligence 4 ( 2021 ) 4 8. ↵ Chollet , F. : Xception: Deep learning with depthwise separable convolutions . In: CVPR . pp. 1251 – 1258 ( 2017 ) 8 9. ↵ Coskunuzer , B. , Akçora , C.G. : Topological methods in machine learning: A tutorial for practitioners . arXiv preprint arxiv: 2409.02901 ( 2024 ) 4 10. ↵ Crawford , L. , Monod , A. , Chen , A.X. , Mukherjee , S. , Rabadán , R. : Predicting clinical outcomes in glioblastoma: an application of topological and functional data analysis . Journal of the American Statistical Association 115 ( 531 ), 1139 – 1150 ( 2020 ) 3 OpenUrl CrossRef 11. ↵ Dey , T.K. , Wang , Y. : Computational topology for data analysis . Cambridge University Press ( 2022 ) 4 12. ↵ Dłotko , P. , Skraba , P. : Distributed persistence computation for large datasets . In: Topological Methods in Data Analysis and Visualization IV . pp. 49 – 64 ( 2020 ) 3 13. ↵ Gibbons , J.D. , Chakraborti , S. : Nonparametric statistical inference . CRC press ( 2014 ) 10 14. ↵ Giunti , B. : Tda applications library ( 2022 ), available at: https://www.zotero.org/groups/2425412/tda-applications/library 3 15. ↵ Giunti , B. , Lazovskis , J. , Rieck , B. : Donut: Database of original & non-theoretical uses of topology ( 2022 ), available at: https://donut.topology.rocks 3 16. ↵ Gupta , A. , Singh , R. , Sharma , P. : A review of classical and deep learning approaches for hematological malignancy classification . IEEE Access 9 , 139206 – 139224 ( 2021 ) 3 OpenUrl 17. ↵ Gupta , S. , Chen , C. , et al : Learning topological interactions for multi-class medical image segmentation . In: ECCV . pp. 701 – 718 ( 2022 ) 3 18. ↵ He , K. , Zhang , X. , Ren , S. , Sun , J. : Deep residual learning for image recognition . In: CVPR . pp. 770 – 778 ( 2016 ) 8 19. ↵ Hehr , M. , et al : A morphological dataset of white blood cells from patients with four different genetic aml entities and non-malignant controls . (No Title) ( 2023 ) 6 20. ↵ Huang , G. , Liu , Z. , Van Der Maaten , L. , Weinberger , K.Q. : Densely connected convolutional networks . In: CVPR . pp. 4700 – 4708 ( 2017 ) 8 21. ↵ Huang , X. , Khetan , A. , Cvitkovic , M. , Karnin , Z. : Tabtransformer: Tabular data modeling using contextual embeddings ( 2020 ), https://arxiv.org/abs/2012.06678 5 22. ↵ Kainz , P. , Pfeiffer , D. , Urschler , M. : Fully convolutional neural networks for peripheral blood cell image segmentation . Nature Methods 16 ( 5 ), 555 – 558 ( 2019 ) 3 OpenUrl 23. ↵ Labati , R.D. , Piuri , V. , Scotti , F. : All-idb: The acute lymphoblastic leukemia image database for image processing . In: 2011 18th IEEE international conference on image processing . pp. 2045 – 2048 . IEEE ( 2011 ) 6 24. ↵ Li , W. , Zhang , Y. , Chen , F. : Transfer learning for rare blood disease classification using pretrained cnns . Computers in Biology and Medicine 125 , 103988 ( 2020 ) 2 OpenUrl 25. ↵ Lin , L. , Zhang , J. , Wang , X. : Feature selection and classification of rare blood diseases using cytomorphological data . Medical Image Analysis 68 , 101906 ( 2021 ) 3 OpenUrl PubMed 26. ↵ Liu , Z. , et al : Swin transformer: Hierarchical vision transformer using shifted windows . In: ICCV . pp. 10012 – 10022 ( 2021 ) 8 27. ↵ Matek , C. , Schwarz , S. , Marr , C. , Spiekermann , K. : Human-level recognition of blast cells in acute myeloid leukaemia with convolutional neural networks . Nature Machine Intelligence 1 ( 10 ), 538 – 544 ( 2019 ) 2 OpenUrl 28. ↵ Milosavljević , N. , Morozov , D. , Skraba , P. : Zigzag persistent homology in matrix multiplication time . In: SoCG . pp. 216 – 225 ( 2011 ) 8 29. ↵ NIAID: Microscopic images of parasites species . https://data.niaid.nih.gov/resources?id=mendeley_38jtn4nzs6 ( 2020 ) x7 30. ↵ NLM: Malaria project . https://lhncbc.nlm.nih.gov/LHC-research/LHC-projects/image-processing/malaria-project.html ( 2018 ) 7 31. ↵ Otter , N. , Porter , M.A. , Tillmann , U. , Grindrod , P. , Harrington , H.A. : A roadmap for the computation of persistent homology . EPJ Data Science 6 , 1 – 38 ( 2017 ) 8 OpenUrl CrossRef PubMed 32. ↵ Papamarkou , T. , et al : Position paper: Challenges and opportunities in topological deep learning ( 2024 ), arXiv preprint arxiv: 2402.08871 3 33. ↵ Qaiser , T. , et al : Fast and accurate tumor segmentation of histology images using persistent homology and deep convolutional features . Medical Image Analysis 55 , 1 – 14 ( 2019 ) 3 OpenUrl CrossRef PubMed 34. ↵ Rabadán , R. , Blumberg , A.J. : Topological data analysis for genomics and evolution: topology in biology . Cambridge University Press ( 2019 ) 3 35. ↵ Rehman , A.u. , et al : Classification of acute lymphoblastic leukemia using deep learning . Microscopy Research and Technique 83 ( 6 ), 702 – 711 ( 2020 ) 2 OpenUrl 36. ↵ Rieck , B. , et al : Uncovering the topology of time-varying fmri data using cubical persistence . In: NeurIPS . vol. 33 , pp. 6900 – 6912 ( 2020 ) 3 OpenUrl 37. ↵ Santhirasekaram , A. , Winkler , M. , Rockall , A. , Glocker , B. : Topology preserving compositionality for robust medical image segmentation . In: CVPR . pp. 543 – 552 ( 2023 ) 3 38. ↵ Skaf , Y. , Laubenbacher , R. : Topological data analysis in biomedicine: A review . Journal of Biomedical Informatics 104082 ( 2022 ) 3 39. ↵ Smith , A. , Roberts , C. , Lee , D. : Artificial intelligence in rare blood disease pathology: A systematic review . Hematological Oncology 39 ( 3 ), 250 – 262 ( 2023 ) 3 OpenUrl 40. ↵ Smith , A. , Roberts , C. , Lee , D. : Topological insights for leukemia classification in blood smear images . Pattern Recognition 135 , 108932 ( 2023 ) 3 OpenUrl 41. ↵ Stucki , N. , et al : Topologically faithful image segmentation via induced matching of persistence barcodes . In: ICML ( 2023 ) 3 42. ↵ Wang , F. , Kapse , S. , Liu , S. , Prasanna , P. , Chen , C. : Topotxr: a topological biomarker for predicting treatment response in breast cancer . In: International Conference on Information Processing in Medical Imaging . pp. 386 – 397 . Springer ( 2021 ) 3 43. ↵ Yogesh , K. , et al : A topological approach to rare blood disease classification in cytomorphological data . Hematology Review 14 ( 3 ), 120 – 133 ( 2023 ) 3 OpenUrl 44. ↵ Zhang , X. , Liu , W. , Huang , F. : Multimodal data integration for risk stratification in hematological malignancies . Journal of Hematology & Oncology 14 ( 1 ), 90 ( 2021 ) 3 OpenUrl PubMed 45. ↵ Zia , A. , et al : Topological deep learning: A review of an emerging paradigm ( 2023 ), arXiv preprint arxiv: 2302.03836 3 View the discussion thread. Back to top Previous Next Posted January 23, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Diagnosis of Blood Diseases and Disorders with Topological Deep Learning Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Diagnosis of Blood Diseases and Disorders with Topological Deep Learning Philmore F. Koung , Saba Fatema , Nagehan Pakasticali , Hung S. Luu , Baris Coskunuzer medRxiv 2025.01.21.25320908; doi: https://doi.org/10.1101/2025.01.21.25320908 Share This Article: Copy Citation Tools Diagnosis of Blood Diseases and Disorders with Topological Deep Learning Philmore F. Koung , Saba Fatema , Nagehan Pakasticali , Hung S. Luu , Baris Coskunuzer medRxiv 2025.01.21.25320908; doi: https://doi.org/10.1101/2025.01.21.25320908 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Hematology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4423) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15219) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6587) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5438) Public and Global Health (9218) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1195) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ff82f70980a300f',t:'MTc3OTQxNDc4Ng=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00