SHEST: Single-cell-level artificial intelligence from haematoxylin and eosin morphology for cell type prediction and spatial transcriptomics reconstruction

doi:10.1101/2025.11.19.689364

SHEST: Single-cell-level artificial intelligence from haematoxylin and eosin morphology for cell type prediction and spatial transcriptomics reconstruction

2025 · doi:10.1101/2025.11.19.689364

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 68,887 characters · extracted from preprint-html · click to expand

SHEST: Single-cell-level artificial intelligence from haematoxylin and eosin morphology for cell type prediction and spatial transcriptomics reconstruction | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results SHEST: Single-cell-level artificial intelligence from haematoxylin and eosin morphology for cell type prediction and spatial transcriptomics reconstruction View ORCID Profile Hoyeon Jeong , Junghan Oh , Donggeon Lee , Jae Hwan Kang , View ORCID Profile Yoon-La Choi doi: https://doi.org/10.1101/2025.11.19.689364 Hoyeon Jeong 1 Medical Research Institute, Sungkyunkwan University , 81 Irwon-Ro, 06351, Seoul, Republic of Korea 2 Research Institute for Future Medicine, Samsung Medical Center , 81 Irwon-Ro, 06351, Seoul, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Hoyeon Jeong Junghan Oh 3 Department of Health Sciences and Technology, Samsung Advanced Institute for Health Sciences and Technology , 81 Irwon-Ro, 06351, Seoul, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site Donggeon Lee 2 Research Institute for Future Medicine, Samsung Medical Center , 81 Irwon-Ro, 06351, Seoul, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site Jae Hwan Kang 2 Research Institute for Future Medicine, Samsung Medical Center , 81 Irwon-Ro, 06351, Seoul, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site Yoon-La Choi 3 Department of Health Sciences and Technology, Samsung Advanced Institute for Health Sciences and Technology , 81 Irwon-Ro, 06351, Seoul, Republic of Korea 4 Department of Pathology and Translational Genomics, Samsung Medical Center, Sungkyunkwan University School of Medicine , 81 Irwon-Ro, 06351, Seoul, Republic of Korea Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Yoon-La Choi For correspondence: ylachoi{at}skku.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract A comprehensive understanding of cancer progression requires integrating tissue morphol-ogy with spatial molecular profiles. We present SHEST, a multi-task profiling framework that leverages haematoxylin and eosin morphology to predict cellular composition and re-construct spatial gene expression at single-cell resolution. SHEST employs a quadruple-tile input capturing nuclear and contextual information, combined with a neighbourhood-informed clustering algorithm to filter ambiguous cellular signals. It comprises a shared morphological encoder with two task-specific heads: a classifier for cell type prediction and a reconstruc-tor for gene expression. Multi-task optimisation uses cross-entropy and zero-inflated negative binomial losses, specifically addressing the sparsity of spatial transcriptomic data. Evalua-tion on human lung adenocarcinoma datasets demonstrated high accuracy for the principal reciprocal constituents of the tumour–immune axis ( F 1 : 0.97 for tumour cells and 0.91 for lym-phocytes). External validation confirmed its generalisability, revealing alveolar cells and their early neoplastic transitions. Reconstructed gene expression reproduced spatially resolved, cell-type-specific marker patterns— EPCAM in tumour cells, LTBP2 in fibroblasts, and CD3E in lymphocytes—recovering biologically coherent transcriptional architecture. SHEST also pre-served distance-dependent spatial relationships and gene-level autocorrelation, reflecting the multicellular niche structure of the tumour microenvironment. By unifying cell type iden-tification, gene expression reconstruction, and spatial mapping within a single interpretable framework, SHEST provides a synergistic and cost-efficient bridge between histopathology and spatial transcriptomics. This approach facilitates comprehensive tissue characterisation and forms a foundation for precision oncology through spatially informed, cell-level insights into tumour–immune ecosystems. 1 Introduction Achieving a comprehensive understanding of cancer progression demands the simultaneous examination of tissue morphological organisation and the precise spatial distribution of functional molecular signatures [ 1 , 2 ]. Over the past century, cancer diagnosis and interpretation have primarily depended on morpholog-ical examination of tissue sections, most commonly through haematoxylin and eosin (H&E) staining [ 3 ]. This classical histopathological approach, which visualises the structure of cell nuclei and cytoplasm, pro-vides essential information about tissue architecture and disease progression. In recent decades, advances in molecular biology and next-generation sequencing technologies have transformed the understanding of cellular function by revealing that the state of a cell is defined by its gene expression profile [ 4 ]. The quantitative study of complex tissue systems, such as the tumour microenvironment (TME), now depends on the capacity to link cellular morphology with molecular function [ 5 , 6 ]. Establishing this correlation is fundamental for developing precision medicine and gaining deeper insights into disease mechanisms. H&E staining produces morphological contrast by colouring nuclei blue with haematoxylin and cyto-plasm or extracellular components pink with eosin. The development of digital pathology (DP), which converts glass slides into high-resolution whole-slide images (WSIs), has enabled faster and more consistent diagnostic processes. This digital transformation has also facilitated quantitative analysis, improving re-producibility compared to traditional, observer-dependent assessments [ 7 ]. In parallel, molecular profiling technologies have progressed rapidly. The emergence of single-cell RNA sequencing (scRNA-seq) has been followed by the introduction of spatial transcriptomics (ST) technologies [ 8 ]. These methods measure RNA expression while preserving spatial context, allowing researchers to map gene expression patterns, infer cell type distributions, and analyse cell-cell interactions within the tissue architecture. Such analyses are particularly valuable for studying cancer biology and immune regulation within the TME [ 9 ]. DP and ST exhibit complementary strengths and limitations. DP provides high-resolution morpho-logical context that is indispensable for histological interpretation and cancer grading. However, it lacks molecular information and is influenced by inter-observer variability. In contrast, ST yields spatially re-solved molecular profiles that capture the functional state of cells, yet it is limited by relatively coarse visual resolution and high experimental costs. Integrating these two modalities enables a comprehensive approach that combines morphological context with molecular characterisation at the single-cell level [ 10 ]. 1.1 Related work Research linking H&E histology with molecular gene expression data can be broadly categorised into two main directions: morphological classification of individual cells or nuclei, and computational inference of spatial molecular profiles from histological images. Accurate single-cell or nuclei segmentation and classification are essential prerequisites for spatial analy-sis at cellular resolution. One of the representative methods in this area is HoVer-Net [ 11 ], which introduced a dual-task network capable of performing nuclei instance segmentation and classification simultaneously by predicting horizontal and vertical distance maps to separate clustered nuclei. HD-Yolo [ 12 ] adopted a YOLO [ 13 ]-based framework for nuclei localisation and segmentation, reporting that morphological features extracted from the model were correlated with patient survival in breast cancer. CellViT [ 14 ] applied a Vi-sion Transformer (ViT) [ 15 ] architecture to enable parallel processing and local feature extraction through attention mechanisms. While these methods advanced morphological analysis, the overall classification accuracy remains still low, and most models have not undergone comprehensive external validation. A complementary research stream focuses on predicting molecular information directly from H&E im-ages. Most prior studies, such as mclSTExp [ 16 ] and TCGN [ 17 ], have been developed for sequencing-based ST platforms such as Visium, where each spot corresponds to an area of approximately 55 µm that contains multiple cells rather than a single cell. These models employ a regression head trained with the mean squared error (MSE) loss function to infer gene expression levels at the Visium spot level. In addition, two studies, Hist2Gene [ 18 ] and THItoGene [ 19 ], which modelled local features using convolution, global features using ViT, and connectivity features using GNN on breast cancer and squamous cell carcinoma tissues, are also noteworthy. Despite their methodological diversity, all the aforementioned models share a fundamental limitation: they are mostly trained using publicly available Visium data from breast cancer tissue, which represents unpurified or unseparated profiles where multiple cells are mixed within a single spot. That is, the prediction outcome represents an average or aggregate of the multiple cells within the spot, which inherently precludes true single-cell-level analysis. The recent development of imaging-based ST technologies, such as Xenium, enables molecular profiling at single-cell resolution. Xenium supports 5K gene panels and provide aligned H&E images. Utilising this advancement, SPiRiT [ 20 ] introduced a framework employing a ViT backbone to map single-cell H&E image tiles to corresponding cellular gene expression profiles. This model demonstrated the feasibility of inferring cell-level molecular signatures from cellular morphology. However, SPiRiT still uses the MSE loss function for gene expression regression, which is not optimal for modelling zero-inflation characteristic of RNA sequencing data. Furthermore, the limited external validation restricts confidence in the model’s generalisability across diverse tissue types and cohorts. Despite these pioneering efforts, a methodological discontinuity remains in simultaneously and coher-ently addressing the multifaceted nature of cellular state, encompassing both morphology and molecular function. The development of a unified framework that jointly performs cell-level morphological encoding, cell type classification, and gene expression reconstruction directly from H&E images is therefore presented as the necessary next step to advance single-cell spatial inference. 1.2 Proposed framework: SHEST SHEST (Single-cell-level artificial intelligence from H&E morphology for ST reconstruction and cell type prediction) is a multi-task profiling framework that integrates morphological and molecular information at single-cell resolution ( Figure 1 ). The framework aligns morphological features extracted from H&E images with molecular features obtained from ST on shared spatial coordinates, enabling a direct correspondence between each cell’s morphology and its gene expression profile ( Figure 1A ). Through this, SHEST simul-taneously performs two learning tasks. The first is the classification of cell types based on morphological context. The second is the reconstruction of single-cell gene expression from image-derived embeddings ( Figure 1B and C ). This joint optimisation enables the model to share representations across tasks, ensuring consistency between morphological classification and molecular prediction. Overall, SHEST offers an integrated computational approach for single-cell molecular profiling from H&E images, establishing a direct and interpretable connection between tissue morphology and gene expression ( Figure 1E ). Download figure Open in new tab Figure 1: Overview of the SHEST framework and its workflow. A Training and test data: H&E images are processed through cell segmentation and quadruple tiling to generate per-cell image tiles. Single-cell–resolution spatial transcriptomics (ST) data are annotated with cell types using reference single-cell RNA sequencing (scRNA-seq) profiles. These multimodal datasets are spatially aligned to construct paired image–transcriptome–label sets for supervised training. B Model: A shared encoder extracts morphological embeddings from the input image tiles. Two task-specific heads operate on these embeddings: a classifier that predicts cell types and a reconstructor that generates reconstructed spatial gene expression profiles for each cell. C Training: Multi-task learn-ing is supervised by two loss functions — cross-entropy (CE) loss for classification and zero-inflated negative binomial (ZINB) loss for gene expression reconstruction. Dashed arrows in the diagram indicate training-time data flow and supervision paths connecting the ground-truth labels and mea-sured expression to their respective losses. D External validation data: Independent H&E images, single-cell–resolution ST, and scRNA-seq annotation datasets are used to assess generalisation per-formance. E Inference: The trained model generates cell-type predictions and reconstructs spatial gene expression maps for each localised cell. Solid arrows indicate the forward data flow through the encoder and both task heads. Model performance is quantitatively evaluated using F 1 scores for classification accuracy and Pearson’s R and Moran’s I for assessing expression-level agreement and spatial autocorrelation. 2 Materials and methods The SHEST framework was implemented in Python 3.10 using PyTorch 2.6.0+cu124 [ 21 ] for modelling, and SpatialData 0.3.0 [ 22 ] and Scanpy 1.10.4 [ 23 ] for processing Xenium data. 2.1 Data acquisition The study utilises ST data coupled with matched H&E WSIs from lung adenocarcinoma (LUAD) samples, a highly representative and clinically significant tumour type. Detailed information regarding the datasets is summarised in Table 1 . View this table: View inline View popup Download powerpoint Table 1: Overview of datasets used for SHEST model development and validation. The foundation for our ground truth cell type labels is the comprehensive CZ CELLxGENE Discover - Human Lung Cancer Atlas (LuCA) [ 24 ]. The primary dataset used for training and internal testing of the SHEST model is the 10X Genomics Xenium Prime Human Lung Cancer dataset ( Figure 1A ). For external validation and assessment of the model’s generalisability, we employed two of the earlier 10X Genomics Xenium V1 Human Lung Cancer datasets, which are characterised by a lower gene count panel compared to the Xenium Prime panel ( Figure 1D ). All H&E slides were scanned at a magnification of ×40 objective. 2.2 Cell type annotation Cell identities were initially annotated on the Xenium Prime data using the TACCO [ 25 ] package against the LuCA single-cell references ( Figure 1A ). Following this initial computational step, the resulting anno-tations were subjected to thorough pathologist (Y.C.) review for refinement and validation. This review utilised the rich morphological information provided by the H&E image, integrating not only the distinct cellular morphology but also the spatial and locational context. Specifically, the pathologist assessed crit-ical biological patterns, such as tumour cells lining the boundary, the floating status of macrophages, the spatial relationship between endothelial cells and neighbouring erythrocytes (RBCs), and the formation of tertiary lymphoid structures (TLS) by lymphocytes. These validated and detailed subtypes were then systematically grouped into higher-level cell types, as summarised in Supplementary Table S1, to facilitate robust classification and downstream analysis within the SHEST framework. 2.3 Image preparation for encoding The preparation of H&E image data for morphological encoding involves sequential steps of spatial align-ment, cell filtering, patch extraction, and colour standardisation. First, precise spatial co-registration between the H&E image coordinates and the ST coordinate system is established ( Figure 1A ). This align-ment is achieved by scaling the raw image coordinates according to the known pixel size and subsequently applying an inverse affine transformation. Following alignment, individual cells, identified via segmentation, are filtered based on their bounding box size. Only cells with dimensions within the range of 2 µm × 2 µm to 16 µm × 16 µm are retained ( Figure 2A ); excessively small or large cells are excluded to prevent misclassification stemming from insufficient or distorted morphological information. Next, we constructed the local morphological context for each retained cell using four concatenated tiles: the whole cell, the nucleus-masked cell, the haematoxylin extraction, and the nucleus-masked haematoxylin extraction (Fig-ure 2B-G). This process generates a set of image patches designed to encompass both the local cellular environment and focused nuclear features. This strategy ensures the encoder receives comprehensive mor-phological data, prioritising nuclear-specific details while reducing extraneous peripheral context. Finally, the extracted H&E image windows are standardised to fulfil the input requirements of the H-optimus-0 [ 26 ] encoder. All patches are resized to a fixed resolution of 224 × 224 pixels, after which the pixel values are normalised by dividing the image tensor by 255. A colour normalisation process is then applied for RGB standardisation, using the pre-calculated mean (0.707223, 0.578729, 0.703617) and standard deviation (0.211883, 0.230117, 0.177517) specific to the H-optimus-0 model. Download figure Open in new tab Figure 2: Morphological and molecular characteristics of cell types before and after annotation re-finement for the 10X Xenium Prime Human Lung Cancer sample. Panel A shows the distribution of cell size parameters (width and height in µm) as violin plots for the six cell types. Panels B-G show morphological examples of six cell types, with each panel displaying a 224 × 224 pixel im-age patch corresponding to a single cell: Alveolar_cell ( B ), Tumor_cell_LUAD ( C ), Macrophage ( D ), Endothelial_cell ( E ), Fibroblast ( F ), and Lymphocyte ( G ). Quadruple tiles with nucleus extraction and haematoxylin extraction are used for each patch. Panels H - J present the raw data 25 before annotation refinement: H UMAP visualisation based on raw gene expression profiles, I UMAP visualisation based on H-optimus-0 morphological embeddings, and J cell count distribu-tion across the six cell types. Panels K - L show the data after annotation refinement: K UMAP visualisation of the curated population based on gene expression, L UMAP visualisation based on morphological features, and M cell count distribution of the high-purity, curated dataset. 2.4 Annotation refinement for high-fidelity training To ensure the SHEST model is trained on data with high confidence and distinct molecular-morphological correspondence, we implemented an annotation refinement step prior to model training (Algorithm 1). This procedure maximises the clarity of learnt features by filtering out cells situated in low-density regions or near cluster boundaries within the latent space, where cell signatures are often ambiguous or mixed. The refinement process operates on low-dimensional representations derived from both modalities. First, the gene expression profile and the corresponding morphological features of all cells are projected into separate low-dimensional UMAP [ 27 ] spaces, creating distinct embeddings that represent molecular and morphological cell clustering, respectively. Starting with the cell type class that has the smallest cell count (to prevent disproportionate pruning of minority populations), we iteratively curate the dataset using the DBSCAN [ 28 ] clustering algorithm. Only the single dominant DBSCAN cluster for the target cell type is retained. Concurrently, any cells of other types that neighbour this dominant cluster within the defined k -nearest neighbours are effectively marked for removal, preventing the inclusion of cells at ambiguous class boundaries. This systematic curation, performed on both the expression and morphological feature spaces, efficiently removes cells that exhibit ambiguous or contradictory signatures, ultimately retaining only those cells that are common to both modalities. Morphological analysis of the six cell types reveals significant size variations that serve as distinguishing features ( Figure 2G ). The Tumor_cell_LUAD population exhibits the largest relative size among all cell types, characterised by nuclear pleomorphism [ 29 ]. Conversely, the Lymphocytes are characterised by consistently displaying the smallest physical dimensions and large cell numbers. Beyond size, visualisation of the data in the low-dimensional feature space before refinement ( Figure 2H and 2I ) shows considerable overlap, indicating ambiguity in raw cell classification based on either expression or morphology. Applying the annotation refinement algorithm resulted in a reduction in the total cell count by effectively removing low-confidence and boundary cells ( Figure 2K and 2L ). The UMAP visualisations of the refined dataset demonstrate markedly enhanced cluster purity and more distinct cluster boundaries in both the expression and morphological feature spaces. Algorithm 1 Representative cell selection with neighbour-aware clustering View this table: View inline View popup Download powerpoint 2.5 Data augmentation and split To mitigate the effects of class imbalance inherent in biological samples, a data augmentation strategy is implemented during the dataset-loading phase. This procedure effectively mimics random oversampling by applying geometric transformations to the H&E image patches. Specifically, the augmentation includes random rotations (one of 0 ◦ , 90 ◦ , 180 ◦ , 270 ◦ ) and horizontal or vertical flipping. By generating multiple transformed copies of minority class samples, this strategy balances the number of samples across all cell classes, thereby promoting robust and unbiased learning by the subsequent model layers. The final dataset was divided into training and testing splits (80:20) using a fixed generator. 2.6 Architecture of the SHEST Model The SHEST model is implemented as a unified multi-task learning framework that jointly reconstructs single-cell gene expression and classifies cell types from H&E-stained image patches. The architecture is composed of three integrated modules—an encoder, a classifier, and a reconstructor—that share a common latent embedding to couple morphological and molecular representations ( Figure 1B and 3 ). This multi-head design thereby moves beyond a mere juxtaposition of tasks, enabling complementary learning dynamics that enhance interpretability by aligning morphological cues with their underlying molecular correlates. Download figure Open in new tab Figure 3: Overview of the SHEST model architecture. The model jointly classifies cell types and reconstructs single-cell gene expression from H&E-stained image patches. An input patch is encoded by a pretrained H-optimus-0 encoder into a shared embedding z . The Classifier branch predicts the refined cell identity through multiple fully connected (FC) layers with Layer Normalisation optimised by the cross-entropy loss L CE . The Reconstructor branch predicts the parameters µ, θ, and π of the zero-inflated negative binomial (ZINB) distribution through separate FC heads and reconstructs the expected gene expression E[y | z ], optimised by the ZINB loss L ZINB . The encoder is based on the H-optimus-0 ViT backbone, which is pre-trained on a large-scale biological image dataset. During SHEST training, all encoder parameters are frozen to preserve the morphological representations learnt during pre-training. The encoder processes an input H&E image of size 224 × 224 pixels and outputs a 1536-dimensional embedding vector z ∈ P 1536 that serves as the common latent representation for all downstream tasks. The classifier receives the latent embedding z and predicts the refined cell identity. It is structured as a multi-layer perceptron (MLP) with Layer Normalisation (LN) [ 30 ], ReLU activation, and Dropout (0.3). The final layer outputs logits o for C high-level cell type categories: where W l and b l are the learnable parameters for the l -th layer (where l ∈ {1, 2, 3}), and o i is the logit corresponding to class c i . The class probability for category c i is obtained from the logits using the Softmax function: The reconstructor maps the latent embedding z to the predicted single-cell gene expression levels. It employs an MLP structure, generating an intermediate representation h 2 : This representation h 2 is then projected into three separate linear heads to estimate the parameters of the zero-inflated negative binomial (ZINB) distribution [ 31 ]—mean ( µ ), dispersion ( θ ), and zero-inflation probability ( π ): The ZINB distribution is used to model the count nature and high sparsity of single-cell expression data. The probability mass function for an observed gene count y = k given the latent embedding z is P ( y = k | z ) and is defined by its predicted parameters: where Ι( k = 0) is the indicator function, and NB( k | µ, θ ) is the negative binomial distribution. The reconstructed expression y ^ is derived from the expected value E[ y | z ] = (1 − π ) µ . 2.7 Loss functions and optimisation The SHEST model is trained by jointly optimising the classification and reconstruction objectives via a multi-task loss function ( Figure 1C ). The classification task minimises the cross-entropy (CE) loss L CE , which is equivalent to the negative log-likelihood (NLL) of the true class distribution for N cells: where y ic is the ground-truth indicator for class c of cell i , and P ( c | z i ) is the class probability predicted by the classifier based on the embedding z i . The reconstruction task minimises the NLL of the ZINB distribution L ZINB for the observed gene count vector y i : where P ( y i | z i ) is the likelihood of observing the ground-truth gene counts y i under the ZINB parameters predicted from z i . The final multi-task loss L is the unweighted sum of the two objectives: This joint optimisation ensures consistency between the predicted cell type and reconstructed gene expres-sion. The model is optimised using the AdamW [ 32 ] optimiser with a learning rate of 0.01, trained for 40 epochs with a batch size of 1024, and utilises mixed precision training and early stopping. 2.8 Deterministic learning for reproducibility To ensure the reproducibility and reliability of the SHEST framework, strict determinism was enforced across both configuration and model initialisation stages. A fixed global seed of 42 was set for NumPy and PyTorch to control stochastic operations. During data splitting, PyTorch used a fixed generator initialised with this seed. Computational determinism was further guaranteed by configuring the cuDNN [ 33 ] back-end to use deterministic algorithms and by disabling benchmarking, ensuring that identical inputs yield consistent outputs regardless of parallelisation. Consistent parameter initialisation is critical for reproducible training trajectories. All FC layers in the multi-task prediction heads, as well as any custom convolutional layers, were initialised using the Kaiming He method [ 34 ] to maintain stable gradient variance. This standardised initialisation ensures that the model starts from the same parameter state in every training run, providing reliable and repeatable results. 3 Results 3.1 SHEST enables simultaneous cell type prediction and transcriptomic profiling from H&E patches with high accuracy The proposed dual-task model was evaluated on the test dataset derived from the 10X Xenium Prime Human Lung Cancer case to assess its capability to classify cell types and reconstruct gene expression profiles from histological image patches. SHEST achieved high classification performance across all six major lung cancer–associated cell types ( Figures 4A and 4B ). The confusion matrices demonstrate that the model distinguished morphologically ambiguous but functionally distinct populations. SHEST achieved particularly high accuracy for tumour cells ( F 1 = 0.97) and lymphocytes ( F 1 = 0.91), the two pivotal cellular populations that define the tu-mour–immune landscape. In this context, the weighted F 1 score is 0.87 ( Figure 4C ). Download figure Open in new tab Figure 4: Performance evaluation of the SHEST model on the test dataset derived from the 10X Xenium Prime Human Lung Cancer cohort. A Confusion matrix of predicted versus actual cell types, showing the number of correctly and incorrectly classified cells across six cell types. B Normalised confusion matrix displaying classification accuracy per cell type. C Weighted and macro F 1 -scores of the classifier. D Violin plot of the Pearson R distribution of the actual and reconstructed expression of each gene across all cells. E Dot plot of marker gene expression in actual data. Dot size represents the fraction of cells expressing each gene, and colour intensity indicates mean expression levels within each cell type. F Corresponding dot plot from reconstructed gene expression. To evaluate the fidelity of the multi-task model in predicting transcriptomic profiles, we first examined the correlation distribution between the actual and reconstructed expression of all genes, which showed that most values exhibited a positive correlation ( Figure 4D ). This confirms that morphological features encoded by SHEST convey sufficient information to infer functional molecular states. Subsequently, the top 10 marker genes for each cell type were identified from the ranked gene groups using the actual gene expression data ( Figure 4E ). The reconstructed expression profile ( Figure 4F ) was then compared with the actual profile for these selected markers. The results demonstrated high reconstruction fidelity, particularly in reproducing the expression patterns, where the distinct marker gene signatures of each cell type were preserved. The consistency of these patterns indicates that the model effectively captures the morphological determinants of cell identity embedded in histological features, enabling cross-modality translation from image to transcriptome space. 3.2 The SHEST model demonstrates external validity in both cell type identification and spatial organisation To evaluate the generalisability of SHEST, the trained model was applied to two independent Xenium V1 Human Lung Cancer datasets, which differ in gene panel composition and tissue characteristics, in order to assess the transferability of its learnt morphological-molecular representations across distinct experimental and biological environments ( Figure 5 and Supplementary Figure S1). Cells located mainly at the tissue periphery, where the H&E image boundaries did not align with the nuclear boundaries in the ST data, were excluded from inference. These correspond to cells with a mean grey value exceeding 230 in their grayscale representation. Download figure Open in new tab Figure 5: Cell type prediction by the SHEST model on the validation dataset derived from a 10X Xenium V1 Human Lung Cancer Addon panel case. A UMAP embedding from the image encoder. B - D Confusion matrices of predicted cell types, comparing the reference annotation to the predic-tions from SHEST ( B ), HD-Yolo ( C ), and HoVer-Net ( D ) models. E Cell type count comparison between annotations and the models. From HD-Yolo and HoVer-Net, cell types were mapped to the category most similar to the reference. F W28eighted and macro F 1 scores of the classification results. G - H Comparison of spatial cell-cell distances between annotated and predicted cell types. Violin plots showing the distribution of distances from each cell type to its nearest neighbour. The numbers below represent the median. G Distances derived from annotated cell types. H Distances derived from model-predicted cell types. The UMAP embedding generated by the image encoder ( Figure 5A ) demonstrates the model’s ability to cluster cells based on image features while preserving distinct cellular identities. This indicates that the morphological features learnt from the Xenium Prime samples are sufficiently generic to capture cross-sample variability while preserving lineage-specific characteristics. When comparing the annotated cell types with predictions from SHEST, HD-Yolo, and HoVer-Net ( Figures 5B–D ), SHEST shows superior concordance with the reference annotations ( Figure 5B ). This observation is further supported by the mapped cell type counts ( Figure 5E ), indicating that SHEST provides more comprehensive and detailed predictions. In the cross-comparison of F 1 scores ( Figure 5F ), SHEST achieved the highest performance in both weighted and macro F 1 metrics. In addition to cell-level accuracy, we examined whether SHEST can preserve spatial organisation at the tissue scale. To assess the fidelity of the spatial organisation inferred by the model, cell–cell distances were compared between the annotated and predicted cell types ( Figures 5G and H ). The areas of distribution are comparable, and notably, the median distances derived from the annotated data ( Figure 5G ) and predicted data ( Figure 5H ) show consistent agreement across all cell type combinations. Particularly, the close clustering of lymphocytes—a hallmark of tertiary lymphoid structure (TLS) formation within the TME [ 35 ]—was faithfully recapitulated. This alignment between predicted and actual cell–cell proximities implies that the model’s predictions are not indiscriminate classifications but are spatially coherent in a manner consistent with biological tissue organisation. We examined the morphological characteristics of cases misclassified as lymphocytes by SHEST (Sup-plementary Figure S2). The analysis revealed that Macrophages, which are originally large and pale, were misclassified when they appeared smaller and darker; Endothelial_cells, typically located near red blood cells (RBCs), were misclassified when they were positioned away from RBCs and exhibited smaller nuclei; and Fibroblasts, which are normally elongated, were misclassified when they appeared round and small. These observations substantiate that the encoder’s latent space encodes biologically stable morphological cues rather than cohort-specific visual artefacts. Another noteworthy observation arises when examining the predicted cell maps with tissue histology. In the normal alveolar regions, the model correctly classifies the cells as Alveolar_cell ( Figure 6B ), whereas within lepidic-pattern areas, indicative of early adenocarcinomatous transformation, it identifies alveolar cells as Tumor_cell_LUAD based on morphological features ( Figure 6E ). This spatial concordance demon-strates that the model captures histopathologically meaningful transitions, effectively linking structural changes in the tissue to shifts in cellular identity. Download figure Open in new tab Figure 6: Comparison of annotated and predicted cell types in Xenium V1 Human Lung Cancer datasets with corresponding histology. A - C Results from the sample with Addon panel: ( A ) annotated cell types by the LuCA single-cell reference, ( B ) predicted cell types by the SHEST model, and ( C ) H&E staining image providing morphological context. D - F Results from the sample with Multitissue panel: ( D ) annotated cell types by the LuCA single-cell reference, ( E ) predicted cell types by the SHEST model, and ( F ) H&E staining image providing morphological context. Collectively, these results confirm that SHEST exhibits strong external validity in both cell-type recog-nition and spatial context preservation. The model retains interpretability and spatial coherence even across different material sources and gene panels, proving that the learnt morphology–molecular correspondence reflects intrinsic cellular biology. 3.3 The SHEST model reconstructs not only cell-level gene expression but also global expression patterns from H&E images The capability of the SHEST model to accurately reconstruct cell-level transcriptomic profiles from histo-logical images is validated through an external validity assessment by comparing the actual and predicted gene expression data ( Figure 7 and Supplementary Figure S3). Download figure Open in new tab Figure 7: Spatial expression reconstruction by the SHEST model on the validation dataset derived from the 10X Xenium V1 Human Lung Cancer Addon panel case. A Top 20 Pearson correlation coefficients for genes comparing actual and reconstructed expression across all cells. B Distribu-tion of actual expression for five key marker genes per cell type. C Distribution of reconstructed expression for the same marker genes shown in B , demonstrating the model’s fidelity in capturing cell-type-specific signatures. D Spatial distributi 3 o 0 n map of Tumor_cell_LUAD cells (red) across the tissue section. E and F Spatial expression map of the tumour marker gene EPCAM : actual ( E ) and reconstructed ( F ). G Spatial distribution map of Fibroblasts (green). H and I Spatial expression map of the fibroblast marker gene LTBP2 : actual ( H ) and reconstructed ( I ). J Spatial distribu-tion map of Lymphocytes (blue). K and L Spatial expression map of the lymphocyte marker gene CD3E : actual ( K ) and reconstructed ( L ). We ranked genes by Pearson correlation between observed and reconstructed single-cell expression and found canonical marker genes among the top-correlating set ( Figure 7A ). In particular, the tumour epithelial marker EPCAM , the fibroblast-associated extracellular matrix gene LTBP2 , and the lymphocyte marker CD3E showed high reconstruction fidelity ( Figure 7B and C ). In further detail, a comparison of the actual expression ( Figure 7B ) and reconstructed expression ( Figure 7C ) for the top five marker genes per cell type confirms that the prominent expression of these marker genes is maintained in both profiles. EPCAM reflects the progression and metastasis of cancer in epithelial-derived malignancies [ 36 ]. LTBP2 , secreted by activated fibroblasts and cancer-associated fibroblasts, embodies extracellular matrix remodelling and stromal activation processes essential to tumour progression [ 37 ]. CD3E defines T-cell identity and immune activation within TLS [ 38 ]. These results indicate that the image-derived embeddings are sufficient to recover both lineage identity and quantitative marker expression at the single-cell level, capturing the functional convergence between cell morphology and molecular phenotype. Another noteworthy point is that SHEST was trained using the more comprehensive Xenium Prime dataset encompassing approximately 5K genes, thereby enabling expression profiling of genes such as LTBP2 that are not included in the Xenium V1 validation panel. This expanded molecular scope demonstrates the model’s ability to generalise beyond the directly measured transcriptomic range, allowing the inference of biologically relevant genes absent from the validation dataset and supporting a more integrative and mechanistically interpretable reconstruction of the TME. Beyond single-cell reconstruction, SHEST reproduced global gene-expression landscapes that closely matched the actual data. Spatial maps of reconstructed marker expression aligned well with the actual distributions across tissue sections ( Figures 7D–L ). In detail, both the actual and reconstructed profiles consistently showed that EPCAM expression was concentrated within tumour nests, LTBP2 delineated fibroblast-rich stromal regions, and CD3E was specifically expressed within peritumoral TLS. The high accuracy of this spatial expression reconstruction validates the model’s utility as a tool for generating synthetic ST data directly from histology. Notably, alveolar cells exhibiting a lepidic pattern, although not annotated as tumour cells, also demonstrated that both the actual measurements (Figure S3E) and the reconstructed spatial profiles (Figure S3F) consistently exhibit high EPCAM expression. These observations further suggest that the reconstruction framework not only reliably predicts gene expression at single-cell resolution but also preserves subtle microanatomical patterns, enabling the detection of tumour cells that might otherwise be misclassified or overlooked. Preservation of the inherent spatial structure of gene expression was further assessed using Moran’s I spatial autocorrelation ( Figure 8 and Supplementary Figure S4). The spatial correlation of key marker genes essential for defining the TME is notable ( Figure 8B and Supplementary Figure S4B). This preservation suggests that gene expression reconstruction from histomorphological images by SHEST achieves a high spatial correlation with actual data, indicating that the model captures not only the molecular identity of individual cells but also the multicellular niche architecture based on spatially reconstructed transcriptomics within the tissue. Download figure Open in new tab Figure 8: Scatter plot of actual versus reconstructed Moran’s I spatial correlation for individual genes, based on expression values reconstructed by the SHEST model from the Xenium V1 Human Lung Cancer Addon panel dataset. A A scatter plot showing the actual versus reconstructed Moran’s I values across the full range. B A zoomed-in view of the dense region, with only genes within a 20% relative error margin of the line of identity labelled. Marker genes are colour-coded according to their associated cell type. 4 Discussion The SHEST framework establishes a direct computational bridge between H&E histology and single-cell ST by simultaneously encoding cellular morphology, classifying cell types, and reconstructing transcriptomic profiles at single-cell resolution. This unified modelling approach demonstrates that spatially coherent molecular surrogates can be derived directly from conventional histological slides, connecting tissue archi-tecture and molecular function. Methodologically, SHEST introduces three principal breakthroughs. (1) The quadruple-tile input struc-ture integrates four complementary image views—whole-cell, nucleus, haematoxylin extraction, and nucleus-specific hematoxylin—thereby encompassing nuclear detail, cytoplasmic morphology, and contextual ex-tracellular matrix information critical for accurate cellular discrimination. (2) A neighbour-aware, cluster-ing–based refinement algorithm selectively removes ambiguous cells from the latent feature space, increasing data purity and ensuring that model training relies on high-confidence examples. This approach reduces multimodal noise and stabilises the convergence of joint optimisation. (3) A ZINB loss function is employed to match the statistical properties of single-cell expression data, effectively modelling overdispersion and sparsity that are inherent to count-based transcriptomic measurements. Together, these elements enable the framework to maintain biological realism in its morphology-to-expression translation while enhancing performance in both classification and reconstruction tasks. Building upon the research findings, the SHEST framework offers several insights and implications. (1) The framework demonstrates stable inference performance across diverse staining conditions and gene pan-els, preserving both accuracy and interpretability. Having been trained on diverse staining conditions and extensive gene panels, the SHEST model complements the inherent functional information of a given tissue morphology while maintaining inference consistency. This can serve as a foundational approach for inte-grative translational medicine, bridging histopathological interpretation with molecular-level diagnosis. (2) The model’s capacity to extrapolate molecular states from morphology implies that it captures higher-order relationships within the morphology–molecular transformation space. This capacity suggests that SHEST can infer unmeasured or functionally latent features, potentially revealing previously uncharacterised cellu-lar functions or molecular signatures [ 9 ]. (3) The reconstructed spatial organisation reflects not only cellular identities but also their proximities and densities, elucidating the topological dynamics of the TME. Such spatial coherence provides a valuable substrate for downstream analyses, including cell–cell interaction inference, immune infiltration mapping, and microenvironmental remodelling studies [ 39 ]. (4) Trained on multimodal data where morphological and transcriptional identities are jointly validated, SHEST recognises cell types consistent with both morphological integrity and molecular fidelity. This dual validation enables the discrimination of early neoplastic transitions from benign epithelial states and the characterisation of progressive malignant phenotypes, providing a biologically grounded framework for understanding cellular differentiation trajectories and the morphological manifestations of tumour evolution [ 40 , 41 ]. Despite its innovative outcomes, several limitations remain in this study. Unlike cell type prediction, gene expression reconstruction could not be benchmarked, as previous studies were trained on Visium patches from breast cancer, whereas our work was trained on Xenium WSI images of LUAD. Our study focuses primarily on LUAD, a tumour type selected for the accessibility of case acquisition and cohort construction. Extending the framework to encompass diverse tumour types and tissue microenvironments represents an important direction for future work. In addition, although the training dataset was derived from the state-of-the-art Xenium 5K platform with single-cell resolution, a full-transcriptome gene panel has not yet been made publicly available [ 42 ]. As ST technologies advance toward whole-transcriptome coverage, the proposed model can be further extended to achieve more comprehensive and biologically enriched molecular representations of the tissue microenvironment. SHEST demonstrates potential beyond mere technological advancement, contributing to personalised patient care and transformative clinical decision-making. In practice, SHEST enables identification of cel-lular composition within the TME directly from accessible H&E slides, offering the potential to predict immune responses and therapeutic resistance, particularly in the context of immuno-oncology. By com-putationally substituting high-cost ST with morphological inference, the framework achieves comparable informational value at an estimated cost reduction of approximately 99.8%. Such efficiency could sub-stantially accelerate the adoption of spatially informed precision medicine. Ultimately, SHEST represents a step toward computationally enabled translational medicine, where histological morphology is not only descriptive but also quantitatively connected to molecular function. This framework holds potential to enhance patient outcomes and contribute to the long-term goal of overcoming refractory cancers through integrated and personalised therapeutic approaches. 5 Abbreviations CE Cross-Entropy DP Digital Pathology FC Fully Connected H&E Haematoxylin and Eosin LN Layer Normalisation LUAD Lung Adenocarcinoma LuCA Lung Cancer Atlas MLP Multi-Layer Perceptron MSE Mean Squared Error NLL Negative Log-Likelihood RBC Red Blood Cell scRNA-seq Single-cell RNA Sequencing SHEST Single-cell-level artificial intelligence from Haematoxylin and Eosin morphology for cell type prediction and Spatial Transcriptomics reconstruction ST Spatial Transcriptomics TME Tumour Microenvironment TLS Tertiary Lymphoid Structures ViT Vision Transformer WSI Whole-Slide Image ZINB Zero-Inflated Negative Binomial 6 Data availability Single-cell reference data were obtained from the CELLxGENE Discover datasets: https://cellxgene.cziscience.com/datasets . Public data on lung cancer samples from Xenium platforms were obtained from the 10X Genomics datasets page: https://www.10xgenomics.com/datasets/ . 7 Code availability The code for the SHEST model is maintained in the following GitHub repository: https://github.com/ recognizability/shest. 8 Declarations 8.1 Funding This study was supported by a grant from the National R&D Program for Cancer Control, Ministry of Health & Welfare, Republic of Korea (Grant Number RS-2023-CC138390). This study was supported by Samsung Medical Center (Grant Number #SMO125034). 8.2 Ethical approval This study was approved by the Institutional Review Board (IRB) of Samsung Medical Center (IRB No. 2022-11-132). 8.3 Competing interests The authors have no competing interests to declare. 8.4 Authors contributions Conceptualisation: H.J.; data curation: H.J., J.O., D.L., J.H.K.; formal analysis: H.J.; funding acquisition: H.J. and Y.C.; investigation: H.J.; methodology: H.J.; project administration: H.J.; resources: H.J. and Y.C.; software: H.J.; supervision: H.J. and Y.C.; validation: H.J., J.O., and Y.C.; visualisation: H.J.; writing — original draft: H.J.; writing — review and editing: all authors. Download figure Open in new tab Supplementary Figure S1: Cell type prediction by the SHEST model on the validation dataset derived from a 10X Xenium V1 Human Lung Cancer Multitissue panel case. A UMAP embedding from the image encoder. B - D Confusion matrices of predicted cell types, comparing the reference annotation to the predictions from SHEST ( B ), HD-Yolo ( C ), and HoVer-Net ( D ) models. E Cell type count comparison between annotations and the models. From HD-Yolo and HoVer-Net, cell types were mapped to the category most similar3t2o the reference. F Weighted and macro F 1 scores of the classification results. G - H Comparison of spatial cell-cell distances between annotated and predicted cell types. Violin plots showing the distribution of distances from each cell type to its nearest neighbour. The numbers below represent the median. G Distances derived from annotated cell types. H Distances derived from model-predicted cell types. Download figure Open in new tab Supplementary Figure S2: Representative examples of correctly classified and misclassified cases by the SHEST model for the Xenium V1 Human Lung Cancer Addon panel dataset. A Macrophage → Macrophage (correct) vs. Macrophage → Lymphocyte (misclassified). The far right is for compar-ison, showing Lymphocyte → Lymphocyte (correct). B Endothelial_cell → Endothelial_cell (correct) vs. Endothelial_cell → Lymphocyte (misclassified). C Fibroblast → Fibroblast (correct) vs. Fibroblast → Lymphocyte (misclassified). Download figure Open in new tab Supplementary Figure S3: Spatial expression reconstruction by the SHEST model on the validation dataset derived from the 10X Xenium V1 Human Lung Cancer Multitissue panel case. A Top 20 Pearson correlation coefficients for genes comparing actual and reconstructed expression across all cells. B Distribution of actual expression for five key marker genes per cell type. C Distribution of reconstructed expression for the same marker genes shown in B , demonstrating the model’s 34 fidelity in capturing cell-type-specific signatures. D Spatial distribution map of Tumor_cell_LUAD cells (red) across the tissue section. E and F Spatial expression map of the tumour marker gene EPCAM : actual ( E ) and reconstructed ( F ). G Spatial distribution map of Fibroblasts (green). H and I Spatial expression map of the fibroblast marker gene LTBP2 : actual ( H ) and reconstructed ( I ). J Spatial distribution map of Lymphocytes (blue). K and L Spatial expression map of the lymphocyte marker gene CD3E : actual ( K ) and reconstructed ( L ). Download figure Open in new tab Supplementary Figure S4: Scatter plot of actual versus reconstructed Moran’s I spatial correlation for individual genes, based on expression values reconstructed by the SHEST model from the Xenium V1 Human Lung Cancer Multitissue panel dataset. A A scatter plot showing the actual versus reconstructed Moran’s I values across the full range. B A zoomed-in view of the dense region, with only genes within a 20% relative error margin of the line of identity labelled. Marker genes are colour-coded according to their associated cell type. View this table: View inline View popup Download powerpoint Supplementary Table S1: Categorisation of cell subtypes into cell types within the lung cancer tissue. Funder Information Declared Ministry of Health and Welfare , RS-2023-CC138390 Samsung Medical Center , #SMO125034 Footnotes https://cellxgene.cziscience.com/datasets https://www.10xgenomics.com/datasets/ References [1]. ↵ Jorge Almagro , Hendrik A Messal , Alberto Elosegui-Artola , Jacco van Rheenen , and Axel Behrens . Tissue architecture in tumor initiation and progression . Trends in cancer , 8 ( 6 ): 494 – 505 , 2022 . OpenUrl PubMed [2]. ↵ Eduard Chelebian , Christophe Avenel , and Carolina Wählby . Combining spatial transcriptomics with tissue morphology . Nature Communications , 16 ( 1 ): 4452 , 2025 . OpenUrl [3]. ↵ Metin N Gurcan , Laura E Boucheron , Ali Can , Anant Madabhushi , Nasir M Rajpoot , and Bulent Yener . Histopathological image analysis: A review . IEEE reviews in biomedical engineering , 2 : 147 – 171 , 2009 . OpenUrl PubMed [4]. ↵ Daniel C Koboldt , Karyn Meltz Steinberg , David E Larson , Richard K Wilson , and Elaine R Mardis . The next-generation sequencing revolution and its impact on genomics . Cell , 155 ( 1 ): 27 – 38 , 2013 . OpenUrl CrossRef PubMed Web of Science [5]. ↵ Tong Fu , Lei-Jie Dai , Song-Yang Wu , Yi Xiao , Ding Ma , Yi-Zhou Jiang , and Zhi-Ming Shao . Spatial architecture of the immune microenvironment orchestrates tumor immunity and therapeutic response . Journal of hematology & oncology , 14 ( 1 ): 98 , 2021 . [6]. ↵ Michael A Cilento , Christopher J Sweeney , and Lisa M Butler . Spatial transcriptomics in cancer research and potential clinical impact: a narrative review . Journal of cancer research and clinical oncology , 150 ( 6 ): 296 , 2024 . [7]. ↵ Hamid Reza Tizhoosh and Liron Pantanowitz . Artificial intelligence and digital pathology: challenges and opportunities . Journal of pathology informatics , 9 ( 1 ): 38 , 2018 . [8]. ↵ Cameron G Williams , Hyun Jae Lee , Takahiro Asatsuma , Roser Vento-Tormo , and Ashraful Haque . An introduction to spatial transcriptomics for biomedical research . Genome medicine , 14 ( 1 ): 68 , 2022 . [9]. ↵ Yang Jin , Yuanli Zuo , Gang Li , Wenrong Liu , Yitong Pan , Ting Fan , Xin Fu , Xiaojun Yao , and Yong Peng . Advances in spatial transcriptomics and its applications in cancer research . Molecular Cancer , 23 ( 1 ): 129 , 2024 . [10]. ↵ Joseph Bergenstråhle , Ludvig Larsson , and Joakim Lundeberg . Seamless integration of image and molecular analysis for spatial transcriptomics workflows . BMC genomics , 21 ( 1 ): 482 , 2020 . [11]. ↵ Simon Graham , Quoc Dang Vu , Shan E Ahmed Raza , Ayesha Azam , Yee Wah Tsang , Jin Tae Kwak , and Nasir Rajpoot . Hover-net: Simultaneous segmentation and classification of nuclei in multi-tissue histology images . Medical image analysis , 58 : 101563 , 2019 . [12]. ↵ Ruichen Rong , Hudanyun Sheng , Kevin W Jin , Fangjiang Wu , Danni Luo , Zhuoyu Wen , Chen Tang , Donghan M Yang , Liwei Jia , Mohamed Amgad , et al. A deep learning approach for histology-based nucleus segmentation and tumor microenvironment characterization . Modern Pathology , 36 ( 8 ): 100196 , 2023 . [13]. ↵ Joseph Redmon , Santosh Divvala , Ross Girshick , and Ali Farhadi . You only look once: Unified, real-time object detection . In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 779 – 788 , 2016 . [14]. ↵ Fabian Hörst , Moritz Rempe , Lukas Heine , Constantin Seibold , Julius Keyl , Giulia Baldini , Selma Ugurel , Jens Siveke , Barbara Grünwald , Jan Egger , et al. Cellvit: Vision transformers for precise cell segmentation and classification . Medical Image Analysis , 94 : 103143 , 2024 . [15]. ↵ Alexey Dosovitskiy . An image is worth 16×16 words: Transformers for image recognition at scale . arXiv preprint arXiv:2010.11929 , 2020 . [16]. ↵ Wenwen Min , Zhiceng Shi , Jun Zhang , Jun Wan , and Changmiao Wang . Multimodal contrastive learning for spatial gene expression prediction using histology images . Briefings in Bioinformatics , 25 ( 6 ):bbae551, 2024 . [17]. ↵ Xiao Xiao , Yan Kong , Ronghan Li , Zuoheng Wang , and Hui Lu . Transformer with convolution and graph-node co-embedding: An accurate and interpretable vision backbone for predicting gene expres-sions from local histopathological image . Medical Image Analysis , 91 : 103040 , 2024 . [18]. ↵ Yuansong Zeng , Zhuoyi Wei , Weijiang Yu , Rui Yin , Yuchen Yuan , Bingling Li , Zhonghui Tang , Yutong Lu , and Yuedong Yang . Spatial transcriptomics prediction from histology jointly through transformer and graph neural networks . Briefings in Bioinformatics , 23 ( 5 ), 2022 . [19]. ↵ Yuran Jia , Junliang Liu , Li Chen , Tianyi Zhao , and Yadong Wang . Thitogene: a deep learning method for predicting spatial transcriptomics from histological images . Briefings in Bioinformatics , 25 ( 1 ), 2023 . [20]. ↵ Yue Zhao , Elaheh Alizadeh , Yang Liu , Ming Xu , J Matthew Mahoney , and Sheng Li . Inferring single-cell spatial gene expression with tissue morphology via explainable deep learning . bioRxiv , pages 2024–06, 2024 . [21]. ↵ Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , et al. Pytorch: An imperative style, high-performance deep learning library . Advances in neural information processing systems , 32 , 2019 . [22]. ↵ Luca Marconato , Giovanni Palla , Kevin A Yamauchi , Isaac Virshup , Elyas Heidari , Tim Treis , Wouter-Michiel Vierdag , Marcella Toth , Sonja Stockhaus , Rahul B Shrestha , et al. Spatialdata: an open and universal data framework for spatial omics . Nature methods , 22 ( 1 ): 58 – 62 , 2025 . OpenUrl PubMed [23]. ↵ F Alexander Wolf , Philipp Angerer , and Fabian J Theis . Scanpy: large-scale single-cell gene expression data analysis . Genome biology , 19 ( 1 ): 15 , 2018 . [24]. ↵ CZI Cell Science Program, Shibla Abdulla , Brian Aevermann , Pedro Assis , Seve Badajoz , Sidney M Bell , Emanuele Bezzi , Batuhan Cakir , Jim Chaffer , Signe Chambers , et al. Cz cellxgene discover: a single-cell data platform for scalable exploration, analysis and modeling of aggregated data . Nucleic acids research , 53 (D1): D886 – D900 , 2025 . OpenUrl CrossRef PubMed [25]. ↵ Simon Mages , Noa Moriel , Inbal Avraham-Davidi , Evan Murray , Jan Watter , Fei Chen , Orit Rozenblatt-Rosen , Johanna Klughammer , Aviv Regev , and Mor Nitzan . Tacco unifies annotation transfer and decomposition of cell identities for single-cell and spatial omics . Nature biotechnology , 41 ( 10 ): 1465 – 1473 , 2023 . OpenUrl CrossRef PubMed [26]. ↵ Charlie Saillard , Rodolphe Jenatton , Felipe Llinares-López , Zelda Mariet , David Cahané , Eric Durand , and Jean-Philippe Vert . H-optimus - 0 , 2024 . [27]. ↵ Leland McInnes , John Healy , and James Melville . Umap: Uniform manifold approximation and pro-jection for dimension reduction . arXiv preprint arXiv:1802.03426 , 2018 . [28]. ↵ Martin Ester , Hans-Peter Kriegel , Jörg Sander , Xiaowei Xu , et al. A density-based algorithm for discovering clusters in large spatial databases with noise. In kdd , volume 96 , pages 226–231, 1996 . [29]. ↵ Yoshimasa Nakazato , Yuko Minami , Hiromi Kobayashi , Kaishi Satomi , Yoichi Anami , Koji Tsuta , Ryota Tanaka , Masafumi Okada , Tomoyuki Goya , and Masayuki Noguchi . Nuclear grading of primary pulmonary adenocarcinomas: correlation between nuclear size and prognosis . Cancer , 116 ( 8 ): 2011 – 2019 , 2010 . OpenUrl CrossRef PubMed Web of Science [30]. ↵ Jimmy Lei Ba , Jamie Ryan Kiros , and Geoffrey E Hinton . Layer normalization . arXiv preprint arXiv:1607.06450 , 2016 . [31]. ↵ William H Greene . Accounting for excess zeros and sample selection in poisson and negative binomial regression models . 1994 . [32]. ↵ Ilya Loshchilov and Frank Hutter . Decoupled weight decay regularization . arXiv preprint arXiv:1711.05101 , 2017 . [33]. ↵ Sharan Chetlur , Cliff Woolley , Philippe Vandermersch , Jonathan Cohen , John Tran , Bryan Catanzaro , and Evan Shelhamer . cudnn: Efficient primitives for deep learning . arXiv preprint arXiv:1410.0759 , 2014 . [34]. ↵ Kaiming He , Xiangyu Zhang , Shaoqing Ren , and Jian Sun . Delving deep into rectifiers: Surpassing human-level performance on imagenet classification . In Proceedings of the IEEE international confer-ence on computer vision, pages 1026–1034, 2015 . [35]. ↵ Ton N Schumacher and Daniela S Thommen . Tertiary lymphoid structures in cancer . Science , 375 ( 6576 ):eabf9419, 2022. [36]. ↵ Lucia Tombolan , Elisabetta Rossi , Angelica Zin , Luisa Santoro , Paolo Bonvini , Rita Zamarchi , and Gianni Bisogno . Pediatric sarcomas display a variable epcam expression in a histology-dependent manner . Translational Oncology , 13 ( 11 ): 100846 , 2020 . [37]. ↵ Jiarong Zhan , Mengqing Li , Lei Li , Ting-Ting Zeng , Jun Liu , Qingyun Chen , Chen Jiang , Xin Wang , Dan Xie , Xin-Yuan Guan , et al. Targeting ltbp2 derived from cancer-associated fibroblasts sensitizes esophageal squamous cell carcinoma to chemotherapy . Cancer Research , 85 ( 13 ): 2412 – 2428 , 2025 . OpenUrl PubMed [38]. ↵ Zixu Chen , Bang Hu , Keyu Cai , Han Gao , Zhenyu Xian , Shuang Zhang , Zhen Fang , Qian Zhou , Donglin Ren , and Qi Zou . Dynamics of tertiary lymphoid structures and immune cross talk in early versus advanced colorectal cancer: potential implications for immunotherapy. Cancer Immunology , Immunotherapy , 74 ( 6 ): 185 , 2025 . [39]. ↵ Khalaf Mohamed Almazrouei , Vartika Mishra , Hetal Pandya , Kumar Sambhav , Sainath Narayan Bhavsar , and Sainath Narayan N Bhavsar Sr. Tumor microenvironment and its role in cancer progres-sion: An integrative review . Cureus , 17 ( 9 ): e92707 , 2025 . OpenUrl [40]. ↵ Kaimin Hu , Yinan Wu , Yajing Huang , Meiqi Zhou , Yanyan Wang , and Xingru Huang . Annotation-free deep learning algorithm trained on hematoxylin & eosin images predicts epithelial-to-mesenchymal transition phenotype and endocrine response in estrogen receptor-positive breast cancer . Breast Cancer Research , 27 ( 1 ): 6 , 2025 . [41]. ↵ Rohit Arora , Christian Cao , Mehul Kumar , Sarthak Sinha , Ayan Chanda , Reid McNeil , Divya Samuel , Rahul K Arora , T Wayne Matthews , Shamir Chandarana , et al. Spatial transcriptomics reveals distinct and conserved tumor core and edge architectures that predict survival and targeted therapy response . Nature communications , 14 ( 1 ): 5029 , 2023 . OpenUrl [42]. ↵ Yichen Si , Joo Sang Lee , Goo Jun , Hyun Min Kang , and Jun Hee Lee . Spatial omics enters the microscopic realm: opportunities and challenges . Trends in Genetics , 2025 . View the discussion thread. Back to top Previous Next Posted November 19, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following SHEST: Single-cell-level artificial intelligence from haematoxylin and eosin morphology for cell type prediction and spatial transcriptomics reconstruction Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share SHEST: Single-cell-level artificial intelligence from haematoxylin and eosin morphology for cell type prediction and spatial transcriptomics reconstruction Hoyeon Jeong , Junghan Oh , Donggeon Lee , Jae Hwan Kang , Yoon-La Choi bioRxiv 2025.11.19.689364; doi: https://doi.org/10.1101/2025.11.19.689364 Share This Article: Copy Citation Tools SHEST: Single-cell-level artificial intelligence from haematoxylin and eosin morphology for cell type prediction and spatial transcriptomics reconstruction Hoyeon Jeong , Junghan Oh , Donggeon Lee , Jae Hwan Kang , Yoon-La Choi bioRxiv 2025.11.19.689364; doi: https://doi.org/10.1101/2025.11.19.689364 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41913) Biophysics (21436) Cancer Biology (18578) Cell Biology (25482) Clinical Trials (138) Developmental Biology (13372) Ecology (19889) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15599) Genomics (22483) Immunology (17728) Microbiology (40365) Molecular Biology (17163) Neuroscience (88540) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15130) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9818) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-06-17T06:32:23.968882+00:00