Self-Supervised AI Reveals a Hidden Landscape of Prognostic Spatial Patterns in Multiplex Immunofluorescence Images

doi:10.1101/2025.10.16.682563

Self-Supervised AI Reveals a Hidden Landscape of Prognostic Spatial Patterns in Multiplex Immunofluorescence Images

2025 · doi:10.1101/2025.10.16.682563

preprint OA: gold CC-BY-4.0

📄 Open PDF Full text JSON View at publisher

Full text 74,607 characters · extracted from preprint-html · click to expand

Self-Supervised AI Reveals a Hidden Landscape of Prognostic Spatial Patterns in Multiplex Immunofluorescence Images | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Self-Supervised AI Reveals a Hidden Landscape of Prognostic Spatial Patterns in Multiplex Immunofluorescence Images View ORCID Profile Lucas Farndale , View ORCID Profile Kai Rakovic , View ORCID Profile Phimmada Hatthakarnkul , View ORCID Profile Sona Szalma , Leonor Schubert Santana , View ORCID Profile Silvia Martinelli , Fiona Ballantyne , View ORCID Profile Rachel L Baird , View ORCID Profile Ian R Powley , View ORCID Profile Leah Officer-Jones , Noori Maka , View ORCID Profile Campbell Roxburgh , View ORCID Profile Crispin J Miller , View ORCID Profile David Chang , View ORCID Profile Joanne Edwards , View ORCID Profile Edward W Roberts , View ORCID Profile John Le Quesne , View ORCID Profile Ke Yuan doi: https://doi.org/10.1101/2025.10.16.682563 Lucas Farndale 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lucas Farndale Kai Rakovic 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom 3 Department of Pathology, Queen Elizabeth University Hospital , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Kai Rakovic Phimmada Hatthakarnkul 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Phimmada Hatthakarnkul Sona Szalma 4 School of Medicine, Medical Sciences and Nutrition, University of Aberdeen , Aberdeen, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sona Szalma Leonor Schubert Santana 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site Silvia Martinelli 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Silvia Martinelli Fiona Ballantyne 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rachel L Baird 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Rachel L Baird Ian R Powley 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ian R Powley Leah Officer-Jones 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Leah Officer-Jones Noori Maka 3 Department of Pathology, Queen Elizabeth University Hospital , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site Campbell Roxburgh 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom 5 Academic Unit of Surgery, University of Glasgow , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Campbell Roxburgh Crispin J Miller 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Crispin J Miller David Chang 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom 5 Academic Unit of Surgery, University of Glasgow , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for David Chang Joanne Edwards 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joanne Edwards Edward W Roberts 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Edward W Roberts John Le Quesne 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom 3 Department of Pathology, Queen Elizabeth University Hospital , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for John Le Quesne For correspondence: john.lequesne{at}glasgow.ac.uk ke.yuan{at}glasgow.ac.uk Ke Yuan 1 School of Cancer Sciences, University of Glasgow , Glasgow, United Kingdom 2 Cancer Research UK Scotland Institute , Glasgow, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ke Yuan For correspondence: john.lequesne{at}glasgow.ac.uk ke.yuan{at}glasgow.ac.uk Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Modern spatial proteomic methods, such as multiplex immunofluorescence (mIF) imaging, offer a data-rich view of spatial biology in intact tissues. However, interpreting its complexity is a major bottleneck, limiting its potential for biological discovery and clinical translation. Current computational methods often rely on segmentation-based approaches that discard crucial morphological information and are limited to testing pre-defined hypotheses. Here, we introduce a self-supervised learning (SSL) framework that enables hypothesis-agnostic, context-aware discovery of biomarkers directly from mIF images. Our approach extracts rich feature representations that capture holistic architectural patterns, which integrate cellular morphology, marker interactions, and microenvironmental context without human supervision. Applying this framework to over 7,000 mIF tissue images from over 1,800 patients in two distinct cancer types, we demonstrate superior prognostic performance over conventional segmentation analyses. The method autonomously identified previously unknown and potentially clinically actionable biological patterns. In lung adenocarcinoma, these include a Ki67-mediated immune evasion phenotype, a sub-cellular pattern of GLB1 expression which aligns with low-grade EGFR -driven tumours, and distinct modes of tumour-immune interaction in PD-L1 + patients. We also find a regulatory T-cell-mediated immunosupressive environment promoting tumour budding in colorectal carcinoma. Our work establishes SSL as a powerful, scalable, and unbiased platform to decode tissue ecosystems while being fully explainable without pre-defined hypotheses. This paradigm shift transforms high-plex imaging from a hypothesistesting tool into a hypothesis-generating engine that can accelerate the discovery of next-generation spatial biomarkers. 1 Main mIF allows simultaneous quantification of many protein markers within a single intact tissue section, potentially unlocking a new era of spatially informed precision oncology. However, the very richness of mIF data has outpaced our ability to interpret it, leaving its transformative potential unrealised. Clinical histopathology is routinely supplemented by additional single-plex proteomic assays to help guide patient management, for example to select patients for personalised therapies in lung adenocarcinoma (LUAD) [ 1 , 2 , 3 ] or to augment diagnosis in colorectal carcinoma (CRC) [ 4 ]. Sequential single-plex assays are tissue-exhaustive, a key limitation when personalised therapies and associated biomarkers proliferate [ 5 ] as diagnostic tissue is a scarce resource. Widespread use of mIF faces two fundamental analytical challenges. First, human perception is illequipped to interpret high-plex images: simultaneously viewing dozens of markers exceeds cognitive limits, and composite visualisations become crowded, obscuring critical interactions. Second, conventional computational tools rely on cell segmentation, reducing complex tissue ecosystems to simplistic celltype counts or neighbourhood graphs. Segmentation pipelines are imperfect, discard morphological information, and, crucially, the information generated from their data is unlikely to lead to discovery beyond the confines of pre-defined hypotheses. Even with perfect segmentation, the combinatorial explosion of cell states and interactions makes exhaustive analysis prohibitively expensive. SSL offers a paradigm shift by learning directly from image data without manual supervision. In contrast to supervised methods constrained by annotated training sets [ 6 , 7 ] joint-embedding SSL extracts semantically meaningful features by identifying invariant patterns across augmented views of the same image. Typically, redundant patterns such as biological noise are obscured by the augmentations, while meaningful patterns such as protein localisation are unaffected. This approach is uniquely suited to mIF: it preserves spatial and morphological information, integrates signals across all markers simultaneously, and avoids biases introduced by segmentation or pre-defined cell states. SSL applications in pathology have focused on brightfield images [ 8 , 9 , 10 , 11 ], but the potential to discover high-order, cross-protein spatial biomarkers in full mIF datasets remains untapped. Our framework learns rich spatial features directly from image pixels, bypassing the need for error-prone cell segmentation in the discovery phase. We then use these learned image features as a sub-strate for Leiden community detection, discovering spatial proteomic clusters (SPCs) - clusters of similar image tiles that represent recurrent spatial neighbourhoods. We then characterise these discovered patterns using traditional segmentation for interpretation and biological validation only, bridging the gap between data-driven discovery and established pathology. The key contribution of this work is that we are able to discover spatial biomarkers by holistically considering tissue architecture and cell interactions without wasting information about their morphology or marker distribution. No existing study has enabled hypothesis-agnostic discovery of biomarkers from mIF which have withstood validation in holdout patient sets. While recent work has applied SSL to multiplexed imaging [ 6 , 12 , 13 , 14 , 15 ], these approaches lack validation, with the focus generally on characterising cells or patients with little consideration for tissue morphology. Here, we introduce a discriminative SSL framework that identifies prognostic biomarkers validated in 7,000+ mIF tissue microarray (TMA) cores across two cancers. Our approach identifies prognostic tissue patterns, integrating marker interactions, cellular morphology, and tumour microenvironment context without prior assumptions. We validate the method in three key ways: (i) demonstrating the ability to predict survival in a held-out test set, (ii) uncovering novel candidate biomarkers, and (iii) showing cross-panel generality through integrated analysis of independent staining panels. Crucially, all biomarkers emerge from the data itself — without manual supervision or pre-formed hypotheses — yet remain interpretable through pathologist-validated patterns. 2. Results A typical, hypothesis-driven mIF analysis would involve cell segmentation followed by assignment of phenotypes to cells using the mean fluorescence intensities of key lineage markers ( Figure 1 ). In our analysis, we take a new approach, allowing the SSL model to generate hypotheses which we can then investigate using classical methods. We used three mIF panels from two common solid malignancies to explore the utility of our methodology. The Leicester Archival Thoracic Tumour Investigatory Cohort - Adenocarcinoma (LATTICeA) cohort [ 16 ] was stained with two panels: an immuno-oncology (LATTICeA-IO) panel, and a hallmarks of cancer panel (LATTICeA-Hallmarks). We also used an immuno-oncology panel on cores from the Glasgow Royal Infirmary (GRI) colorectal cancer cohort [ 17 ]. The details of the constituents of each panel are detailed in Supplementary Figure S1 and in the methods. Download figure Open in new tab Figure 1. Comparison of (a) a classical analysis and (b) our self-supervised approach. Segmentation-based approaches typically lose informative features such as extracellular matrix components, cell morphology, and subcellular marker intensity variation. SSL-based approaches are designed to lose redundant information but can encode the features missed by conventional approaches. A classical initial analysis would involve calculation of cell lineage densities per case, and straightforward linking of these to prognosis. In the LATTICeA-IO panel, this gives us a modest ability to stratify patients by CD8 + T-cell density (Supplementary Figure S2a, b). This is a useful preliminary insight into the data, but there is considerably more supervision required to identify more complex spatial patterns in the tissue. As an alternative to using this level of supervision, we used an SSL approach to generate patterns of interest by clustering 224 × 224px tiles from each core based on their latent representations. In parallel, we performed classical cell segmentation-based workflows to assign cell identities within tiles in a spatially resolved manner. This means that for each tile we were able to quantify the SSL-discovered phenotypes by classically discovered cell types, aiding the interpretation of the clusters. SSL cluster frequencies were then linked to outcome and other key clinicopathological states ( Figure 2 ). Download figure Open in new tab Figure 2. Method overview. Core images are tiles at 224 × 224 px (100 μ m), and tiles containing less than 90% tissue area are discarded. The remaining tiles are used to train a self-supervised model to map tiles to latent representations. Leiden clustering is used to obtain spatial proteomic clusters, and the distribution of these clusters is used to construct a vector characterising each patient in terms of the clusters. These vectors are then investigated for associations with survival, aligned to classical cell phenotype analysis, neighbourhood discovery, pathologists’ visual interpretation, and gene expression/mutation data. 2.1. Automated discovery of recurrent spatial clusters Our methodology identifies recurrent spatial patterns across the data, encoding these features into SPCs. For each mIF panel, we annotate each SPC with a short description based on a visual histopathological assessment ( Figure 3a, c ). We saw broad colocalisation of major cell lineages in distinct locales on the uniform manifold approximation and projection (UMAP) plot ( Figure 3a , Supplementary Figures S3a-e). We saw similar patterns replicated across the LATTICeA-Hallmarks panel ( Figure 3b , Supplementary Figures S3f-m). Download figure Open in new tab Figure 3. Clusters of mIF image tiles have consistent morphologies. (a, b) Uniform manifold approximation and projection plot of the immuno-oncology panel (a) and hallmarks of cancer panel (b) embedding spaces. Tile images 100 μ m in width. (c) Mean cell densities of cell lineages by cluster (LATTICeA-IO), as z-scores. (d) Mean cell fluorescence intensity of markers by cluster (LATTICeA-Hallmarks), as z-scores. (e) Kaplan Meier curve by risk group using information from both LATTICeA panels. High risk n = 469, low risk n = 514. (f) Kaplan Meier curve by risk group using information from LATTICeA-IO. High risk n = 451, low risk n = 539. (g) Kaplan Meier curve by risk group using information from LATTICeA-Hallmarks. High risk n = 509, low risk n = 482. cC3 cleaved caspase-3. Despite the clear segregation of immune-, stromal- and tumour-dominated regions in the latent space, in the LATTICeA-IO panel we identified that heterogeneous higher-order interactions between cells are captured by the latent representations. For example, we saw clear division in the latent space between SPCs with and without inflammatory cells, as well as neighbourhoods characterised by recurrent relative compositions of these cells ( Figure 3b ). Similarly, in the LATTICeA-Hallmarks panel we saw SPCs characterised by tumour cells expressing PD-L1, or by hypoxia (which show concomitant downregulation of angiogenesis; Figure 3d ). We used two-fold cross validation to establish the prognostic power of the SPCs we discovered, yielding substantial test set sizes with which to validate our observations. Our predictive model was able to successfully stratify patients into high- and low-risk groups using combined information from both panels and individually ( Figures 3e-g , Supplementary Figures S4a-d). 2.2 Proliferating tumour cells evade immune attack Several SPCs from the LATTICeA-IO panel were characterised by their constituent cells high Ki67 expression ( Figure 4a ). We sought to look at SPC 22 in closer detail, as it was characterised by proliferating tumour cells adjacent to a mixed inflammatory population ( Figure 4b ). The ubiquity of this interesting phenotype led us to hypothesise that T-cell density may be less beneficial for survival in the context of high tumour proliferation. Therefore we explored its clinicopathological significance, by investigating the relationship between T-cell density, proliferation and prognosis. We stratified all patients by their tumour cell mean Ki67 fluorescence intensity and T-cell density, and found that T-cell density is a positive prognostic factor only in the low-proliferation setting ( Figures 4c, d ). This remained in a sub-analysis of PD-L1 positive patients, which is more common in the high proliferative group (Supplementary Figures S5a-c). Download figure Open in new tab Figure 4. (a) Mean Ki67 fluorescence intensity by cell type and SPC. (b) Example pseudocoloured tile images from SPC 22. Tile width 100 μ m. (c) Kaplan Meier curve showing all Ki67-low cases stratified by T-cell density. T-cell high n = 224, T-cell low n = 218. (d) Kaplan Meier curve showing all Ki67-high cases stratified by T-cell density. T=cell high n = 235, T-cell low n = 246. (e) Enrichment of MCP counter cell type deconvolution estimates in Ki67 high versus low cases expressed as log 2 fold change. Statistical significance tested with a two-tailed Mann-Whitney-U test; * p < 0.05 after correction for multiple testing with the Benjamini-Hochberg method. (f) Schematic of validation on TCGA-LUAD bulk RNA-Seq data. (g) Kaplan-Meier curve showing all MKI67 -low patients from TCGA-LUAD stratified by estimated T-cell abundance. T-cell high n = 129, T-cell low n = 123. (h) Kaplan-Meier curve showing all MKI67 -high patients from TCGA-LUAD stratified by estimated T-cell abundance. T-cell high n = 123, T-cell low n = 129. Using RNA-Seq-based cell type deconvolution, we found small but significant differences in microenvironmental cell abundance estimates between Ki67-high/T-cell-high and Ki67-low/T-cell-high tumours. The Ki67-high tumours had increased fibroblast numbers and more cytotoxic lymphocytes compared to the Ki67-low tumours, which were better vascularised, and had increased myeloid dendritic cells and neutrophils ( Figure 4e ). In order to validate this observation, we similarly classified patients using bulk transcriptomic data from The Cancer Genome Atlas (TCGA). We estimated T cell abundance using microenvironment cell type deconvolution [ 18 ] and used MKI67 gene expression as a marker of proliferation ( Figure 4f ). We found, again, that the ability of T-cell density to stratify patients into risk groups is successful only in low-proliferation tumours ( Figures 4g, h ). This has implications for translatability, suggesting that we should consider proliferation rates alongside measures of immune cell engagement when assessing prognosis and the likelihood of response to immunotherapies. 2.3 Regulatory T-cells contribute to the high-stroma phenotype in colorectal carcinoma Next, we turned our attention to the GRI cohort. As with the LUAD panels, we found broad structure in the tile embedding space, with predominantly-stromal and -tumour clusters occupying separate niches ( Figure 5a , Supplementary Figures S6a-g). We were, again, able to link the frequency of SPCs to prognosis and stratify patients into risk groups in the held-out test set ( Figure 5b ), We again calculated cell density measures for each cluster on this panel and found distinctions between stroma and tumour clusters ( Figure 5c ). In particular, there was marked heterogeneity in stromal composition amongst those stromal clusters. SPC 11 was notable, as this was enriched in regulatory T-cells () and macrophages while being depleted for CD8 + T-cells, suggesting a protumour microenvironment. T reg Download figure Open in new tab Figure 5. (a) Uniform manifold approximation and projection plot of the embedding space from the GRI-CRC panel and example SPC tile images. Tile diameters 100 μ m. (b) Kaplan Meier curve by risk group using information from the GRI-CRC panel. High risk n = 276, low risk n = 250. (c) Mean cell type densities by cluster, as z-scores. (d) SPC 11 score by tumour budding grade. High grade n = 225, low grade n = 532. Statistical significance from a two-tailed Mann-Whitney-U test. (e) SPC 11 score by consensus molecular subtype. CMS1 n = 28, CMS2 n = 91, CMS3 n = 10, CMS4 n = 33. Statistical significance from the Kruskal-Wallis test. (f) Correlation matrix showing the cluster proportion score against the single sample gene set enrichment analysis score of hallmark pathways. Pathways with at least one significant relationship with a cluster are shown. Values are Spearman correlation coefficients; * p< 0.01 after correction for multiple testing applied with the Benjamini-Hochberg method. We found that enrichment for SPC 11 was associated with the high stroma content consensus molecular subtype (CMS) of CRC, CMS4 [ 19 ] as well as high grade tumour budding ( Figures 5d, e ). This is corroborated through pathway analysis, where we found that the proportion of SPC 11 on the slide correlated with transcriptomic modules of stromal remodelling such as epithelial-mesenchymal transition (EMT), myogenesis and angiogenesis, as well as those of immunosuppression via IL6/STAT3 signalling ( Figure 5f ) that has been shown to suppress anti-tumour effectors and promote regulatory immune cell function including T reg and to promote tumour cell growth and survival [ 20 , 21 ]. These observations support the notion that STAT3 signalling may promote an immunosuppressive environment throughout the tumour, which permits the evolution to a highly infiltrative phenotype with high grade tumour budding. While similar observations have been subsequently made [ 22 ], these were not apparent at the time of data-generation. This demonstrates the potential for this framework to make real, independently reproducible biological discovery autonomously, without human supervision. 2.4 Apical GLB1 expression is associated with low-grade EGFR-driven LUAD In order to integrate the information obtained from both LATTICeA panels, we next used non-negative matrix factorisation (NMF) to combine the SPCs from both panels to 12 components ( Figure 6a ). These 12 components incorporate the information obtained from all SPCs, and enable patients to be described in terms of combined features from each panel ( Figure 6a, c ; Supplementary Figure S7a). We performed an associative survival analysis to guide subsequent interrogation of components, finding components 6 and 11 to be associated with favourable prognosis (Supplementary Figure S7b, c). On visual inspection of tumours with a high component 6 score, we observed that many of these tumours were of lepidic or papillary growth pattern, which are lower grade LUAD phenotypes ( Figure 6b, d ). Interestingly, the defining spatial proteomic feature was epithelial GLB1 expression localising to the apical aspect of the cell ( Figure 6b, e ). Our approach allows for direct discovery of subtle image features such as the subcellular localisation of expressed proteins, which would never be identified using a classical cellsegmentation based approach. Download figure Open in new tab Figure 6. Apical GLB1 staining is associated with differentiation in LUAD. (a) Schematic of the integration of the two panels. SPC frequency vectors from the two panels are concatenated. A non-negative matrix factorisation model gives 12 components. (b) mIF image of a high component 6 core of LUAD with papillary architecture, displaying apical GLB1 intensity. Scale bars: core image 100 μ m; insert 10 μ m. (c) Heatmap showing the definitions of the 12 NMF components. A high score indicates a strong contribution of that SPC to that component’s score. (d) Association between core-level component score and growth pattern; * p < 0.05. (e) Scatter plot of core-level component 6 score against mean tumour cell GLB1 fluorescence intensity and Spearman correlation. (f) Component 6 score by EGFR mutation status; WT n = 559, mutant n = 79. (g) Component 6 score by smoking status; current or previous n = 736, never smoked n = 65. (h) Overall survival by dichotomised component 6 score in IASLC grade 3 tumours; C6 high n = 385, C6 low n = 137. (i) Gene set enrichment analysis based on differential expression analysis between component 6 high and low tumours. All statistical significance tested with a two-tailed Mann-Whitney-U test and p-value correction with the Benjamini-Hochberg method where appropriate unless otherwise specified. In addition to a relationship with low grade morphology, component 6 was also associated with EGFR mutation and never-smoking ( Figure 6f , g). This is a recognised phenotype in LUAD, with poorly understood biology, seen classically in young patients of East Asian ancestry. This observation gives us a glimpse into the biology of this phenomenon, and allows us to follow this up with focused functional experiments to understand its significance. Having discovered the relationship between component 6 and well-differentiated morphologies, we went back to the survival associations. We found that the component 6 score successfully stratifies all patients into survival risk groups (Supplementary Figure S7d). Given the relationship with low grade tumours, we performed a sub-group analysis and found that component 6 is only prognostic in IASLC grade 3 patients ( Figure 6h , Supplementary Figure S7e). This raises the possibility that this pattern of GLB1 expression is a marker of reduced biological virulence across the whole tumour, its presence in one area indicating that conventionally high-grade appearances elsewhere might actually be less deadly than expected from H&E morphology alone. This idea is further supported at the transcriptomic level, where component 6-high tumours showed downregulation of protumourigenic pathways such as those relating to proliferation and EMT ( Figure 6i ). 2.5 Spatial Organisation of Tumour Infiltrating Lymphocytes Predicts Prognosis in PD-L1 + Patients While the previous sections have focused on obtaining a global set of SPCs for each panel/cohort, we can obtain more granular insights by examining SPCs discovered within targeted patient subsets. For example, we examined the morphological heterogeneity within only the PD-L1 + patients, and discovered a new set of SPCs ( Figure 7a, b ), which we refer to as PD-L1 + SPCs. We found that many of these PD-L1 + SPCs highlighted recurrent patterns of tumour/immune interaction. For example PD-L1 + SPCs 24 and 3 show mutual exclusivity of tumour and immune cells, whereas in PD-L1 + SPCs 12 and 26 they tend to co-localise within the 100 μ m tile region ( Figure 7c ). These patterns were strongly predictive of survival in the held-out test set ( Figure 7d, e ). Download figure Open in new tab Figure 7. (a) SPCs are rediscovered from the LATTICeA-IO panel using only PD-L1 + patients. (b) UMAP plot of the embedding space of the PD-L1 + subset. (c) Example tiles of SPCs from the PD-L1 + set. Tile diameter 100 μ m. (d) Cox proportional hazards model linking PD-L1 + SPCs to overall survival (train/test split inherited from overall splits: train n = 101, test n = 77). Error bars are 95% confidence intervals. (e) Kaplan Meier plot of test set patients stratified by risk from the PD-L1 + SPC-based model (high-risk n = 85, low-risk n = 93). (f, g) Enrichment plot of cell types showing a tendency of cell types on the x-axis to be within the neighbourhood of cell types on the y-axis in (f) SPC 24 and (g) SPC 26. We quantified these spatial relationships by carrying out a neighbourhood analysis, measuring the differential enrichment of T-cells near tumour cells between patients enriched or depleted in the target PD-L1 + SPC. We found that in cores which were high for PD-L1 + SPC 24 there was a significant enrichment of both CD4 + and CD8 + T-cells as well as macrophages near tumour cells ( Figure 7f ), but in PD-L1 + SPC 26 we saw depletion of those same cell types ( Figure 7g ). We found that the frequency of these co-localisation events were key drivers of prognosis in the PD-L1 + subset (Supplementary Figures S8a-d). We also quantified the relationship between PD-L1 + SPCs and the original set. We found that despite near equivalence between SPC 17 and PD-L1 + SPC 26, this SPC was only predictive of prognosis in the PD-L1 + patient group (Supplementary Figure S9, Figure 7d ). These findings highlight the importance of context for the immune response to exhibit successful anti-tumour activity. The mere presence of tumourinfiltrating lymphocytes in PD-L1 + LUAD patients is insufficient to predict a good prognosis, and spatial measures are perhaps generally required to do so. To extend the idea further, it seems likely that a spatial measure of T-cell engagement may also prove valuable in predicting patient response to anti-PDL1 immunotherapy. 3 Discussion Our approach democratises the interpretation of multiplex imaging by introducing unsupervised, imagefirst analysis as a powerful hypothesis-generating engine. We shift the focus of mIF analysis from intractable whole core images to SPCs, recurrent morphological and spatial proteomic patterns in LUAD and CRC which encode the spatial architecture of each sample. By using SPCs first, we are unconstrained by cell segmentation-based analyses which can be reductionist and prone to losing subtle morphological cues such as patterns of subcellular staining. Crucially, SPCs are immediately interpretable and explainable, avoiding the black-box nature of many other AI methods. Some of these SPCs are candidate spatial biomarkers which are immediately translatable. For example, patient selection for immunotherapy is imperfect [ 23 ] and biomarkers beyond simple PD-L1 status have previously been explored [ 23 , 24 ]. The density and spatial positioning of immune cells has been previously shown to be related to immunotherapy treatment response in LUAD [ 25 , 26 ] and other solid malignancies [ 27 ]. Our findings support the notion that T-cell engagement is prognostically favourable for those with PD-L1 + tumours, and we identified these recurrent patterns within our data without any supervision. A limitation of our cohort is that we lack treatment-paired immunotherapy response data, but we envisage our methodology to be useful for providing candidate spatial biomarkers both for immunotherapy or for novel therapeutic agents in clinical trials in the future. Furthermore, we expect that the method could be used to derive highly accurate spatial biomarkers from biologically focused singleplex immunohistochemistry, or oligo-plex (eg 3 or 4 coloured dyes) chromogenic assays, both of which are compatible with current diagnostic laboratory workflows, unlike mIF. Others raise interesting avenues for further functional research. GLB1 is classically used as a protein marker of cellular senescence [ 28 ], but the punctate pattern of apical staining we observed in low-grade LUAD is distinct from that often seen in senescence, which appears much more diffuse [ 29 , 30 , 31 , 32 ]. The prognostic implications of GLB1 in solid malignancies are also mixed [ 29 , 30 , 31 , 32 ]. It is unclear whether this spatial proteomic feature is a manifestation of senescence, or a reflection of a metabolic state unique to low-grade EGFR-driven LUAD but is a fascinating avenue for further work. Three key limitations guide future work. First, while SSL automates pattern discovery, mechanistic understanding requires integration with orthogonal, functional experimental data. Second, tile-based analysis captures local but not global tissue architecture; graph-based extensions or hierarchical transformers could help integrate spatial information from multiple scales. Third, panel-specific training currently limits generalisability, as models are specific to the order and composition of panels. This makes the area ripe for innovation via marker-agnostic SSL architectures. As mIF and related spatial technologies mature, SSL offers a framework to decode the grammar of tumour ecosystems, where spatial relationships between cells or intracellular protein distribution may be as prognostic as molecular alterations. By transforming images into hypothesis-generating engines, this approach has the potential to greatly accelerate the discovery of next-generation biomarkers that integrate morphology, multiplex protein data, and cellular interactions, as well as driving biological discovery. Ultimately, such tools may shift pathology from descriptive reporting to predictive ecosystem modelling, ushering in an era of truly spatial precision medicine. We envision SSL as the foundation for a new class of spatial biomarkers, where AI-driven discovery guides targeted clinical assays. 4 Methods 4.1 Datasets We used 1 mm TMA tumour cores from two cancer cohorts: the Leicester Archival Thoracic Tumour Investigatory Cohort – Adenocarcinoma (LATTICeA) lung cancer cohort, which features > 2,500 cores from 1,025 lung adenocarcinoma patients, and 0.6 mm cores from the Glasgow Royal Infirmary (GRI) colorectal cancer cohort, featuring 829 cores from 787 colorectal cancer patients. The LATTICeA cohort was originally constructed under REC 14/EM/1159 (East Midlands REC), and the ongoing management of the resource by the Greater Glasgow and Clyde Biorepository was approved under an amendment granted by the Leicester South REC. Ongoing use of the collection is now managed under REC 16/WS/0207. The GRI cohort included 787 patients with stage IIII colorectal cancer (CRC) who underwent primary surgical resection at the Glasgow Royal Infirmary between 1997 and 2013. Patients were excluded if they received neoadjuvant therapy, had an emergency operation, or died within 30 days of surgery. Tumours were staged using the 5th Edition AJCC/UICC-TNM system and clinicopathological data were collected with a minimum 5-year follow-up post-resection. The study was approved by the West of Scotland Research Ethics Committee (REC 4: 22/ws/0207) and data are stored in the Greater Glasgow and Clyde Safehaven (SH21ON012). On LATTICeA, we applied two distinct staining panels: the immune cell phenotyping panel LATTICeA-IO and the hallmarks of cancer panel LATTICeA-Hallmarks . LATTICeA-IO uses markers DAPI, pan-CK, CD68, SMA, CD4, CD8, and Ki67. LATTICeA-Hallmarks uses markers DAPI, pan-CK, N-cadherin, Ca9, cC3, VWF, Ki67, PD-L1, and GLB1. We stained the GRI cohort with another immuno-oncology panel, featuring DAPI, pan-CK, CD68, SMA, FoxP3, CD3, and CD8. Datasets were split randomly into two folds on the patient level to avoid data leakage where cores from the same patient were assigned to different folds. Where applicable, e.g. for survival analysis, two-fold cross validation was used, and for analysis of clusters, the first fold was always used for training with the second fold kept as a holdout test set of equal size. Any findings which are associative across the whole cohort are clearly marked in the text. 4.2 Image Acquisition The GRI panel was prepared and imaged as previously described [ 17 ]. 4.2.1 LATTICeA-IO A multiplex assay consisting of 6 markers was applied to twenty-three 4 μ m-thick FFPE sections from the LATTICeA cohort using the Ventana Discovery Ultra autostainer (Roche Tissue Diagnostics, RUO Discovery Universal v21.00.0019). Discovery CC1 (Roche Tissue Diagnostics, 950-123) was applied to the section for 64 minutes at 95 °C for antigen retrieval. The antibodies were applied as follows: CD68 (D4B9C) (Cell Signaling Technologies, 76437) at 1:200 for 32 minutes, followed by Discovery OmniMap anti-Rb HRP (Roche Tissue Diagnostics, 760-4311) for 16 minutes, detected by Opal 690 (Akoya Bioscience, FP1497001KT) at 1:100 for 8 minutes; Pan-Cytokeratin (AE1/AE3) (Leica Biosystems AE1/AE3-601-L-CE) at 1:250 for 28 minutes, followed by Discovery OmniMap anti-Ms HRP (Roche Tissue Diagnostics, 05269652001) for 12 minutes, detected by Opal 620 (Akoya Bioscience, FP1495001KT) at 1:100 for 8 minutes; Smooth Muscle Actin (1A4) (Roche Tissue Diagnostics, 760-2833) for 32 minutes, followed by Discovery OmniMap anti-Ms HRP (Roche Tissue Diagnostics, 05269652001) for 12 minutes, detected by Opal 540 (Akoya Bioscience, FP1494001KT) at 1:200 for 8 minutes; CD4 (SP35) (Roche Tissue Diagnostics, 790-4423) for 1 hour, followed by Discovery OmniMap anti-Rb HRP (Roche Tissue Diagnostics, 760-4311) for 16 minutes, detected by Opal 570 (Akoya Bioscience, FP1488001KT) at 1:50 for 8 minutes; CD8 ω (C8/144B) (Cell Signaling Technologies, 70306) at 1:100 for 56 minutes, followed by Discovery OmniMap anti-Ms HRP (Roche Tissue Diagnostics, 05269652001) for 12 minutes, detected by Opal 520 (Akoya Bioscience, FP1487001KT) at 1:100 for 8 minutes; and Ki67 (30-9) (Roche Tissue Diagnostics, 790-4286) for 20 minutes, followed by Discovery OmniMap anti-Rb HRP (Roche Tissue Diagnostics, 760-4311) for 12 minutes, detected by Opal 650 (Akoya Bioscience, FP1496001KT) at 1:400 for 8 minutes. DAPI was applied as a nuclear counterstain (Roche Tissue Diagnostics, 05268826001). Whole slide images were acquired at 20x magnification using the PhenoImager HT (Akoya Bioscience, v1.0.13) and unmixed using Phenochart (Akoya Bioscience, v1.1). 4.2.2 LATTICeA-Hallmarks For LATTICeA-Hallmarks, 23 tissue micro arrays containing three 1.0mm cores from 1025 adenocarcinoma lung cancer patients (3075 cores in total) were cut at 4μm on TOMO slides (Matsunami, ref. TOM-11) and baked at 60°C for 60 minutes. The staining was performed using the Ventana Discovery Ultra, Roche Tissue Diagnostics, v21.00.0019. Slides were dewaxed and retrieved using Discovery CC1 (Roche Tissue Diagnostics, ref. 06414575001) for 32 minutes at 95°C. A stripping step was performed after each opal using CC2 (Roche Tissue Diagnostics, ref. 05279798001) at 100°C. The antibodies were applied in the following order: Cleaved Caspase 3 (Asp175), (Cell Signaling Technology, ref. 9661) at 1:200 for 32 minutes followed by Omnimap anti-rabbit HRP (Roche Tissue Diagnostics, ref. 05269679001) for 12 minutes, detected by Opal 650 (Akoya Biosciences, ref. FP1496001KT) at 1:200. N Cadherin [EPR1791-4], (Abcam, ref. 76011) at 1:50 for 64 minutes, followed by Omnimap anti-rabbit HRP (Roche Tissue Diagnostics, ref. 05269679001) for 32 minutes, detected by Opal 520, (Akoya Biosciences, ref. FP1487001KT) at 1:100. GLB1/Beta-galactosidase, (Abcam, ref. 96239) at 1:150 for 32 minutes, followed by OmniMap anti-rabbit HRP (Roche Tissue Diagnostics, ref. 05269679001) for 16 minutes, detected by Opal 540 (Akoya Biosciences, ref. FP1494001KT) at 1:200. Von Willebrand Factor [36B11] (Leica Biosystems, ref. NCL-L-vWF) at 1:25 for 32 minutes followed by Omnimap anti-mouse HRP, (Roche Tissue Diagnostics, ref. 05269652001) for 12 minutes, detected by Opal 570(Akoya biosciences, ref. FP1488001KT) at 1:100. Carbonic Anhydrase 9, (Abcam, ref. 15086) at 1:250 for 32 minutes, followed by Omnimap anti-rabbit HRP (Roche Tissue Diagnostics, ref. 05269679001) for 12 minutes, detected by Opal 690 (Akoya Biosciences, ref. FP1497001KT) at 1:200. Pan-Cytokeratin [AE1/AE3], (Leica Biosystems, ref. AE1/AE3-601-L-CE) at 1:250 for 32 minutes, followed by Omnimap anti-mouse HRP, (Roche Tissue Diagnostics, ref. 05269652001) for 12 minutes, detected by Opal 620 (Akoya Biosciences, ref. FP1495001KT) at 1:100. PD-L1 [E1L3N] (Cell Signalling Technology, ref. 13684) at 1:50 for 120 minutes followed by Ultramap anti-rabbit HRP (Roche Tissue Diagnostics, ref. 05269717007), detected by Opal 480 (Akoya Biosciences, ref. FP1500001KT) at 1:50. KI67 [ 30 - 9 ] (Roche Tissue Diagnostic, ref. 05278384001) for 20 minutes, followed by Omnimap anti-rabbit HRP (Roche Tissue Diagnostics, ref. 05269679001) for 12 minutes, detected by TSA-DIG (Akoya Biosciences, ref. FP1501001KT) for 24 minutes and Opal 780 (Akoya Biosciences, ref. FP1501001KT) for 60 minutes. QD DAPI (Roche Tissue Diagnostics, ref. 05268826001) was applied for 24 minutes as counterstain. All antibodies were previously validated in a chromogenic DAB assay using the Ventana Discovery Ultra. A single staining slide was created for each antibody – fluorophore combination to create a spectral library in InForm (Akoya Biosciences, v2.4.2) and an unstained section was used as autofluorescence reference. Slides were mounted and whole slide images were collected using PhenoImagerHT (Akoya Biosciences, v1.0.13) at 10x magnification, TMA maps were applied using Phenochart (v1.1.0) and individual core images were acquired at 20x magnification, and all images were spectrally unmixed using a project-specific spectral library using InForm. 4.3 Image Preprocessing Images were tiled using a 224 × 224px sliding window approach. Finding a tissue outline in mIF images is nontrivial, as there is a large amount of black space throughout the tissue in each image, so standard Otsu thresholding segments out large portions of the tissue. To obtain the tissue mask, the DAPI channel was dilated with a kernel size of 50px, and the image was then downsampled by a factor of 10. A median filter was applied with a filter size of 20px, and the image was resized back to its native resolution. This created images with an approximate outline of the tissue based on the DAPI channel. Otsu thresholding was then used to obtain a tissue mask. These operations were carried out using scikit-image [ 33 ]. To prevent the model overfitting on the background noise or the shape of the tissue edge, tiles that were less than 90% tissue were excluded. Any core images containing less than 20 tiles were rejected due to the lack of sample size. 4.4 Model Architecture and Training Every iteration, tiles were distinctly augmented to create two views of the same image, and each augmented tile was mapped to a latent representation by an eight-channel ResNet-50 [ 34 ]. To accommodate the increased number of imaging channels compared to standard RGB images, the augmentation regime used was: random left/right/up/down flipping ( p = 1), random cropping (scale between 0.75 and 1, p = 1), random augmentation of the background (additional of uniform values to all 0-valued pixels, p = 0.3), random rotation ( p = 0.4), solarisation ( p = 0.3), and gaussian noise ( p = 1). The VICReg [ 35 ] loss was used. This is a non-contrastive loss that does not require negative samples to avoid dimensional collapse, is computationally efficient, and has shown good results in other medical imaging settings [ 36 ]. In comparison to generative models that fixate on pixel-level reconstruction [ 37 , 13 , 12 ], joint-embedding architectures such as VICReg are designed to capture more meaningful, abstract features while ignoring more redundant information such as their exact appearance or location. The loss consists of three terms: variance v ( z i ), invariance s ( z 1 , z 2 ), and covariance c ( z i ), for embeddings z i ℝ D , i = 1, 2. The variance and covariance terms regularise the embedding to avoid collapse to a trivial solution, either by predicting the same score for all inputs or predicting the same value for each element of the embedding, respectively. The invariance term minimises the difference between the embeddings of each augmented image, meaning that the model should only learn features which are invariant to the augmentation regime–those features which have semantic meaning. Models were trained for 100 epochs on an Nvidia H100 GPU with 80Gb memory. All models and training were implemented in Tensorflow 2, using code from [ 38 , 39 , 40 ]. A batch size of 256 was used, and a warmup-cosine learning rate was used, with a maximum learning rate of 10 ×4 and a 10 epoch warmup schedule, starting from 0. A representation size of 2048 was used, with an embedding size of 8192 generated from a three layer projection head with learned batch normalisation and a ReLU activation at every layer, and a linear activation on the output. 4.5 Clustering and Analysis Methods Representations of each tile in the train set were clustered using the Leiden community detection algorithm with a resolution of 1. The learned mappings were then used to assign the test set to the same clusters. The frequency of each cluster label in all tiles across all cores from each patient was then used to construct a cluster frequency vector for that patient, normalised to 1. As the test set representations were mapped to the same clusters as the train set, this means each patient’s vector would have the same dimension. Where multiple panels were combined (LATTICeA-IO and LATTICeA-Hallmarks), we concatenated the cluster vectors and normalised the resulting vector to magnitude 1 for consistency. The clusters were visualised using UMAP projections into two dimensions to assess their distributions and check for any evidence of issues with the representation, cluster imbalance, or patient specificity. This was quantified in Figure S10 . The distribution of markers across clusters was visualised to ensure there was no overfitting to marker intensity, as illustrated in Figures 3 and S3. Marker intensities were scaled to the range 0-1 for visualisation purposes. For the PD-L1 positive patient cohort sub-analysis, patients were selected based on a tumour proportion score of ≥ 1%. Clustering was then performed on the train set tiles from these patients, with tiles from PD-L1 negative patients excluded. These clusters were distinct from those obtained from the whole cohort, and were used to more closely interrogate the tissue architecture of this patient subset. A map illustrating the relationship between the clusters in each cohort is shown in Figure S9c. 4.6 Survival Analysis Cox Proportional Hazards models with a L2 penalty of 0.05 were used to predict survival in each cohort. Survival data were truncated at 60 months, due to the paucity of events past this time. To control for the large number of variables relative to the sample size, variables were selected using two-fold cross validation on the train set. The train set was split into two folds, and each column was used to fit a univariate survival model with a penalty of 1. Only variables with mean c index greater than 0.5 were used in the Cox model. Forest plots were produced showing the log hazard ratios with 95% confidence intervals (Figure S4). 4.7 Cluster Characterisation Images were imported into Visiopharm (version 2021.09.2.10918). A deep learning analysis protocol package (APP), using the U-Net architecture, was trained over 288,000 iterations using 106 images with varied tissue morphologies as the training dataset. The APP utilised DAPI, autofluorescence, and cytokeratin channels for training to classify regions of necrosis, tumour, stroma, and background tissue in all LUAD core images to prepare for cell-level analysis. For cell-level analysis, a second, custom deep learning APP based on the U-Net architecture was developed using annotated images from 15 LUAD cores, selected to capture a broad range of cell types and sizes. Manual annotations were used to generate background labels, with additional boundary labels assigned to each nucleus for training purposes. The APP was trained over 127,000 iterations using DAPI and Ki67 channels as inputs. Cytoplasmic labels were generated by nuclear expansion by 20 px (9 μ m) for tumour cells and 10 px (4.5 μ m) for stromal cells. Cells were then delineated by using the boundary probability heatmap. For each segmented cell, mean marker intensities for each compartment spatial coordinates were extracted in the output variables section in the APP design for downstream analysis. 4.7.1 Preprocessing All fluorescence intensities were normalised prior to analysis using the combat function in scanpy v1.10.1. For LATTICeA-IO, mean whole cell intensity was used for all markers, except Ki67 where the mean nuclear intensity was used. Cells were phenotyped using k-means clustering, as implemented in scikit learn v1.4.0 [ 41 ] using k = 10. Where clusters were annotating the same cell type, these were merged to produce the final classification. For LATTICeA-Hallmarks, tissue segmentation labels were used to phenotype cells into two classes (tumour or stroma). For GRI, cell phenotyping was performed using Visiopharm image analysis software. The ready-to-use app Phenoplex was applied, with the thresholds used to assign marker positivity described in Supplementary Table S1. Cells which had positivity for conflicting markers (eg. CD68 and SMA), as well as cells which were not positive for any of the markers were considered unclassifiable. Cluster labels were projected onto original images and verified by a pathologist in all cases. 4.7.2 Quantification Cells were assigned to tiles based on the centroid of their nuclei, inclusive of those cells which overlap tile boundaries. Measures of cell density per cluster were derived from cell density per tile and the mean taken of all tiles in the cluster. A similar approach was taken to mean marker intensity. A sample of 100 tiles from the test set in each cluster was also manually inspected by a senior pathologist, and textual descriptions were assigned to each cluster based on their marker enrichment and visual appearance. These descriptions aimed to be holistic, and capture the recurring patterns among the sampled tiles, which were largely consistent in visual appearance. To integrate the information from both LATTICeA panels, the cluster sets from each panel were merged using non-negative matrix factorisation with k = 12. This yielded a set of twelve components which contained combined information from both panels. Neighbourhood analysis was performed using the Squidpy [ 42 ] and CellCharter [ 43 ] packages in Python. Each cell was assigned a label based on its lineage, and a directed adjacency matrix A is constructed by finding the 6 nearest spatial neighbours of each cell on a core-by-core basis. These adjacency matrices were used to calculate the asymmetric neighbourhood enrichment of each combination of cell types, which is the difference between observed and expected numbers of edges between cell types. Differential neighbourhood enrichment is the difference between the neighbourhood enrichment matrices in two conditions. The p value is calculated by randomly sampling by condition label for each sample with replacement. The p value is the proportion of cases where the magnitude of the permuted value is greater than the magnitude of the observed value. We sort patients by the prevalence of the PD-L1 + SPC of interest, splitting them into two groups based on the median value (where the PD-L1 + SPC is present in less than half of patients, the groups are presence of the PD-L1 + SPC against no presence). We calculate the differential neighbourhood enrichment between patients in these groups for each cell type. Co-localisation scores are calculated as the directed neighbourhood enrichment between the two cell types. We consider scores in both directions, and a combined score, which is the sum of the enrichments in each direction ( Figure 7h and Supplementary Figure S9a, b). These scores are used to separate patients into high- and low-enrichment groups, whose survival probabilities are presented. 4.8 Gene Expression and Genotyping LATTICeA gene expression was acquired and prepro-cessed as previously described [ 44 ]. For GRI, FFPE tissue digestion was carried out to obtain the tissue lysate and combined with detector oligos which annealed to the targeted RNA and ligated. Amplification of ligated oligos was then performed using a unique set of primers for each sample, introducing a sample-specific barcode and Illumina adaptors. Barcoded samples were pooled into a single library and run on an Illumina HiSeq 2500 High Output v4 flowcell. Sequencing reads were demultiplexed using BCL2FASTQ software (Illumina, USA). FASTQ files were aligned to the Human Whole Transcriptome v2.0 panel, which consists of 22,537 probes. Differential expression analysis was performed in DESeq2 v1.40.2 [ 45 ], GSEA in fgsea v1.26.0 [ 46 ] and ssGSEA in GSVA v1.48.3 [ 47 ] using the MSigDb hallmarks database. TCGA-LUAD RNA-Seq was obtained from Xena Browser [ 48 ]. Cell deconvolution was done with MCP Counter v1.2.0 [ 18 ]. 1 mm cores of tumour tissue were taken for DNA sequencing at the same time as cores were taken for TMA construction. Cores for sequencing were taken last to minimise DNA contamination from the previous case (ie typically after 12 passes of the coring needle through the case to be sequenced). DNA was extracted from tumour cores using the QIAmp FFPE DNA tissue kit (Qiagen) according to the manufacturer’s instructions. Recovered DNA was subsequently assessed for quantity and quality (DIN) using an Agilent Tapestation 4200 with Genomic screen tape. Where possible, 10 ng (but not less than 2 ng) of DNA was then sequenced using an ISO 15189 accredited custom Ampliseq based assay (MGP-4-DNA, Sarah Cannon Molecular Diagnostics). Library preparation was performed manually with templating and chip loading on an Ion ChefTM and sequencing on an Ion GeneStudioTM S5 Prime. Data analysis was undertaken using a hybrid bioinformatics pipeline comprising the native Torrent Suite (5.10.2) Variant Caller (5.10.1.20) combined with in-house developed applications for error checking, quality control, variant ‘binning’ and HGVS nomenclature construction. The assay was validated with a limit of detection for SNVs and small indels of 2.5% VAF in hotspot regions from > 30 genes. The absolute VAF threshold for reporting variants as detected was adjusted upwards as required in many of the older specimens in order to allow for extensive ‘fixation’ associated background noise. The assay was also validated for the detection of copy number gains in 6 genes (EGFR, ERBB2, KRAS, KIT, MET & PIK3CA) in specimens with greater than 20% tumour cellularity. Samples which did not yield full panel coverage or which failed to meet other minimum quality control criteria were excluded. Author Contributions LF and KR designed and executed the experiments, and prepared the manuscript. PH provided the GRI cohort images and contributed to their analysis. SS helped with the analysis of the LATTICeA-Hallmarks images. LSS preprocessed and prepared the gene expression data for the GRI cohort. IRP designed the panels and FB, RLB, SM, and LOJ performed the staining and imaging for the LATTICeA images. NM provided the histological assessment for the GRI cohort. CJM provided computational resources and feedback on experimental design. DC provided helpful feedback in the development of the method and production of the manuscript. JE and CR supervised the production of the GRI cohort. EWR supervised the experiments and provided feedback on experimental design and manuscript production. JLQ and KY designed the experiments, edited the manuscript, and jointly supervised the research. Data and Code Availability Statement The codebase used to produce the results in this manuscript are github.com/lucasfarndale/MultiplexSSL . data will be made available on publication. Funding Statement LF is supported by the MRC, United Kingdom grant MR/W006804/1. KR is supported by a Jean Shanks Foundation and Pathological Society of Great Britain clinical PhD fellowship. The JLQ lab (SM, RLB, FB, IRP, JLQ) is funded by the Mazumdar-Shaw Chair Endowment. LOJ is funded by the CRUK Scotland Centre CTRQQR-2021 \ 100006. The JE lab (JE, PH, LSS) is funded by CRUK (CTRQQR-202 \ 100006) and Innovate UK (42497). CJM acknowledges support a core programme award to CJM (A29801) and NC3Rs DA3RT grant (APP51238). EWR acknowledges CRUK core funding (A1920). EWR and KY are supported by funding from Prostate Cancer UK (MA-TIA22-001). KY acknowledges support from Cancer Research UK (EDDPGM-Nov21 100001 and DRCMDP-Nov23 100010), BBSRC BB V016067 1, and EU Horizon 2020 grant ID: 101016851. This work was supported by Cancer Research UK core funding to the Scotland Institute (A31287). Competing Interest Statement DC, JLQ, and KY are co-founders and shareholders of TileBio Ltd. Acknowledgements The authors would like to extend their gratitude to: Catherine Winchester, CRUK Scotland Institute, for critically appraising this manuscript; Naveed Khan, CRUK Scotland Institute, for management of the high performance computing facility; the NHS Research Scotland (NRS) Greater Glasgow and Clyde Biorepository and Glasgow Tissue Research Facility for their ongoing management of the LATTICeA and GRI cohorts; the CRUK Scotland Institute Deep Phenotyping Advanced Technology Core Facility (Research Resource Identifier [RRID]: SCR 027366) for the preparation of the images; the Academic Unit of Surgery at the Glasgow Royal Infirmary for the clinical database management; Claire Smith and Marco Sereno from the University of Leicester for LATTICeA data collection and construction; and Philip Bennett from Sarah Cannon Molecular Diagnostics, Part of HCA Healthcare UK for the preparation of the LATTICeA genomic dataset. Funder Information Declared Medical Research Council, https://ror.org/03x94j517 , MR/W006804/1 Cancer Research UK, https://ror.org/054225q67 , CTRQQR-2021\100006 , CTRQQR-202\100006 , A29801 , A1920 , EDDPGM-Nov21 100001 , DRCMDP-Nov23 100010 Innovate UK, https://ror.org/05ar5fy68 , 42497 NC3Rs DA3RT , APP51238 Prostate Cancer UK, https://ror.org/04dkv6329 , MA-TIA22-001 Biotechnology and Biological Sciences Research Council, https://ror.org/00cwqg982 , BB V016067 1 EU Horizon 2020 , 101016851 Cancer Research UK, https://ror.org/054225q67 , A31287 Footnotes https://github.com/lucasfarndale/MultiplexSSL/ References [1]. ↵ A. T. Shaw et al. Crizotinib in ROS1-Rearranged Non–Small-Cell Lung Cancer . New England Journal of Medicine , 371 ( 21 ): 1963 – 1971 , November 2014 . OpenUrl CrossRef PubMed [2]. ↵ M. Reck et al. Pembrolizumab versus Chemotherapy for PD-L1–Positive Non–Small-Cell Lung Cancer . New England Journal of Medicine , 375 ( 19 ): 1823 – 1833 , November 2016 . OpenUrl CrossRef PubMed [3]. ↵ E. L. Kwak et al. Anaplastic Lymphoma Kinase Inhibition in Non–Small-Cell Lung Cancer . New England Journal of Medicine , 363 ( 18 ), October 2010 . [4]. ↵ H. Hampel et al. Screening for the Lynch Syndrome (Hereditary Nonpolyposis Colorectal Cancer) . New England Journal of Medicine , 352 ( 18 ): 1851 – 1860 , May 2005 . OpenUrl CrossRef PubMed Web of Science [5]. ↵ A. Passaro et al. Cancer biomarkers: Emerging trends and clinical implications for personalized treatment . Cell , 187 ( 7 ): 1617 – 1635 , March 2024 . Publisher: Elsevier . OpenUrl CrossRef PubMed [6]. ↵ D. Jiménez-Sánchez et al. Weakly supervised deep learning to predict recurrence in lowgrade endometrial cancer from multiplexed immunofluorescence images . NPJ Digital Medicine , 6 ( 1 ): 48 , 2023 . OpenUrl PubMed [7]. ↵ M. Harary et al. Fluoroformer: Scaling multiple instance learning to multiplexed images via attention-based channel fusion . arXiv preprint arXiv: 2411.08975 , 2024 . [8]. ↵ R. J. Chen et al. Towards a general-purpose foundation model for computational pathology . Nature Medicine , 30 ( 3 ): 850 – 862 , March 2024 . Publisher: Nature Publishing Group. OpenUrl CrossRef PubMed [9]. ↵ X. Wang et al. Transformer-based unsupervised contrastive learning for histopathological image classification . Medical Image Analysis , 81 : 102559 , October 2022 . OpenUrl CrossRef PubMed [10]. ↵ E. Vorontsov et al. A foundation model for clinical-grade computational pathology and rare cancers detection . Nature Medicine , pp. 1 – 12 , July 2024 . Publisher: Nature Publishing Group. [11]. ↵ H. Xu et al. A whole-slide foundation model for digital pathology from real-world data . Nature , 630 ( 8015 ): 181 – 188 , June 2024 . Publisher: Nature Publishing Group. OpenUrl CrossRef PubMed [12]. ↵ E. Wu et al. Rosie: Ai generation of multiplex immunofluorescence staining from histopathology images . Nature Communications , 16 ( 1 ): 1 – 13 , 2025 . OpenUrl PubMed [13]. ↵ J. Tan et al. Characterization of tumour heterogeneity through segmentation-free representation learning on multiplexed imaging data . Nature Biomedical Engineering , pp. 1 – 15 , 2025 . [14]. ↵ G. Atarsaikhan et al. Self-supervised learning enables unbiased patient characterization from multiplexed cancer tissue microscopy images . bioRxiv , pp. 2025 – 03 , 2025 . [15]. ↵ J. Wenckstern et al. Ai-powered virtual tissues from spatial proteomics for clinical diagnostics and biomedical discovery . arXiv preprint arXiv: 2501.06039 , 2025 . [16]. ↵ D. A. Moore et al. In situ growth in early lung adenocarcinoma may represent precursor growth or invasive clone outgrowth—a clinically relevant distinction . 32 ( 8 ): 1095 – 1105 . [17]. ↵ P. Hatthakarnkul et al. Protein expression of s100a2 reveals it association with patient prognosis and immune infiltration profile in colorectal cancer . Journal of Cancer , 14 ( 10 ): 1837 , 2023 . OpenUrl PubMed [18]. ↵ E. Becht et al. Estimating the population abundance of tissue-infiltrating immune and stromal cell populations using gene expression . Genome Biology , 17 ( 1 ): 218 , October 2016 . OpenUrl CrossRef PubMed [19]. ↵ J. Guinney et al. The consensus molecular subtypes of colorectal cancer . Nature Medicine , 21 ( 11 ): 1350 – 1356 , November 2015 . Publisher: Nature Publishing Group. OpenUrl CrossRef PubMed [20]. ↵ D. E. Johnson et al. Targeting the il-6/jak/stat3 signalling axis in cancer . Nature reviews Clinical oncology , 15 ( 4 ): 234 – 248 , 2018 . OpenUrl PubMed [21]. ↵ K. A. Pennel et al. Jak/stat3 represents a therapeutic target for colorectal cancer patients with stromal-rich tumors . Journal of Experimental & Clinical Cancer Research , 43 ( 1 ): 64 , 2024 . OpenUrl CrossRef PubMed [22]. ↵ P. Hatthakarnkul et al. The characteristic of epithelial-specific phenotypes and immunosuppressive microenvironment in the context of tumour budding in colorectal cancer . bioRxiv , pp. 2025 – 03 , 2025 . [23]. ↵ S. Lu et al. Comparison of Biomarker Modalities for Predicting Response to PD-1/PD-L1 Checkpoint Blockade: A Systematic Review and Metaanalysis . JAMA Oncology , 5 ( 8 ): 1195 – 1204 , August 2019 . OpenUrl PubMed [24]. ↵ N. A. Rizvi et al. Cancer immunology. Mutational landscape determines sensitivity to PD-1 blockade in non-small cell lung cancer . Science (New York, N.Y .), 348 ( 6230 ): 124 – 128 , April 2015 . OpenUrl [25]. ↵ F. Ghiringhelli et al. Immunoscore immune checkpoint using spatial quantitative analysis of CD8 and PD-l1 markers is predictive of the efficacy of anti-PD1/PD-l1 immunotherapy in non-small cell lung cancer . 92 : 104633 . [26]. ↵ J. Monkman et al. Spatial insights into immunotherapy response in non-small cell lung cancer (NSCLC) by multiplexed tissue imaging . 22 ( 1 ): 239 . [27]. ↵ D. Dejardin et al. A composite decision rule of CD8+ t-cell density in tumor biopsies predicts efficacy in early-stage, immunotherapy trials . 30 ( 4 ): 877 – 882 . [28]. ↵ J. Wagner et al. Overexpression of the novel senescence marker ε-galactosidase (glb1) in prostate cancer predicts reduced psa recurrence . PloS one , 10 ( 4 ): e0124366. 2015. [29]. ↵ M. Ilic et al. Overexpression of Senescence-Associated Beta-Galactosidase (SA-B-GAL) as a Prognostic Marker of Invasive Breast Carcinoma . EABR. Experimental and Applied Biomedical Research , 26 ( 1 ): 39 – 52 , January 2025 . OpenUrl [30]. ↵ C. L. Cotarelo et al. Detection of Cellular Senescence Reveals the Existence of Senescent Tumor Cells within Invasive Breast Carcinomas and Related Metastases . Cancers , 15 ( 6 ): 1860 , March 2023 . OpenUrl PubMed [31]. ↵ M. L. B. Jr et al. Persistence of senescent prostate cancer cells following prolonged neoadjuvant androgen deprivation therapy . PLOS ONE , 12 ( 2 ): e0172048 , February 2017 . Publisher: Public Library of Science. OpenUrl PubMed [32]. ↵ A. M. Haugstetter et al. Cellular senescence predicts treatment outcome in metastasised colorectal cancer . British Journal of Cancer , 103 ( 4 ): 505 – 509 , August 2010 . Publisher: Nature Publishing Group. OpenUrl CrossRef PubMed [33]. ↵ S. Van der Walt et al. scikit-image: image processing in python . PeerJ , 2 : e453 , 2014 . OpenUrl CrossRef PubMed [34]. ↵ K. He et al. Deep residual learning for image recognition . In Proceedings of the IEEE conference on computer vision and pattern recognition , pp. 770 – 778 , 2016 . [35]. ↵ A. Bardes et al. Vicreg: Variance-invariance-covariance regularization for self-supervised learning . arXiv preprint arXiv: 2105.04906 , 2021 . [36]. ↵ A. Kondepudi et al. Foundation models for fast, label-free detection of glioma infiltration . Nature , pp. 1 – 7 , 2024 . [37]. ↵ R. Balestriero and Y. LeCun . Learning by reconstruction produces uninformative features for perception . arXiv preprint arXiv: 2402.11337 , 2024 . [38]. ↵ L. Farndale et al. Trident: Triple deep network training for privileged knowledge distillation in histopathology . arXiv preprint arXiv: 2312.02111 , 2023 . [39]. ↵ L. Farndale et al. Synthetic privileged information enhances medical image representation learning . arXiv preprint arXiv: 2403.05220 , 2024 . [40]. ↵ L. Farndale et al. Divide and conquer self-supervised learning for high-content imaging . arXiv preprint arXiv: 2503.07444 , 2025 . [41]. ↵ F. Pedregosa et al. Scikit-learn: Machine Learning in Python . Journal of Machine Learning Research , 12 : 2825 – 2830 , 2011 . OpenUrl [42]. ↵ G. Palla et al. Squidpy: a scalable frame-work for spatial omics analysis . Nature methods , 19 ( 2 ): 171 – 178 , 2022 . OpenUrl PubMed [43]. ↵ M. Varrone et al. Cellcharter reveals spatial cell niches associated with tissue remodeling and cell plasticity . Nature genetics , 56 ( 1 ): 74 – 84 , 2024 . OpenUrl CrossRef PubMed [44]. ↵ H. L. Williams et al. Spatial resolution of transcriptomic plasticity states underpinning lethal morphologies in lung adenocarcinoma , June 2024 . Pages: 2024.06.11.598228 Section: New Results. [45]. ↵ M. I. Love et al. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2 . Genome Biology , 15 ( 12 ): 550 , December 2014 . OpenUrl CrossRef PubMed [46]. ↵ G. Korotkevich et al. Fast gene set enrichment analysis , February 2021 . Pages: 060012 Section: New Results. [47]. ↵ S. Hänzelmann et al. GSVA: gene set variation analysis for microarray and RNA-Seq data . BMC Bioinformatics , 14 ( 1 ): 7 , January 2013 . OpenUrl CrossRef PubMed [48]. ↵ M. J. Goldman et al. Visualizing and interpreting cancer genomics data via the Xena platform . Nature Biotechnology , 38 ( 6 ): 675 – 678 , June 2020 . Publisher: Nature Publishing Group. OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 16, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Self-Supervised AI Reveals a Hidden Landscape of Prognostic Spatial Patterns in Multiplex Immunofluorescence Images Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Self-Supervised AI Reveals a Hidden Landscape of Prognostic Spatial Patterns in Multiplex Immunofluorescence Images Lucas Farndale , Kai Rakovic , Phimmada Hatthakarnkul , Sona Szalma , Leonor Schubert Santana , Silvia Martinelli , Fiona Ballantyne , Rachel L Baird , Ian R Powley , Leah Officer-Jones , Noori Maka , Campbell Roxburgh , Crispin J Miller , David Chang , Joanne Edwards , Edward W Roberts , John Le Quesne , Ke Yuan bioRxiv 2025.10.16.682563; doi: https://doi.org/10.1101/2025.10.16.682563 Share This Article: Copy Citation Tools Self-Supervised AI Reveals a Hidden Landscape of Prognostic Spatial Patterns in Multiplex Immunofluorescence Images Lucas Farndale , Kai Rakovic , Phimmada Hatthakarnkul , Sona Szalma , Leonor Schubert Santana , Silvia Martinelli , Fiona Ballantyne , Rachel L Baird , Ian R Powley , Leah Officer-Jones , Noori Maka , Campbell Roxburgh , Crispin J Miller , David Chang , Joanne Edwards , Edward W Roberts , John Le Quesne , Ke Yuan bioRxiv 2025.10.16.682563; doi: https://doi.org/10.1101/2025.10.16.682563 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Cancer Biology Subject Areas All Articles Animal Behavior and Cognition (7622) Biochemistry (17648) Bioengineering (13870) Bioinformatics (41880) Biophysics (21423) Cancer Biology (18553) Cell Biology (25458) Clinical Trials (138) Developmental Biology (13364) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15589) Genomics (22475) Immunology (17711) Microbiology (40326) Molecular Biology (17145) Neuroscience (88471) Paleontology (666) Pathology (2826) Pharmacology and Toxicology (4815) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Outcome instruments

NRS-pain

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-21T05:10:58.409756+00:00

License: CC-BY-4.0