Full text
51,214 characters
· extracted from
preprint-html
· click to expand
Systematic Review of Hybrid Vision Transformer Architectures for Radiological Image Analysis | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Systematic Review of Hybrid Vision Transformer Architectures for Radiological Image Analysis Ji Woong Kim , Aisha Urooj Khan , Imon Banerjee doi: https://doi.org/10.1101/2024.06.21.24309265 Ji Woong Kim 1 Arizona State University , USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aisha Urooj Khan 2 Mayo Clinic , USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Imon Banerjee 2 Mayo Clinic , USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: ibanerj7{at}asu.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Background Vision Transformer (ViT) and Convolutional Neural Networks (CNNs) each possess distinct strengths in medical imaging: ViT excels in capturing long-range dependencies through self-attention, while CNNs are adept at extracting local features via spatial convolution filters. However, ViT may struggle with detailed local spatial information, critical for tasks like anomaly detection in medical imaging, while shallow CNNs may not effectively abstract global context. Objective This study aims to explore and evaluate hybrid architectures that integrate ViT and CNN to lever-age their complementary strengths for enhanced performance in medical vision tasks, such as segmentation, classification, and prediction. Methods Following PRISMA guidelines, a systematic review was conducted on 28 articles published between 2020 and 2023. These articles proposed hybrid ViT-CNN architectures specifically for medical imaging tasks in radiology. The review focused on analyzing architectural variations, merging strategies between ViT and CNN, innovative applications of ViT, and efficiency metrics including parameters, inference time (GFlops), and performance benchmarks. Results The review identified that integrating ViT and CNN can mitigate the limitations of each architecture, offering comprehensive solutions that combine global context understanding with precise local feature extraction. We benchmarked the articles based on architectural variations, merging strategies, innovative uses of ViT, and efficiency metrics (number of parameters, inference time(GFlops), performance). Conclusion By synthesizing current literature, this review defines fundamental concepts of hybrid vision transformers and highlights emerging trends in the field. It provides a clear direction for future research aimed at optimizing the integration of ViT and CNN for effective utilization in medical imaging, contributing to advancements in diagnostic accuracy and image analysis. Summary Statement We performed systematic review of hybrid vision transformer architecture using PRISMA guideline and perfromed through meta-analysis to benchmark the architectures. ACM Reference Format Ji Woong Kim, Aisha Urooj Khan, and Imon Banerjee. 2018. Systematic Review of Hybrid Vision Transformer Architectures for Radiological Image Analysis. J. ACM 37, 4, Article 111 (August 2018), 16 pages. https://doi.org/XXXXXXX.XXXXXXX 1 INTRODUCTION Convolutions Neural Networks (CNN) were capable of learning inductive bias and were the state-of-the-art for many medical imaging applications [ 2 ]. Recently, vision Transformer has been adopted in a lot of applications in medical imaging domain and demonstrated comparative performance [ 12 , 25 , 34 ]. Since, the transformer has strength in capturing the long-range dependencies by using self-attention mechanism it has shown a great performance for complex natural language processing tasks[ 17 , 42 ]. To identify small anomalies in anatomical imaging, the local correlation among the neighboring pixels also matters in addition to the long-range dependencies [ 40 ]. CNN architecture is known for being capable to capture such local dependencies[ 15 , 36 , 37 ]. Thus the Vision Transformer and CNN have complimentary strengths to process medical imaging for various applications, e.g. segmentation, classification, prediction. Therefore, many works have been attempting to combine these two architectures to have strength to capture both global and local contents of the images as hybrid concept architectures[ 22 ]. While CNN with the help of spatial convolution filters can primarily learn local features, shallow CNN networks with less layers often struggle to understand the global context in the image given the limitation of abstraction. In contrast, ViT learns the long-range dependencies via self-attention between the image patches to understand the global context while the patch based positional encoding mechanism may miss relevant local spatial information and it usually cannot attain the performance of CNNs on small-scale dataset. This limitation of ViT has been highlighted in multiple recent studies in particularly radiology domain where the findings are minute and contain within a small spatial location [ 24 , 29 ]. Combining CNN and ViT in a hybrid modeling architecture can overcome the limitations of both and ultimately provide an opportunity to learn the global and local spatial context in an end-to-end model. Given many attempts to embrace strength of both ViT and CNN in radiology domain in a single end-to-end framework, we conducted the systematic review to understand various trends of how the Hybrid Vision Transformer (CNN+ViT) was used to address image processing challenges in radiology and create a comprehensive benchmarking to guide future development. Shamshad et. al. [ 34 ] recently published a comprehensive survey on transformer applications for medical images; however the survey was primarily focused on the broad transformer architectures and only briefly mentioned the hybrid structures’ pros and cons. while we primarily focus on benchmarking the hybrid architectures for radiological images. Based on the PRISMA guideline, we have selected 28 published articles for full text review that proposed novel hybrid architecture for medical vision tasks. We performed three-level meta-analysis (a) overall architectural variations for the design of the hybrid CNN and ViT; (ii) merging strategies between CNN and ViT; (iii) purpose of innovative ViT usages; and (iv) design efficiency of the architectures in terms of number of parameters and inference time efficiency. To the best of our knowledge, there exist no systematic review paper that focuses on the hybrid architectures that combine ViT and CNN for the usage in radiology domain. While there exists survey papers that analyze ViT or CNN usage separately in medical imaging domain [ 34 ], we particularly focused on how the ViT is combined with other modules as a hybrid vision transformer architecture and applied to analyze varying scale features in radiology images. Through the meta-analysis, we not only analyzed and summarized the contents of the published literature, our also paper defines fundamental concepts of the hybrid vision transformer to make clear pathway for future research in this area. 2 METHOD 2.1 Study Selection Figure 1 outlines the search strategy. Initially, papers for review were identified from Google Scholar, encompassing all articles published between Jan, 2020 and Aug, 2023. The first screening pass identified papers pertaining to six keywords [(‘Vision transformer’, ‘ViT’, ‘Hybrid ViT’) AND (‘Radiology’, ‘Image Analysis’, ‘Modeling’)]. Papers were excluded based on: (i) publication year before 2020 and after 2023 Aug, (ii) not peer-reviewed, and (iii) proposed usage not related to radiology. Download figure Open in new tab Fig. 1. Architecture variations-(a) parallel and (b) sequential Download figure Open in new tab Fig. 2. Merging Strategies variations - (a) Feature reshaping, (b) Positional encoding, and (c) Fusing Module. To properly filter papers, we parsed the methods and experiments sections to determine the architecture and usage. We selected only architectures that combine self-attention or transformer modules in visual feature extraction and integrate with CNN modules. Vanilla ViT also proposed CNN embedding for input patches [ 28 ], thus we defined hybrid ViT as the Vision Transformer architecture that integrates CNN more extensively in end-to-end learning than the variant of vanilla ViT. Additionally, hybrid transformer architectures that used transformer modules solely for text feature extraction were excluded, as we focused on transformer usage for visual radiologic features. Finally, papers that only concatenated the output of two architectures without proposing any new design modifications were excluded. Subsequently, three independent reviewers, JK, AU, and IB, scanned the retrieved papers and selected relevant papers based on predefined inclusion and exclusion criteria. Conflicts were resolved by majority voting. 2.2 Meta-Analysis We primarily focused on four meta analysis factors as described below. 2.1.1 Architecture We categorized the existing hybrid architectures into parallel and sequential design ( Fig. 1 ). Parallel design includes the one where CNN and ViT modules used in-parallel to cooperatively learn the feature representation from the data and various intermediate fusion functions is generally used to increase the co-understanding between the modules, such as cross-attention. Note that both modules are parsing the data in the same level of granularity and have less dependence on each other. In contrast, CNN and ViT are used in sequence in sequential design where output of one module is directly passed onto others. One module is used for initial feature extraction and other module for generating an abstract view based on previous module interpretation. Therefore, the dependency between the CNN and ViT modules is much higher in the sequential design. 2.2.2 Merging Strategy Based on the architectural variations (sequential or parallel), various strategies have been employed to merge the CNN and ViT to effectively utilize their respective strengths. We have identified three broad techniques for merging ViT and CNN: (i) feature reshaping this is extremely well adopted technique for the sequential architecture where output of one module is resized using simple linear function (such as flatting) to be fed into the other. For example, given that ViT only parse sequential input tokens, CNN feature maps are flattened into sequential tokens if the output is passed to ViT; (ii) positional encoding - Since flattened feature maps lose spatial information, often architecture that use feature reshaping also include positional encoding to capture the spatial information based on the feature map; (iii) fusing module - for the parallel design where both of the modules are co-learning together, most of the literature used linear combination of ViT and CNN output to create a fusion between these two modules or use a different model for learning the fusion parameters. 2.2.3 Transformer utilization Given the usage of transformer varied widely between the hybrid models, we also categorized the hybrid model based on the final usage of the transformer - (i) Encoder - when the transformer is used to embed the raw or processed data. In other words generate a compressed representation of input; (ii) Decoder - when the transformer is used for the generation of reconstruction or interpretation. 2.2.4 Application We observed that hybrid transformer architectures are adopted in various problems of computer vision for healthcare, starting from classic tasks like classification, segmentation, reconstruction, registration to regression, synthesis, view combination, and text generation. We aimed to group the designs based on the architecture to better highlight the design choices and metric for performance evaluation. 2.3 Benchmarking criteria We used five criteria to benchmark these models which are centered around understanding the utility and efficiency of the hybrid models. 2.3.1. Modality Hybrid models are primary developed to deal with high dimensional spatial data (e.g. 2D, 3D, 4D) to process them in computationally efficient way through transformer while keeping both local and global spatial context. Therefore, we consider the imaging modality (2D - X-ray, 2D + time - Ultrasound (US), 3D - Magnetic Resonance (MR), Computed Tomography (CT), Positron Emission Tomography (PET)) as a primary benchmarking criteria which is ideally a proxy to represent the data dimensionality. 2.3.1. Model size We measured the model size by the number of trainable parameters. In theory, as the number of trainable parameters increases, so does the need for more training samples [ 11 ], which limits the large model’s applicability for interesting clinical use cases due to the lack of training data and manual annotation in the clinical domain. However, the number of training parameters also depends on the input data dimension. Therefore, we defined an efficient hybrid design as a model that can handle high dimensional input data with fewer training parameters. We primarily record the number of trainable parameters either from the published paper (if documented) or by loading the model summary in python, if the model is supplied by the authors with academic open-source license. 2.3.3. Computing efficiency To measure the computing efficiency, we used the flops/image as a benchmarking creteria to demonstrate feasibility of applying the algorithm real time. Flops denotes the number of floating point operations performed to run the inference on a single image. Given the assumption that the hardware configurations may vary, flops shows a standardize hardware independent measure of the algorithmic efficiency of the hybrid model. Though, we calculated and tested all the models on 4 A100 GPU cores with 32GB memory. Similar to model size, we rely on the published manuscript for the flops if documented or calculated the flops by doing inference on a single random input image generated based on the dimension specified by the authors. 2.3.4. Training data size Though training sample size is not directly related with the hybrid model design, we incorporated it as a benchmarking creteria to highlight task and modality specific trend of the targeted use cases and how it ultimately affects the model performance. We directly capture the number of training samples from the reported documentation by the original authors. If multiple training setting was mentioned in the paper, we only selected the largest cohort setting and report the number of total exams included. 2.3.5. Performance Based on the applications (Sec. 2.2.4), we ultimately benchmark the reported performance. Given the issue of not availability of the shared codebase or a common dataset, we only documented the reported performance by the original authors on distinct datasets. However, the performance benchmark highlights overall performance for a task based on a standard task-specific metric, e.g. Dice for segmentation accuracy, AUC for classification prediction true positive and false positive trade-off at different probability threshold (F1 if AUC is not reported), Structural Similarity Index (SSIM) for reconstruction and registration quality compared to the input data, and Bleu1 score for evaluating the quality of text generation compared to the original reference text. 3 RESULTS In Fig. 3 , we present the PRISMA diagram with total articles included and excluded at each step of filtering. Finally, within the scope of this survey, we analyzed 28 articles that satisfied all our inclusion criteria. Table 1 shows the meta analysis results and Table 2 benchmarking according to the defined meta analysis and benchmarking criteria in Sec. 2.2 and Sec. 2.3 respectively. View this table: View inline View popup Table 1. Meta-Analysis of the existing hybrid architectures performed based on the criteria defined in Sec.2.2. Studies are grouped based on the targeted applications. ‘Transf.’, ‘Util.’ refers to Transformer and Utility respectively. View this table: View inline View popup Table 2. Benchmark Table. See Sec. 2.3 for benchmarking creteria definition. Inaccessibility of the models are marked as ‘–’ which restrict to calculate the benchmarking creteria. Download figure Open in new tab Fig. 3. PRISMA study selection diagram for Hybrid Vision Transformer in Radiology Meta-analysis: Architecture Despite parallel architecture enhances cooperative learning between CNN and ViT, as simplistic design option, most hybrid architectures follow sequential structure (25 out of 28) where the output of the CNN feature extractor block is fed into ViT for generating compressed representation ( Table 1 ). Particularly, the hybrid models proposed for segmentation mostly (11 out of 12) follow the U shape architecture based on U-Net, and transformer with self-attention module is included between encoder and decoder branch within the U-Net shape backbone. For the classification, restoration and reconstruction, similar sequential architecture idea has been adopted where CNNs are used as feature extractor and feature maps are flattened and fed into transformer with positional encoding which either calculated in the feature space [ 13 ] or referred to the image space using learnable parameters [ 40 ]. After the transformer encoding, CNN decoder is included based on the image generation task requirement, e.g. registration, reconstruction. Parallel architectures were particularly proposed to conserve same level features using both CNN and transformer; however fusing the feature spaces at multiple level is a challenging problem and needs innovative measure. For segmentation, CPT U-Net [ 48 ] adopted the CNN with parallel branch with ViT and fuse the each feature maps from parallel branches. CPT U-Net shows that performance also varies depending on the modification of fusing module. For the parallel classification paradigm, TECNN [ 1 ] utilizes parallel architecture using transfomer pathway where the CNN feature maps are flattened at each step and fed into the transformer, and ultimately merge module combines both feature space and compute classification score using softmax. As parallel architecture for high quality image restoration, TransCT decomposed the images into high (HF) and low (LF) frequency component and used CNN to parse LF and transformer to parse HF. To combine the transformer and CNN features, they again utilize a simplistic ResNET (2 Conv Layers) to get the final output. Meta-analysis: Merging Strategy As also highlighted by the dominance of sequential architectures (25 out of 28), irrespective of the downstream application, most of existing hybrid architectures [ 13 , 15 , 27 , 35 , 40 ] leverage the long-context learning ability of the transformers after calculating feature maps from CNN by reshaping the feature maps and by adding the positional encoding. Separate fusing modules are primarily leveraged by the parallel architectures where either linear merging or another model is used for fusing the features from transformer and CNN [ 1 , 39 ]. Interstingly, being a sequential network, DTN [ 46 ] simultaneously leverage the feature reshaping for handling exchange between CNN and transformer, and fusion module for aggregation of two parallel transformer blocks for temporal image frames. TransMorph [ 4 ] designed an unique strategy which we grouped under the broad category of feature reshaping, though they developed a patch merging module to reshape and align the features between the 3D Swin transformer blocks in the encoder and the feature maps generated at each resolution are sent into a ConvNet decoder to produce an output. Meta-analysis: Transformer utilization Transformers are solely used for additional compression of the bottleneck features to capture the global context - out 18 of 28 studies. Interestingly, 7 studies applied transformer for both encoding and decoding purposes - particularly for target image generation tasks, such as segmentation, registration, restoration. Image to generation models (R2GEN [ 7 ]) primarily use transformer for the language generation task where CNN is being used as a image feature extractor. A compelling use of transformer is observed in 3D transformer GAN which devices a 12 layers transformer network between the encoderCNN and decoderCNN where 6 transformer layers performs encoding and 6 layers performs decoding before feeding into the CNN decoder block. The primary different from the original transformer, the transformer block applies parallel decoding to achieve parallel sequence prediction that is reshaped and processed by 1×1×1 convolution. Meta-analysis: Application The hybrid architectures are adopted for all fundamental medical image analysis tasks, e.g. segmentation (12), classification (2), reconstruction (4), registration (3). Additionally, we observed some innovative applications of the hybrid architectures. For example, ResViT [ 10 ] leveraged the contextual sensitivity of vision transformers along with the precision of convolution operators to generate missing multi-Contrast MR series and compared against state-of-art convolution only GAN based models and showed that hybrid methods outperformed. TransCT [ 47 ] improvise a to enhance the final CT image quality from a low dose CT images by using content features from transformer and latent texture features from CNN. Multi-View ViT [ 38 ] designed a cross-view transformer to transfer information between unregistered image views at the level of spatial feature maps, and validated for mammogram and chest X-ray. Benchmarking Based on the five benchmarking criteria described in Sec. 2.3, we compared the existing 28 architectures. We observed that most the hybrid architectures (23/28) are applied for the high dimensional imaging modalities, such as MR, CT, PET. This deign choice could be influenced by the capability of the transformer-CNN hybrid to digest both local and global context from a high dimensional image space and generate a denser representation compared to CNN only encoder. Highest number of trainable parameters (346.8M) is observed in Ultrasound ViT[ 33 ] that process variable length echo cardiogram videos since they utilized 16 parallel BERT encoders for spatio-temporal reasoning and two parallel regression tasks where they used ResNetAE to distil the US frames into smaller dimension embedding (1024D) and the resulting embeddings are stacked for the clip, and BERT encoders are used to process variable length videos. T2Net[ 14 ] contains lowest number of trainable parameters (1.4M) where they proposed a multitask learning framework where shared parallel network backbone, leveraging knowledge from one task to speedup the learning of the other and increase flexibility for sharing complimentary features. Only CNN based model, such as DenseNet121 has 8.1M and ResNet-50 is approximately 25.6M, which shows the fact the hybrid architecture often does not increase the trainable parameter set as it helps to generate dense representation. Theoretically, light-weight models with less parameters are faster during inference with less number of floating point operations; While T2Net has the lowest parameters, Multi-view ViT [ 38 ] for chest X-ray has the fastest inference speed which could be based on the fact that it process 2D compress X-ray images. Inference speed for segmentation (TransUNet [ 6 ]) and reconstruction (Transmorph [ 4 ]) models is often higher than the classification due to their pixel or patch-wise processing strategies. Following the similar trend as CNN, hybrid models are often trained and validated on open-source datasets, e.g. BraTS [ 16 ], Figshare [ 8 ], KiTS21 [ 20 ], TCIA [ 9 ], Synapse, with limited samples of high dimensional medical images, and obtained current state-of-the-art performance for all major medical image analysis tasks by outperforming its CNN or transformer only counterparts. This trends for all major medical image analysis tasks shows the benefit of designing hybrid architectures by combining both CNN and transformers. 4 DISCUSSION Following PRISMA guideline, we performed a systematic review for the 28 hybrid CNN and transformer architectures for radiological image analysis task. The diverse roles of transformers, from encoding raw data to decoding generated outputs, highlight their versatility in medical image analysis tasks. While transformers excel at capturing long-range dependencies, their integration into hybrid architectures introduces challenges in balancing computational efficiency and performance across different applications. The wide range of applications for hybrid vision transformer architectures underscores their potential to address various challenges in radiology, from segmentation and classification to reconstruction and registration. Innovative approaches, such as missing image generation and image quality enhancement, demonstrate the versatility and adaptability of these models to clinical needs. The predominance of sequential architectures in hybrid vision transformer models suggests a preference for a structured flow of information from CNN feature extraction to transformer-based representation learning. This sequential approach aligns well with the nature of medical image analysis tasks, where hierarchical feature extraction and global context understanding are crucial. The utilization of feature reshaping and positional encoding reflects efforts to bridge the gap between CNN and transformer representations, enabling effective fusion of spatial and contextual information. While sequential architectures predominantly rely on these strategies, parallel architectures explore alternative fusion methods, emphasizing the need for further research into optimal merging techniques. However, challenges remain in ensuring robustness, interpretability, and scalability for real-world applications. Further validation on diverse datasets and clinical scenarios is essential to establish their efficacy and reliability. While hybrid architectures offer advanced capabilities in feature representation and context modeling, their computational demands pose practical challenges, particularly in resource-constrained clinical environments. Optimizing model size, training efficiency, and inference speed is crucial for facilitating widespread adoption and deployment in clinical settings. Moreover, the reliance of hybrid architectures on large-scale annotated datasets raises concerns about data availability and quality, especially for rare diseases and specialized imaging modalities. Addressing data scarcity through data augmentation, transfer learning, and domain adaptation techniques is critical for generalizing model performance across diverse clinical scenarios. Future research should explore novel hybrid architectures that leverage the complementary strengths of CNNs and transformers while addressing their inherent limitations. Investigating alternative fusion strategies, attention mechanisms, and network architectures could lead to more efficient and effective hybrid models tailored to specific clinical tasks and modalities. Comprehensive validation studies on diverse clinical datasets are essential to assess the generalization and robustness of hybrid vision transformer models across different imaging modalities and pathologies. Collaborations between clinicians, radiologists, and machine learning researchers are crucial for co-designing and evaluating models that meet clinical needs and standards. Use of distinct private datasets for evaluation and reporting their performance using different metrics makes the comparison between the models extremely challenging. Standardized benchmarking criteria and open-source dataset is needed for the model understanding. We believe that our review will set a proper benchmakring framework for the hybrid models. Efforts to optimize model inference speed, memory efficiency, and hardware compatibility are essential for enabling real-time deployment of hybrid architectures in clinical workflows. Leveraging hardware accelerators, model compression techniques, and cloud-based inference services can facilitate seamless integration into existing radiology infrastructure. As hybrid vision transformer models become increasingly integrated into clinical practice, attention must be paid to ethical and regulatory considerations, including patient privacy, data security, and algorithmic transparency. By addressing challenges related to model design, computational efficiency, data availability, and ethical considerations, these models can significantly impact patient care and healthcare delivery, paving the way for a future where AI-powered radiology transforms diagnosis, treatment, and patient outcomes. Limitation Due to non-standardize performance reporting style in literature, we were unable to create a standard performance benchmark and models were also validated on distinct datasets. If the model is not available with open-source license, we were unable to calculate the inference speed and trainable parameters. We did make an effort to reach the corresponding authors but didn’t get the model access. 5 ESSENTIALS Hybrid vision transformer architectures preserves both local and global spatial dependencies in radiological images. Such architecture holds tremendous promise for advancing medical image analysis in radiology, offering superior performance, interpretability, and clinical relevance. Comprehensive validation studies on diverse clinical datasets are essential to assess the generalization and robustness of hybrid vision transformer models. Efforts to optimize model inference speed, memory efficiency, and hardware compatibility are essential for enabling real-time deployment. Data Availability All data produced in the present work are contained in the manuscript Footnotes Authors’ Contact Information: Ji Woong Kim, jkim804{at}asu.edu , Arizona State University, Tempe, Arizona, USA; Aisha Urooj Khan, urooj.aisha{at}mayo.edu ; Imon Banerjee, Banerjee.Imon{at}mayo.edu , Mayo Clinic, Phoenix, Arizona, USA. 1 https://www.synapse.org/#Synapse:syn3193805/wiki/89480 REFERENCES [1]. ↵ Mohammed Aloraini , Asma Khan , Suliman Aladhadh , Shabana Habib , Mohammed F Alsharekh , and Muhammad Islam . 2023 . Combining the Transformer and Convolution for Effective Brain Tumor Classification Using MRI Images . Applied Sciences 13 , 6 (2023), 3680 . OpenUrl [2]. ↵ Syed Muhammad Anwar , Muhammad Majid , Adnan Qayyum , Muhammad Awais , Majdi Alnowami , and Muhammad Khurram Khan . 2018 . Medical image analysis using convolutional neural networks: a review . Journal of medical systems 42 (2018), 1 – 13 . OpenUrl CrossRef [3]. Victor M Campello , Polyxeni Gkontra , Cristian Izquierdo , Carlos Martin-Isla , Alireza Sojoudi , Peter M Full , Klaus Maier-Hein , Yao Zhang , Zhiqiang He , Jun Ma , et al. 2021 . Multi-centre, multi-vendor and multi-disease cardiac segmentation: the M&Ms challenge . IEEE Transactions on Medical Imaging 40 , 12 (2021), 3543 – 3554 . OpenUrl CrossRef [4]. ↵ Junyu Chen , Eric C Frey , Yufan He , William P Segars , Ye Li , and Yong Du . 2022 . Transmorph: Transformer for unsupervised medical image registration . Medical image analysis 82 (2022), 102615 . OpenUrl [5]. Junyu Chen , Yufan He , Eric C Frey , Ye Li , and Yong Du . 2021 . Vit-v-net: Vision transformer for unsupervised volumetric medical image registration . Medical Imaging with Deep Learning (2021). [6]. ↵ Jieneng Chen , Yongyi Lu , Qihang Yu , Xiangde Luo , Ehsan Adeli , Yan Wang , L. Lu , Alan L Yuille , and Yuyin Zhou . 2021 . Transunet: Transformers make strong encoders for medical image segmentation . arXiv preprint arxiv: 2102.04306 (2021). [7]. ↵ Zhihong Chen , Yan Song , Tsung-Hui Chang , and Xiang Wan . 2020 . Generating radiology reports via memory-driven transformer . Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) (2020). [8]. ↵ Jun Cheng , Wei Huang , Shuangliang Cao , Ru Yang , Wei Yang , Zhaoqiang Yun , Zhijian Wang , and Qianjin Feng . 2015 . Enhanced performance of brain tumor classification via tumor region augmentation and partition . PloS one 10 , 10 (2015), e0140381 . OpenUrl CrossRef [9]. ↵ Kenneth Clark , Bruce Vendt , Kirk Smith , John Freymann , Justin Kirby , Paul Koppel , Stephen Moore , Stanley Phillips , David Maffitt , Michael Pringle , et al. 2013 . The Cancer Imaging Archive (TCIA): maintaining and operating a public information repository . Journal of digital imaging 26 (2013), 1045 – 1057 . OpenUrl CrossRef PubMed Web of Science [10]. ↵ Onat Dalmaz , Mahmut Yurt , and Tolga Çukur . 2022 . ResViT: Residual vision transformers for multimodal medical image synthesis . IEEE Transactions on Medical Imaging 41 , 10 (2022), 2598 – 2614 . OpenUrl [11]. ↵ Abdelhamid Djouadi , Oe. Snorrason , and Frederick D Garber . 1990 . The quality of training sample estimates of the bhattacharyya coefficient . IEEE Transactions on Pattern analysis and machine intelligence 12 , 1 (1990), 92 – 97 . OpenUrl [12]. ↵ Alexey Dosovitskiy , Lucas Beyer , Alexander Kolesnikov , Dirk Weissenborn , Xiaohua Zhai , Thomas Unterthiner , Mostafa Dehghani , Matthias Minderer , Georg Heigold , Sylvain Gelly , et al. 2020 . An image is worth 16x16 words: Transformers for image recognition at scale . International Conference on Learning Representations (2020). [13]. ↵ Muhamad Faisal , Jeremie Theddy Darmawan , Nabil Bachroin , Cries Avian , Jenq Shiou Leu , and Chia-Ti Tsai . 2023 . CheXViT: CheXNet and Vision Transformer to Multi-Label Chest X-Ray Image Classification . In 2023 IEEE International Symposium on Medical Measurements and Applications (MeMeA) . IEEE , 1 – 6 . [14]. ↵ Chun-Mei Feng , Yunlu Yan , Huazhu Fu , Li Chen , and Yong Xu . 2021 . Task transformer network for joint MRI reconstruction and super-resolution . In Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part VI 24 . Springer , 307 – 317 . [15]. ↵ Yunhe Gao , Mu Zhou , and Dimitris N Metaxas . 2021 . UTNet: a hybrid transformer architecture for medical image segmentation . In Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part III 24 . Springer , 61 – 71 . [16]. ↵ Mina Ghaffari , Arcot Sowmya , and Ruth Oliver . 2019 . Automated brain tumor segmentation using multimodal brain scans: a survey based on models submitted to the BraTS 2012–2018 challenges . IEEE reviews in biomedical engineering 13 (2019), 156 – 168 . OpenUrl [17]. ↵ Kai Han , An Xiao , Enhua Wu , Jianyuan Guo , Chunjing Xu , and Yunhe Wang . 2021 . Transformer in transformer . Advances in neural information processing systems 34 (2021), 15908 – 15919 . OpenUrl [18]. Ali Hatamizadeh , Vishwesh Nath , Yucheng Tang , Dong Yang , Holger R Roth , and Daguang Xu . 2021 . Swin unetr: Swin transformers for semantic segmentation of brain tumors in mri images . In International MICCAI Brainlesion Workshop . Springer , 272 – 284 . [19]. Ali Hatamizadeh , Yucheng Tang , Vishwesh Nath , Dong Yang , Andriy Myronenko , Bennett Landman , Holger R Roth , and Daguang Xu . 2022 . Unetr: Transformers for 3d medical image segmentation . In Proceedings of the IEEE/CVF winter conference on applications of computer vision . 574 – 584 . [20]. ↵ Nicholas Heller , Fabian Isensee , Dasha Trofimova , Resha Tejpaul , Zhongchen Zhao , Huai Chen , Lisheng Wang , Alex Golts , Daniel Khapun , Daniel Shats , et al. 2023 . The KiTS21 Challenge: Automatic segmentation of kidneys, renal tumors, and renal cysts in corticomedullary-phase CT . arXiv preprint arxiv: 2307.01984 (2023). [21]. Qiran Jia and Hai Shu . 2021 . Bitr-unet: a cnn-transformer combined network for mri brain tumor segmentation . In International MICCAI Brainlesion Workshop . Springer , 3 – 14 . [22]. ↵ Asifullah Khan , Zunaira Rauf , Anabia Sohail , Abdul Rehman , Hifsa Asif , Aqsa Asif , and Umair Farooq . 2023 . A survey of the Vision Transformers and its CNN-Transformer based Variants . arXiv preprint arxiv: 2305.09880 (2023). [23]. Yilmaz Korkmaz , Salman UH Dar , Mahmut Yurt , Muzaffer Özbey , and Tolga Cukur . 2022 . Unsupervised MRI reconstruction via zero-shot learned adversarial transformers . IEEE Transactions on Medical Imaging 41 , 7 (2022), 1747 – 1763 . OpenUrl CrossRef [24]. ↵ Chenghao Li and Chaoning Zhang . 2024 . Toward a Deeper understanding: RetNet viewed through convolution . Pattern Recognition (2024), 110625 . [25]. ↵ Jun Li , Junyu Chen , Yucheng Tang , Ce Wang , Bennett A Landman , and S Kevin Zhou . 2023 . Transforming medical imaging with Transformers? A comparative review of key properties, current progresses, and future perspectives . Medical image analysis 85 (2023), 102762 . OpenUrl [26]. Yijiang Li , Wentian Cai , Ying Gao , Chengming Li , and Xiping Hu . 2022 . More than encoder: Introducing transformer decoder to upsample . In 2022 IEEE International Conference on Bioinformatics and Biomedicine (BIBM) . IEEE , 1597 – 1602 . [27]. ↵ Yanmei Luo , Yan Wang , Chen Zu , Bo Zhan , Xi Wu , Jiliu Zhou , Dinggang Shen , and Luping Zhou . 2021 . 3D transformer-GAN for high-quality PET reconstruction . Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part VI 24 , 276 – 285 . [28]. ↵ Xiaofeng Mao , Gege Qi , Yuefeng Chen , Xiaodan Li , Ranjie Duan , Shaokai Ye , Yuan He , and Hui Xue . 2022 . Towards robust vision transformer . In Proceedings of the IEEE/CVF conference on Computer Vision and Pattern Recognition . 12042 – 12051 . [29]. ↵ Zachary R Murphy , Kesavan Venkatesh , Jeremias Sulam , and Paul H Yi . 2022 . Visual transformers and convolutional neural networks for disease classification on radiographs: a comparison of performance, sample efficiency, and hidden stratification . Radiology: Artificial Intelligence 4 , 6 (2022), e220012 . OpenUrl [30]. Jiayi Pan , Heye Zhang , Weifei Wu , Zhifan Gao , and Weiwen Wu . 2022 . Multi-domain integrative swin transformer network for sparse-view tomographic reconstruction . Patterns 3 , 6 (2022). [31]. Olivier Petit , Nicolas Thome , Clement Rambour , Loic Themyr , Toby Collins , and Luc Soler . 2021 . U-net transformer: Self and cross attention for medical image segmentation . In Machine Learning in Medical Imaging: 12th International Workshop, MLMI 2021, Held in Conjunction with MICCAI 2021, Strasbourg, France, September 27, 2021, Proceedings 12 . Springer , 267 – 276 . [32]. Pranav Rajpurkar , Jeremy Irvin , Kaylie Zhu , Brandon Yang , Hershel Mehta , Tony Duan , Daisy Ding , Aarti Bagul , Curtis Langlotz , Katie Shpanskaya , et al. 2017 . Chexnet: Radiologist-level pneumonia detection on chest x-rays with deep learning . arXiv preprint arxiv: 1711.05225 (2017). [33]. ↵ Hadrien Reynaud , Athanasios Vlontzos , Benjamin Hou , Arian Beqiri , Paul Leeson , and Bernhard Kainz . 2021 . Ultrasound Video Transformers for Cardiac Ejection Fraction Estimation . Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part VI 24 (2021), 495 – 505 . [34]. ↵ Fahad Shamshad , Salman Khan , Syed Waqas Zamir , Muhammad Haris Khan , Munawar Hayat , Fahad Shahbaz Khan , and Huazhu Fu . 2023 . Transformers in medical imaging: A survey . Medical Image Analysis (2023), 102802 . [35]. ↵ Zhiqiang Shen , Hua Yang , Zhen Zhang , and Shaohua Zheng . 2021 . Automated kidney tumor segmentation with convolution and transformer network . In International Challenge on Kidney and Kidney Tumor Segmentation . Springer , 1 – 12 . [36]. ↵ Hoo-Chang Shin , Holger R Roth , Mingchen Gao , L. Lu , Ziyue Xu , Isabella Nogues , Jianhua Yao , Daniel Mollura , and Ronald M Summers . 2016 . Deep convolutional neural networks for computer-aided detection: CNN architectures, dataset characteristics and transfer learning . IEEE transactions on medical imaging 35 , 5 (2016), 1285 – 1298 . OpenUrl CrossRef PubMed [37]. ↵ Kenji Suzuki . 2017 . Overview of deep learning in medical imaging . Radiological physics and technology 10 , 3 (2017), 257 – 273 . OpenUrl CrossRef [38]. ↵ Gijs van Tulder , Yao Tong , and Elena Marchiori . 2021 . Multi-view analysis of unregistered medical images using crossview transformers . In Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part III 24 . Springer , 104 – 113 . [39]. ↵ Fan Wang and Bo Wang . 2022 . Hybrid Transformer and Convolution for Medical Image Segmentation . In 2022 International Conference on Image Processing, Computer Vision and Machine Learning (ICICML) . IEEE , 156 – 159 . [40]. ↵ Wenxuan Wang , Chen Chen , Meng Ding , Hong Yu , Sen Zha , and Jiangyun Li . 2021 . Transbts: Multimodal brain tumor segmentation using transformer . In Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part I 24 . Springer , 109 – 119 . [41]. Xiaosong Wang , Yifan Peng , L. Lu , Zhiyong Lu , Mohammadhadi Bagheri , and Ronald M Summers . 2017 . Chestx-ray8: Hospital-scale chest x-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases . In Proceedings of the IEEE conference on computer vision and pattern recognition . 2097 – 2106 . [42]. ↵ Thomas Wolf , Lysandre Debut , Victor Sanh , Julien Chaumond , Clement Delangue , Anthony Moi , Pierric Cistac , Tim Rault , Rémi Louf , Morgan Funtowicz , et al. 2020 . Transformers: State-of-the-art natural language processing . In Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations . 38 – 45 . [43]. Yutong Xie , Jianpeng Zhang , Chunhua Shen , and Yong Xia . 2021 . Cotr: Efficiently bridging cnn and transformer for 3d medical image segmentation . In Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part III 24 . Springer , 171 – 180 . [44]. Xiangyi Yan , Hao Tang , Shanlin Sun , Haoyu Ma , Deying Kong , and Xiaohui Xie . 2022 . After-unet: Axial fusion transformer unet for medical image segmentation . In Proceedings of the IEEE/CVF winter conference on applications of computer vision . 3971 – 3981 . [45]. Di You , Fenglin Liu , Shen Ge , Xiaoxia Xie , Jing Zhang , and Xian Wu . 2021 . Aligntransformer: Hierarchical alignment of visual regions and disease tags for medical report generation . In Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part III 24 . Springer , 72 – 82 . [46]. ↵ Yungeng Zhang , Yuru Pei , and Hongbin Zha . 2021 . Learning dual transformer network for diffeomorphic registration . In Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part IV 24 . Springer , 129 – 138 . [47]. ↵ Zhicheng Zhang , Lequan Yu , Xiaokun Liang , Wei Zhao , and Lei Xing . 2021 . TransCT: dual-path transformer for low dose computed tomography . In Medical Image Computing and Computer Assisted Intervention–MICCAI 2021: 24th International Conference, Strasbourg, France, September 27–October 1, 2021, Proceedings, Part VI 24 . Springer , 55 – 64 . [48]. ↵ Jinghua Zhu , Yue Sheng , Hui Cui , Jiquan Ma , Jijian Wang , and Heran Xi . 2023 . Cross Pyramid Transformer makes U-net stronger in medical image segmentation . Biomedical Signal Processing and Control 86 (2023), 105361 . OpenUrl View the discussion thread. Back to top Previous Next Posted June 22, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Systematic Review of Hybrid Vision Transformer Architectures for Radiological Image Analysis Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Systematic Review of Hybrid Vision Transformer Architectures for Radiological Image Analysis Ji Woong Kim , Aisha Urooj Khan , Imon Banerjee medRxiv 2024.06.21.24309265; doi: https://doi.org/10.1101/2024.06.21.24309265 Share This Article: Copy Citation Tools Systematic Review of Hybrid Vision Transformer Architectures for Radiological Image Analysis Ji Woong Kim , Aisha Urooj Khan , Imon Banerjee medRxiv 2024.06.21.24309265; doi: https://doi.org/10.1101/2024.06.21.24309265 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Radiology and Imaging Subject Areas All Articles Addiction Medicine (574) Allergy and Immunology (865) Anesthesia (304) Cardiovascular Medicine (4460) Dentistry and Oral Medicine (445) Dermatology (383) Emergency Medicine (611) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1517) Epidemiology (15251) Forensic Medicine (31) Gastroenterology (1132) Genetic and Genomic Medicine (6621) Geriatric Medicine (669) Health Economics (1002) Health Informatics (4564) Health Policy (1372) Health Systems and Quality Improvement (1617) Hematology (544) HIV/AIDS (1272) Infectious Diseases (except HIV/AIDS) (15938) Intensive Care and Critical Care Medicine (1107) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6643) Nursing (346) Nutrition (1001) Obstetrics and Gynecology (1149) Occupational and Environmental Health (957) Oncology (3350) Ophthalmology (981) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1698) Pharmacology and Therapeutics (694) Primary Care Research (714) Psychiatry and Clinical Psychology (5464) Public and Global Health (9259) Radiology and Imaging (2212) Rehabilitation Medicine and Physical Therapy (1372) Respiratory Medicine (1198) Rheumatology (598) Sexual and Reproductive Health (716) Sports Medicine (533) Surgery (715) Toxicology (100) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a03a16e93c1ff047',t:'MTc4MDEwNTg0Mg=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.