Frontal cortex organization supporting audiovisual processing during naturalistic viewing

doi:10.1101/2025.06.26.661755

Frontal cortex organization supporting audiovisual processing during naturalistic viewing

2025 · doi:10.1101/2025.06.26.661755

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 108,132 characters · extracted from preprint-html · click to expand

Frontal cortex organization supporting audiovisual processing during naturalistic viewing | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Frontal cortex organization supporting audiovisual processing during naturalistic viewing View ORCID Profile Faxin Zhou , View ORCID Profile Amirhossein Khalilian-Gourtani , View ORCID Profile Patricia Dugan , View ORCID Profile Andrew Michalak , View ORCID Profile Orrin Devinsky , View ORCID Profile Peter Rozman , View ORCID Profile Werner Doyle , View ORCID Profile Daniel Friedman , View ORCID Profile Adeen Flinker doi: https://doi.org/10.1101/2025.06.26.661755 Faxin Zhou 1 Department of Biomedical Engineering, Tandon School of Engineering, New York University , New York, 11201, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Faxin Zhou For correspondence: fz2185{at}nyu.edu adeen{at}nyu.edu Amirhossein Khalilian-Gourtani 2 Department of Neurology, School of Medicine, New York University , New York, 10016, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Amirhossein Khalilian-Gourtani Patricia Dugan 2 Department of Neurology, School of Medicine, New York University , New York, 10016, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Patricia Dugan Andrew Michalak 2 Department of Neurology, School of Medicine, New York University , New York, 10016, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Andrew Michalak Orrin Devinsky 2 Department of Neurology, School of Medicine, New York University , New York, 10016, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Orrin Devinsky Peter Rozman 3 Department of Neurosurgery, School of Medicine, New York University , New York, 10016, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Peter Rozman Werner Doyle 3 Department of Neurosurgery, School of Medicine, New York University , New York, 10016, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Werner Doyle Daniel Friedman 2 Department of Neurology, School of Medicine, New York University , New York, 10016, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Daniel Friedman Adeen Flinker 1 Department of Biomedical Engineering, Tandon School of Engineering, New York University , New York, 11201, NY, USA 2 Department of Neurology, School of Medicine, New York University , New York, 10016, NY, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Adeen Flinker For correspondence: fz2185{at}nyu.edu adeen{at}nyu.edu Abstract Full Text Info/History Metrics Preview PDF Abstract Our brains dynamically adapt to a multisensory world by orchestrating diverse inputs across sensory streams. This process engages multiple brain regions, but it remains unclear how audiovisual stimuli are represented and evolve over time, especially in naturalistic scenarios. Here, we employed a movie-viewing paradigm to explore this question. We recorded intracranial electrocorticography (iEEG) to measure brain activity in 19 participants watching a short multilingual movie. Using unsupervised clustering and supervised encoding models, we identified a robust modality-specific gradient in the frontal cortex, wherein the ventral division primarily processes auditory information and the dorsal division processes visual inputs. Further, we found that this cortical organization dynamically changed, adapting to different movie contexts. This result potentially reflects flexible audiovisual-resource assignment to construct a coherent percept of the movie. Leveraging behavioral ratings, we found that the frontal cortex is the primary site in this modality assignment process. Together, our findings shed new light on the functional architecture of the frontal cortex underlying flexible multisensory representation and integration in natural contexts. Introduction The brain has a remarkable capacity to process multiple sensory signals simultaneously for a coherent percept [ 1 ]. This multisensory processing leverages complementary sensory information to enhance environmental awareness and guide behavior [ 2 ]. Neuroimaging and electrophysiological studies have implicated the frontal cortex as a key region in this process [ 3 – 5 ]. Increased activation in the frontal regions occurs when congruent multisensory information is presented [ 6 – 8 ] and when sensory inputs are successfully integrated [ 9 ]. Additionally, the frontal lobe also selectively represents relevant information originating from perceptual cortical regions [ 7 , 10 ]. These findings are primarily derived from highly controlled, task-based experiments that contrast multi- and unisensory conditions. There remains a paucity of research addressing how the brain dynamically organizes multiple sensory inputs in natural settings. To this end, movies can serve as ideal stimuli to study multisensory processing. Movies contain abundant audiovisual information, and observers need to dynamically allocate resources for different modalities to optimize perceptual under-standing. Further, naturalistic materials are ecologically valid [ 11 ] and potentially generalizable outside the laboratory [ 12 ]. Only a limited number of functional Magnetic Resonance Imaging (fMRI) studies have directly investigated audiovisual processing during naturalistic movie viewing. For instance, one employed deep learning (DL)-based encoding models to quantify cortical representations of auditory and visual features and found that audiovisual models provided better fits for frontal cortex versus unimodal models [ 13 ]. A related study implicated frontal regions in watching audiovisual movies, as well as in audio-only and visual-only controls, indicating that the frontal cortex represents cross-modal information [ 14 ]. These studies, however, did not explicitly differentiate audio and visual representations in the frontal cortex. Moreover, due to the relatively low temporal resolution of fMRI, these studies could not assess moment-to-moment temporal changes in audiovisual processing. Additionally, these studies primarily identified static neural representations, without focusing on how audiovisual representations change through-out the movie. As a result, it remains unclear how the frontal cortex processes auditory and visual information (i.e., in a modality-specific or general manner) and how this functional organization evolves temporally. To bridge these gaps, we examined neural responses in a cohort of neurosurgical patients using intracranial electroencephalography (iEEG) while they watched a multilingual naturalistic movie. Intracranial EEG directly records from the brain with excellent spatiotemporal resolution (millimeters and milliseconds) and high signal-to-noise ratio (SNR) [ 15 ]. Applying functional clustering and encoding models to these recordings, we identified a ventral-to-dorsal gradient in the lateral frontal cortex, associated with auditory (ventral) and visual (dorsal) representations. Further, this modality-specific frontal pattern varied with changes in language context, indicating flexible modality assignment of neural resources. Additional analyses showed that the neural responses related to the audiovisual assignment were predominantly located in the frontal cortex. Overall, these results highlight a dynamic functional organization in the lateral frontal cortex, advancing our understanding of natural audiovisual processing. Our work contributes to the development of neurobiologically plausible models for flexible multisensory processing in real-world scenarios. Results We measured neural activity during free viewing of a movie across 19 patients, including 2688 electrode contacts with wide coverage across the brain (Fig. S1A). The movie contained four different stories with a common theme, alternating between English and other languages (Greek, German, and French; see Methods: Task and procedure). All participants were native English speakers with no prior knowledge of any of the foreign languages in the movie. In this study, we focused on the high-gamma (70 to 150 Hz) broadband field potentials, which are robust at the single-trial level [ 16 ], reflect local neuronal firing [ 17 – 19 ], and exhibit strong correlation with the fMRI blood-oxygenation level-dependent (BOLD) response [ 17 , 20 ]. Unsupervised clustering reveals modality-specific neural responses We were first interested in understanding the neural responses to diverse auditory and visual scenes. To this end, we divided the movie into four conditions, each representing a distinct audiovisual scenario ( Fig. 1A ): (1) English (EN) condition, in which dialogue is spoken in English; (2) Foreign Language (FL) condition, in which characters communicate in non-English languages such as French or German (English subtitles are provided for movie comprehension); (3) Other Sound (OS) condition, in which non-speech sounds co-occur with the visual scene; and (4) Silent (SI) condition, in which visual scenes are presented without audio. By analyzing neural signals within a one-second post-onset epoch for each condition (onsets were defined as the beginning of the sound in EN, FL, OS or the visual scene in SI, see Methods: Active electrodes selection), we observed various neural characteristics related to audiovisual processing in different regions. For instance, con-texts including auditory information (i.e., the EN, FL, and OS conditions) elicited early responses (around 200 ms) in an example electrode from superior temporal gyrus (STG) ( Fig. 1C , leftmost panel). Similarly, an electrode in primary visual cortex (V1) exhibited strong activation in the Foreign Language condition in an early epoch ( Fig. 1C , the middle left panel). Additionally, frontal electrodes demonstrated delayed responses exclusively for language conditions (after 400 ms), while signals in non-language conditions remained stable ( Fig. 1C , the right two panels). The electrodes with significant responses were predominantly distributed in the superior temporal gyrus (STG), occipital regions, frontal cortex, and inferior parietal lobe (IPL; Fig. 1B, S1C, and S8C; permutation test, p < 0.05, FDR corrected; see Methods: Active electrodes selection). To characterize the prototypical patterns of these diverse neural signals, we applied a non-negative matrix factorization (NMF) analysis to the significant electrodes across the four conditions (Fig. S3A, see Methods: Functional clustering analysis) [ 21 – 23 ]. This clustering approach, combined with a silhouette test, identified two distinct clusters (Fig. S3B). The first cluster exhibited higher weights for electrodes in the STG and ventrolateral prefrontal cortex (vlPFC), while the second cluster contained the occipital areas, dorsolateral prefrontal cortex (dlPFC), and pericentral gyrus ( Fig. 2A ). Based on anatomical structures and their functional responses, we assigned the two clusters as the auditory cluster (red) and the visual cluster (blue), respectively. Furthermore, we calculated the weighted average of neural responses for both clusters ( Fig. 2B ) and observed that the language conditions generally showed higher amplitudes than non-language conditions. By computing the weighted activity according to distinct anatomical regions ( Fig. 2C ; Fig. S1B), we found that the auditory and visual clusters showed earlier peaks in the STG and occipital regions respectively (STG electrodes: 365 ± 242 ms; occipital electrodes: 389 ± 221 ms; computed based on the language conditions). In addition, the two clusters showed later responses in the frontal areas (auditory cluster = 657±237 ms; visual cluster peak timing: 608±278 ms; computed based on the language conditions). Notably, the frontal electrodes appeared to be spatially organized according to their respective modalities, wherein the vlPFC electrodes were primarily assigned to the auditory cluster, and the dlPFC electrodes were predominantly assigned to the visual cluster ( Fig. 2A ). These results indicated a modality-specific cortical activation map during movie viewing. Download figure Open in new tab Fig. 1: Neural responses during movie viewing. (A) Four conditions identified from the movie: (1) English (EN) condition: dialogue is in English; (2) Foreign Language (FL) condition: communications in other languages such as French or German (English subtitles are provided for movie comprehension); (3) Other Sound (OS) condition: non-speech sounds co-occur with the visual scenes; (4) Silent (SI) condition: visual scenes are presented without audio. (B) All active electrodes survived the per-mutation test (Fig. S2) across the four conditions (Fig. S1C). (C) Averaged signals across the four conditions for the representative electrodes marked in (B). The shade areas around the curves represent the standard error of the mean (SEM) across trials. Download figure Open in new tab Fig. 2: Functional clustering of the active responses. (A) Spatial map showing two identified clusters with the non-negative matrix factorization (NMF) approach. For both clusters, the weights were normalized to the range 0-1, with higher values indicating greater contributions to each cluster. (B) Weighted average signals of the two clusters across all electrodes. (C) Weighted average signals across the electrodes in three specific ROIs (Fig. S1B): STG (left panel), occipital regions (middle panel), and frontal regions (right panel). Shaded areas around the curves represent the SEM. Audiovisual encoding exhibits modality-specific representations We aimed to determine whether the spatial distributions identified by the functional clusters reflect how distinct auditory and visual information is processed during free viewing. To this end, we employed the encoding models to examine how audiovisual information is represented in the brain [ 24 ]. First, we utilized a series of computational models to extract low- and high-level features from auditory and visual streams ( Fig. 3A , see Methods: Audiovisual feature extraction). At the low level, we were interested in capturing fine-grained acoustic representations from audio (i.e., the spectrogram) and spatiotemporal frequencies from visual scenes (i.e., motion energy Gabor filters [ 25 ]). At the high level, by tapping into the significant improvements of transformer architecture in complicated downstream tasks [ 26 , 27 ], we extracted the vectorial embeddings from two representative models for auditory (the wav2vec 2.0 model [ 28 ]) and visual modalities (the vision transformer (ViT) model [ 29 ]). Then, we concatenated the low- and high-level embeddings to ensure a comprehensive feature space. We then used the multivariate temporal response function (mTRF) model to quantify neural encoding [ 30 ]. Briefly, the mTRF is a linear model that incorporates time lags ( τ ) between features ( X ) and neural activity ( y ) ( Fig. 3B ), which has been widely used in neural representational analysis of acoustic, phonological, and linguistic features in connected speech [ 23 , 31 , 32 ] as well as semantic novelty in naturalistic movie segments [ 33 ]. To prevent model overfitting and facilitate computational efficiency, we used principal component analysis (PCA) to reduce the dimensions of the audio and visual features (Fig. S4A). We conducted the encoding analysis on all electrodes and evaluated model performance based on the Pearson correlation ( r ) between pre-dicted and actual neural signals in the withheld test set under a 4-fold cross-validation procedure (see Methods: Encoding modeling procedure). Auditory and visual mTRF models were trained separately. Spatially, significant auditory electrodes were primarily located in STG, vlPFC, and middle precentral gyrus (midPrCG) ( Fig. 3C and S5A), while significant visual electrodes were largely distributed in occipital regions, dlPFC, STG, and IPL ( Fig. 3D and S5B). The proportion of electrodes tuned to both modalities was relatively low (14.483%) and these electrodes were primarily located in the middle STG, with some scattered across the frontal regions (Fig. S4B). Notably, the anatomical distributions identified by auditory mTRF models replicated our previous NMF analysis of the auditory cluster (Chi-square test, χ 2 (9) = 8.921, p = 0.445; Fig. S8B, D). Similarly, the spatial patterns identified by the visual mTRF models were consistent with those of the visual cluster, showing no significant difference (Chi-square test, χ 2 (9) = 14.337, p = 0.111). Temporally, based on our functional clustering results showing that frontal regions are recruited later than perceptual cortices (e.g., STG and occipital region), we observed similar temporal patterns by examining the lags ( τ ) of the peak weights in the mTRF filters (Fig. S4C; see Methods: Timing analysis of encoding models): the STG (auditory: 194 ± 28 ms) and occipital regions (visual: 224 ± 58 ms) peaked at relatively earlier stages, while the frontal electrodes exhibited later peak latencies (auditory: 396 ± 63 ms; visual: 387 ± 68 ms; Fig. 3E ). Moreover, we found no significant differences between modalities in either the lower-level regions (permutation test, p = 0.512, FDR corrected) or the frontal regions (permutation test, p = 0.745, FDR corrected), but observed a significant difference between them (permutation test, p < 0.001, FDR corrected; Fig. 3E ). These results collectively suggest distinct encoding of auditory and visual information across multiple timescales. A ventral-dorsal gradient in lateral frontal cortex shows audiovisual selectivity Both the NMF and mTRF results suggest that the ventral and dorsal areas of the frontal cortex are selective for auditory and visual information, respectively. To quantify this relationship, we constructed an index to capture the audiovisual (AV) tuning strength of the significant electrodes in the frontal region. This AV index is computed for each significant electrode by taking the r value difference between the visual and auditory models, divided by their sum (see Methods: Frontal ventral-dorsal gradient analysis). Therefore, an AV index of 1 indicates that the electrode is completely visually tuned, while an index of -1 indicates complete auditory tuning. To further delineate the ventral-dorsal gradient, we constructed a frontal polar coordinate system with the intersection of the precentral sulcus and the Sylvian fissure as the origin (MNI coordinate: [-55, 15 , -8]). The smaller values of the radius (d, in mm) indicate more ventral positions, while the larger values correspond to more dorsal areas ( Fig. 4A ). We found a significant linear relationship between the radius and the AV index ( r (63) = 0.447, p < 0.001), quantitatively validating an audiovisual gradient in the frontal cortex ( Fig. 4B ). Further, we examined what type of audiovisual information drives this frontal gradient. Our feature construction procedure allows us to dissociate low-level perceptual effects (e.g., spectrogram and Gabor features) from high-level representations (e.g., wav2vec 2.0 and ViT features). To assess the contribution of each feature space to neural encoding performance, we computed the reduction in r -values when either low- or high-level features were removed from the model (see Methods: Encoding model partitioning). We only observed pronounced effects in the frontal cortex for the features derived from transformer-based models (one-sample t -test, auditory: t (21) = 9.097, p < 0.001, d = 1.940; visual: t (18) = 4.789, p < 0.001, d = 1.099; FDR corrected), but not for the low-level features (one-sample t -test; auditory: t (21) = 1.267, p = 0.219, d = 0.270; visual: t (18) = 2.074, p = 0.063, d = 0.476; FDR corrected; Fig. 4C ). These findings indicated that the frontal gradient is primarily driven by the high-level information embedded in transformer-based models (see Discussion). Download figure Open in new tab Fig. 3: Neural representations of the auditory and visual features in the movie. (A) Pipeline for extracting the audiovisual features, including: 1) low-level auditory (spectrogram) and visual features (motion energy filters); 2) high-level auditory (transformer embeddings from the wav2vec 2.0) and visual features (transformer embeddings from the vision transformer (ViT); see Methods: Audiovisual feature extraction). Low- and high-level features were concatenated (denoted by “||”) before undergoing dimensionality reduction using principal component analysis (PCA). (B) Overview of the multivariate temporal response function (mTRF). The processed movie features ( X , purple) were used to predict the neural responses ( y , oranges) with a set of weights ( w , green) across various time lags ( τ ). (C, D) The correlation brain map of the auditory and visual mTRF models, showing only significant electrodes. (E) The peak timing for all the electrodes in four anatomical region-of-interests (ROIs). The distributions are plotted based on a bootstrap procedure, and the significance test between different regions was conducted with a permutation test (see Methods: Timing analysis of encoding models). The error bars represent the standard deviation (SD). Significance levels are set as p < 0.001 (***), p < 0.01 (**), p < 0.05 (*), and p ≥ 0.05 (n.s.). Additionally, we conducted a control analysis to ensure that the frontal gradient was not biased by the semantic or linguistic components of the subtitles in the Foreign Language condition. By extracting the BERT embeddings of the subtitles and applying a model partitioning procedure (see Methods: Audiovisual feature extraction and Encoding model partitioning), we found that adding the subtitle embeddings neither significantly increased r -values in the frontal cortex (Mann-Whitney U -Test, U (72) = 763, p = 0.399, r = −0.115; FDR corrected; Fig. 4D-E ) nor affected the frontal audiovisual gradient ( r (47) = 0.594, p < 0.001; Fig. 4F-G ), indicating that the modality-specific frontal pattern was not confounded by subtitle-related semantic processing. A flexible audiovisual-resource assignment supported by behavioral and neural evidence Next, we were interested in how bimodal information is represented in the brain throughout movie viewing and whether neural patterns vary across different movie context. To this end, we examined the mTRF results for multiple conditions. Although we did not detect frontal representations in the non-language conditions (i.e. Other Sound and Silent; Fig. S5), we observed significant representations changing with language conditions. Specifically, the auditory electrodes were more pronounced in the frontal cortex during the English condition while the visual electrodes were more prominent in the Foreign Language condition ( Fig. 5A ). Moreover, there was a significant interaction effect between language and modality conditions (two-way ANOVA, modality main effect: F (1, 196) = 2.780 , p = 0.097 , η 2 = 0.014; language main effect: F (1, 196) = 3.604 , p = 0.059 , η 2 = 0.019; language-by-modality inter-action effect: F (1, 196) = 55.178, p < 0.001, η 2 = 0.220; Fig. 5B ), suggesting that audiovisual information is reallocated in the frontal cortex depending on contextual demands. Notably, this pattern was not observed in other areas typically implicated in audiovisual processing, such as the parietal cortex [ 34 , 35 ], in which auditory encoding performance exceeded visual encoding performance across both the EN and FL conditions, alongside a weaker but significant interaction effect (two-way ANOVA, modality main effect: F (1, 122) = 24.340 , p < 0.001 , η 2 = 0.166; language main effect: F (1, 122) = 0.448 , p = 0.505 , η 2 = 0.004; language-by-modality interaction effect: F (1, 122) = 7.262 , p = 0.008 , η 2 = 0.056; Fig. S7). These results further support the specificity of the frontal cortex as a key site in context switching. Why did distinct frontal patterns emerge under different language conditions? Past studies suggest that combining multisensory information operates as a weighting process [ 2 , 36 – 39 ], indicating that audiovisual processing during movie viewing involves weight-based assignment of different modalities. From this perspective, we would expect the auditory domain to receive greater weights in the English condition and visual cues to be prioritized in the Foreign Language condition, consistent with our findings ( Fig. 5A-B ). To explicitly test this idea, we conducted a series of movie rating tasks using the Amazon Mechanical Turk (AMT) platform (see Methods: Task and procedure; Fig. 5E ). In the task, the entire movie was segmented into 2-3 seconds clips, and participants were asked to evaluate (1) how important the clip is for under-standing the entire movie (global context; Fig. 5C ); (2) what type of information is more important for understanding the current movie clip (local modality; Fig. 5D ). As a result, the point-wise multiplication of the global context and local modality ratings (i.e., the interaction term) could serve as an indicator of modality assignment, quantifying which modality contributes more to overall movie comprehension (global modality; Fig. 5F ). Additionally, we performed condition analyses and found that the global context ratings in the language conditions were significantly higher than that in the non-language conditions (two sample t -test, t (196) = 3.055, p = 0.003, d = 0.425; Fig. 5G ). This result is consistent with our NMF and mTRF findings showing stronger responses for the language conditions (i.e., English and Foreign Language; Fig. 2B and S5). Moreover, our findings for the language conditions also matched our prediction, where the English condition showed significant tuning to auditory information (one sample t -test, t (43) = −2.562, p = 0.014, d = 0.386, FDR corrected), while the For-eign Language condition showed greater selectivity for the visual stream (one sample t -test, t (45) = 5.027, p < 0.001, d = 0.741, FDR corrected; Fig. 5H ). Further, we reasoned that if this weight-based assignment hypothesis is applicable to audiovisual processing during naturalistic movie viewing, this modality assignment effect should also be detectable at the neural level. To this end, we fitted the mTRF models to the audiovisual assignment variable (i.e., global modality; see Methods: Encoding modeling procedure). Significant electrodes were predominantly located in the lateral frontal cortex and were also sparsely scattered in the anterior temporal lobe (ATL), temporal-parietal junction (TPJ), and other regions ( Fig. 6A and S8E). To evaluate the robustness of these findings, we conducted a series of control analyses. First, to rule out the potential confounding effect of attention, we applied the mTRF model to the attentional engagement variable obtained from the engagement rating task (Fig. S9A, B; see Methods: Task and procedure), which revealed a brain pattern distinct from that of the global modality ( r = 0.015, p = 0.431; Fig. S9D). Moreover, the global modality mTRF brain maps trained with and without removing engagement yielded consistent patterns ( r = 0.621, p < 0.001; Fig. S9E, F), suggesting that attentional engagement did not confound our results. Second, we examined whether film cuts could influence the results, as scene transitions might alter the assignment of audiovisual resources. To address this, we computed the first derivative of the global modality and compared it with the film cuts. We defined the film cuts at each abrupt change in movie scenes (i.e., the boundaries; Fig. S9G) [ 40 ]. No significant correlations were observed between the film cuts and changes in the global modality across raters (one-sample t test against 0: T (25) = −1.817, p = 0.081, d = 0.356; Fig. S9H), indicating that film cuts did not confound the observed effects. Moreover, to exclude potential confounding influences of audiovisual features in the frontal cortex, we performed an additional partitioning procedure (see Methods: Encoding model partitioning) and detected significant effects only for the audiovisual assignment (one sample t -test, t (33) = 7.968, p < 0.001, d = 1.366, FDR corrected, Fig. S10A). Additionally, these electrodes were distinct from those representing audiovisual features. The r-values from the assignment models did not correlate with either auditory (Pearson correlation, r = −0.122, p = 0.145) or visual r-values (Pearson correlation, r = −0.086, p = 0.306; Fig. 6B ). In contrast to the ventral-to-dorsal frontal gradient observed for the audiovisual features, the electrodes related to audiovisual assignment were more diffusely distributed in the frontal lobe ( Fig. 6C ). Notably, 73.856% of the electrodes (113 out of 153) were associated with only one specific function, and only four electrodes showed overlap across all three models (2.614%; Fig. 6D ). An analysis of the timing (peak lag of modality assignment) showed that the audiovisual assignment in the frontal cortex (297 ± 47 ms) occurred significantly later than the audiovisual representation in the perceptual regions (permutation test, p = 0.008, FDR corrected), but earlier than the frontal encoding of audiovisual features (permutation test, p = 0.001, FDR corrected; Fig. S10B). Taken together, our neural and behavioral findings suggest that frontal cortex representations flexibly adapt to shifts in movie plots and linguistic scenarios. Further-more, audiovisual assignment likely relies on distinct frontal neural substrates that complement the representation of audiovisual features. Download figure Open in new tab Fig. 4: Frontal gradient for audiovisual modalities. (A) The significant electrodes within the frontal sector ROI are represented by audiovisual (AV) index. The sector ROI was defined using a polar coordinate system, with the origin at the inter-section of the precentral sulcus and the Sylvian fissure, where the radius (d, mm) quantifies the ventral-to-dorsal gradient. The AV index is computed for each significant electrode by taking the r-value difference between the visual mTRF and auditory mTRF, normalized by their sum (see Methods: Frontal ventral-dorsal gradient anal-ysis). (B) A significant correlation was found between radius and AV index. (C) Results of the encoding model partitioning procedure. We obtained the unique low-level effects (auditory: spectrogram; visual: motion energy Gabor filter) by subtracting the high-level effects (auditory: wav2vec 2.0; visual: vision transformer) from the full model (including both the low- and high-level features). Similarly, the unique high-level effects can be obtained by reversing the procedure (see Methods: Encoding model partitioning). (D) The subtitle effect in Foreign Language condition. Significant neural encoding maps for visual features only ( M vis ; top left panel) and for both the visual and subtitle features ( M vis & txt ; top right panel). The unique subtitle effect map is the difference between the two maps above ( M vis & txt − M vis ; bottom panel). (E) ROI analysis. The x-axis shows model types ( M vis and M vis & txt ), while the y-axis indicates r -values for electrodes within each ROI. No difference was observed in the frontal or the occipital regions. However, a significant increase was found in the STG. (F-G) Frontal gradient analysis incorporating the subtitle embeddings in the visual models (same procedure as figure A-B). The error bars in figure C and E represent 95% CI. The gray area around the regression line in figure B and G represents the 95% confidence interval (CI). Significance levels are set as p < 0.001 (***), p < 0.01 (**), p < 0.05 (*), and p ≥ 0.05 (n.s.). Download figure Open in new tab Fig. 5: Neural dynamics and the behavior experiment. (A) The mTRF correlation map in the frontal region for English (EN) and Foreign Language (FL) conditions. (B) An interaction effect between the auditory and visual mTRF models in the frontal cortex for the EN and FL conditions. (C, D) The global context and local modality ratings. Upper panel: ratings across all participants over time. Lower panel: the aver-age time series across participants. (E) The pipeline of the clip rating task conducted on the Amazon Mechanical Turk platform. (F) Pointwise multiplication of global con-text and local modality (the interaction effect). This interaction effect reflects dynamic changes in modality assignment for movie comprehension. (G) The bar plot of the global context ratings for the language (lang) condition, which includes the EN and FL conditions, and the non-language (non-lang) condition, which includes the OS and SI conditions. The dots represent trials in each condition. (H) The bar plot of the global modality for the EN and FL conditions. The shaded areas in figure C, D, and F rep-resent SE; the error bars in figure B, G, and H represent 95% CI. The colors in figure G and H correspond to the four conditions, consistent with the color scheme defined in Fig. 1A . Significance levels are set as p < 0.001 (***), p < 0.01 (**), p < 0.05 (*), and p ≥ 0.05 (n.s.). Download figure Open in new tab Fig. 6: Neural substrates for the audiovisual assignment. (A) The mTRF results for the global modality (i.e., the audiovisual assignment). (B) The correlation analysis between r-values from the assignment mTRF models and r-values from the auditory/visual mTRF models for all significant electrodes in the frontal cortex. The shaded areas around the lines represent the 95% CI. (C) The brain map of significant frontal electrodes across the three mTRF models, rendered with the color code in (D). (D) The Venn diagram of the significant frontal electrodes from the three mTRF models. Discussion The human brain dynamically reconfigures auditory and visual information during free movie viewing. Here, we identified auditory and visual neural responses and representations spanning from the early perceptual areas to the frontal cortex. Notably, the frontal cortex exhibited a modality-specific topography, which shifted depending on the language context. Together with converging behavioral data, we provide evidence that observers dynamically allocate audiovisual resources in response to contextual demands, revealing a flexible and adaptive mechanism for real-world audiovisual processing. Our results demonstrate a frontal gradient in audiovisual representation during free viewing. Previous studies have shown that the frontal cortex is broadly involved in audiovisual processing during natural movie watching [ 13 , 14 ] and distinct frontal subregions respond selectively to auditory and visual information in both humans [ 41 – 44 ] and non-human primates [ 6 , 45 , 46 ]. However, these studies primarily focused on specific regions of interest (ROIs) and lacked a comprehensive view of representational organization across the frontal lobe. In contrast, we systematically examined the lateral frontal cortex using both unsupervised functional clustering and super-vised encoding models, revealing a robust segregation between auditory and visual modalities. Anatomically, our findings are consistent with the established structural connectivity patterns, wherein the auditory cortices project primarily to the vlPFC (e.g., the IFG) via the arcuate fasciculus (AF) [ 47 , 48 ] and the visual cortices connect to dlPFC (e.g., the frontal eye field, FEF) through inferior fronto-occipital fasciculus (IFOF) [ 49 ]. Functionally, we found that this dorsoventral gradient was predominantly driven by high-level features extracted from transformer-based models and exhibited relatively late peak latencies (around 400 ms; Fig. 4C ). Moreover, the sustained frontal activity during the language context ( Fig. 2C ) suggests its involvement in maintaining recent narrative information over time [ 50 , 51 ]. These findings align with recent unimodal auditory and visual studies reporting dissociable semantic representations in the frontal cortex [ 21 , 52 – 56 ] as well as studies combining structural and functional connectivity showing a dorsoventral frontal gradient corresponding to an auditory-to-visual transition [ 57 ]. Extending these findings, our study provides the first direct iEEG evidence for an audiovisual semantic gradient along the dorsoventral axis in frontal regions during naturalistic movie viewing. Importantly, rather than restricting our analysis to predefined anatomical areas, we constructed a frontal sector coordinate system and identified a modality-specific gradient, transitioning from the ventral region associated with auditory information to the dorsal region associated with visual information. By constructing a continuous ventrodorsal index from our coordinate system, we were able to quantify modality transition trends in a fully data-driven manner without imposing rigid anatomical assumptions. This suggests a spatially continuous coordinate system along the ventral–dorsal axis of the frontal cortex [ 58 – 60 ], reflecting a functional topographic organization that supports audiovisual processing under naturalistic experiences. In addition, prior perceptual research has shown that observers integrate multimodal cues by assigning weights proportional to their reliability across different task contexts [ 34 , 38 , 39 , 61 , 62 ]. Under this view, more reliable modalities tend to exert greater influence on unified perception. Our findings suggest a similar “modality assignment strategy” during movie viewing, in which frontal neural resources are dynamically allocated between auditory and visual modalities based on contextual demands. Unlike prior studies that manipulated stimulus reliability by adding noise, movies are carefully crafted and contain minimal artificial distortion [ 11 , 63 ], with both auditory and visual streams maintaining consistently high fidelity. Therefore, the modality assignment effect we observed is unlikely to stem from sensory reliability per se , but instead reflects how different modalities contribute to forming a coherent perceptual object. This aligns with a framework of goal-driven, top-down control over multisensory processing [ 64 ]. However, the modality assignment effect could also reflect the influence of attention induced by language intelligibility. To exclude this alternative explanation, we conducted a series of control analyses (Fig. S9A-F), which demonstrated that attentional engagement did not significantly contribute to the observed modality assignment effect. Moreover, we found that the modality assignment effect peaked around 300 ms, consistent with the temporal window associated with multisensory integration [ 65 – 67 ]. The frontal cluster associated with the “modality assignment” also aligns with prior evidence implicating frontal regions in processing novel information during multimodal integration [ 68 – 70 ]. Collectively, our results provide an expansion from previous well-controlled laboratory experiments to naturalistic settings, revealing a goal-oriented modality assignment strategy for audiovisual integration during free viewing of movies. Several limitations should be noted. First, the constraints of bedside iEEG experiments in the hospital (e.g., variable distance between participants and the laptop) may introduce variability in cortical activity and reduce SNR. Second, the absence of behavioral measures from patients (e.g., eye-tracking recordings and direct assessments of movie comprehension) limited our ability to directly account for attention and eye movements. Third, unbalanced electrode coverage across hemispheres (Fig. S1A) precluded an investigation of the potential lateralization effects during movie viewing. Lastly, vlPFC responses were primarily observed during intelligible speech, raising the possibility that the frontal audiovisual gradient may be influenced by language-related processes rather than purely audiovisual. The present study did not include naturalistic stimuli lacking linguistic content, thus we could not rule out this interpretation, and we believe that the relationship between language and audiovisual processing warrants further investigation. In summary, our study demonstrates that the lateral frontal cortex serves as a key region for processing audiovisual information under naturalistic experiences. We reveal modality-specific representations in the frontal cortex, with auditory information encoded toward ventral regions and visual information toward dorsal areas. We also show that distinct frontal substrates are largely involved in the modality assignment process. Together, these findings provide evidence for a distinct spatial organization in the frontal cortex supporting flexible multisensory processing of real-world scenarios. Methods Participants The first cohort consisted of 19 patients undergoing neurosurgical evaluation for refractory epilepsy (12 females; all right-handed; 32.053 ± 11.297 years old; 12 grid and 7 stereotactic coverages; see Supplementary Table S1). These patients participated in a multilingual movie watching task during their stay in the hospital. Electrode implantation was determined exclusively based on clinical requirements. All participants were fluent in English and had no knowledge of the foreign languages used in the movie. Written informed consent was obtained from all participants either prior to the neurosurgical procedure or upon admission to the hospital unit for evaluation. A second cohort of 26 crowdsourced raters was included in an online movie rating task (11 females; 3 left-handed; 42.154 ± 8.694 years old). Participants self-identified as either native English speakers (23 raters) or proficient English speakers (3 raters), none reported knowledge of the foreign languages used in the movie. The current study protocol was approved by the NYU Langone Medical Cen-ter Committee on Human Research. All participants were reimbursed for their participation. Task and procedure In the intracranial experiment ( Movie watching task ), patients were asked to silently watch a 12-minute long multilingual movie called Foreign Language Movie (FLM; https://vimeo.com/61040183 ). This short movie contains four distinct story-lines: 1) a Greek mother reveals her terminal disease to her daughter (Greek and English); 2) two German girls discuss saving an unborn life (German); 3) a married couple’s relationship comes to an end (English); 4) a young couple conversing in French and English about getting married. The stories are intricately interwoven, forming a cohesive and unified story. English subtitles are provided in the foreign language conversations to aid in movie comprehension. The movie was presented on a 15” laptop placed 0.5–1.0 meters in front of the participants. Audio was delivered through a nearby speaker beside the laptop, with the volume adjusted to a clear and comfortable level for each participant. To synchronize neural activity with the movie, trigger pulses were transmitted to the EEG recording system at the movie onset and at one-minute intervals thereafter. For all included patients, the experiment was not intervened by clinicians or relatives. The behavioral experiment ( Online movie-rating task ) was conducted using the Amazon Mechanical Turk (AMT) platform and required participants to watch the movie twice. Before the experiment, raters were instructed to “find a quiet place, make sure that your internet has a stable connection and will not be disturbed during the experiment”. During the first viewing, participants watched the entire movie and were asked to provide continuous engagement ratings by adjusting a scale bar from 1 (not engaging) to 99 (extremely engaging) [ 71 ]. The scale bar remained visible below the video window throughout the session. If participants did not rate for longer than 5 seconds (i.e., did not move the cursor), the movie would automatically pause, and a flashing reminder would prompt them to rate. Immediately after the first viewing, participants completed a free-recall task to assess comprehension. The instruction for this section was “Please describe what you remember from the movie. Please try to recount events in their original order, if possible, and describe events in writing in as much detail as possible (spend approximately 5-10 minutes). During your description, completeness and detail are more important than the order of events, typos, and grammar.” [ 72 ]. Raters whose responses lacked sufficient detail or omitted one or more storylines were excluded from further analysis. During a second viewing, the movie was presented as a sequence of 2-3 second clips. The segmentation was performed using the FFmpeg toolbox ( https://www.ffmpeg.org/ ). For each clip, the raters were asked to evaluate two aspects using scale bars: 1) how important is this clip for understanding the entire movie from 1 (not important) to 99 (extremely important); 2) what type of information is more important for understanding the current movie clip from 1 (completely visual) to 99 (completely auditory) ( Fig. 5E ). If unsure of their ratings, the raters could replay the clips before proceeding to the next clip. The resulting clip-specific ratings were concatenated (based on frame counts) to reconstruct continuous movie rating signals ( Fig. 5C-D ). Statistical testing For both behavioral and neural data, the D’Agostino test was used to determine the normality of the distribution prior to statistical analysis. If data followed a normal distribution, a parametric test was used (e.g., paired t -test); otherwise, a non-parametric test was used (e.g., Wilcoxon signed-rank test). In addition, we applied a permutation test to identify active electrodes (e.g., Fig. 1B ) and assess significance in the encoding models (e.g., Fig. 3C-D ). Detailed procedures are described in the corresponding Methods sections (see Methods: Active electrodes selection and Encoding modeling procedure). All statistical tests were two-tailed with a significance threshold of p < 0.05, unless stated otherwise. The false discovery rate (FDR) correction was applied when multiple comparisons were conducted. Data acquisition and preprocessing During the movie watching task, iEEG signals were recorded using one of two amplifier types (dictated by clinical location during acquisition): (1) NicoletOne amplifier (Natus Neurologics, Middleton, WI), with signals bandpass filtered from 0.16 to 250 Hz and digitized at 512 Hz; (2) Neuroworks Quantum Amplifier (Natus Biomedical, Appleton, WI), with signals recorded at a sampling rate of 2048 Hz, bandpass filtered between 0.01 to 682.67 Hz and then decimated to 512 Hz. For grid electrodes, a two-contact subdural strip facing the skull near the craniotomy site served as the reference electrode, and a similar strip screwed to the skull was used as the instrument ground. For sEEG electrodes, a five-lead subgaleal strip was placed facing the skull and used as both the ground and reference. Electrodes within the seizure onset zone (SOZ; Fig. S1A), or showing epileptiform activity, line noise artifacts, or large amplitude shifts were excluded. The data were then re-referenced using the common average reference (CAR) approach, in which the averaged signal was subtracted from all electrodes for each subject. To extract high-gamma broadband activity (70–150 Hz), we applied a multi-band averaging method. Specifically, we computed the z-scored analytic amplitude across eight logarithmically spaced frequency bands within this range using the Hilbert transform, and then averaged them [ 21 , 73 ]. This method enhances signal robustness as it takes the mean across multiple frequency bands and mitigates the bias of lower frequencies given the z-score normalization. Data were then corrected relative to the baseline (with silent, black scenes) for each channel. Electrode localization in individual space was performed by co-registering post-operative brain MRI or CT scans to preoperative MRI scans using a rigid-body transformation. Electrodes were then projected into the MNI space using a nonlinear DARTEL algorithm [ 74 ]. Further, anatomical locations of electrodes were determined using automated FreeSurfer segmentation of the preoperative MRI. Visualization of electrode locations on the cortical surface was performed using Mithra [ 75 ]. Active electrodes selection To identify the active electrodes involved in audiovisual processing during free viewing, we extracted the neural responses in four different audiovisual conditions ( Fig. 1A ): 1) English (EN) condition, in which dialogue is in English; 2) foreign language (FL) condition, in which characters communicate in other languages like French or German (English subtitles are provided to aid movie comprehension); 3) other sound (OS) condition, in which non-speech sounds are presented with visual scenes; and 4) silent (SI) condition, in which only visual scenes are presented without sound. Specifically, for the conditions with sound (i.e., EN, FL, and OS), we manually annotated the trial onsets and offsets based on the sound waveform and spectrogram using the Praat software ( https://www.fon.hum.uva.nl/praat/ ). The remaining intervals were then labeled as the silent period. To minimize the potential residual activity, we marked the onsets of silent periods 200-500 ms after the preceding sound offsets. Trials shorter than one second were excluded to ensure data stability and robustness. As a result, 44 EN trials, 46 FL trials, 58 OS trials, and 51 SI trials with comparable segment durations were included for analysis (one-way ANOVA: F (3, 198) = 1.020, p = 0.385, η 2 = 0.061; Fig. S3C). For each electrode, we aligned the onset of all trials and analyzed the potential changes within a one-second epoch in each condition. An electrode was considered active if its activity across trials was significantly greater than zero (one sample t - test) for a duration exceeding a threshold determined by the permutation procedure (Fig. S2). Specifically, in each permutation, we distorted the temporal alignment of the signals using a phase randomization approach [ 76 ], then identifying the maximum number of consecutive significant time points (i.e., significantly above zero) using the same analysis. Repeating this procedure 1000 times yielded a null distribution of significant duration. Then, the p -value can be derived by calculating the proportion in the null distribution that exceeded the real significant duration. An electrode was considered significant if p < 0.05 and survived the false discovery rate (FDR) correction of multiple comparisons (Fig. S2C). The union of electrodes identified as active across the four conditions was used for subsequent analyses. Functional clustering analysis We performed unsupervised non-negative matrix factorization (NMF) to summarize neural underpinnings associated with audiovisual processing [ 21 , 23 ]. By introducing the non-negativity constraint, NMF is able to effectively derive the parts-based representation [ 77 ]. Mathematically, a non-negative data matrix ( D ∈ R m×n ) can be factorized into two components W ∈ R m×k and H ∈ R k×n by minimizing the error function iteratively (using the Frobenius norm): where the term k is a hyperparameter representing the number of factorized clusters. In practice, the matrix D represents the concatenated neural responses of active electrodes across four conditions (Fig. S3A). We imposed non-negativity in D by setting all negative elements to zero, as no significant negative activations were identified with the permutation test (see Methods: Active electrodes selection). W is the weighting matrix, representing the extent to which electrodes were assigned to each cluster. H is the temporal prototype matrix, demonstrating the typical temporal dynamics for each cluster. To identify the best number of clusters ( k ), we applied the silhouette method to measure the similarity of each sample to its own cluster relative to others [ 78 ]. The highest silhouette score, which indicates the optimal clustering assignment, was found when k = 2 (Fig. S3B). Further, we observed that the two factorized clusters were largely independent with minimal overlap (Fig. S3D). Therefore, electrodes were assigned to clusters based on their maximum contributions in the weighting matrix W , and were normalized (0-1) for visualization purposes ( Fig. 2A ). Audiovisual feature extraction We employed several computational models to derive generalizable feature spaces for both auditory and visual modalities ( Fig. 3A ). Specifically, we extracted features at two levels. At the low levels, we captured the spectrogram and motion energy features to represent auditory and visual information, respectively. For the spectrogram, we applied the librosa Python package [ 79 ] to extract spectral frequencies ranging from approximately 20 to 8000 Hz, which serves as an effective acoustic representation of auditory perception. For the motion energy features, we utilized a pyramid of non-linear spatiotemporal Gabor filters, which are designed to capture information across multiple positions, orientations, spatiotemporal frequencies, and motion directions [ 25 ]. This filter bank spans both small and large receptive field sizes, making it sensitive to both local changes and global motions. The feature extraction procedure was implemented with the pymoten package ( https://github.com/gallantlab/pymoten ). At the high levels, we leveraged transformer-based deep-learning models, which have demonstrated significant improvements in complicated downstream tasks [ 26 , 27 ]. The core mechanism of the transformer architecture is multihead self-attention, an algorithm that calculates context-dependent weights to integrate information across input vectors [ 80 ]. In practice, we extracted features from two representative models: wav2vec 2.0 for auditory information [ 28 ] and Vision Transformer (ViT) for visual information [ 29 ]. To avoid task-specific influences and capture representative features of audiovisual information, we selected vector embeddings from the middle layers, as prior research indicated that these layers produce the most comprehensive features for both the wav2vec 2.0 [ 53 , 81 ] and the ViT models [ 82 ]. In detail, for the auditory domain, we extracted 7th layer embeddings of transformer blocks in wav2vec 2.0-Base model (number of convolution layers = 7, number of transformer layers = 12, hidden size = 768, number of self-attention heads = 8, total parameters = 95 M) [ 28 ]. We kept the same sampling rate (16 kHz) and batch size (15.6 seconds) with the pretrain models during feature extraction to minimize potential input format biases. For the visual domain, we extracted the [CLS] embeddings from the 16th layer of the ViT-Huge model (number of transformer layers = 32, hidden size = 1280, number of self-attention heads = 16, total parameters = 632 M) [ 29 ]. [CLS] is an extra learnable “token” introduced to the transformer encoder, representing image representations for the classification task. Furthermore, to control the potential effect of visual subtitles in the Foreign Language condition ( Fig. 4D ), we extracted semantic embeddings of these sentences utilizing the robustly optimized BERT approach (RoBERTa) model [ 83 ]. RoBERTa is a generalized BERT representational model, featuring a multi-layer bidirectional transformer encoder conditioned on both left and right contexts [ 84 ]. Compared with the original BERT model, RoBERTa model has a larger architecture, with a bigger batch size and more training data. Here, we applied the RoBERTa-Large model (the number of layers = 24, the hidden size = 1024, the number of self-attention heads = 16, total parameters = 355 M) [ 83 ], and selected the features from the penultimate hidden layers, as the final layers tend to be biased by the model training objectives (i.e., masked language model and next sentence prediction). Practically, we represented sentence embeddings by averaging all word vectors within each sentence [ 85 – 87 ]. All feature extraction procedures associated with deep-learning models were performed using the PyTorch (v2.1.0) framework [ 88 ] in Python (v3.10). Encoding modeling procedure We utilized encoding models to probe the neural substrates responsive to various features by leveraging the multivariate temporal response function (mTRF) [ 30 ]. Specifically, the mTRF is a linear model that incorporates time lags ( τ ) between features ( X ) and neural activities ( y ) ( Fig. 3B ). It can be formulated in a convolutional form: where w represents the weights (also known as the mTRF filter) to be estimated, and ɛ is the additive noise. Both neural signals and features were z-scored prior to model training. Given the relatively large parameter spaces (number of lags × feature dimensions), we applied L2-norm regularization during parametric estimation to mitigate the potential over-fitting problem. The encoding model was implemented using four-fold cross-validation (CV), where the data were split into four groups - three groups used for training and the withheld one for testing. Model performance was assessed using Pearson’s correlation ( r value) between predicted ( y ^) and real signal ( y ). The ridge parameter was optimized using a grid search approach to maximize the r-value during training. Final model performance was calculated as the average r-value across all four folds. In addition, we decided to train mTRF models with delays up to one second (i.e., τ max = 1 s) for two reasons. First, since only trials longer than 1 second were included, using a consistent mTRF window ensures sufficient data for model estimation. Second, a 1-second window adequately captures key neural processes involved in audiovisual processing during natural movie viewing, including early sensory responses (50–200 ms) [ 73 , 89 ], audiovisual integration (100–300 ms) [ 65 – 67 ], and semantic processing (∼400 ms) [ 90 ]. This window choice is supported by our additional mTRF analyses using progressively increasing lag windows (0–600 ms, 0–800 ms, 0–1000 ms, and –100–1100 ms) for both auditory and visual modalities, which showed no systematic changes across window sizes ( r -values clustered around the diagonal; Fig. S6). Further, we con-ducted the encoding analysis on all electrodes rather than the active electrodes that survived from the permutation test, avoiding the potential double dipping problem [ 91 ]. Specifically, three types of mTRF models were primarily employed: 1) Auditory and visual mTRF models. The auditory and visual mTRF models were trained independently. For each modality, low- and high-level features were first concatenated, followed by a principal component analysis (PCA) procedure to reduce dimensionality to enhance computational efficiency ( Fig. 3A ). To determine the optimal number of PCs, we trained mTRF models using 5 to 50 principal components (PCs) in increments of 5 PCs (incremental PCs test). Then, we compared the average r-values to identify the elbow point. As a result, this procedure revealed that 15 PCs were optimal (Fig. S4A). 2) Audiovisual assignment mTRF model. The averaged global modality feature obtained from the behavior experiment ( Fig. 5F ) was fed into the encoding model. 3) Subtitle mTRF model. The dimension of the subtitle BERT features in the foreign language (FL) condition was also relatively large (i.e., 1024 dimensions). Therefore, PCA was also employed here to reduce the feature dimensions. In the end, 50 PCs were selected for the subtitles, explaining 95.01% variance of the feature space. Furthermore, we evaluated the statistical significance level of the model performances using a permutation test, wherein the neural data was phase-randomized for 1000 iterations. We repeated the above encoding model training procedure for each permutation to generate a null distribution of r-values. Then, the p-value can be derived by calculating the proportion of r-values in the null distribution that exceeded the real r-value. An electrode was considered significant if p < 0.05 and survived the false discovery rate (FDR) correction of multiple comparisons. Timing analysis of encoding models To estimate the latencies of the neural responses, we identified the time delays corresponding to the peak weights in the mTRF filters for each electrode (Fig. S4C). The peak timings were first grouped by ROI (Fig. S1B). Then, we applied a bootstrap procedure to assess the response lags for each ROI. Specifically, we estimated the 95% confidence intervals (CI) by sampling the data with replacement and calculated the mean for 10000 iterations. A permutation test was conducted to determine statistical significance across different ROIs ( Fig. 3E ). For each comparison, the group labels of each element were randomly shuffled for 1000 iterations, yielding a null distribution of mean differences. Then, the p -value was calculated by determining the position of the real difference in the null distribution. The resulting p -value was corrected for multiple comparisons using the FDR method. Frontal ventral-dorsal gradient analysis We constructed the AV index and frontal coordinate to quantify the representational distribution of audiovisual information in the prefrontal cortex. Specifically, the AV index ( I AV ) was applied to capture the electrode selectivity for auditory or visual modalities, based on r-values obtained from the auditory and visual mTRF models ( r A and r V ): By definition, the range of I AV is from -1 to 1, where 1 represents the electrode is complete visual tuning and -1 represents complete auditory tuning. In addition, we established a coordinate system to quantify the ventral-to-dorsal gradient of the frontal cortex. Conventionally, the ventrodorsal organization of the frontal cortex has been characterized based on cytoarchitecture and connectivity, where dlPFC corresponds to Brodmann areas (BA) 46 and 9/46 and vlPFC corresponds to BA 45 and 47/12 [ 92 , 93 ]. This dichotomy aligns with the division between the inferior frontal gyrus (IFG) and middle frontal gyrus (MFG), demarcated by the inferior frontal sulcus (IFS). Notably, this boundary is not strictly cardinal (i.e., horizontal or vertical), necessitating the use of both y and z MNI coordinates to accurately capture the ventrodorsal distinction. To this end, we defined a frontal polar coordinate system within the y - z plane of the MNI space, taking the intersection of precentral sulcus and Sylvian fissure as the origin (MNI coordinate: [-55, 15 , -8]; please note that the x coordinate was not critical in this analysis). In this system, a smaller radius ( d , mm) denotes a more ventral position, whereas a larger radius corresponds to a more dorsal location ( Fig. 4A, F ). This approach also enables us to quantify the modality transition in a fully data-driven manner without imposing rigid anatomical assumptions. Encoding model partitioning We employed a partitioning procedure to obtain the unique effect of a specific feature space [ 13 ]. For instance, to quantify the unique effects of the low- and high-level audiovisual features ( Fig. 4C ), we trained and evaluated models using either low-or high-level features ( M l and M h ), and compared them with models trained using both feature spaces ( M l∪h ). Then, the pure contribution of the low-level features was computed as the difference between the combined model and the high-level model ( M l∪h − M h ), and that of high-level features was determined as the discrepancy between the combined model and the low-level model ( M l∪h − M l ). We fitted all mTRF models mentioned above with 15 dimensions using PCA to ensure comparable model complexity. Similarly, to quantify the unique contributions of audiovisual assignment and features, we trained separate models using either audiovisual assignment ( M a ) or audiovisual features ( M f ), as well as a combined model incorporating both ( M a∪f ). The unique effect of the modality assignment was calculated as the difference between the combined model and the audiovisual features model ( M a∪f − M f ), while the unique contribution of audiovisual features was determined conversely ( M a∪f − M a ; Fig. S10A). In addition, to determine the influences of the subtitles, we built a full model that combined both the visual and subtitle features. Then, the unique subtitle effect was obtained by calculating the decrease in r -values when subtitle features were removed from the full model ( Fig. 4D-E ; only for the Foreign Language condition). Data and Code availability The dataset of the current study will be made available from the authors upon request and documentation is provided that the data will be strictly used for research purposes and will comply with the terms of our study IRB. The code is available upon publication at https://github.com/flinkerlab/ . Author Contributions F.Z., A.K.-G and A.F. conceived and designed the research; P.D. and D.F. provided clinical care, W.D. and P.R. provided clinical care and performed neurosurgery; A.M. and O.D. provided clinical care and critically revised the article; F.Z. and A.F. wrote the paper. Declarations The authors declare no competing interests. Supplementary Information Download figure Open in new tab Fig. S1: (A) The electrode coverage (upper panel; 2688 contacts in total) and the seizure onset zone (SOZ) electrode distribution (lower panel; 232 contacts in total which were removed from analysis including depth and surface) for all 19 participants involved in the movie viewing task. (B) Three ROIs used in this study, including the superior temporal gyrus (STG), frontal regions (comprising the inferior frontal gyrus, middle frontal gyrus, and pre-central gyrus), and the occipital regions (comprising the lateral occipital gyri, lingual gyrus, and cuneus gyrus). (C) The brain maps of the active electrodes for all four conditions that survived the permutation test (Fig. S2; see Methods: Active electrodes selection). Download figure Open in new tab Fig. S2: Permutation procedure for selecting the active electrodes. (A) Representative signals from two electrodes (right panel): one located in the STG (purple) and the other located in the ITG (green). The STG electrode exhibited responses to auditory events (left panel), whereas the ITG electrode showed no response. (B) Event-related analysis. The neural activity during the events was aligned to the onset and averaged across time (the temporal window was set as 1 seconds). Then, the statistical significance was determined for each time point using a two-sided t -test with p < 0.05. The maximum length of the consecutive significant time points (i.e., maximum significant length ( L max )) is recorded. (C) Permutation test. We repeated the event-related analysis on the phase-randomized signals for 1000 times, and computed the maximum significant length for each permutation ( ). This procedure generated a null distribution of the . The real length L max was then compared against this null distribution, with significance determined at the top 5th percentile threshold (FDR corrected). Download figure Open in new tab Fig. S3: Unsupervised functional clustering. (A)The data for all four conditions were concatenated and then decomposed into a weight matrix and a temporal proto-type matrix (see Methods: Functional clustering analysis). (B) The result of silhouette method used to determine the optimal number of clusters (k). Higher scores indicate better clustering performance. The shaded area around the curve represents the standard error (SE). (C) Bar plots of trial lengths across four conditions. Each dot rep-resents an individual trial. (D) The brain maps for the two NMF clusters separately. Download figure Open in new tab Fig. S4: (A) Results of the incremental principal components (PCs) test across all the mTRF models (upper panel), auditory mTRF models (middle panel), and visual mTRF models (lower panel). The red error bars indicate the elbow points identified using kneed Python toolbox, which employs a rotation-based algorithm to detect the maximum curvature [ 94 ]. (B) Brain map showing significant electrodes for both audio and visual mTRF models. (C) Timing of peak weights across electrodes that showed significant responses in the auditory and visual mTRF models. Download figure Open in new tab Fig. S5: Results of encoding models for auditory (A) and visual (B) information in four conditions. N represents the number of significant electrodes. Download figure Open in new tab Fig. S6: Brain maps and r -value comparisons across increasing lag windows (0–600 ms, 0–800 ms, 0–1000 ms, and –100–1100 ms) for both auditory (A) and visual (B) mTRF models. No permutation test or multiple correction were applied; a hard thresh-old of r > 0.1 was used for visualization. Download figure Open in new tab Fig. S7: (A) The mTRF correlation maps in the parietal regions (including the post-central gyrus, inferior parietal lobe, and supramarginal gyrus) for the English (EN) and Foreign Language (FL) conditions. (B) Bar plots of the auditory and visual mTRF models in the parietal cortex for the EN and FL conditions. Error bars represent 95% confidence interval (CI). Download figure Open in new tab Fig. S8: Anatomical distributions for the NMF, mTRF and active electrodes. (A) Nine ROIs illustrated on the brain. (B) Pie charts showing the anatomical distribution of electrodes from the auditory and visual functional clusters (NMF approach). (C) Pie charts showing the anatomical distribution of the active electrodes. (D) Pie charts depicting the anatomical distribution of significant electrodes from the auditory and visual encoding models (mTRF approach). (E) Pie chart showing the anatomical distribution of significant electrodes responsive to the audiovisual assignment (i.e., the global modality). The total number of electrodes within each ROI, and the number (and percentage) of electrodes for figure B-E are shown in Table S2. Download figure Open in new tab Fig. S9: (A) The attentional engagement ratings from all participants over time (upper panel) and the average time series across participants (lower panel). (B) Illustration of the engagement rating task conducted on the Amazon Mechanical Turk platform. (C) Correlations between attentional engagement ratings and global context (one-sample t test; T (25) = 11.987, p < 0.001, d = 2.351), local modality (one-sample t test; T (25) = 2.068, p = 0.074, d = 0.406), as well as global modality (one-sample t test; T (25) = −0.608, p = 0.549, d = 0.119). Each dot represents an individual rater. (D) The mTRF result for the attentional engagement effect. (E) The mTRF results for the original global modality (left) and after removing the attentional effect (right). (F) Comparison of r values between the two models across all electrodes. (G) Markers of abrupt changes in the averaged global modality (black; quantified as its first derivative) and the film cuts (red). (H) Correlations between the film cuts and the global modality changes across participants. Each dot represents an individual rater. Error bars in figure C and H represent 95% CI. Download figure Open in new tab Fig. S10: (A) The unique contributions for modality assignment and audiovisual features in frontal electrodes shown in Fig. 6A (see Methods: Encoding model partitioning). A significant effect was observed only for the audiovisual assignment (one sample t -test, t (33) = 7.968, p < 0.001, d = 1.366, FDR corrected) but not for the audiovisual features (one sample t -test, t (33) = −0.927, p = 0.361, d = 0.159, FDR corrected). The error bars represent 95% CI. (B) Timing analysis of audiovisual (AV) assignment. The timing of AV assignment in the frontal cortex (297 ± 47 ms) was significantly later than the neural representation of AV features in the perceptual regions. A significant difference was also observed between the timing of encoding the AV features and the modality assignment in the frontal regions. The distributions were estimated based on a bootstrap procedure, and the statistical significance between regions was assessed via a permutation test (see Methods: Timing analysis of encoding models). The error bars represent the standard deviation (SD). Significance levels are set as p < 0.001 (***), p < 0.01 (**), p < 0.05 (*), and p ≥ 0.05 (n.s.). View this table: View inline View popup Download powerpoint Table S1: Patient information View this table: View inline View popup Download powerpoint Table S2: Electrode anatomical coverage and significant responses Acknowledgements We thank A. Ferrari, M. Landy, and S. Michelmann for their comments on an early version of the manuscript, A. Morton for sharing the movie license for academic use, and other members in Flinker lab for extensive discussion. This work was supported by National Institutes of Health grants R01NS109367, R01NS115929, and R01DC018805 (A.F.) and National Science Foundationshum IIS-2309057 (A.F.). Funder Information Declared National Institute of Health , R01NS109367 , R01NS115929 , R01DC018805 National Science Foundationshum , IIS-2309057 Footnotes Figure 2 and 4 are revised. Supplementary figure 1, 3, 4, 6, 7, 9 are also updated. Additional control analyses have been described and discussed in the main text. References [1]. ↵ French , R.L. , DeAngelis , G.C .: Multisensory neural processing: from cue integration to causal inference . Current Opinion in Physiology 16 , 8 – 13 ( 2020 ) doi: 10.1016/j.cophys.2020.04.004 . Accessed 2023-06-06 OpenUrl CrossRef PubMed [2]. ↵ Noppeney , U .: Perceptual Inference, Learning, and Attention in a Multisensory World . Annual Review of Neuroscience 44 ( 1 ), 449 – 473 ( 2021 ) doi: 10.1146/annurev-neuro-100120-085519 . Accessed 2024-05-09 OpenUrl CrossRef PubMed [3]. ↵ Campbell , R. : The processing of audio-visual speech: empirical and neural bases . Philosophical Transactions of the Royal Society B: Biological Sciences 363 ( 1493 ), 1001 – 1010 ( 2007 ) doi: 10.1098/rstb.2007.2155 . Accessed 2024-02-09 OpenUrl CrossRef [4]. Cappe , C. , Rouiller , E.M. , Barone , P .: Multisensory anatomical pathways . Hearing Research 258 ( 1-2 ), 28 – 36 ( 2009 ) doi: 10.1016/j.heares.2009.04.017 . Accessed 2025-01-09 OpenUrl CrossRef PubMed Web of Science [5]. ↵ Gao , C. , Green , J.J. , Yang , X. , Oh , S. , Kim , J. , Shinkareva , S.V .: Audiovisual integration in the human brain: a coordinate-based meta-analysis . Cerebral Cortex 33 ( 9 ), 5574 – 5584 ( 2023 ) doi: 10.1093/cercor/bhac443 . Accessed 2024-02-02 OpenUrl CrossRef PubMed [6]. ↵ Sugihara , T. , Diltz , M.D. , Averbeck , B.B. , Romanski , L.M .: Integration of Auditory and Visual Communication Information in the Primate Ventrolateral Prefrontal Cortex . The Journal of Neuroscience 26 ( 43 ), 11138 – 11147 ( 2006 ) doi: 10.1523/JNEUROSCI.3550-06.2006 . Accessed 2024-02-09 OpenUrl Abstract / FREE Full Text [7]. ↵ Noppeney , U. , Ostwald , D. , Werner , S .: Perceptual Decisions Formed by Accumulation of Audiovisual Evidence in Prefrontal Cortex . The Journal of Neuroscience 30 ( 21 ), 7434 – 7446 ( 2010 ) doi: 10.1523/JNEUROSCI.0455-10.2010 . Accessed 2024-09-12 OpenUrl Abstract / FREE Full Text [8]. ↵ Romanski , L.M. : Integration of faces and vocalizations in ventral prefrontal cortex: Implications for the evolution of audiovisual speech . Proceedings of the National Academy of Sciences 109 (supplement 1), 10717 – 10724 ( 2012 ) doi: 10.1073/pnas.1204335109 . Accessed 2024-02-13 OpenUrl Abstract / FREE Full Text [9]. ↵ Bushara , K.O. , Hanakawa , T. , Immisch , I. , Toma , K. , Kansaku , K. , Hallett , M .: Neural correlates of cross-modal binding . Nature Neuroscience 6 ( 2 ), 190 – 195 ( 2003 ) doi: 10.1038/nn993 . Accessed 2024-09-12 OpenUrl CrossRef PubMed Web of Science [10]. ↵ Ozker , M. , Yoshor , D. , Beauchamp , M.S .: Frontal cortex selects representations of the talker’s mouth to aid in speech perception . eLife 7 , 30387 ( 2018 ) doi: 10.7554/eLife.30387 . Accessed 2023-09-07 OpenUrl CrossRef PubMed [11]. ↵ Sonkusare , S. , Breakspear , M. , Guo , C. : Naturalistic Stimuli in Neuroscience: Critically Acclaimed . Trends in Cognitive Sciences 23 ( 8 ), 699 – 714 ( 2019 ) doi: 10.1016/j.tics.2019.05.004 OpenUrl CrossRef PubMed [12]. ↵ Nastase , S.A. , Goldstein , A. , Hasson , U .: Keep it real: rethinking the primacy of experimental control in cognitive neuroscience . NeuroImage 222 , 117254 ( 2020 ) doi: 10.1016/j.neuroimage.2020.117254 . Accessed 2024-01-03 OpenUrl CrossRef PubMed [13]. ↵ Khosla , M. , Ngo , G.H. , Jamison , K. , Kuceyeski , A. , Sabuncu , M.R .: Cortical response to naturalistic stimuli is largely predictable with deep neural networks . Science Advances 7 ( 22 ) ( 2021 ) doi: 10.1126/sciadv.abe7547 OpenUrl FREE Full Text [14]. ↵ Setti , F. , Handjaras , G. , Bottari , D. , Leo , A. , Diano , M. , Bruno , V. , Tinti , C. , Cecchetti , L. , Garbarini , F. , Pietrini , P. , Ricciardi , E .: A modality-independent proto-organization of human multisensory areas . Nature Human Behaviour ( 2023 ) doi: 10.1038/s41562-022-01507-3 . Accessed 2023-01-24 OpenUrl CrossRef [15]. ↵ Parvizi , J. , Kastner , S .: Promises and limitations of human intracranial electroencephalography . Nature Neuroscience 21 ( 4 ), 474 – 483 ( 2018 ) doi: 10.1038/s41593-018-0108-2 OpenUrl CrossRef PubMed [16]. ↵ Flinker , A. , Chang , E.F. , Kirsch , H.E. , Barbaro , N.M. , Crone , N.E. , Knight , R.T .: Single-Trial Speech Suppression of Auditory Cortex Activity in Humans . Journal of Neuroscience 30 ( 49 ), 16643 – 16650 ( 2010 ) doi: 10.1523/jneurosci.1809-10.2010 OpenUrl Abstract / FREE Full Text [17]. ↵ Mukamel , R. , Gelbard , H. , Arieli , A. , Hasson , U. , Fried , I. , Malach , R .: Coupling Between Neuronal Firing, Field Potentials, and fMRI in Human Auditory Cor-tex . Science 309 ( 5736 ), 951 – 954 ( 2005 ) doi: 10.1126/science.1110913 . Accessed 2025-01-13 OpenUrl Abstract / FREE Full Text [18]. Nir , Y. , Fisch , L. , Mukamel , R. , Gelbard-Sagiv , H. , Arieli , A. , Fried , I. , Malach , R .: Coupling between Neuronal Firing Rate, Gamma LFP, and BOLD fMRI Is Related to Interneuronal Correlations . Current Biology 17 ( 15 ), 1275 – 1285 ( 2007 ) doi: 10.1016/j.cub.2007.06.066 . Accessed 2025-09-07 OpenUrl CrossRef PubMed Web of Science [19]. ↵ Ray , S. , Crone , N.E. , Niebur , E. , Franaszczuk , P.J. , Hsiao , S.S .: Neural Correlates of High-Gamma Oscillations (60–200 Hz) in Macaque Local Field Potentials and Their Potential Implications in Electrocorticography . The Journal of Neuroscience 28 ( 45 ), 11526 – 11536 ( 2008 ) doi: 10.1523/JNEUROSCI.2848-08.2008 . Accessed 2025-01-13 OpenUrl Abstract / FREE Full Text [20]. ↵ Hermes , D. , Miller , K.J. , Vansteensel , M.J. , Aarnoutse , E.J. , Leijten , F.S.S. , Ramsey , N.F .: Neurophysiologic correlates of fMRI in human motor cortex . Human Brain Mapping 33 ( 7 ), 1689 – 1699 ( 2012 ) doi: 10.1002/hbm.21314 . Accessed 2023-05-17 OpenUrl CrossRef PubMed Web of Science [21]. ↵ Yu , L. , Dugan , P. , Doyle , W. , Devinsky , O. , Friedman , D. , Flinker , A .: A left-lateralized dorsolateral prefrontal network for naming . Cell Reports ( 2025 ) doi: 10.1101/2024.05.15.594403 . Accessed 2025-02-06 OpenUrl Abstract / FREE Full Text [22]. Khalilian-Gourtani , A. , Wang , R. , Chen , X. , Yu , L. , Dugan , P. , Friedman , D. , Doyle , W. , Devinsky , O. , Wang , Y. , Flinker , A .: A Corollary Discharge Circuit in Human Speech . Proceedings of the National Academy of Sciences 121 ( 50 ) ( 2024 ) doi: 10.1101/2022.09.12.507590 . Accessed 2022-09-29 OpenUrl CrossRef [23]. ↵ Hamilton , L.S. , Edwards , E. , Chang , E.F .: A Spatial Map of Onset and Sustained Responses to Speech in the Human Superior Temporal Gyrus . Current Biology 28 ( 12 ), 1860 ( 2018 ) doi: 10.1016/j.cub.2018.04.033 OpenUrl CrossRef PubMed [24]. ↵ Holdgraf , C.R. , Rieger , J.W. , Micheli , C. , Martin , S. , Knight , R.T. , Theunissen , F.E . : Encoding and Decoding Models in Cognitive Electrophysiology . Frontiers in Systems Neuroscience 11 ( 2017 ) doi: 10.3389/fnsys.2017.00061 OpenUrl CrossRef PubMed [25]. ↵ Nishimoto , S. , Vu , A. , Naselaris , T. , Benjamini , Y. , Yu , B. , Gallant , J .: Recon-structing Visual Experiences from Brain Activity Evoked by Natural Movies . Current Biology 21 ( 19 ), 1641 – 1646 ( 2011 ) doi: 10.1016/j.cub.2011.08.031 . Accessed 2023-12-07 OpenUrl CrossRef PubMed [26]. ↵ Khan , S. , Naseer , M. , Hayat , M. , Zamir , S.W. , Khan , F.S. , Shah , M. : Transformers in Vision: A Survey . ACM Computing Surveys 54 ( 10 s), 1 – 41 ( 2022 ) doi: 10.1145/3505244 . Accessed 2025-01-31 OpenUrl CrossRef [27]. ↵ Gillioz , A. , Casas , J. , Mugellini , E. , Khaled , O.A .: Overview of the Transformer-based Models for NLP Tasks , pp. 179 – 183 ( 2020 ). doi: 10.15439/2020F20 . https://annals-csis.org/Volume21/drp/20.htmlAccessed2025 − 01 − 31 OpenUrl CrossRef [28]. ↵ Baevski , A. , Zhou , H. , Mohamed , A. , Auli , M. : wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. arXiv. arXiv:2006.11477 [cs, eess] ( 2020 ). http://arxiv.org/abs/2006.11477 Accessed 2022-11-09 [29]. ↵ [29] Dosovitskiy , A. , Beyer , L. , Kolesnikov , A. , Weissenborn , D. , Zhai , X. , Unterthiner , T. , Dehghani , M. , Minderer , M. , Heigold , G. , Gelly , S. , Uszkoreit , J. , Houlsby , N. : An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale . arXiv. arXiv:2010.11929 [cs] ( 2021 ). http://arxiv.org/abs/2010.11929 Accessed 2023-07-17 [30]. ↵ Crosse , M.J. , Di Liberto , G.M. , Bednar , A. , Lalor , E.C. : The Multivariate Tem-poral Response Function (mTRF) Toolbox: A MATLAB Toolbox for Relating Neural Signals to Continuous Stimuli . Frontiers in Human Neuroscience 10 ( 2016 ) doi: 10.3389/fnhum.2016.00604 . Accessed 2023-01-25 OpenUrl CrossRef PubMed [31]. ↵ Mesgarani , N. , Cheung , C. , Johnson , K. , Chang , E.F .: Phonetic Feature Encoding in Human Superior Temporal Gyrus . Science 343 ( 6174 ), 1006 – 1010 ( 2014 ) doi: 10.1126/science.1245994 OpenUrl Abstract / FREE Full Text [32]. ↵ Broderick , M.P. , Anderson , A.J. , Di Liberto , G.M. , Crosse , M.J. , Lalor , E.C .: Electrophysiological Correlates of Semantic Dissimilarity Reflect the Comprehension of Natural, Narrative Speech . Current Biology 28 ( 5 ), 803 ( 2018 ) doi: 10.1016/j.cub.2018.01.080 OpenUrl CrossRef PubMed [33]. ↵ Nentwich , M. , Leszczynski , M. , Russ , B.E. , Hirsch , L. , Markowitz , N. , Sapru , K. , Schroeder , C.E. , Mehta , A.D. , Bickel , S. , Parra , L.C .: Semantic novelty modulates neural responses to visual change across the human brain . Nature Communications 14 ( 1 ), 2910 ( 2023 ) doi: 10.1038/s41467-023-38576-5 . Accessed 2023-12-09 OpenUrl CrossRef PubMed [34]. ↵ Rohe , T. , Noppeney , U .: Cortical Hierarchies Perform Bayesian Causal Inference in Multisensory Perception . PLOS Biology 13 ( 2 ), 1002073 ( 2015 ) doi: 10.1371/journal.pbio.1002073 . Accessed 2023-09-15 OpenUrl CrossRef PubMed [35]. ↵ Ferrari , A. , Noppeney , U .: Attention controls multisensory perception via two distinct mechanisms at different levels of the cortical hierarchy . PLOS Biology 19 ( 11 ), 3001465 ( 2021 ) doi: 10.1371/journal.pbio.3001465 . Accessed 2024-08-09 OpenUrl CrossRef PubMed [36]. ↵ Körding , K.P. , Beierholm , U. , Ma , W.J. , Quartz , S. , Tenenbaum , J.B. , Shams , L. : Causal Inference in Multisensory Perception . PLoS ONE 2 ( 9 ), 943 ( 2007 ) doi: 10.1371/journal.pone.0000943 . Accessed 2023-09-15 OpenUrl CrossRef PubMed [37]. Talsma , D. , Senkowski , D. , Soto-Faraco , S. , Woldorff , M.G .: The multifaceted interplay between attention and multisensory integration . Trends in Cognitive Sciences 14 ( 9 ), 400 – 410 ( 2010 ) doi: 10.1016/j.tics.2010.06.008 . Accessed 2024-05-20 OpenUrl CrossRef PubMed Web of Science [38]. ↵ Fetsch , C.R. , Pouget , A. , DeAngelis , G.C. , Angelaki , D.E .: Neural correlates of reliability-based cue weighting during multisensory integration . Nature Neuroscience 15 ( 1 ), 146 – 154 ( 2012 ) doi: 10.1038/nn.2983 . Accessed 2023-10-04 OpenUrl CrossRef PubMed [39]. ↵ Cao , Y. , Summerfield , C. , Park , H. , Giordano , B.L. , Kayser , C .: Causal Inference in the Multisensory Brain . Neuron 102 ( 5 ), 1076 – 10878 ( 2019 ) doi: 10.1016/j.neuron.2019.03.043 . Accessed 2023-01-24 OpenUrl CrossRef PubMed [40]. ↵ Zheng , J. , Schjetnan , A.G.P. , Yebra , M. , Gomes , B.A. , Mosher , C.P. , Kalia , S.K. , Valiante , T.A. , Mamelak , A.N. , Kreiman , G. , Rutishauser , U .: Neurons detect cognitive boundaries to structure episodic memories in humans . Nature Neuroscience 25 ( 3 ), 358 – 368 ( 2022 ) doi: 10.1038/s41593-022-01020-w . Accessed 2022-09-20 OpenUrl CrossRef PubMed [41]. ↵ Clarke , J.M. , Halgren , E. , Scarabin , J.-M. , Chauvel , P .: Auditory and visual sensory representations in human prefrontal cortex as revealed by stimulus-evoked spike-wave complexes . Brain 118 ( 2 ), 473 – 484 ( 1995 ) doi: 10.1093/brain/118.2.473 . Accessed 2025-09-07 OpenUrl CrossRef PubMed Web of Science [42]. Michalka , S. , Kong , L. , Rosen , M. , Shinn-Cunningham , B. , Somers , D .: Short-Term Memory for Space and Time Flexibly Recruit Complementary Sensory-Biased Frontal Lobe Attention Networks . Neuron 87 ( 4 ), 882 – 892 ( 2015 ) doi: 10.1016/j.neuron.2015.07.028 . Accessed 2024-01-18 OpenUrl CrossRef PubMed [43]. Noyce , A.L. , Cestero , N. , Michalka , S.W. , Shinn-Cunningham , B.G. , Somers , D.C . : Sensory-Biased and Multiple-Demand Processing in Human Lateral Frontal Cortex . The Journal of Neuroscience 37 ( 36 ), 8755 – 8766 ( 2017 ) doi: 10.1523/JNEUROSCI.0660-17.2017 . Accessed 2024-05-28 OpenUrl Abstract / FREE Full Text [44]. ↵ Noyce , A.L. , Lefco , R.W. , Brissenden , J.A. , Tobyne , S.M. , Shinn-Cunningham , B.G. , Somers , D.C .: Extended Frontal Networks for Visual and Auditory Working Memory . Cerebral Cortex 32 ( 4 ), 855 – 869 ( 2022 ) doi: 10.1093/cercor/bhab249 . Accessed 2024-04-11 OpenUrl CrossRef PubMed [45]. ↵ Romanski , L.M .: Domain specificity in the primate prefrontal cortex. Cognitive, Affective , & Behavioral Neuroscience 4 ( 4 ), 421 – 429 ( 2004 ) doi: 10.3758/CABN.4.4.421 . Accessed 2025-09-07 OpenUrl CrossRef PubMed [46]. ↵ Romanski , L.M . : Representation and Integration of Auditory and Visual Stimuli in the Primate Ventral Lateral Prefrontal Cortex . Cerebral Cortex 17 ( suppl 1 ), 61 – 69 ( 2007 ) doi: 10.1093/cercor/bhm099 . Accessed 2025-09-07 OpenUrl CrossRef PubMed Web of Science [47]. ↵ Plakke , B. , Romanski , L.M .: Auditory connections and functions of prefrontal cortex . Frontiers in Neuroscience 8 ( 2014 ) doi: 10.3389/fnins.2014.00199 . Accessed 2025-10-20 OpenUrl CrossRef PubMed [48]. ↵ Rilling , J.K. , Glasser , M.F. , Preuss , T.M. , Ma , X. , Zhao , T. , Hu , X. , Behrens , T.E.J .: The evolution of the arcuate fasciculus revealed with comparative DTI . Nature Neuroscience 11 ( 4 ), 426 – 428 ( 2008 ) doi: 10.1038/nn2072 . Accessed 2025-10-20 OpenUrl CrossRef PubMed Web of Science [49]. ↵ Giampiccolo , D. , Herbet , G. , Duffau , H .: The inferior frontooccipital fasciculus: bridging phylogeny, ontogeny and functional anatomy . Brain 148 ( 5 ), 1507 – 1525 ( 2025 ) doi: 10.1093/brain/awaf055 . Accessed 2025-10-20 OpenUrl CrossRef [50]. ↵ Honey , C.J. , Thesen , T. , Donner , T.H. , Silbert , L.J. , Carlson , C.E. , Devinsky , O. , Doyle , W.K. , Rubin , N. , Heeger , D.J. , Hasson , U .: Slow Cortical Dynamics and the Accumulation of Information over Long Timescales . Neuron 76 ( 2 ), 423 – 434 ( 2012 ) doi: 10.1016/j.neuron.2012.08.011 OpenUrl CrossRef PubMed Web of Science [51]. ↵ Fedorenko , E. , Scott , T.L. , Brunner , P. , Coon , W.G. , Pritchett , B. , Schalk , G. , Kanwisher , N .: Neural correlate of the construction of sentence meaning . Proceedings of the National Academy of Sciences 113 ( 41 ), 6256 – 6262 ( 2016 ) doi: 10.1073/pnas.1612132113 OpenUrl Abstract / FREE Full Text [52]. ↵ Wallach , H. , Larochelle , H. , Beygelzimer , A. , Alché-Buc , F.d. , Fox , E. , Garnett , R. [52] Toneva , M. , Wehbe , L. : Interpreting and improving natural-language processing (in machines) with natural language-processing (in the brain) . In: Wallach , H. , Larochelle , H. , Beygelzimer , A. , Alché-Buc , F.d. , Fox , E. , Garnett , R. (eds.) Advances in Neural Information Processing Systems , vol. 32. Curran Associates, Inc. ,( 2019 ). https://proceedings.neurips.cc/paper/2019/file/749a8e6c231831ef7756db230b4359c8-Paper.pdf [53]. ↵ Millet , J. , Caucheteux , C. , Orhan , P. , Boubenec , Y. , Gramfort , A. , Dunbar , E. , Pallier , C. , King , J.-R .: Toward a realistic model of speech processing in the brain with self-supervised learning . 36th Conference on Neural Information Processing Systems (NeurIPS 2022) ( 2022 ). arXiv:2206.01685 [cs, q-bio]. Accessed 2022-11-17 [54]. Goldstein , A. , Zada , Z. , Buchnik , E. , Schain , M. , Price , A. , Aubrey , B. , Nastase , S.A. , Feder , A. , Emanuel , D. , Cohen , A. , Jansen , A. , Gazula , H. , Choe , G. , Rao , A. , Kim , C. , Casto , C. , Fanda , L. , Doyle , W. , Friedman , D. , Dugan , P. , Melloni , L. , Reichart , R. , Devore , S. , Flinker , A. , Hasenfratz , L. , Levy , O. , Hassidim , A. , Brenner , M. , Matias , Y. , Norman , K.A. , Devinsky , O. , Hasson , U .: Shared computational principles for language processing in humans and deep language models . Nature Neuroscience 25 ( 3 ), 369 – 380 ( 2022 ) doi: 10.1038/s41593-022-01026-4 OpenUrl CrossRef PubMed [55]. Wang , A.Y. , Kay , K. , Naselaris , T. , Tarr , M.J. , Wehbe , L .: Better models of human high-level visual cortex emerge from natural language supervision with a large and diverse dataset . Nature Machine Intelligence 5 ( 12 ), 1415 – 1426 ( 2023 ) doi: 10.1038/s42256-023-00753-y . Accessed 2024-10-20 OpenUrl CrossRef [56]. ↵ Zada , Z. , Goldstein , A. , Michelmann , S. , Simony , E. , Price , A. , Hasenfratz , L. , Barham , E. , Zadbood , A. , Doyle , W. , Friedman , D. , Dugan , P. , Melloni , L. , Devore , S. , Flinker , A. , Devinsky , O. , Nastase , S.A. , Hasson , U .: A shared model-based linguistic space for transmitting our thoughts from brain to brain in natural conversations . Neuron , 0896627324004604 ( 2024 ) doi: 10.1016/j.neuron.2024.06.025 . Accessed 2024-08-20 OpenUrl CrossRef [57]. ↵ Braga , R.M. , Hellyer , P.J. , Wise , R.J.S. , Leech , R .: Auditory and visual connectivity gradients in frontoparietal cortex . Human Brain Mapping ( 2017 ) [58]. ↵ Huntenburg , J.M. , Bazin , P.L. , Margulies , D.S .: Large-Scale Gradients in Human Cortical Organization . Trends in Cognitive Sciences 22 ( 1 ), 21 – 31 ( 2018 ) doi: 10.1016/j.tics.2017.11.002 OpenUrl CrossRef PubMed [59]. Bernhardt , B.C. , Smallwood , J. , Keilholz , S. , Margulies , D.S .: Gradients in brain organization . NeuroImage 251 , 118987 ( 2022 ) doi: 10.1016/j.neuroimage.2022.118987 . Accessed 2025-04-12 OpenUrl CrossRef [60]. ↵ Abdallah , M. , Zanitti , G.E. , Iovene , V. , Wassermann , D .: Functional gradients in the human lateral prefrontal cortex revealed by a comprehensive coordinate-based meta-analysis . eLife 11 , 76926 ( 2022 ) doi: 10.7554/eLife.76926 . Accessed 2025-09-07 OpenUrl CrossRef PubMed [61]. ↵ Ernst , M.O. , Banks , M.S .: Humans integrate visual and haptic information in a statistically optimal fashion . Nature 415 ( 6870 ), 429 – 433 ( 2002 ) doi: 10.1038/415429a . Accessed 2024-10-01 OpenUrl CrossRef PubMed Web of Science [62]. ↵ Hillis , J.M. , Ernst , M.O. , Banks , M.S. , Landy , M.S. : Combining Sensory Information: Mandatory Fusion Within, but Not Between, Senses . Science 298 ( 5598 ), 1627 – 1630 ( 2002 ) doi: 10.1126/science.1075396 . Accessed 2025-03-12 OpenUrl Abstract / FREE Full Text [63]. ↵ Kringelbach , M.L. , Perl , Y.S. , Tagliazucchi , E. , Deco , G. : Toward naturalistic neuroscience: Mechanisms underlying the flattening of brain hierarchy in movie-watching compared to rest and task . Science Advances 9 ( 2 ), 6049 ( 2023 ) doi: 10.1126/sciadv.ade6049 . Accessed 2023-04-30 OpenUrl CrossRef [64]. ↵ Ten Oever , S. , Romei , V. , Van Atteveldt , N. , Soto-Faraco , S. , Murray , M.M. , Matusz , P.J .: The COGs (context, object, and goals) in multisensory processing . Experimental Brain Research 234 ( 5 ), 1307 – 1323 ( 2016 ) doi: 10.1007/s00221-016-4590-z . Accessed 2025-01-07 OpenUrl CrossRef PubMed [65]. ↵ Boyle , S.C. , Kayser , S.J. , Kayser , C .: Neural correlates of multisensory reliability and perceptual weights emerge at early latencies during audio-visual integration . European Journal of Neuroscience 46 ( 10 ), 2565 – 2577 ( 2017 ) doi: 10.1111/ejn.13724 . Accessed 2025-02-27 OpenUrl CrossRef PubMed [66]. Aller , M. , Noppeney , U. : To integrate or not to integrate: Temporal dynamics of hierarchical Bayesian causal inference . PLOS Biology 17 ( 4 ), 3000210 ( 2019 ) doi: 10.1371/journal.pbio.3000210 . Accessed 2025-02-19 OpenUrl CrossRef [67]. ↵ Rohe , T. , Ehlis , A.-C. , Noppeney , U .: The neural dynamics of hierarchical Bayesian causal inference in multisensory perception . Nature Communications 10 ( 1 ), 1907 ( 2019 ) doi: 10.1038/s41467-019-09664-2 . Accessed 2023-06-06 OpenUrl CrossRef PubMed [68]. ↵ Calvert , G.A. , Campbell , R. , Brammer , M.J .: Evidence from functional magnetic resonance imaging of crossmodal binding in the human heteromodal cortex . Current Biology 10 ( 11 ), 649 – 657 ( 2000 ) doi: 10.1016/S0960-9822(00)00513-3 . Accessed 2025-09-21 OpenUrl CrossRef PubMed Web of Science [69]. Calvert , G.A. , Hansen , P.C. , Iversen , S.D. , Brammer , M.J .: Detection of Audio-Visual Integration Sites in Humans by Application of Electrophysiological Criteria to the BOLD Effect . NeuroImage 14 ( 2 ), 427 – 438 ( 2001 ) doi: 10.1006/nimg.2001.0812 . Accessed 2025-09-21 OpenUrl CrossRef PubMed Web of Science [70]. ↵ Calvert , G.A. : Crossmodal Processing in the Human Brain: Insights from Functional Neuroimaging Studies . Cerebral Cortex 11 ( 12 ), 1110 – 1123 ( 2001 ) doi: 10.1093/cercor/11.12.1110 Accessed 2025-09-07 OpenUrl CrossRef PubMed Web of Science [71]. ↵ Song , H. , Finn , E.S. , Rosenberg , M.D .: Neural signatures of attentional engagement during narratives and its consequences for event memory . Proceedings of the National Academy of Sciences 118 ( 33 ), 2021905118 ( 2021 ) doi: 10.1073/pnas.2021905118 OpenUrl Abstract / FREE Full Text [72]. ↵ Chen , J. , Leong , Y.C. , Honey , C.J. , Yong , C.H. , Norman , K.A. , Hasson , U .: Shared memories reveal shared structure in neural activity across individuals . Nature Neuroscience 20 ( 1 ), 115 – 125 ( 2017 ) doi: 10.1038/nn.4450 OpenUrl CrossRef PubMed [73]. ↵ Hamilton , L.S. , Oganian , Y. , Hall , J. , Chang , E.F .: Parallel and distributed encoding of speech across human auditory cortex . Cell 184 , 1 – 14 ( 2021 ) OpenUrl CrossRef PubMed [74]. ↵ Ashburner , J .: A fast diffeomorphic image registration algorithm . NeuroImage 38 ( 1 ), 95 – 113 ( 2007 ) doi: 10.1016/j.neuroimage.2007.07.007 . Accessed 2025-05-06 OpenUrl CrossRef PubMed Web of Science [75]. ↵ Khalilian , A. , Esmaeili , Y. , Michalak , A.J. , Flinker , A .: Mithra: An Open-Source and Cross-Platform Visualization Toolbox for Human Intracranial Recordings . bioRxiv ( 2025 ) [76]. ↵ Nastase , S.A. , Gazzola , V. , Hasson , U. , Keysers , C .: Measuring shared responses across subjects using intersubject correlation . Social Cognitive and Affective Neuroscience 14 ( 6 ), 669 – 687 ( 2019 ) doi: 10.1093/scan/nsz037 OpenUrl CrossRef [77]. ↵ Lee , D.D. , Seung , H.S .: Learning the parts of objects by non-negative matrix factorization . Nature 401 ( 6755 ), 788 – 791 ( 1999 ) doi: 10.1038/44565 . Accessed 2024-04-30 OpenUrl CrossRef PubMed Web of Science [78]. ↵ Rousseeuw , P.J. : Silhouettes: A graphical aid to the interpretation and validation of cluster analysis . Journal of Computational and Applied Mathematics 20 , 53 – 65 ( 1987 ) doi: 10.1016/0377-0427(87)90125-7 . Accessed 2023-11-09 OpenUrl CrossRef PubMed Web of Science [79]. ↵ McFee , B. , Raffel , C. , Liang , D. , Ellis , D. , McVicar , M. , Battenberg , E. , Nieto , O . : librosa: Audio and Music Signal Analysis in Python, Austin , Texas , pp. 18 – 24 ( 2015 ). doi: 10.25080/Majora-7b98e3ed-003 . https://doi.curvenote.com/10.25080/Majora-7b98e3ed-003 Accessed 2025-02-13 OpenUrl CrossRef [80]. ↵ Guyon , I. , Luxburg , U.V. , Bengio , S. , Wallach , H. , Fergus , R. , Vishwanathan , S. , Garnett , R. Vaswani , A. , Shazeer , N. , Parmar , N. , Uszkoreit , J. , Jones , L. , Gomez , A.N. , Kaiser , L. , Polosukhin , I. : Attention Is All You Need . In: Guyon , I. , Luxburg , U.V. , Bengio , S. , Wallach , H. , Fergus , R. , Vishwanathan , S. , Garnett , R. (eds.) Advances in Neural Information Processing Systems 30. Advances in Neural Information Processing Systems , vol. 30. Neural Information Processing Systems (Nips) , La Jolla ( 2017 ). ://WOS:000452649406008 [81]. ↵ Vaidya , A.R. , Jain , S. , Huth , A.G .: Self-Supervised Models of Audio Effectively Explain Human Cortical Responses to Speech . Proceedings of the 39 th International Conference on Machine Learning ( 2022 ) [82]. ↵ Zhou , Q. , Du , C. , He , H. : Exploring the Brain-like Properties of Deep Neural Networks: A Neural Encoding Perspective . Machine Intelligence Research 19 ( 5 ), 439 – 455 ( 2022 ) doi: 10.1007/s11633-022-1348-x OpenUrl CrossRef [83]. ↵ Liu , Y. , Ott , M. , Goyal , N. , Du , J. , Joshi , M. , Chen , D. , Levy , O. , Lewis , M. , Zettlemoyer , L. , Stoyanov , V. : RoBERTa: A Robustly Optimized BERT Pretraining Approach . ArXiv abs/1907.11692 ( 2019 ) [84]. ↵ Devlin , J. , Chang , M.-W. , Lee , K. , Toutanova , K. : BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding , pp. 4171–4186. Association for Computational Linguistics, Minneapolis, Minnesota ( 2019 ). https://www.aclweb.org/anthology/N19-1423 doi: 10.18653/v1/N19-1423 OpenUrl CrossRef [85]. ↵ Anderson , A.J. , Kiela , D. , Binder , J.R. , Fernandino , L. , Humphries , C.J. , Conant , L.L. , Raizada , R.D.S. , Grimm , S. , Lalor , E.C .: Deep Artificial Neural Networks Reveal a Distributed Cortical Network Encoding Propositional Sentence-Level Meaning . The Journal of Neuroscience 41 ( 18 ), 4100 ( 2021 ) doi: 10.1523/JNEUROSCI.1152-20.2021 OpenUrl Abstract / FREE Full Text [86]. Huang , J. , Tang , D. , Zhong , W. , Lu , S. , Shou , L. , Gong , M. , Jiang , D. , Duan , N. : WhiteningBERT: An Easy Unsupervised Sentence Embedding Approach . In: Findings of the Association for Computational Linguistics: EMNLP 2021, pp. 238 – 244 . Association for Computational Linguistics, Punta Cana, Dominican Republic ( 2021 ). doi: 10.18653/v1/2021.findings-emnlp.23 . https://aclanthology.org/2021.findings-emnlp.23 Accessed 2025-12-14 OpenUrl CrossRef [87]. ↵ Yu , L. , Ettinger , A. : Assessing Phrasal Representation and Composition in Transformers . In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pp. 4896 – 4907 . Association for Computational Linguistics , Online ( 2020 ). doi: 10.18653/v1/2020.emnlp-main.397 . https://www.aclweb.org/anthology/2020.emnlp-main.397 Accessed 2025-12-14 OpenUrl CrossRef [88]. ↵ Paszke , A. , Gross , S. , Massa , F. , Lerer , A. , Bradbury , J. , Chanan , G. , Killeen , T. , Lin , Z. , Gimelshein , N. , Antiga , L. , Desmaison , A. , Köpf , A. , Yang , E. , DeVito , Z. , Raison , M. , Tejani , A. , Chilamkurthy , S. , Steiner , B. , Fang , L. , Bai , J. , Chintala , S. : PyTorch: An Imperative Style, High-Performance Deep Learning Library . arXiv. arXiv:1912.01703 [cs] ( 2019 ). doi: 10.48550/arXiv.1912.01703 . http://arxiv.org/abs/1912.01703 Accessed 2025-02-13 OpenUrl CrossRef [89]. ↵ Groen , I.I.A. , Piantoni , G. , Montenegro , S. , Flinker , A. , Devore , S. , Devinsky , O. , Doyle , W. , Dugan , P. , Friedman , D. , Ramsey , N.F. , Petridou , N. , Winawer , J .: Temporal Dynamics of Neural Responses in Human Visual Cortex . The Journal of Neuroscience 42 ( 40 ), 7562 – 7580 ( 2022 ) doi: 10.1523/JNEUROSCI.1812-21.2022 . Accessed 2023-12-13 OpenUrl Abstract / FREE Full Text [90]. ↵ Fiske , S.T. , Schacter , D.L. , Taylor , S.E Kutas , M. , Federmeier , K.D. : Thirty Years and Counting: Finding Meaning in the N400 Component of the Event-Related Brain Potential (ERP) . In: Fiske , S.T. , Schacter , D.L. , Taylor , S.E . (eds.) Annual Review of Psychology . Annual Review of Psychology, vol. 62 , pp. 621 – 647 ( 2011 doi: 10.1146/annurev.psych.093008.131123 . ://WOS:000287331200023 OpenUrl CrossRef PubMed Web of Science [91]. ↵ Kriegeskorte , N. , Simmons , W.K. , Bellgowan , P.S.F. , Baker , C.I .: Circular analysis in systems neuroscience: the dangers of double dipping . Nature Neuroscience 12 ( 5 ), 535 – 540 ( 2009 ) doi: 10.1038/nn.2303 . Accessed 2024-01-19 OpenUrl CrossRef PubMed Web of Science [92]. ↵ Curtis , C.E. , D’Esposito , M .: Persistent activity in the prefrontal cortex during working memory . Trends in Cognitive Sciences 7 ( 9 ), 415 – 423 ( 2003 ) doi: 10.1016/S1364-6613(03)00197-9 . Accessed 2025-04-28 OpenUrl CrossRef PubMed Web of Science [93]. ↵ Miller , E.K. , Cohen , J.D .: An integrative theory of prefrontal cortex function . Annual Review of Neuroscience 24 , 167 – 202 ( 2001 ) doi: 10.1146/annurev.neuro.24.1.167 OpenUrl CrossRef PubMed Web of Science [94]. ↵ Satopaa , V. , Albrecht , J. , Irwin , D. , Raghavan , B. : Finding a “Kneedle” in a Haystack: Detecting Knee Points in System Behavior . In: 2011 31st International Conference on Distributed Computing Systems Workshops , pp. 166 – 171 . IEEE, Minneapolis , MN, USA ( 2011 ). doi: 10.1109/ICDCSW.2011.20 . http://ieeexplore.ieee.org/document/5961514/ Accessed 2023-06-01 OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted March 31, 2026. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Frontal cortex organization supporting audiovisual processing during naturalistic viewing Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Frontal cortex organization supporting audiovisual processing during naturalistic viewing Faxin Zhou , Amirhossein Khalilian-Gourtani , Patricia Dugan , Andrew Michalak , Orrin Devinsky , Peter Rozman , Werner Doyle , Daniel Friedman , Adeen Flinker bioRxiv 2025.06.26.661755; doi: https://doi.org/10.1101/2025.06.26.661755 Share This Article: Copy Citation Tools Frontal cortex organization supporting audiovisual processing during naturalistic viewing Faxin Zhou , Amirhossein Khalilian-Gourtani , Patricia Dugan , Andrew Michalak , Orrin Devinsky , Peter Rozman , Werner Doyle , Daniel Friedman , Adeen Flinker bioRxiv 2025.06.26.661755; doi: https://doi.org/10.1101/2025.06.26.661755 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neuroscience Subject Areas All Articles Animal Behavior and Cognition (7636) Biochemistry (17705) Bioengineering (13899) Bioinformatics (41967) Biophysics (21460) Cancer Biology (18600) Cell Biology (25526) Clinical Trials (138) Developmental Biology (13384) Ecology (19909) Epidemiology (2067) Evolutionary Biology (24326) Genetics (15613) Genomics (22512) Immunology (17740) Microbiology (40423) Molecular Biology (17193) Neuroscience (88645) Paleontology (667) Pathology (2835) Pharmacology and Toxicology (4825) Physiology (7647) Plant Biology (15159) Scientific Communication and Education (2046) Synthetic Biology (4302) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00