Full text
70,487 characters
· extracted from
preprint-html
· click to expand
Coarse-graining reveals collective predictive information in a sensory population | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Coarse-graining reveals collective predictive information in a sensory population Adam G. Kline , Maciej Koch-Janusz , Aleksandra M. Walczak , View ORCID Profile Thierry Mora , Stephanie E. Palmer doi: https://doi.org/10.1101/2025.10.18.683195 Adam G. Kline 1 Department of Physics, The University of Chicago , Chicago IL 60637 Find this author on Google Scholar Find this author on PubMed Search for this author on this site Maciej Koch-Janusz 2 University of Chicago, James Franck Institute, 929 E 57th Street, Chicago, IL 60637; Haiqu, Inc. , 95 Third Street, San Francisco, CA 94103, USA ; and Department of Physics, University of Zurich , 8057 Zurich, Switzerland Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aleksandra M. Walczak 3 Laboratoire de physique de l’ École normale supérieure, CNRS, PSL University, Sorbonne Université and Université Paris-Cité , 75005 Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site Thierry Mora 3 Laboratoire de physique de l’ École normale supérieure, CNRS, PSL University, Sorbonne Université and Université Paris-Cité , 75005 Paris, France Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Thierry Mora For correspondence: tmora{at}lps.ens.fr Stephanie E. Palmer 4 Department of Organismal Biology and Anatomy and Department of Physics, The University of Chicago , Chicago IL 60637 Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Preview PDF Abstract Biological systems perform complex computations using hundreds of individual actors, but they do so efficiently and in a way that can be read out and interpreted by other biological networks. Coarsegraining may allow for key collective features to be effectively and efficiently communicated. In the brain, early sensory systems perform prediction, which can compensate for lags in neural processing. This computation is collective, meaning it relies upon interactions between many neurons, and operates in complex, dynamic natural environments. Taking these two facets of biological complexity together, we search for maximally-predictive collective variables in large groups of retinal ganglion cells responding to dynamic natural visual scenes. To find collective variables that best capture predictive computations in the neurons, we apply a tractable, approximate implementation of the information bottleneck method to neural data. We infer a lower-dimensional representation that is maximally informative about the future neural activity. We observe scaling relationships between this mutual information estimate, neural subset size, and information decay timescale. Further, the structure of collective modes changes for predicting at short versus longer timescales. INTRODUCTION Prediction is ubiquitous in biology, allowing organisms to make use of correlations in their environment and overcome processing lags. The success of a species depends on its ability to survive and reproduce, and prediction is crucial for reacting quickly in life-or-death situations. Prediction has therefore been explored as a normative principle for understanding how organisms assign utility to various components of incoming signals in early stages of sensory processing [ 1 – 4 ]. In the vertebrate visual stream, signals are propagated through several layers of neurons, each with significant sensory delays, that prediction may help ameliorate. This begins in the eye, which can optimally predict visual signals in both artificial and naturalistic visual environments [ 5 – 10 ]. While optimal prediction has been observed for small groups of cells in the vertebrate retina under simple stimuli [ 7 ], we expect that this computation should not only extend to richer stimuli, but should also continue along the visual stream in order to produce the fast reaction times in natural scenarios [ 3 , 11 – 13 ]. Visual signals are first processed in the retina, then sent downstream to ultimately drive behavior. While neurons in the retina encode all that the brain sees, the visual information they carry must be decoded into signals or features which are useful to the organism. Abstractly, this decoding problem is difficult because the retinal output is a high-dimensional random variable consisting of the collective state of all ganglion cells. On the other hand, this output contains correlations, both because they are present in the stimulus [ 14 – 22 ] and because there are interactions between neurons [ 23 – 28 ]. These correlations reduce the space of likely codewords and may therefore enable decoding. In particular, given the importance of fast prediction to survival, correlations in the retinal output should allow for neurons downstream to read out signals which are informative of the future. To address this question, we search for predictive coarse-grained features of retinal responses to naturalistic stimuli and explore the correlation structure that allows for this predictability. To study how the collective properties of retinal responses might allow for downstream prediction, we analyze correlations in the simultaneous activity of a large number of neurons stimulated by a range of naturalistic images. Given the wide range of timescales present in natural visual environments [ 18 ] and the retina’s ability to resolve very fast features [ 29 , 30 ], we focus on long recordings with high temporal resolution. To find and study predictive features in the retinal population code, we apply recent machine learning methods which leverage concepts from information theory [ 31 – 34 ]. At its core, our method solves a variational version of the predictive information bottleneck problem [ 35 ] by simultaneously searching over predictive coarse-grained features and performing inference. By using the expressive power of neural networks to solve an inference problem, this method is able to estimate information quantities on a state space of unprecedented size, that is, the state space of joint activity among hundreds of neurons. In doing so, we see how many linear features are needed to be predictive, providing support for the feasibility of compression-based prediction in the retina and more broadly neural coding. RESULTS A. Recordings of retinal neuron population activity In order to investigate the statistics of retinal outputs, we examine response data taken from 93 retinal ganglion cells (RGCs) in the salamander retina under naturalistic stimuli [ 36 ], as described in [ 10 , 37 ]. The data collection process is outlined in Fig. 1 . A retina taken from a larval tiger salamander was placed on an electrode array with a density of roughly one electrode per RGC. These data were then spike-sorted, yielding the times at which each neuron fired an action potential. A binary representation of the neural code was generated by discretizing time into time bins of size Δ t = 1 / 60 s, and assigning the state “1” to a neuron in any time bin in which it fired a spike and a “0” otherwise. In our data, the marginal probability of the “1” state across all neurons and time bins can be as high as 0.016 and as low as 0.003, depending on the stimulus. Download figure Open in new tab FIG. 1. Neural data. Experimental protocol enables simultaneous recording of responses across a large population of salamander retinal ganglion cells under repeated, naturalistic stimuli. A . Stimuli are shown to a salamander retina while ganglion cell responses are recorded by an electrode array. B . Following spike sorting from electrode array, neuron spike times are binned into Δ t = 1/60 second time bins (aligned with stimulus movie frames), which yields a binary representation of the neural code. C . Naturalistic stimuli are shown at 60 frames per second to the retina in random order, in 20 second intervals. In our analysis, the responses from the first 20 frames (0.3 seconds) of each interval are removed, as they contain transients due to switching stimulus. At the beginning and end of the experiment are 30 minute intervals of checkerboard stimulus. D . Snapshots taken from each stimulus. E . Each stimulus is shown to the retina ∼ 80 times, allowing us to observe noise correlation effects. Five different naturalistic stimulus clips were used, each 20 seconds long. These were shown to the retina in random order for 141 minutes, yielding roughly 80-90 repetitions of each stimulus over the course of the experiment. This repeated structure allows us to compare neural responses to a given stimulus across trials, revealing both reliable response features and stochastic effects inherent to the physiology of the retina. Correlations in these latter effects are referred to as “noise correlations” and can be studied by considering the ensemble of repeated trials of the same stimulus. The stimuli themselves are greyscale videos at 60Hz, depicting a range of visual scenes that could conceivably be present in the environment of the specimen. Such stimuli are characterized by heavy-tailed distributions of contrast, velocity, time-, and length-scales, in contrast to simpler, engineered stimuli such as moving gratings [ 18 , 38 ]. Because the retina encodes its stimuli nonlinearly, an ethologically relevant understanding of these encodings justifies probing responses with naturalistic stimuli [ 15 , 16 , 29 , 39 , 40 ]. B. Estimating mutual information We quantify the amount of information encoded in the outputs of the retina about its future outputs. Generally, given a joint distribution p ( x, y ) for two random variables X and Y , the mutual information between them is In general, mutual information quantities can be very challenging to compute. A common difficulty is if X or Y are variables with many dimensions, say X ∈ {0, 1} N with N large. In such cases, even if given an analytical expression for p ( x, y ), the sum in ⟨·⟩ x,y must typically be approximated. More problematic, when working with data, we must provide an estimate of p ( x, y ). This problem is known as inference, and also becomes increasingly difficult as the dimensionality of random variables increases. In this work, we leverage recently-developed tools to compute a tractable lower bound to (1) [ 41 ]. This bound, which relies on an ansatz for p ( x, y ) represented using neural networks, is maximized to approach the true distribution. In other words, this method both solves an inference problem and in the process computes an approximation of I ( X ; Y ). More explicitly, we consider a trial distribution q ( x, y | θ ) which represents our best guess at p ( x, y ), where θ are free parameters. The Barber-Agakov (BA) bound on mutual information is given by Because the Kullback-Leibler divergence D KL [ p || q ] ≥ 0, we have that I BA ( X ; Y ) ≤ I ( X ; Y ) with equality only when q ( x, y | θ ) = p ( x, y ). Therefore, the act of maximizing this bound is, precisely, inference. We parameterize this trial distribution q with the ansatz where p ( x ) and p ( y ) are the true marginals of x and y , and f ( x, y | θ ) is computed by a neural network with parameters θ . The partition function Z ( x, θ ) is then defined by f ( x, y | θ ) and p ( y ) through the requirement that ∑ y q ( x, y |θ) = p ( x ) for all .Inserting this ansatz into the BA bound yields the unnormalized Barber-Agakov bound This bound is intractable due to the log partition function. In the noise-contrastive estimate (NCE) bound, a Monte-Carlo estimate of the partition function is computed using minibatches containing B independent samples from the data, and this effectively reduces variance of the final estimate. Explicitly, the NCE bound is In summary, we aim to compute I NCE ( X ; Y ), which is bounded in the following way with each bound becoming tighter as q ( x, y | θ ) approaches p ( x, y ). We implement the critic function f ( x, y ) using fully connected neural networks u a ( x ) and v a ( y ) with a = 1, …, N embed as: We refer the reader to Refs [ 32 , 41 ] for a more detailed overview of these bounds. C. Predictive information persists at long timescales The retina needs to reliably encode and transmit visual information for further processing by downstream areas of the brain. Retinal dynamics is shaped by the statistical and dynamical properties of the external inputs. However, downstream neural populations do not have direct access to those properties, other than provided by retinal stream itself. Taking the downstream perspective and looking at the retinal code, itself, as if produced by an autonomous dynamical system, what can we say about its predictability? We first examine predictive information, defined as the mutual information I ( X, Y ) between the joint activity of all N = 93 neurons in a single time bin, X ∈ {0, 1} N , and the same set of degrees of freedom at τ into the future, denoted by Y ∈ {0, 1} N ( Fig. 2A ). Our estimate of the predictive information I ( X ; Y ) is given by I NCE ( X ; Y ) (in Eq. (5) ), evaluated on data which were held out dur-ing training of the critic function. Download figure Open in new tab FIG. 2. Long predictive timescales under naturalistic stimuli. A . We use the NCE bound, Eq. 5 , to estimate mutual information I ( X ; Y ) between X and Y . X is the binary codeword of length N = 93 representing ganglion cell responses in a single time bin, while Y is the codeword τ in the future. B . Predictive information versus prediction interval for each stimulus. C . Predictive information normalized by its maximum value, on a semi-log scale (same color scheme). Each stimulus response displays multiple timescales and correlations out to about 500 ms. Dotted lines depict predictive information estimates after shuffling the time index across repeats. Nonzero quantities below these lines are spurious, as they represent estimates when no information beyond frame index is present. In Fig. 2B , estimates of I NCE ( X ; Y ) are given as a function of the prediction interval τ , for each of the 5 stimuli. Since I NCE is a lower bound of mutual information, we can conclude that for certain stimuli, such as “fish” and “opticflow”, the true mutual information I ( X ; Y ) achieves large values even at prediction intervals as large as ∼500 ms. In comparison, for the checkerboard stimulus no correlations were observed, up to the refresh rate of the monitor, 30 fps [ 7 ]. Additionally, there is variation in the overall scale of information across stimuli, and in the change of shape of the τ dependence. To better observe this variation across natural scenes, we examine in Fig. 2C the decay of predictive information on a log scale, after normalizing by the maximum value of I NCE ( X, Y ) at τ = Δ t . These trends show approximately exponential decay for most stimuli over some initial transient period, followed by a plateau. In the “opticflow” stimulus, this profile is pronounced, with the plateau onset occurring after only about 150 ms. One potential worry might be that non-zero estimates of predictive information in this plateau region are due to overfitting. This might occur if the critic function learns to decode frame numbers from present and future frames, which is feasible given that the dataset contains repeats of each stimulus response. To address this, we also provide NCE estimates of predictive information on shuffled data, where this shuffling randomly permutes the time index in a way that is consistent across repeated trials of the same stimulus. If over-fitting due to “memorizing” frame number is occurring, this procedure allows the information estimate to include its contribution while removing contributions from all other correlations in time. Since these shuffle estimates (dashed lines) are below the estimates on intact data, we conclude that long-time predictive information is not likely due to over-fitting by frame memorization. Taking these shuffle estimates as a limit on information, we note that not all of our stimuli elicit responses with any predictability at a time lag of τ = 500 ms. Whereas “opticflow” responses contain predictive information at this extreme interval, responses to the “water” stimulus do not, falling below the shuffle threshold at a much earlier interval of 250 ms. D. Predictive information is compressible The substantial mutual information that we find between points separated by large time intervals τ indicates that there are features of the neural code that are predictable. Given the importance of prediction for survival, it is reasonable to expect downstream processes to leverage this predictability. However, these processes may not need to perform this prediction task based on the full knowledge of the entire repertoire of retinal activity. For one, we know that the neural code is noisy, with repeated exposure to the same stimulus giving variable responses. Secondly, we know that the optic tectum, the next stage downstream of the retina in the visual stream, has fewer neurons and acts as a bottleneck for the signal. Furthermore, signals which drive motor responses also converge to a small set of motor neurons, indicating a need for compressibility. Such considerations also suggest that an important aspect of retinal computation is its ability to make signals accessible through linear readout downstream [ 6 ]. More fundamentally, because of the code’s redundancy, predictability may be supported by an effective representation of lower dimension than the full joint activity. Given a stimulus, how many distinct features of the random variable X ∈ {0, 1} 93 need to bemeasured to learn all there is to know about the future state Y , and what are these features? To answer this question, we seek coarse-grained, lowdimension representations Z = Λ( X ) of the joint neural activity X in the present that carry maximal information about the future activity Y . This approach is the deterministic version of the information bottleneck (IB) for predictive information described in [ 7 ], and we follow [ 32 – 34 ] to find variational solutions using the NCE machinelearning estimate of the mututal information. Suppose X ∈ χ is the current state of the retinal code and Y is the future, as in Fig. 3A . We seek coarse-graining maps Λ : χ → ℝ K that preserve a maximal amount of mutual information between Z = Λ( X ) and Y : Download figure Open in new tab FIG. 3. Predictive information is compressible A. Schematic of computation. Joint neural activity X is coarse-grained linearly into a compressed, K -dimensional representation Z = Λ · X . The optimal coarse-graining Λ is found by maximizing the NCE lower bound, Eq. 5 along with its variational parameters. B . Fraction of total predictive information I NCE ( Z ; Y ) /I NCE ( X ; Y ) for a single time bin prediction interval τ = Δ t , for four values of the compressed dimension K . C . Fraction of total predictive information I NCE ( Z ; Y ) /I NCE ( X ; Y ) as a function of prediction interval τ for the “opticflow” stimulus. Each line represents a different K D . To test predictive feature generalization, we first find optimal features Λ 1 at prediction interval τ 1 , then measure I NCE (Λ 1 ( X ); Y ( τ )) for τ across the whole range of timescales. E . Predictive feature generalization I NCE ( Z ; Y ) /I NCE ( X ; Y ) for compressed dimensionalities K = 1, 2, 4, 8 (same color code as C), at two different training timescales τ 1 = 83 ms and τ 2 = 350 ms, as explained in D. The parameters θ for the variational estimate q ( z, y θ ) of the true joint distribution p ( z, y ) are optimized along with the map Λ. In this work, we restrict our search to linear maps Λ: Z = Λ · X , where Λ is a K |dim(χ)| matrix. We refer to the entries of Z as “meta-neurons”, and the row-vectors of Λ as the features that these metaneurons encode. From here on, when the θ argument is dropped from I NCE that should be taken to mean I NCE evaluated after optimizing over θ . Due to the data processing inequality, I ( Z ; Y ) ≤ I ( X ; Y ), where I ( X ; Y ) is the total predictive information. When K is increased, I ( Z ; Y ) cannot decrease, and when K = dim χ , the coarse-graining map Λ becomes invertible, leading to I ( Z ; Y ) = I ( X ; Y ). Together, these constraints tell us that I ( Z ; Y ) is a monotonically increasing function of K which saturates at I ( X ; Y ). In particular, to quantify compressibility of the retinal code, we examine how quickly max Λ I NCE ( Z = Λ X ; Y ), as estimated using the variational method outlined above, approaches its upper bound I NCE ( X, Y ) as K is increased. We take X ∈{0, 1} N to be the activity of all neurons in a single time bin and Y ∈{ 0, 1} N that same activity some time τ in the future ( Fig 3A ) Across stimuli, only about K = 8 meta-neurons are required to capture all of the available predictive information for the next time bin, τ = Δ t , as captured by the fraction of predictive information I NCE ( Z ; Y ) /I NCE ( X ; Y ) ( Fig 3B ). Fig. 3C that this result holds for larger predictive horizons τ , up to 500 ms. Recall that N = 93, meaning the predictive signal can be compressed by more than 90%. However, since each prediction interval τ represents a distinctly defined pair ( X, Y ) of input and relevance variables, these 8 predictive directions in state space could be different at different τ . To investigate the dependence of relevant features on prediction intervals, we pick a “training” time interval, for example τ train = τ 1 = 83 ms, and solve (8), yielding (Λ 1 , θ 1 ) ( Fig. 3D ). The coarse-graining map Λ 1 represents the K linear features of X which maximize the mutual information between Z 1 = Λ 1 · X and represents part of an estimate of p ( z 1 , y ). With the features Λ 1 fixed, we then optimize I NCE ( Z 1 ; Y ( τ ))( θ ) over θ for a range of prediction intervals τ . This gives us an estimate of I ( Z 1 ; Y ( τ )), or the information between meta-neurons trained to predict at interval τ 1 and the state at all other intervals τ . In Fig 3E , we show the dependence of I NCE ( Z ; Y ( τ )) /I NCE ( X ; Y ( τ )) on τ for two different training intervals, τ 1 = 83 ms and τ 2 = 350 ms, and for K = 1, 2, 4, and 8 meta-neurons. For both training intervals τ 1 and τ 2 , the predictive features generalize well, but not perfectly, across testing intervals. When the testing interval is equal to the training interval, we see a peak in predictive information carried by these specialized meta-neurons. While it appears to be the case that only 8 meta-neurons trained at any timescale successfully encode the majority of predictive information at all intervals, the two examples we give reveal that these compressions are highly sub-optimal. For example, 8 meta-neurons trained to predict Y ( τ 2 ) do worse at predicting Y ( τ 1 ) than only 4 meta-neurons trained to predict at that interval. Together, these results suggest that retinal responses for complex naturalistic stimuli encode both general and timescale-specific predictive information, and that these features could be disentangled by downstream neurons solving different prediction problems. E. Meta-neurons carry long-timescale predictive information We have so far shown that retinal responses to naturalistic stimuli exhibit long timescale correlations, and that all of the predictive information can be captured by a few linear coarse-grained variables (meta-neurons). The predictive capabilities of a given set of meta-neurons generalizes to timescales other than the one at which they were trained to predict. Thus, by learning to predict retinal outputs at one specific timescale, neurons downstream of the retina may also learn to predict more generally. What do meta-neurons encode, and why does this lead to generally good compressed prediction at many intervals? One possible answer is that meta-neurons are encoding slow, long-timescale features. In a theoretical exploration of this idea, Schmitt and coworkers [ 42 ] have noted that, in stochastic dynamical systems governed by a time evolution operator U , the most informative features identified by predictive IB are given by the eigenfunctions of U with the largest eigenvalues, corresponding to the longest timescales. In that setting, each IB feature is associated with a timescale, which can be computed from the eigenvalue decomposition of U . Crucially, this identification implies that the compressed features being encoded do not actually depend on the prediction interval, since U generates the system’s dynamics. We want to test this idea that meta-neurons (the entries of Z = Λ ·X t ) encode features with long timescales. We begin by using dynamic mode decomposition (DMD) [ 43 ] to construct an estimate of the time evolution operator for the neural dynamics, X t , independently of the IB and its meta-neurons. There is no guarantee that neural activity is well described by a linear evolution operator, in particular because of memory (non-Markovian) effects. To account for non-Markovian dynamics, we operate on a dynamical variable consisting of a time-delayembedding (TDE) of the neural state: we redefine X t as the binarized neural responses of all neurons in the 7 time bins leading up to (and including) time t : X t ∈{0, 1} 7 N , where N = 93 is the number of neurons in the population. The evolution operator is then obtained by performing linear regression on Eq. 9 , by minimizing the mean-squared residual . This gives with the X 1 =[X 1, …, X T -1 ], X 2 =[X 2, …, X T ] and Moore-Penrose inverse of X 1 . To analyze the timescales of the dynamics and the corresponding features of the neural state, we perform an eigenmode decomposition of U : Uv i = λ i v i and . Writing the neural state in the eigenbasis, , its evolution is given by . Thus, for each mode v i and eigenvalue λ i , we can associate a timescale τ i and frequency ω i through: Note that the time scale τ i diverges as | λ i |→1, which imposes | λ i |≤ 1. Next, we adapt the predictive IB described in the previous section to find the compression Z t = Λ X t that maximizes I NCE ( Z t ; Y t ), where X t is now the TDE of the neural state (of dimension 7 N ) re-defined in the previous paragraph, Λ is a K 7 N matrix, and Y t ∈{0, 1} N is the activity in a single time bin immediately following the last time bin of X t , as depicted in Fig. 4A . The difference from the IB described earlier is that X t now contains the activity in the 7 time bins that lead to t . We made that choice to be able to make a connection with DMD, which predicts X t +1 as a function of X t , or equivalently Y t as a function of X t , since the first 6 time bins of X t +1 are trivially the last 6 time bins of X t . In training, we choose K = 50 as this captures > 95% of predictive information. Download figure Open in new tab FIG. 4. Meta-neuron features encode long timescales. All analyses were carried out on responses to opticflow stimulus. A . Compression of time-delay-embedded input variable X . We take a time delay embedding with 7 time steps, leading to a state X that is a 7 N = 651-entry binary array. This is transformed linearly into K = 50 meta-neurons Z a = Λ a · X , with a = 1, …, 50. That is, Z = Λ · X . B . Meta-neurons encode long-timescale DMD modes. Points are located at eigenvalues λ i of the DMD operator U . Points closer to the unit circle indicate modes with longer timescales. Color depicts the overlap q i defined as squared length of each DMD eigenmode v i onto the vector space spanned by the Λ a vectors. C . IB features are organized into a hierarchy of timescales. Performing PCA in the space of meta-neuron filters over 1000 repetitions of the training provides a hierarchy of IB features. Each feature ϕ µ yields a timescale through projection onto the evolution operator, Eq. 12 . Green points show the result for time-shuffled data that destroy predictability. Dotted line is at Δ t = 1 / 60 s . Error bars include uncertainty both in U and in ϕ . Light blue points show variation over 100 estimates of ϕ , each using only 500 repetition of training for Λ. Inset depicts the eigenvalues for U for both the intact (blue) and time shuffled (green) data. In our first comparison between DMD and IB, we examine IB meta-neuron features in the eigenbasis{ v i } of the DMD operator. Recall that IB meta-neuron features are encoded in the K row-vectors (each of dimension 7 N ) of the matrix Λ. Because any invertible transformation of Z leaves the mutual information invariant, all invertible linear operations Λ → L Λ yield equally good coarsegraining performance. What is important, then, is the rowspace of Λ, i.e. the linear subspace of ℝ 7 N spanned by the row-vectors of the matrix Λ, which is invariant to Λ → L Λ transformations, and corresponds to what the meta-neurons collectively pay attention to in the neural activity X . Therefore, to quantify how well each mode i of the DMD dynamics is encoded in the IB meta-neurons, we define its overlap with their subspace as: where P Λ = Λ T (ΛΛ T ) −1 Λ is the projection operator onto the rowspace of Λ. For example, if a mode has an overlap q i = 1, then X v i can be read out of the metaneuron state Z with perfect fidelity, whereas an overlap of zero indicates that Z can tell us nothing about X v i . Fig 4B shows all eigenvalues λ i in the complex plane, colored by their overlap q i with the IB meta-neurons. We observe that modes with the largest overlap q i have the largest | λ i |, corresponding to the longest time scales. This result suggests that the IB picks out features of the neural state that are the most stable over time, and thus have the largest predictive power for their own evolution. Due to the invariance of Z upon linear transformation, the above analysis does not allow us to directly pair individual IB meta-neuron features (rows of Λ) with DMD modes ( v i ). To obtain an approximate hierarchy of IB features and break the symmetry, we run 1000 independent trials of the IB training, and look for the main directions of variation of the IB meta-neurons. Explicitly, denoting by the compression matrix in trial n , we compute its covariance over both trial n and meta-neuron a , and diagonalize it: with . IB features are then defined by the eigenvectors ϕ µ , and ranked by descending order of explained variance l µ . We then express the evolution operator in this new eigenbasis: . We observe that, empirically, diagonal terms u µµ dominate that sum. Keeping only the diagonal terms and defining , we obtain: This rewriting of the evolution operator allows us to associate a time scale with each IB meta-neuron feature ϕ µ . Fig. 4C shows that the time scale correlates with IB rank: it is highest for the most relevant IB features (highest l µ , low rank), and decreases as a function of rank (blue points). We emphasize that the IB features were not selected to exhibit this timescale hierarchy. As a control, we repeated the analysis on time-shuffled data (where both time and trial index are randomly permuted), revealing an almost flat dependence of time scale versus IB rank (green points), consistent with the expectation that time-shuffled data should have no predictive ability beyond the trivial 6-bin overlap between X t and X t +1 . Indeed, the DMD spectrum{ λ i }of the intact versus timeshuffled data (inset) shows that large-modulus eigenvalues corresponding to long timescales are suppressed by this shuffling operation. In summary, this analysis shows that the features most relevant for prediction, as revealed by IB, correspond to the slowest modes of the dynamics revealed by the DMD analysis. F. Predictive information is collectively encoded From the perspective of a downstream predictor, how important are collective effects in predicting retinal activity? To address this question, we first examine the contributions of individual neurons to the predictability of random groups. Specifically, we measure the relationship between a neuron’s information about itself (selfinformation) and the change in information it provides to a collective upon its inclusion (information contribution). For each neuron A in the 93-cell population, we compute its self-information I ( X A ; Y A ) directly from an empirical histogram and sample ten random groups of 49 neurons B ( Fig 5A ). We then use NCE to estimate I ( X B ; Y B ), the predictive information in these groups without A , as well as I ( X, Y ), the predictive information of all neurons in A ∪ B at the same interval τ . The difference δ A | B = I ( X ; Y ) − I ( X B ; Y B ) represents the increase in total predictability upon including A in the collective. We call it the contextual information of neuron A . Download figure Open in new tab FIG. 5. Synergy in predictive information. A . Depiction of single neuron contribution test. For each neuron A we randomly sample 10 groups B of 49 neurons each ( A and B are disjoint) then estimate I ( X A ; Y A ), I NCE ( X B ; Y B ), and I NCE ( X ; Y ), with X = X A ∪ B . We perform this test at τ = Δ t = 17 ms and at τ = 100 ms. B . Contextual predictive information δ A | B = I ( X ; Y ) − I ( X B ; Y B ) of A , averaged over B , as a function of self-information I ( X A , Y A ). Deviation above the dashed unity line indicates synergestic effects. Red dot is the average over neurons. Collective effects are more important for prediction at the longer interval. C . Scaling of the mean predictive information in 20 random groups G as a function of their size N , for four different prediction intervals. Each curve is normalized by its maximal value at N = 93. Curve convexity implies the presence of synergy, . D . Schematic depiction of hypothesized correlation structure. At short timescales, prediction synergy is positive for small groups and neutral or redundant in large groups, suggesting greater autocorrelation content, less widespread cross-correlations, and effective independence between groups past some scale. Meanwhile, neurons which are highly self-predictive at long timescales also exhibit strong prediction synergy in random collectives, which is explained by strong cross-correlations. Fig. 5B shows the contextual information, δ ( A B ), versus its self-information, I ( X A ; Y A ), at short (17 ms) and long (100 ms) prediction timescales, for each neuron in the population. If neurons were encoding independent features of the stimulus, they would all fall on the identity line. We call neurons that encode more predictive information in the context of other cells than by themselves, δ A | B > I ( X A , Y A ), synergistic. A vast majority of neurons are synergistic. In fact, many neurons have low self-information I ( X A , Y A ), but substantial contextual information δ A | B , especially at short τ = 17 ms (left panel). Such neurons are individually uninformative at the given timescale for downstream predictors; their utility only arises when read out within some group. These synergistic effects increase the overall predictability of the population: at short prediction internal ( τ = 17 ms) the 93-cell population has about ⟨ I ( X, Y ) ⟩ = 1.2 bits of predictive information per time bin, or about 13 mbits/neuron, versus I ( X A ; Y A ) = 8 mbits of self-predictive information per neuron on average. Thus, correlations between weakly-firing, low self-information neurons and the other neurons cause the scaling of total predictive information to exceed the independent-neuron estimate. We find that synergistic effects are even stronger at longer timescales ( τ = 100 ms, right panel), where predictive information is much lower, suggesting a shift towards collective encoding of predictive information as the prediction interval increases. More generally, we can define a predictive synergy between two subpopulations A and B of neurons to quantify how much extra predictive information they carry together relative to their sum: where X = X A ∪ B is the joint activity of the population. To get an summary estimate of synergy in the population, we can compute the predictive information as a function of N , averaged over random subgroups G of neurons of size N . Fig. 5C shows this quantity normalized by its maximal value at , for various prediction intervals τ . We observe that is a convex function of N , which can be translated mathematically as , or equivalently S ( A ; B ) > 0. The curve becomes more strongly convex, and hence predictive synergy larger, as the prediction interval Δ t is increased, consistent with the result for single neurons within a population. We hypothesize that this trend towards increasingly collective encoding at longer timescales is reflected in the correlation structure, as schematically depicted in Fig. 5D , with blue connections representing correlations. A simple Gaussian toy model highlighting the relationship between correlations and synergy in presented in Appendix B. We explore more explicitly how correlations in the neural population impact synergy in the next section. G. How do correlations impact predictive information? Correlations between neurons have two sources: stimulus-induced correlation, which stem from common or correlated stimulus inputs into the neurons, and noise correlations, which originate from shared noise and direct interactions between cells. To capture the contribution of noise correlations to predictability, we make use of the trial structure of the dataset ( Figs 1E and 6A ). Each stimulus was shown to the salamander retina around 80 times (in the case of the “opticflow” stimulus, 85 times). During each of these repeated trials, the stimulus driving responses in the retina stays exactly the same, so any variation in the responses comes from noisy outcomes of the internal dynamics of the retina. Formally, we can fix the stimulus by conditioning the response on the time t within the trial, p ( x, y | t ). Correlations between X and Y under this conditioning and across trials correspond to noise correlations. The unconditioned distribution studied in the previous section is then , and includes both stimulus and noise correlations. To remove noise correlations, we can randomly shuffle trials separately for each neuron. Denote by x n,t,a the state of the n th neuron on the t th time step and in the a th repeat of the stimulus. For each neuron, we draw a random permutation π n that permutes trials to get a new dataset in which the noise correlation structure has been destroyed, while keeping the stimulus correlation structure through the time dependence. We call this operation, depicted on the left side of Fig. 6A , “per-cell” shuffle. It corresponds to making neurons conditionally independent: p per cell ( x, y t ) = ∏ n p ( x n , y n t ), where the product runs over cells. Fig 6B shows that this per-cell shuffle does not have a substantial impact on predictive information at short timescales ( τ = 1 / 60 s). This suggests that noise correlations between neurons do not contribute to the predictive information. Download figure Open in new tab FIG. 6. Contributions of cross- and autocorrelations to predictive information. Cross-correlations contribute significantly at short intervals and dominate at large intervals, while autocorrelations only significantly contribute at short intervals. Each stimulus (one of five 20 s naturalistic movies) was shown to the retina a total of 80-90 times. We use this trial structure to probe noise correlations. To remove noise correlations between neurons, we randomly permute the trial in a way that is consistent in time, but changes based on cell (left). We similarly remove noise auto-correlations by permuting the trial differently at each time step (right). Both these methods change the trial orderings but preserve the stimulus dependence. Predictive information from “opticflow” stimulus with different noise correlations removed by the methode described in A. Removing noise correlation between cells (per-cell shuffle) has negligible effect, but destroying noise autocorrelations (per-time shuffle) removes significant predictive information. C . Predictive information as a function of the prediction interval, with and without (per-time shuffle) noise correlations. At longer intervals, the gap narrows, meaning that noise autocorrelations matter less. D . By removing interactions between neurons in the critic function, we estimate the contributions of cross-correlations at all orders to predictive information. E . Difference in predictive information between fully expressive and independent critic functions, at two different prediction intervals. This difference represents the information discarded by a downstream predictor when it ignores cross correlations at all orders. As a result of the negligible effect of noise crosscorrelations, any information carried from the present to the future by noise correlations must be comprised of autocorrelations, i.e. correlations between the activity x n of a single neuron and its future y n , conditioned on t . We can quantify this effect by building a “per-time” shuffle, by drawing a trial permutation for each time t, π t , and defining shuffled data through . This other shuffling method, depicted on the right sideof Fig. 6A , destroys correlations between the activities of neurons at different times conditioned on the stimulus, p per time ( x, y | t ) = p ( x | t ) p ( y | t ). Predictive information in that “per-time” shuffle is shown in Fig. 6B for τ = Δ t = 17 ms. Out of the 1.2 total bit of predictive information available at τ = Δ t , destroying temporal noise correlations removes around 0.35 bits. For longer prediction intervals τ , the relative contribution of noise correlations to predictive information begins to fade ( Fig 6C ). This tells us that the long timescales of predictions found in the neural code (quantified in the previous sections, see Fig. 2 ) are actually due to stimulus-induced correlations between neurons, rather than through the noise-induced auto-correlation of each neuron. In summary, noise correlations only contribute to predictive information at short intervals. Since these correlations are a single-neuron effect, their disappearance at longer timescales is consistent with our hypothesis that the balance of individual to collective correlations shifts towards collective at late times. To quantify more directly the contribution to predictive information of correlations between neurons at different times, we need to go beyond shuffling operations. We would like to build a control in which correlations between different neurons at different times are destroyed, but not between neurons at the same time. We leverage the fact that the NCE estimate is a lower-bound of the Barber-Agakov bound; its optimization can be conceptualized as variational inference within a class of models. Recall that our variational estimate for a joint distribution p ( x, y ) is represented by a so-called critic function f ( x, y ), which encodes the interaction between x and y , but not the marginal joint distributions p ( x ) and p ( y ), which contain all same-time correlations between cells. So far we have taken the critic function to be a fully expressive neural network, able to capture any relevant correlations ( Fig 6D , left). To destroy interactions between neurons across time, we consider an independent critic function ansatz ( Fig 6D , right): This independent critic yields a family of distributions q ind ( y x ) which only capture equal-time correlations and autocorrelations. In essence, this ansatz only allows each neuron to directly inform its own future. Fig 6E shows the predictive information for the full (using a generic f ( x, y )) versus independent (using f ind ) critic for short ( τ = 17 ms) and long ( τ = 100 ms) prediction intervals, under the opticflow stimulus. The drop D in predictive information upon forcing independence may be written as: D represents the additional information about Y which can be extracted from X from temporal crosscorrelations. At τ = Δ t , observing cross-correlations provides about D = 0.7 bits, or 58% of the available predictive information, while at τ = 100 ms cross-correlations provide about D = 0.5 bits, or 84% of the total. This result directly confirms the picture that temporal crosscorrelations between neurons become increasingly important for prediction at long timescales. DISCUSSION Combining information theory and machine learning we have demonstrated that the retina compresses information to make long-term predictions. By going beyond previous work that was limited to small sets of neurons [ 7 , 44 ], we accessed the many-cell regime by leveraging a variational inference technique. We find that all predictive information in a large retinal population is encoded collectively in a few delocalized linear features. The predictive and decodable features we have discovered may be crucial for quick reactions in dangerous situations [ 45 – 47 ]. In the large populations we studied, even once activity is binarized, the size of the space of possible states is 2 93 , suggesting that coarse-graining schemes are involved in neural processing. Additionally, since the neural code is sparse—neurons are much more likely to be silent— many states will never occur. Next, neural responses are correlated. These correlations are induced both by stimulus and by physiology, and both allow us to make statements about the state of given neurons given the firing patterns of other neurons. Finally, the code must ultimately be learnable, otherwise the brain could never decode visual signals. A number of studies have investigated coarse-graining in the retina, and all describe procedures that reduce the state space while preserving relevant features [ 8 , 24 , 44 , 48 – 52 ], although the definition of relevance varies between them. Here we focused on the predictive aspect and demonstrated a compression scheme which preserves predictive information in a relatively large population (93 RGCs). This allowed us to take large-scale collective effects into account and more accurately represent the problem presented to downstream neurons. In particular, we showed that predictive information in a population of retinal neurons can be encoded in a small number of linear features. By varying the target prediction interval, we found that compressibility extends to long timescales (500 ms), and that there are both generalized and timescale-specific predictive features present. Re-gardless of the timescale, these features are delocalized, spreading across the entire neural population, and capture information which is collectively encoded. From the perspective of predictive downstream neurons, is it better to treat signals from neurons individually or collectively? We found that individual effects contribute most at short timescales, and collective effects are crucial at all intervals. We quantified the prediction synergy between different subsets of neurons. Single neurons almost always provide more predictive information to a collective than they have about themselves, and at moderate to long prediction intervals, we observed finite prediction synergy between large subsets. In order to see further into the future, downstream neurons therefore need to incorporate inputs from more neurons. Equaltime cell-cell noise correlations do not significantly contribute to predictive information. Noise autocorrelations are significant at short timescales, playing a smaller role for large prediction intervals. Even at the shortest prediction interval, close to half of the predictive information received by a downstream neuron comes from correlations between different RGCs. We conclude that although optimal prediction of retinal outputs at long timescales can be done with only a few linear features, these features need to be collective. Our DMD analysis serves as a test which corroborates the idea put forward in [ 42 ], but in a complex biological context. By maximizing predictive information with a fixed set of variables, it seems we automatically encode long timescales. The correspondence suggests that if one chooses a number K of meta-neurons such that not all of the predictive information is captured, one still approximately recovers the K longest timescales in the system. To say with greater certainty that the K first IB features encode the K longest timescale modes, future work could try to find IB features which are true eigenfunctions of a better estimate of the DMD operator. This might be possible with more expressive, nonlinear metaneuron features, as well as DMD with a well-chosen set of nonlinear observables. By variationally solving a predictive information bottleneck problem on data taken from vertebrate retina, we have demonstrated that downstream prediction of a large population RGCs is plausible. Applied to data from these downstream neurons, future analyses can reveal their decoding performance relative to the optimum. Further work should constrain the coarse-graining maps and critic function Ansatz to a family of interpretable, mechanistic models of retina and downstream neurons and reveal how predictive computation might be enabled or hindered by physiology, similarly to [ 49 ]. More generally, given the ubiquitous importance of prediction as a biological function, we also anticipate that this method can be useful in finding computationally relevant coarse-graining schemes in other complex and biological systems. DATA AND CODE AVAILABILITY Data is available at https://doi.org/10.5061/dryad.4qrfj6qm8 . Code is available at https://github.com/sepalmer/metaneurons . ACKNOWLEDGMENTS This work was supported by the National Science Foundation through the Physics Frontier Center for Living Systems (PHY-2317138). This work was supported by the NSF-Simons National Institute for Theory and Mathematics in Biology, awards NSF DMS-2235451. This was work supported by a FACCTS award (AMW, SEP). This work was partially supported by the European Research Council Consolidator grant no. 724208 (AMW), Fondation Bettencourt-Schueller (TM), Simons Foundation MP-TMPS-00005320, as well as a Schmidt Sciences Polymath award (SEP). Appendix A NCE implementation details Here we give details on how the “critic function” is computed and optimized. Recall that it takes the form: We take v a and u a to be feed-forward multilayer perceptron (MLP) neural networks and θ are all of the model parameters. When compressing from a single time bin, both u and v have 2 hidden layers with 32 neurons, and output to the embedding dimension of size N embed = 30. When we compress from a TDE state of seven time bins, we use hidden layers with 32 neurons and an embedding dimension of 100. Hidden layers have a tanh nonlinearity. The input dimensions depend on the specific calculation. For example, when coarse-graining to K = 4 meta-neurons to predict the whole population state in a single future time bin, we would have u : ℝ 4 → ℝ 30 and v : ℝ 93 → ℝ 30 . To train the models, we used the Adam optimizer [ 53 , 54 ] with a learning rate of .009, batch sizes of B = 800, and held out half the data for testing. All reported mutual information estimates are from the held-out test sets. We explored effects of dropout on weights within these MLPs but found that early stopping with no dropout was the most effective regularization. We chose to stop training after each data point in the test set had been used by the optimizer 16 times. In both testing and training, the first 20 frames (1/3 of a second) from each response set were removed as these contain transients from stimulus switching. For testing, we constructed 500 batches of size B = 500. Typically, this whole train/test procedure would be done on the order of 10 s of times, producing means across batches, training initializations, and random train/test designations. Uncertainty for mutual information estimates incorporated both noise due to different initializations as well as noise between batches Appendix B Prediction synergy To understand how correlation structure may affect prediction synergy, we briefly consider a Gaussian toy model. Two sub-systems A and B have “present” states x = ( x A , x B ), and future states ( y A , y B ). Because all variables are all jointly Gaussian, we can specify all parameters by choosing first- and second-order correlation functions. We choose a ll meanstobe zero, that is for α = A, B , ⟨ x α ⟩ = 0 = ⟨ y α ⟩. Next, we fix the scale of fluctuations so that all . With only pairwise correlations between degrees of freedom left, we choose three parameters, a, b , and c , representing autocorrelations, equal-time A - B correlations, and across-time A - B correlations (cross-correlations) respectively. Note that the only choices of ( a, b, c ) which are valid are those such that all correlation matrices are positive definite. The expression for prediction synergy in this toy model is As a point of reference, note that the predictive information of either subsystem is given by Where for ⟨ [ x A , y A ] T [ x A , y A ] ⟩ to be positive definite we must have a 2 < 1. Several limits of Eq. B1 are especially illuminating. The easiest limit is that of independence between the subsystems, b = c = 0. While this can be seen to yield S ( A ; B ) = 0 from the expression above, the deeper reason is that when A and B are statistically independent we must have I ( X ; Y ) = I ( X A ; Y A ) + I ( X B ; Y B ), and hence S ( A ; B ) = 0. Next, consider a model where B is a direct copy of A . Since x A has the same relationship with y A as it does with y B , we must have a = c . Further, x A must relate to x B in the same way it relates to itself, so we take b → 1, which yields S ( A ; B ) = log(1 − a 2 ) = − I ( X A ; Y A ). That is, the prediction synergy is negative, indicating redundancy. Moreover, the extent of this redundancy is given by the total predictive information of a single subsystem, in agreement with our interpretation of this model as consisting of two redundant copies. As a final limit, we consider the removal of all correlations except cross-correlations. In this case, a = 0 implies that I ( x A ; y A ) = 0, meaning each subsystem observed alone has no information about its future. Taking b = 0 as well, we find that S ( A ; B ) = − 2 log(1 − c 2 ). From the definition of S , we also see that S ( A ; B ) = I ( X ; Y ). In this limit, all predictive information is encoded collectively, in that observation of both subsystems is required to extract it. It is interesting to note that the expression − 2 log(1− c 2 ) is the same as in the case of independent systems except under the replacement c → a . The basic reason for this is that this limit also describes two independent subsystems, but x A should be grouped with y B and x B with y A , meaning our choice of subsystems does not capture this independence. References [1]. ↵ F Creutzig , H Sprekeler , Predictive Coding and the Slowness Principle: An Information-Theoretic Approach . Neural Computation 20 , 1026 – 1041 ( 2008 ). OpenUrl CrossRef PubMed [2]. F Creutzig , A Globerson , N Tishby , Past-future information bottleneck in dynamical systems . Physical Review E 79 , 041925 ( 2009 ). OpenUrl [3]. ↵ NC Rust , SE Palmer , Remembering the Past to See the Future . Annual review of vision science 7 , 349 ( 2021 ). OpenUrl PubMed [4]. ↵ A Pérez-Escudero , GG De Polavieja , Collective Animal Behavior from Bayesian Estimation and Probability Matching . PLoS Computational Biology 7 , e1002282 ( 2011 ). OpenUrl [5]. ↵ MJ Berry , IH Brivanlou , TA Jordan , M Meister , Anticipation of moving stimuli by the retina . Nature 398 , 334 – 338 ( 1999 ) Number: 6725 Publisher: Nature Publishing Group. OpenUrl CrossRef PubMed Web of Science [6]. ↵ T Gollisch , M Meister , Eye Smarter than Scientists Believed: Neural Computations in Circuits of the Retina . Neuron 65 , 150 – 164 ( 2010 ). OpenUrl CrossRef PubMed Web of Science [7]. ↵ SE Palmer , O Marre , MJ Berry , W Bialek , Predictive information in a sensory population . Proceedings of the National Academy of Sciences 112 , 6908 – 6913 ( 2015 ) Publisher: Proceedings of the National Academy of Sciences. OpenUrl Abstract / FREE Full Text [8]. ↵ AJ Sederberg , JN MacLean , SE Palmer , Learning to make external sensory stimulus predictions using internal correlations in populations of neurons . Proceedings of the National Academy of Sciences 115 , 1105 – 1110 ( 2018 ) Publisher: Proceedings of the National Academy of Sciences. OpenUrl Abstract / FREE Full Text [9]. B Liu , A Hong , F Rieke , MB Manookin , Predictive encoding of motion begins in the primate retina . Nature Neuroscience pp. 1 – 12 ( 2021 ). [10]. ↵ BD Hoshal , et al. , Stimulus-invariant aspects of the retinal code drive discriminability of natural scenes . Proceedings of the National Academy of Sciences 121 , e2313676121 ( 2024 ). OpenUrl CrossRef PubMed [11]. ↵ R Nijhawan , Visual prediction: Psychophysics and neurophysiology of compensation for time delays . Behavioral and Brain Sciences 31 , 179 – 198 ( 2008 ). OpenUrl CrossRef PubMed Web of Science [12]. PR Roelfsema , AK Engel , P König , W Singer , Visuomotor integration is associated with zero time-lag synchronization among cortical areas . Nature 385 , 157 – 161 ( 1997 ). OpenUrl CrossRef PubMed Web of Science [13]. ↵ R Nijhawan , K Kirschfeld , Analogous mechanisms compensate for neural delays in the sensory and the motor pathways: Evidence from motor flash-lag . Current Biology 13 , 749 – 753 ( 2003 ). OpenUrl CrossRef PubMed Web of Science [14]. ↵ C Kayser , W Einhäuser , P König , Temporal correlations of orientations in natural scenes . Neurocomputing 52 , 117 – 123 ( 2003 ). OpenUrl [15]. ↵ M Dorr , T Martinetz , KR Gegenfurtner , E Barth , Variability of eye movements when viewing dynamic natural scenes . Journal of vision 10 , 28 – 28 ( 2010 ). OpenUrl Abstract [16]. ↵ Y Dan , JJ Atick , RC Reid , Efficient coding of natural scenes in the lateral geniculate nucleus: experimental test of a computational theory . Journal of neuroscience 16 , 3351 – 3362 ( 1996 ). OpenUrl Abstract / FREE Full Text [17]. JM Salisbury , SE Palmer , A dynamic scale-mixture model of motion in natural scenes . bioRxiv pp. 2023 – 10 ( 2023 ). [18]. ↵ JM Salisbury , SE Palmer , Optimal Prediction in the Retina and Natural Motion Statistics . Journal of Statistical Physics 162 , 1309 – 1323 ( 2016 ). OpenUrl CrossRef [19]. DL Ruderman , Origins of scaling in natural images . Vision research 37 , 3385 – 3398 ( 1997 ). OpenUrl CrossRef PubMed Web of Science [20]. DL Ruderman , TW Cronin , CC Chiao , Statistics of cone responses to natural images: implications for visual coding . JOSA A 15 , 2036 – 2045 ( 1998 ). OpenUrl [21]. VA Billock , GC de Guzman , JS Kelso , Fractal time and 1/f spectra in dynamic images and human vision . Physica D: Nonlinear Phenomena 148 , 136 – 146 ( 2001 ). OpenUrl [22]. ↵ MM Roberts , MM Schira , B Spehar , ZJ Isherwood , Nature in motion: The tuning of the visual system to the spatiotemporal properties of natural scenes . Journal of Vision 22 , 7 – 7 ( 2022 ). OpenUrl CrossRef PubMed [23]. ↵ BD Hoshal , et al. , Stimulus invariant aspects of the retinal code drive discriminability of natural scenes ( 2023 ). [24]. ↵ JS Prentice , et al. , Error-Robust Modes of the Retinal Population Code . PLOS Computational Biology 12 , e1005148 ( 2016 ) Publisher: Public Library of Science. OpenUrl PubMed [25]. A Loback , J Prentice , M Ioffe , M Berry II . , Noise-Robust Modes of the Retinal Population Code Have the Geometry of “Ridges” and Correspond to Neuronal Communities . Neural Computation 29 , 3119 – 3180 ( 2017 ). OpenUrl PubMed [26]. G Tkačik , et al. , Thermodynamics and signatures of criticality in a network of neurons . Proceedings of the National Academy of Sciences 112 , 11508 – 11513 ( 2015 ). OpenUrl Abstract / FREE Full Text [27]. U Ferrari , et al. , Separating intrinsic interactions from extrinsic correlations in a network of sensory neurons . Physical Review E 98 , 042410 ( 2018 ). OpenUrl [28]. ↵ G Mahuas , T Buffet , O Marre , U Ferrari , T Mora , Strong, but not weak, noise correlations are beneficial for population coding . PRX Life ( 2025 ). [29]. ↵ M Carandini , et al. , Do we know what the early visual system does? Journal of Neuroscience 25 , 10577 – 10597 ( 2005 ). OpenUrl Abstract / FREE Full Text [30]. ↵ H Kirchner , SJ Thorpe , Ultra-rapid object detection with saccadic eye movements: Visual processing speed revisited . Vision research 46 , 1762 – 1776 ( 2006 ). OpenUrl CrossRef PubMed Web of Science [31]. ↵ M Koch-Janusz , Z Ringel , Mutual information, neural networks and the renormalization group . Nature Physics 14 , 578 – 582 ( 2018 ). OpenUrl [32]. ↵ DE Gökmen , Z Ringel , SD Huber , M Koch-Janusz , Statistical physics through the lens of real-space mutual information . Physical Review Letters 127 , 240603 ( 2021 ). OpenUrl PubMed [33]. DE Gökmen , Z Ringel , SD Huber , M Koch-Janusz , Symmetries and phase diagrams with real-space mutual information neural estimation . 104 , 064106 (year?). [34]. ↵ D. Gökmen , et al. , Compression theory for inhomogeneous systems . Nature Communications 15 , 10214 ( 2024 ). OpenUrl PubMed [35]. ↵ N Tishby , FC Pereira , W Bialek , The information bottleneck method . ( 2000 ) arXiv:physics/0004057. [36]. ↵ S Palmer , et al. , Larval salamander retinal population data in response to natural movies from the chicago motion database , doi: 10.5061/dryad.4qrfj6qm8 . ( 2024 ). OpenUrl CrossRef [37]. ↵ O Marre , et al. , Mapping a Complete Neural Population in the Retina . The Journal of Neuroscience 32 , 14859 – 14873 ( 2012 ). OpenUrl Abstract / FREE Full Text [38]. ↵ J Salisbury , SE Palmer , Optimal prediction and natural scene statistics in the retina . arXiv:1507.00125 [q-bio] ( 2015 ) arXiv: 1507.00125 . [39]. ↵ E Vig , M Dorr , E Barth , Efficient visual coding and the predictability of eye movements on natural movies . Spatial Vision 22 , 397 – 408 ( 2009 ). OpenUrl CrossRef PubMed Web of Science [40]. ↵ S Nishimoto , et al. , Reconstructing visual experiences from brain activity evoked by natural movies . Current biology 21 , 1641 – 1646 ( 2011 ). OpenUrl CrossRef PubMed [41]. ↵ B Poole , S Ozair , Avd Oord , AA Alemi , G Tucker , On Variational Bounds of Mutual Information ( 2019 ) arXiv: 1905.06922 [cs]. [42]. ↵ MS Schmitt , M Koch-Janusz , M Fruchart , DS Seara , V Vitelli , Information theory for model reduction in stochastic dynamical systems ( 2023 ) arXiv: 2312.06608 [cond-mat]. [43]. ↵ PJ Schmid , Dynamic mode decomposition of numerical and experimental data . Journal of Fluid Mechanics 656 , 5 – 28 ( 2010 ). OpenUrl CrossRef [44]. ↵ SCL Durian , K Bojanek , O Marre , SE Palmer , Preserving predictive information under biologically plausible compression ( 2025 ). [45]. ↵ P Domenici , ME Hale , Escape responses of fish: a review of the diversity in motor control, kinematics and behaviour . Journal of Experimental Biology 222 , jeb166009 ( 2019 ). OpenUrl Abstract / FREE Full Text [46]. MY Peek , GM Card , Comparative approaches to escape . Current opinion in neurobiology 41 , 167 – 173 ( 2016 ). OpenUrl CrossRef PubMed [47]. ↵ G Card , MH Dickinson , Visually mediated motor planning in the escape response of drosophila . Current Biology 18 , 1300 – 1307 ( 2008 ). OpenUrl CrossRef PubMed Web of Science [48]. ↵ J Ding , et al. , Spatially displaced excitation contributes to the encoding of interrupted motion by a retinal direction-selective circuit . eLife 10 , e68181 ( 2021 ). OpenUrl CrossRef PubMed [49]. ↵ S Wang , et al. , Learning low-dimensional generalizable natural features from retina using a u-net . Advances in neural information processing systems 35 , 11355 – 11368 ( 2022 ). OpenUrl PubMed [50]. J Freedland , F Rieke , Systematic reduction of the dimensionality of natural scenes allows accurate predictions of retinal ganglion cell spike outputs . Proceedings of the National Academy of Sciences 119 , e2121744119 ( 2022 ). OpenUrl CrossRef PubMed [51]. JL Puchalla , E Schneidman , RA Harris , MJ Berry , Redundancy in the Population Code of the Retina . Neuron 46 , 493 – 504 ( 2005 ). OpenUrl CrossRef PubMed Web of Science [52]. ↵ I Bomash , Y Roudi , S Nirenberg , A Virtual Retina for Studying Population Coding . PLoS ONE 8 , e53363 ( 2013 ). OpenUrl CrossRef PubMed [53]. ↵ Z Zhang , Improved adam optimizer for deep neural networks in 2018 IEEE/ACM 26th international symposium on quality of service (IWQoS). (Ieee) , pp. 1 – 2 ( 2018 ). [54]. ↵ DP Kingma , Adam: A method for stochastic optimization . arXiv preprint arXiv: 1412.6980 ( 2014 ). View the discussion thread. Back to top Previous Next Posted October 19, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Coarse-graining reveals collective predictive information in a sensory population Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Coarse-graining reveals collective predictive information in a sensory population Adam G. Kline , Maciej Koch-Janusz , Aleksandra M. Walczak , Thierry Mora , Stephanie E. Palmer bioRxiv 2025.10.18.683195; doi: https://doi.org/10.1101/2025.10.18.683195 Share This Article: Copy Citation Tools Coarse-graining reveals collective predictive information in a sensory population Adam G. Kline , Maciej Koch-Janusz , Aleksandra M. Walczak , Thierry Mora , Stephanie E. Palmer bioRxiv 2025.10.18.683195; doi: https://doi.org/10.1101/2025.10.18.683195 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neuroscience Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41913) Biophysics (21436) Cancer Biology (18578) Cell Biology (25482) Clinical Trials (138) Developmental Biology (13372) Ecology (19889) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15599) Genomics (22483) Immunology (17728) Microbiology (40365) Molecular Biology (17163) Neuroscience (88540) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15130) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9818) Zoology (2269)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.