Natural gradient Bayesian sampling automatically emerges in canonical cortical circuits

doi:10.1101/2025.07.03.663069

Natural gradient Bayesian sampling automatically emerges in canonical cortical circuits

2025 · doi:10.1101/2025.07.03.663069

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 85,155 characters · extracted from preprint-html · click to expand

Natural gradient Bayesian sampling automatically emerges in canonical cortical circuits | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Natural gradient Bayesian sampling automatically emerges in canonical cortical circuits View ORCID Profile Zimei Chen , Yi Ren , View ORCID Profile Eryn Sale , Ying Nian Wu , View ORCID Profile Wen-Hao Zhang doi: https://doi.org/10.1101/2025.07.03.663069 Zimei Chen 1 Lyda Hill Department of Bioinformatics, UT Southwestern Medical Center 2 O’Donnell Brain Institute, UT Southwestern Medical Center Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Zimei Chen Yi Ren 1 Lyda Hill Department of Bioinformatics, UT Southwestern Medical Center 2 O’Donnell Brain Institute, UT Southwestern Medical Center Find this author on Google Scholar Find this author on PubMed Search for this author on this site Eryn Sale 1 Lyda Hill Department of Bioinformatics, UT Southwestern Medical Center 2 O’Donnell Brain Institute, UT Southwestern Medical Center Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Eryn Sale Ying Nian Wu 3 Department of Statistics, University of California, Los Angeles Find this author on Google Scholar Find this author on PubMed Search for this author on this site Wen-Hao Zhang 1 Lyda Hill Department of Bioinformatics, UT Southwestern Medical Center 2 O’Donnell Brain Institute, UT Southwestern Medical Center Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Wen-Hao Zhang For correspondence: wenhao.zhang{at}utsouthwestern.edu Abstract Full Text Info/History Metrics Supplementary material Preview PDF A bstract Accumulating evidence suggests the canonical cortical circuit, consisting of excitatory (E) and diverse classes of inhibitory (I) interneurons, implements Bayesian posterior sampling. However, most of the identified circuits’ sampling algorithms are simpler than the nonlinear circuit dynamics, suggesting complex circuits may implement more advanced algorithms. Through comprehensive theoretical analyses, we discover the canonical circuit innately implements natural gradient Bayesian sampling, which is an advanced sampling algorithm that adaptively adjusts the sampling step size based on the local geometry of stimulus posteriors measured by Fisher information. Specifically, the nonlinear circuit dynamics can implement natural gradient Langevin and Hamiltonian sampling of uni- and multi-variate stimulus posteriors, and these algorithms can be switched by interneurons. We also find that the non-equilibrium circuit dynamics when transitioning from the resting to evoked state can further accelerate natural gradient sampling, and analytically identify the neural circuit’s annealing strategy. Remarkably, we identify the approximated computational strategies employed in the circuit dynamics, which even resemble the ones widely used in machine learning. Our work provides an overarching connection between canonical circuit dynamics and advanced sampling algorithms, deepening our understanding of the circuit algorithms of Bayesian sampling. 1 I ntroduction The brain lives in a world of uncertainty and ambiguity, necessitating the inference of unobserved world states. The Bayesian inference provides a normative framework for this process, and extensive studies have suggested that neural computations across domains aligns with Bayesian principles, giving rise to the concept of the “Bayesian brain” ( Knill & Pouget, 2004 ). These include visual processing ( Yuille & Kersten, 2006 ), multi-sensory integration ( Ernst & Banks, 2002 ), decision-making ( Beck et al., 2008 ), sensorimotor learning ( Körding & Wolpert, 2004 ), etc. Recent studies suggested the canonical cortical circuit may naturally implement sampling-based Bayesian inference to compute the posterior ( Hoyer & Hyvärinen, 2003 ; Buesing et al., 2011 ; Aitchison & Lengyel, 2016 ; Haefner et al., 2016 ; Orbán et al., 2016 ; Echeveste et al., 2020 ; Zhang et al., 2023 ; Terada & Toyoizumi, 2024 ; Masset et al., 2022 ; Sale & Zhang, 2024 ), in that the large cortical response variability is consistent with the stochastic nature of sampling algorithms. The canonical cortical circuit ( Fig. 1A ) – the fundamental computational building block of the cerebral cortex – consists of excitatory (E) neurons and various inhibitory interneurons (I) including neurons of parvalbumin (PV), somatostatin (SOM), and vasoactive intestinal peptide (VIP) ( Adesnik et al., 2012 ; Fishell & Kepecs, 2020 ; Niell & Scanziani, 2021 ; Campagnola et al., 2022 ). Different interneuron classes have different intrinsic electrical properties and form specific connectivity patterns ( Fig. 1B ). The canonical circuit is highly conserved across a wide spectrum of vertebrate species and likely represents a common network architecture solution discovered by evolution over millions of years. Therefore, studying the algorithms underlying canonical circuits not only advances our understanding of neural computations, but also positions these circuits as building blocks for next-generation deep network models, with their clear algorithmic understanding enabling full interpretability. Download figure Open in new tab Figure 1. (A) The canonical circuit of E and diverse types of interneurons and sampling-based Bayesian inference. (B) The circuit model in the present study consists of E and two types of interneurons (PV and SOM). (C) The recurrent connection kernel between E neurons. (D) Feedforward input represented as a continuous approximation of Poisson spike trains with Gaussian tuning across the stimulus feature. (E) The feedforward input parametrically encodes the stimulus feature likelihood. (F) The population responses of E (top) and SOM neurons (bottom). (G) Stimulus feature values sampled by the E and SOM neurons respectively. (H) The network’s sampling distribution read out from E and SOM neurons. The E neuron’s position is regarded as stimulus feature sample z E , while the sample of SOM neurons z S contributes to the auxiliary momentum variable in Hamiltonian sampling. The distribution of z E (top marginal) will be used to approximate the posterior. The field has started to identify the algorithm of the canonical circuit. For example, a very recent study has identified the Bayesian sampling algorithm in reduced canonical circuit motifs ( Sale & Zhang, 2024 ): the reduced circuit of only E and PV neurons can implement Langevin posterior sampling in the stimulus feature manifold. And incorporating SOM into the circuit introduces oscillations that accelerate sampling by upgrading Langevin sampling into more efficient Hamiltonian sampling. Nevertheless, a significant gap remains between identified Bayesian sampling algorithms and the complex, nonlinear canonical circuit dynamics. A notable distinction is that canonical circuit dynamics is inherently non-linear and substantially more complex than the linear dynamics of Langevin and Hamiltonian samplings identified in previous circuit models and used in machine learning (ML) research. Rather than dismissing the added complexity as incidental to neural dynamics without computational purpose, we explore whether these nonlinear circuit dynamics may serve some advanced function. This raises a compelling question: Can nonlinear circuit dynamics implement more advanced and efficient sampling algorithms? If so, what are advanced circuit algorithms? To address this question, we perform comprehensive theoretical analyses of the canonical circuit model composed of E neurons and two classes of interneurons (PV and SOM). Our analysis reveals that canonical circuit dynamics not only implements standard Langevin and Hamiltonian sampling as revealed in Sale & Zhang (2024) , but innately incorporate the natural gradient (NG) to automatically adjust the step size (or the “temperature”) in the circuit’s Langevin and Hamiltonian sampling based on the local geometry of the posterior distribution measured by the Fisher information (FI). Specifically, we find the total activity of E neurons monotonically increases with posteriors’ FI, and dynamically control the effective sampling step size in the low-dimensional stimulus feature manifold. Remarkably, the NG sampling in canonical circuit dynamics exhibits computational strategies analogous to established numerical techniques in ML ( Hwang, 2024 ; Girolami & Calderhead, 2011 ; Marceau-Caron & Ollivier, 2017 ). These include, 1 ) the recurrent E input acts as a regularization analogous to adding a small number during FI inversion to prevent numerical instabilities ( Eq. 5 , α ) – a common practice in NG sampling algorithms; 2 ) When coupling multiple canonical circuits interact to sample multivariate stimulus posteriors, the coupled circuit approximates the full FI matrix with its diagonal elements ( Sec. 5 ), similar to the diagonal approximation used in scalable NG samplings. In addition, our analysis reveals that when the circuit transitions from resting state (no feedforward input) to evoked state (with feedforward input), the non-equilibrium circuit dynamics further accelerates sampling beyond the efficiency of standard NG sampling. We analytically identify the neural annealing strategy within canonical circuit dynamics ( Fig. 2J , Eq. 3b ). Download figure Open in new tab Figure 2. NG Langevin sampling in the reduced circuit (E and PV). (A) The need for adaptive step size to sample different posteriors. (B) The reduced circuit with fixed weights flexibly samples posteriors with different uncertainties. (C-D) E bump height U E increases with posterior FI (C) and determines the sampling time constant in the stimulus feature manifold. (E-F) The sampling convergence with recurrent weight w EE that acts as a regularizer ( Eq. 5 ). (G-H) The sampling convergence at different posterior uncertainties. (I-K) Non-equilibrium sampling further accelerates convergence. The non-equilibrium population responses (I), bump height (J), and the KL divergence (K) from the resting state (no feedforward input) to evoked state. Significance The present study provides the first demonstration that canonical cortical circuits with diverse classes of interneurons naturally implement natural gradient Langevin and Hamiltonian sampling. We establish a precise mapping between circuit components and computational elements of advanced sampling algorithms, bridging computational neuroscience and ML. And the canonical cortical circuit may inspire the new building block for more efficient, interpretable deep networks. 2 B ackground : T he canonical cortical circuit model We consider a nonlinear canonical circuit model consisting of E neurons and two classes of interneurons (PV and SOM) ( Fig. 1A ), whose dynamics is adopted from a recent circuit modeling study ( Sale & Zhang, 2024 ). This model is biologically plausible by reproducing tuning curves of different types of neurons ( Fig. A1A-C ), and is analytically tractable so we can directly identifying the nonlinear circuit’s algorithm. Briefly, each of the N E E neurons is tuned to a preferred 1D stimulus z = θ j . The full set of these preferences, , uniformly covers the whole stimulus space. E neurons are recurrently connected with a Gaussian kernel in the stimulus space ( Eq. 1d ). Both PV and SOM neurons are driven by E neurons but with different interactions: PV neurons deliver global, divisive normalization to E neurons ( Eq. 1b ), whereas SOM neurons provide local, subtractive inhibition ( Eq. 1c ). The whole circuit dynamics is (Sec. C for detailed explanation and construction rationale). u X and r X represent the synaptic inputs and firing rates of neurons of type X respectively. In Eq. (1a) , the neuronal types X ∈ { E, F, S } representing inputs from E neurons, sensory feedforward inputs ( Eq. 1e ), and SOM neurons ( Eq. 1c ) respectively. [ x ] + = max( x , 0) is the negative rectification. E neurons receive internal Poisson variability with Fano factor F, mimicking stochastic spike generation that can provide appropriate internal variability for circuit sampling ( Zhang et al., 2023 ). In particular, g S is the “gain” of SOM neurons and can be modulated (see Discussion), which is the key circuit mechanism to flexibly switch between static inference and dynamic inference with various speeds. To facilitate math analysis, the above dynamics consider infinite number of neurons in theory ( N E → ∞), then the summation of inputs from other neurons θ j becomes an integration (convolution) over θ , e.g., ( W * r )( θ ) = ∫ W ( θ − θ ′ ) r ( θ ′ ) dθ ′ , while our simulations take finite number of neurons. ρ = N E / 2 π is the neuronal density in the stimulus feature space, a factor in discretizing the integral. 2.1 T heoretical analysis of the canonical circuit dynamics It has established theoretical approach to obtain analytical solutions of the nonlinear recurrent circuit dynamics considered in the present study ( Fung et al., 2010 ; Wu et al., 2016 ; Zhang & Wu, 2012 ; Sale & Zhang, 2024 ), including attractor states, full eigenspectrum of the perturbation dynamics, and the projected dynamics onto the dominant eigenmodes. These analytical solutions are essential to identify the circuit’s Bayesian algorithms. Below, we briefly introduce the key steps and results of the theoretical analysis, with detailed math calculations in Sec. C. Attractors E neurons in canonical circuit dynamics have the following attractor states with a bump profile over the stimulus feature space ( Fig. A1 ; Sec. C), Similar bump attractor states exist for SOM neurons ( Eq. E2) . In contrast, PV neurons don’t have a spatial bump profile since their interactions with E neurons are unstructured ( Eq. 1b ). Dimensionality reduction for stimulus sampling dynamics The perturbation analysis reveals that the first two dominant eigenmodes of the circuit dynamics correspond to the change of bump position z E and the bump height U E respectively (Sec. C, ( Fung et al., 2010 ; Wu et al., 2016 )), and similarly for SOM neurons. We project the E dynamics ( Eq. 1a ) onto the above two dominant eigenvectors (calculating the inner product of the circuit dynamics and the eigenvectors), yielding the governing dynamics of the z E and U E (the projection of SOM neurons will be shown later in Sec. 6 and E), where U XY is the population input height from population Y to X . and are constants that don’t change with network activities. The approximation comes from omitting negligible nonlinear terms in the circuit dynamics (Sec. C.3). Our following theoretical analysis on circuit algorithms will be based on the above two equations. 2.2 B ackground : N atural gradient B ayesian sampling Amari’s natural gradient is a well-known method to adaptively adjust the sampling step size based on the local geometry characterized by the Fisher information (FI) G ( z ) ( Amari, 1998 ; Amari & Douglas, 1998 ; Amari, 2016 ; Girolami & Calderhead, 2011 ) (see details in Sec. (B.2), For a Gaussian distribution 𝒩 ( z | µ , Λ − 1 ), the FI will be its precision Λ and doesn’t depend on z . Natural gradient Langevin sampling (NGLS) The FI is used to determine the step size of the Langevin sampling dynamics to sample the posterior π ( z ) ( Girolami & Calderhead, 2011 ), α is a small positive constant acting as a regularization term to improve numerical stability in inverting the FI ( Hwang, 2024 ; Marceau-Caron & Ollivier, 2017 ; Wu et al., 2024 ), which is widely used in ML. η is a small constant similar to the inverse of “learning rate”. In the naive Langevin sampling, τ L is fixed rather than proportional to the FI. In the NG Langevin sampling, the τ L scales with the FI. If the distribution is widely spread out, the sampling step size will be larger, allowing for faster exploration of the space. Conversely, if the distribution is sharply peaked, the sampling step size will be smaller to explore the local region more thoroughly. Natural gradient Hamiltonian sampling (NGHS) It defines a Hamiltonian function H ( z, p ) where the momentum distribution π ( p | z )’s variance (rather than precision) is proportional to the FI G ( z ). The NGHS dynamics with friction is governed by ( Girolami & Calderhead, 2011 ; Ma et al., 2015 ), where τ H is the time constant of the Hamiltonian sampling, and γ is the friction that dampens momentum. The Hamiltonian dynamics with friction can be interpreted as a Langevin dynamics added into the momentum dynamics ( Chen et al., 2014 ; Ma et al., 2015 ). When γ = 0, Eq. (7) reduces into the naive Hamiltonian dynamics. Our following analysis will show the canonical circuit can automatically implement the natural gradient Langevin sampling and Hamiltonian sampling. 3 F rom circuit dynamics to B ayesian sampling In our framework, the stage from external stimulus z to the feedforward input r F is regarded as a generative process ( Fig. 1A ), and then the circuit dynamics ( Eqs. 1a and 1c ) effectively performs Bayesian sampling dynamics to compute the stimulus posterior, π ( z ) ≡ p ( z | r F ) ∝ p ( r F | z ) p ( z ). We hypothesize that the circuit computes subjective posterior distributions π ( z ) based on its internal generative model ( Lange et al., 2023 ), implicitly assuming the subjective prior in brain’s neural circuits matches the objective prior (usually not known precisely) of the world. With this hypothesis, we treat the canonical circuit, strongly supported by experiments, as a “ground truth”, and aim to identify the circuit’s internal generative model and its Bayesian sampling algorithms. Subjective prior p ( z ). We will leave the subjective prior p ( s ) unspecific for now and will find it through the analysis of the circuit dynamics. This will be shown later in the Eqs. (10 and 13 ). Stimulus likelihood L ( z ). The stochastic feedforward input from the stimulus z ( Eq. 1e ) naturally specifies the stimulus likelihood that is calculated as a Gaussian likelihood (see Sec. C.4), The mean µ z and precision Λ are geometrically regarded as r F ‘s location and height respectively ( Fig. 1D-E ). A single snapshot of r F parametrically conveys the stimulus likelihood p ( r F | z ), in that all likelihood parameters are read out from r F . In particular, the Gaussian stimulus likelihood is resulted from the the Gaussian profile of feedforward input tuning λ F ( θ | z ) ( Eq. 1e , Ma et al. (2006) . Circuit’s stimulus posterior FI comes from the likelihood ( Eq. 8 ) and the prior (unspecified now), Our following analysis will focus on connecting the circuit dynamics on the position and height subspace ( Eqs. 3a and 3b ) to the NG sampling dynamics ( Sec. 2.2 ), to identify how the circuit implements NG Langevin and Hamiltonian sampling. To facilitate understanding, we will start from the reduced circuit model without SOM neurons ( Sec. 4 - 5 ) and then add the SOM back ( Sec. 6 ). 4 A reduced circuit with E and PV neurons : NG L angevin sampling To facilitate understanding, we first present how the circuit realizes the naive Langevin sampling ( Sale & Zhang, 2024 ), then conduct further analyses to reveal its mechanism of NG Langevin samplings. 4.1 N aive L angevin sampling in the reduced circuit To analyze the circuit Langevin sampling, we convert the bump position z E dynamics ( Eq. 3a ) into a Langevin sampling form by expressing its drift term as the log-likelihood gradient, where the feedforward input strength U EF ( Eq. 3a ) is proportional to the likelihood precision Λ, i.e., U EF ∝ w EF R F ∝ w EF Λ ( Eq. 8 ). Notably, the drift and diffusion terms in Eq. (10) share the same factor τ U E , a necessary condition for Langevin sampling ( Eq. 5 ). Then we investigate the how the circuit realizes Langevin sampling by comparing Eqs. (10 and 5 ), and study its sampling structure. Uniform (uninformative) circuit prior It is because the drift term in Eq. (10) is the stimulus likelihood ℒ ( z ) gradient, due to the translation-invariant recurrent weights ( Eq. 1d ). This result is consistent with the previous study ( Zhang et al. (2023) ; Sale & Zhang (2024) ; see Discussion). The circuit sampling only constrains feedforward weight w EF It requires the ratio in Eq. (10) which only constrains the feedforward weight as , irrelevant with other circuit weights like w EE and w EP as long as the circuit dynamics is stable. This suggests the robust circuit sampling and no fine-tuning of circuit parameters is needed. Flexible sampling the whole likelihood family Once the feedforward weight is set at , the circuit with fixed weights flexibly sampling likelihoods with different means and uncertainties, because in Eq. (10) the λ z and σ z are invariant with circuit activities, and τ U E is a free parameter without changing the equilibrium sampling distribution. And then the bump position z E dynamics will automatically sample the corresponding likelihood that is parametrically represented by the instantaneous feedforward input r F ( Eq. 8 ). This is also confirmed by our simulation ( Fig. 2B ). 4.2 N atural gradient L angevin sampling in the reduced circuit The NG Langevin sampling requires the sampling time constant increases with the FI G ( z ) ( Eq. 5 ). Meanwhile, the time constant of the circuit’s bump position z E dynamics is proportional to the bump height U E ( Eq. 3b and 10 ) and is confirmed by circuit simulation ( Fig. 2D ). Thus we analyze the relation between U E and the FI. For simplicity, we first focus on the equilibrium mean of U E (averaging Eq. 3b ), and the identified NGLS parameters in the circuit are shown in Fig. 4E . E bump height U E encodes Fisher information The feedforward input height U EF is proportional to the likelihood FI G ( z ) ( Eq. 9 , uniform prior), making the mean bump height Ū E increase with G ( z ). This is also confirmed by the circuit simulation ( Fig. 2C ). Consequently, the bump height Ū E effectively represents the stimulus FI and in turn scales the time constant of the circuit sampling z E dynamics ( Eq. 10 , Fig. 2D ), enabling the NGLS in the circuit. The recurrent E input (weight) acts as a regularizer Comparing Eqs. (11 and 5 ), the recurrent input strength U EE acts as a role of the regularization coefficient α , improving the numerical stability in inverting the FI when it is small or ill-conditioned ( Hwang, 2024 ; Marceau-Caron & Ollivier, 2017 ; Wu et al., 2024 ). Without recurrent E weight ( U EE = 0 via setting w EE = 0), the circuit sampling behaves similarly with the NGLS ( Fig. 2E ). Including recurrent weights enlarges the sampling time constant, slowing down the sampling as suggested by our theory ( Fig. 2E ). Nevertheless, with extremely small FI, the circuits with higher recurrent weights have faster convergence ( Fig. 2F , leftmost part), because the recurrent E input stabilizes the inversion of very small FI. Moreover, NGLS is characterized by the invariant temporal correlation of samples with the posterior uncertainties (controlled by input intensity R F ), which is also confirmed in the circuit simulation ( Fig. 2G ). The flexible scaling with various posterior FI The canonical circuit model with fixed weights flexibly scales its sampling time constant (determined by Ū E , Eq. 3a ) with various posteriors FI (controlled by the feedforward input rate R F ), all of which is automatically completed by the recurrent dynamics without the need of changing circuit parameters. For example, increasing R F increases the bump height U E ( Eq. 3b ) that leads to a larger sampling time constant, and meanwhile it changes the equilibrium sampling distribution of the circuit ( Eqs. 8 and 3a ). 4.3 N on-equilibrium circuit dynamics accelerates natural gradient sampling Our analysis so far concentrates on the equilibrium mean Ū E ( Eq. 11 ). We now extend to the non-equilibrium dynamics ( Eq. 3b ), particularly during the transient response immediately following the onset of a stimulus. After receiving a r F , the U E will gradually grow up until the equilibirum state ( Fig. 2I-J ). And meanwhile, the sampling step size will gradually decreases in that a larger U E leads to larger sampling constant and therefore smaller step size. This is similar to the annealing in stochastic computation. The larger sampling step size during the non-equilibrium implies the circuit can sample faster than the equilibrium state, confirmed by simulation ( Fig. 2K ). Furthermore, U E temporal trajectory ( Fig. 2J ) describes circuit’s annealing strategy , governed by Eq. 3b . This intrinsic annealing schedule is an emergent property of the circuit’s nonlinear dynamics. 5 C oupled circuits : NGLS of multivariate posteriors The brain needs to sample multivariate stimulus posteriors, which can be implemented by a decentralized system consisting of multiple coupled canonical circuit modules ( Fig. 3A , Zhang et al. (2016 ; 2023 ); Raju & Pitkow (2016) ). Each circuit module m receives a feedforward input stochastically evoked by a 1D stimulus z m ( Fig. 3 ), and the cross-talk between circuits enables them to read out the circuit prior, and eventually each circuit m samples the corresponding stimulus z m distributedly. As a proof of concept, we consider the smallest decentralized system of two coupled circuits to sample bivariate posteriors ( Fig. 3A ). The sampling of higher dim. posteriors can be extended by inserting more circuit modules, with the number of circuit modules determined by the stimulus dimension. Download figure Open in new tab Figure 3. Coupled canonical circuits sample multivariate posteriors, with each circuit m sampling the corresponding marginal posterior of z m . (A) The structure of decentralized circuit with each consisting of E and PV neurons (the same as Fig. 2B ). (B-C) The spatiotemporal E neuronal responses in two circuits (B) and the decoded stimulus samples (C). (D) When concatenating the samples from two circuits together, we recover the bivariate sampling distribution. Vector field: the drift term of the sampling dynamics in the circuit. (E) The vector field of natural gradient sampling with full FIM and diagonal FIM approximation. (F) The convergence speed in the decentralized circuit. The diagonal FIM approximation is scalable in high dimensions, while paying the cost of slower sampling speed. We investigate how the coupled circuits implement bivariate posteriors’ NGLS, and what kind of approximation, if there is any, is used in the circuit. The theoretical analysis of the two coupled circuits is similar to a single circuit module, but we perform the analysis on each circuit module individually, yielding the new position and height dynamics (details at Sec. D), z E = ( z 1 , z 2 ) ⊤ is two circuits’ E bump positions. Similarly for µ z and R F (feedfwd. input position and intensity respectively), and U E and R E (bump height of synaptic input and firing rate respectively). D U = diag( U E ) is a diagonal matrix of U E . The ° denots the element-wise product. The associative bivariate stimulus prior The z E dynamics ( Eq. 12a ) has a new term, i.e., − Lz E , which can be linked to the internal stimulus prior (omitting the subscript EE of U for clarity below). Hence the coupling matrix L is the prior’s precision matrix (see Sec. D.2). To ease of understanding, we expand the bivariate prior as p ( z 1 , z 2 ) ∝exp [−Λ 12 ( z 1 − z 2 ) 2 / 2], with Λ 12 ∝ ( U 12 + U 12 ) / 2. The coupled circuit stores an associative (correlational) stimulus prior with each marginal uniform, consistent with previous studies ( Sale & Zhang, 2024 ; Zhang et al., 2023 ). The identified correlational prior is confirmed by numerical simulation, where the actual sampling distribution of the circuit dynamics matches the subjective posterior predicted via the identified prior ( Fig. A2 ). Diagonal Fisher information approximation Given the identified circuit’s prior, we calculate the Fisher information matrix (FIM) G ( z ) and compare it with the actual sampling time constants, The time constant matrix D U in the circuit dynamics is proportional to the diagonal elements of the FIM, i.e., D U ∝ diag[ G ( z )], suggesting the circuits utilize the diagonal approximation of the FIM to scale the Langevin sampling step size. The diagonal FIM approximation is widely used in ML, as a trade-off of computational efficiency and accuracy ( Amari, 1998 ; Amari & Douglas, 1998 ; Amari, 2016 ; Wu et al., 2024 ), where the full FIM is hard to estimate. To illustrate the effect of diagonal FIM approximation on sampling dynamics, we plot the vector field of NGLS with full FIM and diagonal FIM respectively ( Fig. 3D ). All vector fields under full FIM directly point to the posterior mean, while the ones under diagonal FIM are curled along the long axis of the posterior. Meanwhile, the curled vector fields also exist in coupled circuits’ sampling dynamics, confirming diagonal FIM approximation in the circuit ( Fig. 3C ). The diagonal FIM simplifies the computation, while paying the cost of sampling speed ( Fig. 3E ). 6 T he circuit with SOM neurons : NG H amiltonian sampling We investigate the Bayeisan sampling in the augmented circuit model with SOM neurons providing structured inhibition to E neurons ( Eq. 1a ). The SOM’s structured inhibition can add the Hamiltonian sampling component in the circuit ( Sale & Zhang, 2024 ). We further analyze whether the augmented circuit with SOM neurons implements the natural gradient Hamiltonian sampling (NGHS). For simplicity, we consider a augmented circuit model to sample a univariate stimulus posterior. Similarly, we derive the eigenvectors of the SOM’s dynamics and then project the dynamics on dominant eigenvectors (see details in Sec. E). The position dynamics of the E and SOM neurons are, To understand the circuit’s sampling dynamics, the z E dynamics ( Eq. 15a ) is decomposed into the drift terms from Langevin and Hamiltonian parts with α H + α L = 1, and the momentum p is defined as a mixture of z E and z S . Transforming the ( z E , z S ) dynamics ( Eqs. 15a - 15b , Fig. 1H ) into the ( z E , p ) dynamics ( Fig. 4B ) shows a mixture of Langevin and Hamiltonian sampling in the circuit, Download figure Open in new tab Figure 4. Natural gradient Hamiltonian sampling in the augmented circuit with E, PV, and SOM neurons. (A) The circuit structure. (B) The decoded trajectory of stimulus sample z E and momentum p exhibits an oscillation pattern, which is a characteristic of Hamiltonian sampling. The momentum p is a weighted average of sample z E and z S as shown in Fig. 1H . (C) The circuit with fixed weights can sample posteriors with different uncertainties. (D) The momentum precision decreases with the Fisher information controlled by feedforward input strength, satisfying the requirement of natural gradient Hamiltonian sampling. (E) A table summarizing the circuit’ sampling parameters. where β p , β E and σ p are functions of the coefficients in Eq. (15a) (details at Eq. E16 ). And the momentum p dynamics has a friction term ( Eq. 16 ), corresponding to a Langevin component. A line manifold in weight space for Hamiltonian sampling It requires the ratio between the drift and diffusion coefficients are the same as the Langevin ( Eq. 5 ) and Hamiltonian sampling ( Eq. 7 ). Specifically, it requires 1) and 2) . Solving the two constraints, we can derive the requirement of circuit weights for Hamiltonian sampling, Q ( α L ) is nonlinear with α L and is invariant with network activities ( Eq. E21 ). U X and R X are the height of the population synaptic input and firing rate of neurons X ( Eq. 2 ). Eq. (17) suggests a line manifold in the circuit’s weight space ( w ES , w EF ) for correct posterior sampling, confirmed by numerical simulation ( Fig. A3 ). Moreover, once circuit weights are set within the line manifold, the circuit with fixed weights can flexibly sample posteriors with various uncertainties ( Fig. 4C ). Natural gradient Hamiltonian sampling Implementing NGHS requires the precision of the momentum p to be inversely proportional to posterior’s FI, G ( z ) ( Eq. 7 ). To verify this, we calculate the momentum distribution π ( p | z ) in the circuit (comparing Eqs. 16 and 7 ), We analyze the momentum precision Λ p in the circuit dynamics. Since β E is a complex, quadratic function of neuronal responses, we then use order analysis to provide insight (details at Sec. E.4). In the circuit dynamics, we calculate β E ~ 𝒪 ( G ( z )), U E ~ 𝒪 ( G ( z )), and G ( z ) = Λ, and then we have Λ p ∝ ( G ( z ) − 1 ), suggesting the momentum precision Λ p decreases with posterior’s FI and satisfys the requirement of NGHS. This is confirmed by simulation results ( Fig. 4D ). 7 C onclusion and D iscussion The present theoretical study for the first time discovers that the canonical circuit dynamics with E and two classes of interneurons (PV and SOM) innately implement natural gradient sampling of stimulus posteriors, deepening our understanding of circuit computations. The circuit samples stimulus posterior in the stimulus manifold that is geometrically regarded as the E neurons’ bump position. And we find the E bump height encodes the FI of the stimulus posterior, and determine the time constant of bump position’s sampling dynamics. We find the non-equilibirum dynamics of the E bump height can further accelerate sampling, and explicitly identify the circuit annealing strategy ( Eq. 3b ). Remarkably, we discover the circuit dynamics also utilizes computational approximations widely used in ML algorithms, including the regularization coefficient for inverting FI ( Eq. 11 ) and the diagonal FI matrix approximation in multivariate cases ( Eq. 12b ), which provides a direct evidence to validate the biological plausibility of artificial ML algorithms. Our work unprecedentedly links the canonical circuit with classes of interneurons to natural gradient sampling and related approximation strategies, providing deep, mechanistic insight into circuit sampling algorithms. Preliminary experimental support of NG sampling Our NG sampling circuit specifically predicts that the magnitude of the E neurons’ responses (the bump height U E , Eq. 3b ), is inversely proportional to the step size of the trajectory in the stimulus feature subspace (bump position z E , Eq. 3a ). This is supported by experiments from hippocampal place cells where the step size of the decoded spatial trajectories (akin to our z E ) was found to be negatively correlated with population firing rate ( Pfeiffer & Foster, 2015 ), providing a necessary condition for validating circuit NG sampling. Comparison with previous studies First , In our best knowledge, only one study investigated the NG sampling in recurrent networks ( Masset et al., 2022 ), while it is difficult to make direct and “fair” comparison since the network models in two studies are different: The previous study considers a spiking network without explicit defining neuron types, while the present study considers a rate-based network with diverse classes of interneurons (PV and SOM) that has rich repertoire of realizing various NG sampling algorithms (Langevin and Hamiltonian). From functional perspective, our circuit with fixed weights can flexibly realize NG sampling for posteriors with different mean and uncertainties, whereas it remains unknown whether this flexibly holds in the previous study. Second , our circuit model builds upon a recent work ( Sale & Zhang, 2024 ) that discovered the conventional Langevin and Hamiltonian sampling in the canonical circuit. Our work takes one step further and finds the same circuit can innately realize NG Langevin and Hamiltonian sampling, which is a fundamentally deeper result after more comprehensive theoretical analysis of the circuit by additionally projecting the circuit dynamics onto the second dominant height mode ( Eq. 3b ). Limitations, generalizations, and future directions First , the proposed circuit model doesn’t include VIP neurons ( Fig. 1 ), which are likely act as a “knob” modulating the SOM gain ( g S , Eq. 1c ) to adjust circuit sampling speed and the momentum (Sec. E.4). Second , Our canonical circuit model, widely used in neuroscience, only stores a uniform (marginal) prior for each stimulus as a result of an ideal case that neurons are uniformly tiling the stimulus manifold and translation-invariant recurrent weights ( Eq. 1d ). This implies the circuit has to break the neuronal homogeneity on the stimulus manifold to store a non-uniform (marginal) prior ( Ganguli & Simoncelli, 2010 ). Third , although our circuit with fixed weights automatically scale its sampling time constant with various posteriors’ FI, for each posterior it uses a globally homogeneous FI because the Gaussian posteriors have homogeneous curvature. In principle, we can change the profile of the recurrent kernel, and then the circuit can sample other posteriors in the exponential family with locally dependent FI. Fourth , we can introduce bump height U E oscillations with larger PV inhibitory weight w EP , and then the circuit has the potential to implement cosine-profile annealing. Fifth , to implement the NG sampling of general distributions, one possibility is our circuit samples baseline Gaussian distributions, and a feedforward decoder network map the base distribution into arbitrary distributions. Preserving the NG sampling in the space of arbitrary distribution probably requires the diffeomorphism of the decoder network. All of these form our future research. 8 R eproducibility S tatement All analytical calculations of the nonlinear circuit dynamics are detailed from Appendix Sec. B - E. Below is a list of the Appendix sections and their associated sections in the main text. Circuit models and theoretical analysis : is presented in Sec. 2 in the main text and the detailed introduction and rationale are presented in Appendix Sec. C 1D NG Langevin sampling : is presented in Sec. 4 in the main text and the detailed calculations are in Appendix Sec. C. Multivariate NG Langevin in coupled circuits : is presented in Sec. 5 in the main text and the detailed calculations are in Appendix Sec. D. 1D NG Hamiltonian sampling : is presented in Sec. 6 in the main text and the detailed calculations are in Appendix Sec. E. Numerical simulation details : is presented in Appendix Sec. F including the parameters for each figure. The complete code of simulation is provided in the supplementary files with detailed usage instructions. A ppendix A A ppendix F igures Download figure Open in new tab Figure A1: (A-B)The tuning curve of an example E neuron in control state compared with enhancing PV neurons (A) and SOM neurons (B). It shows the PV neurons provide divisive inhibition to the E neuron, while the SOM provides subtractive inhibition to the E neuron. (C) The tuning curves of all E neurons in the circuit tile the whole stimulus feature space z. (D-E) The temporally averaged Gaussian profile of the firing rate r E ( θ ) (D) and synaptic input u E ( θ ) (E), supporting the Gaussian ansatz of the attractor states in Eqs. (2) Download figure Open in new tab Figure A2: (A) The joint correlational prior of the stimulus z 1 and z 2 stored in the coupled circuits presented in Fig. 3 . The correlation between two stimuli is determined by width of the diagonal band. (B) The prior precision λ s increases with the coupling weight between two circuits. (C-D) The coupled circuits sample the posterior by using its internal subjective prior. Comparison of the sampling mean (C) and the prior precision (D) stored in the network with theoretical predictions. Each point represents results from a combination of feedforward inputs, connection weights. Download figure Open in new tab Figure A3: The augmented circuit with E, PV and SOM neurons have a line manifold in the parameter space to sample posteriors correctly, suggesting no fine-tuning is needed. The parameter space is spanned by feedforward weight w EF and the inhibitory weight from SOM to E neurons w ES . The red dot shows the network parameters we use for simulation B N atural gradient L angevin sampling B.1 L angevin D ynamics The dynamics of Langevin sampling performs stochastic gradient ascent on the manifold of the log-posterior of stimulus features ( Welling & Teh, 2011 ), which is written as, where ξ t is a multivariate independent Gaussian-white noise, satisfying , with I the identity matrix and d ( t − t ′ ) the Dirac delta function, and τ L is a positive-definite matrix (or a positive scalar in the 1D case) determining the sampling time constant, which is also called the pre-conditioning matrix. Importantly, τ L is a free parameter of the sampling in that it doesn’t change the equilibrium distribution of z t . B.2 N atural gradient sampling via F isher I nformation (matrix) Amari proposed the natural gradient method to utilize the geometry of the distribution to adaptively determine the sampling time constant τ L that controls the sampling step size ( Amari, 1998 ). Intuitively, for a widely spread distribution, we should choose a small time constant (large step size) that can speed up the convergence of the sampling. Vice versa, for a narrowly distributed latent variable, a large time constant (small step size) is favoured to avoid instability of the sampling. Specifically, Amari’s natural gradient method proposed the sampling time constant can be determined by using the Fisher information that is a measure of the local curvature of the distribution. In the framework of information geometry ( Amari, 2016 ), the Fisher information matrix serves as a Riemannian metric on the statistical manifold of z . Consider two neighboring (posterior) distributions π ( z ) and π ( z + d ) with an infinitesimal displacement d , a second-order Taylor series approximation reveals the Fisher information as the underlying distance metric. While the Fisher Information is often introduced in the context of the likelihood function in frequentist statistics, its definition can be generalized. For any probability distribution, its Fisher Information matrix measures the expected curvature of its logarithm. For the posterior π ( z ) ≡ p ( z | r F ) to be sampled, we get the posterior information matrix (or Bayesian Fisher Information)( Amari, 2016 ), It is symmetric and positive semi-definite. Then the posterior information matrix acts as a precondition to set up the time constant of the sampling ( Girolami & Calderhead, 2011 ): Here, the time constant increases with G ( z ), which ensures a smaller step size (larger time constant) when the posterior is more curved (larger Fisher information). This adaptation improves sampling efficiency, as it accounts for anisotropies in the posterior, preventing slow mixing along directions of low curvature. The α is a regularization term that increases the numerical stability when inverting the time constant with a very small Fisher information G ( z ). With more details, the Fisher information is the expected value of the negative Hessian matrix. It represents the curvature of the posterior on the statistical manifold where the latent variable z reside. In many practical applications, a “flat” or “non-informative” prior is used for some or all parameters. The posterior information matrix simplifies to become identical to the likelihood’s Fisher information matrix. If prior is flat, this metric tensor of posterior manifold becomes, B.3 S ampling speed measured by the decaying speed of KL divergence It has been proved that the upper-bound of the KL-divergence between the distribution of sample p t ( z ) = T − 1 ∑ t d ( z − z t ) and the equilibrium distribution p ∞ ( z ) decreases exponentially ( Dong et al., 2022 ), i.e., where p 0 ( z ) denotes the initial distribution at t = 0, and h denotes the smallest real-part of all eigenvalues of the drift matrix. C A single canonical circuit and 1D natural gradient sampling : theory We present the math of theoretical analyses of the reduced recurrent circuit model consisting of E and PV neurons based on continuous attractor network dynamics. C.1 C ontinuous attractor network dynamics To simplify the reading, we copy the network dynamics of E neurons ( Eq. 1a ), and the divisive normalization provided by PV neurons ( Eq. 1b ), and the recurrent connection kernel W EX ( Eq. 1d ) C.2 N etwork’s attractor states We verify the proposed Gaussian ansatz of the attractor states of E neurons ( Eq. 2 ), First, we substitute it into the divisive normalization ( Eq. C2 ), yielding the following expression for the firing rate of E neurons, Then we use the above E firing rate ( Eq. C5 ) to calculate the recurrent input from the neuronal population of type Y to the one with type X in the circuit model, Specifically, based on Eq. (C6) , the recurrent E population input is and the feedforward population input is, It can be checked the proposed Gausian ansatz ( Eq. C4 ) is indeed the sum of the recurrent input ( Eq. C7 ) and the feedforward input ( Eq. C8 ), i.e., and implies This completes the recurrent loop of the dynamics, and verify the validity of the Gaussian ansatz ( Eq. 2 ). C.3 D imensionality reduction by projecting on dominant modes We substitute Eqs. (C4 - C8 ) into the Eq. (C1 ), Previous studies analytically calculated the first two dominant eigenvectors (modes) ( Wu et al., 2016 ; Fung et al., 2010 ), corresponding to the change of the position and height of the Gaussian ansatz respectively, Projecting the dynamics Eq. (C9) into these 2 motion modes ( Eq. C10 ), which means calculate the inner product ∫ f ( θ ) ϕ ( θ | z E ) dθ with f ( θ ) a term in Eq. (C9) , where When the bump position z E is near the input position, i.e., µ − z E ≪ a E , which is usually the case in the circuit model, the exponential term exp[− ( µ − z E ) 2 / 8 a 2 ] is close to one and can be safely ignored, Furthermore, by using the notation we arrive at Eqs. (3a and 3b) in the main text. C.4 T he probabilistic generative model embedded in the circuit model C.4.1 T he stimulus likelihood We study how feedforward input defines the latent stimulus likelihood, i.e., ℒ ( z ) ∝ p ( r F | z ). From the Eq. (1e) , the feedforward input r F is modeled as a set of independent Poisson spike trains, where each neuron’s firing rate is Gaussian-tuned to the stimulus ( Ma et al., 2006 ): where λ F ( θ | z ) is the mean firing rate of the neuron with stimulus preference θ . r F denotes the peak input rate, and a specifies the tuning width. Explicitly writing the Poisson distribution of feedforward input spikes (we discretize the continuous θ into equally spaced θ j ), Taking the logarithm, The const. in the above equation is under the assumption that the sum of population firing rate ∑ j λ j is a constant irrelevant to latent stimulus z , which is true in a homogeneous population with a large number of neurons. Substituting the expression of the Gaussian tuning, where This implies the latent stimulus likelihood for the latent stimulus feature z given an observed feedforward input r F is derived as a Gaussian distribution, which is the Eq. (8) in the main text. Notably, the Gaussian distribution comes from the profile of the Gaussian tuning ( Eq. C14 ) ( Ma et al., 2006 ). C.4.2 U niform stimulus prior in the circuit Comparing the E bump position dynamics ( Eq. C12 ) with the Langevin sampling dynamics ( Eq. B3 ), it immediately suggests that the circuit stores a uniform (uninformative) stimulus prior, i.e., p ( z ) is uniform. This is because the gradient of the log-likelihood (∇ℒ ( z ), Eq. 8 ) has the same form with the drift term in the E position dynamics suggesting the gradient of the prior is zero, i.e., ∇ ln p ( z ) = 0. This uniform prior arises from the circuit’s homogeneous neurons (uniformly distributed in feature space) and its translation-invariant connection profile. Consequently, for the circuit to store a non-uniform prior, it must break this inherent symmetry in its neural organization and connectivity. C.5 C onditions for realizing L angevin sampling in the circuit The circuit sampling of the likelihood means the equilibrium distribution of the bump position ( Eq. C12 ) should match with the likelihood ( Eq. 8 ). We copy the circuit bump position dynamics and the likelihood Langevin sampling dynamics in below for comparison, The σ z is a constant that doesn’t change with neuronal activities. Therefore, the likelihood Langevin sampling in the circuit can be realized by setting the feedforward weight w EF appropriately to make the ratio of the drift and diffusion coefficients the same as the Langevin sampling dynamics. The optimal feedforward weight can be found as (by using Eq. C18 ) Furthermore, the time constant of the z E dynamics is which is proportional to the E bump height U E . Finally, the equation of bump position ( Eq. C12 ) can be converted into the same form with a standard Langevin sampling, C.6 N atural gradient sampling in the circuit The natural gradient Langevin sampling utilizes the Fisher information to determine the sampling time constant ( Eq. B3 ). We verify whether this can be realized in the circuit dynamics. Firstly, the Fisher information of the likelihood is ( Eqs. B4 and C17 ), Meanwhile, the time constant of the circuit sampling dynamics τ z is proportional to the bump height U E ( Eq. C20) . From the Eq. (C13) , the equilibrium mean of the bump height can be calculated as And therefore the circuit’s sampling time constant is It clearly shows the bump height Ū E increases with the Fisher information G ( z ). Moreover, the recurrent E input U EE acts as the regularization term to increase the numerical stability of inverting the Fisher information (similar to the role of α in Eq. B3 ). This proves the reduced circuit with E and PV neurons indeed implements natural gradient Langevin sampling from the likelihood. D C oupled neural circuits and multivariate posterior sampling : theory D.1 T heoretical analysis of the coupled circuit dynamics We present the math about coupled canonical neural circuits implementing multivariate stimulus posterior inference via natural gradient Langevin sampling ( Zhang et al., 2016 ; 2023 ; Raju & Pitkow, 2016 ). The model we consider is composed of M reciprocally connected coupled circuit, with each the same as a single canonical circuit in Sec. C. Each circuit m receives a feedforward input independently generated from the corresponding latent stimulus s m ( Fig. 3 ), and eventually draw the stimulus z m from the multivariate posterior. Therefore, the number of coupled circuits in the model is determined by the dimension of the multivariate posteriors. The dynamics of the coupled circuits is written as (we raise the subscript of capital latter denoting neuron and input types to the superscript, and the new subscripts of lowercase letters denote the E population indices), Each circuit receives a feedforward input that is independently generated from a latent stimulus s m via the same way in the single circuit ( Fig. 3 , Eq. C14 ), For simplicity, we consider the feedforward connection weight of each circuit is the same. Similar to the one-dimensional case ( Eq. C4 ), we consider the Gaussian ansatz for the population synaptic input at each circuit m , Performing similar calculations by substituting the Gaussian ansatz of each circuit into the dynamics of the coupled circuits ( Eq. D1 ), Projecting the above dynamics onto the two eigenfunctions (C10), and assume the differences between the bump positions of different circuits are small enough compared with the tuning width a , i.e., | z n − z m | ≪ a , where σ z and σ U are the same as E. (C11). Reorganizing the ab ve equation into the matrix form, where ° denotes the element-wise multiplication, and We obtain the bump position and height dynamics embedded in neural dynamics as presented in Eqs. (12a - 12b) in the main text. D.2 T he generative model of multivariate stimulus stored in the circuit We present the math analysis in identifying the generative model especially the subjective stimulus prior stored in the circuit. Generally, the multivariate stimulus posteriors given received feedforward inputs are, where the second last equality comes from by using the same derivations as the Sec. C.4.1 on each feedforward input . And is the likelihood precision matrix. Note that the stimulus prior p ( z ) is still unspecified at this moment. We will determine it in the following. Subjective prior stored in the coupled circuits Utilizing the Langevin sampling dynamics to sample the posterior Meanwhile, the coupled circuits’ bump position dynamics is Using the definition of U EF ( Eq. D4 ) and the feedforward input intensity with the likelihood precision ( Eq. C18 ), It is straightforward to regard the Lz term as the gradient from the stimulus prior, Specifically, the prior precision matrix is a generalized Laplacian matrix ( Eq. D4 , whose determinant is zero, i.e., | L | = 0, suggesting the marginal prior of each stimulus is uniform, i.e., p ( z m ) is uniform. As an example, for M = 2, the prior p ( z = z 1 , z 2 ) ⊤ is written as, where L 12 characterizes the correlation between z 1 and z 2 . It can be checked each marginal stimulus prior is uniform. Subjective multivariate stimulus posterior in the circuit Based on the identified stimulus prior stored in the circuit ( Eq. D5 ), the (subjective) stimulus posterior is calculated as where D.3 N atural gradient sampling via diagonal approximation of F isher information matrix Eq. (D3a) suggests the time constant of the circuit’s sampling dynamics (bump position) is determined by the matrix D U . We next analyze its relation with the Fisher information to verify whether the circuit implement natural gradient sampling for multivariate posteriors. Based on the (subjective) multivariate posterior calculated by the circuits ( Eq. D7 ), the Fisher information matrix of the multivariate stimulus is, In particular, by using the definition of the prior precision matrix ( Eq. D5 ) and the posterior precision ( Eq. D7 ), which clearly shows the circuit’s sampling time constant D U is the diagonal matrix of the full Fisher information matrix, giving rise to the Eq. (14) in the main text. E N atural gradient H amiltonian sampling in the circuit with SOM neurons E.1 C ircuit dynamics We also copy the dynamics of a single augmented circuit with SOM neurons ( Eq. 1a and Eq. 1c ) below. Similar to the Gaussian ansatz presented in Eqs. (C4 - C8 ), we also propose the same Gaussian ansatz for the synaptic inputs of E and SOM neurons respectively. Specifically, since SOM neurons have different activation function with the E neurons, the population firing rate of SOM neurons is, Substituting the Gaussian ansatz of E and SOM neurons into the circuit dynamics ( Eqs. E1 ), Since the above equations are summations of Gaussian functions, it can be checked that when the positions of Gaussian functions are the same, i.e., z E = z S = µ z , the sum of two Gaussian functions will also be a Gaussian function. In addition, to validate the Gaussian ansatz, we need the width fulfilling the following constrain of the connection width, Similar to the two motion modes for E neuron, the SOM also have two motion nodes ( Sale & Zhang, 2024 ), We project the dynamics of u E and u S onto their respective position modes ( Eq. C10 and Eq. E4 respectively). From here, we assume the difference between neuronal populations’ positions is small enough compared to the connection width a , i.e., | z E − z S | and | µ z − z E |≪ 4 a X . In this case, the projected circuit dynamics can be simplified by ignoring exponential terms in Eq. (E3) , Similarly, we project the E and SOM’s dynamics on their respective height modes, Similarly, to simplify notations, we define and σ z and σ U are the same as Eq. (C11) . The Eq. (E5) is simplified into, Reorganizing the bump position dynamcis into the matrix form, where E.2 H amiltonian sampling in the circuit In the present study, we consider a Hamiltonian sampling with friction, because it can be mapped to the proposed circuit with a diversity of interneurons. Hamiltonian sampling can sample the desired distribution π ( z ) (with π ( z ) as the equilibrium distribution), which is defined as, The previous study suggested the z E dynamics is a mixture of the Langevin sampling and the Hamiltonian sampling ( Sale & Zhang, 2024 ), and thus inspires us to decompose it into two parts, where α L ∈ [0, 1] denotes the proportion of Langevin sampling component. In this way, we can define the transformation matrix and rewrite, We are interested in z H dynamics, and investigate how the circuit parameters can be set to fulfill the Hamiltonian sampling. Without loss of generality, we consider a case of µ z = 0 that simplify the derivation of the z H dynamics, which will make M 1 = M 2 = 0. And then, Then we can derive the dynamics of z H , where and Standard form of the Hamiltonian sampling dynamics We further convert the Eq. (E14) into the standard form of Hamiltonian sampling dynamics ( Eq. 7 ), which corresponds to multiply the z E with the posterior precision Λ and then compensate the Λ − 1 into the preceding matrix, The second equality comes from we have the freedom of determining the momentum p ’s precision, and then we could choose a momentum precision to make sure the first matri on the RHS is anti-symmetric. Eventually, by using We can convert the ( z E , p ) dynamics into the standard form of Hamiltonian sampling dynamics as shown in the main text ( Eq. 16 ), i.e., E.3 C onditions for realizing H amiltonian sampling in the circuit Realizing Hamiltonian sampling in the circuit requires we set the ratio between drift and diffusion terms appropriately in Eq. (E17) . Solving Eq. (E18a) , Solving Eq. (E18b) by substituting Eq. (E16) To simplify notations, we define two intermediate variables about common factors in the above equation And utilizing the Eq. (E18a) , it simplifies the equation into Reorganizing the above equation into a quadratic equation of h E , Then the root of the h E is Combining the expression of h E in Eq. (E20) , Then substituting the detailed expression of U EF , U SE , τ E , and τ S into the above equation, we have which is the Eq. (17) in the main text. E.4 N atural gradient H amiltonian: determining the momentum precision in the circuit Eq. (E17) suggests the momentum precision in the circuit dynamics is which should be proportional to the inverse of the Fisher inforamtion of the stimulus, G ( z ) ( Eq. 7 ). We next verify whether this can be satisfied in the circuit dynamics. Substituting the expression of β E in Eq. (E16) into the above equation and using the simplified notation h E ( Eq. E20 ), we have Utilizing the relation between h E and h S in Eq. (E21) , Here the first term of α L about the proportion of Langevin sampling can be treated as a constant, and the λ z is also a constant that doesn’t change with the network activity. Substituting the detailed expression of τ E and h S ( Eq. E20 ) where the last equality comes from U S = U SE in the equilibrum state ( Eq. (E7) ) Furthermore, from the bump height dynamics in the augmented circuit with SOM ( Eq. E6 ), and using similar analysis in Eq. (C22) which clearly shows the U E in the augmented circuit increases with the Fisher information of the stimulus G ( z ). Since the momentum precision Λ p is inversely proportional to U E , it decreases with the stimulus Fisher information G ( z ), which is consistent with the natural gradient Hamiltonian sampling ( Eq. 7 ). F C ircuit simulation parameters and details F.1 C ritical weight To scale the connection strengths in our network model, we use a critical recurrent connection strength as a reference point. This critical strength is defined as the smallest value that allows the network to maintain persistent activity even when there is no feedforward input. In the absence of feedforward input, the stationary state of circuit’s bump height satisfies ( Eqs. E6 - E7 ), Furthermore, the firing rate of the E population, R E , is related to its input U E by the activation function defined in Eq. (C5) . Substituting this expression for R E into Eq. (F1) allows us to write an equation solely in terms of U E : Assuming U E ≠ 0 (for persistent activity), we can divide by U E and rearrange the equation into a quadratic form for U E : Let . This quadratic equation for U E has real solutions if and only if its discriminant is non-negative . The smallest value of w c that permits non-zero persistent activity occurs when the discriminant is zero, i.e., The network parameters used in our simulations are provided in Table 1 . This includes parameters like the number of neurons ( N E = 180, N S = 180) distributed over a feature space of width w z = 360°, leading to a neuronal density ρ = N/w z . So the critical weight value is calculated as: View this table: View inline View popup Download powerpoint Table 1: PARAMETERS FOR HAMILTONIAN SAMPLING View this table: View inline View popup Download powerpoint Table 2: Parameters for network View this table: View inline View popup Download powerpoint Table 3: PARAMETERS FOR HAMILTONIAN SAMPLING The intensity of the feedforward input is then scaled relative to U c , which is the peak synaptic input to the E population that is self-sustained by the E recurrent connections at their critical strength w c , in the absence of feedforward input and SOM inhibition. U c is given by: F.2 P arameters for network simulation For the reduced network with only PV and excitatory neuron, the network parameters is set as following. This parameter set applies for a single circuit sampling a 1D stimulus posterior, and coupled circuits sampling multivariate stimulus posteriors. For 1D and 2D, the parameters are the same aside there are not couping weight for 1d case. For the equilibrium state analysis depicted in Figure 2 , the network is first initialized using an input intensity identical to that of subsequent simulation phases, in order to remove the influence of non-equilibrium bump height. During this initialization, the input position varies across trials, drawn from a Gaussian distribution with mean µ 0 and variance V 0 . After allowing the network’s bump height to reach equilibrium post-initialization, the input position is then set to match the mean of the network’s activity bump. The simulation proceeds for a duration of 50 τ , using an integration time step of 0.01 time units. The first 20 time steps of this period are discarded to avoid transient effects. Following this, the input position is fixed at 0, and the network is simulated for an additional 450 τ with the same integration step. Throughout the latter 450 τ simulation, the bump position is recorded to calculate the KL divergence between the network’s evolving state and a target posterior distribution. The network state at the end of the initialization phase serves as the reference for the initial KL divergence value. For comparison, a separate Langevin sampling process is performed. This sampling is initialized using the network’s bump position from the end of its initialization phase. The Langevin sampling then runs for a duration of 450 τ , also using an integration time step of 0.01 time units. For the non-equilibrium state depicted in Figure 2 , the network is initially prepared by applying a substantially smaller input signal, denoted as scale ini . This input is administered uniformly to all neurons for a duration of 20 τ to initialize the network. After this initialization phase, the input to each neuron is then adjusted to its designated operational value. For the natural gradient (NG) sampling procedure, the starting position is set to the ‘bump’ location observed at the final step of the Continuous Attractor Neural Network (CANN) model’s initialization. For the different recurrent weight, we fix input intensity R F = 3. We get time constant by getting the cross-corelation of bump postion simulated from the network and fit the exponential function to get the time constant. Hamiltonian sampling parameters mostly mirror the previous set, but differ by including connection parameters that define interactions between SOM and excitatory neurons. For 500 trials and simulation 500 τ , it takes 2 hours on the 512GB cpu hpc. F.2.1 N umerical estimate of the stimulus prior in coupled circuits We numerically estimate the subjective bivariate stimulus prior stored in the coupled circuits. Given a combination of circuit parameters, we ran a large ensemble of stochastic network simulations. From the spatio-temporal firing rate patterns in each circuit r m , we decoded instantaneous population vectors z m in each time bin in each trial. Then we concatenate the z m from two circuits together, z = ( z 1 , z 2 ), and estimate its mean µ z and covariance σ z , which are used to parameterize the Gaussian sampling distribution, i.e., p ( z ) = N ( µ z , σ z ). And then we search the prior precision matrix L under which the posterior is closet to the sampling distribution p ( z ), where the posterior π ( z ) is calculated based on the parameter L to be estimated, and the likelihood mean µ and precision Λ are directly estimated from the received feedforward inputs ( Eq. C18 ), F.2.2 T he vector field (drift term) of circuits’ sampling dynamics For both the diagonal-Fisher natural-gradient Langevin sampler and the full-Fisher method, we can directly compute the gradient at each point in parameter space, evaluate the Fisher information (either the full matrix or just its diagonal), and then derive the corresponding vector field from this information. In the case of our CANN (Continuous Attractor Neural Network) model, constructing the equilibrium vector field requires a slightly different approach. The goal is to observe how the position of the bump (i.e., the localized peak of neural activity) shifts in response to changes in the input. To do this, we first stabilize the bump at a reference location. Specifically, we apply a fixed external input centered at ( x 0 , y 0 ) and run the CANN dynamics until the bump height reaches equilibrium. In our experiments, this equilibration phase lasted for 20 time constants (20 τ ). Once the bump has stabilized, we perturb the input by shifting it to a new position ( x 1 , y 1 ), and observe how the bump position responds. The resulting displacement of the bump provides the vector at the new point, essentially showing how the internal state of the network changes in response to this small input shift. Analytically, this shift can be expressed as moving the input from ( x 0 , y 0 ) to ( x 2 , y 2 ) = ( x 0 , y 0 ) + Λ − 1 ? ( x 1 − x 0 , y 1 − y 0 ), where Λ − 1 ? captures the relationship between input space and the internal dynamics of the bump. Because our 2D network structure implicitly encodes a prior, shifting the bump corresponds to translating the mean of the posterior distribution. Repetition of this process across a grid of input locations ( x 2 , y 2 ), we can scan the whole bump position grid and then we can systematically map out the equilibrium vector field of the CANN. This field describes how the network’s internal estimate-the bump position-evolves in response to perturbations in the input. F.3 P arameters fitting In our attractor network, the bump position z ( t ) = ( z E ( t ), z S ( t )) ⊤ is determined by the connection between Excitoory and SOM populations. The dynamics are described by equations(E10). By introducing a compact notation and collecting terms into matrix-vector form, we specifically define the state as z = ( z E , z S ) ⊤ and the 2D dynamics as: where which is the drift matrix, M 1 ∈ ℝ 2 is a constant input, and σξ t is noise term. Convert into the form of transition probability: where and are time-rescaled synaptic coefficients. Because the noise enters the network only through the excitatory population. We therefore estimate the four unknown parameters { h SE , h ES , h EF , σ z } in two consecutive steps. From the noiseless second equation, we have z ? S = U SE ( z E − z S ) with the closed-form discrete update z S ( t +? t ) − z S ( t ) = U SE [ z E ( t ) − z S ( t )]? t . Averaging over a trajectory of length T gives an unbiased estimator so no optimization is required. Conditioned on z S , the excitatory coordinate follows a scalar Ornstein–Uhlenbeck process Then we used maximum likelihood estimation (MLE) to estimate the parameters of { h SE , h ES , h EF , σ z } in the above equation. All parameters are regressed on a data segment of 1000 samples, corresponding to 10 τ . The parameters are explicitly reparameterized in terms of their biological interpretation and optimized via stochastic gradient descent (Adam), enabling stable and interpretable system identification. After obtaining the MLE estimate of { h SE , h ES , h EF , σ z } for each data set, we numerically find the transformation T matrix ( Eq. E13 ) by directly estimating the values of U E , U S and α L from network activities. Eventually, we use the estimated T matrix to convert the ( z E , z S ) into ( z E , p ) as described in Eq. (E13) . We evaluated Λ p under 11 values of feedforward input intensity, R F ∈ { 11, 13, …, 23}, and found the decreasing tread of kinetic energy. The decreasing trend confirms the theoretical prediction that a stronger external drive reduces the effective momentum budget required for accurate sampling. All experiments were performed on a compute node equipped with 40 CPU cores and one NVIDIA A100 GPU; the full pipeline completed in about 26 hours. Funder Information Declared UT Southwestern Endowed Scholars program Footnotes This version of the manuscript has been revised to enhance clarity and focus. The primary updates are as follows: Introduction: The introduction has been rewritten to better emphasize the contribution of our work. Background (Sec. 2): The background on the canonical circuit model has been compressed, and the material from the previous Section 4.1 has been merged into this section. Natural Gradient Langevin Sampling (Sec. 4): This section has been reorganized for improved clarity. This new structure better highlights that our proposed circuit can flexibly sample the entire likelihood family with bump height scaling with various posterior Fisher Information and emphasizes that the bump height has a regularizer that scales with the posterior Fisher Information. Natural Gradient Hamiltonian Sampling (Sec. 6): We integrated the Eq. 19a and Eq. 20 in previous version into the current Eq. 15a as the Eq. 19a is already shown in the background (Sec. 2). Conclusion and Discussion: We have expanded this section to provide a more detailed comparison with previous studies and to outline potential generalizations of our work as future research directions. General Updates: We have corrected typographical errors in equations and text throughout the manuscript. A reproducibility statement has been added to guide readers. All supplementary figures and text have been moved to the appendix. R eferences ↵ Hillel Adesnik , William Bruns , Hiroki Taniguchi , Z Josh Huang , and Massimo Scanziani . A neural circuit for spatial summation in visual cortex . Nature , 490 ( 7419 ): 226 – 231 , 2012 . OpenUrl CrossRef PubMed Web of Science ↵ Laurence Aitchison and Máté Lengyel . The hamiltonian brain: efficient probabilistic inference with excitatory-inhibitory neural circuit dynamics . PLoS computational biology , 12 ( 12 ), 2016 . ↵ Shun-Ichi Amari . Natural gradient works efficiently in learning . Neural Computation , 10 ( 2 ): 251 – 276 , 1998 . doi: 10.1162/089976698300017746 . OpenUrl CrossRef Web of Science ↵ Shun-Ichi Amari . Information geometry and its applications , volume 194 . Springer , 2016 . doi: 10.1007/978-4-431-55978-8 . OpenUrl CrossRef ↵ Shun-Ichi Amari and Scott C Douglas . Why natural gradient? In Proceedings of the 1998 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP’98 (Cat. No. 98CH36181) , volume 2 , pp. 1213 – 1216 , 1998 . doi: 10.1109/ICASSP.1998.675489 . OpenUrl CrossRef ↵ Jeffrey M Beck , Wei Ji Ma , Roozbeh Kiani , Tim Hanks , Anne K Churchland , Jamie Roitman , Michael N Shadlen , Peter E Latham , and Alexandre Pouget . Probabilistic population codes for bayesian decision making . Neuron , 60 ( 6 ): 1142 – 1152 , 2008 . OpenUrl CrossRef PubMed Web of Science ↵ Lars Buesing , Johannes Bill , Bernhard Nessler , and Wolfgang Maass . Neural dynamics as sampling: a model for stochastic computation in recurrent networks of spiking neurons . PLoS computational biology , 7 ( 11 ): e1002211 , 2011 . OpenUrl ↵ Luke Campagnola , Stephanie C Seeman , Thomas Chartrand , Lisa Kim , Alex Hoggarth , Clare Gamlin , Shinya Ito , Jessica Trinh , Pasha Davoudian , Cristina Radaelli , et al. Local connectivity and synaptic dynamics in mouse and human neocortex . Science , 375 ( 6585 ): eabj5861 , 2022 . OpenUrl CrossRef PubMed ↵ Tianqi Chen , Emily Fox , and Carlos Guestrin . Stochastic gradient hamiltonian monte carlo . in International conference on machine learning , pp. 1683 – 1691 . PMLR , 2014 . ↵ Xingsi Dong , Zilong Ji , Tianhao Chu , Tiejun Huang , Wenhao Zhang , and Si Wu . Adaptation accelerating sampling-based bayesian inference in attractor neural networks . Advances in Neural Information Processing Systems , 35 : 21534 – 21547 , 2022 . OpenUrl ↵ Rodrigo Echeveste , Laurence Aitchison , Guillaume Hennequin , and Máté Lengyel . Cortical-like dynamics in recurrent circuits optimized for sampling-based probabilistic inference . Nature Neuroscience , 2020 . ISSN 15461726 . doi: 10.1038/s41593-020-0671-1 . OpenUrl CrossRef PubMed ↵ Marc O Ernst and Martin S Banks . Humans integrate visual and haptic information in a statistically optimal fashion . Nature , 415 ( 6870 ): 429 – 433 , 2002 . OpenUrl CrossRef PubMed Web of Science ↵ Gord Fishell and Adam Kepecs . Interneuron types as attractors and controllers . Annual review of neuroscience , 43 : 1 – 30 , 2020 . OpenUrl CrossRef PubMed ↵ C. C Alan Fung , K. Y. Michael Wong , and Si Wu . A moving bump in a continuous manifold: A comprehensive study of the tracking dynamics of continuous attractor neural networks . Neural Computation , 22 ( 3 ): 752 – 792 , 2010 . OpenUrl CrossRef PubMed Web of Science ↵ Deep Ganguli and Eero P Simoncelli . Implicit encoding of prior probabilities in optimal neural populations . Advances in neural information processing systems , 2010 : 658 , 2010 . ↵ Mark Girolami and Ben Calderhead . Riemann manifold langevin and hamiltonian monte carlo methods . Journal of the Royal Statistical Society: Series B (Statistical Methodology) , 73 ( 2 ): 123 – 214 , 2011 . doi: 10.1111/j.1467-9868.2010.00765.x . OpenUrl CrossRef ↵ Ralf M Haefner , Pietro Berkes , and József Fiser . Perceptual decision-making as probabilistic inference by neural sampling . Neuron , 90 ( 3 ): 649 – 660 , 2016 . OpenUrl CrossRef PubMed ↵ Patrik O Hoyer and Aapo Hyvärinen . Interpreting neural response variability as monte carlo sampling of the posterior . in Advances in neural information processing systems , pp. 293 – 300 , 2003 . ↵ Dongseong Hwang . Fadam: Adam is a natural gradient optimizer using diagonal empirical fisher information , 2024 . URL https://arxiv.org/abs/2405.12807 . ↵ David C Knill and Alexandre Pouget . The bayesian brain: the role of uncertainty in neural coding and computation . TRENDS in Neurosciences , 27 ( 12 ): 712 – 719 , 2004 . OpenUrl CrossRef PubMed Web of Science ↵ Konrad P Körding and Daniel M Wolpert . Bayesian integration in sensorimotor learning . Nature , 427 ( 6971 ): 244 – 247 , 2004 . OpenUrl CrossRef PubMed Web of Science ↵ Richard D Lange , Sabyasachi Shivkumar , Ankani Chattoraj , and Ralf M Haefner . Bayesian encoding and decoding as distinct perspectives on neural coding . Nature Neuroscience , 26 ( 12 ): 2063 – 2072 , 2023 . OpenUrl CrossRef PubMed ↵ Wei Ji Ma , Jeffrey M Beck , Peter E Latham , and Alexandre Pouget . Bayesian inference with probabilistic population codes . Nature Neuroscience , 9 ( 11 ): 1432 – 1438 , 2006 . OpenUrl CrossRef PubMed Web of Science ↵ Yi-An Ma , Tianqi Chen , and Emily Fox . A complete recipe for stochastic gradient mcmc . Advances in neural information processing systems , 28 , 2015 . ↵ Gaétan Marceau-Caron and Yann Ollivier . Natural langevin dynamics for neural networks . in International Conference on Geometric Science of Information , pp. 451 – 459 . Springer , 2017 . ↵ Paul Masset , Jacob Zavatone-Veth , J Patrick Connor , Venkatesh Murthy , and Cengiz Pehlevan . Natural gradient enables fast sampling in spiking neural networks . Advances in neural information processing systems , 35 : 22018 – 22034 , 2022 . OpenUrl PubMed ↵ Cristopher M Niell and Massimo Scanziani . How cortical circuits implement cortical computations: mouse visual cortex as a model . Annual Review of Neuroscience , 44 : 517 – 546 , 2021 . OpenUrl CrossRef PubMed ↵ Gergő Orbán , Pietro Berkes , József Fiser , and Máté Lengyel . Neural variability and sampling-based probabilistic representations in the visual cortex . Neuron , 92 ( 2 ): 530 – 543 , 2016 . OpenUrl CrossRef PubMed ↵ Brad E Pfeiffer and David J Foster . PLACE CELLS. autoassociative dynamics in the generation of sequences of hippocampal place cells . Science , 349 ( 6244 ): 180 – 183 , July 2015 . OpenUrl Abstract / FREE Full Text ↵ Rajkumar Vasudeva Raju and Zachary Pitkow . Inference by reparameterization in neural population codes . in Advances in Neural Information Processing Systems , pp. 2029 – 2037 , 2016 . ↵ Eryn Sale and Wenhao Zhang . The bayesian sampling in a canonical recurrent circuit with a diversity of inhibitory interneurons . in The Thirty-eighth Annual Conference on Neural Information Processing Systems , 2024 . ↵ Yu Terada and Taro Toyoizumi . Chaotic neural dynamics facilitate probabilistic computations through sampling . Proceedings of the National Academy of Sciences , 121 ( 18 ): e2312992121 , 2024 . OpenUrl CrossRef PubMed ↵ Max Welling and Yee W Teh . Bayesian learning via stochastic gradient langevin dynamics . in Proceedings of the 28th international conference on machine learning (ICML-11) , pp. 681 – 688 , 2011 . ↵ Si Wu , KY Michael Wong , CC Alan Fung , Yuanyuan Mi , and Wenhao Zhang . Continuous attractor neural networks: candidate of a canonical model for neural information representation . F1000Research , 5 , 2016 . ↵ Xiaodong Wu , Wenyi Yu , Chao Zhang , and Phil Woodland . An improved empirical fisher approximation for natural gradient descent . Advances in Neural Information Processing Systems , 37 : 134151 – 134194 , 2024 . OpenUrl ↵ Alan Yuille and Daniel Kersten . Vision as bayesian inference: analysis by synthesis? Trends in cognitive sciences , 10 ( 7 ): 301 – 308 , 2006 . OpenUrl CrossRef PubMed Web of Science ↵ Wen-Hao Zhang and Si Wu . Neural information processing with feedback modulations . Neural Computation , 24 ( 7 ): 1695 – 1721 , 2012 . OpenUrl CrossRef PubMed ↵ Wen-Hao Zhang , Aihua Chen , Malte J Rasch , and Si Wu . Decentralized multisensory information integration in neural systems . The Journal of Neuroscience , 36 ( 2 ): 532 – 547 , 2016 . OpenUrl Abstract / FREE Full Text ↵ Wen-Hao Zhang , Si Wu , Krešimir Josić , and Brent Doiron . Sampling-based bayesian inference in recurrent circuits of stochastic spiking neurons . Nature Communications , 14 ( 1 ): 7074 , 2023 . OpenUrl PubMed View the discussion thread. Back to top Previous Next Posted September 30, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Natural gradient Bayesian sampling automatically emerges in canonical cortical circuits Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Natural gradient Bayesian sampling automatically emerges in canonical cortical circuits Zimei Chen , Yi Ren , Eryn Sale , Ying Nian Wu , Wen-Hao Zhang bioRxiv 2025.07.03.663069; doi: https://doi.org/10.1101/2025.07.03.663069 Share This Article: Copy Citation Tools Natural gradient Bayesian sampling automatically emerges in canonical cortical circuits Zimei Chen , Yi Ren , Eryn Sale , Ying Nian Wu , Wen-Hao Zhang bioRxiv 2025.07.03.663069; doi: https://doi.org/10.1101/2025.07.03.663069 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neuroscience Subject Areas All Articles Animal Behavior and Cognition (7629) Biochemistry (17660) Bioengineering (13881) Bioinformatics (41910) Biophysics (21436) Cancer Biology (18576) Cell Biology (25480) Clinical Trials (138) Developmental Biology (13368) Ecology (19887) Epidemiology (2067) Evolutionary Biology (24302) Genetics (15598) Genomics (22482) Immunology (17726) Microbiology (40360) Molecular Biology (17163) Neuroscience (88534) Paleontology (666) Pathology (2830) Pharmacology and Toxicology (4821) Physiology (7637) Plant Biology (15129) Scientific Communication and Education (2045) Synthetic Biology (4290) Systems Biology (9817) Zoology (2269)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00