Gradual proactive regulation of body state by reinforcement learning of homeostasis

doi:10.1101/2025.10.17.682979

Gradual proactive regulation of body state by reinforcement learning of homeostasis

2025 · doi:10.1101/2025.10.17.682979

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 56,409 characters · extracted from preprint-html · click to expand

Gradual proactive regulation of body state by reinforcement learning of homeostasis | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Gradual proactive regulation of body state by reinforcement learning of homeostasis View ORCID Profile Mana Fujiwara , View ORCID Profile Honda Naoki doi: https://doi.org/10.1101/2025.10.17.682979 Mana Fujiwara 1 Laboratory of Theoretical Biology, Graduate School of Biostudies, Kyoto University , Kyoto, Kyoto, Japan 2 Laboratory of Data-driven Biology, Nagoya University Graduate School of Medicine , Nagoya, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mana Fujiwara For correspondence: fujiwara.mana.f87{at}kyoto-u.jp honda.naoki.t1{at}f.mail.nagoya-u.ac.jp Honda Naoki 1 Laboratory of Theoretical Biology, Graduate School of Biostudies, Kyoto University , Kyoto, Kyoto, Japan 2 Laboratory of Data-driven Biology, Nagoya University Graduate School of Medicine , Nagoya, Japan 3 Laboratory for Data-driven Biology, Graduate School of Integrated Sciences for Life, Hiroshima University , Higashihiroshima, Hiroshima, Japan 4 Center for One Medicine Innovative Translational Research (COMIT), Nagoya University , Nagoya, Aichi, Japan Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Honda Naoki For correspondence: fujiwara.mana.f87{at}kyoto-u.jp honda.naoki.t1{at}f.mail.nagoya-u.ac.jp Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Living systems maintain physiological variables such as temperature, blood pressure, and glucose within narrow ranges—a process known as homeostasis. Homeostasis involves not only reactive feedback but also anticipatory adjustments shaped by experience. Prior homeostatic reinforcement learning (HRL) models have provided a computational account of anticipatory regulation under homeostatic challenges. However, existing formulations lack mechanisms for gradual, trial-by-trial adjustment and for extinction learning. To address this issue, we developed a continuous HRL framework that enables trial-wise tuning of anticipatory regulation. The model incorporates biologically informed components: asymmetric reinforcement, weighting negative outcomes more than positive outcomes; and a dual-unit, context-gated inhibitory mechanism. We applied the framework to thermoregulatory conditioning with ethanol-induced hypothermia and successfully reproduced cue-triggered compensation, gradual tolerance, and rapid reacquisition after extinction. We then extended the framework to multiple physiological variables influenced by shared neural or hormonal control signals, and found that when regulatory priorities across variables were uneven, a deviation in one variable propagated through shared control to others, yielding a cascading, system-wide failure to return to ideal state (non-recovery)—a pattern reminiscent of autonomic dysregulation (e.g., dysautonomia, ME/CFS). Overall, our framework provides a computational basis to advances a systems-level understanding of multi-organ homeostatic dysregulation in vivo. Introduction Living organisms maintain internal variables such as body temperature, blood pressure, glucose levels, and hormone concentrations within narrow ranges that are optimal for survival and proper functioning—a process known as homeostasis . To maintain homeostasis in dynamically changing environments, organisms rely not only on reactive mechanisms that respond to disturbances, but also through proactive control that anticipates and prepares for upcoming changes. Such proactive regulation is acquired through experience and learning. A familiar example of this process is the development of pavlovian-conditioned drug tolerance( Farahbakhsh and Siciliano, 2023 ; Hou et al., 2023 ; Le et al., 1979 ; Siegel, 2001 ). Through repeated administrations, the body learns to adjust and attenuate the drug’s effect, eventually requiring increased dosage to achieve the same therapeutic outcome. However, upon moving to another context, which has never been associated with the drug, the same dosage may cause excessive effects, because the learned tolerance was not expressed in the new context. Another illustrative example in laboratory situations is thermoregulatory classical conditioning ( Mansfield et al., 1983 ; Mansfield and Cunningham, 1980 ). In this paradigm, a neutral cue (CS) is paired with an ethanol injection (unconditioned stimulus, US) that induces hypothermia, and over repeated pairings, animals begin to generate a compensatory response to the CS by increasing body temperature in advance to counteract the expected drop, eventually leading to full tolerance. These examples illustrate that homeostatic responses can be modulated through learning in a cue-dependent and anticipatory manner. Despite its biological importance, the computational mechanisms underlying such proactive, experience-driven regulation remain largely unknown. To address such cue-dependent and anticipatory homeostatic regulation, Keramati and Gutkin proposed a computational decision-making model of homeostasis grounded in the reinforcement learning (RL) framework ( Keramati and Gutkin, 2014 ). In their homeostatic reinforcement learning ( HRL ) model, the agent learns to select actions that reduce the deviation from the optimal internal state, which are considered rewarding and reinforced, while actions that increase the deviation are punishing and thus avoided. This model has been applied to various physiological domains, including water balance, thermoregulation, and pathological conditions such as cocaine addiction ( Keramati et al., 2017 ). Building on this framework, Uchida et al . extended the HRL model to account for salt appetite, modeling how the conditioned stimulus (CS) of salt taste predicts future sodium intake ( Uchida et al., 2022 ). While influential, the original HRL formulations face three key limitations when applied to biological systems. First, they operate with discrete actions, which makes it difficult to capture the gradual, trial-by-trial adjustment of compensatory signals. To enable gradual learning, we combined a neural circuit model with the HRL framework. The compensatory signal strength driven by the CS is updated through a reinforcement learning process to maximize net reward. This architecture enables the agent to acquire anticipatory, context-dependent control of internal states and to flexibly modulate learned responses. Second, the previous HRL framework commonly assumes symmetric valuation of positive (reward) and negative (punishment) outcomes. However, behavioral and neural studies, particularly in decision-making tasks such as gambling, have consistently shown loss-dominant asymmetry, in which losses are weighted more heavily than equivalent gains ( Macpherson et al., 2014 ; Sokol-Hessner and Rutledge, 2019 ). We extend this idea to the regulation of internal physiological states. Homeostatic systems are organized to defend safe operating ranges and to prevent critical thresholds from being crossed. Such regulation is robust: for example, body temperature is tightly maintained within 35–38 °C despite external perturbations. From a control-theoretic perspective, however, if a deviation does occur, rapidly restoring the variable risks overshoot and instability. Clinically, recovery from critical illness and multi-organ dysfunction is often protracted and gradual, with persistent organ dysfunction and long-lasting impairments documented in chronic or persistent critical illness ( Iwashyna, 2012 ; Voiriot et al., 2022 ). We therefore interpret this asymmetry in control—strong defense against boundary crossings versus cautious correction once deviation has occurred—as a form of loss-dominant regulation, and we incorporate this principle into our model. Finally, the previous HRL model does not accommodate inhibitory learning , such as memory extinction. In classical conditioning, xtinction refers to the gradual reduction in conditioned responses when the CS is no longer followed by the US. However, the extinguished response can re-emerge as spontaneous recovery after a delay, reinstatement after unsignaled US re-exposure, and rapid reacquisition when CS– US pairings are resumed ( Myers and Davis, 2002 ; Napier et al., 1992 ; Rescorla and Heth, 1975 ). This phenomenon suggests that extinction does not erase the original memory, but rather involves the formation of a separate inhibitory trace, which can be unmasked by contextual cues. In the mammalian brain, such dual-memory systems are supported by the direct (excitatory) and indirect (inhibitory) pathways in the basal ganglia, mediated by D1 and D2 receptor-expressing neurons, respectively. This architecture highlights the need for models that can represent both activation and inhibition in associative learning ( Li et al., 2016 ) and context-sensitive gating mechanisms are likely essential for biologically plausible extinction learning. Here, we propose an extended HRL framework that addresses these three limitations. Our model allows (1) gradual , trial-by-trial updating of internal control signals; (2) asymmetric reward weighting , with a stronger emphasis on punishment; and (3) inhibitory learning , implemented through a gating mechanism that suppresses conditioned responses in the absence of expected outcomes. Then, we further extend this model to multi-dimensional homeostatic systems and demonstrate that local optimization can give rise to emergent trade-offs or even maladaptive, pathological states at the system level. Results Mathematical model of proactive homeostatic regulation To investigate how a self-regulating system actively controls internal states in anticipation of external disturbances, we developed a computational framework based on homeostatic reinforcement learning. The model’s objective is to maintain body states within a desirable range, even in the force of perturbations. Through repeated experience, the agent learns to predict upcoming disturbances, and activates a resistance function in advance to prevent dramatic state deviations. We first focused on thermoregulation in the context of classical conditioning. In this task, a neutral auditory cue (conditioned stimulus, CS) was repeatedly paired with an ethanol injection (unconditioned stimulus, US), which consistently induced hypothermia of approximately 1.5°C ( Fig. 1A ). Over successive trials, animals learned to anticipate the hypothermic effect by generating a compensatory heating response at the time of the CS presentation, ultimately developing tolerance to the US ( Fig. 1B ). Download figure Open in new tab Fig. 1. Body temperature regulation as an example of proactive homeostasis. (A) Sketch of conditioning scheme and body temperature change during the task. (B) Experimental data adapted from Karamati and Gutkin (2014), which is modified from Mansfield & Cunningham (1980) . The mouse gradually obtained tolerance against ethanol-induced hypothermia, and such tolerance is included by the bell ring as shown in the extinction block (E1 after the 8th block). The 10 th trial, R1, represents the first trial of reacquisition. (C) Schematic of the proposed homeostatic RL model. Center: Block diagram illustrating interactions among the environment, body state, and agent. Left: At CS onset, a compensatory signal g ( t ) with the amplitude s is elicited, aiming to counteract the disturbance f ( t ) starting at US. Right: The reward r ( t ) is defined as the local slope of the drive function D ( x ) at the current state x ( t ) which determines the learning signal for policy adaptation. The dynamics of body temperature x ( t ) in i -th trial is described by a differential equation where f ( t ) represents the effect of the ethanol-induced disturbance and g ( t ) represents the compensatory response, scaled by compensatory signal s . The timings t US, i and t CS, i denote the timings of US and CS in the i -th trial, respectively. To represent motivational force toward the homeostatic setpoint x * , we introduced a quadratic drive function D + x ( t )) = a + x ( t ) − x * ) 2 . The compensatory signal s , which is generated by two opposing neural units: an activation unit A and an inhibition unit I as where they are up-regulated by CS, w A,I represents the inhibitory effect of I on A. χ is a context-dependent gating function as χ i = CS i –1 US i –1 , which is 1 and 0 when the previous trial was conditioning and extinction, respectively, and b is a constant parameter. The agent adjust the compensatory signal s to maximize the cumulative of instantaneous reward, which is defined by the temporal derivative of the drive function as where η > 1 introduces an asymmetry that reflects a greater sensitivity to punishment than reward. This weighting scheme embeds a biologically motivated risk-averse strategy, whereby punishment avoidance is prioritized over reward acquisition. To resolve the ambiguity between self-induced and externally induced changes, we introduced a context-sensitive learning rule that modulates the update of the compensatory signal s , depending on the inferred attribution of outcome shifts. This mechanism prevents inappropriate reinforcement when deviations arise from external disturbances rather than internal regulation (see Methods for details). Simulations of Proactive Body Temperature Regulation We validated the proposed HRL model by simulating thermoregulatory learning in mice, examining whether the model can reproduce gradual acquisition of tolerance against hypothermia. In repeated CS-US pairings, the simulated body temperature showed progressive tolerance: the magnitude of the hypothermic deviation decreased across trials as the agent learned to proactively activate the compensatory response ( Fig. 2A , trial 1-10), replicating empirical data ( Fig. 1B ). Download figure Open in new tab Fig. 2: Homeostatic regulation of body temperature (A, B) Simulated body temperature from normal (A) and net reward (B) produced by the proposed model incorporating a gating function χ , which suppresses inhibition during reacquisition trials. The reward weight parameter was set to η = 1.5. (C, D) Corresponding results from the same model without the gating mechanism. (E, F) Results with gating intact but with a symmetric reward structure ( η = 1). (G, H) Results produced by the previous HRL model with discrete all-or-none responses. Moreover, we checked the behavior after extinction. The proposed model exhibited an immediate recovery of the compensatory response upon reacquisition ( Fig. 2A , trial 11), resulting in a sharp increase in reward ( Fig. 2B , trial 11). In the lack of the a context-sensitive gating mechanism χ , this recovery was absent, even though both activation and inhibition units were present ( Fig. 2C and D , trial 11). These results indicate that the gating mechanism plays a critical role in enabling the re-expression of suppressed responses, in addition to the presence of dual learning units. Collectively, our findings support the notion that effective extinction learning requires not only inhibitory memory formation but also a context-dependent mechanism to unmask it. Next, we examined the role of reward asymmetry. When η > 1 (i.e., assigning greater weight to punishment), the model successfully acquired proactive tolerance ( Fig. 2A and B ). In contrast, when η = 1 (i.e., equal weighting of reward and punishment), learning failed ( Fig. 2E ). This failure arises because the net reward R inherently sums to zero ( Fig. 2F ), as temperature changes that start and return to baseline. These results indicate that imbalanced reward weighting is essential for learning effective regulation. For comparison, we also implemented the previous model, which allows only binary, all-or-none responses of body temperature in individual trials in a probabilistic manner ( Keramati and Gutkin, 2014 ). This model failed to replicate the gradual development of tolerance ( Fig.2G ) and showed unstable net rewards across trials ( Fig. 2H ), highlighting the benefit of our continuous-output model. Multi-dimensional model The one-dimensional (1D) model described above focused on regulating a single variable, namely body temperature, within a framework of HRL. However, real biological systems must maintain homeostasis across multiple variables simultaneously. To capture this complexity, we extended the model to a multi-dimensional framework in which multiple internal variables are regulated via a smaller set of shared meta-signals, such as neural and hormonal activities ( Fig. 3A ). Download figure Open in new tab Fig. 3. Homeostatic regulation of multiple variables with a meta-signal. (A) Model architecture with N local variables and K meta-signals. In the simulation, N = 3 and K = 2. (B-G) Simulated time series of the model variables with the reward weightinf parameter η = 1.5. Shaded region indicate extinction phase. (B) The time series of variable x n . The first, second, and third local variables are shown in orange, blue, and green, respectively. (C) Time series of reard r n ( t ). (D) Trial-by-trial changes in local signal strength s n . (E) Meta-signal z 1 (purple) and z 2 (yellow), which modulate x 1 , x 2 , and x 2 , x 3 , respectovely. (F) Net rewards R n for each local variable. (G) Total system reward R tot . In biological systems, control signals such as sympathetic tone, vasopressin, or cortisol can simultaneously influence body temperature, blood pressure, hydration, and glucose levels. These shared inputs, referred to here as meta-signals, give rise to trade-offs in regulation: an adjustment intended to stabilize one variable may inadvertently perturb others. In our model, this is captured by allowing each meta-signal to influence multiple variables through fixed regulatory weights w . The extended model consists of N internal variables ( x 1 , x 2 , …, x N ), each associated with a local control signal s n , which is computed as a weighted sum of K meta-signals ( z 1 , z 2 , …, z K ) ( Fig. 4 ) as where N > K , and w kn is a fixed parameter representing the physiological influence of meta-signal z k on variable x n . We assume these weights take both positive and negative values, reflecting the fact that meta-signals exert heterogeneous effects. Therefore adjusting one variable toward its optimal value may induce deviations in others. For example, in a hot environment, evaporative cooling through sweating lowers body temperature but leads to water loss. Similarly, pharmacological interventions targeting a specific variable often induce side effects by perturbing others. Download figure Open in new tab Fig. 4. Homeostatic regulation of multiple variables under three priority weight patterns. (A–C) Simulations with three different priority configurations, where variable x 1 (A), variable x 2 (B), or variable x 3 (C) is assigned the highest weight, as shown in the bar graphs (left). Red, green, and blue bars represent priority weights for variables x 1 , x 2 , and x 3 , respectively. For each configuration, the top-right panel shows the time series of deviations from each setpoint x n ( t ) − and the bottom-right panel shows the corresponding total reward R across trials. The dynamics of each internal variable x n are described by The drive function, defined as , quantifies the motivational force to restore x n to its fixed setpoint , where an represents priority assigned to each variable. F n ( t ) and G n ( t ) denote the response profiles to US and CS, respectively. t US, i and t CS, i are the timings of unconditioned and conditioned stimuli in the i -th trial. Learning in this multi-dimensional HRL model proceeds by updating the meta-signals z k to maximize the cumulative reward of the local variables they influence. Since each meta-signal affects multiple variables, and each variable has its own priority weight a n , the reward function implicitly integrates across all variables under shared control. As a result, optimization of meta-signals involves navigating trade-offs among competing variables, favoring adjustments in higher-priority variables in order to improve the overall reward. Simulation of Proactive Regulation of the Multiple States To explore the dynamics of the multidimensional model and its capacity to maintain homeostasis across multiple variables, we focused on two key factors influencing the system’s behavior: (1) prioritization among local states and (2) degree of imbalance weight on positive and negative reward. To examine these effects, we considered the model comprising three variables x 1 , x 2 , x 3 ( N = 3), where x 1 was designated as the target of the external disturbance (US). The model included two meta-signals z 1 and z 2 ( K = 2). The physiological influence matrix of meta-signal, w , was given as which represents their up- and down-regulatory effects on the local variables in the multidimensional simulation. Although each variable x n in the model is an abstract representation of an internal physiological state, this formulation can be interpreted as a simplified representation of competing homeostatic dimensions such as body temperature, plasma glucose level, and hydration state, which are known to interact through shared regulatory signals. (1) Prioritization among local states Each variable xn represents a distinct physiological state with its own optimal setpoint . The prioritization for each variable is encoded in the priority parameter an within the drive function. Depending on a n , some variables are vital for survival and thus tightly maintained near their setpoints, while others can tolerate temporary deviations without substantially disrupting overall homeostasis. We examined model behavior under two prioritization patterns. We simulated the case with uniform prioritization pattern ( a n = 2 for all n ) ( Fig. 3B-G ). The system gradually improved x over time ( Fig. 3B ). Although external disturbances were applied only to x 1 ( t ), x 2 ( t ) deviated from its optimal value due to the influence of the meta-signal z 1 , which jointly regulates x 1 and x 2 . Subsequently, x 3 deviated from its own optimum to compensate for the accumulated deviations. Thus, local variables adjusted their setpoints to maximize the system-wide global reward R tot ( Fig. 3G ). We also simulated the model with non-uniform prioritization configurations, in which each of the three variables was assigned the highest priority in turn. When x 1 , the target of external disturbance, held the highest priority (i.e., a 1 > max( a 2 , a 3 )), the system exhibited only minor perturbation due to stronger reactive regulation mediated by the drive term ∂D 1 / ∂x 1 . As a result, the disturbed state rapidly returned to its setpoint ( Fig. 4A ). In this condition, the deviation of x 1 was effectively minimized, although deviations in x 2 and x 3 emerged as compensatory trade-offs. Consequently, the global reward R tot increased only modestly. In contrast, when either x 2 or x 3 was prioritized (i.e., a 2 > max( a 1 , a 3 ) or a 3 > max( a 1 , a 2 )), the initial perturbation in x 1 was larger and its recovery was slower. This resulted in delayed reward accumulation and greater compensatory deviations in the other variables ( Fig. 4B, C ). Overall, fluctuations in the higher-priority variables were effectively suppressed, while disturbances affecting lower-priority variables were tolerated and left largely uncorrected. Accordingly, when either x 2 or x 3 received the highest priority, the system failed to fully compensate for the the external disturbances applied to x 1 , reflecting a prioritization strategy that protects critical variables at the expense of others. (2) Joint parameter effect on RL success We systematically investigated how reward asymmetry ( η ) and prioritization ( a ) jointly influence the learning dynamics. The prioritization pattern was parameterized by a single scalar angle, such that , which allows continuous variation across the entire spectrum of prioritization configurations ( Fig.5A ). For each parameter pair ( η, θ ), we performed a grid-search to identify the optimal strength of the meta-signals z * ( η, θ ), z = ( z 1 , z 2 , …, z K ), that maximizes the system’s reward in a trial, R tot ( Fig. 5B , Fig. S2). Download figure Open in new tab Fig. 5. Influence of parameter pairs on multidimensional model dynamics. (A) Parameterization of priority weights a n as a function of prioritization angle θ . Orange, green, and blue lines represent a 1 , a 2 , a 3 , respectively. (B) Example heatmap of R tot depending on meta-signal strengths. The star represents the reward optimal meta-signal z * ( η, θ ), where η = 3 and θ = 1.3878. (C) Reward-optimal z * ( η, θ ) for each parameter pair. Each star marks the reward optima z * ( η, θ ); hue denotes θ and lightness denotes η (see the right panel). As the optimum’s location varies mainly with θ , points for lower η (darker shades of the same hue) lie directly behind those for higher η and may not be visible. (D, E) Trajectories of the two meta-signals ( z 1 , z 2 ) across 100 trials for representative values of θ . Trajectories were smoothed by averaging across every 3 trials in (D) balanced prioritization conditions ( θ ∉ {0.2857, 0.9796, 1.6735}) and (E) disproportionate prioritization conditions( θ ∈ {0.2857, 0.9796, 1.6735}). (F) Heatmap of the mean distance to the reward optimum z * ( η, θ ) across the ( η, θ )-plane (averaged over the final 10 trials). Because RL success or failure can be evaluated by whether the simulated meta-signal trajectory z k,i approaches the reward-optimal z * ( η, θ ), we simulated the multidimensional model for 100 CS–US trials without an external disturbance. This procedure resolved the trajectory of the meta-signals under each parameter setting ( Fig. 5D ). When the prioritization was more balanced, trajectories remained tightly confined around the optimum ( Fig. 5D left); by contrast, under extremely disproportionate prioritization, trajectories deviated from the optimum ( Fig. 5D right). Lastly, we tested whether the meta-signals reliably converge to the reward optimum z * ( η, θ ) across the ( η, θ ) space. Deviation from the optimum was quantified as the mean Euclidean distance between the simulated trajectory and z * ( η, θ ) over the final 10 trials. This systematic sweep revealed contiguous regions of the ( η, θ )plane where the meta-signals consistently converged to z * ( η, θ ), and distinct regions where trajectories moved away from the optimum, indicating failure to learn appropriate compensatory responses ( Fig. 5E ). The resulting phase diagram delineates the parameter regimes in which the proposed RL mechanism is expected to succeed. Importantly, the external disturbance acted directly only on the first variable, x 1 ( Fig. 3A ) Discussion Summary This study proposes an extended framework of homeostatic reinforcement learning (HRL) that enables proactive and gradual regulation of internal physiological states through experience. In a one-dimensional thermoregulatory conditioning task, the model reproduces ethanol tolerance and rapid reacquisition after extinction by learning trial-by-trial compensatory responses, consistent with CS–US thermoregulatory data ( Mansfield and Cunningham, 1980 ) Extending to multiple physiological axes, the framework shows that shared regulatory signals impose unavoidable trade-offs: priority structure determines which variables are tightly stabilized and which are allowed to fluctuate, thereby shaping convergence or failure. Extensions beyond the original HRL Model Relative to the original HRL model ( Keramati and Gutkin, 2014 ), three biologically informed ingredients are central and each is needed to capture the observed phenomena. First, continuous control signals allow graded, trial-by-trial compensation rather than all-or-none actions. In the one-dimensional thermoregulatory task, this continuity reproduces gradual tolerance and rapid reacquisition after extinction, in line with CS–US thermoregulatory findings and Pavlovian tolerance reports ( Farahbakhsh and Siciliano, 2023 ; Mansfield and Cunningham, 1980 ; Poulos and Cappell, 1991 ; Siegel, 2005 ,2001). Second, loss-dominant valuation – greater weight on increases in drive than on equivalent decreases – is required in our formulation. Without this asymmetry, the temporal-derivative reward becomes uninformative and prevents learning. This is consistent with asymmetry observed in biological decision-making ( Macpherson et al., 2014 ; Sokol-Hessner and Rutledge, 2019 ). Third, inhibitory learning with context-sensitive gating treats extinction as inhibition of expression rather than of the original memory trace. This representation aligns with evidence that extinction recruits inhibitory circuits and is context dependent, which in turn allows re-expression phenomena such as spontaneous recovery and rapid reacquisition ( Bouton, 2002 ; Bouton et al., 2006 ; Quirk and Mueller, 2008 ; Whittle et al., 2021 ). In the multidimensional extension, the framework jointly regulates several variables through shared meta-signals and explicitly specified priority weights. This arrangement clarifies how priority choices create trade-offs across axes. Prioritizing one variable tightens stabilization of that axis at the expense of others. Pathological Relevance In the multidimensional model, when one homeostatic variable receives disproportionately low priority, its deviation is effectively ignored during learning, leading to persistent misattribution of disturbances as externally caused and to non-convergence of compensatory responses. Over time, such model dynamics suggest a cautious link to autonomic dysregulation. In myalgic encephalomyelitis/chronic fatigue syndrome (ME/CFS), for example, orthostatic intolerance and related autonomic abnormalities are frequently reported ( Freeman and Komaroff, 1997 ; Garner and Baraniuk, 2019 ) and reduced heart-rate variability (HRV) indicates altered cardiac autonomic regulation ( Nelson et al., 2019 ), with HRV abnormalities further associated with fatigue severity ( Escorihuela et al., 2020 ). These empirical features are consistent with theoretical accounts emphasizing impaired allostatic control and aberrant interoceptive inference ( Barrett and Simmons, 2015 ; Stephan et al., 2016 ). In this framing, clinical manifestations can be mapped onto specific parameter regimes of the model, thereby linking clinical observations to the computational dynamics of HRL. Perturbation-learning protocols could be used to estimate relative priority weights from time-series data and to assess the prevalence of non-convergent adaptation. Such tests would evaluate the model’s relevance to patient populations. Comparison with Prior Works Classical homeostatic models grounded in negative feedback have been highly successful at explaining rapid, reactive stabilization of internal variables—for example, baroreflex control of blood pressure and thermoregulatory set-point maintenance ( Ashby, 2013 ; Cannon, 1932 ). By design, however, these formulations do not incorporate experience-dependent learning or contextual adaptation, and thus do not readily account for gradual tolerance or cue-dependent modulation of internal states. A complementary perspective is allostasis, which emphasizes “stability through change,” highlighting predictive, context-sensitive adjustments of set points (Sterling & Eyer 1988)( McEwen, 2017 ; Ramsay and Woods, 2014 ; Sterling, 2012 , 2004 ). This literature provides conceptual and empirical motivation for anticipatory regulation in stress and neuroendocrine systems, but it has largely remained descriptive and does not specify how such predictive adjustments are learned from repeated experience. Predictive coding and the free-energy principle offer a normative route to anticipatory control, casting agents as inference systems that maintain and update generative models of body and environment to minimize prediction error ( Friston et al., 2015 ; Friston and Stephan, 2007 ; Pezzulo et al., 2024 ; Tschantz et al., 2022 ). These approaches have broad explanatory scope but typically assume structured priors or learned latent dynamics and entail nontrivial computational demands, which may be challenging for fast, resource-limited autonomic regulation. Our reinforcement-learning account complements these traditions by producing anticipatory regulation without constructing explicit generative models. Building on prior HRL work ( Keramati and Gutkin, 2014 ), we introduce biologically informed components that together reproduce gradual tolerance, extinction with re-expression (spontaneous recovery/rapid reacquisition), and multivariable trade-offs. In this way, the framework provides a learning-based, model-free complement to predictive coding/FEP and a computational instantiation of allostatic ideas, while remaining simple enough to yield testable predictions about when regulation converges or fails ( Friston et al., 2015 ; Friston and Stephan, 2007 ; Ramsay and Woods, 2014 ; Sterling, 2012 ). Limitations and Future Directions Despite its strengths, the model has limitations. Reward was defined solely as the temporal derivative of the drive function, reducing sensitivity to persistent deviations. The influence matrix w was fixed, limiting flexibility in representing adaptive reweighting of regulatory signals. Moreover, simulations were not fit directly to multi-variable physiological datasets. Future work should integrate empirical data (e.g., temperature, glucose, hydration) and allow adaptive coupling weights to capture long-term physiological changes. Parameter regimes that yielded instability in simulations may correspond to dysregulated states in real organisms; testing these predictions against pathological data represents an important next step. Beyond these technical considerations, the framework offers a compact yet extensible bridge between reinforcement learning and physiological regulation. By emphasizing experience-dependent adaptation within biologically realistic constraints, it complements control-theoretic, Bayesian, and predictive coding approaches. Looking forward, its integration with empirical datasets may transform it from a conceptual model into a translational tool—deepening our understanding of adaptive and maladaptive regulation, and guiding strategies for intervention. Methods 1D model The dynamics of body temperature x in the i-th trial are described by the equation: f ( t ) and g ( t ) are transient functions which start at t = 0, where f ( t ) = 0 and g ( t ) = 0 when t ≤ 0. t US and t CS denote the timings of US and CS, respectively. The function f ( t ) represents the effect of ethanol injection on body temperature, leading to hypothermia, while g ( t ) represents the compensatory response. In our simulation, the dynamics of body temperature were modeled using this equation for each trial, which was repeated multiple times. The drive function D + x ( t )) represents the motivation to shift the body state based on the current state, which is defined as the distance of the internal state from the desired setpoint x * . In this study, we employ a quadratic function The model optimizes the control variable sn to maximize reward r. In the context of homeostatic reinforcement learning, the reward at each time t is defined as the negative temporal derivative of the drive function D, i.e., dD/dt=a(x-x*)dx/dt. Additionally, we introduced an asymmetry in positive and negative rewards based on prospect theory, by adding a weight on negative reward (punishment). The instant reward r(t) is defined as where >1, which represents the asymmetry. The net reward of the i -th trial R i and the cumulative deviation of body temperature X i were calculated as temporal integrals of their time serieses where T i denotes the time that the i -th trial ends. Note that R i is always negative and zero if η > 1 and η = 1, respectively. The strength of the compensatory signal s i is assumed to be modulated by two neural populations, A and I , which respectively promote and suppress the response both driven by the CS. Their interaction is governed by a gating variable χ i (i.e., χ i = CS i-1 US i-1 , CS i ,US i ∈ {0,1}), defined as where w A,I represents the inhibitory effect of I on A , and indicates the effect size of neural activity on temperature dynamics. This gating mechanism reflects the idea that an activatory memory can be masked by an inhibitory trace, and that such inhibition can be transiently desengaged upon presentation of a retrieval cue. This formulation is consistent with the biological observation that extinction does not erase original associative learning, but instead superimposes context-dependent inhibition. Both A and I are upregulated by CS input where and are weights on A i and I i , respectively. These weights are updated based on the discrepancy between actual and ideal rewards, following the assumptions introduced in the Rescorla-Wagner model, a classical conditioning model. The update of signal weights is where α represents the learning rate and m ∈ { A, I }. The update process is divided into two components: and , which represent how much reward is predicted by self-regulation of and by unexpected deviations of the body state e.g., when the acquisition phase changes to the extinction phase and vice versa. is described by temporal reward shift R i with respect to the increment of , with a sigmoid function h to limit the update amount per step, as where and ϵ is the small constant to avoid zero division. In contrast, is described by temporal change in reward R i with respect to the deviation of X i , as WhereΔ X i = X i − X i − 1 In the second term, ω (0 ≤ ω ≤ 1) determines the weight of and as where σ (·) = 1/(1 + exp(·)), Δ s i = s i − s i –1 , and c denotes constant parameter. This ω implies how likely the cause of the change in R i was either a self-regulation signal or unexpected deviations of the body state (For intuition, we visualize the functional form of σ relative to an example dataset: see Supplementary Fig. S1). When R i improves gradually as a result of the update of signal weights , takes a small value hence is dominant. In contrast, when R i experiencing sudden changes (jump or drop) due to unexpected deviations of the body state, ω takes a large value hence is applied. In this scenario with large ω , since the change in R i is not directly related to , the gradient for updating is approximated by calculating the temporal difference of net reward R i with respect to the deviation of X i . Multi-dimensional extension In the one-dimensional (1D) model described above, the regulation of a single internal variable, body temperature, was considered. However, real biological systems must simultaneously maintain homeostasis across multiple physiological variables. Here, we consider a scenario in which the brain controls multiple internal variables (e.g., body temperature, blood pressure, blood glucose level) through a limited number of command signals, such as peripheral neural activity or hormonal secretion. In such a case, each command signal influences multiple variables simultaneously. Consequently, adjusting one variable toward its optimal value may cause other variables to deviate from theirs. For instance, when the body lowers an elevated temperature via sweating, the body temperature approaches its optimal range, but this leads to a loss of water. Likewise, pharmacological interventions often have side effects: while a drug may alleviate one condition, it may simultaneously disturb another. Within the framework of Homeostatic Reinforcement Learning (HRL), where each internal variable is associated with a fixed set point, a single action that optimizes one variable does not necessarily bring all other variables closer to their respective set points. Regulatory actions therefore involve inevitable trade-offs, producing both beneficial and adverse consequences. At each ( i -th) trial, each variable x n,i is regulated by a local signal s n,i which is in turn influenced by a weighted sum of meta-signal z k,i where w kn represents the fixed physiological weight from the k -th meta-signal to the n -th variable (see Fig. 3 ) The dynamics of the n -th variable x n are defined as where is the drive function, t US, i and t CS, i denote the timings of conditional (CS) and unconditioned stimuli (US), respectively, and F n ( t ) and G n ( t ) are US- and CS-induced input functions for the n-th variable. The amplitude of each G n ( t ) is modulated by the local signal s n,i which itself is regulated by the meta-signals z k,i . An instantaneous reward r n ( t ) is defined independently for each variable. As in the 1D model, the net reward R n,i and the cumulative deviation of variable X n,i for the i -th trial are computed accordingly as follows. The meta-signal z k is optimized through a control variable u k , which aims to maximize the total reward . The meta-signal z k is assumed to be modulated by two neural populations, A k (excitatory) and I k (inhibitory), with the control variable (for m ∈ { A, I }) updated as with a gating variable χ i (i.e., χ i = CS i-1 US i-1 , CS i ,US i ∈ {0,1}). The parameter w A,I represents the inhibitory effect of I k on A k , and indicates the effect size of neural activity on temperature dynamics. The update consists of two components: , which reflects self-regulation based on predicted reward changes, and , which reflects adjustments due to external disturbances. These are defined as where and ϵ is a small constant to avoid division by zero. The weighting parameter ω n , which determines the relative contribution of to , is computed as where c is a parameter that shifts the sigmoid function. All model parameters are listed in Table 1 in Supprementary. Competing Interests M.F and H.N. declare no competing interests. Author contributions M.F. and H.N. conceived the research project and wrote a manuscript. M.F. developed the model and implemented analysis. Acknowledgements We are grateful to Prof. Michiyuki Matsuda and Prof. Kazuhiro Aoki for providing research environment. We thank Sana Ishihara for assistance with preliminary model construction related to this study. This study was supported in part by Japan Society for the Promotion of Science (JSPS) KAKENHI [23KJ1293 to M.F], Japan Science and Technology Agency (JST) the Moonshot R&D–MILLENNIA Program [JPMJMS2024-9 to H.N.], and Japan Agency for Medical Research and Development (AMED) Multidisciplinary Frontier Brain and Neuroscience Discoveries (Brain/MINDS 2.0) [JP25wm0625322 to H.N.]. Funder Information Declared Japan Society for the Promotion of Science, https://ror.org/00hhkn466 , 23KJ1293 Japan Science and Technology Agency , JPMJMS2024-9 Japan Agency for Medical Research and Development , JP25wm0625322 References ↵ Ashby , W. , 2013 . Design for a brain: The origin of adaptive behaviour . ↵ Barrett , L.F. , Simmons , W.K. , 2015 . Interoceptive predictions in the brain . Nat. Rev. Neurosci . 16 , 419 – 429 . OpenUrl CrossRef PubMed ↵ Bouton , M.E. , 2002 . Context, ambiguity, and unlearning: sources of relapse after behavioral extinction . Biol. Psychiatry 52 , 976 – 986 . OpenUrl CrossRef PubMed Web of Science ↵ Bouton , M.E. , Westbrook , R.F. , Corcoran , K.A. , Maren , S. , 2006 . Contextual and temporal modulation of extinction: behavioral and biological mechanisms . Biol. Psychiatry 60 , 352 – 360 . OpenUrl CrossRef PubMed Web of Science ↵ Cannon , W.B. , 1932 . The wisdom of the body . Am. J. Med. Sci . 184 , 864 . OpenUrl ↵ Escorihuela , R.M. , Capdevila , L. , Castro , J.R. , Zaragozà , M.C. , Maurel , S. , Alegre , J. , Castro-Marrero , J. , 2020 . Reduced heart rate variability predicts fatigue severity in individuals with chronic fatigue syndrome/myalgic encephalomyelitis . J. Transl. Med . 18 , 4 . OpenUrl CrossRef PubMed ↵ Farahbakhsh , Z.Z. , Siciliano , C.A. , 2023 . Pavlovian-conditioned opioid tolerance . Sci. Adv . 9 , eadg6086 . OpenUrl PubMed ↵ Freeman , R. , Komaroff , A.L. , 1997 . Does the chronic fatigue syndrome involve the autonomic nervous system? Am . J. Med . 102 , 357 – 364 . OpenUrl ↵ Friston , K. , Levin , M. , Sengupta , B. , Pezzulo , G. , 2015 . Knowing one’s place: a free-energy approach to pattern regulation . J. R. Soc. Interface 12 . doi: 10.1098/rsif.2014.1383 OpenUrl CrossRef PubMed ↵ Friston , K.J. , Stephan , K.E. , 2007 . Free-energy and the brain . Synthese 159 , 417 – 458 . OpenUrl CrossRef PubMed Web of Science ↵ Garner , R. , Baraniuk , J.N. , 2019 . Orthostatic intolerance in chronic fatigue syndrome . J. Transl. Med . 17 , 185 . OpenUrl PubMed ↵ Hou , Y. , Zou , G. , Wang , X. , Guo , H. , Ma , X. , Cheng , X. , Xie , Z. , Zuo , X. , Xia , J. , Mao , H. , Yuan , M. , Chen , Q. , Cao , P. , Yang , Y. , Zhang , L. , Xiong , W. , 2023 . Coordinated activity of a central pathway drives associative opioid analgesic tolerance . Sci. Adv . 9 , eabo5627 . OpenUrl CrossRef PubMed ↵ Iwashyna , T.J. , 2012 . Trajectories of recovery and dysfunction after acute illness, with implications for clinical trial design . Am. J. Respir. Crit. Care Med . ↵ Keramati , M. , Durand , A. , Girardeau , P. , Gutkin , B. , Ahmed , S.H. , 2017 . Cocaine addiction as a homeostatic reinforcement learning disorder . Psychol. Rev . 124 , 130 – 153 . OpenUrl CrossRef PubMed ↵ Keramati , M. , Gutkin , B. , 2014 . Homeostatic reinforcement learning for integrating reward collection and physiological stability . Elife 3 . doi: 10.7554/eLife.04811 OpenUrl CrossRef PubMed ↵ Le , A. , Poulos , C. , Cappell , H. , 1979 . Conditioned tolerance to the hypothermic effect of ethyl alcohol . Science . doi: 10.1126/science.493999 OpenUrl Abstract / FREE Full Text ↵ Li , Y. , Nakae , K. , Ishii , S. , Naoki , H. , 2016 . Uncertainty-Dependent Extinction of Fear Memory in an Amygdala-mPFC Neural Circuit Model . PLoS Comput. Biol . 12 , e1005099 . OpenUrl CrossRef PubMed ↵ Macpherson , T. , Morita , M. , Hikida , T. , 2014 . Striatal direct and indirect pathways control decision-making behavior . Front. Psychol . 5 , 1301 . OpenUrl CrossRef PubMed ↵ Mansfield , J.G. , Benedict , R.S. , Woods , S.C. , 1983 . Response specificity of behaviorally augmented tolerance to ethanol supports a learning interpretation . Psychopharmacology 79 , 94 – 98 . OpenUrl CrossRef PubMed ↵ Mansfield , J.G. , Cunningham , C.L. , 1980 . Conditioning and extinction of tolerance to the hypothermic effect of ethanol in rats . J. Comp. Physiol. Psychol . 94 , 962 – 969 . OpenUrl CrossRef PubMed Web of Science ↵ McEwen , B.S. , 2017 . Stress: Homeostasis, Rheostasis, Reactive Scope, Allostasis and Allostatic Load☆ , in: Reference Module in Neuroscience and Biobehavioral Psychology . Elsevier . ↵ Myers , K.M. , Davis , M. , 2002 . Behavioral and neural analysis of extinction . Neuron 36 , 567 – 584 . OpenUrl CrossRef PubMed Web of Science ↵ Napier , R.M. , Macrae , M. , Kehoe , E.J. , 1992 . Rapid reacquisition in conditioning of the rabbit’s nictitating membrane response . J. Exp. Psychol. Anim. Behav. Process . 18 , 182 – 192 . OpenUrl CrossRef PubMed Web of Science ↵ Nelson , M.J. , Bahl , J.S. , Buckley , J.D. , Thomson , R.L. , Davison , K. , 2019 . Evidence of altered cardiac autonomic regulation in myalgic encephalomyelitis/chronic fatigue syndrome: A systematic review and meta-analysis . Medicine (Baltimore) 98 , e17600 . OpenUrl CrossRef PubMed ↵ Pezzulo , G. , Parr , T. , Friston , K. , 2024 . Active inference as a theory of sentient behavior . Biol. Psychol . 186 , 108741 . OpenUrl CrossRef PubMed ↵ Poulos , C.X. , Cappell , H. , 1991 . Homeostatic theory of drug tolerance: A general model of physiological adaptation . Psychol. Rev . 98 , 390 – 408 . OpenUrl CrossRef PubMed Web of Science ↵ Quirk , G.J. , Mueller , D. , 2008 . Neural mechanisms of extinction learning and retrieval . Neuropsychopharmacology 33 , 56 – 72 . OpenUrl CrossRef PubMed Web of Science ↵ Ramsay , D.S. , Woods , S.C. , 2014 . Clarifying the roles of homeostasis and allostasis in physiological regulation . Psychol. Rev . 121 , 225 – 247 . OpenUrl CrossRef PubMed ↵ Rescorla , R.A. , Heth , C.D. , 1975 . Reinstatement of fear to an extinguished conditioned stimulus . J. Exp. Psychol. Anim. Behav. Process . 1 , 88 – 96 . OpenUrl CrossRef PubMed ↵ Siegel , S. , 2005 . Drug tolerance, drug addiction, and drug anticipation . Curr. Dir. Psychol. Sci . 14 , 296 – 300 . OpenUrl CrossRef ↵ Siegel , S. , 2001 . Pavlovian Conditioning and Drug Overdose: When Tolerance Fails . Addiction Research C Theory . doi: 10.3109/16066350109141767 OpenUrl CrossRef Web of Science ↵ Sokol-Hessner , P. , Rutledge , R.B. , 2019 . The psychological and neural basis of loss aversion . Curr. Dir. Psychol. Sci . 28 , 20 – 27 . OpenUrl CrossRef ↵ Stephan , K.E. , Manjaly , Z.M. , Mathys , C.D. , Weber , L.A.E. , Paliwal , S. , Gard , T. , Tittgemeyer , M. , Fleming , S.M. , Haker , H. , Seth , A.K. , Petzschner , F.H. , 2016 . Allostatic self-efficacy: A metacognitive theory of dyshomeostasis-induced fatigue and depression . Front. Hum. Neurosci . 10 , 550 . OpenUrl PubMed ↵ Sterling , P. , 2012 . Allostasis: a model of predictive regulation . Physiol. Behav . 106 , 5 – 15 . OpenUrl CrossRef PubMed Web of Science ↵ Schulkin , J. Sterling , P. , 2004 . Principles of allostasis: Optimal design, predictive regulation, pathophysiology, and rational therapeutics , in: Schulkin , J. (Ed.), Allostasis, Homeostasis, and the Costs of Physiological Adaptation . Cambridge University Press, Cambridge , pp. 17 – 64 . ↵ Tschantz , A. , Barca , L. , Maisto , D. , Buckley , C.L. , Seth , A.K. , Pezzulo , G. , 2022 . Simulating homeostatic, allostatic and goal-directed forms of interoceptive control using active inference . Biol. Psychol . 169 , 108266 . OpenUrl CrossRef PubMed ↵ Uchida , Y. , Hikida , T. , Yamashita , Y. , 2022 . Computational Mechanisms of Osmoregulation: A Reinforcement Learning Model for Sodium Appetite . Front. Neurosci . 16 , 857009 . OpenUrl CrossRef PubMed ↵ Voiriot , G. , Oualha , M. , Pierre , A. , Salmon-Gandonnière , C. , Gaudet , A. , Jouan , Y. , Kallel , H. , Radermacher , P. , Vodovar , D. , Sarton , B. , Stiel , L. , Bréchot , N. , Préau , S. , Joffre , J. , la CRT de la SRLF , 2022 . Chronic critical illness and post-intensive care syndrome: from pathophysiology to clinical challenges . Ann. Intensive Care 12 , 58 . OpenUrl PubMed ↵ Whittle , N. , Fadok , J. , MacPherson , K.P. , Nguyen , R. , Botta , P. , Wolff , S.B.E. , Müller , C. , Herry , C. , Tovote , P. , Holmes , A. , Singewald , N. , Lüthi , A. , Ciocchi , S. , 2021 . Central amygdala micro-circuits mediate fear extinction . Nat. Commun . 12 , 4156 . OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted October 17, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Gradual proactive regulation of body state by reinforcement learning of homeostasis Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Gradual proactive regulation of body state by reinforcement learning of homeostasis Mana Fujiwara , Honda Naoki bioRxiv 2025.10.17.682979; doi: https://doi.org/10.1101/2025.10.17.682979 Share This Article: Copy Citation Tools Gradual proactive regulation of body state by reinforcement learning of homeostasis Mana Fujiwara , Honda Naoki bioRxiv 2025.10.17.682979; doi: https://doi.org/10.1101/2025.10.17.682979 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neuroscience Subject Areas All Articles Animal Behavior and Cognition (7635) Biochemistry (17691) Bioengineering (13892) Bioinformatics (41937) Biophysics (21452) Cancer Biology (18588) Cell Biology (25504) Clinical Trials (138) Developmental Biology (13378) Ecology (19899) Epidemiology (2067) Evolutionary Biology (24320) Genetics (15609) Genomics (22506) Immunology (17736) Microbiology (40394) Molecular Biology (17181) Neuroscience (88605) Paleontology (666) Pathology (2832) Pharmacology and Toxicology (4824) Physiology (7641) Plant Biology (15156) Scientific Communication and Education (2045) Synthetic Biology (4294) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00