Multivariate Bayesian Inversion for Classification and Regression

preprint OA: closed CC-BY-ND-4.0
📄 Open PDF Full text JSON View at publisher
Full text 148,214 characters · extracted from preprint-html · click to expand
Multivariate Bayesian Inversion for Classification and Regression | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Multivariate Bayesian Inversion for Classification and Regression View ORCID Profile Joram Soch , View ORCID Profile Carsten Allefeld doi: https://doi.org/10.1101/2025.05.09.653015 Joram Soch 1 Institute of Psychology, Otto von Guericke University , Magdeburg, Germany 2 Bernstein Center for Computational Neuroscience, Charité – Universitätsmedizin Berlin , Berlin, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joram Soch For correspondence: joram.soch{at}ovgu.de Carsten Allefeld 3 Department of Psychology and Neuroscience, City St George’s, University of London , London, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Carsten Allefeld Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract We propose the statistical modelling approach to supervised learning (i.e. predicting labels from features) as an alternative to algorithmic machine learning (ML). The approach is demonstrated by employing a multivariate general linear model (MGLM) describing the effects of labels on features, possibly accounting for covariates of no interest, in combination with prior distributions on the model parameters. ML “training” is translated into estimating the MGLM parameters via Bayesian inference and ML “testing” or application is translated into Bayesian model comparison – a reciprocal relationship we refer to as multivariate Bayesian inversion (MBI). We devise MBI algorithms for the standard cases of supervised learning, discrete classification and continuous regression, derive novel classification rules and regression predictions, and use practical examples (simulated and real data) to illustrate benefits of the statistical modelling approach: interpretability, incorporation of prior knowledge, probabilistic predictions. We close by discussing further advantages, disadvantages and the future potential of MBI. 1 Introduction The core aims of machine learning (ML) are the identification of patterns in data and the use of these patterns for the prediction of variables of interest [ 1 ], allowing for the automatization of complex problems such as optical character recognition [ 2 ], human speech recognition [ 3 ] or strategic game play [ 4 ]. Typically, ML is based on constructing a mapping from measured variables, or features , to some variables of interest, or labels . For example, ML may proceed by separating data points in a high-dimensional feature space to classify them into discretely labelled categories, as in support-vector machines (SVMs) [ 5 , 6 ], or by transforming features using complex non-linear mappings to predict continuous target labels, as in deep convolutional neural networks (DNNs/CNNs) [ 7 , 8 ]. In the ML approach, labels are algorithmically extracted from features. From a statistical perspective, an alternative is to model the features in terms of the labels (see Table 1 ) – and in many contexts where ML is applied, this is the more plausible approach, because the statistical model is often interpretable in terms of an underlying data-generating process. A statistical model entails the specification of probability distributions for the feature variables given the label variables, a so-called generative model [ 9 , 10 ]. This typically invokes unknown parameters that can be estimated from the data and, in combination with prior distributions on those parameters, it becomes a full probability model [ 11 ]. Once the model parameters have been estimated, the model can be inverted for probabilistic inference of labels from features [ 12 , 13 ]. Fundamental advantages of the statistical over the algorithmic approach are the possibility to incorporate explicit knowledge about the relationships between labels and features [ 14 ], both through the model structure and prior distributions, and its ability to qualify predictions by associating them with probabilities. For further technical advantages, see below and Section 5 . View this table: View inline View popup Download powerpoint Table 1. Terminology. Analogous terms commonly used in the ML and statistics literatures, accompanied by an explanation of our use of them. The aim of this paper is to demonstrate this statistical approach by adapting a standard model from multivariate statistics to a predictive modelling context, showing its application in simulations and analyses of existing data sets. Multivariate features are assumed to be normally distributed and influenced by some label variable of interest as well as possibly covariates of no interest (see Table 1 ). Specifically, we use a multivariate general linear model (MGLM) [ 15 , 16 ] to describe linear relationships between features, labels and covariates (see Table 2 ). Via a fully Bayesian treatment of the MGLM, we calculate conditional probabilities for the label variables, given the multivariate features. We refer to this as multivariate Bayesian inversion (MBI) and show how it can be used for both classification and regression. View this table: View inline View popup Download powerpoint Table 2. Notation. Symbols used in the mathematical description of MBI. Quantities are either measured (empirically observed), known (experimentally controlled), unknown (statistically estimated) or predicted (reconstructed from the features). In the process, we provide a systematic review of Bayesian parameter estimation and Bayesian model comparison for the MGLM. While these techniques are very well known [ 12 , 17 , 18 ] and have been applied in many different contexts before (e.g. [ 19 , 20 ]), we here combine them to develop a comprehensive Bayes classifier (and Bayes regressor) for the MGLM. Taken together, the contributions of this work are three-fold: We rewrite the ML task of predicting from multivariate continuous data in terms of an interpretable probabilistic model which, rather than extracting labels from features, describes the effects that labels have on features. We show how inversion of this model by means of Bayesian model selection can be used to perform ML tasks such as classification and regression and, in addition, provides posterior probabilities for the predicted labels. We familiarize users of standard ML methods (e.g. SVMs) with statistical-model-based predictive analyses and demonstrate how common problems in ML – e.g. inclusion of variables of no interest, integration of prior knowledge, classification analyses for unbalanced categories, regression analyses for bounded variables, and correlated observations – can be handled naturally and elegantly by the MBI framework. We are aware of the fact that the simple model which we describe here lacks several capabilities of currently leading ML methods (i.e. DNNs) – e.g. facilitating predicition in high-dimensional spaces, coping with non-normal and heteroskedastic error distributions, or allowing for non-linear relationships between features and labels via kernel-based methods. Our goal is not to outperform these approaches in terms of predictive accuracy. Rather, we want to provide a basic framework for statistical-model-based Bayesian machine learning which may be further extended to address these issues. Some possible extensions are discussed in Section 5.3 . The present paper is structured as follows. First, we outline the statistical theory and derive the mathematical equations underlying MBI (see Section 2 ). Next, we present a series of MBI applications based on the analysis of simulated and empirical data (see Sections 3 and 4 ). In these sections, we also compare results based on MBI with results obtained using SVMs, representing a comparably flexible and currently popular ML approach [ 1 , 21 , 22 ]. Finally, we review various aspects illustrated by these simulations and discuss advantages, disadvantages and possible extensions of MBI (see Section 5 ). Terminology Note that, following the statistical modelling approach, we conceptualize an ML problem as learning a relationship of the form Y = f ( X ) + ε – in which Y are features and X are labels –, as opposed to the typical ML approach of extracting estimates of labels from observations of features, i.e. . Consequently, we refer to labels as “independent variables” and features as “dependent variables”, not the other way around. This is because the statistical model describes Y as a function of X , but when inverted, can be used to deduce X from Y . Notation In the following, we denote n -dimensional zero vectors as 0 n and n -dimensional vectors of all ones as 1 n . Similarly, we denote n × m zero matrices as 0 nm and n × m matrices of all ones as 1 nm . We write the n × n identity matrix as I n and write its i -th column, i.e. the i -th n -dimensional unit vector, as e i for i = 1, …, n . We use 𝒩 ( µ , Σ) to denote the multivariate normal distribution with expected value µ ∈ ℝ n and covariance matrix where is the set of all positive-definite symmetric matrices of size n × n . Finally, we use 𝒩 ℳ ( M, U, V ) to denote the matrix normal distribution with expected value M ∈ ℝ n × m , covariance matrix across rows and covariance matrix across columns . 2 Theory In this section, we introduce the foundations of multivariate Bayesian inversion (MBI). We first review the notions of predictive algorithms and generative models and then introduce the multivariate general linear model as a generative model (see Section 2.1 ). We go on to describe how, using Bayesian data analysis, one can either estimate the parameters of such a model or perform comparison between the models themselves (see Section 2.2 ). Using the MBI framework for predictive analyses rests on the reciprocal estimation of parameters and labels, i.e. either label values are known and model parameters have to be estimated or model parameters are known and label values have to be predicted (see Section 2.3 ). Finally, upon partitioning available data into training and test data, MBI can be used for discrete classification or continuous regression, respectively (see Section 2.4 ). 2.1 Linear models for multivariate data 2.1.1 Predictive algorithms vs. generative models Consider some features Y ∈ 𝒴 and some labels X ∈ 𝒳, i.e. random variables (dependent and independent variables, see Table 1 ) with sample spaces 𝒴 and 𝒳, respectively. Then, a function g : 𝒴 → 𝒳 is called a predictive algorithm . In supervised learning, one typically calibrates g based on a training set of features and labels ( Y 1 , X 1 ) and then evaluates the performance of g by predicting the labels X 2 of test features Y 2 , where ĝ denotes the calibrated predictive algorithm and are the predicted labels of the test features [ 23 , 24 ]. A common example for this is the multivariate continuous prediction task in which Y ∈ ℝ v is a random vector of v continuous features and either X ∈ { 1, …, C } is a discrete class index or X ∈ ℝ is a real number. In the former case, the multivariate continuous prediction task is referred to as a classification task and is typically solved using algorithms such as multinomial logistic regression or support vector classification [ 5 , 25 ]. In the latter case, the multivariate continuous prediction task is referred to as a regression problem and can be solved using e.g. multiple linear regression or support vector regression [ 6 ]. Alternatively, the relationship between features Y and labels X can be furnished by taking a statistical perspective. Here, a statement about the probability distribution of Y , given X and some model parameters θ ∈ Θ, is referred to as a generative model . The term implies that, given fixed parameters θ , features Y can be generated by sampling the data-generating mechanism, where m denotes the generative model and 𝒟 ( X, θ ) signifies an arbitrary probability distribution parametrized in terms of X and θ . This distribution implies a probability function p ( Y | X, θ, m ): 𝒴 → ℝ ≥0 , which – when viewed as a function θ for fixed Y – is referred to as the likelihood function of the generative model. The idea behind this generative approach is to model the effects that labels have on features ( X→ Y ), with the intention to later invert the model in order to estimate labels from features ( Y →X ). When labels are latent variables causally influencing observable features – which is often, but not always the case in real-world applications (see Section 4 ) –, the generative model captures the data-generating mechanism by describing p ( Y | X ) rather than p ( X | Y ) (see Section 2.7 ). A typical example for this is a Gaussian model where 𝒟 is a (potentially multivariate) Gaussian distribution and θ are weights or coefficients characterizing linear or non-linear effects of X on Y . Importantly, the fact that generative models are based on explicit probabilistic assumptions about features, labels and parameters makes them compatible to principled forms of model estimation and model selection such as maximum likelihood, Bayesian posterior inference, classical information criteria or Bayesian model comparison [ 11 , 12 , 18 , 26 , 27 ]. For example, in Bayesian inference, a prior distribution over model parameters p ( θ | m ) is specified which allows to infer the posterior probabilities of the model parameters, given the features p ( θ | Y, m ), or to calculate the marginal likelihood of the measured features, given the model p ( Y | m ). In the following, we show how the multivariate general linear model can be written as a generative model and made amenable to Bayesian analyses, where the results of these analyses can then be employed as a means to solve multivariate continuous prediction tasks in the style of predictive algorithms. 2.1.2 Multivariate general linear models Let Y ∈ ℝ n × v denote an observed random matrix of measured data comprising n observations of v features and let X ∈ ℝ n × p denote a known matrix comprising n associated observations of p predictor variables (also called “regressors”) that possibly influence the measured data. Then, we define a multivariate general linear model (MGLM) m as where B ∈ ℝ p × v denotes a matrix of unknown (regression) coefficients, E ∈ ℝ n × v denotes and unknown feature an unobserved error matrix, and ℳ 𝒩 (0 nv , V , Σ) denotes a matrix-normal distribution with zero mean, known observation covariance matrix covariance matrix . In applications to data sets comprising multiple features for multiple experimental units (e.g. participants in a study), V models the error covariation between different experimental units and Σ models the error covariation between of features within each experimental unit. In applications to spatiotemporal data sets, where observations correspond to different time points and features correspond to different locations in space, V may be referred to as a temporal covariance matrix and Σ may be referred to as a spatial covariance matrix [ 16 ]. Note that, in equation (3) , X and V are assumed to be known, whereas B and Σ are assumed to be unknown (see Table 2 ). This is compatible with the fact that observations, i.e. rows of Y , typically have some known autocorrelation structure or are assumed independent (hence, V is pre-specified), whereas features, i.e. columns of Y , typically have some unknown covariation which needs to be inferred (hence, Σ is estimated). The assumption of matrix-normally distributed errors also means that all observations have the same covariance across columns Σ and that all features have the same covariance across rows V . If observations are independent and identically distributed (i.i.d.), as is often assumed in ML applications, then V = I n and the model simplifies to (see Appendix A ) where y i ∈ ℝ 1× v , x i ∈ ℝ 1× p , ε i ∈ ℝ 1× v are row vectors and 𝒩 (0 v , Σ) denotes a multivariate normal distribution with zero mean and covariance matrix Σ ∈ ℝ n × n . Since E follows a matrix-normal distribution and XB is constant, the MGLM formulated in ( 3 ) is equivalent to the following generative model (see Appendix A ): With the matrix-normal probability density function 𝒩 ℳ: ℝ n × v → ℝ > 0 , equation ( 5 ) then implies the following likelihood function for m (see Appendix A ): For reasons of mathematical convenience, we write this in terms of the precision matrices and (see Table 2 ): Note that, among the conditioned variables, V (or equivalently, P ) and m are constant which is why, in the following sections, we will drop them for clarity of notation. In contrast to that, B and T are random entities described by some probability distribution. X is considered fixed when estimating B and T (see Section 2.3.1 ) and random when using a distribution over B and T to predict it (see Section 2.3.2 ). Taken together, we will therefore refer to the likelihood function of the MGLM as p ( Y | X, B, T ). 2.2 Bayesian inference for MGLMs 2.2.1 Bayesian parameter estimation In Bayesian inference, one usually specifies a prior density p ( θ | m ), i.e. a probability distribution for θ unconditional on Y , and the goal is to derive the posterior density p ( θ | Y, m ): Θ → ℝ ≥0 , i.e. the probability distribution for θ conditional on Y , given the model m [ 11 , 18 , 28 – 30 ]. Bayes’ theorem implies that this posterior density is proportional to the product of the likelihood function p ( Y θ, m ) and the prior density: Following ( 7 ), the MGLM model parameters are where is the set of all positive-definite symmetric matrices of size n × n . Thus, Bayesian MGLM estimation requires a prior distribution on B and T . In order to obtain a tractable posterior, we will use a conjugate prior, i.e. a prior distribution that, when combined with the likelihood function, leads to a posterior distribution that belongs to the same family of probability distributions. For the MGLM, this conjugate prior is a normal-Wishart distribution on the coefficient matrix B and the precision matrix T ([ 11 , ch. 3]; [ 31 , sect. 8 ]) where M 0 ∈ ℝ p × v is the prior mean, is the prior precision matrix, is the prior inverse scale matrix, ν 0 ∈ ℝ are the prior degrees of freedom and 𝒩 ℳ: ℝ p × v → ℝ > 0 and are the respective probability density functions. Because this distribution is a conjugate prior, the application of ( 8 ) results in a posterior distribution that is also a normal-Wishart distribution (see Appendix B ) where the posterior parameters M n , Λ n , Ω n and ν n turn out to be functions of the prior parameters, the feature variables Y and the predictor variables X (see Appendix B ): Note that, if observations are assumed to be i.i.d., we have and the matrix P simply disappears from those equations. This is the case for most applications of linear regression models and will also be assumed throughout most parts of this paper (but see Simulation 4 in Section 3 ). 2.2.2 Bayesian model comparison In addition to obtaining posterior distributions for the model parameters θ , Bayesian inference also allows to perform statistical inference on the generative model m itself. This is achieved by calculating the marginal likelihood p ( Y | m ): 𝒴 → ℝ ≥0 , i.e. the probability of the data Y , independent of particular parameter values θ , given only the model m [ 12 , 32 – 34 ]. The law of marginal probability implies that this marginal likelihood is an integral of the product of likelihood function and prior distribution: Application of ( 12 ) to the MGLM with yields and the marginal likelihood with ( 11 ) becomes (see Appendix C ) where Γ p : ℝ → ℝ is the multivariate Gamma function of order p ∈ ℕ. Once again, if observations are assumed i.i.d., such that rows of the data matrix Y are uncorrelated – as it is the case in most linear modelling contexts –, we have P = I n , such that the term | P | v in ( 14 ) would be replaced by 1. If two or more models are differing by the predictor variables X they utilize (giving rise to different likelihood functions p ( Y | X, θ, m )) or by the prior parameters ( M 0 , Λ 0 , Ω 0 , ν 0 ) they impose (giving rise to different prior distributions p ( θ | m )), this consitutes a case for model comparison. Consider a model space ℳ = { m 1 , …, m M } where M is the number of models. Then, posterior model probabilities can be calculated from marginal likelihoods ( 14 ) using Bayes’ theorem for i = 1, …, M where p ( m i ) are prior model probabilities, i.e. prior beliefs about the relative likelihoods of models m ∈ ℳ before seeing the data Y . 2.3 Multivariate Bayesian inversion 2.3.1 Estimation of parameters, given the labels In Section 2.2.1 , we have introduced how Bayesian inference can be used for MGLM parameter estimation. In this section, we explain how this method constitutes the first step of a multivariate Bayesian inversion analysis. The fact that, according to equation (7) , the probability of the measured data Y is conditional on both the model’s regressors X and the model parameters θ = ( B, T ), means that both of these quantities can be considered either estimated and known or unknown and to-be-predicted. In some situation, e.g. in a designed experiment, it could be that regressor values are known and model parameters have to be learned. In other contexts, e.g. in consumer choice prediction, it could be that model parameters are known and regressor values have to be inferred. Consider a data set ( Y, X, P ) ∈ ℝ n × v ℝ n × p ℝ n × n specified by data matrix Y , design matrix X and precision matrix P (see Section 2.1.2 ). In this case, X would be considered known and B and T have to be estimated following (8) : To do so, a prior density p ( B, T ) as in ( 9 ) has to be specified by fixing the prior parameters M 0 , Λ 0 , Ω 0 and ν 0 (see Section 2.2.1 ). If there is no prior knowledge, i.e. one does not have concrete prior beliefs about the model parameters B and T , one possibility is to use non-informative prior parameters [ 30 ] given by Using ( 10 ) and ( 11 ), the posterior distribution becomes where the posterior parameters after seeing the data are (see Appendix D ): Note that the prior parameters ( 17 ) are specified in such a way that the posterior parameters ( 19 ) are only influenced by the observed data Y and the known regressors X in this example (cf. eq. 11 ). Moreover, as we only estimate one T = Σ −1 , the feature covariance matrix Σ is assumed to be constant across observations, i.e. the same for all data points. For multivariate Bayesian classification (see Section 2.4.1 ), this means that MBI pools covariance across classes. For multivariate Bayesian regression (see Section 2.4.1 ), this means that the covariance is not dependent on the target variable. 2.3.2 Identification of labels, given the parameters In Section 2.2.2 , we have introduced how Bayesian inference can be used for MGLM model comparison. In this section, we explain how this method constitutes the second step of a multivariate Bayesian inversion analysis. Other than before, it could be that a distribution p ( B, T ) is available, for example (but not necessarily) because it has been estimated from independent data. In this case, the model parameters B and T can be assumed known – or at least partially known, or known with some uncertainty – when approaching new data. Consider a single data point ( y i , x i ) ∈ ℝ 1× v × ℝ 1× p specified by a new observation y i and the associated regressor values x i , where i is an index over previously unseen data. Let us assume that the goal is to identify the x i belonging to a single new y i . For this one observation, the multivariate model reads (cf. eq. 4 ): Now, B and T can be considered known in the sense that the distribution p ( B, T ) acts as the prior distribution for the new data point, such that the prior predictive distribution of the observed y i , given the unknown x i follows from ( 12 ). For example, assume that the distribution p ( B, T ) is given by the posterior distribution p ( B, T | Y ) in ( 18 ) specified by the parameters M 1 , Λ 1 , Ω 1 and ν 1 from ( 19 ). Then, assuming this as our prior knowledge and using ( 14 ), the marginal likelihood p ( y i | x i ) becomes where the posterior parameters after seeing the new data are (see Appendix D ): Crucially, different regressor values can be plugged in for x i in equation ( 22 ), enabling to treat the identification of labels as a model selection problem. More precisely, we are testing competing models for the observations y i by submitting different possibilities for the regressors x i , to see which of these possibilities maximizes the marginal likelihood p ( y i | x i ) and is thus the most likely value of x i (for a similar approach, see [ 20 ]). Given that we can specify some prior knowledge about the predictor variables x i , the marginal likelihoods from ( 21 ) can be used to obtain posterior probabilities for label values x i , given the measured data y i : Note that, other than in equation ( 16 ), X is not considered fixed anymore, but x i is now a random variable which has prior distribution p ( x i ) and can be estimated in the form of a posterior distribution p ( x i | y i ). Multivariate Bayesian inversion (MBI) is the alternating procedure in which first, MGLM parameters are estimated, given known labels, and in which second, labels are infered on, given known MGLM parameters. In Sections 2.4.1 and 2.4.2 , we will outline an application of this MBI framework for cross-validated predictive analyses which are based on splitting the data into training and test set in order to predict labels for left-out observations using either classification or regression. 2.4 Cross-validated MBI for prediction 2.4.1 Multivariate Bayesian classification Let be a vector of non-zero class indices (1, 2, 3, …) corresponding to the observations in the data matrix Y ∈ ℝ n × v (see Figure 1 ) and let and denote training and test set, respectively. The number of data points are n 1 and n 2 , such that n 1 + n 2 = n . Download figure Open in new tab Fig. 1. Multivariate Bayesian inversion for classification. (A) In the training set, a categorical design matrix X 1 is formed and the posterior distribution over regression coefficients B and inverse feature covariance T = Σ −1 is calculated from the measured data Y 1 . (B) In the test set, each category j is successively tested against each data point y 2 i as a possible model m ij , resulting in posterior class probabilities p ( x 2 i = j | y 2 i ). Multivariate Bayesian classification is a two-step procedure, separately estimating parameters (top) and inferring on classes (bottom). Note that between-observation covariance, denoted in the text via V and P , has been omitted (i.e. set to V = P = I n ) in this example for simplicity. First, we define a “regressor generator function” which sets the j -th column in the i -th row to 1, if the i -th observation belongs to the j -th class, where C = max( x ) is the number of classes (see Figure 1A ): The result is a “categorical design matrix” in which each column vector is a binary indicator, i.e. only taking values 0 or 1, representing one class. Since x 2 is unknown, i.e. we do not know the classes in the test set, we have to specify a set of candidate models. Let m ij be the model asserting that the single observation y 2 i in the test set belongs to the j -th class (see Figure 1B ): i.e. the i -th row of X 2 is a 1 × C vector of zeros with a one at the j -th position. Then, we can calculate the marginal likelihood of the test data point y 2 i based on ( 21 ): This allows to quantify posterior class probabilities for each observation based on ( 15 ): where p ( m ij ) is the prior probability that test set observation i belongs to class j . Note that the only terms in ( 26 ) depending on the candidate model m ij are such that the posterior class probabilities from ( 27 ) can be written as for all test data points i = 1, …, n 2 . If one were to predict the classes in the test set, picking the one which maximizes the posterior probability would be a reasonable choice: 2.4.2 Multivariate Bayesian regression Let x ∈ ℝ n ×1 be a vector of real-number target values ( x i ∈ ℝ, i = 1, …, n ) corresponding to the observations in the data matrix Y ∈ ℝ n × v (see Figure 2 ) and let and denote training and test set, respectively. The number of data points are n 1 and n 2 , such that n 1 + n 2 = n . Download figure Open in new tab Fig. 2. Multivariate Bayesian inversion for regression. (A) In the training set, a regression design matrix X 1 is formed and the posterior distribution over regression coefficients B and inverse feature covariance T = Σ −1 is calculated from the measured data Y 1 . (B) In the test set, the marginal likelihood for each data point y 2 i is a continuous function of the possible target values λ , resulting in a posterior target density p ( x 2 i = λ | y 2 i ). Target values corresponding to the three models shown are marked on top of the posterior density plot. Multivariate Bayesian regression is a two-step procedure, separately estimating parameters (top) and inferring on targets (bottom). Note that between-observation covariance, denoted in the text via V and P , has been omitted (i.e. set to V = P = I n ) in this example for simplicity. First, we define a “regressor generator function” X R : ℝ n ×1 → ℝ n ×2 which horizontally concatenates the target vector and a vector of ones (see Figure 2A ): This results in a “regression design matrix” in which the first column represents the target variable and the second column is a constant regressor. Since x 2 is unknown, i.e. we do not know the targets in the test set, we have to specify a set of candidate models. Let m i ( λ ) be the model asserting that the target value belonging to the single observation y 2 i is equal to λ (see Figure 2B ): i.e. the i -th row of X 2 is a 1× 2 vector with λ as first entry and one as second entry. Then, the marginal likelihood of the test data point y 2 i is a function of λ based on ( 21 ): This allows to derive a posterior target density over λ for each observation based on ( 15 ): where p ( λ ) and p ( ϑ ) is the prior probability density over the i -th test set label x 2 i . Note that the only terms in ( 33 ) depending on the candidate value λ are such that the posterior target density from ( 34 ) can be written as for all test data points i = 1, …, n 2 . If one was to predict the target value for each data point in the test set, reporting either the posterior expected value or the maximum-a-posteriori estimate would be reasonable choices: 2.5 Inclusion of covariates In Sections 2.4.1 and 2.4.2 , MBC and MBR were formulated without covariates, i.e. with only class indices x or target values x influencing the measured data Y . We will now write down the design matrices for MBI with inclusion of covariates, i.e. incorporating confound variables Z which are not themselves the target of prediction, but may have influences on the available features Y . Let Z ∈ ℝ n × c be a matrix of covariates corresponding to the observations in the data matrix Y ∈ ℝ n × v and let and denote covariate values from training and test set, respectively. Let ( Z 2 ) i be the 1 c vector of covariate values for the i -th observation in the test set. Then, the design matrix for multivariate Bayesian classification in the training set is and the model assuming the j -th class for the i -th observation in the test set is Similarly, the design matrix for multivariate Bayesian regression in the training set is and the model assuming target value λ for the i -th observation in the test set is In other words, different than class indices and target values which are known in the training data and considered unknown in the test data, covariates will be considered known in both parts of the data set. Given these modified design matrices X 1 and ( X 2 ) i , the process of calculating posterior probabilities is essentially the same, e.g. given by equation for MBC and ( 36 ) for MBR. 2.6 Cross-validation Multivariate Bayesian inversion, as outlined in Sections 2.4.1 and 2.4.2 , relies on cross-validation to predict labels based on features. Cross-validation (CV) successively splits a given data set into training and test sets and evaluates predictive performance on the test set, which is repeated until each data point has once been part of the test set [ 35 , 36 ]. Unless otherwise stated, the following simulations (see Section 3 ) and analyses (see Section 4 ) use k-fold cross-validation with k = 10. This means that the data set is split into 10 equally large subsets (or sets as equally large as possible, if the number of CV folds k does not divide the number of data points n 1 ) and in each CV fold, 9 subsets are used for training, i.e. estimation of the parameters (see Section 2.3.1 ), and the remaining subset is used for testing, i.e. identification of the labels (see Section 2.3.2 ). Splits into CV folds are non-random, but strictly sequential by index of data point. For multivariate Bayesian classification (Simulations 1-3, Analyses 1-3), we use 10-fold CV on points per class . This means that all classes are split into 10 equally large subsets separately, such that in each CV fold, the distribution of classes is the same in training and test set (or as close as possible, if the number of CV folds does not divide the number of data points per class). This ensures that class imbalances are preserved in all subsets and thus do not have a spurious influence on classification. For Analysis 2 (MNIST digit recognition; see Section 4.2 ) and Analysis 4 (brain age prediction; see Section 4.4 ), we did not use cross-validation, as the data sets were already partitioned in training and validation sets. In these cases, the posterior distribution over model parameters was estimated from the training set, the class labels (Analysis 2) or target values (Analysis 4) were predicted in the validation set and predictive performance measures are solely based on the validation set. 2.7 Comparison with other Bayesian methods There exist a number of Bayesian approaches to machine learning [ 12 ] some of which are closely related to the general statistical approach presented here. MBI differs from Bayesian discriminative methods such as Bayesian logistic regression [ 11 , ch. 16] and Bayesian linear regression [ 11 , ch. 14] by the fact that the latter typically model the predictive distribution of the labels given the features, i.e. the probability distribution p ( X | Y ) (cf. Section 2.1.1 ). MBI in turn is more closely related to Bayesian generative methods such as Gaussian naive Bayes classifiers [ 37 ] or Variational Gaussian mixture models [ 38 ] which describe the generative distribution of the features given the labels, i.e. the probability distribution p ( Y | X ), and later inverts this distribution to infer on labels given the features. MBC and MBR are multi-stage procedures , meaning that data are splitted into training and test sets, in order to first estimate parameters of the generative model then extract values of the label variable (see Figures 1 and 2 ). This is in contrast to expectation maximization (EM) [ 39 ] and variational Bayesian (VB) [ 40 ] techniques which treat labels as latent random variables and jointly estimate the posterior distribution over label variables and model parameters. The generative model underlying MBC is equivalent to the model behind (Bayesian) linear discriminant analysis (LDA) [ 41 ] when the design matrix only includes categorical regressors ( X = X c ( x )) and when all observations are regarded as independent ( V = I n ). In this case, all data points in the training set are assumed to follow multivariate normal distributions with class-specific means and class-independent covariance. By estimating B and T (cf. eq. 10 ), MBI implicitly estimates those class means and covariance matrix, such that new data points in the test set can be assigned to one class. This “LDA scenario” is exemplified in Simulation 3 (see Section 3.3 ) and Analyses 1/2 (see Sections 4.1 / 4.2 ). The novel contribution here is that MBI generalizes (Bayesian) LDA (i) by using the same generative model for classification and regression, (ii) by allowing for correlations between data points in this model, (iii) by systematically integrating covariates into predictive analysis and (iv) by incorporating prior knowledge about class frequencies. 3 Simulations In this section, we apply cross-validated multivariate Bayesian inversion to several synthetic data sets necessitating either classification (Simulations 1-3) or regression (Simulation 4). For each of these examples, we describe (i) the purpose behind the simulation study, (ii) the generative model underlying the data set, (iii) possible scientific interest in data sets of this form, (iv) steps of data analysis, we give a (v) summary of the results and provide (vi) a conclusion. When of interest (Simulations 2-4), we compare MBI’s performance with that of support vector machines (SVM), as a representative easy-to-implement and currently popular alternative approach [ 42 , 43 ]. 3.1 Simulation 1: two-class classification Purpose The purpose of this simulation is to show how between-class variance (quantified as the Euclidean distance between class means) as well as within-class variance (quantified as the standard deviation around class means) influence classification accuracy in multivariate Bayesian classification (MBC). Generative model We simulate two classes of equal size in two dimensions: Each observation y i ∈ ℝ 1×2 , i = 1, …, n is generated as where n = 250 (with 125 per class) and some correlation between the two dimensions is induced via the between-feature covariance matrix The terms x 1 i , x 2 i ∈ { 0, 1} are indicating the presence of the two classes and the class means are given by (see Figure 3 ) Download figure Open in new tab Fig. 3. Simulation 1: two-class classification. Simulated data ( n = 250) from two classes (blue, red) in two dimensions (x, y) when varying (A) the distance between the distributions (measured as Euclidean distance between the multivariate means) or (B) the spread of the distributions (measured as standard deviation of the noise). The color of each dot indicates the posterior probability assigned to this data point in multivariate Bayesian classification (see colorbar). Assigning classes by maximum posterior probability results in a classification accuracy for each of the scenarios (right panels). Taken together, this is equivalent to the model behind linear discriminant analysis (LDA) [ 41 ], with class-dependent multivariate means and class-independent between-feature covariance. MBC generalizes LDA to additional variables influencing the features (see Simulation 2), an arbitrary number of classes (see Simulation 3) and potential between-observation correlation (see Simulation 4). The parameters µ and σ are controlling (i) the distance between the centers of the two distributions and (ii) the amount of spreading (standard deviation) for the two distributions. In our simulations, they are specified as Scientific interest Note that µ/σ is proportional to the Mahalanobis distance between the two distributions [ 44 ] which is a measure for class separability: With large µ and small σ , observations from the two classes are very distinct, whereas with small µ and large σ , the two classes are practically overlapping (see Figure 3A/B ), which makes classification either a very easy or a very hard task. Thus, a scientific question related to this simulated data set – or a real data set of the same form – could be how well the two classes can be separated, given some difference between class means and some variation within classes. Data analysis MBC is performed according to the methodology described in Sections 2.3.1 , 2.3.2 and 2.4.1 based on the multivariate general linear model and using k -fold cross-validation (CV) with k = 10 CV folds. Results We observe that, trivially, (i) with increasing distance between the distributions, classification accuracy increases, and (ii) with increasing variance of the distributions, classification accuracy decreases (see Figure 3 ). Moreover, (iii) when the distance between distributions is zero, all points receive a posterior probability close to 1 / 2 (see Figure 3A ), and (ii) when variance of distributions is large, many close to the decision boundary points receive such an indecisive posterior probability (see Figure 3B ). Conclusion We have seen that classification accuracy in binary classification increases with distance of class means and decreases with variability of classes in multivariate space. Moreover, posterior class probabilities reflect the certainty of predictions made by MBC, with posterior probability becoming practically 1 / 2, if the two classes overlap. 3.2 Simulation 2: two classes with confound Purpose The purpose of this simulation is to show that including a confound (i.e. a covariate that systematically varies with class membership) into the classification model (MBC) results in higher classification accuracy than removing the covariate effect from the feature variables before classification (SVC). Generative model We again simulate two-dimensional data by drawing from two classes, but this time adding a continuous covariate to the outcome variables: More precisely, each observation y i ∈ ℝ 1×2 , i = 1, …, n is generated as where n = 250, the class means β 1 and β 2 are as before (see Section 3.1 ) with µ = 1, σ = 2 and Σ given in ( 43 ). The continuous variable z i ∈ [−1, +1] is sampled as where 𝒰 ( a, b ) is a uniform distribution with minimum a and maximum b . This sampling causes z i ∈ [−0.5, +1] for class 1 and z i ∈ [1, +0.5] for class 2 (see Figure 4B ), such that the covariate z is mildly correlated with class membership and tends to be higher for class 1. The effect of the covariate on the two features is specified as which means that the covariate has no effect on y i 1 and exclusively acts on the second feature y i 2 (see Figure 4B ). Download figure Open in new tab Fig. 4. Simulation 2: two classes with confound. (A) Two-dimensional data ( n = 250) is simulated as coming from two classes (class means indicated by black crosses) and (B) there is an additional effect of a confounding variable in one of the dimensions (covariate effect indicated by black arrow). (C) When performing MBC, classes can be separated with high accuracy (decision boundary shown as gray line). (D) Because this effect is in part based on the covariate, accounting for this confound reduces classification accuracy. (E) When performing SVC, a similar linear decision boundary also results in high accuracy. (F) Because accounting for the covariate is done via prior regression in SVC, the confounding nature of the covariate is not properly accounted for and classification accuracy decreases by much more. Also note that MBC as well as SVC with correction achieve classification beyond linear boundaries (see panels D and F), but for different reasons: MBC, because it includes an additional covariate into the model; SVC, because prior regression transforms the data into another space (for comparison purposes, data are plotted in the original space). Scientific interest When a data set contains a continuous variable that has an effect on the features, but also varies with class membership (such as z i ), accounting for the effect of this confounder is crucial, such that patterns observed in the features may not be mistaken as being due to class, but as (partly) due to the covariate. Thus, scientific questions related to this simulated data set – or a real data set of similar form – could be whether the two classes can be separated based on the features when accounting for the effect of the covariate associated with class membership, and whether separability differs depending on how the covariate is accounted for. Data analysis MBC is performed using the MGLM framework as before (see Section 3.1 ). In order to account for the correlated covariate (see Section 2.5 ), the multivariate linear model is extended to For comparison, SVC is performed by training a linear support vector machine with soft-margin parameter C = 1 on the data points from the two classes. In order to account for the confound in SVC, a linear regression model is fit to the features, removing the effect associated with z and keeping the residuals for classification. This is here referred to as the “prior regression” approach (see Figure 4F ) and is known to be problematic sometimes [ 45 , 46 ]. Both MBC and SVC are performed with 10-fold cross-validation. Results We observe that, the way the data are generated, values of the covariate are in fact correlated with the second feature and related to class membership (see Figure 4A/B ). Both MBC and SVC result in linear decision boundaries separating the data points into predicted classes (see Figure 4C/E ). Plausibly, because separation into classes is in part based on the covariate effect, accounting for this covariate results in a reduction of classification accuracy in MBC (see Figure 4D ). However, because the SVM approach removes the covariate effect separately from classification and thus fails to account for the correlation of class membership and covariate, the classification accuracy reduces by much more in SVC (see Figure 4F ). In addition to this prior regression approach, we also tested a second correction method for SVC. This method includes the covariate as a feature, instead of removing its effect from the other features (see Section 4.4 ). We find that this preserves the classification accuracy of SVC without correction (CA = 86.00%; results not shown). However, this only shows that both, features and covariate, are correlated with class membership and it does not show how informative the features are about class membership when correcting for the effect of the covariate. Conclusion We have seen that covariate inclusion captures a confound’s effect on the features while accounting for the correlation of covariate and class which leads to a more realistic estimation of class separability than removing the confound’s effect via prior regression before classification. 3.3 Simulation 3: three-class classification Purpose The purpose of this simulation is to show that (i) MBC, unlike SVC, facilitates posterior class probabilities resulting in gradual classification boundaries and that (ii) prior class probabilities can be used to bias decisions into direction of particular classes by incorporating prior information. Generative model We simulate three classes of equal size in two dimensions: Each observation y i ∈ ℝ 1×2 , i = 1, …, n is generated as where n = 300 (with 100 per class), σ = 2, Σ is as given in ( 43 ) and x 1 i , x 2 i , x 3 i ∈ {0, 1} where 1. The class means are specified as (see Figure 5A ) Download figure Open in new tab Fig. 5. Simulation 3: three-class classification. (A) Simulated data ( n = 300) from three classes (red, green, blue) in two dimensions (x, y). (B) Posterior class probabilities from MBC as a function of location in feature space are visualized as RGB triplets (e.g. [0.5, 0.0, 0.5] results in purple, which corresponds to equal posterior probability for classes 1 and 3; linear RGB converted to sRGB for visualization purposes). (C) Predicted classes from SVC are shown as a function of location in feature space. (D) , (E) , (F) Varying the prior class probabilities for MBC results in higher posterior probabilities for the class which is more likely a priori. Since classes were equally likely a priori, classification accuracy (CA) is highest when a uniform prior distribution is used (see panel B). Scientific interest Multi-class, or n-ary classification occurs frequently in empirical research problems such as optical character recognition, object classification or language detection. Thus, scientific questions related to this simulated data set – or a real data set of the same form – could be to what extent classes differ in multivariate space and whether some pairs of classes can be better distinguished than other pairs. Data analysis MBC is performed using the MGLM framework as before (see Section 3.1 ) based on the multivariate general linear model For exploratory reasons, we also perform MBC under manipulation of the prior class probabilities, i.e. by letting p ( m ij ) in ( 29 ) to be equal across classes, i.e. p ( m ij ) = 1 / 3 for all classes (see Figure 5B ) or by setting p ( m ij ) = 2 / 3 for one class and p ( m ij ) = 1 / 6 for the other two classes (see Figure 5D/E/F ). For comparison, SVC is performed by training a linear support vector machine with soft-margin parameter C = 1 on the data points from the three classes. Both MBC and SVC are performed with 10-fold cross-validation. Results We observe that, when classification was performed by maximizing posterior probability, MBC and SVC would give rise to nearly identical decision boundaries (see Figure 5B/C ). However, close to the decision boundaries, MBC provides graded information in the form of posterior probabilities rather than discrete predictions. Moreover, when incorporating prior knowledge by e.g. increasing the prior probability for one of the classes, the decision boundaries move accordingly, allocating more posterior probability to the respective class (see Figure 5D/E/F ). Conclusion We have seen that MBC provides probabilistic information in multi-class decision problems and that it allows to incorporate prior information about class frequency, according to possible knowledge about the prevalence of categories. 3.4 Simulation 4: continuous regression Purpose The purpose of this simulation is to show that MBR, in the presence of between-observation covariance, achieves predictive correlation comparable with SVR and results in predicted targets whose distribution is more similar to the training data than with SVR. Generative model We simulate multi-dimensional data with continuous effects: Each observation y i ∈ ℝ 1× v , i = 1, …, n is generated as where x i ∈ [−1, +1] is a continuous regressor sampled from a uniform distribution, β 1 and β 0 are 1 × v row vectors with slopes and intercepts for the features y i and Σ is a v × v matrix inducing between-feature covariance 2 : For demonstration purposes, we here also modulate the covariance between data points by applying an n × n matrix V inducing between-observation covariance: such that the simulation model for the entire data set is given by: In our simulations, we set n = 200, v = 10, σ = 1, ν = 0.5, τ = 0.25 and all entries of β 0 and β 1 were sampled from the standard normal distribution 𝒩 (0, 1). Scientific interest If the rows of Y correspond to units of time (i.e. time series measurements), τ may also be called “time constant” governing the “temporal covariance” V . If the columns of Y correspond to units of space (e.g. measurement locations), then ν may also be called “space constant” governing the “spatial covariance” Σ [ 16 ]. In this situation, a scientific question could be whether and to what extent the continuous quantity x i can be inferred based multivariate data y i , accounting for (known) temporal covariance and (estimated) spatial covariance. Moreover, values of x i may fall into a range that may not necessarily preserved by a particular algorithm. Data analysis MBR is performed according to the methodology described in Sections 2.3.1 , 2.3.2 and 2.4.2 based on the multivariate general linear model given by ( 57 ), with the aim of reconstructing the target values x i , i = 1, …, n . As the prior density p ( x i ) = p ( λ ) in ( 36 ), we use a continuous uniform distribution, i.e. p ( λ ) = 1 / 2 where −1 < λ < +1. Resulting posterior densities are calculated via discretization of the interval [−1, +1] in steps of 0.01 (see Figure 6D ). Download figure Open in new tab Fig. 6. Simulation 4: continuous prediction. (A) A continuous predictor variable x (the histogram of which is shown) is used to sample a 200 × 10 data matrix Y (for details, see text). (B) Maximum-a-posteriori (MAP) estimates from MBR and (C) support vector machine (SVM) predictions from SVR are plotted against the actual labels. (D) Exemplary posterior target densities for the first four simulated data points, along with MAP estimate (blue dots) and true values (black crosses). (E) MAP estimates and (F) SVM predictions for the reconstructed targets are plotted as histograms. For comparison, SVR is performed by training a linear support vector machine with soft-margin parameter C = 1 on the data matrix Y and the target vector x . Both, MBC and SVC are performed with 10-fold cross-validation. Results We observe that, trivially, maximum-a-posteriori (MAP) estimates from MBR fall into [−1, +1], because the prior probability is zero outside this interval. Notably, the resulting distribution of predicted target values is closer to the distribution of actual target values (see Figure 6A ) for MAP estimates from MBR (see Figure 6E ), when compared with SVM predictions (see Figure 6F ). MBR and SVR are otherwise comparable in terms of predictive correlation and mean absolute error (see Figure 6B/C ). In addition to this, it appears that posterior target densities p ( λ | y 2 i ) from MBR are univariate normal distributions when uniform prior densities are applied (see Figure 6D ). Conclusion We have seen that MBR is able to reconstruct continuous labels from a multi-dimensional feature space and that it can effectively constrain the reconstructed variable into its natural range by a prior distribution restricted to that range. 3.5 Comparison with alternative approaches Purpose In addition to comparison with SVMs (see Figures 4 - 6 ), we also compare MBI’s simulation performance against that of other ML approaches for selected simulations. This includes generative and discriminative as well as Bayesian and non-Bayesian methods for classification and regression problems. Data analysis We re-analyze data from Simulation 3 using the following techniques: naive Bayes classification (GNB; multivariate normal likelihood, uniform prior); linear discriminant analysis (LDA; linear discriminant function); multiclass logistic regression (LogReg; regularization parameter λ = 1); random forrest classification (RFC; bootstrap aggregation) neural network classification (NNC; two hidden layers with ten neurons each, sigmoid activation function) Moreover, we re-analyze data from Simulation 4 using the following techniques: naive Bayes regression (GNB; multivariate normal likelihood with linear effect of label variable on feature variables, uniform prior density over label variable); multiple linear regression (LinReg; weighted least squares estimation); random forrest regression (RFR; boot-strap aggregation) neural network regression (NNR; two hidden layers with ten neurons each, linear activation function) We distinguish between generative approaches, i.e. methods modelling feature variables as a function of the label variable (e.g. LDA, and including MBC), and discriminative approaches, i.e. methods modelling the label variable as a function of the feature variables (e.g. LogReg, and including SVC). In order to systematically compare diverse approaches, predictive performance of all techniques is measured as rank graduation accuracy (RGA) [ 47 ] which always falls between 0 and 1, with a value of 0.5 corresponding to chance-level performance. RGA is equivalent to area under the curve (AUC) in binary classification, but generalizes to n-ary classification and regression problems [ 48 ]. Results Because MBC and LDA are equivalent for multi-class classification with i.i.d. observations and without covariates (see Section 2.7 ), both show identical performance for Simulation 3 (RGA = 0.8591). As expected, they perform better than GNB which disregards covariances between features (RGA = 0.8448). The highest performance is achieved by SVC (RGA = 0.8611), all other discriminative approaches have performance lower than that of MBC (see Table 3 ). View this table: View inline View popup Download powerpoint Table 3. Simulations: comparisons with alternative approaches. Predictive performance for generative and discriminantive techniques when analyzing data from Simulation 3 and Simulation 4. For details regarding MBC and SVC, see Figure 5 , and for details regarding MBR and SVR, see Figure 6 . In all cases, predictive performance is given as rank graduation accuracy (RGA) [ 48 ]. In Simulation 4, differences between regression techniques are negligible, with MBR (RGA = 0.9327) and NNR (RGA = 0.9318) performing best among generative and discriminative approaches, respectively; GNB (RGA = 0.9218) and RFR (RGA = 0.9143) performing worst among generative and discriminative approaches, respectively. All other discriminative approaches have predictive performance between that of GNB and NNR (see Table 3 ). Conclusion MBI-based predictive analysis for classification and regression reaches predictive performance comparable with that of established (generative or discriminative, Bayesian or non-Bayesian) ML techniques in simulation studies. 4 Applications In this section, we apply cross-validated multivariate Bayesian inversion to several empirical data sets necessitating either classification (Analyses 1-3) or regression (Analysis 4). Again, we describe (i) the purpose behind each empirical example, (ii) specificities, dimension (and possibly, acquisition) of the respective data set, (iii) possible scientific interest in this data set, (iv) steps of data analysis, we give a (v) summary of the results and provide (vi) a conclusion. For all analyses, we compare MBI’s performance with that of support vector machines (SVM), as a popular ML approach that is often used for multivariate continuous data [ 42 , 43 ]. 4.1 Analysis 1: Egyptian skull data Purpose The purpose of this analysis is to show that MBC achieves (above-chance) classification accuracy comparable with SVC in different binary and multi-class classification scenarios for a single data set without covariates. Data set The Egyptian skull data set ([ 49 ]; [ 26 , p. 287]) comprises 4 skull measurements (maximal breadth of the skull, MB; basibregmatic height, BH; basialveolar length, BL; nasal height, NH) from 5 × 30 human skulls coming from 5 time periods (4000 BCE, 3300 BCE, 1850 BCE, 200 BCE, 150 CE). Thus, we have v = 4 features from n = 150 observations belonging to C = 5 classes. The entire data set is visualized in Figure 7A . Download figure Open in new tab Fig. 7. Analysis 1: Egyptian skull data. (A) Maximal breadth of the skull (MB), basibregmatic height (BH), basialveolar length (BL) and nasal height (NH) are shown for 30 skulls each from around 4000 BCE (red), 3300 BCE (green), 1850 BCE (blue), 200 BCE (cyan) and 150 CE (magenta). Note that all measurements are given in integer millimeters (mm), causing an impression of discreteness in the plots. (B) 5-class, 3-class and binary classification were exercised with the goal to detect the historical era based on the four skull measurements. Confusion matrices show the proportion of predicted classes (y-axis), given the true class (x-axis). (C) Classification accuracy, i.e. the proportion of correct predictions across all observations, is shown as a function of classification algorithm and number of classes. Classification accuracy decreases with increasing number of classes, but MBC and SVC achieve comparable performance. Scientific interest Scientific questions related to this data set could be: To what extent are skulls from various eras differing in terms of MB, BH, BL, NH? Are skulls differing more strongly, if they are from temporally more distant eras? Given a random skull, with what accuracy can it be assigned to a time period? Data analysis Analyses are performed for classification into five groups (all time periods) or distinction into three categories (−4000 vs. −1850 vs. 150) or binary classification (− 4000 vs. 150). MBC is performed as described in Section 2.4.1 , setting V = I n for no covariance between observations. SVC is performed, setting the soft-margin parameter C = 1. Both MBC and SVC are performed with 10-fold cross-validation. Results We observe that, as could be expected, classification accuracy increases with decreasing number of classes (see Figure 7C ). However, MBC as well as SVC both exceed chance level in each classification task (see Figure 7C ). Confusion matrices suggest that the most extreme time periods (−4000, 150) were predicted with highest accuracy (see Figure 7B ) and that the 3rd era (−1850) can be better recognized than the 2nd and the 4th era (−3300, −200), possibly because the latter were closer in time to the 1st and the 5th era (−4000, 150) than to the middle-most time period (see Figure 7B ). Conclusion We have seen that a comparably large number of classes ( 5 ) can be separated based on a comparably small number of samples per class ( 30 ) using a comparably low number of features ( 4 ). Moreover, confusion matrices show that misclassifications become less frequent with increasing temporal distance between true and predicted class which highlights empirical plausibility of the results. 4.2 Analysis 2: MNIST digit recognition Purpose The purpose of this analysis is to show that MBC (and SVC) accuracy is increasing with growing number of training samples and that MBC-based posterior probabilities approximate true class frequencies of predicted test data points. Data set The MNIST database [ 50 , 51 ] consists of images of handwritten digits 0 to 9. Original images from the US National Institute of Standards and Technology (NIST) were size-normalized into a 20 ×20 pixel box, converting binary black-and-white images into gray-scale levels in the interval [0, 255], and placed into a 28 ×28 pixel image by computing the center of mass of the pixels. The data set contains n 1 = 60,000 training samples, n 2 = 10,000 test samples, the number of classes is C = |{0, …, 9}| = 10 and the number of features is v = 28 × 28 = 784. Scientific interest Scientific questions related to this data set could be: With what accuracy can handwritten digits be recognized from pixelated images? Which pairs of digits are more often confused with each other than other pairs? Given a digit image Y has been assigned a probability p of being digit X , what is the frequency of this assignment being correct? Data analysis During preprocessing, 4 pixels were removed on either side of each image, extracting only the center 20 ×20 pixels, resulting in a reduced number of features v = 20 ×20 = 400. Moreover, all pixel intensity values were rescaled into the interval [0, 1]. Each preprossed image matrix was vectorized into a row vector y i ∈ ℝ 1×20·20 , giving an n 1 × v training data matrix Y 1 and an n 2 × v test data matrix Y 2 . MBC is performed as described in Section 2.4.1 , setting V = I n . SVC is performed, setting the soft-margin parameter C = 1. Both MBC and SVC are performed for classification into ten categories (digits 0-9) and with 10-fold cross-validation. Results We observe that classification accuracy on the test set depends on the number of training samples used for estimating model parameters and that SVC reaches higher classification accuracy than MBC across all numbers of training samples, with both clearly exceeding chance level in the digit classification task (see Figure 8A ). This is also evident from confusion matrices which show higher misclassification rates for MBC than for SVC, as the former especially confuses 3 with 5 and 4 with 9 (see Figure 8B/C ). Download figure Open in new tab Fig. 8. Analysis 2: MNIST digit recognition. (A) Classification accuracy in the test set ( n 2 = 10,000) is shown as a function of the number of training samples (ranging from 600 to 60k) used to estimate parameters for multivariate Bayesian classification (MBC; blue) and support vector classification (SVC; red). Classification analyses were performed to distinguish digits 0-9. For the full training set ( n 1 = 60,000), confusion matrices show the proportion of predicted classes (y-axis) given the true class (x-axis) for (B) MBC and (C) SVC. Misclassification rates are higher for MBC. (D) MBC assigns posterior class probabilities to test set samples and the optimal choice is given by predicting the class with the highest posterior probability. We plot the frequency of the most likely class being the actual class against binned highest posterior probability. Blue dots represent estimated frequencies and error bars denote 95% binomial confidence intervals. (E) MBC’s posterior inverse scale matrix Ω 1 (after training), indicating covariance between image pixels. Nearby pixels shower higher covariance than pixels which are far apart. (F) MBC’s posterior precision matrix Λ 1 (after training), indicating covariance between classes. Because classes are mutually exclusive, no between-class covariance exists. Download figure Open in new tab Fig. 9. Analysis 2: exemplary test set cases. For each class of digits, three samples from the test set are shown with the highest posterior class probability assigned to these samples (“PP(‘5’) = 0.41” means that this test sample is most likely digit 5, with a posterior probability of around 41%). The first row shows correct classifications with the highest posterior class probability for each digit (“high-confidence hits”). The second row shows correct classifications with the lowest posterior class probability for each digit (“low-confidence hits”). The third row shows one random incorrect classification in which the class with the highest posterior probability differs from the true class (“misses”). Next, we evaluated how accurately the posterior class probabilities calculated for test samples reflect the actual class of those samples. Each test sample is assigned ten posterior class probabilities (for digits 0-9) and can then be predicted as belonging to the class for which posterior class probability is highest. Therefore, we binned test data items by highest posterior probability (25 bins, bin width 0.04) and calculated, for each bin, the frequency that the class with the highest posterior probability was also the true class. There is a linear-increasing relationship between posterior probability of the most likely class and the frequency of the most likely class being the true class (see Figure 8D ), indicating that posterior probabilities are somewhat veridical. However, the relationship is slightly below the identity line (see Figure 8D ), suggesting that MBC posterior probabilities based on the MNIST training data are a bit overconfident. For illustration, we also show the posterior inverse scale matrix Ω 1 (cf. eq. 19c), embodying the covariance between features (see Figure 8E ), and the posterior precision matrix Λ 1 (cf. eq. 19b), embodying the covariance between classes (see Figure 8F ), according to the training data. As nearby image pixels tend to be correlated (digits are contiguous, so neighboring pixels have similar intensity values) and all image matrices were vectorized before training, Ω 1 is characterized by diagonal lines which become weaker, the higher the distance from the main diagonal (and thus, the more two pixels are apart from each other in the image). As classes are mutually exclusive (no data point belongs to two or more digits at the same time), Λ 1 is a diagonal matrix with all off-diagonal entries (representing between-class covariances) being equal to zero. Finally, we present exemplary test set cases which were correctly classified with either highest posterior probability (see Figure 12 , top row) or lowest posterior probability (see Figure 12 , middle row) or which were misclassified, with posterior probability being maximal for the wrong digit (see Figure 12 , bottom row). These examples anecdotally show that images are more likely to be correctly classified, if they are drawn in bold, rather than thin lines (see Figure 12 , top and middle row), and that they are more likely to be misclassified, if they are similar to other digits (see Figure 12 , bottom row). Conclusion We have seen that classification accuracy improves with increasing size of the training set, as the posterior distribution over MGLM model parameters (see Section 2.2.1 ) is more accurately estimated when more training data are available. Moreover, posterior probabilities derived from MBC are veridical with respect to the true class in the sense that the they approximately reflect the frequency with which a test set sample is the class to which the highest posterior probability was assigned. 4.3 Analysis 3: birth weight data Purpose The purpose of this analysis is to show that MBC achieves higher classification accuracy than SVC and less “classification artefacts” (i.e. (near-)exclusive prediction of one class) in situations where one categorical variable (here: smoking) is predicted while other categorical variables (here: ethnicity) and parametric variables (here: age) influence the feature variables. Data set The low birth weight data set ([ 52 ]; [ 26 , p. 288]) consists of a number of measurements from n = 189 United States mother-baby pairs which was collected to investigate potential causes of low birth weight. Collected variables include the birth weight (in kg), the mother’s weight (in kg), smoking status (non-smoker, smoker), ethnicity (white, black, other), hypertension (no, yes), the mother’s age (in yrs) and the number of visits to the doctor before birth. The entire data set is visualized in Figure 10A . Download figure Open in new tab Fig. 10. Analysis 3: birth weight data. (A) Birth weight [kg] and mother’s weight [kg] as a function of smoking status (top), ethnicity (middle), hypertension (bottom) and as a function of mother’s age [yrs] (cyan) and visits to the doctor (magenta). (B) Classification analyses were executed with the goal to distinguish the discrete variables based on the two features, accounting for covariates. Confusion matrices show the proportion of predicted classes (y-axis), given the true class (x-axis). (C) Balanced accuracy, i.e. the average over class-wise accuracies (diagonal entries in confusion matrices), is shown as a function of classification algorithm and classified variable. Whereas SVC remains at chance level, MBC exhibits above-chance prediction accuracy. Scientific interest Presumably, the original purpose of acquiring the data set was to identify causes of low birth weight, i.e. to predict birth weight from the other variables. Turning this question around, one can also ask whether the the effects of lifestyle, demographics and health on body weight are so strong that these categorical variables can be detected from the parametric ones. In our analysis, the goal therefore is to predict the 3 discrete variables (smoking, ethnicity, hypertension) from the 2 continuous features (birth weight, mother’s weight). Challenges in this prediction task are (i) strongly unequal class sizes (see Figure 10A ), (ii) controlling for the dimensions not currently predicted and (iii) confounding variables (mother’s age, visits to the doctor). Data analysis Analyses are performed for classification into two categories (smoking, hypertension) or three categories (ethnicity). MBC is performed as described in Section 2.4.1 , setting V = I n as data points were regarded independent. SVC is performed, setting the soft-margin parameter C = 1. Both MBC and SVC are performed with 10-fold cross-validation, as described in Section 2.6 . In order to account for potentially confounding variables, a design matrix Z was generated by forming difference regressors (+1 vs. − 1) for the categories not currently classified (e.g. hypertension when predicting smoking) and including continuous covariates (e.g. mother’s age) as further regressors. For MBC, this covariate design matrix Z was added to the categorical design matrix X C used for classification itself, as described in Section 2.5 . For SVC, a linear regression model is fit to the features, removing the effect associated with Z and keeping the residuals for classification analyses. To assess classification performance, we used the balanced accuracy (BA), i.e. the average of class-wise accuracies, rather than the classification accuracy (CA), as CA can lead to misleading conclusions in the case of unequal group sizes which are however compensated by the BA [ 53 , 54 ]. Results We observe that SVC with prior regression fails to accomodate the unequal group sizes in all three classification tasks (smoking, ethnicity, hypertension), frequently predicting the largest class, independent of the true class (see Figure 10B ). Note that, when samples were weighted by class size in the respective training set, predicted class was more correlated with true class for all three label variables, producing above-chance accuracies (results not shown). We here report unweighted results for SVM analyses, as no classification approach was informed with group sizes in training or test set analysis. In contrast to SVC, MBC with covariate inclusion can seemingly adjust for the unequal numbers of samples, resulting in acceptable class-wise accuracies throughout (see Figure 10B ). As a consequence, balanced accuracy based on SVC is at chance level whereas MBC achieves above-chance prediction accuracy (see Figure 10C ). We hypothesize that MBC with covariate inclusion manages to account for interactions between discrete variables and for the effects of covariates (see Figure 10A ), thereby more faithfully estimating the true pattern underlying the categories currently classified. We also ran another SVC analysis that includes the covariates as features, instead of removing their effects from the other features (see Section 4.4 ). We find that this gives above-chance MBC-like balanced accuracies for smoking (BA = 65.29%) and ethnicity (BA = 48.12%), but chance-level performance for hypertension (BA = 50.00%; results not shown). This however only demonstrates that features and covariates together are predictive of smoking and ethnicity, but it makes no statement about how informative the features are about those variables when correcting for the covariates’ effects. Conclusion We have seen that MBC outperforms SVC in a relatively small data set with few features, potentially confounding continuous variables (here: mother’s age, visits to the doctor), potentially interacting discrete variables (here: smoking, ethnicity, hypertension) and highly unequal class sizes. 4.4 Analysis 4: brain age prediction Purpose The purpose of this analysis is to show that MBR achieves higher predictive correlation than SVR when variables covarying with the features are included as covariates influencing the features (MBR), rather than as features predicting the labels (SVR). Data set In brain age prediction, the goal is to estimate an individual person’s chronological age (in years) from structural magnetic resonance imaging (MRI) scans of this person [ 55 , 56 ]. For the present analysis, we are working with dimensionality-reduced structural MRI information from n = 3,300 subjects that was obtained by averaging within the regions from an anatomical atlas of the human brain and already used in an earlier study [ 57 ]. 3 As the data were analyzed within a predictive analytics competition 4 , the analysis was not fully cross-validated, but the the data set was split into n 1 = 2,640 training samples (available during the competition) and n 2 = 660 validation samples (target values not disclosed, used to decide the competition) with chronological age between 17 and 89 years (see Figure 11 , left column). For each of these subjects, gray matter (GM) and white matter (WM) densities for 116 brain regions were extracted. Additionally, each subject’s gender (coded as + /− 1) and the site at which they were acquired (1 out of 17) were known [ 57 , 58 ]. Download figure Open in new tab Fig. 11. Analysis 4: brain age prediction. Chronological age in training and validation set (left, gray) was predicted from structural MRI features (not shown) using MBR with site and gender as covariates (middle, blue), using different prior densities on age (bottom row), and using SVR with site and gender as features (right, red), resulting in higher predictive correlation and lower absolute errors for MBR (top row) and distributions of predicted values that are closer to the actual distribution (middle row). Download figure Open in new tab Fig. 12. Analysis 4: posterior distributions. Exemplary posterior distributions over chronological age with maximum-a-posteriori estimates (blue dots) and true values (black crosses) for the first five subjects in the validation set (see Figure 11 , left column) when using multivariate Bayesian regression with uniform prior (top row), data-driven prior (middle row) or fitted prior (bottom row) over chronological age (see Figure 11 , bottom row). For explanations, see text. Scientific interest Scientific questions related to this data set could be: Which are the brain features that are most strongly varying with chronological age? With what precision can chronological age be detected based on brain features? How can the non-uniform distribution of target values in the training set be handled? Data analysis SVR was performed by calibrating a linear support vector machine with soft-margin parameter C = 1 on the training data, using chronological age as the target variable x and the 2 ×116 + 17 + 1 = 250 variables (= GM/WM × regions + sites + gender) as feature matrix Y , and then predicting chronological age in the validation data (see [ 57 , sec. 2.2 , 2.3.2 ] for details). Adding site and gender to the feature matrix, although different from the “prior regression” approach used above (see Simulation 2 and Analysis 3), is compatible with common practice in biomedical research in which covariates are added to the feature matrix when predicting class labels [ 59 ] or target values [ 60 ]. By contrast, in a statistical modelling context, it is not really useful to employ covariates such as site and gender as features for prediction of age, since gender was putatively controlled when splitting the data and the whole age range was putatively sampled at each site [ 61 ]. And even if this wasn’t the case, prediction of a subject’s chronological age from gender or site is not really something that would be demanded. Instead, structural MRI features could be varying as a function of gender or site, such that these must be included as influential covariates, not additional features. MBR was therefore performed using the MGLM framework outlined above (see Sections 2.4.2 and 2.5 ) based on the multivariate general linear model where Y is a n 1 × (2 × 116) matrix of WM and GM densities, x is an n 1 × 1 vector of chronological age, 1 n is a constant regressor of the same size and Z is a n 1 ×17 covariate design matrix including indicator regressors for site and gender effects, such that B is a (2+17) × (2 ×116) matrix of regression coefficients, describing the influences of x and Z on Y . In this way, effects of site and gender on structural MRI measures are accounted for when reconstructing chronological age from these quantities. For MBR, we applied three different prior target densities (see Figure 11 , bottom row): (i) a uniform prior, with constant prior density for ages between 0 and 100 years; (ii) a data-driven prior, the probability for each integer age being equal to its frequency of occurence in the training set; and (iii) a fitted prior, from fitting a shifted gamma distribution to the target values in the training set (see Figure 11 , top left). Like in Simulation 4, posterior densities we obtained via discretization of the interval [0, 100] in steps of 1. Results We observe that (i) all MBR variants achieve a higher predictive correlation (uniform: r = 0.91; data-driven: r = 0.91; fitted: r = 0.92) than SVR ( r = 0.83) when predicting chronological age (see Figure 11 , top row); (ii) this is underscored by the fact that the distribution of MBR predictions is closer to the actual distribution of targets than the distribution of SVR predictions (see Figure 11 , middle row); and (iii) using the fitted prior density for age results in the smallest mean absolute error (MAE = 4.72 yrs), outperforming the uniform prior (MAE = 5.77 yrs) by about a year and SVR (MAE = 6.99 yrs) by more than two years (see Figure 11 , top row). We also present exemplary posterior target densities using the three different priors (see Figure 12 ) which anecdotally show how the data-driven and the fitted prior can occasionally draw the MAP estimates closer to the true value (see Figure 12 , right column). Replicating Simulation 4, we also repeat our observation that posterior target densities from MBR seem to approximate univariate normal distributions when uniform prior densities are applied (see Figure 12 , first row). Conclusion We have seen that MBR outperforms SVR by including confounding variables as covariates influencing the features, rather than as features allowing to predict the targets. Additionally, different prior distributions can be incorporated, with an ecologically plausible prior distribution resulting in the smallest mean absolute error. 4.5 Comparison with alternative approaches Purpose In addition to comparison with SVMs (see Figures 7 - 8 , 10 - 11 ), we also compare MBI’s empirical performance against that of other ML approaches for selected analyses. This includes generative and discriminative as well as Bayesian and non-Bayesian methods for classification and regression problems. Data analysis We analyze data from Analysis 2 using the same techniques and settings described in Section 3.5 : naive Bayes classification (GNB); linear discriminant analysis (LDA); multiclass logistic regression (LogReg); random forrest classification (RFC); neural network classification (NNC; one hidden layer with 400 neurons). Moreover, we re-analyze data from Analysis 4 using the same techniques and settings described in Section 3.5 : naive Bayes regression (GNB); multiple linear regression (LinReg); random forrest regression (RFR); neural network regression (NNR; one hidden layers with 250 neurons). As before, we distinguish between generative approaches (e.g. GNB, and including MBC) and discriminative approaches (e.g. RFC, and including SVC) and measure predictive performance using the rank graduation accuracy (RGA) [ 47 ] which always falls between 0 and 1, with a value of 0.5 corresponding to chance-level performance. Results Because MBC and LDA are equivalent for multi-class classification with i.i.d. observations and without covariates (see Section 2.7 ), both show identical performance for Analysis 2 (RGA = 0.9396). As expected, they perform better than GNB which disregards covariances between features (RGA = 0.8934). The highest performance is achieved by NNC (RGA = 0.9908) and all discriminative approaches except for LogReg have performance higher than that of MBC (see Table 4 ). View this table: View inline View popup Download powerpoint Table 4. Analyses: comparisons with alternative approaches. Predictive performance for generative and discriminantive techniques when analyzing data from Analysis 2 and Analysis 4. For details regarding MBC and SVC, see Figure 8 , and for details regarding MBR and SVR, see Figure 11 . In all cases, predictive performance is given as rank graduation accuracy (RGA) [ 48 ]. In Analysis 4, MBR (RGA = 0.9510) and LinReg (RGA = 0.9546) perform best among generative and discriminative approaches, respectively. Except for GNB, which fails to account for covariances between features (RGA = 0.8123), differences between regression techniques are negligible, all performing slightly below the level of MBR and LinReg (see Table 4 ). Conclusion MBI-based predictive analysis for classification and regression reaches predictive performance comparable with that of established (generative or discriminative, Bayesian or non-Bayesian) ML techniques in empirical data sets. 5 Discussion In this section, we discuss some aspects pertaining to multivariate Bayesian inversion (MBI). We categorize these aspects into “advantages”, i.e. benefits of MBI over other approaches, “disadvantages”, i.e. shortcomings relative to other approaches, and “extensions”, i.e. possibilities for increasing the applicability of MBI. 5.1 Advantages 5.1.1 Explainability and robustness → Examples: Simulations 1-4, Analyses 1-4 The proposed MBI framework aligns well with the principles of the SAFE machine learning approach [ 62 ]. In contrast to discriminative algorithms, where model weights typically lack a direct substantive interpretation, MBI is based on an explicit generative model in which parameters quantify the effect of label variables on observed features (see Section 2.1 ). This means that the regression coefficients can be interpreted in a straightforward manner as conditional dependencies of features on labels, thereby providing a transparent link between model structure and domain-relevant mechanisms. This property enhances interpretability not only at the level of individual parameters, but also at the level of the full model, facilitating scientific insight and supporting informed decision-making. With respect to accuracy , MBI demonstrates competitive performance relative to widely used machine learning approaches. Empirical evaluations show that MBI either outperforms or achieves comparable results to established methods, particularly support vector machines, across multiple real-world data sets (see Sections 4.1 - 4.4 ). This indicates that the generative modeling strategy underlying MBI does not come at the cost of predictive performance. Instead, it combines strong accuracy with the additional benefit of probabilistic outputs, suggesting that high predictive quality and principled statistical modeling are not mutually exclusive, but can be jointly realized within a unified Bayesian framework. Finally, MBI exhibits favorable properties in terms of sustainability , understood here as robustness and reliability of inference. By basing predictions on full posterior probabilities rather than point estimates, MBI enables balanced decision-making that naturally accounts for uncertainty. This probabilistic treatment reduces the risk of overconfident or unstable predictions and supports more robust conclusions, particularly in settings with limited or noisy data. The robustness of this approach is further corroborated by comparisons based on the robustness-focused RGA measure [ 48 ] in which MBI shows consistent and stable performance compared to alternative methods (see Sections 3.5 and 4.5 ). Together, these characteristics highlight MBI as a sustainable machine learning approach in the SAFE sense, combining robustness with principled uncertainty quantification. 5.1.2 Classification and regression → Examples: Simulations 1-4, Analyses 1-4 Unlike other approaches that are solely designed for discrete prediction, e.g. logistic regression ([ 12 , sec. 4.3 ]), or continuous prediction, e.g. multiple linear regression ([ 12 , sec. 3.1 ]), the MBI framework can be used for classification into discrete categories as well as for regression of continuous variables (see Figure 13 ); it has that in common with convolutional deep neural networks [ 7 , 8 ], support vector machines [ 63 , 64 ] or k -nearest neighbor algorithms [ 65 , 66 ]. Download figure Open in new tab Fig. 13. Different kinds of predictive analysis. Conceptual comparison of traditional methods (left; here: SVMs) and probabilistic methods (right; here: MBI) for classification (top) and regression (bottom), reiterating predictive algorithms vs. generative models (see Section 2.1.1 ). (A) Support vector classification searches for a hyperplane that optimally separates multivariate patterns from different classes. (B) Multivariate Bayesian classification estimates a probability distributions of the data, conditional on class, and uses them to calculate posterior probabilities for each class, conditional on the data. (C) Support vector regression searches for a projection of the data set that best matches the features onto the targets. (D) Multivariate Bayesian regression estimates a probability distribution of the features, conditional on the targets, and uses them to calculate posterior densities across target values, conditional on the observed features. In multivariate Bayesian classification (MBC), the quantity of interest is treated as a discrete random variable, resulting in posterior class probabilities (cf. Figure 1B and see e.g. Figure 3A ). In multivariate Bayesian regression (MBR), the quantity of interest is treated as a continuous random variable, resulting in a posterior target density (cf. Figure 2B and see e.g. Figure 6D ). Importantly, MBI works identically for both cases during the training phase, except for being based on different design matrices (see Figure 1A vs. Figure 2A ). This also means that, in principle, the same model can be used for classification into categories and regression of a continuous variable at the same time, reciprocally accounting for the other, provided that both quantities of interest are included in the design matrix used in the training phase. 5.1.3 Posterior probabilities → Examples: Simulations 1/3/4, Analyses 2/4 Unlike many other approaches, e.g. linear discriminant analysis [ 41 ], support vector classification [ 5 ] or perceptrons [ 67 ], MBI provides posterior probabilities for the quantity to be predicted (see Figure 13 ); like e.g. logistic regression [ 68 , 69 ] or naive Bayes classifiers [ 37 , 70 ]. These are probabilities for observations belonging to particular classes in MBC and probability densities for targets having particular values in MBR. Notably, support vector machines can be cast as probabilistic models [ 71 ]. Although probabilistic classification can be achieved using methods such as Platt scaling [ 72 ], such posterior inferences are not informed by prior distributions, but rather based on fitting a logistic regression model to the classifier’s scores [ 73 ], making it a methodologically unrelated postprocessing step not part of the algorithm itself. Moreover, the method cannot be easily extended to continuous prediction tasks for which posterior target densities from MBR are a relatively simple solution. Probabilistic results are particularly useful for some prediction contexts, e.g. disease classification [ 74 , 75 ] or individual treatment projection [ 76 , 77 ]. Whereas non-probabilistic approaches for classification always make a distinct decision, a user of MBC can also refrain from making a decision, if the posterior class probability does not exceed a pre-defined threshold. Moreover, these probabilities can be combined with loss or gain functions to identify optimal decisions by application of Bayesian decision theory. Similarly, while non-probabilistic approaches for regression only give point estimates, the posterior density from MBR quantifies remaining uncertainty after seeing the data and informs the user about the precision with which the target variable is known. 5.1.4 Prior knowledge → Examples: Simulations 3/4, Analysis 4 The fact that MBI provides posterior probabilities comes hand in hand with the fact that it uses prior probabilities. In this way, it can integrate prior knowledge about the frequency of the distinguished classes or about the distribution of the predicted variable. For example, it might be known beforehand that one class occurs much more often than the other (e.g. healthy controls vs. patients) or that a variable tends to assume values in a specific range of its support (e.g. age or IQ). This is especially useful in cases where it is known that the quantity of interest is constrained to a certain range of values and values outside this range are not meaningful, e.g. in brain age prediction [ 56 ]. While some approaches will inevitably also return negative values for chronological age (see [ 57 , Fig. 2 ]; also see Figure 11 ), specification of a prior distribution can easily prevent this (see Figure 6E ). In such cases, the prior distribution should be motivated from the experimental design by which the variable was controlled (see Figure 6A ) or from world knowledge about the process from which the variable was sampled in case of field studies (see Figure 11 ). Although this was not practiced in the present work, it is also conceivable that not only the prior model probabilities are manipulated, but the prior hyperparameters governing the MGLM parameters themselves are modified (see eqs. 9 and 17 ). This requires another level of prior knowledge, pertaining not just to the distribution of predicted quantities, but also to the specific mechanism connecting variables X and Y ; however, it could also be helpful for regularization in the case of high-dimensional data (see Section 5.3.1 ). 5.1.5 Feature dependencies → Examples: Simulations 1/2, Analyses 1-3, Simulation 4 Like many other approaches, MBI accounts for dependencies between measured variables (between-feature covariance; Σ in eq. 3 ). This is in contrast to e.g. Gaussian naive Bayes [ 37 ] which assumes zero covariance between features (emulated by a diagonal matrix Σ). In principle, MBC implements a Bayes classifier for the multivariate GLM that is aware of non-diagonal feature covariance. This exploits the way how several features jointly vary as a function of class or target and enables to delineate patterns belonging to classes or targets in multi-dimensional space (see Figures 3 and 4 ). In fact, the between-feature covariance plays a key role, because MBI’s (unnormalized) posterior distribution is given by the ratio of two determinants (see eqs. 29 and 36 ), where the two matrices are hyperparameters governing the feature covariance (Ω) and the regressor covariance (Λ). Unlike many other approaches, MBI however also accounts for dependencies between data points (between-observation covariance; V in eq. 3 ). This is important when the features to predict from are e.g. time-series data, as accounting for the correlation between data points then drastically improves prediction accuracy (cf. [ 16 ]). 5.1.6 Covariate integration → Examples: Simulation 2, Analyses 3/4 Many classification approaches only accept a feature matrix Y and a label vector x as inputs. 5 This makes it difficult to handle variables which are not predictive with respect to the labels (like the features), but which might influence the features (like covariates) and thus increase predictive performance, if accounted for. MBI naturally integrates such covariates by simply adding them as regressors (see Section 2.5 ): For the training set, they are additional columns in the design matrix; and in the test data, they are also included as regressors, but other than the quantities of interest, their values are fixed in the model inversion process which serves to predict the quantity of interest. We have seen that including those covariates outperforms a “prior regression” approach [ 45 , 46 ] in which variation related to them is removed from the features before predictive analysis (see Figures 4 and 10 ). We have also seen that including the covariates outperforms employing them as features, i.e. adding them to the feature matrix (see Figure 11 ) in the hope that this will explain the variable to be predicted. Note that treating a variable as a covariate rather than as a feature increases the number of parameters from r (one weight per each covariate-as-feature-variable) to r × v (one weight per each covariate influencing each feature). This suggests that the latter approach can more flexibly account for possible effects of the covariates. 5.1.7 Multi-class classification → Examples: Simulation 3, Analyses 1-3 Using several examples, we have shown that MBC is capable of multi-class classification (see Figures 5 and 7 ). Unlike other approaches, e.g. support vector classification [ 78 ], MBI uses the same algorithm for binary and n-ary classification, i.e. a categorical design matrix (see Figure 1A and eq. 24 ). This means that MBC does not require separate 1-vs-1 comparisons for multi-class classification which avoids long computation times when the number of classes is very high (but see Section 5.2.5 ) and there is no need for a decision rule to merge several 1-vs-1 comparisons into a multi-class decision ([ 79 ]; [ 12 , p. 339]). 5.2 Disadvantages 5.2.1 Linear mappings → Examples: Analyses 2/4 Since MBI is based on multivariate general linear models (see eq. 3 ), it intrinsically assumes linear relationships between labels and features, and linear effects of covariates on features. As such, it is principally restricted to such linear mappings and not able to represent non-linear relationships between labels and features in multi-dimensional space. This is different from kernel-based methods [ 80 ], e.g. support vector machines, which also allow to exploit non-linear patterns in the data by replacing a linear kernel with e.g. radial basis function or polynomial kernels, leading to non-linear decision boundaries in cases of classification [ 81 ]. It is also a disadvantage when compared with convolutional or deep neural networks [ 7 ] which allow to represent non-linear relationships through inclusion of hidden layers or non-linear activation functions [ 8 ]. For example, it is plausible to expect that the mapping from chronological age to structural MRI data is non-linear in nature. Here, we did not explore this question, as we always applied linear SVMs as alternative methods for comparison with MBI. In an MBI-based approach, one way to faithfully capture those patterns would be to replace the MGLM-derived likelihood function by one allowing for non-linear relationships between X and Y – which would however lead to non-tractable posterior distributions and marginal likelihoods (see Section 5.3.2 ). Another approach would be to apply non-linear basis functions to either features or targets ([ 12 , sec. 3.1 ]) or to equip MBI with a kernel-based architecture ([ 12 , sec. 6.1]). 5.2.2 Normal errors → Examples: Analyses 1/2/4 In addition to linearity of the mapping from classes or targets to features, MBI assumes random variation in the features to exclusively consist of matrix-normally distributed errors. This also entails that the between-observation covariance V is the same across all columns and that the between-feature covariance Σ is the same across all rows of the data matrix Y . While the latter belief is not strictly problematic, because it corresponds to the commonly made “independent and identically distributed” (i.i.d.) assumption, the normal distribution itself may not always be a valid assumption. The theoretical justification for normality of the errors is (i) that residual components on top of signals in the features can typically be assumed to consist of a sum of a large number of unmodeled random variables and (ii) the central limit theorem asserts that such a sum will tend to a normal distributuion ([ 17 , sect. 3.3 ]). Univariate analyses depending on this justification such as multiple linear regression have turned out very robust and reliable, such that the assumption is also legitimate for multivariate approaches ([ 15 , p. 352]). One of our examples suggests that MBI can handle moderate levels of non-normality (cf. Figure 7A ). However, there will certainly exist situations in which the assumption is not warranted and in which generalized linear models may be a better choice than the MGLM [ 82 ]. 5.2.3 Equal covariance → Examples: Simulation 3, Analyses 1-3 Through the between-feature covariance Σ, MBI can accomodate arbitrary covariance between measured signals, and because the matrix (or its inverse, T = Σ −1 ) is fully estimated, MBI makes no assumptions about the specific structure of this covariance. However, since the MGLM describes a stationary process, multivariate Bayesian classification assumes an identical covariance for all classes. This means that if, for example, signals belonging to two classes are not differing in terms of their multivariate mean, but only in terms of the amount of variance or the direction of covariance between the features ([ 83 , Fig. 3 ]), or if classes are differing in terms of multivariate mean and feature covariance (cf. Analysis 2), the estimate of the between-feature covariance will be based on the pooled data from both classes (cf. eq. 22c). Thus, MBC will likely not be able to distinguish between the classes, because the measured signals from both classes equally well fit with the pooled estimate. In each analysis, the null hypothesis should therefore precisely specify whether classes are equal only in terms of their multivariate mean (in which case such a result is statistically correct) or in terms of the distribution as such. 5.2.4 Conjugate priors → Examples: Simulation 4, Analyses 2-4 Multivariate Bayesian inversion uses conjugate normal-Wishart priors (cf. eq. 9 ) to estimate parameters of the MGLM underlying classification and regression. When a non-informative prior distribution is used (cf. eq. 17 ), no hyperparameters need to be specified. However, as this choice is mainly made for mathematical convenience and computational efficiency, it may also be seen as a weakness. A fixed prior distribution reduces the model’s flexibility to accomodate uncertainty about model parameters and its capability to adapt inferences to the specifics of real-world situations. With that in mind, the potential of continuous shrinkage priors [ 84 ] or spike-and-slab priors [ 85 , 86 ] might be investigated. The same holds for alternative prior distributions when infering on values of the label variable (cf. eq. 34 ), e.g. heavy-tailed t-distributions for regression problems [ 87 , 88 ]. As these changes would make the model intractable in closed form, approximate inference techniques such as sampling inference and variational Bayes would have to be employed when using non-conjugate priors (see Section 5.3.2 ). 5.2.5 Unimodal posterior → Examples: Simulation 4, Analysis 4 When performing multivariate Bayesian regression and using a uniform prior density (see Section 3.4 and 4.4 ), the posterior target density typically has a global maximum and no other local maxima (see Figures 6 and 12 ). In other words, the posterior target density is inherently unimodal (and appears univariate normal). This means that if, for example, the relationship between targets and features is in a way, such that two distinct values in separated regions of the target space are equally likely a posteriori, the posterior target density in MBR will most probably end up with its mode between these two values. In the worst case, this can lead to misleading conclusions, by reporting the medium value as the reconstruction, although reporting the two extreme values with equal posterior probability would have been more appropriate. Potential remedies for this are different prior densities or non-linear basis functions. 5.2.6 Computational cost → Examples: Simulations 3/4, Analyses 2/4 For each observation in the test set, for which a posterior class probability or the posterior target density is calculated, MBI requires the determinants of two matrices for each possible value of the predicted variable (see eqs. 29 and 36 ). These two matrices are the posterior precision governing the regressor covariance (Λ 2 i ∈ ℝ p × p ) and the posterior scale governing the feature covariance (Ω 2 i ∈ ℝ v × v ). With the number of regressor ( p ) or the number of features ( v ) becoming very high, these matrices become very large, such that the calculation of determinants can potentially become slow. This especially holds for discretized MBR when the prior density is intended to be very fine-grained or spanning a large range of values, such that the calculation in equation ( 36 ) has to be repeated very often. Systematic comparison of computation times confirms this, showing a 5-fold or 20-fold increase of computation time for MBR, compared with SVR (see Table 5 ). Computation times of MBC are comparable with those of SVC and sometimes lower, especially when working with a large number of classes, e.g. in Analyses 1 and 2 (see Table 5 ). View this table: View inline View popup Download powerpoint Table 5. Computational cost. Computation times for predictive analyses based on multivariate Bayesian inversion (MBI) and support vector machines (SVM) for main analyses reported in this paper. All computations were performed using MATLAB R2025b running on a 64-bit Windows 11 PC with 32 GB RAM and an Intel Core Ultra 5 Processor 225H working at 1.70 GHz. The issue of excessively long computation times for MBR can, if necessary, be mitigated by assuming that the posterior target density is a univariate normal distribution (cf. Figures 2 , 6 and 12 ) – in which case its parameters can be calculated from just three pairs of target value λ and unnormalized posterior density . 5.3 Extensions 5.3.1 High-dimensional data → Example: Analyses 2/4 As it can be seen from the equations for posterior class probabilities (see eq. 29 ) and the posterior target density (see eq. 36 ), computation of posterior probabilities always requires the calculation of two determinants, the posterior precision Λ 2 i governing the regressor covariance and the posterior scale Ω 2 i governing the feature covariance. Note that, when the number of features v is larger than the number of observations n , the matrix Ω 2 i will be ill-conditioned and not invertible (cf. eq. 22 ), such that its determinant is zero and the posterior probabilities are not defined. One reason for this is that the prior scale matrix is set to a v × v zero matrix Ω 0 = 0 vv (cf. eq. 17 ) which has the advantage that MBI does not require user-specified hyperparameters, but causes the posterior scale matrix Ω 2 i to become singular. This can be avoided by setting the prior scale matrix to a scalar multiple of the identity matrix Ω 0 = ω 0 I v . The posterior scale matrix will then become a mixture of this prior covariance term and data-based covariance terms (cf. eqs. 11 and 22 ): The magnitude of ω 0 will determine the extent to which the features are regarded as independent signals, while the remaining terms are incorporating possible dependencies between the features based on the measured signals. Conceptually, imposing this prior knowledge on T via Ω 0 is equivalent to regularizing our estimate of the between-feature covariance, similar to Tikhonov regularization in ridge regression [ 89 , 90 ]. Provided that the dimensionality of Ω 2 i ( v × v ) allows for calculation of matrix determinants in a software package implementing MBI, this can allow for prediction of classes or target variables from high-dimensional data ( v ≫ n ), i.e. data sets with much more features than observations. 5.3.2 Non-linear mappings → Example: Analyses 2/4 As we have noted above (see Sections 5.2.1 and 5.2.2 ), MBI is restricted to linear mappings from regressors to features and matrix-normally distributed errors. Sometimes, such linear mappings are not possible or not realistic to assume, but specifying a non-linear model often renders the joint likelihood intractable and the posterior distribution cannot be expressed in closed form [ 12 , 28 , 91 ]. However, provided that such an alternative likelihood function p ( Y | X, θ, m ) and a prior distribution p ( θ | m ) are given mathematically, MBI can be finessed to handle non-linear mappings using one out of two strategies. The first strategy, sampling-based inference , calculates the posterior distribution (in the training data) numerically and approximates the marginal likelihood (in the test data) by drawing from the prior and inserting into the likelihood [ 92 , 93 ]. The second strategy, variational Bayesian inference , assumes the posterior distribution (in the training data) to factorize over sets of parameters [ 32 ] and calculates the variational free energy [ 94 ] as a lower bound to the marginal likelihood (in the test data). In conclusion, while MBI does not allow for non-linear relationships between label variables and observed features, specifying an appropriate likelihood function and prior distribution allows to perform non-standard MBI with non-linear mappings [ 39 , 95 ] via sampling or variational approaches. 6 Conclusions We have systematically reviewed fully Bayesian inference for the multivariate general linear model (MGLM) and have shown that Bayesian MGLM estimation can enable cross-validated prediction of discrete or continuous variables. The resulting techniques of multivariate Bayesian classification (MBC) and regression (MBR) have a number of desirable properties. Further research should investigate how the proposed framework can be extended to other statistical tasks such as clustering, expectation maximization, prediction from high-dimensional data, non-linear prediction or sampling-based inference. 7 Statements and Declarations 7.1 Competing Interests The authors have no conflict of interest, financial or otherwise, to declare. 7.2 Research Funding This research did not receive any third-party funding. 7.4 Data and Code MATLAB and Python implementations of MBI for classification and regression are available from GitHub 6 . These tools allow for both, customized train-and-test analyses as well as fully automatic cross-validated prediction. MATLAB and Python code underlying data analysis is provided in the same GitHub repository. Empirical data can be found within this repository 7 . Simulated data can be generated using the routines in this repository 8 . 7.3 Acknowledgements The authors wish to thank Dirk Ostwald (OvGU Magdeburg) for providing helpful comments on an earlier version of this manuscript. Additionally, the authors thank the acquisitors and curators of the Egyptian skull data set [ 49 ], the MNIST data set [ 50 ], the birth weight data set [ 52 ] and the brain age competition data set [ 61 ] for providing open access to these valuable resources. Appendix A Likelihood function of the MGLM The model equation of the multivariate general linear model (MGLM) reads: The linear transformation theorem for the matrix-normal distribution states that, if Ξ ∈ ℝ n × p is matrix-normally distributed, then any (bi-)linear transformation of Ξ in the form ϒ = A Ξ B + C with constant A ∈ ℝ r × n , B ∈ ℝ p × s and C ∈ ℝ r × s is also matrix-normally distributed [ 96 , ch. 2-3]: Combining ( A.1 ) and ( A.2 ) leads to the following distribution for Y In the independent and identically distributed (i.i.d.) case, i.e. with V = I n , we can set , B = I v , C = 0 1 v for i = 1, …, n and apply ( A.2 ) to ( A.3 ) to obtain The probability density function of the matrix-normal distribution for a random matrix Ξ ∈ ℝ n × p with mean M ∈ ℝ n × p and covariances and is given by [ 97 , sect. 2.1 ] Combining ( A.3 ) and ( A.5 ) leads to the following likelihood function for m : Substituting P = V −1 and T = Σ −1 , we finally have: Appendix B Posterior distribution for the MGLM From ( 9 ), the conjugate prior for the MGLM is given by a normal-Wishart prior distribution over the model parameters B and T = Σ −1 : From ( 8 ), the posterior distribution is proportional to the product of likelihood function and prior density, i.e. the joint likelihood function: From ( 7 ), we have the likelihood function: The probability density function of the matrix-normal distribution for a random matrix Ξ ∈ ℝ n × p with mean M ∈ ℝ n × p and covariances and is: The probability density function of the Wishart distribution for a random matrix with scale matrix and degrees of freedom n ∈ ℝ, n > p is: Combining ( B.3 ) with ( B.1 ) and plugging in ( B.4 ) and ( B.5 ), the joint likelihood of the model is given by: Collecting identical variables gives: Expanding the products in the exponent gives: Completing the square over B , we finally have with the posterior hyperparameters Ergo, the joint likelihood is proportional to with the posterior hyperparameters Taken together, we see that the joint likelihood ( B.11 ) is proportional to a normal-Wishart distribution, such that the posterior distribution is also a normal-Wishart distribution 9 and the posterior hyperparameters are given by Appendix C Marginal likelihood for the MGLM Again, we assume a normal-Wishart distribution as conjugate prior distribution over the model parameters B and T = Σ −1 : From ( 12 ), the marginal likelihood is obtained by integrating over the product of likelihood function and prior density, i.e. over the joint likelihood function: In Appendix B, the joint likelihood p ( Y, B, T ) has been obtained as (see eq. B.9 ) In the following, we will make use of the fact that probability density integrates to one over the whole domain of a random variable, e.g.: Using the matrix-normal probability density function ( B.4 ), we can rewrite this as Now, B can be integrated out easily by appling ( C.4 ): Using the Wishart probability density function ( B.5 ), we can rewrite this as Finally, T can also be integrated out by appling ( C.5 ): Combining ( C.9 ) and ( C.2 ), the marginal likelihood of the MGLM is 10 where the posterior hyperparameters M n , Λ n , Ω n and ν n are given by ( B.14 ). Appendix D Multivariate Bayesian inversion In this section, we use the superscript □; (1) to denote quantities belonging to the training data and the superscript □; (2) to denote quantities belonging to the test data. In the first step of an MBI analysis, the data set of features and labels is and from ( 17 ), the prior hyperparameters for the first step are given by: Plugging this into ( B.14 ), the posterior hyperparameters after the first step are: In the second step of an MBI analysis, features and labels of the single data point are and from ( D.3 ), the prior hyperparameters for the second step are the posterior hyperparameters obtained in the first step: Plugging this into ( B.14 ), the posterior hyperparameters after the second step are: Footnotes - minor changes in Sections 2.1.1, 2.1.2, 2.3.1 - edited captions of Figure 1 and 2 - added new Discussion in Section 2.7 - changes to Simulation 2/Section 3.2 - changes to Analysis 3/Section 4.3 - new Simulations in Section 3.5 - new Analyses in Section 4.5 - added new Tables 3 and 4 - added new Discussion in Section 5.1.1 - added new Discussion in Section 5.2.4 - minor changes in Section 5.2.6 - added new Table 5 https://github.com/JoramSoch/MBI ↵ 1 For details, see the MATLAB implementation ( https://github.com/JoramSoch/MBI/blob/main/MATLAB/ML_CV.m ) or the Python implementation ( https://github.com/JoramSoch/MBI/blob/main/Python/MBI.py#L341-L421 ) of cross-validation in the MBI GitHub repository. ↵ 2 For simplicity, this and the next matrix are specified as Toeplitz matrices with constant descending diagonals, i.e. the entries only depend on the absolute difference between row and column index ( i − j ). ↵ 3 The data set can be obtained from: https://github.com/JoramSoch/PAC_2019 . ↵ 4 See: https://github.com/ohbm/OpenScienceRoom2019/issues/10 . ↵ 5 See, for example, the Python implementation of support vector machines ( https://github.com/cjlin1/libsvm/blob/master/python/libsvm/svmutil.py ) or the MATLAB implementation of logistic regression ( https://www.mathworks.com/help/stats/fitmnr.html ). ↵ 6 See: https://github.com/JoramSoch/MBI . ↵ 7 e.g. Analysis 1: https://github.com/JoramSoch/MBI/tree/main/Examples/Analysis_1 . ↵ 8 e.g. Simulation 1: https://github.com/JoramSoch/MBI/tree/main/Examples/Simulation_1 . ↵ 9 The derivation of this posterior distribution is inlcuded in [ 98 , ch. 3.4] and can also be found at: https://statproofbook.github.io/P/mblr-post . ↵ 10 The derivation of this marginal likelihood is inlcuded in [ 98 , ch. 3.4] and can also be found at: https://statproofbook.github.io/P/mblr-lme . References [1]. ↵ Efron , B. , Hastie , T. : Computer Age Statistical Inference: Algorithms, Evidence, and Data Science . Institute of Mathematical Statistics monographs . Cambridge University Press , New York, NY ( 2016 ) [2]. ↵ Wei , T.C. , Sheikh , U.U. , Rahman , A.A.-H.A. : Improved optical character recognition with deep neural network . In: 2018 IEEE 14th International Colloquium on Signal Processing & Its Applications (CSPA) , pp. 245 – 249 . IEEE, Batu Feringghi ( 2018 ). doi: 10.1109/CSPA.2018.8368720 . https://ieeexplore.ieee.org/document/8368720/ Accessed 2022-04-12 OpenUrl CrossRef [3]. ↵ Xiong , W. , Droppo , J. , Huang , X. , Seide , F. , Seltzer , M. , Stolcke , A. , Yu , D. , Zweig , G. : Achieving Human Parity in Conversational Speech Recognition ( 2016 ) doi: 10.48550/ARXIV.1610.05256 . Accessed 2022-04-12 OpenUrl CrossRef [4]. ↵ Silver , D. , Schrittwieser , J. , Simonyan , K. , Antonoglou , I. , Huang , A. , Guez , A. , Hubert , T. , Baker , L. , Lai , M. , Bolton , A. , Chen , Y. , Lillicrap , T. , Hui , F. , Sifre , L. , Driessche , G. , Graepel , T. , Hassabis , D. : Mastering the game of Go without human knowledge . Nature 550 ( 7676 ), 354 – 359 ( 2017 ) doi: 10.1038/nature24270 . Accessed 2022-04-12 OpenUrl CrossRef PubMed [5]. ↵ Cortes , C. , Vapnik , V. : Supportvector networks . Machine Learning 20 ( 3 ), 273 – 297 ( 1995 ) doi: 10.1007/BF00994018 . Accessed 2022-04-12 OpenUrl CrossRef Web of Science [6]. ↵ Mozer , M.C. , Jordan , M. , Petsche , T. Drucker , H. , Burges , C.J.C. , Kaufman , L. , Smola , A. , Vapnik , V. : Support Vector Regression Machines . In: Mozer , M.C. , Jordan , M. , Petsche , T. (eds.) Advances in Neural Information Processing Systems , vol. 9 . MIT Press, ??? ( 1996 ). https://proceedings.neurips.cc/paper/1996/file/d38901788c533e8286cb6400b40b386d-Paper.pdf [7]. ↵ Hinton , G. , Deng , L. , Yu , D. , Dahl , G. , Mohamed , A.-r. , Jaitly , N. , Senior , A. , Vanhoucke , V. , Nguyen , P. , Sainath , T. , Kingsbury , B. : Deep Neural Networks for Acoustic Modeling in Speech Recognition: The Shared Views of Four Research Groups . IEEE Sigal Processing Magazine 29 ( 6 ), 82 – 97 ( 2012 ) doi: 10.1109/MSP.2012.2205597 . Accessed 2022-04-12 OpenUrl CrossRef [8]. ↵ Deng , L. , Hinton , G. , Kingsbury , B. : New types of deep neural network learning for speech recognition and related applications: an overview . In: 2013 IEEE International Conference on Acoustics, Speech and Signal Processing , pp. 8599 – 8603 . IEEE , Vancouver, BC, Canada ( 2013 ). doi: 10.1109/ICASSP.2013.6639344 . http://ieeexplore.ieee.org/document/6639344/ Accessed 2022-04-12 OpenUrl CrossRef [9]. ↵ Dietterich , T. , Becker , S. , Ghahramani , Z. Ng , A. , Jordan , M. : On Discriminative vs. Generative Classifiers: A comparison of logistic regression and naive Bayes . In: Dietterich , T. , Becker , S. , Ghahramani , Z. (eds.) Advances in Neural Information Processing Systems , vol. 14 . MIT Press , ??? ( 2001 ). https://proceedings.neurips.cc/paper_files/paper/2001/file/7b7a53e239400a13bd6be6c91c4f6c4e-Paper.pdf [10]. ↵ Frässle , S. , Yao , Y. , Schöbi , D. , Aponte , E.A. , Heinzle , J. , Stephan , K.E. : Generative models for clinical applications in computational psychiatry . WIREs Cognitive Science 9 ( 3 ) ( 2018 ) doi: 10.1002/wcs.1460 . Accessed 2022-04-12 OpenUrl CrossRef [11]. ↵ Gelman , A. , Carlin , J.B. , Stern , H.S. , Dunson , D.B. , Vehtari , A. , Rubin , D.B. : Bayesian Data Analysis , 3rd edition edn. Chapman and Hall/CRC , Boca Raton ( 2013 ) [12]. ↵ Bishop , C.M. : Pattern Recognition and Machine Learning , 1st ed. 2006. corr. 2nd printing 2011 edn. Springer , New York ( 2007 ) [13]. ↵ Friston , K. , Chu , C. , Mourão-Miranda , J. , Hulme , O. , Rees , G. , Penny , W. , Ashburner , J. : Bayesian decoding of brain images . NeuroImage 39 ( 1 ), 181 – 205 ( 2008 ) doi: 10.1016/j.neuroimage.2007.08.013 . Accessed 2015-10-30 OpenUrl CrossRef PubMed Web of Science [14]. ↵ Shmueli , G. : To Explain or to Predict? Statistical Science 25 ( 3 ) ( 2010 ) doi: 10.1214/10-STS330 . Accessed 2022-05-30 OpenUrl CrossRef Web of Science [15]. ↵ Allefeld , C. , Haynes , J.-D. : Searchlight-based multi-voxel pattern analysis of fMRI by cross-validated MANOVA . NeuroImage 89 , 345 – 357 ( 2014 ) doi: 10.1016/j.neuroimage.2013.11.043 . Accessed 2015-10-30 OpenUrl CrossRef PubMed Web of Science [16]. ↵ Soch , J. , Allefeld , C. , Haynes , J.-D. : Inverse transformed encoding models – a solution to the problem of correlated trial-by-trial parameter estimates in fMRI decoding . NeuroImage 209 , 116449 ( 2020 ) doi: 10.1016/j.neuroimage.2019.116449 . Accessed 2021-01-07 OpenUrl CrossRef PubMed [17]. ↵ Timm , N.H. : Applied Multivariate Analysis . Springer texts in statistics. Springer, New York Berlin Heidelberg ( 2002 ) [18]. ↵ Koch , K.-R. : Introduction to Bayesian Statistics , 2nd, updated and enlarged ed. 2007 edition edn . Springer , Berlin; New York ( 2007 ) [19]. ↵ Soch , J. , Allefeld , C. : MACS – a new SPM toolbox for model assessment, comparison and selection . Journal of Neuroscience Methods 306 , 19 – 31 ( 2018 ) doi: 10.1016/j.jneumeth.2018.05.017 . Accessed 2019-01-23 OpenUrl CrossRef PubMed [20]. ↵ Lee , D. , Yun , S. , Jang , C. , Park , H.-J. : Multivariate Bayesian decoding of single-trial event-related fMRI responses for memory retrieval of voluntary actions . PLOS ONE 12 ( 8 ), 0182657 ( 2017 ) doi: 10.1371/journal.pone.0182657 . Accessed 2022-09-13 OpenUrl CrossRef [21]. ↵ Sarker , I.H. : Machine Learning: Algorithms, Real-World Applications and Research Directions . SN Computer Science 2 ( 3 ), 160 ( 2021 ) doi: 10.1007/s42979-021-00592-x . Accessed 2025-02-05 OpenUrl CrossRef PubMed [22]. ↵ Huang , H.-H. , Xu , T. , Yang , J. : Comparing logistic regression, support vector machines, and permanental classification methods in predicting hypertension . BMC Proceedings 8 ( S1 ), 96 ( 2014 ) doi: 10.1186/1753-6561-8-S1-S96 . Accessed 2025-02-05 OpenUrl CrossRef [23]. ↵ Vapnik , V.N. : Statistical Learning Theory . Adaptive and learning systems for signal processing, communications, and control . Wiley , New York ( 1998 ) [24]. ↵ Valente , G. , Castellanos , A.L. , Hausfeld , L. , De Martino , F. , Formisano , E. : Cross-validation and permutations in MVPA: Validity of permutation strategies and power of cross-validation schemes . NeuroImage 238 , 118145 ( 2021 ) doi: 10.1016/j.neuroimage.2021.118145 . Accessed 2022-04-12 OpenUrl CrossRef PubMed [25]. ↵ Böhning , D. : Multinomial logistic regression algorithm . Annals of the Institute of Statistical Mathematics 44 ( 1 ), 197 – 200 ( 1992 ) doi: 10.1007/BF00048682 . Accessed 2022-04-12 OpenUrl CrossRef [26]. ↵ Claeskens , G. , Hjort , N.L. : Model Selection and Model Averaging . Cambridge series in statistical and probabilistic mathematics . Cambridge University Press , Cambridge; New York ( 2008 ) [27]. ↵ MacKay , D.J.C. : Information Theory , Inference and Learning Algorithms . Cambridge University Press , Cambridge, UK; New York ( 2003 ) [28]. ↵ Penny , W. , Kiebel , S. , Friston , K. : Variational Bayesian inference for fMRI time series . NeuroImage 19 ( 3 ), 727 – 741 ( 2003 ) doi: 10.1016/S1053-8119(03)00071-5 . Accessed 2015-10-30 OpenUrl CrossRef PubMed Web of Science [29]. Penny , W.D. , Trujillo-Barreto , N.J. , Friston , K.J. : Bayesian fMRI time series analysis with spatial priors . NeuroImage 24 ( 2 ), 350 – 362 ( 2005 ) doi: 10.1016/j.neuroimage.2004.08.034 . Accessed 2015-10-30 OpenUrl CrossRef PubMed Web of Science [30]. ↵ Soch , J. , Haynes , J.-D. , Allefeld , C. : How to avoid mismodelling in GLM-based fMRI data analysis: cross-validated Bayesian model selection . NeuroImage 141 , 469 – 489 ( 2016 ) doi: 10.1016/j.neuroimage.2016.07.047 . Accessed 2019-02-27 OpenUrl CrossRef [31]. ↵ Murphy , K.P. : Conjugate Bayesian Analysis of the Gaussian Distribution . Technical report, University of British Columbia ( 2007 ). https://www.cs.ubc.ca/texttildelowmurphyk/Papers/bayesGauss.pdf [32]. ↵ Penny , W. , Flandin , G. , Trujillo-Barreto , N. : Bayesian comparison of spatially regularised general lin-ear models . Human Brain Mapping 28 ( 4 ), 275 – 293 ( 2007 ) doi: 10.1002/hbm.20327 . Accessed 2015-10-30 OpenUrl CrossRef PubMed Web of Science [33]. Ostwald , D. , Starke , L. : Probabilistic delay differential equation modeling of event-related potentials . NeuroImage 136 , 227 – 257 ( 2016 ) doi: 10.1016/j.neuroimage.2016.04.025 OpenUrl CrossRef PubMed [34]. ↵ Soch , J. , Meyer , A.P. , Haynes , J.-D. , Allefeld , C. : How to improve parameter estimates in GLM-based fMRI data analysis: cross-validated Bayesian model averaging . NeuroImage 158 , 186 – 195 ( 2017 ) doi: 10.1016/j.neuroimage.2017.06.056 . Accessed 2019-08-21 OpenUrl CrossRef PubMed [35]. ↵ Molinaro , A.M. , Simon , R. , Pfeiffer , R.M. : Prediction error estimation: a comparison of resampling methods . Bioinformatics 21 ( 15 ), 3301 – 3307 ( 2005 ) doi: 10.1093/bioinformatics/bti499 . Accessed 2022-04-12 OpenUrl CrossRef PubMed Web of Science [36]. ↵ Airola , A. , Pahikkala , T. , Waegeman , W. , De Baets , B. , Salakoski , T. : An experimental comparison of cross-validation techniques for estimating the area under the ROC curve . Computational Statistics & Data Analysis 55 ( 4 ), 1828 – 1844 ( 2011 ) doi: 10.1016/j.csda.2010.11.018 . Accessed 2022-04-12 OpenUrl CrossRef Web of Science [37]. ↵ Hand , D.J. , Yu , K. : Idiot’s Bayes: Not So Stupid after All? International Statistical Review / Revue Internationale de Statistique 69 ( 3 ), 385 ( 2001 ) doi: 10.2307/1403452 . Accessed 2022-04-12 OpenUrl CrossRef [38]. ↵ Attias , H. : Inferring parameters and structure of latent variable models by variational bayes . In: Proceedings of the Fifteenth Conference on Uncertainty in Artificial Intelligence. UAI’99 , pp. 21 – 30 . Morgan Kaufmann Publishers Inc ., San Francisco, CA, USA ( 1999 ) [39]. ↵ Roweis , S. , Ghahramani , Z. : A Unifying Review of Linear Gaussian Models . Neural Computation 11 ( 2 ), 305 – 345 ( 1999 ) doi: 10.1162/089976699300016674 . Accessed 2022-09-08 OpenUrl CrossRef PubMed Web of Science [40]. ↵ Blei , D.M. , Kucukelbir , A. , McAuliffe , J.D. : Variational Inference: A Review for Statisticians . Journal of the American Statistical Association 112 ( 518 ), 859 – 877 ( 2017 ) doi: 10.1080/01621459.2017.1285773 . Accessed 2026-02-20 OpenUrl CrossRef [41]. ↵ Fisher , R.A. : The Use of Multiple Measurements in Taxonomic Problems . Annals of Eugenics 7 , 179 – 188 ( 1936 ) OpenUrl CrossRef Web of Science [42]. ↵ Koutsouleris , N. , Kahn , R.S. , Chekroud , A.M. , Leucht , S. , Falkai , P. , Wobrock , T. , Derks , E.M. , Fleischhacker , W.W. , Hasan , A. : Multisite prediction of 4-week and 52-week treatment outcomes in patients with first-episode psychosis: a machine learning approach . The Lancet Psychiatry 3 ( 10 ), 935 – 946 ( 2016 ) doi: 10.1016/S2215-0366(16)30171-7 . Accessed 2025-02-05 OpenUrl CrossRef PubMed [43]. ↵ Sanfelici , R. , Dwyer , D.B. , Antonucci , L.A. , Koutsouleris , N. : Individualized Diagnostic and Prognostic Models for Patients With Psychosis Risk Syndromes: A Metaanalytic View on the State of the Art . Biological Psychiatry 88 ( 4 ), 349 – 360 ( 2020 ) doi: 10.1016/j.biopsych.2020.02.009 . Accessed 2025-02-05 OpenUrl CrossRef PubMed [44]. ↵ Mahalanobis , P.C. : ON THE GENERALIZED DISTANCE IN STATISTICS . Sankhyā: The Indian Journal of Statistics, Series A (2008-) 80 , 1 – 7 ( 2018 ). Accessed 2023-07-05 OpenUrl [45]. ↵ Li , L. , Rakitsch , B. , Borgwardt , K. : ccSVM: correcting Support Vector Machines for confounding factors in biological data classification . Bioinformatics 27 ( 13 ), 342 – 348 ( 2011 ) doi: 10.1093/bioinformatics/btr204 . Accessed 2025-02-05 OpenUrl CrossRef [46]. ↵ Hamdan , S. , Love , B.C. , Polier , G.G. , Weis , S. , Schwender , H. , Eickhoff , S.B. , Patil , K.R. : Confound-leakage: confound removal in machine learning leads to leakage . GigaScience 12 , 071 ( 2022 ) doi: 10.1093/gigascience/giad071 . Accessed 2025-02-05 OpenUrl CrossRef [47]. ↵ Raffinetti , E. : A Rank Graduation Accuracy measure to mitigate Artificial Intelligence risks . Quality & Quantity 57 ( S2 ), 131 – 150 ( 2023 ) doi: 10.1007/s11135-023-01613-y . Accessed 2026-03-18 OpenUrl CrossRef [48]. ↵ Giudici , P. , Raffinetti , E. : RGA: a unified measure of predictive accuracy . Advances in Data Analysis and Classification 19 ( 1 ), 67 – 93 ( 2025 ) doi: 10.1007/s11634-023-00574-2 . Accessed 2026-03-18 OpenUrl CrossRef [49]. ↵ Thomson , A. , Randall-Maciver , R. : Ancient Races of the Thebaid . Oxford University Press , Oxford, United Kingdom ( 1905 ). http://www.dm.unibo.it/texttildelowsimoncin/Egyptian-Skulls.html [50]. ↵ Lecun , Y. , Bottou , L. , Bengio , Y. , Haffner , P. : Gradient-based learning applied to document recognition . Proceedings of the IEEE 86 ( 11 ), 2278 – 2324 ( 1998 ) doi: 10.1109/5.726791 . Accessed 2025-01-21 OpenUrl CrossRef [51]. ↵ Li Deng : The MNIST Database of Handwritten Digit Images for Machine Learning Research [Best of the Web] . IEEE Signal Processing Magazine 29 ( 6 ), 141 – 142 ( 2012 ) doi: 10.1109/MSP.2012.2211477 . Accessed 2025-01-21 OpenUrl CrossRef [52]. ↵ Hosmer , D.W. , Lemeshow , S. : Applied Logistic Regression . Wiley series in probability and mathematical statistics . Wiley , New York ( 1989 ). https://www.worldcat.org/title/applied-logistic-regression/oclc/19514573 [53]. ↵ Brodersen , K.H. , Ong , C.S. , Stephan , K.E. , Buhmann , J.M. : The Balanced Accuracy and Its Posterior Distribution . In: 2010 20th International Conference on Pattern Recognition , pp. 3121 – 3124 . IEEE , Istanbul, Turkey ( 2010 ). doi: 10.1109/ICPR.2010.764 . http://ieeexplore.ieee.org/document/5597285/ Accessed 2020-09-08 OpenUrl CrossRef [54]. ↵ Armada , M.A. , Sanfeliu , A. , Ferre , M. Carrillo , H. , Brodersen , K.H. , Castellanos , J.A. : Probabilistic Performance Evaluation for Multiclass Classification Using the Posterior Balanced Accuracy . In: Armada , M.A. , Sanfeliu , A. , Ferre , M. (eds.) ROBOT2013: First Iberian Robotics Conference vol. 252 , pp. 347 – 361 . Springer , Cham ( 2014 ). doi: 10.1007/978-3-319-03413-325 . http://link.springer.com/10.1007/978-3-319-03413-325 Accessed 2022-04-12 OpenUrl CrossRef [55]. ↵ Dosenbach , N.U.F. , Nardos , B. , Cohen , A.L. , Fair , D.A. , Power , J.D. , Church , J.A. , Nelson , S.M. , Wig , G.S. , Vogel , A.C. , Lessov-Schlaggar , C.N. , Barnes , K.A. , Dubis , J.W. , Feczko , E. , Coalson , R.S. , Pruett , J.R. , Barch , D.M. , Petersen , S.E. , Schlaggar , B.L. : Prediction of Individual Brain Maturity Using fMRI . Science 329 ( 5997 ), 1358 – 1361 ( 2010 ) doi: 10.1126/science.1194144 . Accessed 2020-10-28 OpenUrl Abstract / FREE Full Text [56]. ↵ Cole , J.H. , Poudel , R.P.K. , Tsagkrasoulis , D. , Caan , M.W.A. , Steves , C. , Spector , T.D. , Montana , G. : Predicting brain age with deep learning from raw imaging data results in a reliable and heritable biomarker . NeuroImage 163 , 115 – 124 ( 2017 ) doi: 10.1016/j.neuroimage.2017.07.059 . Accessed 2020-09-08 OpenUrl CrossRef PubMed [57]. ↵ Soch , J. : Distributional Transformation Improves Decoding Accuracy When Predicting Chronological Age From Structural MRI . Frontiers in Psychiatry 11 , 604268 ( 2020 ) doi: 10.3389/fpsyt.2020.604268 . Accessed 2021-06-02 OpenUrl CrossRef PubMed [58]. ↵ Fisch , L. , Leenings , R. , Winter , N.R. , Dannlowski , U. , Gaser , C. , Cole , J.H. , Hahn , T. : Editorial: Predicting Chronological Age From Structural Neuroimaging: The Predictive Analytics Competition 2019 . Frontiers in Psychiatry 12 , 710932 ( 2021 ) doi: 10.3389/fpsyt.2021.710932 . Accessed 2022-04-12 OpenUrl CrossRef PubMed [59]. ↵ Waschkies , K.F. , Soch , J. , Darna , M. , Richter , A. , Altenstein , S. , Beyle , A. , Brosseron , F. , Buchholz , F. , Butryn , M. , Dobisch , L. , Ewers , M. , Fliessbach , K. , Gabelin , T. , Glanz , W. , Goerss , D. , Gref , D. , Janowitz , D. , Kilimann , I. , Lohse , A. , Munk , M.H. , Rauchmann , B. , Rostamzadeh , A. , Roy , N. , Spruth , E.J. , Dechent , P. , Heneka , M.T. , Hetzer , S. , Ramirez , A. , Scheffler , K. , Buerger , K. , Laske , C. , Perneczky , R. , Peters , O. , Priller , J. , Schneider , A. , Spottke , A. , Teipel , S. , Düzel , E. , Jessen , F. , Wiltfang , J. , Schott , B.H. , Kizilirmak , J.M. : Machine learning-based classi-fication of Alzheimer’s disease and its at-risk states using personality traits, anxiety, and depression . International Journal of Geriatric Psychiatry 38 ( 10 ), 6007 ( 2023 ) doi: 10.1002/gps.6007 . Accessed 2024-05-02 OpenUrl CrossRef [60]. ↵ Treder , M.S. , Shock , J.P. , Stein , D.J. , Du Plessis , S. , Seedat , S. , Tsvetanov , K.A. : Correlation Constraints for Regression Models: Controlling Bias in Brain Age Prediction . Frontiers in Psychiatry 12 , 615754 ( 2021 ) doi: 10.3389/fpsyt.2021.615754 . Accessed 2025-02-05 OpenUrl CrossRef PubMed [61]. ↵ Cole , J.H. , Ritchie , S.J. , Bastin , M.E. , Valdés Hernández , M.C. , Muñoz Maniega , S. , Royle , N. , Corley , J. , Pattie , A. , Harris , S.E. , Zhang , Q. , Wray , N.R. , Redmond , P. , Marioni , R.E. , Starr , J.M. , Cox , S.R. , Wardlaw , J.M. , Sharp , D.J. , Deary , I.J. : Brain age predicts mortality . Molecular Psychiatry 23 ( 5 ), 1385 – 1392 ( 2018 ) doi: 10.1038/mp.2017.62 . Accessed 2020-10-28 OpenUrl CrossRef PubMed [62]. ↵ Giudici , P. : Safe machine learning . Statistics 58 ( 3 ), 473 – 477 ( 2024 ) doi: 10.1080/02331888.2024.2361481 . Accessed 2026-03-19 OpenUrl CrossRef [63]. ↵ Gunn , S.R. : Support Vector Machines for Classification and Regression . Project Report, s.n., University of Southampton ( 1998 ). https://eprints.soton.ac.uk/256459/ [64]. ↵ Brereton , R.G. , Lloyd , G.R. : Support Vector Machines for classification and regression . The Analyst 135 ( 2 ), 230 – 267 ( 2010 ) doi: 10.1039/B918972F . Accessed 2022-04-12 OpenUrl CrossRef PubMed [65]. ↵ Laaksonen , J. , Oja , E. : Classification with learning k-nearest neighbors . In: Proceedings of International Conference on Neural Networks (ICNN’96) , vol. 3 , pp. 1480 – 1483 . IEEE , Washington, DC, USA ( 1996 ). doi: 10.1109/ICNN.1996.549118 . http://ieeexplore.ieee.org/document/549118/ Accessed 2022-04-12 OpenUrl CrossRef [66]. ↵ Kramer , O. : K-Nearest Neighbors . In: Dimensionality Reduction with Unsupervised Nearest Neighbors vol. 51 , pp. 13 – 23 . Springer , Berlin, Heidelberg ( 2013 ). doi: 10.1007/978-3-642-38652-72 . http://link.springer.com/10.1007/978-3-642-38652-72 Accessed 2022-04-12 OpenUrl CrossRef [67]. ↵ Rosenblatt , F. : The perceptron: A probabilistic model for information storage and organization in the brain . Psychological Review 65 ( 6 ), 386 – 408 ( 1958 ) doi: 10.1037/h0042519 . Accessed 2022-04-12 OpenUrl CrossRef PubMed Web of Science [68]. ↵ Berkson , J. : Application to the Logistic Function to Bio-Assay . Journal of the American Statistical Association 39 ( 227 ), 357 ( 1944 ) doi: 10.2307/2280041 . Accessed 2022-04-12 OpenUrl CrossRef Web of Science [69]. ↵ Berkson , J. : Why I Prefer Logits to Probits . Biometrics 7 ( 4 ), 327 ( 1951 ) doi: 10.2307/3001655 . Accessed 2022-04-12 OpenUrl CrossRef [70]. ↵ Domingos , P. , Pazzani , M. : On the Optimality of the Simple Bayesian Classifier under Zero-One Loss . Machine Learning 29 ( 2-3 ), 103 – 130 ( 1997 ) doi: 10.1023/A:1007413511361 . Accessed 2026-02-20 OpenUrl CrossRef [71]. ↵ Franc , V. , Zien , A. , Schölkopf , B. : Support Vector Machines as Probabilistic Models . In: Proceedings of the 28th International Conference on Machine Learning , pp. 665 – 672 . International Machine Learning Society , Madison, WI, USA ( 2011 ) [72]. ↵ Platt , J.C. : Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Like-lihood Methods . In: Advances in Large Margin Classifiers , pp. 61 – 74 . MIT Press, ??? ( 1999 ) [73]. ↵ Lin , H.-T. , Lin , C.-J. , Weng , R.C. : A note on Platt’s probabilistic outputs for support vector machines . Machine Learning 68 ( 3 ), 267 – 276 ( 2007 ) doi: 10.1007/s10994-007-5018-6 . Accessed 2022-05-30 OpenUrl CrossRef [74]. ↵ Hackmack , K. , Paul , F. , Weygandt , M. , Allefeld , C. , Haynes , J.-D. : Multi-scale classification of disease using structural MRI and wavelet transform . NeuroImage 62 ( 1 ), 48 – 58 ( 2012 ) doi: 10.1016/j.neuroimage.2012.05.022 . Accessed 2019-03-01 OpenUrl CrossRef PubMed Web of Science [75]. ↵ Eitel , F. , Schulz , M.-A. , Seiler , M. , Walter , H. , Ritter , K. : Promises and pitfalls of deep neural networks in neuroimaging-based psychiatric research . Experimental Neurology 339 , 113608 ( 2021 ) doi: 10.1016/j.expneurol.2021.113608 . Accessed 2022-02-23 OpenUrl CrossRef PubMed [76]. ↵ Huys , Q.J.M. , Maia , T.V. , Frank , M.J. : Computational psychiatry as a bridge from neuroscience to clinical applications . Nature Neuroscience 19 ( 3 ), 404 – 413 ( 2016 ) doi: 10.1038/nn.4238 . Accessed 2020-09-08 OpenUrl CrossRef PubMed [77]. ↵ Frässle , S. , Marquand , A.F. , Schmaal , L. , Dinga , R. , Veltman , D.J. , Wee , N.J.A. , Tol , M.-J. , Schöbi , D. , Penninx , B.W.J.H. , Stephan , K.E. : Predicting individual clinical trajectories of depression with generative embedding . NeuroImage: Clinical 26 , 102213 ( 2020 ) doi: 10.1016/j.nicl.2020.102213 . Accessed 2022-04-13 OpenUrl CrossRef PubMed [78]. ↵ Chang , C.-C. , Lin , C.-J. : LIB-SVM FAQ . www.csie.ntu.edu.tw ( 2022 ). https://www.csie.ntu.edu.tw/texttildelowcjlin/libsvm/faq.html#f419 [79]. ↵ Chih-Wei Hsu , Chih-Jen Lin : A comparison of methods for multiclass support vector machines . IEEE Transactions on Neural Networks 13 ( 2 ), 415 – 425 ( 2002 ) doi: 10.1109/72.991427 . Accessed 2022-04-13 OpenUrl CrossRef PubMed Web of Science [80]. ↵ Hofmann , T. , Schölkopf , B. , Smola , A.J. : Kernel methods in machine learning . The Annals of Statistics 36 ( 3 ) ( 2008 ) doi: 10.1214/009053607000000677 . Accessed 2022-04-13 OpenUrl CrossRef [81]. ↵ Scholkopf , B. , Kah-Kay Sung , Burges , C.J.C. , Girosi , F. , Niyogi , P. , Poggio , T. , Vapnik , V. : Comparing support vector machines with Gaussian kernels to radial basis function classifiers . IEEE Transactions on Signal Processing 45 ( 11 ), 2758 – 2765 ( 1997 ) doi: 10.1109/78.650102 . Accessed 2022-04-13 OpenUrl CrossRef [82]. ↵ Nelder , J.A. , Wedderburn , R.W.M. : Generalized Linear Models . Journal of the Royal Statistical Society. Series A (General) 135 ( 3 ), 370 ( 1972 ) doi: 10.2307/2344614 . Accessed 2022-05-30 OpenUrl CrossRef [83]. ↵ Hebart , M.N. , Baker , C.I. : Deconstructing multivariate decoding for the study of brain function . NeuroImage 180 , 4 – 18 ( 2018 ) doi: 10.1016/j.neuroimage.2017.08.005 . Accessed 2022-04-13 OpenUrl CrossRef PubMed [84]. ↵ Bernardo , J.M. , Bayarri , M.J. , Berger , J.O. , Dawid , A.P. , Heckerman , D. , Smith , A.F.M. , West , M. Polson , N.G. , Scott , J.G. : Shrink globally, act locally: Sparse Bayesian regularization and prediction . In: Bernardo , J.M. , Bayarri , M.J. , Berger , J.O. , Dawid , A.P. , Heckerman , D. , Smith , A.F.M. , West , M. (eds.) Bayesian Statistics 9 , pp. 501 – 538 . Oxford University Press, ??? ( 2010 ). doi: 10.1093/acprof:oso/9780199694587.003.0022 OpenUrl CrossRef [85]. ↵ Mitchell , T.J. , Beauchamp , J.J. : Bayesian variable selection in linear regression . Journal of the American Statistical Association 83 ( 404 ), 1023 – 1032 ( 1988 ) doi: 10.1080/01621459.1988.10478694 OpenUrl CrossRef [86]. ↵ George , E.I. , McCulloch , R.E. : Variable selection via Gibbs sampling . Journal of the American Statistical Association 88 ( 423 ), 881 – 889 ( 1993 ) doi: 10.1080/01621459.1993.10476353 OpenUrl CrossRef [87]. ↵ West , M. : On scale mixtures of normal distributions . Biometrika 74 ( 3 ), 646 – 648 ( 1987 ) doi: 10.1093/biomet/74.3.646 OpenUrl CrossRef Web of Science [88]. ↵ Gelman , A. , Jakulin , A. , Pittau , M.G. , Su , Y.-S. : A weakly informative default prior distribution for logistic and other regression models . The Annals of Applied Statistics 2 ( 4 ), 1360 – 1383 ( 2008 ) doi: 10.1214/08-AOAS191 OpenUrl CrossRef Web of Science [89]. ↵ Tikhonov , A.N. : Solution of incorrectly formulated problems and the regularization method . Soviet Mathematics 4 , 1035 – 1038 ( 1963 ) OpenUrl [90]. ↵ Hoerl , A.E. , Kennard , R.W. : Ridge Regression: Biased Estimation for Nonorthogonal Problems . Technometrics 12 ( 1 ), 55 – 67 ( 1970 ) doi: 10.1080/00401706.1970.10488634 . Accessed 2022-04-13 OpenUrl CrossRef [91]. ↵ Frässle , S. , Lomakina , E.I. , Razi , A. , Friston , K.J. , Buhmann , J.M. , Stephan , K.E. : Regression DCM for fMRI . NeuroImage 155 , 406 – 421 ( 2017 ) doi: 10.1016/j.neuroimage.2017.02.090 . Accessed 2022-04-13 OpenUrl CrossRef PubMed [92]. ↵ Penny , W.D. , Stephan , K.E. , Daunizeau , J. , Rosa , M.J. , Friston , K.J. , Schofield , T.M. , Leff , A.P. : Comparing Families of Dynamic Causal Models . PLoS Computational Biology 6 ( 3 ), 1000709 ( 2010 ) doi: 10.1371/journal.pcbi.1000709 . Accessed 2015-10-30 OpenUrl CrossRef [93]. ↵ Aponte , E.A. , Yao , Y. , Raman , S. , Frässle , S. , Heinzle , J. , Penny , W.D. , Stephan , K.E. : An introduction to thermodynamic integration and application to dynamic causal models . Cognitive Neurodynamics 16 ( 1 ), 1 – 15 ( 2022 ) doi: 10.1007/s11571-021-09696-9 . Accessed 2022-04-13 OpenUrl CrossRef PubMed [94]. ↵ Friston , K. , Mattout , J. , Trujillo-Barreto , N. , Ashburner , J. , Penny , W. : Variational free energy and the Laplace approximation . NeuroImage 34 ( 1 ), 220 – 234 ( 2007 ) doi: 10.1016/j.neuroimage.2006.08.035 . Accessed 2015-10-30 OpenUrl CrossRef PubMed Web of Science [95]. ↵ Särkkä , S. : Bayesian Filtering and Smoothing . Institute of Mathematical Statistics textbooks , vol. 3 . Cambridge University Press , Cambridge, U.K.; New York ( 2013 ) [96]. ↵ Anderson , T.W. : An Introduction to Multivariate Statistical Analysis , 3rd edn. Wiley , New York ( 2003 ) [97]. ↵ Glanz , H. , Carvalho , L. : An expectation–maximization algorithm for the matrix normal distribution with an application in remote sensing . Journal of Multivariate Analysis 167 , 31 – 48 ( 2018 ) doi: 10.1016/j.jmva.2018.03.010 . Accessed 2026-01-28 OpenUrl CrossRef [98]. ↵ Ramírez-Hassan , A. : Introduction to Bayesian Econometrics: A Guided Tour Using R . Bookdown, Online ( 2025 ). https://bookdown.org/aramir21/IntroductionBayesianEconometricsGuidedTour/ View the discussion thread. Back to top Previous Next Posted May 18, 2026. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Multivariate Bayesian Inversion for Classification and Regression Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Multivariate Bayesian Inversion for Classification and Regression Joram Soch , Carsten Allefeld bioRxiv 2025.05.09.653015; doi: https://doi.org/10.1101/2025.05.09.653015 Share This Article: Copy Citation Tools Multivariate Bayesian Inversion for Classification and Regression Joram Soch , Carsten Allefeld bioRxiv 2025.05.09.653015; doi: https://doi.org/10.1101/2025.05.09.653015 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neuroscience Subject Areas All Articles Animal Behavior and Cognition (7622) Biochemistry (17648) Bioengineering (13870) Bioinformatics (41880) Biophysics (21423) Cancer Biology (18553) Cell Biology (25458) Clinical Trials (138) Developmental Biology (13364) Ecology (19866) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15589) Genomics (22475) Immunology (17711) Microbiology (40327) Molecular Biology (17145) Neuroscience (88472) Paleontology (666) Pathology (2826) Pharmacology and Toxicology (4815) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-23T02:00:01.238055+00:00
License: CC-BY-ND-4.0