Multi-model ensembles in infectious disease and public health: Methods, interpretation, and implementation in R

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 85,328 characters · extracted from preprint-html · click to expand
Multi-model ensembles in infectious disease and public health: Methods, interpretation, and implementation in R | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Multi-model ensembles in infectious disease and public health: Methods, interpretation, and implementation in R View ORCID Profile Li Shandross , View ORCID Profile Emily Howerton , View ORCID Profile Lucie Contamin , View ORCID Profile Harry Hochheiser , View ORCID Profile Anna Krystalli , Consortium of Infectious Disease Modeling Hubs , View ORCID Profile Nicholas G. Reich , Evan L. Ray doi: https://doi.org/10.1101/2024.06.24.24309416 Li Shandross 1 University of Massachusetts Amherst Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Li Shandross Emily Howerton 2 The Pennsylvania State University Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Emily Howerton For correspondence: ehowerton{at}psu.edu Lucie Contamin 3 University of Pittsburgh Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lucie Contamin Harry Hochheiser 3 University of Pittsburgh Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Harry Hochheiser Anna Krystalli 4 R-RSE SMPC Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Anna Krystalli Nicholas G. Reich 1 University of Massachusetts Amherst Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Nicholas G. Reich Evan L. Ray 1 University of Massachusetts Amherst Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Combining predictions from multiple models into an ensemble is a widely used practice across many fields with demonstrated performance benefits. Popularized through domains such as weather forecasting and climate modeling, multi-model ensembles are becoming increasingly common in public health and biological applications. For example, multi-model outbreak forecasting provides more accurate and reliable information about the timing and burden of infectious disease outbreaks to public health officials and medical practitioners. Yet, understanding and interpreting multi-model ensemble results can be difficult, as there are a diversity of methods proposed in the literature with no clear consensus on which is best. Moreover, a lack of standard, easy-to-use software implementations impedes the generation of multi-model ensembles in practice. To address these challenges, we provide an introduction to the statistical foundations of applied probabilistic forecasting, including the role of multi-model ensembles. We introduce the hubEnsembles package, a flexible framework for ensembling various types of predictions using a range of methods. Finally, we present a tutorial and case-study of ensemble methods using the hubEnsembles package on a subset of real, publicly available data from the FluSight Forecast Hub. 1 Introduction Predictions of future outcomes are essential to planning and decision making, yet generating reliable predictions of the future is challenging. One method for overcoming this challenge is combining predictions across multiple, independent models. These combination methods (also called aggregation or ensembling) have been repeatedly shown to produce predictions that are more accurate 1 , 2 and more consistent 3 than individual models. Because of the clear performance benefits, multi-model ensembles are a widely used statistical tool across fields, including weather forecasting 4 , climate modeling 5 , and economics 6 . In the last decade, the number of multi-model ensemble predictions generated and used in real time for public health planning and response has grown rapidly. In particular, predicting infectious disease outbreaks and anticipating the effects of potential interventions has demonstrated utility for public health officials and medical practitioners. Underlying these predictions are mathematical models that use historical disease incidence data to make probabilistic predictions of incidence in the future 7 – 11 . Given the performance benefits of multi-model ensembles, it is becoming increasingly common to convene multiple modeling teams into a collaborative “hub” 12 , where each team generates independent predictions that are aggregated to collectively produce an ensemble. For example, this approach has been used to make real-time, multi-model predictions for seasonal influenza 10 , 13 , dengue 14 , West Nile virus 15 , and more recently SARS-CoV-2 16 – 18 . Generating multi-model ensembles or interpreting the resulting predictions depends on understanding the underlying statistical methodology. There are many proposed methods for generating ensembles, and these methods differ in at least one of two ways: (1) the function used to combine or “average” predictions, and (2) how predictions are weighted when performing the combination. A few methodological papers have discussed theory of multi-model ensembles and tested various ensembling methods in public health applications specifically 19 – 25 , yet there is no consensus on which method should be favored. There are software packages that support various aspects of multi-model ensembling 26 – 29 . However, these packages only support a subset of methods and prediction types, illustrating the need for standard, easy-to-use implementations of common methods. Here, we provide an introduction to the statistical foundations of multi-model ensembles in applied probabilistic forecasting ( Section 2 ). In addition, to improve accessibility, reproducibility, and interoperability, we have developed a comprehensive R package hubEnsembles that implements these methods. The hubEnsembles package provides a flexible framework for generating ensemble predictions from multiple models across a range of common methods and prediction types; it is situated within the broader “hubverse” collection of open-source software and data tools to facilitate the development and management of collaborative modeling exercises 30 . These two factors together, a simple implementation framework across methods and integration with hubverse data standards and tools, make hubEnsembles accessible and easy to use. Finally, we present a basic demonstration of multi-model ensemble generation and interpretation ( Section 4 ), and a more in-depth analysis using real influenza forecasts ( Section 5 ). Together, these case studies motivate a discussion and comparison of the various methods ( Section 6 ). While the case studies focus on infectious disease applications, the software and tools presented are general and could be used for applications in other areas of biomedical and public health research, or other domains. By reviewing multi-model ensemble methodology and synthesizing these methods into an easy-to use implementation, this tutorial provides guidance on understanding, interpreting, and implementing multi-model ensembles. 2 How to generate a multi-model ensemble In this section, we provide an overview of the process to generate a multi-model ensemble, including basic definitions of key statistical concepts in probabilistic forecasting, and an overview of the classes of methods that are typically used for generating a multi-model ensemble. 2.1 Key statistical concepts in forecasting Generating an ensemble requires multiple predictions to be combined, and a combination method for calculating the ensemble from these predictions ( Figure 1 ). These predictions will often be produced by different statistical or mathematical models, and the output from these models (referred to as “model output” from here on) will vary based on the setting. For example, some public health questions, such as short-term resource allocation, may depend on a forecast of public health outcomes weeks into the future, whereas longer-term decisions about vaccination schedules may require projections months into the future across multiple possible scenarios. Some intervention decisions, such as quarantine and isolation policy, may depend on estimates of key biological parameters such as the generation interval for an infectious pathogen. Throughout, we will use the general term “prediction” to encapsulate all such outcomes that could be modeled. Predictions can also capture varying degrees of uncertainty in the outcome. A point prediction gives a single estimate of an outcome while a probabilistic prediction provides an estimated probability distribution over a set of outcomes. In either case, the basic steps required to generate an ensemble are the same. Download figure Open in new tab Figure 1: Overview of process to generate a multi-model ensemble. Predictions, p i are generated from N independent models (step 1). Then those predictions, p = { p i | i ∈ 1, …, N }, are combined with some function, C , and set of weights, w = { w i | i ∈ 1, …, N }. This figure illustrates example probabilistic forecasts for incident influenza hospitalizations, where the median (line) and 90% prediction interval are shown. In this case, the ensemble is constructed using the linear pool method ( F LOP ( x )}). 2.2 Mathematical definitions and properties of ensemble methods Here, we use N to denote the total number of individual predictions that the ensemble will combine. For example, if predictions are produced by different models, N is the total number of models that have provided predictions. Individual predictions will be indexed by the subscript i . Optionally, one can calculate an ensemble that uses a weight w i for each prediction; we define the set of model-specific weights as w = { w i | i ∈ 1, …, N }. Informally, predictions with a larger weight have a greater influence on the ensemble prediction, though the details of this depend on the ensemble method (described further below). Then, for a set of N point predictions, p = { p i | i ∈ 1, …, N }, each from a distinct model i , an ensemble of these predictions is using any function C and any set of model-specific weights w . For example, an arithmetic average of predictions yields , where the weights are non-negative and sum to 1. If w i = 1/ N for all i , all predictions will be equally weighted. More complex functions for aggregation are also possible, such as a (weighted) median or geometric mean. For probabilistic predictions, there are two commonly used classes of methods to average or ensemble multiple predictions: quantile averaging (also called a Vincent average 31 ) and probability averaging (also called a distributional mixture or linear opinion pool 32 ) 33 . To define these two classes of methods, let F ( x ) be a cumulative density function (CDF) defined over values x of the target variable for the prediction, and F −1 ( θ ) be the corresponding quantile function defined over quantile levels θ ∈ [0, 1]. Throughout this article, we may refer to x as either a ‘value of the target variable’ or a ‘quantile’ depending on the context, and similarly we may refer to θ as either a ‘quantile level’ or a ‘(cumulative) probability’. Additionally, we will use f ( x ) to denote a probability mass function (PMF) for a prediction of a discrete variable or a discretization (such as binned values) of a continuous variable. The quantile average combines a set of quantile functions, , with a given set of weights, w , as This computes the average value of predictions across different models for each fixed quantile level θ . For a normal distribution or any distribution with a location and scale parameter, the resulting quantile average will be the same type of distribution, with location and scale parameters that are the average of the location and scale parameters from the individual distributions ( Figure 2 , panel B). In other words, this method interprets the predictive probability distributions that are being combined as uncertain estimates of a single true distribution. It is also possible to use other combination functions, such as a weighted median, to combine quantile predictions. Download figure Open in new tab Figure 2: (Panel A) Example predictions from two distributions ( N (100, 10) in purple and N (120, 5) in green) shown as cumulative distribution functions (CDFs). To ease submission to a hub, the prediction can be summarized at a fixed number of points along the distribution. Here, the solid points show model output for seven fixed quantile levels ( θ = 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, and 0.99). The y-axis ticks show each of the fixed quantile levels. The associated values for each fixed quantile level are shown with vertical lines. (Panel B) Quantile average ensemble, which is calculated by averaging values for each fixed quantile level (represented by horizontal dashed gray lines). The distributions and corresponding model outputs from panel A are re-plotted and the black line shows the resulting quantile average ensemble. Inset shows corresponding probability density functions (PDFs). (Panel C) Linear pool ensemble, which is calculated by averaging cumulative probabilities for each fixed value (represented by vertical dashed gray lines). The distributions and corresponding model outputs from panel A are re-plotted. To calculate the linear pool in this case, where model outputs are not defined for the same values (i.e., vertical lines in Panel A do not line up), the model outputs are used to interpolate the full CDF for each distribution from which predicted cumulative probabilities can be extracted for fixed values (shown with open circles). The black line shows the resulting linear pool average ensemble. Inset shows corresponding PDFs. The probability average or linear pool is calculated by averaging probabilities across predictions for a fixed value of the target variable, x . In other words, for a set of CDFs ℱ = { F i ( x )| i ∈ 1, …, N } and weights w , the linear pool is calculated as For a set of PMF values, { f i ( x )| i ∈ 1, …, N }, the linear pool can be equivalently calculated as . Statistically this amounts to a mixture of the probability distributions, and the resulting probability distribution can be interpreted as one where the constituent probability distributions represent alternative predictions of the future, each of which has a probability w i of being the true one. For a visual depiction of these equations, see Figure 2 below. The different averaging methods for probabilistic predictions yield different properties of the resulting ensemble distribution. For example, the variance of the linear pool is , where μ i is the mean and is the variance of individual prediction i , and although there is no closed-form variance for the quantile average, the variance of the quantile average will always be less than or equal to that of the linear pool 33 . Both methods generate distributions with the same mean, , which is the mean of individual model means 33 . The linear pool method preserves variation between individual models, whereas the quantile average cancels away this variation under the assumption it constitutes sampling error 24 . 2.3 Applications in public health and infectious disease outbreaks Multi-model ensembles have become the gold standard for forecasting and prediction efforts that support public health in real time 12 , 34 – 36 . One prominent domain is forecasting key characteristics of infectious disease outbreaks, including weekly incidence or healthcare demand over future weeks, disease burden for the entire season, and timing and magnitude of the outbreak peak 14 – 16 , 23 , 37 . Projections of disease outcomes under multiple possible future scenarios have also been used to estimate intervention effectiveness to inform policy 38 – 40 , and it has been proposed to use short-term forecasts of incidence to inform vaccine efficacy trials 41 . Standard guidelines for reporting of prediction efforts in outbreak and public health settings have also been established 42 . Across a variety of pathogens and outbreak settings, multi-model ensembles have been shown to produce forecasts that are as good or better than the individual models that compose the ensemble 10 , 13 , 14 , 16 – 19 , 23 . Notably, the ensemble does not always outperform the best model, but typically offers improved consistency and robustness over individual models 16 , 17 , 20 , 43 . However, in one instance, a baseline historical average of West Nile Virus cases in the US outperformed most model predictions, including the ensemble 15 . Examinations of ensemble methodology in infectious disease contexts suggest there is not one method that universally performs best. In short-term forecasting settings, a simple linear pool average of component predictions tends to produce prediction intervals that are too wide (i.e., suggesting outcomes are more uncertain than in reality); beta-transformation 44 and dynamic weighting 45 have been suggested to mitigate this problem. A median quantile average has been shown to provide similar performance to a weighted mean in short-term forecasting challenges, while also offering robustness to changes in performance across individuals models 25 , and was thus used as the primary ensemble for short-term forecasts of COVID-19 16 , 17 . For longer-term predictions of COVID-19, a trimmed LOP ensemble performed best, as models tended to be more overconfident in this setting 18 . The number of models submitting real-time predictions has varied dramatically (from as few as four models for longer-term predictions of COVID-19 18 to more than 40 for short-term forecasts of COVID-19 16 ). Research including applications across influenza and COVID-19 suggests that at least 3 models are needed, with diminishing returns for every model that is added 46 . The growing body of literature on multi-model ensembles in public health domains emphasizes the utility of these approaches to inform response in real time. Future research on optimizing ensemble performance for different targets and time horizons will further improve utility. Moreover, expanding the use of these methods to other pathogens and countries will enable further methodological development. 3 How to implement ensemble calculations The methods described in Section 2 are implemented via the hubEnsembles package in a flexible, easy-to-use framework. Importantly, hubEnsembles is situated within the broader hubverse software infrastructure, which provides data standards and conventions for representing and working with model predictions 30 , including for example, collecting and manipulating predictions (hubUtils) as well as visualization ( hubVis ). In 2024-2025, the hubverse supported over a dozen collaborative modeling hubs used by public health agencies across the globe. We begin with a short overview of hubverse concepts and conventions that support the process of combining model predictions, supplemented by example predictions provided by the hubverse in hubExamples , then explain the implementation of the two primary ensembling functions included in the package, simple_ensemble() and linear_pool() . 3.1 Terminology and data standards in the hubverse In the hubverse, predictions are always represented in a standardized tabular format called “model output”, codified by the model_out_tbl S3 class in hubUtils (a package of basic utility functions). Each row represents a single, unique prediction while the columns provide information about what is being predicted, its scope, and value. A single model output object can store and organize many predictions while remaining easy to parse at a glance, which is particularly useful when collecting predictions from multiple models to combine into an ensemble. Any tabular predictions can be transformed into model output using the as_model_out_tbl () function from hubUtils (see Section 5 for an example). The model_out_tbl class is defined by four standard types of columns: (i) the model ID, which denotes which model has produced the prediction; (ii) the task IDs (also referred to as “task ID variables” or “task ID columns”), which provide details about what is being predicted; (iii) the model output representation, which specify the type of prediction and other identifying information; and (iv) the value of the prediction itself. While most of these columns are always required and have standardized column names, the task ID variables may vary according to the needs of the modeling hub or modeling task 30 . Table 1 provides an example of model output that stores short-term forecasts of weekly flu hospitalizations for different US states and territories. By reading across the table, we can see that these are quantile predictions ( output_type ) of the quartiles (output type ID: otid ) from a single model (model_id of “team1-mod”) for four distinct forecast horizons. Here, details about the prediction related to modeling task are represented by the task ID variables loc (location abbreviation), ref_date (reference date: the “starting point” of the forecasts), h (horizon: how many weeks into the future, relative to the ref_date ), and target (what is being predicted). View this table: View inline View popup Download powerpoint Table 1: Example of forecasts for incident influenza hospitalizations, formatted according to hubverse standards. Quantile predictions for the median and 50% prediction intervals from a single model are shown for four distinct horizons. The output_type_id column’s name has been shortened to otid for brevity. These predictions are a modified subset of the forecast_outputs data provided by the hubExamples package. As mentioned previously, task ID variables are not fixed in name, number, or composition to incorporate flexibility in the model_out_tbl class. Different modeling efforts may use different sets of task ID columns with different values to define their prediction goals, or may simply choose distinct names to represent the same concept. For example, the date task column was named ref_date above but could easily be called origin_date or forecast_date instead. Some standard examples of task ID variables are available on the hubverse documentation website 30 . The “model output representation” columns output_type and output_type_id contain metadata about how the predictions are conveyed. The hubverse data standards require that these columns are included in a model_out_tbl . The output_type column defines how a prediction is represented and may be “mean” or “median” for point predictions, or one of “quantile” , “cdf” , “pmf” , or “sample” for probabilistic predictions. The output_type_id provides additional identifying information for prediction and is specific to the particular output_type (see Table 2 ). The last column, value, always contains the numeric value of the prediction, regardless of output type. Requirements for the values of the output_type_id and value columns associated with each valid output type are summarized in Table 2 . View this table: View inline View popup Download powerpoint Table 2: A table summarizing how the model output representation columns are used for predictions of different output types; adapted from hubverse documentation 30 . (*Rows of sample predictions from a particular model that share an output type ID value may be assumed to represent a single sample from a joint distribution across multiple levels of the task ID variables.) All output types can summarize predictions from univariate marginal distributions, e.g. for a single location and time point. The sample output type, which represents randomly drawn values from a probabilistic predictive distribution, is unique in that it can additionally represent predictions from joint predictive distributions. This means that samples may encode dependence across combinations of multiple values for task ID variables, for example across multiple locations and/or time points. In this case, rows of sample predictions with the same index (specified by the output_type_id ) from a particular model may be assumed to correspond to a single sample from a joint distribution. For quantile predictions, the output_type_id is a numeric value between 0 and 1 specifying the cumulative probability associated with the quantile prediction. In the notation we defined in Section 2 , the output_type_id corresponds to θ and the value is the quantile prediction F −1 ( θ ). For CDF or PMF predictions, the output_type_id is the target variable value x at which the cumulative distribution function or probability mass function for the predictive distribution should be evaluated, and the value column contains the predicted F ( x ) or f ( x ), respectively. The hubverse also provides standards for target data (i.e., observed data corresponding to each prediction target), which can be stored in one of two formats: target time series or oracle output. The two tabular representations differ in terms of columns and purposes. Target time series data is a more traditional representation of the observed “truth” in a time series format with minimal columns; this format usually serves as calibration data for generating forecasts or might be used for visualization of predictions. Oracle output, on the other hand, represents prediction that an “oracle model” would have made had it known the observed values in advance. This format resembles model output data and is suited for evaluating forecasts. Some examples of target data are given in Section 4. 3.2 Ensemble functions in hubEnsembles The hubEnsembles package contains two functions that perform ensemble calculations: simple_ensemble() , which applies some function to each model prediction, and linear_pool() , which computes an ensemble using the linear opinion pool method. In the following sections, we outline the implementation details for each function and how these implementations correspond to the statistical ensembling methods described in Section 2 . A short description of the calculation performed by each function is summarized by output type in Table 3 . View this table: View inline View popup Download powerpoint Table 3: Summary of ensemble function calculations for each output type. The ensemble function determines the operation that is performed, and in the case of probabilistic output types (quantile, CDF, PMF), this also determines what ensemble distribution is generated (quantile average, , or linear pool, F LOP ( x )). The ensembled predictions are returned in the same output type as the inputs. Thus, the output type determines how the resulting ensemble distribution is summarized (as a quantile, F −1 ( θ ), cumulative probability, F ( x ), or probability f ( x )). Estimating individual model cumulative probabilities is required to compute a linear_pool() for predictions of quantile output type; see Section 3.2.2 on the linear pool operation for details. In the case of simple_ensemble() , we show the calculations for the default case where agg_fun = mean; however, if another aggregation function is chosen (e.g., agg_fun = median ), that calculation would be performed instead. For example, simple_ensemble(…, agg_fun = median) applied to predictions of mean output type would return the median of individual model means. 3.2.1 Simple ensemble The simple_ensemble() function directly computes an ensemble from component model outputs by combining them via an aggregation function ( C ) within each unique combination of task ID variables, output types, and output type IDs. This function can be used to summarize predictions of output types mean, median, quantile, CDF, and PMF. The mechanics of the ensemble calculations are the same for each of the output types, though the resulting statistical ensembling method differs for different output types ( Table 3 ). By default, simple_ensemble() uses the mean for the aggregation function C and equal weights for all models. For point predictions with a mean or median output type, the resulting ensemble prediction is an equally weighted average of the individual models’ predictions. For probabilistic predictions in a quantile format, by default simple_ensemble() calculates an equally weighted average of individual model target variable values at each quantile level, which is equivalent to a quantile average. For model outputs in a CDF or PMF format, by default simple_ensemble() computes an equally weighted average of individual model (cumulative or bin) probabilities at each target variable value, which is equivalent to the linear pool method. Any aggregation function C may be specified by the user. For example, a median ensemble may also be created by specifying median as the aggregation function, or a custom function may be passed to the agg_fun argument to create other ensemble types. Similarly, model weights can be specified to create a weighted ensemble. 3.2.2 Linear pool The linear_pool() function implements the linear opinion pool (LOP) method for ensembling predictions. Currently, this function can be used to combine predictions with output types mean, quantile, CDF, PMF, and sample. Unlike simple_ensemble() , this function handles its computation differently based on the output type. For the CDF, PMF, and mean output types, the linear pool method is equivalent to calling simple_ensemble() with a mean aggregation function (see Table 3 ), since simple_ensemble() produces a linear pool prediction (an average of individual model cumulative or bin probabilities). For the sample output type, the LOP method collects a stratified draw of the individual models’ predictions and pools them into a single ensemble distribution. By default, all samples are used to create this ensemble. Additionally, only equally-weighted linear pools of samples are supported by the hubEnsembles package during this time. Samples may also be converted to another common output type such as quantiles or bin probabilities (as the main scientific interest often concerns a summary of samples), and other ensemble methods may then be utilized for that output type. For the quantile output type, implementation of LOP is comparatively less straightforward. This is because LOP averages cumulative probabilities at each value of the target variable, but the predictions are given as quantiles (on the scale of the target variable) for fixed quantile levels. The value for these quantile predictions will generally differ between models; hence, we are typically not provided cumulative probabilities at the same values of the target variable for all component predictions. This lack of alignment between cumulative probabilities for the same target variable values impedes computation of LOP from quantile predictions and is illustrated in panel A of Figure 2 . Given that LOP cannot be directly calculated from quantile predictions, we must first obtain an estimate of the CDF for each component distribution from the provided quantiles, combine the CDFs, then calculate the quantiles using the ensemble’s CDF. We perform this calculation in three main steps, assisted by the distfromq package 47 for the first two: Interpolate and extrapolate from the provided quantiles for each component model to obtain an estimate of the CDF of that particular distribution. Draw samples from each component model distribution. To reduce Monte Carlo variability, we use quasi-random samples corresponding to quantiles of the estimated distribution 48 . Pool the samples from all component models and extract the desired quantiles. For step 1, functionality in the distfromq package uses a monotonic cubic spline for interpolation on the interior of the provided quantiles. The user may choose one of several distributions to perform extrapolation of the CDF tails. These include normal, lognormal, and Cauchy distributions, with “normal” set as the default. A location-scale parameterization is used, with separate location and scale parameters chosen in the lower and upper tails so as to match the two most extreme quantiles. The sampling process described in steps 2 and 3 approximates the linear pool calculation described in Section 2 . 4 A simple demonstration of multi-model ensembles In this section, we provide a simple example to illustrate how to compute a multi-model ensemble and compare the methods supported by the functions of hubEnsembles . In doing so, we use a number of other packages available through the hubverse, including to access example data and to visualize outputs. See the Code Availability Statement for details about implementation and required package versions. 4.1 Example data: a forecast hub The first step in generating a multi-model ensemble is to gather the predictions we wish to combine. In this example, we use some short-term forecasts already formatted as model output data from the hubExamples package. These model outputs are from a larger example modeling hub, created using a modified subset of predictions from the FluSight Forecasting challenge (discussed in further detail in Section 5 ). In addition to toy model output data, the example hub also includes observed data in the form of target time series data and oracle output. The model output is stored in a data object named forecast_outputs and contains mean, median, quantile, and sample forecasts of future incident influenza hospitalizations, as well as CDF and PMF forecasts of hospitalization intensity. Each prediction is described by five task ID variables: the location for which the forecast was made ( location ), the date on which the forecast was made ( reference_date ), the number of steps ahead ( horizon ), the date of the forecast prediction ( target_end_date , a combination of the date the forecast was made and the forecast horizon), and the forecast target ( target ). We begin by examining the predictions for weekly incident influenza hospitalizations, displaying a subset of each output type in Table 4 . View this table: View inline View popup Download powerpoint Table 4: Example model output for forecasts of weekly incident influenza hospitalizations. A subset of example model output is shown: 1-week ahead forecasts made on 2022-12-17 for Massachusetts from three distinct models; only the mean, median, select samples, and the 5th, 25th, 75th and 95th quantiles are displayed. The location, reference_date and target_end_date columns have been omitted for brevity. This example data is provided in the hubExamples package. View this table: View inline View popup Table 5: Example target time series data for incident influenza hospitalizations. This table includes target time series data for Massachusetts (FIPS code 25) between 2022-11-01 and 2023-02-01. The target data is provided in the hubExamples package. While the hubExamples package provides both formats of target data, we focus on the target time series data ( Table 5 ) which is convenient for making forecasts and plotting. The forecast_target_ts data object provides observed incident influenza hospitalizations in a given week for given location using columns observation, date , and location . The forecast-specific task ID variables reference_date and horizon are not relevant for this time series representation of the target data, and are thus not included as columns. We can plot the quantile and median forecasts and the target time series data ( Figure 3 ) shown above using the plot_step_ahead_model_output() function from hubVis , another package in the hubverse suite for visualizing model outputs. We subset the model output data and the target data to the location and time horizons we are interested in. Download figure Open in new tab Figure 3: One example set of quantile forecasts for weekly incident influenza hospitalizations in Massachusetts from each of three models (panels). Forecasts are represented by a median (line), 50% and 90% prediction intervals (ribbons). Gray points represent observed incident hospitalizations. Download figure Open in new tab Next, we examine the PMF forecasts for hospitalization intensity in the example model output data. For this target, teams forecasted the probability that hospitalization intensity will be “low”, “moderate”, “high”, or “very high”. These four categories are determined by thresholds for weekly hospital admissions per 100,000 population. In other words, “low” hospitalization intensity in a given week means few incident influenza hospitalizations per 100,000 population are predicted, whereas “very high” hospitalization intensity means many hospitalizations per 100,000 population are predicted. These forecasts are made for the same task ID variables as the quantile forecasts of incident hospitalizations except for the target, which is “wk flu hosp rate category” for these categorical predictions. We show a representative example of the hospitalization intensity category forecasts in Table 6 . Because these forecasts are PMF output type, the output_type_id column specifies the bin of hospitalization intensity and the value column provides the forecasted probability of hospitalization incidence being in that category. Values sum to 1 across bins. For the MOBS-GLEAM_FLUH and PSI-DICE models, incidence is forecasted to decrease over the horizon ( Figure 3 ), and correspondingly, there is lower probability of “high” and “very high” hospitalization intensity for later horizons ( Figure 4 ). View this table: View inline View popup Table 6: Example PMF model output for forecasts of incident influenza hospitalization intensity. A subset of predictions are shown: 1-week ahead PMF forecasts made on 2022-12-17 for Massachusetts from three distinct models. We round the forecasted probability (in the value column) to two digits. The location, reference_date and target_end_date columns have been omitted for brevity. This example data is provided in the hubExamples package. Download figure Open in new tab Figure 4: One example PMF forecast of incident influenza hospitalization intensity is shown for each of three models (panels). Each cell shows the forecasted probability of a given hospitalization intensity bin (low, moderate, high, and very high) for each forecast horizon (0-3 weeks ahead). Darker colors indicate higher forecasted probability. 4.2 Creating ensembles with simple_ensemble Using the default options for simple_ensemble() , we can generate an equally weighted mean ensemble for each unique combination of values for the task ID variables, the output_type and the output_type_id . Recall that this function corresponds to different statistical ensemble methods for different output types: for the quantile output type in our example data, the resulting ensemble is a quantile average, while for the CDF and PMF output types, the ensemble is a linear pool ( Table 3 ). Download figure Open in new tab The resulting model output has the same structure as the original model output data ( Table 7 ), with columns for model ID, task ID variables, output type, output type ID, and value. We also use model_id = “simple-ensemble-mean” to change the name of this ensemble in the resulting model output; if not specified, the default is “hub-ensemble”. View this table: View inline View popup Download powerpoint Table 7: Mean ensemble model output. The values in the model_id column are set by the argument simple_ensemble(…, model_id) . Results are generated for all output types, but only a subset are shown: 1-week ahead forecasts made on 2022-12-17 for Massachusetts, with only the mean, median, 25th and 75th quantilesfor the quantile output type and all bins for the PMF output type. The location, reference_date and target_end_date columns have been omitted for brevity, and the value column is rounded to two digits. 4.2.1 Changing the aggregation function We can change the function that is used to aggregate model outputs. For example, we may want to calculate a median of the component models’ submitted values for each quantile. We do so by specifying agg_fun = median . Download figure Open in new tab Custom functions can also be passed into the agg_fun argument. We illustrate this by defining a custom function to compute the ensemble prediction as a geometric mean of the component model predictions. Any custom function to be used must have an argument x for the vector of numeric values to summarize, and if relevant, an argument w of numeric weights. Download figure Open in new tab As expected, the mean, median, and geometric mean each give us slightly different resulting ensembles. The median point estimates, 50% prediction intervals, and 90% prediction intervals in Figure 5 demonstrate this. Download figure Open in new tab Figure 5: Three different ensembles for weekly incident influenza hospitalizations in Massachusetts. Each ensemble combines individual predictions from the example hub ( Figure 3 ) using a different method: arithmetic mean, geometric mean, or median. All methods correspond to variations of the quantile average approach. Ensembles are represented by a median (line), 50% and 90% prediction intervals (ribbons). Geometric mean ensemble and simple mean ensemble generate similar estimates in this case. 4.2.2 Weighting model contributions We can weight the contributions of each model in the ensemble using the weights argument of simple_ensemble() . This argument takes a data.frame that should include a model_id column containing each unique model ID and a weight column. In the following example, we include the baseline model in the ensemble, but give it less weight than the other forecasts. Download figure Open in new tab Download figure Open in new tab 4.3 Creating ensembles with linear_pool We can also generate a linear pool ensemble, or distributional mixture, using the linear_pool() function; this function can be applied to predictions with an output_type of mean, quantile, sample, CDF, or PMF. Our example hub includes the median output type, so we exclude it from the calculation. Download figure Open in new tab As described above, for quantile model outputs, the linear_pool function approximates the full probability distribution for each component prediction using the value-quantile pairs provided by that model, and then obtains quasi-random samples from that distributional estimate. The number of samples drawn from the distribution of each component model defaults to 1e4 , but this can be changed using the n_samples argument. In Figure 6 , we compare ensemble results generated by simple_ensemble() and linear_pool() for model outputs of output types PMF and quantile. As expected, the results from the two functions are equivalent for the PMF output type: for this output type, the simple_ensemble() method averages the predicted probability of each category across the component models, which is the definition of the linear pool ensemble method. This is not the case for the quantile output type, because the simple_ensemble() is computing a quantile average. Download figure Open in new tab Figure 6: Comparison of results from linear_pool() (blue) and simple_ensemble() (red). (Panel A) Ensemble predictions of Massachusetts incident influenza hospitalization intensity (classified as low, moderate, high, or very high), which provide an example of PMF output type. (Panel B) Ensemble predictions of weekly incident influenza hospitalizations in Massachusetts, which provide an example of quantile output type. Note, for quantile output type, simple_ensemble() corresponds to a quantile average. Ensembles combine individual models from the example hub, and are represented by a median (line), 50% and 90% prediction intervals (ribbons) ( Figure 3 ). 4.3.1 Weighting model contributions Like with simple_ensemble() , we can change the default function settings. For example, weights that determine a model’s contribution to the resulting ensemble may be provided. (Note that we must exclude the sample output type here because it cannot yet be combined into weighted ensembles.) Download figure Open in new tab 4.3.2 Changing the parametric family used for extrapolation into distribution tails When requesting a linear pool of quantiles, we may also change the distribution that distfromq uses to approximate the tails of component models’ predictive distributions to either log normal or Cauchy using the tail_dist argument (the default is normal) 47 . This choice usually does not have a large impact on the resulting ensemble distribution, though, and can only be seen in its outer edges. (For more details and other function options, see the documentation in the distfromq package at https://reichlab.io/distfromq/ .) Download figure Open in new tab 4.3.3 Requesting a subset of input sample predictions to be ensembled Recall that for the sample output type, linear_pool() defaults to creating an equally-weighted ensemble by collecting and returning all provided sample predictions, so that the total number of samples for the ensemble is equal to the sum of the number of samples from all individual models. To change this behavior, the user may instead specify a number of sample predictions for the ensemble to return using the n_output_samples argument. Then, a random subset of predictions from individual models will be selected to construct a LOP of samples so that all component models are represented equally. This random selection of samples is stratified by model to ensure approximately the same number of samples from each individual model is included in the ensemble. When requesting a linear pool composed of a subset of the input sample predictions, the user must identify the task ID variables which together identify a single modeled unit. This group of independent task ID variables is called the compound task ID set and is specified using the compound_taskid_set parameter to ensure the subsetting of sample predictions is performed correctly. Samples summarizing a marginal distribution will have a compound task ID set made up of all the task ID variables. On the other hand, samples summarizing a joint distribution will have a compound task ID set that contains only task ID variables for which the joint distribution does not capture dependence. For example, if a joint distribution is estimated across multiple forecast horizons separately for each location, location would be included in the compound task ID set but horizon would not. Derived task IDs are another subgroup of task ID variables that must be specified in a call to linear_pool() for a subsetted sample ensemble; their values are derived from a combination of the values from other task ID variables (which may or may not be part of the compound task ID set). A common example of a derived task ID variable is the target date for a prediction, which is a deterministic function of the reference date of the prediction and the prediction horizon. Generally, the derived task IDs won’t be included in the compound task ID set because they are not needed to identify a single modeled unit for an outcome of interest, unless all of the task ID variables their values depend on are already a part of the compound task ID set. Not all model outputs will contain derived task IDs, in which case the argument may be set to NULL (the default value). However, it is important to provide the linear_pool() function with any derived task IDs when calculating an ensemble of (subsetted) samples, as they are used to check that the provided compound task ID set is compatible with the input predictions and the resulting LOP is valid. Download figure Open in new tab 5 Example: in-depth analysis of forecast data To further demonstrate the differences between the two ensemble functions and the utility of the hubEnsembles package, we provide a more complex example that walks through the full process of generating multi-model ensembles. This case study gathers real forecasts collected by a modeling hub to create four equally-weighted ensembles, then evaluates their performance to determine the best approach for the application. The predictions we use to create the ensemble models are sourced from two seasons of the FluSight forecasting challenge. Since 2013, the US Centers for Disease Control and Prevention (CDC) has been soliciting short-term forecasts of seasonal influenza from modeling teams through this collaborative challenge 49 . Using simple_ensemble() and linear_pool() , we build four equally-weighted, multi-model ensembles to predict weekly influenza hospitalizations: a quantile (arithmetic) mean, a quantile median, a linear pool with normal tails, and a linear pool with lognormal tails. Then, we compare the resulting ensembles’ performance through plotting and scoring their forecasts. Only a select portion of the code used in this analysis is shown for brevity, but all the functions and scripts used to generate the case study results can be found in the associated GitHub repository ( https://github.com/hubverse-org/hubEnsemblesManuscript ). In particular, the figures and tables supporting this analysis are generated reproducibly using data from .rds files stored in the analysis/data/raw-data directory and scripts in the inst directory of the repository. 5.1 Data and Methods We collect the predictions used to generate the four ensembles by querying them from Zoltar 50 , a repository designed to archive forecasts created by the Reich Lab at UMass Amherst. For this analysis we only consider FluSight forecasts in a quantile format from the 2021-2022 and 2022-2023 influenza seasons. These quantile forecasts are stored in two data objects, split by season, called flu_forecasts-zoltar_21-22.rds and flu_forecasts-zoltar_22-23.rds , which are then joined together into a single data frame. A subset is shown below in Table 8 . View this table: View inline View popup Download powerpoint Table 8: An example prediction of weekly incident influenza hospitalizations pulled directly from Zoltar. The example forecasts were made on May 15, 2023 for California at the 1 week ahead horizon. The forecasts were generated during the FluSight forecasting challenge, then formatted according to Zoltar standards for storage. The timezero, season, unit, param1, param2 , and param3 columns have been omitted for brevity. (The season column has a value of ‘2021-2022’ or ‘2022-2023’ while the last three ‘param’ columns always have a value of NA.) Download figure Open in new tab Download figure Open in new tab Although these forecasts are in a tabular format, they are not model_out_tbl objects and thus cannot yet be fed into either of the hubEnsembles functions. Thus, we must use the as_model_out_tbl() 1 function from hubUtils to transform the raw forecasts so that they conform to hubverse standards. Below, we specify the appropriate column mappings in the call with task ID variables of forecast_date (when the forecast was made), location, horizon, and target . Download figure Open in new tab To ensure the quantile mean and median ensemble had consistent component forecast make-up at every quantile level, we only included predictions (defined by a unique combination of task ID variables) that contained all 23 quantiles specified by FluSight ( θ ∈ {.010, 0.025, .050, .100, …, .900, .950, .990}). This requirement required no further action on our part, since it was consistent with FluSight submission guidelines. However, we did remove the baseline and median ensemble models generated by the FluSight hub from the component forecasts, a choice motivated by the desire to match the composition of models in the official FluSight ensemble. Download figure Open in new tab With these inclusion criteria, the final data set of component forecasts consists of predictions from 25 modeling teams and 42 distinct models, 53 forecast dates (one per week), 54 US locations, 4 horizons, 1 target, and 23 quantiles. In the 2021-2022 season, 25 models made predictions for 22 weeks spanning from late January 2022 to late June 2022, and in the 2022-2023 season, there were 31 models making predictions for 31 weeks spanning mid-October 2022 to mid-May 2023. Fourteen of the 42 total models made forecasts for both seasons. Locations consist of the 50 US states, Washington DC, Puerto Rico, the Virgin Islands, and the entire US; horizons 1 to 4 weeks ahead, quantiles the 23 described above, and target week ahead incident flu hospitalization. The values for the forecasts are always non-negative. In Table 9 , we provide an example of these predictions, showing select quantiles from a single model, forecast date, horizon, and location. View this table: View inline View popup Download powerpoint Table 9: An example prediction of weekly incident influenza hospitalizations. The example model output was made on May 15, 2023 for California at the 1 week ahead horizon. The forecast was generated during the FluSight forecasting challenge, then formatted according to hubverse standards post hoc. The location, forecast_date, and season columns have been omitted for brevity; quantiles representing the endpoints of the central 50%, 80% and 95% prediction intervals are shown. Next, we can combine the predictions into a single model_out_tbl object used to generate forecasts for each ensemble method. Then, we call the appropriate function hubEnsembles to generate predictions for each equally-weighted ensemble, storing the results in four separate objects of model output data. Download figure Open in new tab Download figure Open in new tab We then evaluate the performance of the ensembles using scoring metrics that measure the accuracy and calibration of their forecasts. We chose several common metrics in forecast evaluation, including mean absolute error (MAE), weighted interval score (WIS) 51 , 50% prediction interval (PI) coverage, and 95% PI coverage. MAE measures the average absolute error of a set of point forecasts; smaller values of MAE indicate better forecast accuracy. WIS is a generalization of MAE for probabilistic forecasts and is an alternative to other common proper scoring rules which cannot be evaluated directly for quantile forecasts 51 . WIS is made up of three component penalties: (1) for over-prediction, (2) for under-prediction, and (3) for the spread of each interval (where an interval is defined by a symmetric set of two quantiles). This metric is a weighted sum of these penalties across all prediction intervals provided. A lower WIS value indicates a more accurate forecast 51 . PI coverage provides information about whether a forecast has accurately characterized its uncertainty about future observations. The 50% PI coverage rate measures the proportion of the time that 50% prediction intervals at that nominal level included the observed value; the 95% PI coverage rate is defined similarly. Achieving approximately nominal (50% or 95%) coverage indicates a well-calibrated forecast. We also use relative versions of WIS and MAE (rWIS and rMAE, respectively) to understand how the ensemble performance compares to that of the FluSight baseline model. These metrics are calculated as where model m is any given model being compared against the baseline. For both of these metrics, a value less than one indicates better performance compared to the baseline while a value greater than one indicates worse performance. By definition, the FluSight baseline itself will always have a value of one for both of these metrics. Each unique prediction from an ensemble model is scored against target data using the score_forecasts() 2 function from the covidHubUtils package, made for scoring tabular infectious disease data with commonly used evaluation metrics including those mentioned above. We use median forecasts taken from the 0.5 quantile for the MAE evaluation. 5.2 Performance results across ensembles The quantile median ensemble has the best overall performance in terms of WIS and MAE (and the relative versions of these metrics), and has coverage rates that were close to the nominal levels ( Table 10 ). The two linear opinion pools have very similar performance to each other. These methods have the second-best performance as measured by WIS and MAE, but they have the highest 50% and 95% coverage rates, with empirical coverage that was well above the nominal coverage rate. The quantile mean performs the worst of the ensembles with the highest MAE, which is substantially different from that of the other ensembles. View this table: View inline View popup Download powerpoint Table 10: Summary of overall model performance across both seasons, averaged over all locations except the US national location and sorted by ascending WIS. The quantile median ensemble has the best value for every metric except 50% coverage rate, though metric values are often quite similar among the models. Plots of the models’ forecasts can aid our understanding about the origin of these accuracy differences. For example, the linear opinion pools consistently have some of the widest prediction intervals, and consequently the highest coverage rates. The median ensemble, which has the best WIS, balanced interval width with calibration best overall, with narrower intervals than the linear pools that still achieved near-nominal coverage on average across all time points. The quantile mean’s interval widths vary, though it usually has narrower intervals than the linear pools. However, this model’s point forecasts have a larger error margin compared to the other ensembles, especially at longer horizons. This pattern is demonstrated in Figure 7 for the 4-week ahead forecast in California following the 2022-23 season peak on December 5, 2022. Here, the quantile mean predicted a continued increase in hospitalizations, at a steeper slope than the other ensemble methods. Download figure Open in new tab Figure 7: One to four week ahead forecasts for select dates plotted against target data for California. The first panel shows all models on the same scale. All other panels show forecasts for each individual model, with varying y-axis scales, and their prediction accuracy as compared to observed influenza hospitalizations. Averaging across all time points, the median ensemble has the best scores for every metric. (Note that we only show plots of WIS vs forecast date, faceted by season and horizon, due to similar trends being observed with MAE and 95% prediction interval coverage). The quantile median outperforms the mean ensemble by a similar amount for both WIS and MAE, particularly around local times of change (see Figure 8 ). The median ensemble also has better coverage rates than the mean ensemble in the tails of the distribution (95% intervals) and similar coverage in the center (50% intervals). The median model also outperforms the linear pools for most weeks, with the greatest differences in scores being for WIS and coverage rates ( Figure 8 ). This seems to indicate that the linear pools’ estimates are usually too conservative, with their wide intervals and higher-than-nominal coverage rates being penalized by WIS. However, during the 2022-2023 season there are several localized times when the linear pools showcased better one-week-ahead forecasts than the median ensemble ( Figure 8 ). These localized instances are characterized by similar MAE values for the two methods and poor median ensemble coverage rates. In these instances, the wide intervals from the linear pools were useful in capturing the eventually-observed hospitalizations, usually during times of rapid change. Download figure Open in new tab Figure 8: Weighted interval score (WIS) averaged across all locations. Average target data across all locations for 2021-2022 (A) and 2022-2023 (B) seasons for reference. For each season, average WIS is shown for 1-week (C-D) and 4-week ahead (E-F) forecasts. Results are plotted for each ensemble model (colors) across the entire season. Lower values indicate better performance. All of the ensemble variations outperform the baseline model in this analysis, led by the quantile median which had the best scores overall for WIS, MAE, 50% PI coverage, and 95% PI coverage. However, other models sometimes demonstrated better performance, especially around the season’s peak. These results seem to be consistent with previous findings: linear pools produce intervals generally too wide for a short-term forecasting, except under conditions with greater levels of uncertainty or where the individual models contributing to the ensemble are poorly calibrated. Thus, while the quantile median offers consistency and robustness, there may be certain influenza seasons in which other ensemble methods display better performance. The choice of an appropriate ensemble aggregation method may depend on the forecast target, the goal of forecasting, and the behavior of the individual models contributing to an ensemble. One case may call for prioritizing high coverage rates while another may prioritize accurate point forecasts. The simple_ensemble() and linear_pool() functions and the ability to specify component model weights and an aggregation function for simple_ensemble() allow users to implement a variety of ensemble methods. 6 Summary and discussion Ensembles of independent models are a powerful tool to generate more accurate and more reliable predictions of future outcomes than a single model alone. Here, we have provided an overview of multi-model ensemble methodology with the goal of improving use of ensemble results in public health and biomedical research settings, as well as other domains that use probabilistic forecasting. Moreover, we have demonstrated how to utilize hubEnsembles , a simple and flexible framework to combine individual model predictions into an ensemble. Multi-model ensembles are becoming the gold standard for prediction exercises in the public health domain. Collaborative modeling hubs can serve as a centralized entity to guide and elicit predictions from multiple independent models, as well as to generate and communicate ensemble results 12 , 36 . Given the increasing popularity of multi-model ensembles and collaborative hubs, there is a clear need for generalized data standards and software infrastructure to support these hubs. By addressing this need, the hubverse suite of tools can reduce duplicative efforts across existing hubs, support other communities engaged in collaborative efforts, and enable the adoption of multi-model approaches in new domains. When generating and interpreting an ensemble prediction, it is important to understand the methods underlying the ensemble, as methodological choices can have meaningful effects on the resulting ensemble and its performance. Although there may not be a universal “best” method, matching the properties of a given ensemble method with the features of the component models will likely yield best results 24 . Our case study on seasonal influenza forecasts in the US demonstrates this point. The quantile median ensemble performs best overall for a range of metrics, including weighted interval score, mean absolute error, and prediction interval coverage. Yet, the other ensemble methods we tested also showcased a clear improvement over the baseline model and even outperformed the quantile median for certain weeks, particularly during periods of rapid change when outlying component forecasts are likely more important. The accuracy gains from ensemble models motivate the use of a “hub-based” approach to prediction for infectious diseases, public health, and in other fields. Data Availability All code necessary to reproduce this manuscript can be found in a GitHub repository available at https://github.com/hubverse-org/hubEnsemblesManuscript , including required package versions which are available in the DESCRIPTION file. https://github.com/hubverse-org/hubEnsemblesManuscript https://github.com/hubverse-org/hubEnsembles Code availability All code necessary to reproduce this manuscript can be found in a GitHub repository available at https://github.com/hubverse-org/hubEnsemblesManuscript , including required package versions which are available in the DESCRIPTION file. Consortium of Infectious Disease Modeling Hubs Consortium of Infectious Disease Modeling Hubs authors include Alvaro J. Castro Rivadeneira (University of Massachusetts Amherst), Lucie Contamin (University of Pittsburgh), Sebastian Funk (London School of Hygiene & Tropical Medicine), Aaron Gerding (University of Massachusetts Amherst), Hugo Gruson ( data.org ), Harry Hochheiser (University of Pittsburgh), Emily Howerton (The Pennsylvania State University), Melissa Kerr (University of Massachusetts Amherst), Anna Krystalli (R-RSE SMPC), Sara L. Loo (Johns Hopkins University), Evan L. Ray (University of Massachusetts Amherst), Nicholas G. Reich (University of Massachusetts Amherst), Koji Sato (Johns Hopkins University), Li Shandross (University of Massachusetts Amherst), Katharine Sherratt (London School of Hygene and Tropical Medicine), Shaun Truelove (Johns Hopkins University), Martha Zorn (University of Massachusetts Amherst) Acknowledgements The authors thank all members of the hubverse community; the broader hubverse software infrastructure made this package possible. L. Shandross, A. Krystalli, N. G. Reich, and E. L. Ray were supported by the National Institutes of General Medical Sciences (R35GM119582) and the US Centers for Disease Control and Prevention (U01IP001122 and NU38FT000008). E. Howerton was supported by NSF RAPID awards DEB-2126278 and DEB-2220903, as well as the Eberly College of Science Barbara McClintock Science Achievement Graduate Scholarship in Biology at the Pennsylvania State University. L. Contamin and H. Hochheiser were supported by NIGMS grants U24GM132013 and R24GM153920. The content is solely the responsibility of the authors and does not necessarily represent the official views of NIGMS, the National Institutes of Health, or CDC. Footnotes ↵ 5 A list of authors and affiliations appears at the end of the paper The manuscript has been revised to serve as a tutorial piece, rather than a software manuscript. This includes adding an overview figure (new Figure 1) and a new section (2.3) that describes the application of multi-model ensembles in public health. The other major components of the manuscript remain largely unchanged, with edits for flow and consistency. ↵ 1 https://hubverse-org.github.io/hubUtils/reference/as_model_out_tbl.html ↵ 2 https://reichlab.io/covidHubUtils/reference/score_forecasts.html 3 https://reichlab.io/covidHubUtils/reference/score_forecasts.html References 1. ↵ Clemen RT . Combining forecasts: A review and annotated bibliography . International Journal of Forecasting . 1989 ; 5 ( 4 ): 559 – 583 . doi: 10.1016/0169-2070(89)90012-5 OpenUrl CrossRef 2. ↵ Timmermann A. Chapter 4 Forecast Combinations . In: Vol 1 . Elsevier ; 2006 : 135 – 196 . doi: 10.1016/S1574-0706(05)01004-9 OpenUrl CrossRef 3. ↵ Hibon M , Evgeniou T. To combine or not to combine: selecting among forecasts and their combinations . International Journal of Forecasting . 2005 ; 21 ( 1 ): 15 – 24 . doi: 10.1016/j.ijforecast.2004.05.002 OpenUrl CrossRef 4. ↵ Alley RB , Emanuel KA , Zhang F. Advances in weather prediction . Science . 2019 ; 363 ( 6425 ): 342 – 344 . doi: 10.1126/science.aav7274 OpenUrl Abstract / FREE Full Text 5. ↵ Tebaldi C , Knutti R. The use of the multi-model ensemble in probabilistic climate pro-jections . Philosophical Transactions: Mathematical, Physical and Engineering Sciences . 2007 ; 365 ( 1857 ): 2053 – 2075 . doi: 10.1098/rsta.2007.2076 OpenUrl CrossRef PubMed Web of Science 6. ↵ Aastveit KA , Mitchell J , Ravazzolo F , Dijk HK van . The Evolution of Forecast Density Combinations in Economics . Tinbergen Institute Discussion Papers . Published online 2018. https://hdl.handle.net/10419/185588 7. ↵ Shaman J , Kandula S. Improved Discrimination of Influenza Fore-cast Accuracy Using Consecutive Predictions . PLoS Currents . 2015 ; 7 : ecurrents.outbreaks.8a6a3df285af7ca973fab4b22e10911e. doi: 10.1371/currents.outbreaks.8a6a3df285af7ca973fab4b22e10911e OpenUrl CrossRef 8. Held L , Meyer S , Bracher J. Probabilistic forecasting in infectious disease epidemiology: The 13th Armitage lecture . Statistics in Medicine . 2017 ; 36 ( 22 ): 3443 – 3460 . doi: 10.1002/sim.7363 OpenUrl CrossRef PubMed 9. Ray EL , Sakrejda K , Lauer SA , Johansson MA , Reich NG . Infectious disease prediction with kernel conditional density estimation . Statistics in Medicine . 2017 ; 36 ( 30 ): 4908 – 4929 . doi: 10.1002/sim.7488 OpenUrl CrossRef PubMed 10. ↵ Reich NG , McGowan CJ , Yamana TK , et al. Accuracy of real-time multi-model ensemble forecasts for seasonal influenza in the U.S . PLOS computational biology . 2019 ; 15 ( 11 ): e1007486 . doi: 10.1371/journal.pcbi.1007486 OpenUrl CrossRef 11. ↵ Rodríguez A , Kamarthi H , Agarwal P , et al. Machine learning for data-centric epidemic fore-casting . Nature Machine Intelligence . 2024 ; 6 . doi: 10.1038/s42256-024-00895-7 OpenUrl CrossRef 12. ↵ Reich NG , Lessler J , Funk S , et al. Collaborative hubs: Making the most of pre-dictive epidemic modeling . American Journal of Public Health . 2022 ; 112 ( 6 ): 839 – 842 . doi: 10.2105/AJPH.2022.306831 OpenUrl CrossRef PubMed 13. ↵ McGowan CJ , Biggerstaff M , Johansson M , et al. Collaborative efforts to forecast seasonal influenza in the United States, 2015–2016 . Scientific Reports . 2019 ; 9 ( 1 ): 683 . doi: 10.1038/s41598-018-36361-9 OpenUrl CrossRef PubMed 14. ↵ Johansson MA , Apfeldorf KM , Dobson S , et al. An open challenge to advance proba-bilistic forecasting for dengue epidemics . Proceedings of the National Academy of Sciences . 2019 ; 116 ( 48 ): 24268 – 24274 . doi: 10.1073/pnas.1909865116 OpenUrl Abstract / FREE Full Text 15. ↵ Holcomb KM , Mathis S , Staples JE , et al. Evaluation of an open forecasting challenge to assess skill of West Nile virus neuroinvasive disease prediction . Parasites & Vectors . 2023 ; 16 ( 1 ): 11 . doi: 10.1186/s13071-022-05630-y OpenUrl CrossRef PubMed 16. ↵ Cramer EY , Ray EL , Lopez VK , et al. Evaluation of individual and ensemble probabilistic forecasts of COVID-19 mortality in the united states . Proceedings of the National Academy of Sciences . 2022 ; 119 ( 15 ): e2113561119 . doi: 10.1073/pnas.2113561119 OpenUrl CrossRef PubMed 17. ↵ Sherratt K , Gruson H , Grah R , et al. Predictive performance of multi-model ensemble forecasts of COVID-19 across European nations . eLife . 2023 ; 12 : e81916 . doi: 10.7554/eLife.81916 OpenUrl CrossRef PubMed 18. ↵ Howerton E , Contamin L , Mullany LC , et al. Evaluation of the US COVID-19 Scenario Modeling Hub for informing pandemic response under uncertainty . Nature Communications . 2023 ; 14 ( 1 ): 7260 . doi: 10.1038/s41467-023-42680-x OpenUrl CrossRef 19. ↵ Yamana TK , Kandula S , Shaman J. Superensemble forecasts of dengue outbreaks . Journal of The Royal Society Interface . 2016 ; 13 ( 123 ): 20160410 . doi: 10.1098/rsif.2016.0410 OpenUrl CrossRef PubMed 20. ↵ Ray EL , Reich NG . Prediction of infectious disease epidemics via weighted density ensembles . PLOS computational biology . 2018 ; 14 ( 2 ): e1005910 . doi: 10.1371/journal.pcbi.1005910 OpenUrl CrossRef PubMed 21. Viboud C , Sun K , Gaffey R , et al. The RAPIDD ebola forecasting challenge: Synthesis and lessons learnt . Epidemics . 2018 ; 22 : 13 – 21 . doi: 10.1016/j.epidem.2017.08.002 OpenUrl CrossRef PubMed 22. Colón-González FJ , Bastos LS , Hofmann B , et al. Probabilistic seasonal dengue forecasting in Vietnam: A modelling study using superensembles . PLOS Medicine . 2021 ; 18 ( 3 ): e1003542 . doi: 10.1371/journal.pmed.1003542 OpenUrl CrossRef 23. ↵ Paireau J , Andronico A , Hozé N , et al. An ensemble model based on early predictors to forecast COVID-19 health care demand in France . Proceedings of the National Academy of Sciences . 2022 ; 119 ( 18 ): e2103302119 . doi: 10.1073/pnas.2103302119 OpenUrl CrossRef PubMed 24. ↵ Howerton E , Runge MC , Bogich TL , et al. Context-dependent representation of within- and between-model uncertainty: Aggregating probabilistic predictions in infectious disease epidemiology . Journal of The Royal Society Interface . 2023 ; 20 ( 198 ): 20220659 . doi: 10.1098/rsif.2022.0659 OpenUrl CrossRef PubMed 25. ↵ Ray EL , Brooks LC , Bien J , et al. Comparing trained and untrained probabilistic ensemble fore-casts of COVID-19 cases and deaths in the United States . International Journal of Forecasting . 2023 ; 39 ( 3 ): 1366 – 1383 . doi: 10.1016/j.ijforecast.2022.06.005 OpenUrl CrossRef 26. ↵ Pedregosa F , Varoquaux G , Gramfort A , et al. Scikit-learn: Machine Learning in Python . Journal of Machine Learning Research . 2011 ; 12 ( 85 ): 2825 – 2830 . doi: 10.5555/1953048.2078195 OpenUrl CrossRef 27. Weiss Christoph , E , Raviv E , Roetzer G. Forecast Combinations in R using the ForecastComb Package . The R Journal . 2019 ; 10 ( 2 ): 262 . doi: 10.32614/RJ-2018-052 OpenUrl CrossRef 28. Bosse N , Yao Y , Abbott S , Funk S. Stackr: Create Mixture Models From Predictive Samples .; 2023 . 29. ↵ Couch S , Kuhn M. Stacks: Tidy Model Stacking .; 2023 . 30. ↵ Consortium of Infectious Disease Modeling Hubs. The hubverse: Open tools for collaborative forecasting . Published online 2025. https://hubverse.io/en/latest/index.html 31. ↵ Vincent SB . The Function of the Vibrissae in the Behavior of the White Rat. PhD thesis. University of Chicago ; 1912 . 32. ↵ Stone M. The opinion pool . The Annals of Mathematical Statistics . 1961 ; 32 ( 4 ): 1339 – 1342 . OpenUrl 33. ↵ Lichtendahl KC , Grushka-Cockayne Y , Winkler RL . Is it better to average probabilities or quantiles? Management Science . 2013 ; 59 ( 7 ): 1594 – 1611 . doi: 10.1287/mnsc.1120.1667 OpenUrl CrossRef 34. ↵ Hollingsworth TD , Medley GF . Learning from multi-model comparisons: Collaboration leads to insights, but limitations remain . Epidemics . 2017 ; 18 : 1 – 3 . doi: 10.1016/j.epidem.2017.02.014 OpenUrl CrossRef PubMed 35. Shea K , Runge MC , Pannell D , et al. Harnessing multiple models for outbreak management . Science . 2020 ; 368 ( 6491 ): 577 – 579 . doi: 10.1126/science.abb9934 OpenUrl Abstract / FREE Full Text 36. ↵ Borchering RK , Healy JM , Cadwell BL , et al. Public health impact of the U.S. Scenario Modeling Hub . Epidemics . 2023 ; 44 : 100705 . doi: 10.1016/j.epidem.2023.100705 OpenUrl CrossRef PubMed 37. ↵ Reich NG , Brooks LC , Fox SJ , et al. A collaborative multiyear, multimodel assessment of seasonal influenza forecasting in the United States . Proceedings of the National Academy of Sciences . 2019 ; 116 ( 8 ): 3146 – 3154 . doi: 10.1073/pnas.2200703119 OpenUrl Abstract / FREE Full Text 38. ↵ Borchering RK , Mullany LC , Howerton E , et al. Impact of SARS-CoV-2 vaccination of children ages 5–11 years on COVID-19 disease burden and resilience to new variants in the United States, November 2021–March 2022: A multi-model study . The Lancet Regional Health – Americas . 2023 ; 17 : 100398 . doi: 10.1016/j.lana.2022.100398 OpenUrl CrossRef PubMed 39. Loo SL , Howerton E , Contamin L , et al. The US COVID-19 and Influenza Scenario Mod-eling Hubs: Delivering long-term projections to guide policy . Epidemics . 2024 ; 46 : 100738 . doi: 10.1016/j.epidem.2023.100738 OpenUrl CrossRef PubMed 40. ↵ Jung S , Loo SL , Howerton E , et al. Potential impact of annual vaccination with reformulated COVID-19 vaccines: Lessons from the US COVID-19 scenario modeling hub . PLOS Medicine . 2024 ; 21 ( 4 ): e1004387 . doi: 10.1371/journal.pmed.1004387 OpenUrl CrossRef PubMed 41. ↵ Dean NE , Pastore y Piontti A , Madewell ZJ , et al. Ensemble forecast modeling for the design of COVID-19 vaccine efficacy trials . Vaccine . 2020 ; 38 ( 46 ): 7213 – 7216 . doi: 10.1016/j.vaccine.2020.09.031 OpenUrl CrossRef PubMed 42. ↵ Pollett S , Johansson MA , Reich NG , et al. Recommended reporting items for epidemic forecasting and prediction research: The EPIFORGE 2020 guidelines . PLOS Medicine . 2021 ; 18 ( 10 ): e1003793 . doi: 10.1371/journal.pmed.1003793 OpenUrl CrossRef 43. ↵ Oidtman RJ , Omodei E , Kraemer MUG , et al. Trade-offs between individual and ensem-ble forecasts of an emerging infectious disease . Nature Communications . 2021 ; 12 ( 1 ): 5379 . doi: 10.1038/s41467-021-25695-0 OpenUrl CrossRef PubMed 44. ↵ Wattanachit N , Ray EL , McAndrew TC , Reich NG . Comparison of combination methods to create calibrated ensemble forecasts for seasonal influenza in the U.S . Statistics in Medicine . 2023 ; 42 ( 26 ): 4696 – 4712 . doi: 10.1002/sim.9884 OpenUrl CrossRef PubMed 45. ↵ McAndrew T , Reich NG . Adaptively stacking ensembles for influenza forecasting . Statistics in Medicine . 2021 ; 40 ( 30 ): 6931 – 6952 . doi: 10.1002/sim.9219 OpenUrl CrossRef PubMed 46. ↵ Fox SJ , Kim M , Meyers LA , Reich NG , Ray EL . Optimizing Disease Outbreak Forecast Ensem-bles . 2024 ; 30 ( 9 ): 1967 – 1969 . doi: 10.3201/eid3009.240026 OpenUrl CrossRef 47. ↵ Ray EL , Gerding A. Distfromq: Reconstruct a Distribution from a Collection of Quantiles .; 2024 . 48. ↵ Niederreiter H. Random Number Generation and Quasi-Monte Carlo Methods . Society for In-dustrial; Applied Mathematics ; 1992 . 49. ↵ CDC . About flu forecasting . Published online 2023. https://www.cdc.gov/flu/weekly/flusight/how-flu-forecasting.htm 50. ↵ Reich NG , Cornell M , Ray EL , House K , Le K. The Zoltar forecast archive, a tool to standardize and store interdisciplinary prediction research . Scientific Data . 2021 ; 8 ( 1 ): 59 . doi: 10.1038/s41597-021-00839-5 OpenUrl CrossRef PubMed 51. ↵ Bracher J , Ray EL , Gneiting T , Reich NG . Evaluating epidemic forecasts in an interval format . PLOS Computational Biology . 2021 ; 17 ( 2 ): e1008618 . doi: 10.1371/journal.pcbi.1008618 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted May 05, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Multi-model ensembles in infectious disease and public health: Methods, interpretation, and implementation in R Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Multi-model ensembles in infectious disease and public health: Methods, interpretation, and implementation in R Li Shandross , Emily Howerton , Lucie Contamin , Harry Hochheiser , Anna Krystalli , Consortium of Infectious Disease Modeling Hubs , Nicholas G. Reich , Evan L. Ray medRxiv 2024.06.24.24309416; doi: https://doi.org/10.1101/2024.06.24.24309416 Share This Article: Copy Citation Tools Multi-model ensembles in infectious disease and public health: Methods, interpretation, and implementation in R Li Shandross , Emily Howerton , Lucie Contamin , Harry Hochheiser , Anna Krystalli , Consortium of Infectious Disease Modeling Hubs , Nicholas G. Reich , Evan L. Ray medRxiv 2024.06.24.24309416; doi: https://doi.org/10.1101/2024.06.24.24309416 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Public and Global Health Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (302) Cardiovascular Medicine (4453) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15242) Forensic Medicine (30) Gastroenterology (1131) Genetic and Genomic Medicine (6615) Geriatric Medicine (669) Health Economics (1001) Health Informatics (4552) Health Policy (1372) Health Systems and Quality Improvement (1614) Hematology (543) HIV/AIDS (1270) Infectious Diseases (except HIV/AIDS) (15929) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6625) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3344) Ophthalmology (979) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5461) Public and Global Health (9252) Radiology and Imaging (2207) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1197) Rheumatology (597) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a02f04e4adbcc13d',t:'MTc3OTk4OTc2MA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00