Machine Learning Assisted Differentiation of Low Acuity Patients at Dispatch (MADLAD): A Randomized Controlled Trial

preprint OA: closed CC-BY-4.0
📄 Open PDF Full text JSON View at publisher
Full text 49,166 characters · extracted from preprint-html · click to expand
Machine Learning Assisted Differentiation of Low Acuity Patients at Dispatch (MADLAD): A Randomized Controlled Trial | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Machine Learning Assisted Differentiation of Low Acuity Patients at Dispatch (MADLAD): A Randomized Controlled Trial View ORCID Profile Douglas Spangler , Simon Morelli , View ORCID Profile David Smekal , View ORCID Profile Lennart Edmark , Hans Blomberg doi: https://doi.org/10.1101/2025.09.19.25336143 Douglas Spangler 1 Uppsala University , Uppsala, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Douglas Spangler For correspondence: douglas.spangler{at}akademiska.se Simon Morelli 2 Västmanland Hospital , Västerås, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site David Smekal 1 Uppsala University , Uppsala, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for David Smekal Lennart Edmark 1 Uppsala University , Uppsala, Sweden 2 Västmanland Hospital , Västerås, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lennart Edmark Hans Blomberg 1 Uppsala University , Uppsala, Sweden Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Background Resource Constrained Situations (RCS) at Emergency Medical Dispatch centers where there are more patients requiring an ambulance than there are available ambulances are common. Machine learning (ML) techniques offer a promising but largely untested approach to assessing relative risks among these patients. The study aims to establish whether the provision of ML-based risk scores predicting patient outcomes improves the ability of dispatchers to identify patients at high risk for deterioration in RCS and dispatch the first available ambulance to them. Methods A parallel-grouped, randomized trial of adult patients assessed by a dispatch nurse in the Swedish regions of Uppsala or Västmanland as requiring a low-priority ambulance response in RCS. Patients were randomized 1:1 to be prioritized with the aid of a ML-based risk assessment tool, or per current clinical practice. Prioritization accuracy was assessed primarily in terms of whether the first available ambulance was sent to the patient with the highest National Early Warning Score (NEWS 2) based on subsequently collected vital signs. Trial registered at ClinicalTrials.gov ( NCT04757194 ). Results A total of 1245 RCS were included in the study. In the intervention group, patients assigned the first available ambulance had the highest NEWS in 68.3% of cases vs 62.5% in the control group, corresponding to an odds ratio of 1.28 (95% CI 1.00 – 1.63, p = 0.047). Prespecified analyses also suggested that dispatchers complied with the tool in 80.9% (77.7 – 83.9) of cases, and that full compliance with the risk prediction instrument would have improved prioritization decisions further. Discussion This study suggests that clinical ML-based decision support tools have the ability to influence care provider decisions and improve their capacity to rapidly differentiate between high- and low-risk patients at dispatch. Introduction In prehospital care systems, ambulance availability places constraints on the ability of Emergency Medical Dispatch (EMD) centers to immediately provide an ambulance response for all patients determined to require one. The stochastic nature of ambulance demand via the emergency hotline entails that any cost-effective ambulance system will from time to time experience Resource Constrained Situations (RCS) in which the number of patients requiring an ambulance response exceeds the number of available ambulances. Machine learning (ML) models offer a promising approach to stratifying the risks associated with these patients. In previous research, an open source ML-based risk assessment tool was developed and retrospectively validated in a cohort of patients receiving ambulance care in the region of Uppsala ( 1 , 2 ). While a number of ML-based risk assessment tools have been proposed for use in prehospital- and emergency care and validated retrospectively ( 3 – 9 ), only a single randomized trial has been performed ( 10 , 11 ) and no models intended for risk differentiation across the full spectrum of prehospital patient types have been evaluated in a randomized trial. There is thus a great need to identify suitable use-cases for these tools, and to generate high-quality evidence regarding their effectiveness in achieving clinically important objectives. Given that the application of ML models in this context is novel and unproven, this must be done in a manner which minimizes the patient safety hazards associated with incorrect decisions. Aim This study aims to investigate whether the application of a ML-based risk assessment instrument improves the ability of dispatchers to identify and dispatch an ambulance to the most critically ill patient in RCS. Patient acuity was operationalized primarily as the National Early Warning Score (NEWS 2 base scale) based on the first set of vital signs obtained, and secondarily based on a composite risk score consisting of prehospital interventions and hospital outcomes. Hypotheses Primary The intervention results in a greater proportion of immediate ambulance responses in RCS being directed to the patient in the most critical condition as operationalized by subsequent NEWS value. Secondary The intervention improves differentiation with regards to a composite risk score consisting of ambulance interventions, abnormal initial ambulance findings, emergent transport, hospital admission, and mortality between patients receiving immediate vs. delayed ambulance response during RCS. The intervention increases the difference in NEWS between patients receiving immediate vs. delayed ambulance response during RCS. Pre-specified ancillary analyses Evaluation of overall personnel compliance with risk assessment instrument in intervention arm. Evaluation of compliance in intervention arm cases where the model had a high vs low level of confidence. Evaluation of improved/degraded compliance with risk assessment instrument over time as manifested by a slope change in a time series analysis of intervention group Evaluation of spillover effects as manifested by a significant positive slope in a time series analysis of control group outcomes. Evaluation of change in risk assessment tool predictive value over time (covariate drift). Evaluation of model calibration with regards to age, gender, and complaint category. Methods Design A parallel grouped trial, randomized 1:1 to intervention or control arms. Setting The study took place in two EMDCs in central Sweden (Uppsala and Västmanland), serving a combined population of 499 000 in 2021. The regions have a total of 32 ambulances during peak hours. Each dispatch center is staffed by 2-3 dispatch nurses that answer emergency (112) and non-emergency calls, and 1 ambulance director 24 hours per day. The dispatch nurses currently employ a self-developed, rule-based Clinical Decision Support System (CDSS) to structure patient interviews and determine a priority level ( 12 , 13 ). The CDSS results in a priority of 1A/1B (lights and sirens response) 2A/2B (non-emergency response) or referral to non-ambulance care. Priority 2A/2B patients were chosen as the target population for the intervention, given that they had been determined by a dispatch nurse to be relatively low acuity, but would also not be exposed to the risks associated with being referred to non-emergency care ( 12 , 14 , 15 ). Participants Inclusion Criteria Identification of a resource constrained situation by ambulance director (i.e., 2 or more patients awaiting an ambulance response). Assigned priority 2A or 2B by dispatch nurse. Complete call documentation in the CDSS. Valid Swedish personal identification number collected at dispatch. Age >= 18 years. Exclusion Criteria Relevant calls received more than 30 minutes apart. Logistical factors (e.g. the patients’ geographical locations) affect the ambulance assignment decision. On scene risk factors (e.g. a patient is outdoors and risks hypothermia) or risk mitigators (e.g. healthcare staff already on-scene with a patient) affect the ambulance assignment decision. Intervention Ambulance directors had overall responsibility for executing the study protocol, and were tasked with identifying RCS suitable for inclusion in the study. Patients were to be included in the study upon the identification of an RCS involving eligible patients by the ambulance director at the point in time when an ambulance was available for dispatch to one of the patients. Directors were instructed to consider any relevant non-clinical factors prior to randomization, and exclude any RCS where these factors would override a clinical determination per the above exclusion criteria. The above inclusion criteria were applied automatically (i.e., patients not meeting criteria could not be selected), while exclusion criteria were applied by ambulance directors. Upon selecting the relevant patients and pressing a button in the dispatch interface to compare the selected patients, the RCS was randomly assigned to a study arm via a random number generator. In the control arm, the risk scores for each patient were calculated and stored, but not displayed to the user. In the intervention arm, a mark was displayed in the interface indicating which of the included patients had the highest risk score, along with a color-coded indicator of model confidence (red for high or orange for low, with a cutoff value calibrated to include ca. 50% of patients in each group). Figure 1 below illustrates the user interface presented to the dispatcher, with Fig 1A illustrating the cases having been selected for inclusion but prior to the user pressing the “predict” button, and Fig 1B illustrating a comparison included in the intervention arm with a high level of model confidence in the selected high-risk patient. Download figure Open in new tab Figure 1 Illustration of Graphical User Interface (translated) Download figure Open in new tab Figure 2 Participant flow chart In both study arms, the ambulance director then conferred with the nurses involved in triaging the patients to confirm which patient should receive the available ambulance. In the intervention arm, the ambulance director noted which patient was proposed by the ML framework, and could access additional information regarding the risk assessment by clicking the risk buttons in the interface. The director then dispatched the available ambulance to the patient determined through this process to have the greatest need and cleared the prediction. This process was repeated each time an ambulance became available. The intervention was based on a risk assessment instrument validated in a previous study ( 1 ). Since the publication of the validation study, the risk assessment instrument was further developed to include free-text notes entered by dispatchers, which were found to improve the performance of the models. The source code of the tool employed in the study is available on github under an open-source license ( 2 ). The tool estimates the likelihood that a patient will be assessed by ambulance crews to 1) have abnormal initial findings, 2) be transported to the hospital with lights and sirens, 3) receive a prehospital intervention, and 4) be admitted to the hospital or die within 30 days. Hospital outcome measures are based on the first hospital visit within 72 hours, in order to capture hospital outcomes for non-conveyed patients. The predicted likelihood for each of the outcomes was then combined into a composite risk score, with the above outcomes weighted to achieve predictive properties similar to those of NEWS, resulting in weights of 4:2:1:1, respectively. Development of the modelling framework was frozen upon initiation of the main study phase and was updated only upon identification of negative trends in model accuracy based on assessment by the data monitoring committee, and upon initiation of the study at the second site. Outcomes The primary outcome of the study was the National Early Warning Score (NEWS) of each included patient, based on the first set of vital signs captured by the ambulance crew upon arrival to the patient. Where vital signs were not documented by an ambulance (e.g., if a patient used an alternate mode of transport to the hospital), NEWS component items were multiply imputed including the first set of vital signs documented within 24 hours from the emergency department. NEWS was selected as the primary outcome of the study for two reasons: Firstly, NEWS is widely used in acute care, and has been thoroughly validated as being predictive of outcomes in a variety of adult patient cohorts. ( 16 – 19 ) Secondly, NEWS is based on patient vital signs, and is thus conceptually distinct from-, and prior in terms of causality to the outcome measures employed to train the models included in the risk assessment tool. The latter is a subtle but important conceptual point which addresses two issues: Firstly, by selecting an evaluation measure which is not causally dependent on the outcomes used to train the models, the possibility that assignment to the intervention or control arm in and of itself affects the evaluation is minimized. Secondly, it addresses issues relating to AI system alignment. As suggested by the orthogonality thesis, the predictive performance of an AI system is thought to be independent of the goals of the system as a whole ( 20 ). Operationalizing the need for a rapid ambulance response in terms of measurable outcomes is difficult, and we cannot assume that we have done so perfectly. Thus, it is appropriate that both the model and human decisions are evaluated in terms of a measure which in causally independent of either. In this way, the ML framework is not given an unfair advantage over human dispatchers who may have internalized a different definition of patent risk and ambulance care need. The first secondary outcome in the study is based on a composite score consisting of each of the four outcomes included in the risk assessment instrument. While this measure of intervention effectiveness suffers from the problems noted above, this is the manner in which predictive models are typically evaluated ( 4 , 5 ). The second secondary hypothesis consists of an alternate specification regarding the difference-in-difference of NEWS between prioritized and non-prioritized patients across treatment arms. Sample Size Sample size was determined based on pilot study data indicating that available ambulances were directed to the patient with the highest NEWS 65.3% of the time (considering ties as “correct” assessments), while simulation using randomly selected pairs of potentially eligible patients suggested that the model would mark the patient with the highest NEWS correctly in 70.3% of cases. Using this effect size, an estimated power of 0.8 with an alpha of 0.05 was achieved at n ≈ 1500 using a two-sided test of proportions. Randomisation Patients were recruited by ambulance directors upon identification of an RCS with multiple eligible patients. Study arm allocation was be performed automatically by the server used to generate risk assessments using a simple random number generator implemented by the numpy python package ( 21 ). Blinding Patients and the ambulance / hospital staff collecting outcome data were blind to treatment arm allocation, but by the nature of the intervention dispatchers were aware of the randomization results. Data analysis scripts evaluating the primary and secondary hypotheses were written prior to extracting outcome data using synthetic data simulating the null hypotheses. Where it was necessary to manually extract outcome data manually (hospital vital sign data at the second study site), the abstractor was blind to treatment group assignment. Statistical analysis To generate risk predictions, gradient boosting models were applied to patient demographics, structured CDSS data, and free-text notes embedded using the bag-of-words method as described elsewhere ( 1 ). The models were implemented in the openTriage platform and accessed by the dispatching system via API ( 2 ). The primary hypotheses was evaluated using logistic regression, with missing vital sign data necessary to calculate NEWS multiply imputed using multivariate imputation by chained equations using the random forest algorithm ( 22 ). 10 sets of imputed NEWS components were generated, and full scores were directly calculated from the component items in line with recommendations ( 23 ). Per the study protocol, outcomes were to be evaluated based on the median value of 5 imputations, but upon further consideration this was felt to risk inflating type-I error rates, and the more rigorous approach of pooling estimates using Rubin’s rules was used ( 24 ). Secondary hypothesis 1 was similarly evaluated using logistic regression, but the second study center had to be excluded due to technical difficulties in gathering comprehensive hospital outcome data. Secondary hypothesis 2 was evaluated using a Wilcoxon rank sum test applied to the multiply imputed data, pooling z-values using Rubin’s rules to test for significance ( 25 ). Six ancillary analyses were pre-specified. Analyses 1-2 investigated compliance with the instrument, and were investigated by assessing the compliance of dispatchers with the risk assessment tool overall and in the high and low confidence groups of the intervention arm, with the hypothesis that compliance would be higher in the high-confidence group. An analysis was also performed to investigate the hypothetical outcomes if dispatchers had been 100% compliant with the tool to evaluate the potential impact of compliance rates. Pre-specified analyses 3-5 regarded changes over time, and were examined using time-series analysis within a regression framework employing a variable representing the study month (or months since last model update) as the independent variable of interest. Analysis 6 regarded model calibration, and was conducted by including patient characteristics of interest (age, sex, and major complaint groups) as independent predictors in models evaluating the primary hypothesis. An analysis of loss to follow-up (i.e., patients who withdrew from the study) was performed to examine whether patients who opted out of the study differed from those to remained. An analysis of time to dispatch was not pre-specified, but was identified as an important indicator of operational efficiency in discussions with the study with clinical and administrative staff. It was anticipated that dispatchers could attempt to circumvent the randomization process (e.g, repeat the randomization if the RCS was assigned to the control arm), and code was implemented to assign repeated randomizations of the same patients to the same treatment arm. In these cases, only the last randomization was included in the analysis. In cases where randomization was repeated but additional patients were included and thus not captured by the repeated randomization check, the last randomization was only included if all randomizations had by chance been assigned to the same treatment arm. All other randomizations were excluded as protocol violations. All data transformation and analysis was performed using R v4.4.2 ( 26 ), and the scripts used to perform all transformations and analysis are available in a public repository ( 27 ). All code to evaluate the prespecified analyses were written prior to obtaining patient outcome data using on a synthetic dataset simulating the null hypothesis. The analysis script used to generate the results presented in this manuscript and its output may be found as supplement 1 – Analysis notebook. Ethics Ethical approval for the study was sought and granted by the Swedish Ethical Review Authority (Dnr 2020-00187). An exemption from gathering prospective informed consent from patients was granted for the study by the ethics review board. Informed consent materials were instead mailed to study participants retroactively, at which point patients were given the opportunity to withdraw from the study. The study and its protocol was preregistered at ClinicalTrials.gov (ID NCT04757194 ) on 2021-02-17, ( 28 ) and is reported according to the CONSORT guidelines ( 29 ). Results Participant flow A total of 1845 RCS were included for randomization by dispatchers. 350 randomizations (13.5%) were the result of a protocol violation in which no ambulance was dispatched to any patient included in the comparison prior to the next comparison including the same patients. There was also a single case where no ambulance was assigned to any patient in the RCS included in this category of exclusions. Of the remaining 1495 RCS, 250 (16%) included a patient who upon receiving information about the study, opted to decline participation. Upon applying these exclusions, 1245 RCS remained for analysis per figure 1 below. Recruitment Patients were recruited between 2021-02-01 and 2024-12-01. The trial had to be extended due to a slow rate of inclusion and delays in obtaining data necessary to train the models for the second study site, which began data collection in 2024-06-01. A data monitoring committee was formed to monitor the study and address any reported incidents or patient safety issues, but none were identified. The study was ended prior to having collected the full calculated sample size (1500) for administrative reasons. Baseline data A total of 585 RCS were included in the control arm, and 660 in the intervention arm, corresponding to 1285 and 1479 individual patients, respectively. Patient demographics in terms of age and gender were similar across both treatment arms. Overall dispatch times were similar across both treatment arms, but both dispatch times and times from inclusion to dispatch were shorter for prioritized patients in the intervention group. Note that an analysis of dispatch times was not pre-specified. View this table: View inline View popup Download powerpoint Table 1 Patient / Dispatch characteristics Missingness NEWS score calculation was based primarily on ambulance vital parameters, which were missing in between 11.0% (pulse) and 14.2% (temperature) of cases. Missing ambulance data could be due either to patients not receiving an ambulance, or ambulance staff not fully documenting vital parameters. Ambulance vital sign data was supplemented with hospital data collected from the patient’s first ED visit within 24 hours of contact with the EMD center, resulting in final rates of between 3.7% (consciousness) and 4.8% (temperature). These combined datasets were used to perform multiple imputation which achieved stable imputed values with good mixing properties across chains. Full data on missingness rates and imputation diagnostics may be found in supplementary materials 1. Primary and secondary hypotheses In the control arm, the patient with the highest NEWS value was prioritized in 62.5% of cases, while in the intervention arm the proportion was 68.3%, corresponding to an odds ratio of 1.28 (95% CI 1.00 – 1.63, p = 0.047). There is thus support for rejecting the null regarding the primary hypotheses of the study. Similarly, intervention effects regarding the first secondary hypotheses achieved statistical significance, with 63.0% of prioritized patients in the control arm having the highest composite outcome score, compared to 69.6% in the intervention arm, corresponding to an odds ratio of OR 1.31 (1.01 – 1.72, p = 0.041) among the 1106 RCS in the first study site. The average difference between the NEWS value of prioritized and non-prioritized patients in the control arm was 0.62, compared to 1.08 in the intervention arm, corresponding to a mean difference of 0.45, although based on a Wilcoxon rank-sum test, no significant difference could be identified (p = 0.086). Results are summarized in table 2 below. View this table: View inline View popup Download powerpoint Ancillary analyses Six additional pre-specified analyses were performed to evaluate the properties of the study and the risk prediction model itself. Per the study protocol, directors were permitted to deviate from the model recommended patient upon conferring with the involved dispatch nurses. In the control arm where information regarding the risk score was not available, the patient with the highest risk score was prioritized in 54.0% (50.0 – 58.1) of cases. In the intervention arm, the patient with the highest risk score was prioritized in 80.9% (77.7 – 83.9) of cases. To evaluate the impact of these deviations, an analysis was performed to estimate the intervention effect had the compliance with the risk score been 100% in the intervention group. This analysis identified a substantially stronger effect, with 72.2% of prioritized patients having the highest NEWS value, and resulting in an odds ratio of 1.56 (1.21 – 2.00, p < 0.001). An indicator of model confidence was included in the intervention as described in the methods, and while a difference in compliance between the high- and low-confidence groups was identified (81.7% vs 80.3%), the difference was not found to be significant, nor was there a significant difference in outcomes between the groups as assessed per the primary hypothesis. There was however a substantial difference when analysed in the 100% compliance scenario, with 67.4% of patients assessed by the instrument as highest risk having the highest NEWS value in the low confidence group, vs 78.5% in the high confidence group, corresponding to an OR of 1.77 (1.20 – 2.62, p = 0.004) between the confidence groups. A number of analyses investigating temporal effects during the study period were also performed. It was hypothesized that compliance with the tool might change over the study period (either positively due to routinization effects, or negatively due to loss of trust in the tool). The per-month change in the odds of compliance with the tool in the intervention group was 0.98 (0.97 – 1.001, p = 0.07). It was also hypothesized that the dispatchers might learn from the assessments of the tool, resulting spillover to the control group, manifesting as an increased assessment accuracy over time in the control group. No evidence of this could be found however, with a per month change in the odds of correct assessment per the primary hypothesis in the control group of 1.00 (0.98 – 1.01, p = 0.944). There was also concern that model performance could degrade over time, and model performance was monitored over the course of the study by the Data Monitoring Committee in terms of the correlation between the patients assigned risk score and their resultant NEWS value. A risk of degradation was thought to have been identified once early in the study, and the model was retrained with updated data. The model was updated once more during the course of the study upon initiation of the study at the second site. The updates were made on the first of December 2021 and the first of June 2024, respectively. Upon analysis of the full dataset however, no linear degradation of model performance could be identified, with an average change in spearman correlation per month since the most recent update of -0.00 (-0.01 – 0.01, p = 0.906). To evaluate model calibration, indicators for patient age, sex, and clinical category were used as predictors of NEWS together with the ML risk score. We found no residual predictive value of patient age or sex in predicting NEWS when adjusted by the ML risk score. Significant predictive values for patient categories were identified for 2 of the 41 patient types included in the study (Fever and Difficulty breathing). Full results regarding all ancillary analyses may be found in the supplementary materials. Discussion This study evaluated the ability of a ML-based risk scoring tool to influence care providers at EMD centers, with the aim of improving prioritization decisions of low priority cases in resource constrained situations. The intervention was found to have resulted in an improved differentiation of patients, though intervention effects were at the edge of statistical significance. The intervention appeared to be stable over the nearly 4-year study timeframe, with no signs of degraded model performance. In addition to improved differentiation, the intervention group also had shorter dispatch delays for prioritized patients. While this analysis was not prespecified, these findings suggest that the intervention may also have improved the speed of the dispatching process. This is, to our knowledge, the first randomized trial of a ML-based risk assessment tool intended for use in the general patient population served by EMD centers. In a previous RCT intended for use in identifying cardiac arrests at EMD centers, Blomberg et al. ( 10 ) identified no intervention effect of an ML-based alerting tool, which was attributed to a lack of compliance with the intervention by EMD nurses. These findings, along with the results of our study identifying the potential for a substantially larger intervention effect had the tool been followed more closely, highlight the need to understand how to build trust in automated risk assessment tools if they are to be used to their full potential. An indicator of model confidence was included in our intervention, but it failed to achieve a statistically significant impact on compliance, despite the high-confidence intervention arm RCS containing a higher proportion of accurate assessments. While the definition of the outcome metrics employed was largely based on clinical judgement, the similar effect sizes with regards to predicting NEWS (OR 1.28) and the composite outcome the model is trained to predict (OR 1.31) suggests that the risk predictions were well aligned with a widely used risk differentiation tool. While our study identified a statistically significant intervention effect, the overall level of accuracy leaves much to be desired, highlighting the difficulty of triaging potentially emergent conditions over the telephone. In order to improve the accuracy of the ML models used, we see two general paths: The inclusion of unstructured audio data from the call, and the inclusion of additional structured data from historical patient medical records. The former may be accomplished e.g. through the processing of audio data from the emergency call through pretrained speech recognition models, and its integration with the decision support system data. The challenges of integrating patient medical record data are primarily legal and technical, necessitating the availability of APIs for obtaining high-quality data from medical records systems in near real-time. Nonetheless, our findings suggest that the effect size hypothesized based on retrospective validation indeed translated into a real-world impact of similar magnitude, and that applications aimed at higher-risk patient groups using risk assessment tools with similar levels of performance may be safely pursued. Limitations By the nature of the intervention, dispatchers could not be blinded to the treatment assignment. This could explain the uneven distribution between the intervention and control arms (660 vs 585), as dispatchers sometimes sought to repeat the risk scoring if they were assigned to the control group, resulting in these repeat risk assessments sometimes being excluded due to protocol violation. While this exclusion mechanism does not appear likely to bias the results, it reduced the sample size and thus the power of the analysis. Missing outcome data also reduced the power of the analysis by inducing between-imputation variability, resulting in an effect size similar to that expected based on pilot study results, but with a substantial degree of uncertainty. The multiple imputation process is also stochastic, and given that the findings are at the very edge of statistical significance, even simply changing the random seed used to generate imputations can variously produce significant or non-significant findings. We thus urge that the p-values reported here be interpreted thoughtfully as the probability of repeated trials generating an effect at least this extreme under the null hypothesis, rather than dichotomously. This study was performed at dispatch centres employing nurses in the primary call-taking role. This level of formal education is relatively rare in the context of EMD and could impact the generalizability of the results. It is reasonable to believe that it would be in the direction of underestimating the intervention effects were models of similar precision implemented in a context where the control group were assessed by care providers with less formal training. Similarly, the models used are based on a CDSS used only in a small number of Swedish regions. However, the modelling framework is freely available and can be adapted to structured and free-text data from other CDSS. The study suffered from a degree of post-randomization drop-out due to the 306 RCS (16%) having to be excluded due to at least one patient opting out of the study. The characteristics of patients excluded due to this were however similar to those included in the study (see supplementary materials 1), and this source of loss to follow-up thus does not appear to have impacted the findings. Conclusion This randomized controlled trial suggests that ML-based interventions in the context of emergency medical dispatching have the ability to improve the capacity of care providers to identify patients most in need of an ambulance across a diverse patient cohort. The overall accuracy of the triage process however remains modest, and more can be done to improve the accuracy of the models and the adherence of care providers with model recommendations. Data Availability The data used in this study are owned by the regional health authorities in each respective study site, and permission to publicly distribute individual level data was not granted by the Swedish ethics review authority. The data used in this study may be obtained by researchers with appropriate ethics approvals by contacting ambulanssjukvard{at}akademiska.se . All code used to generate the reported results are available in a public repository at https://osf.io/erkv7/ and the tool evaluated is available at https://github.com/dnspangler/opentriage Funding Partly funded by the Swedish Innovation Agency grant number 2017-04652. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript. Conflicts of interest The authors have declared that no competing interests exist. Patient or Public Involvement No formal Patient or Public Involvement efforts were involved in this study. References 1. ↵ Ashkenazi I , editor Spangler D , Hermansson T , Smekal D , Blomberg H. A validation of machine learningbased risk scores in the prehospital setting . Ashkenazi I , editor. PLOS ONE . 2019 Dec ; 14 ( 12 ): e0226518 . OpenUrl PubMed 2. ↵ Spangler D. openTriage [Internet] . 2020 [cited 2020 Jun 25 ]. Available from: https://github.com/dnspangler/openTriage 3. ↵ Blomberg SN , Folke F , Ersbøll AK , Christensen HC , Torp-Pedersen C , Sayre MR , et al. Machine learning as a supportive tool to recognize cardiac arrest in emergency calls . Resuscitation . 2019 May ; 138 : 322 – 9 . OpenUrl CrossRef PubMed 4. ↵ Levin S , Toerper M , Hamrock E , Hinson JS , Barnes S , Gardner H , et al. Machine-Learning-Based Electronic Triage More Accurately Differentiates Patients With Respect to Clinical Outcomes Compared With the Emergency Severity Index . Ann Emerg Med . 2018 May 1; 71 ( 5 ): 565 - 574 .e2. OpenUrl CrossRef PubMed 5. ↵ Hong WS , Haimovich AD , Taylor RA . Predicting hospital admission at emergency department triage using machine learning . PLOS ONE . 2018 Jul 20; 13 ( 7 ): e0201016 . OpenUrl CrossRef PubMed 6. Yu JY , Xie F , Nan L , Yoon S , Ong MEH , Ng YY , et al. An external validation study of the Score for Emergency Risk Prediction (SERP), an interpretable machine learning-based triage score for the emergency department . Sci Rep . 2022 Oct 19; 12 ( 1 ): 17466 . OpenUrl PubMed 7. Pirneskoski J , Tamminen J , Kallonen A , Nurmi J , Kuisma M , Olkkola KT , et al. Random forest machine learning method outperforms prehospital National Early Warning Score for predicting one-day mortality: A retrospective study . Resusc Plus . 2020 Dec 1; 4 : 100046 . OpenUrl PubMed 8. Almulihi QA , Alquraini AA , Almulihi FAA , Alzahid AA , Al Qahtani SSAJ , Almulhim M , et al. Applications of Artificial Intelligence and Machine Learning in Emergency Medicine Triage - A Systematic Review . Med Arch . 2024 ; 78 ( 3 ): 198 – 206 . OpenUrl PubMed 9. ↵ Chee ML , Chee ML , Huang H , Mazzochi K , Taylor K , Wang H , et al. Artificial intelligence and machine learning in prehospital emergency care: A scoping review . iScience . 2023 Jul 17; 26 ( 8 ): 107407 . OpenUrl PubMed 10. ↵ Blomberg SN , Christensen HC , Lippert F , Ersbøll AK , Torp-Petersen C , Sayre MR , et al. Effect of Machine Learning on Dispatcher Recognition of Out-of-Hospital Cardiac Arrest During Calls to Emergency Medical Services: A Randomized Clinical Trial . JAMA Netw Open . 2021 Jan 6; 4 ( 1 ): e2032320 . OpenUrl 11. ↵ Plana D , Shung DL , Grimshaw AA , Saraf A , Sung JJY , Kann BH . Randomized Clinical Trials of Machine Learning Interventions in Health Care: A Systematic Review . JAMA Netw Open . 2022 Sep 29; 5 ( 9 ): e2233946 . OpenUrl PubMed 12. ↵ Spangler D , Edmark L , Winblad U , Colldén-Benneck J , Borg H , Blomberg H. Using trigger tools to identify triage errors by ambulance dispatch nurses in Sweden: an observational study . BMJ Open . 2020 Mar 1; 10 ( 3 ): e035004 . OpenUrl Abstract / FREE Full Text 13. ↵ Holmström IK , Kaminsky E , Lindberg Y , Spangler D , Winblad U. Registered Nurses’ experiences of using a clinical decision support system for triage of emergency calls: A qualitative interview study . J Adv Nurs . 2020 ; 76 ( 11 ): 3104 – 12 . OpenUrl PubMed 14. ↵ Ebben RHA , Vloet LCM , Speijers RF , Tönjes NW , Loef J , Pelgrim T , et al. A patient-safety and professional perspective on non-conveyance in ambulance care: a systematic review . Scand J Trauma Resusc Emerg Med . 2017 Jul 17; 25 : 71 . OpenUrl CrossRef PubMed 15. ↵ Paulin J , Kurola J , Koivisto M , Iirola T. EMS non-conveyance: A safe practice to decrease ED crowding or a threat to patient safety? BMC Emerg Med . 2021 Oct 9; 21 ( 1 ): 115 . OpenUrl CrossRef PubMed 16. ↵ Brangan E , Banks J , Brant H , Pullyblank A , Roux HL , Redwood S. Using the National Early Warning Score (NEWS) outside acute hospital settings: a qualitative study of staff experiences in the West of England . BMJ Open . 2018 Oct 1; 8 ( 10 ): e022528 . OpenUrl Abstract / FREE Full Text 17. Pimentel MAF , Redfern OC , Gerry S , Collins GS , Malycha J , Prytherch D , et al. A comparison of the ability of the National Early Warning Score and the National Early Warning Score 2 to identify patients at risk of in-hospital mortality: A multi-centre database study . Resuscitation . 2019 Jan 1; 134 : 147 – 56 . OpenUrl CrossRef PubMed 18. Silcock DJ , Corfield AR , Gowens PA , Rooney KD . Validation of the National Early Warning Score in the prehospital setting . Resuscitation . 2015 Apr 1; 89 : 31 – 5 . OpenUrl CrossRef PubMed 19. ↵ Pirneskoski J , Kuisma M , Olkkola KT , Nurmi J. Prehospital National Early Warning Score predicts early mortality . Acta Anaesthesiol Scand . 2019 ; 63 ( 5 ): 676 – 83 . OpenUrl CrossRef PubMed 20. ↵ Bostrom N. The superintelligent will: Motivation and instrumental rationality in advanced artificial agents . Minds Mach . 2012 ; 22 ( 2 ): 71 – 85 . OpenUrl 21. ↵ Harris CR , Millman KJ , van der Walt SJ , Gommers R , Virtanen P , Cournapeau D , et al. Array programming with NumPy . Nature . 2020 Sep ; 585 ( 7825 ): 357 – 62 . OpenUrl CrossRef PubMed 22. ↵ Buuren S van , Groothuis-Oudshoorn K. Multivariate Imputation by Chained Equations in R . J Stat Softw [Internet] . 2011 Dec 12 [cited 2017 May 4 ]; 45 ( 3 ). Available from: https://www.jstatsoft.org/article/view/v045i03 23. ↵ Gottschall AC , West SG , Enders CK . A Comparison of Item-Level and Scale-Level Multiple Imputation for Questionnaire Batteries . Multivar Behav Res . 2012 Feb 8; 47 ( 1 ): 1 – 25 . OpenUrl 24. ↵ Marshall A , Altman DG , Holder RL , Royston P. Combining estimates of interest in prognostic modelling studies after multiple imputation: current practice and guidelines . BMC Med Res Methodol . 2009 Jul 28; 9 : 57 . OpenUrl CrossRef PubMed 25. ↵ Buuren S van . Flexible Imputation of Missing Data, Second Edition . 2nd ed. New York : Chapman and Hall/CRC ; 2018 . 444 p. 26. ↵ R Core Team . R: A Language and Environment for Statistical Computing [Internet] . Vienna, Austria : R Foundation for Statistical Computing ; 2020 . Available from: https://www.R-project.org/ 27. ↵ Spangler D. MADLAD . 2025 Sep 18 [cited 2025 Sep 18 ]; Available from: https://osf.io/erkv7/ 28. ↵ Blomberg H. Machine Learning Assisted Differentiation of Low Acuity Patients at Dispatch: A Randomized Controlled Trial [Internet] . clinicaltrials.gov ; 2023 Jun [cited 2025 Jan 7 ]. Report No.: NCT04757194 . Available from: https://clinicaltrials.gov/study/NCT04757194 29. ↵ Schulz KF , Altman DG , Moher D. CONSORT 2010 Statement: updated guidelines for reporting parallel group randomised trials . BMJ [Internet] . 2010 Mar 24 [cited 2020 Mar 4 ]; 340 . Available from: https://www.bmj.com/content/340/bmj.c332 View the discussion thread. Back to top Previous Next Posted September 21, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Machine Learning Assisted Differentiation of Low Acuity Patients at Dispatch (MADLAD): A Randomized Controlled Trial Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Machine Learning Assisted Differentiation of Low Acuity Patients at Dispatch (MADLAD): A Randomized Controlled Trial Douglas Spangler , Simon Morelli , David Smekal , Lennart Edmark , Hans Blomberg medRxiv 2025.09.19.25336143; doi: https://doi.org/10.1101/2025.09.19.25336143 Share This Article: Copy Citation Tools Machine Learning Assisted Differentiation of Low Acuity Patients at Dispatch (MADLAD): A Randomized Controlled Trial Douglas Spangler , Simon Morelli , David Smekal , Lennart Edmark , Hans Blomberg medRxiv 2025.09.19.25336143; doi: https://doi.org/10.1101/2025.09.19.25336143 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Emergency Medicine Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6600) Geriatric Medicine (668) Health Economics (997) Health Informatics (4536) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (541) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3332) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9232) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00acae96aefaa15',t:'MTc3OTYwOTg5OA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-23T02:00:01.238055+00:00
License: CC-BY-4.0