Justifying model complexity: evaluating transfer learning against classical models for intraoperative nociception monitoring under anesthesia

doi:10.1101/2025.07.01.25330670

Justifying model complexity: evaluating transfer learning against classical models for intraoperative nociception monitoring under anesthesia

2025 · doi:10.1101/2025.07.01.25330670

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 43,085 characters · extracted from preprint-html · click to expand

Justifying model complexity: evaluating transfer learning against classical models for intraoperative nociception monitoring under anesthesia | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Justifying model complexity: evaluating transfer learning against classical models for intraoperative nociception monitoring under anesthesia View ORCID Profile Chanseo Lee , Jaihyoung Lee , Kimon-Aristotelis Vogt , Muhammad Munshi doi: https://doi.org/10.1101/2025.07.01.25330670 Chanseo Lee 1 Yale School of Medicine , New Haven, CT 06520 2 Sporo Health , Boston, MA 02145 BS Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Chanseo Lee For correspondence: chanseo.lee{at}yale.edu Jaihyoung Lee 1 Yale School of Medicine , New Haven, CT 06520 2 Sporo Health , Boston, MA 02145 PhD Find this author on Google Scholar Find this author on PubMed Search for this author on this site Kimon-Aristotelis Vogt 2 Sporo Health , Boston, MA 02145 3 Paulson School of Engineering and Applied Sciences, Harvard University , Boston, MA 02134 MS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Muhammad Munshi 1 Yale School of Medicine , New Haven, CT 06520 BS Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Background Accurate intraoperative detection of nociceptive events is essential for optimizing analgesic administration and improving postoperative outcomes. While deep learning models promise to capture complex temporal dynamics of physiological signals, their added complexity may not always yield clinically meaningful gains compared to well-engineered classical approaches. Methods We evaluated two classical supervised models—L1-regularized logistic regression and Random Forests (with and without drug dosing features)—against a Temporal Convolutional Network (TCN) transfer-learning framework. We used a dataset of 101 adult surgical cases (~50,000 annotated nociceptive events over ~18,500 minutes) sourced from PhysioNet that tracked 30 physiologic and 18 drug-related features in 5-second windows. All models were assessed under a leave-one-surgery-out cross-validation, with AUROC and AUPRC as primary metrics. We further examined probability calibration (Platt scaling, isotonic regression) and four ensemble strategies—including a meta-learner, MLP, and a feature-conditioned gated network—to quantify the benefit of deep personalization. Results Drug-aware Random Forests achieved the highest discrimination (AUROC 0.716; AUPRC 0.399), significantly outperforming the TCN transfer-learning model (AUROC 0.649; AUPRC 0.311). Isotonic calibration reduced expected calibration error by over 80% but did not alter discrimination. None of the ensemble methods surpassed the standalone Random Forest, and the gated network consistently assigned > 84% weight to the classical model. Permutation importances revealed critical mechanistic features related to sympathetic physiologic response. Conclusions In this head-to-head benchmark, interpretable classical models on expertly curated features matched or exceeded the performance of a complex deep learning approach, while offering superior computational efficiency and transparency. These findings underscore the importance of rigorous comparative evaluation before adopting high-complexity AI solutions in clinical practice. Data Availability Statement All data was sourced from Subramanian et al. on PhysioNet under data usage agreement and proper citations in the manuscript. All code and analysis can be provided upon reasonable request. The authors plan to upload their code on GitHub. Competing Interests Statement The authors declare no conflict of interests or financial stakes in this work. Funding Disclosures There is no funding to declare for this work. Introduction Intraoperative detection of nociceptive events is critical for optimizing analgesic administration, as inadequate pain monitoring can lead to both acute postoperative complications and chronic pain syndromes. 1 , 2 Because of the subjective nature of pain, accurate monitoring of nociception during anesthesia remains elusive, resulting in inadequate dosing of analgesics and poor pain control. There are prior studies on applying machine learning to detect intraoperative nociceptive events to guide analgesic dosing, such as the NOL monitor. NOL has demonstrated substantial clinical utility, reducing the average pain of post-anesthesia care unit patients. 3 However, the proprietary pipeline required for NOL utilizes specialized hardware, and the inner workings are poorly understood, making the system an an algorithmic “black box.” Instead, many classical algorithms such as Random Forests and logistic regressions have been applied to physiologic pain assessment due to their robustness and interpretability. 4 In fact, Random Forests and logistic regression classifiers have been implemented in many predictive tools, including discrimination of pain states from electrodermal activity (EDA) and cardiovascular indices. Continued advances in deep learning promise richer modeling of temporal and multimodal signals. Multimodal CNNs integrating electroencephalogram, photoplethysmography, and electrocardiogram signals demonstrate improved nociception detection compared to single-sensor methods. 5 However, these deep architectures often require significant compute resources, risk overfitting on small dataset sizes, and may offer marginal gains over well-engineered classical models. 6 This raises the question of whether added architectural complexity of deep learning truly translates to clinical benefit. 7 To address this question, our study explores transfer learning via TCNs, adapting a globally trained network to patient-specific data, potentially enhancing personalized pain detection on small datasets. 8 Despite numerous studies on either classical or deep learning methods for pain detection, few have rigorously benchmarked these approaches head-to-head in a clinically realistic setting. Moreover, the field lacks quantitative assessment of when increased model complexity justifies its additional cost and interpretability trade-offs. We address this gap by implementing a Leave-One-Surgery-Out (LOSO) framework to compare classical models to a deep-learning model. We also implemented a gated ensemble network that combines these two approaches to dynamically weigh model contributions and reveal the true value of personalization and model complexity. This study is especially relevant in the context of large language models (LLMs) and AI pipelines gaining traction in medical tasks, from diagnostics to documentation. The emergence of these models raise substantial concerns regarding transparency, ethical risk, and resource allocation. 9 , 10 While emerging foundations like TabPFN show that transformer-based tabular models can excel in small-data regimes, their applicability to high-resolution time-series physiologic data remains both untested and computationally demanding. 11 Our findings highlight that well-curated feature sets and interpretable classical models can match or exceed the performance of complex deep learning frameworks on nociception detection, while dramatically reducing computational burden and enhancing clinician trust. These findings are supported by several prior literature, although the subject remains hotspot for debate. 6 , 7 , 10 , 12 Our study underscores the importance of evaluating model complexity against utility and efficiency as the medical community explores deep learning architectures and LLMs for healthcare problems. 13 Methods Data source and pre-processing The dataset was sourced from PhysioNet. 14 , 15 Subramanian et al. compiled a prospective archive of multi-sensor, continuous physiologic recordings (derived from ECG, EDA) and real-time drug dosing from 101 adult surgical cases, paired with manual annotations by anesthesiologists of 50,000 surgical nociceptive stimuli across ~18,500 minutes of surgery. 15 autonomic features and their respective estimated first derivatives for a total of 30 physiologic features, and 18 drug dosing chronology covariates (time since dose, cumulative dose) from nine drug classes. The data was concatenated into a single-table with non-overlapping 5-second windows and then underwent quality assurance checks (eg. zero imputation for missing values in drug dosing). Every numeric column was then standard ⍰ scaled across the entire pooled cohort to zero mean and unit variance, ensuring equal weight during model fitting. For reproducibility, we created two input matrices: a 48⍰column version that includes both autonomic and drug covariates, and a 30 ⍰ column version that excludes the drug information. The manual annotated nociceptive stimuli recordings were used as the ground truth for comparison. Model creation and performance evaluation Each model’s creation and performance were completed using a Leave-One-Surgery-Out (LOSO) cross-validation strategy. In this approach, data from each surgery was held out in turn as the test set, while the models were trained and saved on the remaining surgeries. This process was repeated for all surgeries, ensuring that each subject contributed exactly once as a test case. For each LOSO fold, the held-out surgery was further partitioned for transfer learning experiments. For transfer-learning models, the initial segment of the surgery was used for patient-specific adaptation (fine-tuning), while the remainder was reserved for evaluation. The adaptation window was varied to assess the impact of patient-specific data on model performance. Model discrimination was quantified using the Area Under the Receiver Operating Characteristic Curve (AUROC) and the Area Under the Precision-Recall Curve (AUPRC). 95% confidence intervals for the median AUROC and AUPRC were calculated using non-parametric bootstrapping with 10,000 resamples. To assess the statistical significance of differences between models, pairwise comparisons of AUROC and AUPRC distributions were performed using non-parametric Wilcoxon signed-rank test. Producing benchmark supervised models To establish a performance benchmark, we created baseline models based on Subramanian et al. 1 Four models were implemented: two logistic regression models with L1 regularization (LASSO), selected via the Akaike Information Criterion (AIC), with and without inclusion of pharmacologic features; and two random forest classifiers, each consisting of 200 decision trees with a maximum depth of 50, trained using 90% bootstrap resampling to mitigate overfitting. For ensemble experiments, an additional 50-tree random forest classifier with drug information was also trained with the same methodology. Transfer-learning models with adaptive windows Each transfer-learning experiment was structured as a two-phase, leave-one-subject-out protocol. First, a global base model was initialized by pooling all 5-second windows from 100 of the 101 surgeries and training a lightweight Temporal Convolutional Network (TCN). This network applies a single 1-D convolution across the feature channels (48 channels when drug covariates are included, 30 otherwise), followed by batch-normalization, ReLU, global max-pooling, and a two-layer dense head. We optimized all parameters for up to twelve epochs (Adam, α = 1 × 10□ 3 , batch = 128) with binary cross-entropy loss weighted for the 6% event prevalence, using early stopping (patience = 3) to avoid over-fitting. The resulting weights were checkpointed as the base model for that LOSO fold. In the personalization phase, we loaded the base model, froze its convolutional and normalization layers, and fine-tuned only the dense head on the first K minutes of the held-out patient’s own data (where K ∈ {1, 2, 5, 10}, corresponding to 12, 24, 60, or 120 windows). Fine-tuning ran for three epochs (Adam, α = 1 × 10□ □, batch = 128) to prevent forgetting and optimizing for small dataset size. The resulting models were saved for each patient in a LOSO-fashion. Calibration analysis To assess the reliability of predicted probabilities from the transfer learning models, calibration analysis was performed across adaptation window lengths. For each window, the TL model was evaluated on the held-out portion of each surgery in the LOSO cross-validation framework. Predicted probabilities and true labels were aggregated for each adaptation window. Three calibration approaches were compared: raw (uncalibrated), in which direct probabilities are output from the TL model; Platt scaling, in which a logistic regression model was fit to map the raw outputs to calibrated probabilities; isotonic regression, in which a non-parametric isotonic regression model was fit to the raw outputs. Calibration performance was assessed using reliability curves, the Brier score, and the expected calibration error (ECE), computed with 10 quantile-based bins. For each method, calibration curves were plotted by comparing the mean predicted probability to the observed event frequency within each bin. Ensemble methods To further enhance predictive performance, several ensemble strategies were evaluated by combining the outputs of the RF and TL models. First, a simple linear combination was implemented, where the final prediction was a weighted average of the RF and TL model outputs, with weights either fixed or optimized via linear regression on the validation data. Additionally, a pruned version of the RF models (with 50 trees instead of 200) was employed to observe the behavior of the resulting ensemble models compared to the 200-tree baseline. Meta-learning approaches were explored beyond linear combination. A one-layer meta-learner, implemented as logistic regression, was trained to learn optimal weights for combining the base model predictions. For greater flexibility, a two-layer neural network meta-learner was also evaluated, allowing the ensemble to capture potential non-linear relationships between the base model outputs. Finally, a gated network (GateNet) ensemble was implemented. In this approach, a small neural network was trained to dynamically assign input-dependent weights to the RF and TL predictions, effectively learning when to rely more on one model versus the other based on the input features. All ensemble models were trained and evaluated within the same LOSO cross-validation framework as the base models, ensuring fair and unbiased comparison. Pooled AUROC/AUPRC as well as two-tailed Wilcoxon tests were between ensembles and base models were calculated. Results Personalization of nociceptive signal detection Table 1 delineates the model performances of the Random Forest and Logistic Regression baselines that are drug-naïve and drug-aware. The RF models consistently outperform LR in terms of AUROC and AUPRC with statistical significance. Interestingly, the RF models also benefit from intraoperative drug information (AUROC 0.716 [0.700, 0.759]) versus without (AUROC 0.662 [0.640, 0.700]) with statistical significance. However, this pattern is not reflected in the LR models. View this table: View inline View popup Download powerpoint Table 1. Classical models as a baseline for nociceptive signal discrimination. The performance of transfer-learning models is shown in Table 2 . At face value, the models’ AUROC/AUPRCs benefit marginally without statistical significance either from drug information or increased personalization phases despite being introduced up to 10 times more data. However, a granular per-surgery AUROC benefit analysis between 10 minutes and 1 minute of personalization ( Fig. 1 ) showed that 67 of 101 surgeries (66%) improved from the additional adaptation in drug-aware models, while 34 (34%) declined. The median AUROC benefit was 0.019 with an interquartile range of [-0.023, 0.047]. View this table: View inline View popup Download powerpoint Table 2. Drug-aware and naïve transfer-learning models with varying adaptation windows. Download figure Open in new tab Figure 1. Per-patient AUROC change between 1-minute to 10-minute adaptation windows. AUROC differences were calculated for each personalized surgery (LOSO fold) then plotted in the histogram above. 66% of surgeries experienced an increase in AUROC while 34% experienced a decrease. Isotonic calibration improves model accuracy Calibration analysis, at its simplest, is meant to show whether an X % predicted risk by the transfer learning model directly translates to X % of cases experiencing a nociceptive signal. We find that while the three calibration methods do not significantly improve its discriminatory ability in terms of AUROC ( Table 3 ) , the Brier scores and ECE indeed show significant improvement from the raw model, approaching nearly zero for all adaptation windows ( Fig. 2A ) . Platt scaling also provided some improvement (ECE 10 min = 0.0198), but the reliability curves still deviated from the ideal diagonal ( Fig. 2B ) , especially in the region where fewer positive samples were available. A Wilcoxon two-tailed test showed that the isotonic calibration performed significantly better than both Platt scaling and the raw models (p < 0.001). View this table: View inline View popup Download powerpoint Table 3. Calibration of 10-minute drug-aware transfer learning models and skipped-folds analysis. Download figure Open in new tab Figure 2. Calibration analysis of transfer-learning with varying adaptation windows a . Mean-predicted probability plotted against observed frequency for the three calibration strategies. The diagonal dashed line represents the ideal calibration where observed frequency equals the predicted probability. b . Isotonic calibration across 1-, 2-, 5-, and 10-minute adaptation windows and their respective Expected Calibration Error (ECE). Across adaptation windows, the number of “skipped folds” ( Table 3 ) —cases where Platt scaling could not be performed due to a lack of both positive and negative events in the adaptation set—decreased substantially as the adaptation window increased. For the shortest window (1 minute), Platt scaling was feasible in only 5 out of 101 cases, with 96 folds skipped. As the adaptation window lengthened, the number of skipped folds declined, reaching 8 at the 10-minute window. Ensemble modeling reveals the key insights into intraoperative nociception prediction Our baseline Random Forest models and tailored transfer-learning models were ensembled together with the hypothesis that they could not only improve performance but could also reveal key insights about the operative data and best practices when applying deep learning to nociception. The four ensemble strategies include simple linear combination, one-layer logistic regression meta-learner, two-layer non-linear meta-learner, and dynamic GateNet. An initial comparison between a linear combination ensemble of a 200-tree RF versus a pruned 50-tree RF with the 10-minute TL model revealed that while both the RF and RF-TL ensembles outperform the TL model, there was no statistical difference in performance with increasing RF tree number ( Table 4 ) . The pruned RF(50)-TL (AUROC 0.681 [0.679, 0.684]) and unpruned RF(200)-TL (AUROC 0.683 [0.681, 0.686]) performed similarly to each other, but worse than the RF models alone. The pruned RF (AUROC 0.715 [0.713, 0.718]) and unpruned RF (AUROC 0.713 [0.711, 0.716]) also performed similarly to each other. View this table: View inline View popup Download powerpoint Table 4. Pooled AUROC and AUPRC comparison of RF-TL ensemble methods. To investigate the effects of ensemble behavior between RF and TL in nociception detection, a one-layer meta-learner was employed. Interestingly, while the pruned RF-TL (AUROC 0.686 [0.684, 0.688]) did not improve in performance, the unpruned RF-TL (AUROC 0.708 [0.706, 0.711]) performed significantly better than its pruned counterpart, despite the pruned RF(50) and unpruned RF(200) having no statistical difference. However, the unpruned RF-TL with the one-layer meta-learner still significantly underperformed than its unpruned RF counterpart (p < 0.001). To test a further hypothesis that the interaction between model outputs may be non-linear, we utilized a two-layer meta-learner to potentially boost ensemble performance. The RF(200)-TL (AUROC 0.710 [0.708, 0.713]) performed similarly to the RF(200) (AUROC 0.713 [0.711, 0.716]) without significant difference (p = 0.221). Feature-conditioned gated neural networks allow ensemble interpretation An additional ensemble strategy employing a feature-conditioned gated neural network was used to reveal mechanistic insights of nociceptive signal prediction. The GateNet performed as well as the Random Forest, achieving an AUROC of 0.712 [0.710, 0.715]. A similar per-surgery analysis revealed that 60 of 101 (59.4%) surgeries experienced an AUROC increase while 41 declined (40.6%). However, the median ΔAUROC was 0, and the IQR was [0, 0.001], suggesting spurious improvements. Permutation importances were calculated from the Random Forest, RF(200)-TL ensemble, and the GateNet in Figure 3 , which delineates the top 10 features from each calculation. We note that both the ensemble and Random Forest have high dependencies on time since last sedative dose, tonic electrodermal activity, and the mean heart rate, indicating a high dependency on these factors for predictive power and optimal weight assignments. Download figure Open in new tab Figure 3. Permutation importance of predictive driver features in RF, ensemble, and GateNet. Top 10 permutation importance features for Random Forest (RF), GateNet arbitrator, and RF(200)-TL ensemble model. Note that “Mu-” stands for mean, “Sigma-” stands for standard deviation, and “d_-“ indicates first derivative features. GateNet’s top three features included cumulative doses of antinociceptives, sevoflurane, and heart rate variability. Furthermore, unlike the Random Forest and the ensemble, it had proportionally similar, lesser important features such as Tonic EDA and time since beta blocker and alpha-2 agonist dosing. These features were important in determining levels of trust between the RF versus TL models. Given the high permutation importances observed with the cumulative doses of antinociceptives and sevoflurane, the GateNet α was plotted over these two features. Figure 4A depicts α over observed evaluation windows, and Figure 4B shows a heatmap of α over all simulated evaluation windows when independently varying the two features while others are held constant. Both show that α always trends near 1 with a minimum of 0.84, indicating the ensemble relied more on RF than TL to make final prediction decisions. Download figure Open in new tab Figure 4. GateNet α analysis for across antinociceptive and sevoflurane dosing features. a . GateNet α plotted over observed evaluation windows. The colors and their corresponding α are shown in the color bar. b . Simulated α values across simulated evaluation windows by independently varying the respective features. Discussion Accurate detection and management of pain during surgery remains a critical challenge in perioperative medicine, with significant implications for patient outcomes and the advancement of personalized care. Intraoperative nociception is inherently complex, influenced by a dynamic interplay of physiological responses and pharmacological interventions. Traditional monitoring approaches often fail to capture this complexity, leading to suboptimal pain control and increased risk of adverse events. Prior literature has explored machine learning techniques to improve intraoperative pain assessment. However, questions surrounding the clinical utility of machine learning methodologies in the context of small data sizes and interpretability remain largely unanswered. Our study is the first to address these questions by introducing a flexible ensemble framework that integrates both classical and deep learning models, guided by a neural gating mechanism. This approach provides interpretable insights into the relative importance of physiological and drug-related features and analyzing methodologies for personalized pain detection. We harness ensemble methods and data science to assess the cost-benefit of increasing model complexity. Personalized deep learning doesn’t always outperform reliable supervised learning techniques Our study found that architecture is not the only factor in creating clinically effective machine learning and artificial intelligence in precision medicine. Despite having the advantage of additional study of up to 10 minutes of a patient’s surgery in each LOSO fold, we find that transfer learning models do not outperform the Random Forest baseline. We provide several pieces of literature evidence supporting this finding. First, the Random Forest is commonly used for performing predictions on medical tabular data 11 because of its strong performance on irregular, non-rotationally invariant data where linear combinations of features may be uninformative for predictive power. 12 In these cases, tree-based methods like Random Forest commonly outperform deep learning methods. 7 , 12 Second, the models in this study were fed expertly crafted features, such as mean, standard deviations, first derivatives, drug timings, and dosages. These features may already capture high amounts of domain knowledge and pattern discovery. 16 For example, permutation importance revealed factors such as TonicEDA, mean heart rate, or standard deviation of heart rate as high importance features in our ensemble. These features have been shown to be associated with sympathetic chain function and nociceptive stimuli and response. 17 , 18 Against a 200-tree Random Forest trained on 100 diverse surgeries, the TCN-based transfer learning model, despite traditionally excelling on small sample sizes on prior medical detection studies, 19 , 20 offers marginal benefits in a highly engineered feature set. In fact, fine-tuning on just the first few minutes of surgery could be detrimental, because this initial period can be noisy, not contain enough information, or not representative of the rest of the procedure. As prior studies in medical imaging have shown, 21 , 22 if the transfer learning model overfits to this small, specific window, this may potentially make it worse at prediction compared to the global RF model. On the other hand, our calibration analysis suggests that there may be some benefit of tuning beyond the first few minutes of surgery. This was indicated by the dramatic decrease in number of skipped LOSO folds with increasing adaptation windows, indicating increased capture of clinically relevant information. The unbiased judge: harnessing ensemble methods to reveal insights into medical machine learning Ensemble methods are commonly used to mix behaviors of various models to boost the accuracy for more robust predictions. 23 , 24 They can also be used to reveal insights into methodologies and data. One of the first questions we explored is the difference in performance of the ensembled RF-TL with various combination strategies and RF tree sizes. While there was no difference in performance between 50 or 200 trees, a one-layer meta-learner produced a significant difference favoring the 200-tree RF-TL. This is indicative of the increased stability (lower variance) of increasing tree size, which smooths decision boundaries at the cost of increasing computation burden and model size linearly. 25 The performance of the 200-tree RF-TL was further improved by the employment of a two-layer meta-learner. Consistent with earlier conclusions, this suggests that prediction mechanism using the engineered data is non-linear, which is suited well for RF’s robust predictive power. This conclusion is further supported by the GateNet’s arbitrating behavior. GateNet’s top permutation importance features closely mirror that of the Random Forest model, indicating high trust. The alpha analysis suggests that in most cases, the best strategy to minimize error is to trust RF over TL, most likely due to the reasons outlined earlier in the discussion. The clinical implications are substantial. Our findings suggests that for the purposes of nociceptive signal detection, a less computationally intensive, more interpretable, and easier-to-deploy Random Forest model is a superior approach. Especially as interpretability and scalability is at the forefront of discussion in every medical ML/AI tool, pursuing complex, “black box” models like TCNs may not be the best path forward. Instead, proper feature engineering and simple, interpretable models such as Random Forest may offer superior scalability and increase clinician trust without sacrificing accuracy. Final remarks In this comprehensive evaluation of models for nociception detection, we found that a robust, supervised Random Forest model trained on engineered physiological features established a high-performance benchmark. While deep transfer learning offers a promising paradigm for patient-specific adaptation, our results indicate it provided no significant performance gain in this setting. Furthermore, we demonstrate the utility of ensemble methods such as a gated ensemble network as a diagnostic tool, which automatically determined the marginal value of the transfer learning component to be negligible. These findings underscore the critical importance of benchmarking against strong classical models and suggest that for this clinical application, a simpler, more efficient model may be the better solution. Data Availability All data was sourced from Subramanian et al. on PhysioNet under data usage agreement and proper citations in the manuscript. All code and analysis can be provided upon reasonable request. The authors plan to upload their code on GitHub. https://physionet.org/content/multimodal-surgery-anesthesia/1.0/ Bibliography 1. ↵ Subramanian S , Tseng B , Del Carmen M , et al. Monitoring surgical nociception using multisensor physiological models . Proc Natl Acad Sci . 2024 ; 121 ( 40 ): e2319316121 . doi: 10.1073/pnas.2319316121 OpenUrl CrossRef PubMed 2. ↵ Shi G , Liu G , Gao Q , et al. A random forest algorithm-based prediction model for moderate to severe acute postoperative pain after orthopedic surgery under general anesthesia . BMC Anesthesiol . 2023 ; 23 ( 1 ): 361 . doi: 10.1186/s12871-023-02328-1 OpenUrl CrossRef PubMed 3. ↵ Van Der Wal I , Meijer F , Fuica R , et al. Intraoperative use of the machine learning-derived nociception level monitor results in less pain in the first 90 min after surgery . Front Pain Res . 2023 ; 3 : 1086862 . doi: 10.3389/fpain.2022.1086862 OpenUrl CrossRef 4. ↵ Fernandez Rojas R , Hirachan N , Brown N , et al. Multimodal physiological sensing for the assessment of acute pain . Front Pain Res . 2023 ; 4 : 1150264 . doi: 10.3389/fpain.2023.1150264 OpenUrl CrossRef 5. ↵ Abdel Deen OMT , Fan SZ , Shieh JS . A Multimodal Deep Learning Approach to Intraoperative Nociception Monitoring: Integrating Electroencephalogram, Photoplethysmography, and Electrocardiogram . Sensors . 2025 ; 25 ( 4 ): 1150 . doi: 10.3390/s25041150 OpenUrl CrossRef PubMed 6. ↵ Xu H , Kinfu KA , LeVine W , et al. When are Deep Networks really better than Decision Forests at small sample sizes, and how? Published online November 2, 2021. doi: 10.48550/arXiv.2108.13637 OpenUrl CrossRef 7. ↵ McElfresh D , Khandagale S , Valverde J , et al. When Do Neural Nets Outperform Boosted Trees on Tabular Data? Published online July 15, 2024. doi: 10.48550/arXiv.2305.02997 OpenUrl CrossRef 8. ↵ Xu P , Ji X , Li M , Lu W. Small data machine learning in materials science . Npj Comput Mater . 2023 ; 9 ( 1 ): 42 . doi: 10.1038/s41524-023-01000-z OpenUrl CrossRef 9. ↵ Armitage RC . Implications of Large Language Models for Clinical Practice: Ethical Analysis Through the Principlism Framework . J Eval Clin Pract . 2025 ; 31 ( 1 ): e14250 . doi: 10.1111/jep.14250 OpenUrl CrossRef PubMed 10. ↵ Lee C , Vogt KA , Kumar S. Prospects for AI clinical summarization to reduce the burden of patient chart review . Front Digit Health . 2024 ; 6 : 1475092 . doi: 10.3389/fdgth.2024.1475092 OpenUrl CrossRef PubMed 11. ↵ Hollmann N , Müller S , Purucker L , et al. Accurate predictions on small data with a tabular foundation model . Nature . 2025 ; 637 ( 8045 ): 319 – 326 . doi: 10.1038/s41586-024-08328-6 OpenUrl CrossRef PubMed 12. ↵ Grinsztajn L , Oyallon E , Varoquaux G. Why do tree-based models still outperform deep learning on tabular data? Published online July 18, 2022. doi: 10.48550/arXiv.2207.08815 OpenUrl CrossRef 13. ↵ Brown KE , Yan C , Li Z , et al. Large language models are less effective at clinical prediction tasks than locally trained machine learning models . J Am Med Inform Assoc . 2025 ; 32 ( 5 ): 811 – 822 . doi: 10.1093/jamia/ocaf038 OpenUrl CrossRef PubMed 14. ↵ Goldberger AL , Amaral LA , Glass L , et al. PhysioBank, PhysioToolkit, and PhysioNet: components of a new research resource for complex physiologic signals . Circulation . 2000 ; 101 ( 23 ): E215 – 220 . doi: 10.1161/01.cir.101.23.e215 OpenUrl CrossRef 15. ↵ Subramanian S , Tseng B , Barbieri R , Brown E. Multimodal Physiological Indices During Surgery Under Anesthesia . doi: 10.13026/GS4V-4Q80 OpenUrl CrossRef 16. ↵ Ciobanu-Caraus O , Aicher A , Kernbach JM , Regli L , Serra C , Staartjes VE . A critical moment in machine learning in medicine: on reproducible and interpretable learning . Acta Neurochir (Wien) . 2024 ; 166 ( 1 ): 14 . doi: 10.1007/s00701-024-05892-8 OpenUrl CrossRef PubMed 17. ↵ Yugar LBT , Yugar-Toledo JC , Dinamarco N , et al. The Role of Heart Rate Variability (HRV) in Different Hypertensive Syndromes . Diagnostics . 2023 ; 13 ( 4 ): 785 . doi: 10.3390/diagnostics13040785 OpenUrl CrossRef PubMed 18. ↵ Forte G , Troisi G , Pazzaglia M , Pascalis VD , Casagrande M. Heart Rate Variability and Pain: A Systematic Review . Brain Sci . 2022 ; 12 ( 2 ): 153 . doi: 10.3390/brainsci12020153 OpenUrl CrossRef 19. ↵ Maray N , Ngu AH , Ni J , Debnath M , Wang L. Transfer Learning on Small Datasets for Improved Fall Detection . Sensors . 2023 ; 23 ( 3 ): 1105 . doi: 10.3390/s23031105 OpenUrl CrossRef PubMed 20. ↵ Safonova A , Ghazaryan G , Stiller S , Main-Knorn M , Nendel C , Ryo M. Ten deep learning techniques to address small data problems with remote sensing . Int J Appl Earth Obs Geoinformation . 2023 ; 125 : 103569 . doi: 10.1016/j.jag.2023.103569 OpenUrl CrossRef 21. ↵ Matas I , Serrano C , Nogales M , et al. Mitigating Overfitting in Medical Imaging: Self-Supervised Pretraining vs . ImageNet Transfer Learning for Dermatological Diagnosis . Published online May 22, 2025. doi: 10.48550/arXiv.2505.16773 OpenUrl CrossRef 22. ↵ Salehi AW , Khan S , Gupta G , et al. A Study of CNN and Transfer Learning in Medical Imaging: Advantages, Challenges, Future Scope . Sustainability . 2023 ; 15 ( 7 ): 5930 . doi: 10.3390/su15075930 OpenUrl CrossRef 23. ↵ Naderalvojoud B , Hernandez-Boussard T. Improving machine learning with ensemble learning on observational healthcare data . AMIA Annu Symp Proc AMIA Symp . 2023 ; 2023 : 521 – 529 . OpenUrl PubMed 24. ↵ Fanizzi A , Fadda F , Maddalo M , et al. Developing an ensemble machine learning study: Insights from a multi-center proof-of-concept study. Umer M , ed. PLOS ONE . 2024 ; 19 ( 9 ): e0303217 . doi: 10.1371/journal.pone.0303217 OpenUrl CrossRef PubMed 25. ↵ Lange TM , Gültas M , Schmitt AO , Heinrich F. optRF: Optimising random forest stability by determining the optimal number of trees . BMC Bioinformatics . 2025 ; 26 ( 1 ): 95 . doi: 10.1186/s12859-025-06097-1 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted July 03, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Justifying model complexity: evaluating transfer learning against classical models for intraoperative nociception monitoring under anesthesia Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Justifying model complexity: evaluating transfer learning against classical models for intraoperative nociception monitoring under anesthesia Chanseo Lee , Jaihyoung Lee , Kimon-Aristotelis Vogt , Muhammad Munshi medRxiv 2025.07.01.25330670; doi: https://doi.org/10.1101/2025.07.01.25330670 Share This Article: Copy Citation Tools Justifying model complexity: evaluating transfer learning against classical models for intraoperative nociception monitoring under anesthesia Chanseo Lee , Jaihyoung Lee , Kimon-Aristotelis Vogt , Muhammad Munshi medRxiv 2025.07.01.25330670; doi: https://doi.org/10.1101/2025.07.01.25330670 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Anesthesia Subject Areas All Articles Addiction Medicine (569) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4442) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1510) Epidemiology (15230) Forensic Medicine (30) Gastroenterology (1126) Genetic and Genomic Medicine (6609) Geriatric Medicine (668) Health Economics (998) Health Informatics (4542) Health Policy (1370) Health Systems and Quality Improvement (1613) Hematology (543) HIV/AIDS (1266) Infectious Diseases (except HIV/AIDS) (15923) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (147) Nephrology (668) Neurology (6607) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1146) Occupational and Environmental Health (957) Oncology (3337) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (664) Pediatrics (1693) Pharmacology and Therapeutics (692) Primary Care Research (712) Psychiatry and Clinical Psychology (5448) Public and Global Health (9237) Radiology and Imaging (2202) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (596) Sexual and Reproductive Health (714) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a019a2afcf313fe2',t:'MTc3OTc2NTUzNw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00