FreeSurfer version-shuffling can boost brain age predictions

doi:10.1101/2024.06.14.599070

FreeSurfer version-shuffling can boost brain age predictions

2024 · doi:10.1101/2024.06.14.599070

preprint OA: closed CC-BY-4.0

📄 Open PDF Full text JSON View at publisher

Full text 44,922 characters · extracted from preprint-html · click to expand

FreeSurfer version-shuffling can boost brain age predictions | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results FreeSurfer version-shuffling can boost brain age predictions View ORCID Profile Max Korbmacher , View ORCID Profile Lars T. Westlye , View ORCID Profile Ivan I. Maximov doi: https://doi.org/10.1101/2024.06.14.599070 Max Korbmacher 1 Department of Health and Functioning, Western Norway University of Applied Sciences , Bergen, Norway 2 Mohn Medical Imaging and Visualization Center (MMIV) , Bergen, Norway Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Max Korbmacher For correspondence: max.korbmacher{at}hvl.no Lars T. Westlye 3 Department of Psychology, University of Oslo , Oslo, Norway 4 Centre for Precision Psychiatry, Division of Mental Health and Addiction, Oslo University Hospital & Institute of Clinical Medicine, University of Oslo , Oslo, Norway Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Lars T. Westlye Ivan I. Maximov 1 Department of Health and Functioning, Western Norway University of Applied Sciences , Bergen, Norway Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Ivan I. Maximov Abstract Full Text Info/History Metrics Preview PDF Abstract / Key points The influence of FreeSurfer version-dependent variability in reconstructed cortical features on brain age predictions is average small when varying training and test splits from the same data. FreeSurfer version differences can lead to some variability in brain age dependent on the choice of algorithm and individual differences in brain morphometry, highlighting the advantage of repeated random train-test splitting. Shuffling of differently processed FreeSurfer data dependent on the FreeSurfer version increases performance and generalizability of the brain age prediction model. Background The concept of brain age is based on the prediction of participants’ age from a set of brain images using machine learning algorithms. 1 – 4 Subtracting brain age from a person’s chronological age, also called brain age gap or brain age delta, has been outlined as a promising biomarker of brain health, where a larger gap has been related to poorer health outcomes for the participant. 2 , 4 While brain age estimation has been widely applied, there are still multiple sources of prediction errors, which cannot simply be attributed to biological brain features, such as in-scanner subject motion 5 , field strength 6 or other MRI-related technical features, or machine learning workflows (feature space, algorithm, bias correction choices) 7 – 10 . Software packages used in image processing such as FreeSurfer 11 present version-dependent variability in image derived estimates, such as cortical thickness in T 1 -weighted metrics. 12 – 15 The naturally arising question is then whether and how this source of variability influences brain age predictions. A more pronounced knowledge of brain age variability induced by MRI processing software is key for inference and further comparability of brain age study results, as well as the application of trained models on unseen data processed with different FreeSurfer versions. Hence, we aimed to explore brain age predictions based on data obtained from FreeSurfer versions 5.3.0 and 7.4.1 a using the latest data release of the UK Biobank (UKB). Moreover, we introduce version-shuffling for brain age predictions, leveraging training and test data from different FreeSurfer versions boosting prediction accuracy and robustness. Methods Participants We included N = 4,395 UKB participants free of neurological and psychiatric disorders (ICD-10 categories F, G, I, and stroke), and removed MRI feature-level outliers defined as 5 standard deviations above and below the mean. The participants were scanned at three sites in the UK: Cheadle (55.38%), Newcastle (34.99%), and Reading (9.62%). The participants’ age ranged from 49.33 to 82.84 years of age (Mean±SD=64.84±7.27), including 52.53% females. Magentic Resonance Imaging T 1 -weighted images were obtained at 3T on a Siemens Skyra 3T running VD13A SP4 (as of October 2015), with a standard Siemens 32-channel radio frequency receive head coil using a 3D MPRAGE sequence (sagittal, in-plane acceleration iPAT=2, prescan-normalise) at 1x1x1 mm 3 , field-of-view: 208x256x256 matrix, taking approximately 5 minutes. Feature derivation Cortical reconstruction was performed using FreeSurfer versions 5.3.0 and 7.4.1 (from here on v5 and v7), extracting regional cortical thickness, volume and surface area using the Desikan-Killiany cortical parcellation scheme on a single T 1 -weighted 3D image per participant. 16 The data were processed on the Sigma2 Norwegian national computation cluster on RedHat Linux Release 8.10 (Ootpa), Linux kernel version: 4.18.0-513.24.1.el8_9.x86_64. Power analysis With the conservative assumption of a large parameter shrinkage of 40%, we estimated a required training sample size of N = 2,009 to train a brain age model on the selected M = 204 grey matter features using the pmsampsize package 17 in R, version 4.3.3. Hence, we could be confident that our sample size was appropriate to train a meaningful model when using a 50-50 split of the sample, to firstly train models for each FreeSurfer version using least absolute shrinkage and selection operator (Lasso) 17 , Support Vector Machine (SVM) 19 regression, eXtreme Gradient Boosting (XGBoost) 20 regression, and Light Gradient Boosting Machine (LightGBM) 21 , and conventional linear regression. Main statistical analyses We then predicted the age in the test set within and between FreeSurfer version-specific models (see Fig.1 for the procedure). For completeness, in addition to Lasso, we applied alternative machine learning algorithms, namely, SVM 19 regression, XGBoost 20 regression, and LightGBM 21 regression, as well as linear regression. For comparability, we kept all hyperparameters constant at default settings, except for Lasso, where we implemented a higher maximum number of iterations (i Lasso = 100,000) to reach optimal alpha tolerance. A single train-test iteration, from here on the initial iteration , served to compare model performance. For simplicity, the following analyses are focused on results from the best performing model only. Download figure Open in new tab Fig.1: Data processing scheme. Acronyms of the used ML algorithms: SVM = Support Vector Machine, Lasso = Least absolute shrinkage and selection operator, LM = Linear Model (Linear Regression), XGBoost = eXtreme Gradient Boosting, LightGBM = Light Gradient Boosting Machine. To account for individual differences, we used i = 1,000 permutations to randomly split training from test data into equal parts (50% each). Then, we investigated models trained on version-shuffled data, i.e. model training based on features of both FS5 and FS7 in the initial iteration , which each training and test sets were composed of 50% FS5 and 50% FS7 data. As a final step, we varied both train-test splits and version specific feature composition in i = 1,000 permutations, where the data was again equally split into train and test data, and each of these sub-sets contained 50% FS5 and 50% FS7 features. We examined common error metrics (root mean squared error (RMSE), mean absolute error (MAE)), the coefficient of determination (R 2 ), the Akaike information criterion (AIC), the Bayesian information criterion (BIC), and the Pearson correlations between chronological and predicted brain age. Moreover, we estimated differences between these correlations, using Zou 22 and Hittner tests 23 , as well as the version-dependent marginal means of the predicted ages accounting for sex, age, and scanner site, using simple linear models. We also examined scanner site closer as a potential covariate of interest when using different models to predict on the same data using linear mixed effects models where the version of the training data was the random intercept, sex and age covariates, and site the fixed effect. To give a practical example of how training and test data can influence study outcomes, we estimated marginal sex differences from linear models predicting brain age from age, sex, and site for each combination of training and test sets. Analyses of the data structure Supplemental analyses aided to further explore potential reasons for the observed variability, we examined a) the correlations between FS5 and FS7 features, b) the correlations of the most important features, ranked by permutation feature importance, between versions, c) age correlations at the feature level, d) executed a Principal Component Analysis (PCA) sorting the features into thickness, area, volume. Results Cross-version predictions In the initial iteration , we evaluated the performance of different machine learning algorithms against each other. SVM, XGB, LightGBM, and Lasso yielded similar results as the linear regression models, yet with lower prediction performance (see Supplement 1-2). In other words, linear regression was outlined as the superior model. For illustration, we present the predictions of this first iteration in Fig.2 . Note that the differences between the predictions were statistically significant b . Download figure Open in new tab Fig.2: Training and test performance for linear regression models trained on different FreeSurfer versions in the initial iteration . Panels a-b: prediction performance on training data. Panels c-f: prediction performance on test data. FS5 = FreeSurfer v5, FS7 = FreeSurfer v7, arrows indicate which trained model was used to predict which training data, e.g.: “FS7 > FS5” indicates that a model trained on FreeSurfer v7 processed features was used to predict age derived by FreeSurfer v5 processed features. The red line shows a smooth cubic fit with k = 4 knots. When randomly sampling training and test data, we obtained an acceptable model performance in both training and test data (r > 0.67, R 2 > 0.39, MAE < 4.51 years, RMSE < 9.03 years, see Table 1 ), comparable to brain age studies using a similar set of FreeSurfer features c . The variability in the findings from the i = 1,000 random train-test splits suggest that there is an influence of individual differences in brain morphometry on brain age predictions from FS5 and FS7 ( Table 1 ). View this table: View inline View popup Download powerpoint Table 1: Version-specific linear regression train-test performance across i = 1,000 iterations for random training and test splitting. R = Pearson’s Correlation coefficient, adjusted R 2 = adjusted variance explained (linear model), adjusted R 2 corrected = adjusted variance explained (linear model correcting for sex and scanner site), MAE = mean absolute error, RMSE = root mean squared error, AIC = Akaike information criterion, BIC = Bayesian information criterion. FS = FreeSurfer. RMSE and MAE are indicated in years. Values are presented as Mean ± Standard Deviation. Note : The composition of the 50-50 train-test splits were varied (i = 1,000) to examine the influence of individual differences. Training-test performance of the second-best performing algorithm, Lasso, can be found in Supplement 5. Version-shu?ling Version-shuffled models performed better than single-version models on version-shuffled data, and on cross-version predictions ( Table 2 ). Version-shuffled models performed also comparable to single-version models predicting on the data from the initial iteration of the same version as the single-version models ( Table 2 ) and provided slightly better predictions in general in comparison to single-version models where the train test split composition was varied (compare Tables 1 - 2 ), also when varying both version-specific feature composition and train-test composition (compare Table 1 and Supplement 6). View this table: View inline View popup Download powerpoint Table 2: Lasso Modelling Performance Across FreeSurfer Versions averaged across the i = 1,000 trained models. R = Pearson’s Correlation coefficient, adjusted R 2 = adjusted variance explained (linear model), adjusted R 2 corrected = adjusted variance explained (linear model correcting for sex and scanner site), MAE = mean absolute error, RMSE = root mean squared error. FS = FreeSurfer. RMSE and MAE are indicated in years. Modelling column: Mix = version-shuffled, FS5 = FreeSurfer v5 only, and FS7 = Freesurfer v7 only. Values are presented as Mean ± Standard Deviation. 1 Refers to randomly assembled version-shuffled training data. 2 Refers to randomly assembled version-shuffled test data. 3 Refers to models which were established on a single iteration ( i =1), predicting on constant data from the initial iteration . Note: The 50-50 train-test split was held constant but features from FS5 and FS7 were varied within the splits (50-50 FS5 and FS7) to probe the influence of version-shuffling. Hence, there was only a single iteration i = 1, being the initial iteration for single version predictions, as reflected in Fig.2 (as also noted in footnote 3 in this figure). Zou 22 and Hittner tests 23 comparing correlations between brain age and age from the initial iteration for FS5 and FS7 models (i = 1) and the average across predictions (i.e., bagging) when varying the composition of the version-shuffled data (as in Table 2 ) showed no differences between within single-version predictions (FS5 to FS5 and FS7 to FS7) and version-shuffled models’ predictions ( Table 3 ). However, version-shuffled models outperformed single-version models on all other accounts. View this table: View inline View popup Download powerpoint Table 3: Comparison of correlations between age and predicted age in single- and version-shuffled predictions. Version-shuffled predictions were averaged across the i = 1,000 predictions to account for variations in version-dependent feature compositions. For comparability, single version estimates are from the initial iteration (i = 1) to hold the training-test sample structure constant. rdi’ refers to the difference in correlations between age and predicted age. CIlower and CIupper refer to the Zou’s 22 95% confidence interval constructed around rdi’. Z and p values indicate Hittner test 23 derived Z-statistic and according (uncorrected) p-value. The modelling pairs indicate which correlations between each predicted age and chronological age were compared. The observed differences in correlations between chronological age and brain ages can also be observed in the same pattern as presented in Table 3 when directly comparing marginal means of brain ages in version pairs using simple linear models correcting for age, sex and site ( Table 4 ). We observe version-dependent brain age differences of up to 1.42 years using models trained on different versions to predict on the same data. View this table: View inline View popup Table 4: Marginal mean comparison of different models’ brain age predictions corrected for age, sex and site. Estimate = difference between brain ages (in years). SE = Standard Error (in years). df = degrees of freedom. Raw and false discovery corrected p-values are indicated. Version-shuffled brain age predictions were averaged across the i = 1,000 predictions to account for variations in version-dependent feature compositions. For comparability, within and between single version brain age predictions were used from the initial iteration (i = 1) to hold the training-test sample structure constant. For a test of the effect of both train-test splits and (version-dependent) feature composition see Supplement 6. Linear mixed effect models, with the FreeSurfer version being the random intercept, and scanner site, age and sex being fixed effects, did not suggest an effect of scanner site on brain age estimates (p > .340). To provide a practical example of how the different training and test data compositions can affect study results, such as group differences, we used simple linear models for each of the nine brain age predictions (representing brain age models trained on either FS5, FS7 and version-shuffled data predicting into the respective data). The results suggest partly opposing estimates of group differences ( Figure 2 ). Analyses of the data structure a) Correlations between the same features estimated by the different FreeSurfer versions were on average Pearson’s correlation coefficients’ Mean r train = 0.88 in the training data and Mean r test = 0.88 in the test data. To examine potential confounding effects of age, we additionally estimated the same version correlations for participants grouped by age decade, resulting in consistent correlations across age-bins (Supplement 4). The smallest correlations (r < 0.69) were observed for enthorinal cortex as well as temporal and frontal poles’ volume and surface area. b) The 10 highest ranked features in the permutation feature importance rankings of models trained on FS5 or FS7 data were similar for the models trained on FS5 compared to FS7 (Supplement 3). Moreover, these features were strongly correlated between versions (r > 0.90). The strongest age-associated regions in the training data reflected prominent features contributing to the age predictions (inferior parietal, superior frontal, and precentral) when ranked by permutation feature importance (see Supplement 3), in terms of thickness across versions (r < -0.42), but smaller for volume (r < -0.22), and small for surface area (r < - 0.07). c) Moreover, feature-age associations were on average influenced by the Freesurfer versions, as indicated by paired samples t-tests (Mean Pearson’s r: [FS5 train = -0.17, FS7 train = -0.15, t = -4.998, p = 1.25 x 10 -6 , d = -0.350], [FS5 test = -0.18, FS7 test = -0.13, t = -14.859, p < 2.20 x 10 -16 , d = -1.040]), yet d) PCAs produces similarly structured components across FreeSurfer versions (see Figure 4 for an overview of a-d). Download figure Open in new tab Fig.3. Age and site-corrected sex differences in brain age estimates from models trained on and predicting on data from FS5, FS7, and a version mix. Coefficients b > 0 indicate a higher brain age of females, b < 0 for males, respectively. The sex differences were significant at pFDR < .05 for all predictions except for FS5 to FS7, mix to FS7, and FS5 to mix. FS5 = FreeSurfer version 5, FS7 = FreeSurfer version 7, Mix = 50-50 mix of FS5 and FS7 data. FDR = false discovery rate. Version-shuffled models were established by randomly sampling training data in i = 1,000 iterations, and version-shuffled test data were established in the same procedure. The average of the predictions (from i > 1) was used here (bagging approach). Download figure Open in new tab Fig.4: Panel a) Shows the correlations of each feature estimated from FS5 and FS7. Panel b) presents age associations of each feature by data set (and FreeSurfer version). Panel c) indicates the variance explained for each of the first 10 components for each dataset. Panel d) presents the relative distributions of feature types for the first two components. Discussion Brain age predictions are influenced by various factors, some of which are not biological. For meaningful predictions and construction of brain age biomarkers, such factors need to be mapped and potentially controlled for. Here, we examined how different FreeSurfer versions might influence brain age predictions, comparing versions 5.3.0 and 7.4.1. While it is known that FreeSurfer version differences lead to variability in feature estimates, these differences seem to be relatively small. As brain age is a multivariate method, such systematic differences might however influence model training and predictions. While we find that the differences between brain age predictions across FreeSurfer versions are negligible when using i = 1,000 iterations to randomly split training and test sets ( Table 1 ), in individual cases these predictions can induce significant performance differences (here described on the example of one individual initial iteration ; Fig.2 - 3 ; Table 2 - 4 ). This can be both expressed in terms of direct differences in brain age estimates (of up to 1.42 years; Table 2 - 4 ), as well as resulting non-negligible differences in estimations of the effect of sex ( Figure 3 ). We present that training on features from different versions might boost model performance and generalizability. Brain age is predicted most accurately across examined models when training on version-shuffled data and predicting on version-shuffled data ( Table 1 - 2 ). Moreover, predictions from models trained on version-shuffled data on single-version data show only small differences to prediction from models trained on that same single version ( Table 4 ). Specifically, version-shuffled models’ age predictions are just as good as single-version models when predicting on the specific single version data considering FS5 (FS5 train to FS5 test ) and nearly as good as single-version models considering FS7 (FS7 train to FS7 test ). Version-shuffled models clearly outperform single-version models attempting to predict age from data processed with another FreeSurfer version (FS5 train to FS7 test or FS7 train to FS5 test ; Table 3 ). Moreover, version-shuffled models outperform single-version models in predicting age in version-shuffled data, with this difference being however only marginally significant for predictions from the FS7-trained model d . Finally, even when varying test and training splits and which features were recruited from which FreeSurfer version (Supplement 6), version-shuffled models performed similar to, yet slightly better than, single-version models ( Table 2 - 3 ), outlining the advantages of version-shuffling when attempting to build generalizable models. These findings are important considering goals such as increasing accuracy, generalizability, and at the same time the continuous updating of FreeSurfer leading to variability of versions in the data available to researchers. Particularly when attempting to train brain age models, which is usually done on large and multi-site datasets, often requiring merging data processed in different software version, computational overhead can be reduced by shuffling FreeSurfer versions during the training process (when different versions are used at random) instead of reconstructing cortical features from scratch to match a certain target version. Such advantage of reduced version-specific feature engineering is also true when considering the usage of pre-trained models with the goal of predicting brain age in new samples. The outlined version-shuffling procedure is additionally rewarded by the largest accuracy across models when predictions are made on version-shuffled test data ( Table 2 ), especially when using a bagging approach where predictions from multiple models are averaged ( Table 3 - 4 ). Moreover, we find simple linear regression to outperform more advanced machine learning approaches in this ageing sample, older than 49 years of age. This might speak to previously outlined near-linear relationships between age and brain features in that age group 1 , 3 , in contrast to non-linear relationships considering the whole lifespan 26 , 27 . Linear models have a simple structure, are fast to compute, and might be more generalizable across FreeSurfer versions than other algorithms such as Lasso, which was the second-best performing algorithm (compare Table 1 and Supplement 5). To provide additional information on differences in data structures resulting from the usage of FS5 compared to FS7, we ran supplementary analyses comparing feature estimates across versions, their age-associations, as well as principal components. The principal components/correlational structure of features was similar across versions, and cross-version feature correlations were strong (r > 0.87). The lowest correlations between the same features estimated from different FreeSurfer versions were located in the frontal and temporal poles, which is coherent with previous findings comparing similar or different FreeSurfer versions. 12 , 15 Additionally, feature-age associations differed between versions, potentially also influencing brain age predictions. In particular, age-associations of FS5 features were stronger than for FS7, also shown previously by Haddad and colleagues 15 . While brain age estimates from FS5 features were stronger associated with age than predictions based on FS7 features in the initial iteration (i = 1; Figure 2 ), this was not the case when repeatedly randomly splitting the data into training and test sets, showing no differences in predictions on test data ( Table 1 ). FS7 models from that first iteration produced better cross-version and version-shuffled predictions than FS5. Hence, FS7 might be the better choice for accurate brain age predictions across versions or on version-shuffled data. A few explanations for the version-dependent variability can be found in version-dependent differences in feature estimations (and their age-dependencies), which finally lead to differences in feature contributions to the age predictions (Supplement 3). However, such feature rankings are limited by multicollinearity in the present data and must be interpreted with care. Nevertheless, the outlined differences in both predictions ( Figure 1 , Table 2 - 4 ) and accompanying feature importances (Supplement 3) in the initial iteration compared to an absence of prediction performance differences when varying training and test splits ( Table 1 ) suggest an influence of individual differences on FreeSurfer version-specific cortical reconstructions. Repeated random train-test splits, as executed here, might hence produce more robust brain age estimates. Another, and potentially preferable, option is to use version-shuffling, where the composition of version-specific features can be varied, if data from more than one version are available for the same participants. Both strategies are computationally feasible when either using a) a fixed selection of hyperparameters for more complex models, such as Lasso, SVM, or tree boosting, or b) simple models such as linear regression. This allows for predictions from hundreds or thousands of models which can for example be averaged to provide more robust brain age estimates (bagging procedure). The attempt to establish brain age as a biomarker entails to establish robust associations of brain age with pheno- and genotypes or pheno- and genotype-based group differences. Here, we used sex differences as an example phenotype. Sex-differences in brain age usually entail a higher brain age in males in samples with comparable age spans 1 , 28 , which largely corresponds with our findings ( Figure 3 ). However, we find that when predicting on FS7 data with an FS5-trained model, this relationship can even be reversed. Yet, considering the estimated derivatives, within-version sex differences (FS5 train to FS5 test , FS7 train to FS7 test , FSmix train to FSmix test ) were highly similar. In contrast, sex differences in cross-version predicted brain ages (FS5 train to FS7 test or FS7 train to FS5 test ) deviated from within-version estimates more strongly than when obtained from version-mixed models (FSmix train to FS7 test or FSmix train to FS5 test ). These sex differences in brain ages indicated by models trained on version-shuffled data were laying somewhat between the estimates from single-version trained models. This indicates that cross-version predictions might lead to over- or underestimations of pheno- or genotype effects on brain age. Yet, there is no clear answer to which model supplies most accurate sex differences. The reason is that there is no ground truth for these differences. The goal of these analyses was also not to benchmark sex differences in brain age. Instead, with the presented sex differences, we wanted to highlight that choices of the FreeSurfer version when computing training and testing data features can impact phenotype associations with brain age or group differences. There are various limitations to this study. First, we assumed an equal share of FS5 to FS7 features within both training and test sets. Different compositions of features of different versions are more realistic and can be further examined. Second, site effects are a known phenomenon, influencing both FreeSurfer-derived metrics 15 and potentially interaction effects of version and site on brain age estimation. While we did not find site effects, both site effects and site-version interactions require further investigations. These analyses could focus on a wider range of versions and especially consider data sets with larger heterogeneity between sites, for example in terms of acquisition protocols. Third, the manuscript is limited to the Desikan-Killiany atlas and its containing regions. Other atlases and features derived from other procedures, such as vertex-wise analyses, should be examined in future investigations. Connected to this point, there should be, fourth, observed different samples beyond the UK Biobank as well as additional phenotypes. Fifth, much of the inference presented here to compare brain age models trained on and predicting on version-shuffled and single-version data is based on a single random iteration of train-test splits (besides the findings presented in Table 1 ). This choice was based on the computational overhead which would result from varying training-test splits and their feature compositions (e.g., 1,000 x 1,000 times) as well as non-trivial analyses strategies resulting from the generated data. However, we included an analysis where we varied both train-test splits and feature compositions together in i = 1,000 iterations (Supplement 6). Sixth, in an optimal scenario, features would be available for more than one version for the entire sample to being able to randomly draw training and test splits assembled of the data from the different FreeSurfer versions. In reality, most likely not all data are available for several versions, and instead systematically per version. For example, one site processed all data with FS5, another site with FS7, or all data before a certain date were processed with an earlier FreeSurfer version than at a later point in time. We did not examine such systematic version influences, but rather at random and with equal contributions. FS7-trained models outperform FS5-trained models predicting on version-shuffled data, FS7 data, and differ only slightly when predicting on FS5 data. This suggest FS7 as the preferrable version when choosing single-version training data. However, models trained on version-shuffled data perform just as good FS5 trained models when predicting on FS5 data, differ only slightly from FS7-trained models when predicting on FS7 data, and are clearly outperforming on single-version trained models when predicting on version-shuffled data. Models trained on version-shuffled data might hence be more generalizable considering variability in versions and might pose an advantage over single-version trained models. Code Availability All code is available at https://github.com/MaxKorbmacher/FS_BrainAge Data Availability All raw data are available from the UKB ( https://www.ukbiobank.ac.uk ) in accordance with data acessing procedures. Conflicts of Interest We declare no conflict of interest. Ethics Statement This study was conducted using UKB data under Application 27412. UKB received approval from the North West Multi-centre Research Ethics Committee (MREC) as a Research Tissue Bank. Author Contributions M.K.: Conceptualisation, Data curation, Formal Analysis, Methodology, Project Administration, Software, Visualization, Writing – original draft, Writing – review & editing. L.T.W: Funding acquisition, Writing – review & editing. I.I.M.: Conceptualisation, Data curation, Funding acquisition, Software, Writing – review & editing Supplement View this table: View inline View popup Download powerpoint Supplement 1: Training Model Performance Across Algorithms ( initial iteration ) View this table: View inline View popup Download powerpoint Supplement 2: Test Model Performance Across Algorithms ( initial iteration ) View this table: View inline View popup Download powerpoint Supplement 3: Permutation feature importance: top 10 features and their variance explained when predicting age ( initial iteration ) Download figure Open in new tab Supplement 4: Age stratefied correlations between FS5 and FS7 feature estimates View this table: View inline View popup Download powerpoint Supplement 5: Lasso training and test when 1000 times repeatedly randomly sampling training and test data (compare with Table 1 ) View this table: View inline View popup Download powerpoint Supplement 6: Supplement 6: Performance of models varying both training and test splits as well as feature composition (FreeSurfer version) Acknowledgements We want to thank the facilitators and all participants of the UKB. This research was funded by the Research Council of Norway (#223273, L.T.W.); and the European Union’s Horizon2020 Research and Innovation Programme (#802998 L.T.W.). The work was performed on the Service for Sensitive Data (TSD) platform, owned by the University of Oslo, operated and developed by the TSD service group at the University of Oslo IT-Department (USIT). Computations were performed using resources provided by UNINETT Sigma2—the National Infrastructure for High Performance Computing and Data Storage in Norway. Footnotes ↵ a Multiple changes have been made from Freesurfer version 5.3.0 to 7.4.1 to improve both efficiency and accuracy of the reconstructions, such as changes in denoising, white-grey-matter boundary identification, and surface reconstruction. ↵ b While the prediction performance was on average similar between FreeSurfer versions, we observed differences in predictions in the practical example of the initial iteration (Fig.2). Here, Zou 22 and Hittner tests 23 , used to compare correlations between brain age and age, indicated that FS5 predicted more accurately within version, in training data (r diff = 0.069, 95%CI[0.049, 0.085], z = 7.272, p < .0001), and test data (r diff = 0.019, 95%CI[0.006, 0.033], z = 4.493, p < .001), and FS7 between versions (r diff = -0.054, 95%CI[-0.072, -0.036], z = -5.9637, p < .0001; see also Fig.2, Supplement 1-2). ↵ c Examples: Beck et al. 24 : R 2 = 0.73, MAE = 7.2 years, RMSE = 9.11 years; Han et al. 25 : R 2 = 0.47, MAE = 7.50. ↵ d Using a bagging procedure averaging across predictions from different models. References 1. ↵ Franke , K. & Gaser , C . Ten years of BrainAGE as a neuroimaging biomarker of brain aging: what insights have we gained? Front Neurol 789 ( 2019 ). 2. ↵ Korbmacher , M. et al. Brain asymmetries from mid-to late life and hemispheric brain age . Nat. Commun . 15 , 956 ( 2024 ). 3. ↵ Korbmacher , M. et al. Brain-wide associations between white matter and age highlight the role of fornix microstructure in brain ageing . HBM 44 , ( 2023 ). 4. ↵ Kaufmann , T. et al. Common brain disorders are associated with heritable patterns of apparent aging of the brain . Nat Neurosci . 22 , 1617 – 1623 ( 2019 ). OpenUrl CrossRef PubMed 5. ↵ Moqadam , R. , Dadar , M. & Zeighami , Y . Investigating the impact of motion in the scanner on brain age predictions . Imaging Neurosci . 2 , 1 – 21 ( 2024 ). OpenUrl CrossRef 6. ↵ Korbmacher , M. et al. Considerations on brain age predictions from repeatedly sampled data across time . Brain Behav . 13 , e3219 ( 2023 ). OpenUrl 7. ↵ More , S. et al. Brain-age prediction: A systematic comparison of machine learning workflows . NeuroImage 270 , 119947 ( 2023 ). 8. de Lange , A.-M. G. & Cole , J. H . Commentary: Correction procedures in brain-age prediction . NeuroImage Clin . 26 , 102229 ( 2020 ). 9. Lee , W. H . The Choice of Machine Learning Algorithms Impacts the Association between Brain-Predicted Age Difference and Cognitive Function . Mathematics 11 , 1229 ( 2023 ). OpenUrl 10. ↵ Dörfel , R. P. et al. Prediction of brain age using structural magnetic resonance imaging: A comparison of accuracy and test–retest reliability of publicly available software packages . Hum. Brain Mapp . 44 , 6139 – 6148 ( 2023 ). OpenUrl 11. ↵ Fischl , B. FreeSurfer . NeuroImage 62 , 774 – 781 ( 2012 ). OpenUrl CrossRef PubMed Web of Science 12. ↵ Gronenschild , E. H. B. M. et al. The Effects of FreeSurfer Version, Workstation Type, and Macintosh Operating System Version on Anatomical Volume and Cortical Thickness Measurements . PLOS ONE 7 , e38234 ( 2012 ). OpenUrl CrossRef PubMed 13. Hedges , E. P. et al. Reliability of structural MRI measurements: The effects of scan session, head tilt, inter-scan interval, acquisition sequence, FreeSurfer version and processing stream . NeuroImage 246 , 118751 ( 2022 ). 14. Bigler , E. D. et al. FreeSurfer 5.3 versus 6.0: are volumes comparable? A Chronic Effects of Neurotrauma Consortium study . Brain Imaging Behav . 14 , 1318 – 1327 ( 2020 ). OpenUrl 15. ↵ Haddad , E. et al. Multisite test–retest reliability and compatibility of brain metrics derived from FreeSurfer versions 7.1, 6.0, and 5.3. Hum . Brain Mapp . 44 , 1515 – 1532 ( 2023 ). OpenUrl 16. ↵ Desikan , R. S. et al. An automated labeling system for subdividing the human cerebral cortex on MRI scans into gyral based regions of interest . NeuroImage 31 , 968 – 980 ( 2006 ). OpenUrl CrossRef PubMed Web of Science 17. ↵ Riley , R. D. et al. Calculating the sample size required for developing a clinical prediction model . BMJ 368 , m441 ( 2020 ). 18. Tibshirani , R . Regression Shrinkage and Selection via the Lasso . J. R. Stat. Soc. Ser. B Methodol . 58 , 267 – 288 ( 1996 ). OpenUrl 19. ↵ Cortes , C. & Vapnik , V. Support-vector networks . Mach. Learn . 20 , 273 – 297 ( 1995 ). OpenUrl CrossRef Web of Science 20. ↵ Chen , T. & Guestrin , C. Xgboost: A scalable tree boosting system . in Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and data mining 785 – 794 ( 2016 ). 21. ↵ Ke , G. et al. LightGBM: A Highly Efficient Gradient Boosting Decision Tree . in Advances in Neural Information Processing Systems vol. 30 ( Curran Associates, Inc. , 2017 ). 22. ↵ Zou , G. Y . Toward using confidence intervals to compare correlations . Psychol. Methods 12 , 399 – 413 ( 2007 ). OpenUrl CrossRef PubMed Web of Science 23. ↵ Hittner , J. B. , May , K. & Silver , N. C . A Monte Carlo Evaluation of Tests for Comparing Dependent Correlations . J. Gen. Psychol . 130 , 149 – 168 ( 2003 ). OpenUrl CrossRef PubMed Web of Science 24. ↵ Beck , D. et al. Cardiometabolic risk factors associated with brain age and accelerate brain ageing . HBM 43 , 700 – 720 ( 2022 ). OpenUrl 25. ↵ Han , L. K. M. et al. A large-scale ENIGMA multisite replication study of brain age in depression . Neuroimage Rep . 2 , 100149 ( 2022 ). OpenUrl 26. ↵ Beck , D. et al. White matter microstructure across the adult lifespan: A mixed longitudinal and cross-sectional study using advanced diffusion models and brain-age prediction . NeuroImage 224 , 117441 ( 2021 ). OpenUrl CrossRef 27. ↵ Bethlehem , R. A. et al. Brain charts for the human lifespan . Nat 604 , 525 – 533 ( 2022 ). OpenUrl 28. ↵ Subramaniapillai , S. et al. Sex differences in brain aging among adults with family history of Alzheimer’s disease and APOE4 genetic risk . Neuroimage-Clin . 30 , ( 2021 ). View the discussion thread. Back to top Previous Next Posted June 16, 2024. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following FreeSurfer version-shuffling can boost brain age predictions Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share FreeSurfer version-shuffling can boost brain age predictions Max Korbmacher , Lars T. Westlye , Ivan I. Maximov bioRxiv 2024.06.14.599070; doi: https://doi.org/10.1101/2024.06.14.599070 Share This Article: Copy Citation Tools FreeSurfer version-shuffling can boost brain age predictions Max Korbmacher , Lars T. Westlye , Ivan I. Maximov bioRxiv 2024.06.14.599070; doi: https://doi.org/10.1101/2024.06.14.599070 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Neuroscience Subject Areas All Articles Animal Behavior and Cognition (7643) Biochemistry (17717) Bioengineering (13910) Bioinformatics (42015) Biophysics (21477) Cancer Biology (18626) Cell Biology (25536) Clinical Trials (138) Developmental Biology (13392) Ecology (19935) Epidemiology (2067) Evolutionary Biology (24356) Genetics (15617) Genomics (22530) Immunology (17755) Microbiology (40437) Molecular Biology (17200) Neuroscience (88703) Paleontology (667) Pathology (2840) Pharmacology and Toxicology (4832) Physiology (7657) Plant Biology (15171) Scientific Communication and Education (2046) Synthetic Biology (4304) Systems Biology (9828) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-24T02:00:01.246996+00:00

License: CC-BY-4.0