Full text
75,070 characters
· extracted from
preprint-html
· click to expand
Supervised and unsupervised machine learning methods for modelling current and future habitat of Peruvian anchovy | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Supervised and unsupervised machine learning methods for modelling current and future habitat of Peruvian anchovy View ORCID Profile Mariana Hill , View ORCID Profile Tianfei Xue , View ORCID Profile Jaard Hauschildt , View ORCID Profile Mariano Gutiérrez , View ORCID Profile Tronje Kemena doi: https://doi.org/10.1101/2025.05.08.652876 Mariana Hill 1 GEOMAR Helmholtz Centre for Ocean Research Kiel , Wischhofstr. 1-3, 24148, Kiel, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mariana Hill For correspondence: mhill-cruz{at}geomar.de Tianfei Xue 1 GEOMAR Helmholtz Centre for Ocean Research Kiel , Wischhofstr. 1-3, 24148, Kiel, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tianfei Xue Jaard Hauschildt 1 GEOMAR Helmholtz Centre for Ocean Research Kiel , Wischhofstr. 1-3, 24148, Kiel, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Jaard Hauschildt Mariano Gutiérrez 2 Instituto Humboldt de Investigación Marina y Acuícola , Av. República de Panamá 3591 piso 9, San Isidro, 15036, Lima, Peru Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Mariano Gutiérrez Tronje Kemena 1 GEOMAR Helmholtz Centre for Ocean Research Kiel , Wischhofstr. 1-3, 24148, Kiel, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tronje Kemena Abstract Full Text Info/History Metrics Preview PDF Abstract Understanding the drivers and potential impacts of environmental variability on the distribution of Peruvian anchovies, the largest single-species fishery on the planet, is essential for their proper management in a changing world. However, the intricate interactions of these organisms and environmental variability require the use of complex models such as machine learning methods. In this study, we compared three methods for producing habitat maps of anchovies: the traditional Generalised Additive Models, the XGBoost which is a form of supervised machine learning and a new method based on clustering water types as a form of unsupervised machine learning. We optimised the three methods with a parameter grid search algorithm and compared their capability to replicate the mean state of anchovies by comparing them with presence-absence observations along the Peruvian coastline between 1990 and 2010. We used the output of a physical-biogeochemical model as input for the habitat models to produce distribution maps of anchovy. All models successfully simulated the distribution of anchovies along the Peruvian coastline in normal years and a reduced area of distribution with most of the anchovies in the southern part of the domain during the canonical El Niño 97/98. We then applied the models to predict potential changes in the distribution of anchovies under projected temperature and wind conditions by the end of the century. We observed a reduction in the probability of anchovy occurrence under conditions of higher temperature and weaker winds. Two of the three habitat models predicted a severe maximum decline by 90% (GAM) and 75% (XGBoost) whereby the clustering model predicted a moderate maximum decline in anchovy occurrence by 20%. 1 Introduction Understanding the drivers and potential impacts of environmental variability on the Peruvian anchovy ( Engraulis ringens ) is crucial to assess the potential climate change effects on this fishery of global importance as well as the economic consequences. The Peruvian anchovy is the largest single-species fishery in the world ( Chavez et al., 2003 ; Chavez et al., 2008 ), accounting for 24 (23 for fish oil) to 47 % of the global fishmeal and fish oil supply (Freón et al., 2017; Avadí, Freón, and Tam, 2014) which are an important feed source for aquacultural production (Freón et al., 2014). This species inhabits the coastal waters within the Northern Humboldt Current System (NHCS), typically spanning latitudes from about 4 ° S up to 24 ° S in two well-defined stocks ( Bouchon et al., 2021 ). Anchovies respond in complex ways to the strong environmental interannual variability of the southeastern Pacific Ocean. The anchovy fishery was a proliferating business that increased over the 1960’s reaching landings of 10.9 Mt in 1970 and collapsing shortly afterwards during the strong El Niño event 72/73 ( Alheit and Niquen, 2004 ). Fish populations declined by over 50% during El Niño 82/83 event ( Barber and Chavez, 1983 ) and catches dropped to only 1 Mt in 1998 (Ñiquen and Bouchon, 2004), possibly due to a re-distribution of the fish towards the more coastal waters due to El Niño 97/98 ( Bertrand et al., 2004 ). Long-term trends in environmental factors due to climate change might substantially impact the habitat of anchovies. Under climate change, the upwelling centre is projected to shift poleward ( Rykaczewski et al., 2015 ), potentially resulting in a displacement of the anchovy population ( Pinsky et al., 2013 ; Checkley, Asch, and Rykaczewski, 2017 ). However, any wind changes in the NHCS under climate change and their effects on upwelling strength and coastal sea-surface temperature are still highly uncertain, as the effects of increased stratification due to surface warming and increased upwelling-favourable winds may balance each other to some extent ( Gutiérrez et al., 2011 ; Echevin et al., 2012 ; Echevin et al., 2020 ). Furthermore, climate change is expected to increase the frequency and intensity of El Niño events ( Gergis and Fowler, 2009 ; Shin et al., 2022 ) which have affected the fishery in the past. The drivers of anchovy distribution and its response to environmental changes are not yet completely understood. Chavez et al. (2003) concluded that anchovy abundance may be correlated with cold water regimes, while Bertrand et al. (2011) point out that oxygen also plays an important role. Brochier et al. (2013) and Flores-Valiente et al. (2023) suggest that early stages of anchovy may benefit from climate change as a result of increased retention and potentially higher food availability due to a strengthened stratification, nonetheless, Checkley, Asch, and Rykaczewski, 2017 note that a future reduction in nutrient supply could diminish plankton productivity and alter its composition, thus, as a source of food for anchovies, impacting their population. Currently, anchovies have a competitive advantage compared to other species due to their ability to cope with low oxygen concentrations and have, therefore, been thriving in decades characterised by such conditions ( Bertrand et al., 2011 ). However, under future environmental conditions with even lower oxygen anchovies might be replaced by smaller fish ( Salvatteci et al., 2022 ). This indicates a complex interaction between anchovy and environmental dynamics in the NHCS. Habitat models, also known as species distribution models or habitat niche models, predict potential areas of distribution of marine organisms, such as anchovies. By finding relationships between their observed occurrence and environmental drivers, these models offer valuable insight into potential climate change-driven habitat changes. To do so, habitat models can be trained using several environmental predictors such as temperature, bathymetry and wave energy, among others (e.g., Luján Paredes, 2016 ; Oliveros-Ramos, 2014 ; Gogina et al., 2016 ; Schubert, Hukriede, and Karez, 2015 ; Sequeira et al., 2014 ) to predict biomass, abundance, presence only and presence-absence data ( Guisan and Zimmermann, 2000 ; Oliveros-Ramos and Shin, 2023 ; Sequeira et al., 2014 ; Grüss, Drexler, and Ainsworth, 2014 ). The relationship between these various predictors and the target variables can be very complex and non-linear, requiring machine learning or other advanced modelling techniques to be captured. Generalised Linear Models (GLM) and Generalised Additive Models (GAM) are commonly used methods for modelling habitat due to their capability of fitting non-normally distributed data and capturing the joint impact of several predictors ( Guisan and Zimmermann, 2000 ). In recent years supervised machine learning algorithms such as random forests, and particularly XGBoost, which is a scalable machine learning system for tree boosting ( Chen and Guestrin, 2016 ), have become a popular method with the rise of artificial intelligence applications to scientific problems, as an alternative to the traditional statistical models. Clustering is a method of unsupervised machine learning in which patterns in the data are identified. Unsupervised learning is commonly used for exploratory data analysis but, to our knowledge, has not been applied for making fish habitat predictions so far. Machine learning methods are suitable to disentangle the complex and non-linear links between environmental parameters and to predict the distribution of marine organisms; hence they might be able to capture the intricate relationship between anchovy habitat preferences and its drivers. In this study, we compared three methods for predicting the distribution of Peruvian anchovy in the northern Humboldt Current System. We applied XGBoost and GAM, both supervised learning methods and a new approach with unsupervised learning that involves clustering. In-situ observations were used to train the models. For climate change estimates, we used a physical-biogeochemical model to predict the anchovy habitat. The model parameter validations were done with model data, hence accounting for biases that might arise from the differences between the physical-biogeochemical model output and field observations. We designed a common algorithm for training and comparing the three methods. We compared the predictions by our three types of models during normal years between 1990 and 2010 and 1998 where anchovy catches were low due to El Niño ( Bertrand et al., 2004 ). Finally, we used our models to simulate potential anchovy distribution under idealised climate change conditions. 2 Methods 2.1 Data description The observational data was obtained from acoustic surveys performed by the Instituto del Mar del Perú (IMARPE) along the Peruvian coastline in transects up to 200 nautical miles (nm) long from the coast towards the open ocean ( Figure 1 ) from 1990 to 2010 ( Gutiérrez et al., 2012 ). The cleaned dataset consists of 219 thousand samples. Anchovy abundance data is highly skewed to the absence class with values as high as 182.5 thousand but a mean of only 216 m 2 /square nautical mile (nm 2 ). Therefore, we transformed anchovy biomass to presence-absence which resulted in 25.3% of the samples with anchovy presence and 74.7% with absence ( Figure 1 ). In addition, we transformed all environmental variables to a range from 0 to 1 using the StandardScaler function of scikit-learn ( Pedregosa et al., 2011 ). For the supervised learning models, we transformed the distributions of the environmental variables with a quantile transformer into a Gaussian-like distribution ( Pedregosa et al., 2011 ). Download figure Open in new tab Figure 1. Map of the study area with averaged modelled sea surface temperature (background colour, ° C ) in CROCO-BioEBUS and circles on top showing the occurrence rate of anchovies in acoustic surveys (circle colour) and the total number of measurements available for the period of interest across the sampled region (circle size). 2.2 Ocean Physical-Biogeochemical Model We employed a coupled ocean physical-biogeochemical model to investigate how the environmental conditions affect the distribution of anchovy. The model combines the CROCO (Coastal and Regional Ocean COmmunity model, Shchepetkin and McWilliams, 2005 ) for physical environmental conditions with the BioEBUS (Biogeochemical model for the Eastern Boundary Upwelling Systems, Gutknecht et al., 2013 ) model. The CROCO framework provides the background for simulating ocean circulation, temperature, and salinity on a regional scale that extends from 10 ° N to 33 ° S and from 69 ° W to 118 ° W with a horizontal resolution of 1/12 ° and 32 vertical sigma levels. The BioE-BUS model is integrated with CROCO to represent the biogeochemical environment with a complex nitrogen cycle including oxygen-dependant processes like nitrification and remineralisation. The physical component of the model runs in time-steps of 500 seconds, the biogeochemical component in steps of 1500 seconds and the model output has a monthly resolution. Here, to align with the acoustic cruise data detailed in Sect. 2.1 , we have narrowed the focus of our study to a smaller domain covered by the available observations, ranging from 3 ° S to 19 ° S latitude and from 70 ° W to 84 ° W longitude, as illustrated in Fig. 1 . 2.2.1 Hindcast simulation To simulate the historical interannual variability in the environment that potentially affects fish distribution, we used an interannual configuration of CROCO-BioEBUS. Surface heat and freshwater flux forcing for CROCO are provided by CFSR (Climate Forecast System Reanalysis) data, and the wind forcing by the CCMP (Cross-Calibrated Multi-Platform) product with a temporal resolution of six hours. Initial and boundary conditions are from monthly SODA (Simple Ocean Data Assimilation, Carton, Chepurin, and Chen, 2018 ) and monthly climatology CARS (CSIRO - Commonwealth Scientific and Industrial Research Organisation Atlas of Regional Seas, Ridgway, Dunn, and Wilkin, 2002 ). A 30-year spin-up period, by repeating the forcing of the year 1990, ensures model equilibrium before conducting simulations for the period from 1990 to 2010. After spin-up, the model is forced by monthly interannually-varying forcing during the period from 1990 to 2010, which was later used for training the habitat models. The same model set-up was previously used in an End-to-End model study investigating the bottom-up impact of plankton dynamics on higher trophic levels, including anchovy ( Hill Cruz et al., 2022 ). Model evaluations are available in ( Hill Cruz et al., 2022 ; Xue et al., 2024 [In review]). We sampled the model output at the same spatial and temporal coordinates where field observations were collected. Sea surface temperature (SST), distance from the coast and depth are well correlated between the model and observations ( Table 1 ; Figure 2 ). Meanwhile, sea surface salinity and oxycline depth have a lower correlation of 0.63 and 0.56, respectively. Furthermore, chlorophyll a has higher values in CROCO-BioEBUS than in the observations (Appendix A) and it exhibits a correlation of only 0.28 so we did not include this variable in our habitat models. A bi-modal distribution is observed in SST ( Figure 2 ) due to the sampling taking place in two main seasons (see Appendix A). On the other hand, the two peaks are less distinct in the CROCO-BioEBUS model SST, possibly due to the higher diffusion compared to the real world. View this table: View inline View popup Download powerpoint Table 1. Correlations between environmental variables in observations and in CROCO-BioEBUS Download figure Open in new tab Figure 2. Distribution for observational data and CROCO-BioEBUS hindcast output. The latter is interpolated in time and space to match observations. Lines indicate a kernel density estimate. Download figure Open in new tab Figure 3. Anchovy probability of occurrence and concentration predicted for each water type. 2.2.2 Climate change experiments We performed a set of three sensitivity experiments (TEMP+, WIND+, WIND-) to test how our predicted anchovy distribution responds to different environmental states simulated by the coupled physical-biogeochemical model. The sensitivity experiments were designed to represent wind speed changes and warming which could realistically occur by the end of the century according to climate models. These changes were applied to a climatological reference simulation (REFERENCE), which was set up by averaging the forcing of the hindcast simulation described above between 1990 and 2010. The reference simulation was run for a 20-year spin-up period, after which the sensitivity experiments were started. The reference simulation and the sensitivity experiments were then run for a period of 10 years and the last year was used for analysis. Since the climatological forcing contains no interannual variability, the model already reaches a quasi-steady-state within this time-frame in the upper 200-300 m of the ocean, which are relevant for the anchovy habitat. The warming experiment (TEMP+) was designed to reflect an increase in surface air temperature and water column stratification consistent with end-of-century projections from climate models ( Echevin et al., 2020 ), while keeping the wind forcing constant. We applied a spatially uniform ocean temperature offset at the open boundaries and to the initial temperature fields, in the form of a vertical profile. To compute this average profile of ocean warming, we took the temperature difference between 2006-2100 in the CMIP5 (Coupled Model Intercomparison Project Phase 5, Taylor, Stouffer, and Meehl, 2012 ) multi-model ensemble average under the RCP8.5 emission scenario and averaged it spatially over our ocean model domain. We then applied the surface value of the previously computed water temperature change profile (2.2 ° C) as a spatially uniform offset to the surface air temperature, which is used by the model for computing air-sea heat and momentum fluxes. A similar approach of modifying surface air temperature using the bulk forcing option in CROCO has been used before in regional climate change downscaling simulations to add a forcing perturbation (e.g., Espinoza-Morriberon et al., 2017 ; Echevin et al., 2020 ). To further investigate the uncertainty of potential wind changes and their effect on the anchovy habitat in the region, we increased the wind speed by 10% (WIND+) in a first sensitivity simulation and decreased it by 10% (WIND − ) in a second one. This corresponds to a wind stress change of approximately ± 20%, as the wind speed enters quadratically into the wind stress parameterisation. All surface fluxes (momentum, heat, CO 2 , O 2 ) are calculated by a bulk formula and are thus consistent with these wind changes. Note also that the wind speed changes are small compared to some previously published sensitivity studies (e.g., Echevin et al., 2012 ; Travers-Trolet et al., 2014 ) and reflect a plausible range as suggested by global climate models and downscaling experiments in the region (see Echevin et al., 2020 , and references therein). 2.3 Habitat models We fitted a series of XGBoost classifiers to the presence-absence observations with the Python package xgboost ( Chen and Guestrin, 2016 ). To optimise the parameters for XGBoost, we conducted cross-validations together with a brute force grid search for the parameters: learning rate (0.001 to 0.3), depth (2 to 40), and number of estimators (20 to 700). Out of these 200 optimised model configurations, we picked the best model in reference to two metrics: balanced accuracy (in the case of binary classification area under the ROC curve) and Cohen’s Kappa (see Table 2 ), which are suitable scores for classification problems with imbalanced classes and habitat modelling ( Liu, White, and Newell, 2009 ; Ben-David, 2008 ; Wardhani et al., 2019 ). View this table: View inline View popup Download powerpoint Table 2. Comparison of XGBoost model and GAM against single observation points. The highest test scores are marked in bold. Shuffle split trained models are not considered for further use (see text). We compared the performance of these models with Generalised Additive Models (GAM) using the PyGAM package ( Table 2 ). We used the LogisticGAM model which has a Binomial distribution and a logistic link function with all the default parameters ( Servén and Brummitt, 2018 ). Both XGBoost and GAM were cross-validated with 5 splits applying two types of splitting: standard shuffle and Kfold. We then applied the best models to make predictions of anchovy presence-absence using environmental output from the CROCO-BioEBUS model as predictors. For the unsupervised learning method, we clustered the data points into water types based on their distance from the coast, SST, bathymetry, depth of the oxycline at 1 ml O 2 L −1 and the month when the sample was taken using Gaussian mixture model probability. Then we calculated the probability of finding anchovy in each water type by averaging the presence (1) and absence (0) points falling within the water type. Similarly, we calculated the average concentration of anchovies in each water type. Calculating total anchovy biomass is beyond the scope of this paper; therefore, we kept the original units from the raw data of m 2 /nm 2 which is based on acoustic surveys. Since clustering is an unsupervised machine learning method, it is not possible to do cross-validation and due to the applied statistics, we cannot compare to the other models using the standard classification metrics. In order to compare the three methods, we produced a heatmap of observations averaging at least four data points per grid cell. Then we calculated the mean square root error (RMSE) and correlation coefficient between the grid cells in the observations heatmap and each of our modelled habitat maps. Based on these values, we fine-tuned the parameters of the GAM, XGBoost, and clustering (covariance type full) using brute force grid search. Parameter ranges can be found in Tab. 3 . Finally, for the clustering approach, we also optimised a scaling factor applied to the probabilities of finding anchovies in each water type. This had no impact on the correlation but slightly reduced the RMSE. All models took the same environmental predictors as input, as described above. Once we had decided on the best parameters comparing the models with the hindcast observation heatmaps, we applied each of the three models to simulate a climatology of anchovy probability of occurrence taking as predictors the CROCO-BioEBUS output of the sensitivity studies described in Sect. 2.2.2 to evaluate the climate change impact on their habitat. The XGBoost and clustering methods can only predict habitat when the environmental variables fall within the range that was used to train the models. However, because the interannual variability in the NHCS is high in the hindcast simulation that was used to train the models, the comparatively small changes applied to the climatological simulation to represent climate change fall within the range of the training data. 3 Results 3.1 Models training In our initial approach, we performed the parameter grid search for XGBoost with a standard shuffle splitting method, achieving very high train (0.997) and test (0.923) balanced accuracy scores as shown in Tab. 2. Given the potential for spatial or temporal auto-correlation in dense-sampled observational cruises, these results might be overly optimistic ( Salazar et al., 2022 ). When applying the Kfold splitting method to this complex XGBoost model configuration (maximum depth is 40), the test score dropped by over 30%, strongly indicating model overfitting on likely auto-correlated data. To address this, we also conducted a parameter optimisation with a Kfold splitting method, which splits data into k temporally consecutive folds to prevent overfitting from temporal autocorrelation. Among the models with Kfold-split data, the XGBoost configuration with the highest maximum depth (40) yielded the best training scores. However, the considerably lower test scores suggest overfitting. Conversely, a simpler model with a depth of 3 produced closer test and training scores for both Kappa (0.282 and 0.432, respectively) and the balanced accuracy (0.627 and 0.696, respectively)., which were comparable to those of the GAM (Kappa: 0.286, balanced accuracy: 0.340). For further validation, we favoured Kfold splitting, although the GAM model also showed slightly higher test scores when shuffle-splitted, considering this method is more conservative. Following the integration of data from the CROCO-BioEBUS model instead of observations, the balanced accuracy and Kappa for XGBoost slightly improved from 0.627 to 0.638 and 0.282 to 0.315, respectively, while for GAM, they decreased from 0.629 to 0.609 and 0.286 to 0.254, respectively ( Table 2 ). This performance indicates that, despite slight differences, the CROCO-BioEBUS model output is close enough to the observations to produce a reliable habitat map for Peruvian anchovies, demonstrating good generalisation across the two different data sets. 3.2 Anchovy response to interannual variability All three models predict a distribution of anchovies near the coast as seen in the observations. In years other than 1998, the distribution extends over the whole Peruvian coastline. In 1998, the probability of finding anchovy decreases with respect to the average of the other years between 1990 and 2010 and the south of the Peruvian coast remains the main area with high chances of anchovy occurrence ( Figure 4 ). The XGBoost model and GAM show a similar map and the same range of probabilities as the occurrence rate in observations ( Figure 4 ). On the contrary, the clustering map has a narrower range of anchovy probabilities, overestimating in regions where the occurrence rate is low and underestimating in regions where finding anchovies is more likely. However, this method exhibited the highest correlation coefficient (0.527) and the lowest RMSE (0.317, Table 3 ). The correlation coefficient describes the linear relation of the observation and model data, while the RMSE provides the error. On the contrary, the visual comparison suggests that the XGBoost and GAM are better alternatives than the clustering for predicting anchovy probability of occurrence since they show a wider range of predictions reaching zero offshore and up to 1 near the coast. In contrast, clustering represents better the mean state. View this table: View inline View popup Download powerpoint Table 3. Comparison of habitat maps produced with XGBoost, GAM and clustering models using predictors input from the CROCO-BioEBUS model. Parameter range for the grid search is indicated in the brackets. The GAM grid search was also conducted on different covariance types: full, tied, diagonal, spherical. Download figure Open in new tab Figure 4. Comparison of anchovy occurrence rate in the observations and probability of occurrence predicted by each habitat model. The clustering method is also able to predict average anchovy concentration in each water type ( Figure 3 ) while XGBoost and GAM were only trained as classifiers. Although concentration and probability of occurrence generally follow a similar pattern, there are differences in their relative changes across water types ( Figure 3 ). 3.3 Anchovy response to climate change The XGBoost and GAM models predicted a general increase in anchovy probability of occurrence with stronger winds and a decrease in probability with warmer temperatures and weaker winds. A severe maximum decline by 90% (GAM) and 75% (XGB) occurred due to the effect of warmer temperatures. The clustering method predicted similar but weaker responses in the coastal region ( Figure 5 ). Here, a moderate maximum decline in anchovy occurrence by 20% is observed. However, while changes in anchovy occurrence in the open ocean are negligible in the supervised learning models, the relatively high occurrence predicted by the clustering resulted in a high sensitivity to climate change offshore. This is likely due to the very weak overall response of the clustering and relatively high probability of occurrence offshore with respect to the other models which result in a weak signal-to-noise ratio ( Figure 5 ). Download figure Open in new tab Figure 5. Probability of anchovy presence predicted by each habitat model in the reference simulation (top) and relative change in the climate change scenarios. 4 Discussion The Peruvian anchovy is one of the most studied fisheries due to its economic value but also the complexity of its response to environmental variability. Despite the work done by numerous researchers in the last half of a century, anchovy’s behaviour is still to some extent unpredictable. In this study, we compared three methods for modelling the habitat of Peruvian anchovies. We used a XGBoost model and a GAM which is a statistical method, both trained via supervised learning. These methods have been commonly used to model habitat. We also tested a new method based on defining water types with unsupervised machine learning, or clustering, and associating anchovy occurrence to each of them. We produced habitat maps with all three models using data from a physical-biogeochemical model and compared their capability for producing realistic maps based on two statistical metrics and a visual assessment of the maps. In Sect. 4.1 we discuss the implications of using output from physical-biogeochemical models as predictors for habitat models. In Sect. 4.2 we put the hindcast and climate change predictions from our model in context. Finally in Sect. 4.3 we talk about general challenges for modelling the distribution of marine species and how we addressed them. 4.1 Applying a Physical-biogeochemical model as forcing for habitat models Producing habitat maps depends on the availability of continuous data over the whole region of interest. This can be obtained, for example, from satellites or in situ observations. However, satellite data is limited to certain variables that can be detected through remote sensing and in situ observations are limited in their spatial coverage. Physical-biogeochemical models, on the other hand, provide a wide range of environmental variables that are not easily observable such as oxycline depth (e.g., Espinoza-Morriberón et al., 2021 ) and integrated biomass and energy fluxes across the trophic web, which can comprehensively capture the impacts on anchovy feeding and potentially offer a better indicator of fish production ( Xue et al., 2024 ; Friedland et al., 2012 ). Furthermore, physical-biogeochemical models can be used to conduct sensitivity experiments (e.g., Echevin et al., 2012 ; Meier et al., 2019 ) taking into account the changes in all physical and biogeochemical variables that are relevant for the fish habitat such as temperature and oxygen. We trained and tested the XGBoost model and the GAM with field observations and then coupled them to a regional physical-biogeochemical model to produce habitat maps. We observed similar values in the balanced accuracy and the Cohen’s Kappa score when using predictors from a physical-biogeochemical model in comparison to observational data indicating no loss in predictability due to changing from model data to observations. The XGBoost model and GAM were able to produce very similar maps even when using predictors from a completely different dataset than the one that was used to train them. This indicates that the trained habitat models have high capability to generalise and the relatively low test scores have a different source than the different data domains. Relevant factors could be, for example, the stochastic nature of anchovy appearance (discussed later) or better predictors being needed as model input. However, a similar predictability for both datasets opens the possibility of exploiting the potential of regional physical-biogeochemical models for climate change predictions in the field of habitat modelling. Earth System Model (ESM) simulations are necessary to predict the impact of climate change on the distribution of marine organisms due to changes in environmental conditions. One way to do this is to add the anomalies simulated by the ESM to the observations used to train the habitat model (e.g., Oliveros-Ramos and Shin, 2023 ; Sequeira et al., 2014 ). However, this makes the habitat model susceptible to an unknown error if applied to ESM data since it has been trained and tested only on observations. In addition, the interpolation of global results of the ESM output might not necessarily capture the nuances of a particular region ( Echevin et al., 2020 ). Furthermore, various downscaling and interpolation methods can yield differences in the magnitude of change and in spatial patterns ( Pozo Buil et al., 2023 ). A second method is to use the output of regional models to directly feed habitat models. Nevertheless, the habitat model must be able to generalise well and avoid overfitting to the training data from observations, and be stable enough to handle any noise or bias in the regional model output in order to make good predictions. While regional models also require external forcing from an ESM (e.g., Echevin et al., 2020 ), some global ESMs such as the Flexible Ocean and Climate Infrastructure (FOCI, Matthes et al., 2020 ) allow for the implementation of nests with a very fine grid resolution, eliminating the need for downscaling and interpolation at all. Temporal variability in habitat has been suggested to have an impact on how the biomass of marine organisms responds to changes in plankton ( Hill Cruz et al., 2022 ). In addition, fish variability might as well have an impact on the biogeochemistry of the marine ecosystems ( Travers-Trolet et al., 2014 ; Getzlaff and Oschlies, 2017 ; Hill Cruz et al., 2021 ; Bianchi et al., 2021 ). Therefore, being able to run habitat models using direct output from physical-biogeochmeical models opens the possibility for doing dynamic couplings between the two models and capturing the feedbacks between their interaction in end-to-end models. 4.2 Climate change and interannual variability impacts on anchovy habitat Observations indicate that Peruvian anchovies are generally adapted to cold coastal waters with high productivity and are widespread along the entire Peruvian coast Bakun and Broad, 2003 ; Ñ iquen Carranza et al., 2000 . During El Niño, Peruvian anchovies are found to move closer to the coast and migrate southward, rather than maintaining their typical widespread habitat ( Mathisen, 1989 ; Ñiquen and Bouchon, 2004). This aligns with our findings, where only the very coastal region in the southern part of Peru remains a suitable habitat according to the XGBoost model and GAM during 1998 ( Fig. 4 ). The oscillations in the range of the anchovy habitat could potentially influence changes in the fish population; however, as the fish are located further from their typical habitats, they become less accessible for fisheries, leading to an underestimation of fish biomass from commercial catches ( Bertrand et al., 2004 ). Previous studies suggest that environmental changes on different timescales (e.g., interannual and decadal) influence the anchovy habitat in a similar way, which could be indicative of broader climate change impacts ( Alheit and Niquen, 2004 ; Bertrand et al., 2004 ). Our climate change experiments indicated a decrease of anchovy occurrence probability along the Peruvian coast with increased temperature and decreased wind. This is in line with Salvatteci et al. (2022) who suggests a replacement of anchovy by smaller fish as a consequence of climate change. In contrast, increased wind energy has the potential to expand the potential habitat for anchovies or counteract the habitat contraction due to warming. Rykaczewski et al., 2015 have suggested that the coastal upwelling zones will be shifted polewards under global warming, resulting in a slightly decreased upwelling strength for the Peruvian coast. However, our simulations did not result in a latitudinal shift of the anchovy habitat for any of the experiments but rather a decrease (TEMP+, WIND-) or increase (WIND+) in the habitat suitability along the entire Peruvian coastline. This is likely a result of our idealised wind speed perturbation, which uses a spatially uniform scaling factor and does not capture potential changes in the wind pattern. Given the uncertainty of projected wind changes in the region, this is a reasonable first assumption. In this study, the XGBoost model and GAM predicted a similar change in both sign and magnitude in the anchovy distribution as a response to climate change. Such methods can be used for moderate climate change projections based on climatological predictors because, in the NHCS, the strong interannual variability observed in the hindcast under present-day climate conditions is stronger than the climate change impacts reflected in the climatology. However, these methods, as well as the clustering, would not be suitable for predicting extreme changes that fall outside of the range of the training data; for example, in projections where stronger and more frequent El Nino events are considered ( Shin et al., 2022 ), which could cause major changes in anchovy habitat and consequently impact fish population and catches (Arias Schreiber, Ñ iquen, and Bouchon, 2011). Scenarios with a stronger environmental change should rely on simpler methods such as Generalised Linear Models with better capabilities to extrapolate. Despite the general trend observed in models of a preference for colder waters and the implications for climate change, the habitat of Peruvian anchovies remains somewhat unpredictable since they have shown impressive plasticity in recent years. In 2022, the Instituto Público de Investigación de Acuiculture Pesca (2023) confirmed the presence of anchovies in the Gulf of Guayaquil, in Ecuador, located further north than their usual distribution off Peru. This highlights the adaptable nature of anchovies and the importance of continuous monitoring schemes for ensuring that the most up-to-date changes in species behaviour are captured. Since our habitat model was trained with data until 2010, covering only the Peruvian Exclusive Economic Zone, it might not make reliable predictions for the Gulf of Guayaquil. Strong decreases in temperatures were reported in the Easter Pacific Ocean as well as a shallower thermocline in the Gulf of Guayaquil in 2022 during the La Niña event ( Instituto Público de Investigación de Acuiculture Pesca, 2023 ; Senamhi, 2022 ), possibly facilitating the occurrence of anchovy further north ( Instituto Público de Investigación de Acuiculture Pesca, 2023 ). Hence, extending the configuration of our CROCO-BioEBUS model and training the habitat models with up-to-date observations is necessary to be able to simulate an updated hindcast of the anchovy habitat in more recent years and have a clearer picture of their potential response and possibilities of adaptation to climate change. 4.3 Challenges modelling fish habitat In order to overcome the challenges of fitting all habitat models used in our study with a common metric and compare both supervised (GAM and XGBoost) and unsupervised (clustering) methods, we opted for producing a heatmap of the observations and comparing the heatmap cells against predicted distribution by the habitat model with input from the physical-biogeochemical model. This method allows us to compare probabilities against occurrence rate rather than single presence-absence points. We were able to train all three models using randomised grid search against the same set of observations and produce maps that provide a realistic distribution for anchovies, concentrating most of them near the coast of Peru and leaving the open ocean free. A main challenge when modelling marine organisms distributions is the extrapolation of models to regions where observational samples are not available (e.g., Sequeira et al., 2014 ) or sparse, for instance, the open ocean. This has been handled by adding pseudo-absence data points in areas where it is known that the species of interest does not naturally occur ( Luján Paredes, 2016 ). Another alternative for the prediction of fish occurrence is to use a method that is based on presence-only data, such as maximum entropy models ( Bang et al., 2022 ). However, these models are prone to geographical sampling bias ( Syfert, Smith, and Coomes, 2013 ). Our XGBoost model and GAM were trained without pseudo-absence points showing no extrapolation issues, indicating that these models generalise well on unknown data. A further obstacle when modelling the distribution of highly mobile marine animals such as fish is that their presence in the sampling does not only depend on the suitability of environmental conditions but also on the chances of sampling at the specific time when the school of fish is transiting through the sampled location. Predicting anchovy habitats with a classifier, like the XGBoost model or the GAM, can be challenging if the presence of anchovy schools is probabilistic even under suitable environmental conditions. A key problem is that traditional classifiers might struggle to capture the stochastic nature of anchovy appearances, as they typically aim for deterministic outcomes. This means that, even if the environmental conditions are right, the inherent unpredictability of anchovy locations is not easily learned by standard classifiers, which could lead to inaccuracies in habitat prediction. Our clustering method proved to be robust for handling non-deterministic data and, from a statistical perspective, it was better than the other two methods. It also has potential for predicting abundance which is challenging in datasets that are zero-inflated such as for sparsely distributed organisms ( Barry and Welsh, 2002 ; A. Lee-Yaw et al., 2022 ). However, predicting the abundance of anchovy is beyond the scope of this paper so we only evaluated the clustering probability of occurrence predictions.A caveat of the clustering method is that, even after scaling, it predicted a very narrow range of probabilities, underestimating in regions where anchovy is common and overestimating in areas where anchovy is absent. If a larger dataset is available, this might be improved by defining more water types. Supervised methods such as XGBoost and statistical methods like GAM have been extensively used in habitat modelling already, while unsupervised learning, like clustering, has received less attention. Thus, further development of unsupervised learning methods might be a promising approach for improving habitat models. 5 Summary We simulated anchovy habitat with three models combined with input from a physical-biogeochemical model. We used a method for comparing the three models with observations based on producing a heatmap from the presence-absence data points which allowed us to train the models using brute force grid search. This approach must be combined with expert judgement based on the visualisation of the maps to pick the best maps. The new unsupervised learning method that we implemented was able to produce a map with a similar distribution pattern for anchovies to the more traditional methods based on supervised learning. However, the range of probabilities of occurrence was rather narrow, over- and underestimating the predictions depending on the region. We also produced climate change predictions for anchovies which indicate a reduction in their potential habitat under warming temperatures and weaker winds and a potential increase under environmental conditions with stronger winds. Two of the three habitat models predicted a severe maximum decline by 90% (GAM) and 75% (XGBoost) whereby the clustering model predicted a moderate maximum decline in anchovy occurrence by 20%. A Model comparison with observations In order to compare with observations, phytoplankton concentrations ( P , in unit: mmol N m −3 ) in the CROCO-BioEBUS model were converted to chlorophyll a ( chl , in unit: mg m −3 ) using a constant chl/N ratio of 1.59 ( chl = 1.59 · P ) following Hurtt and Armstrong, 1996 and Gutknecht et al., 2013 . Simulated chlorophyll in CROCO-BioEBUS tends to be higher than in observations ( Figure 6 , note that chlorophyll is shown in log-scale). This discrepancy may arise from two key factors: (1) the constant chl/N ratio used in the model is too high for the Peruvian system; and (2) the absence of iron limitation in the model, which leads to an overestimation of chlorophyll concentrations offshore where, according to observations, phytoplankton growth is limited by iron availability, as discussed in Xue et al. (2022) . Download figure Open in new tab Figure 6. Distribution of chlorophyll a in observations and CROCO-BioEBUS output sampled at the same locations. Fig. 7 shows a higher temperature in observations sampled between January and May than those sampled after May. This is also observed in the CROCO-BioEBUS model although with a less clear differentiation ( Figure 8 ). Download figure Open in new tab Figure 7. Sea surface temperature distribution in Observations (°C). Download figure Open in new tab Figure 8. CROCO-BioEBUS sea surface temperature distribution of points sampled at the same location as in observations (°C) B XGBoost error type analysis Out of the environmental variables that we used to predict the habitat of Peruvian anchovies, the depth of the oxy-cline is the variable that showed the lowest correlation between observations and the physical-biogeochemical model CROCO-BioEBUS (see Section 2.2.1 ). Therefore, we performed an error type analysis to evaluate whether this variable was responsible of the decrease in the XGBoost test score when running it with predictors from CROCO-BioEBUS with respect to observations (see Section 3.2 ). Fig. 9 indicates that the model has generally higher accuracy when the oxycline is deep in both observations and the CROCO-BioEBUS model ( Figure 9 top-left). This also corresponds to the area with fewer data-points and fewer fish ( Figure 9 ). Since accuracy does not seem to be affected by a higher or lower correlation between observations and CROCO-BioEBUS output ( Figure 9 top-left), we cannot conclude that the low correlation between observations and CROCO-BioEBUS has significantly impacted the overall accuracy of the habitat map. Download figure Open in new tab Figure 9. Error type analysis of the depth of the oxycline. C Characteristics of the water types Water types were clustered according to similarities in their environmental characteristics. These are shown in Fig. 10 . Download figure Open in new tab Figure 10. Features of the clustered water types. Author contributions MH: Conceptualising the original idea and designing the study, leading the paper writing, organisation, discussion of results and preparing figures. TX: Conceptualising the original idea and designing the study, running the CROCO-BioEBUS hindcast simulation and preparing the model output, discussion of results, preparing figures and contribution to the manuscript text. JH: Running the CROCO-BioEBUS climate sensitivity simulation, discussion of results, preparing figures and contribution to the manuscript text. MG: Expertise on the northern Humboldt Current System and the Peruvian anchovy, data compilation, discussion of results and contribution to the manuscript text. TK: Setting up and tuning the habitat models, proposing the clustering method and the common method to evaluate all three models, data analysis, discussion of results, preparing figures and contribution to the manuscript text. MH and TK contributed equally to the paper. Model availability The CROCO-BioEBUS hindcast output is available at the GEOMAR data management server: https://hdl.handle.net/20.500.12085/b4d40ba5-48ad-48c8-99c4-fc422aa3cebd ( Xue et al., 2023 ). The source code is available and maintained at the official CROCO webiste: https://www.croco-ocean.org/ . Acknowledgements The authors thank Daniel Lizarbe and David Moncayo for their valuable feedback and comments to improve this paper. We also thank Anna Akimova and Klaus Huebert for their suggestions. CROCO-BioEBUS simulations were carried out using the computing facilities of the Norddeutscher Verbund zur Förderung des Hoch-und Höchstleistungsrechnens – HLRN. This project received financial support by the Bundesministerin für Bildung und Forschung (BMBF)-funded project GlobalTip: Humboldt-Tipping (01LC2323B). Funder Information Declared Bundesministerin fuer Bildung und Forschung BMBF , 01LC2323B References ↵ A. Lee-Yaw , J J L. McCune , S Pironon , and S N. Sheth ( 2022 ). “ Species distribution models rarely predict the biology of real populations ”. In: Ecography 2022 ( 6 ), e05877 . DOI: 10.1111/ecog.05877 . eprint: https://nsojournals.onlinelibrary.wiley.com/doi/pdf/10.1111/ecog.05877 . OpenUrl CrossRef ↵ Alheit , J and M Niquen ( 2004 ). “ Regime shifts in the Humboldt Current ecosystem ”. In: Progress in Oceanography 60 ( 2-4 ), pp. 201 – 222 . ISSN: 0079-6611 . DOI: 10.1016/j.pocean.2004.02.006 . OpenUrl CrossRef Web of Science Arias Schreiber , M. M Ñiquen , and M Bouchon ( 2011 ). “ Coping Strategies to Deal with Environmental Variability and Extreme Climatic Events in the Peruvian Anchovy Fishery ”. In: Sustainability 3 ( 6 ), pp. 823 – 846 . ISSN: 2071-1050 . DOI: 10.3390/su3060823 . OpenUrl CrossRef Avadí , A , P Fréon , and J Tam ( 2014 ). “ Coupled Ecosystem/Supply Chain Modelling of Fish Products from Sea to Shelf: The Peruvian Anchoveta Case ”. In: PLOS ONE 9 ( 7 ), pp. 1 – 21 . DOI: 10.1371/journal.pone.0102057 . OpenUrl CrossRef ↵ Bakun , A and K Broad ( 2003 ). “ Environmental ‘loopholes’ and fish population dynamics: comparative pattern recognition with focus on El Niño effects in the Pacific ”. In: Fisheries Oceanography 12 ( 4-5 ), pp. 458 – 473 . DOI: 10.1046/j.1365-2419.2003.00258.x . OpenUrl CrossRef Web of Science ↵ Bang , M , D Sohn , JJ Kim , W Choi , CJ Jang , and C Kim ( 2022 ). “ Future changes in the seasonal habitat suitability for anchovy (Engraulis japonicus) in Korean waters projected by a maximum entropy model ”. In: Frontiers in Marine Science 9 . ISSN: 2296-7745 . DOI: 10.3389/fmars.2022.922020 . OpenUrl CrossRef ↵ Barber , RT and FP Chavez ( 1983 ). “ Biological consequences of El Niño ”. In: Science 222 ( 4629 ), pp. 1203 – 1210 . DOI: 10.1126/science.222.4629.1203 . OpenUrl Abstract / FREE Full Text ↵ Barry , SC and A Welsh ( 2002 ). “ Generalized additive modelling and zero inflated count data ”. In: Ecological Modelling 157 ( 2 ), pp. 179 – 188 . ISSN: 0304-3800 . DOI: 10.1016/S0304-3800(02)00194-1 . OpenUrl CrossRef Web of Science ↵ Ben-David , A ( 2008 ). “ About the relationship between ROC curves and Cohen’s kappa ”. In: Engineering Applications of Artificial Intelligence 21 ( 6 ), pp. 874 – 882 . ISSN: 0952-1976 . DOI: 10.1016/j.engappai.2007.09.009 . OpenUrl CrossRef ↵ Bertrand , A , A Chaigneau , S Peraltilla , J Ledesma , M Graco , F Monetti , and FP Chavez ( 2011 ). “ Oxygen: A Fundamental Property Regulating Pelagic Ecosystem Structure in the Coastal Southeastern Tropical Pacific ”. In: PLOS ONE 6 ( 12 ), pp. 1 – 8 . DOI: 10.1371/journal.pone.0029558 . OpenUrl CrossRef ↵ Bertrand , A , M Segura , M Gutiérrez , and L Vásquez ( 2004 ). “ From small-scale habitat loopholes to decadal cycles: a habitat-based hypothesis explaining fluctuation in pelagic fish populations off Peru ”. In: Fish and Fisheries 5 ( 4 ), pp. 296 – 316 . DOI: 10.1111/j.1467-2679.2004.00165.x . OpenUrl CrossRef Web of Science ↵ Bianchi , D , DA Carozza , ED Galbraith , J Guiet , and T DeVries ( 2021 ). “ Estimating global biomass and biogeochemical cycling of marine fish with and without fishing ”. In: Science Advances 7 ( 41 ), eabd7554 . ISSN: 23752548 . DOI: 10.1126/sciadv.abd7554 . OpenUrl CrossRef ↵ Bouchon , M , E Díaz , P Marín , E Ramos , C Peña , J Salcedo , D Ulloa , and K Ttito ( 2021 ). Informe sobre la situación de la anchoveta disponible en la región sur del mar peruano y perspectivas de explotación para la primera temporada de pesca de 2022 . Instituto del Mar del Perú. Informe IMARPE , p. 15 . URL: https://www.gob.pe/institucion/imarpe/informes-publicaciones/4106537-informe-sobre-la-situacion-de-la-anchoveta-disponible-en-la-region-sur-del-mar-peruano-y-perspectivas-de-explotacion-para-la-primera-temporada-de-pesca-de-2022 (visited on June 4, 2024). ↵ Brochier , T , JM Ecoutin , LT de Morais , DM Kaplan , and R Lae ( 2013 ). “ A multi-agent ecosystem model for studying changes in a tropical estuarine fish assemblage within a marine protected area ”. In: Aquatic Living Resources 26 ( 02 ), pp. 147 – 158 . DOI: 10.1051/alr/2012028 . OpenUrl CrossRef ↵ Carton , JA , GA Chepurin , and L Chen ( 2018 ). “ SODA3: A New Ocean Climate Reanalysis ”. In: Journal of Climate 31 ( 17 ), pp. 6967 – 6983 . DOI: 10.1175/JCLI-D-18-0149.1 . OpenUrl CrossRef ↵ Chavez , FP , A Bertrand , R Guevara-Carrasco , P Soler , and J Csirke ( 2008 ). “ The northern Humboldt Current System: Brief history, present status and a… view towards the future ”. In: Progress in Oceanography 79 ( 2-4 ), pp. 95 – 105 . DOI: 10.1016/j.pocean.2008.10.012 . OpenUrl CrossRef Web of Science ↵ Chavez , FP , J Ryan , SE Lluch-Cota , and M Ñiquen ( 2003 ). “ From anchovies to sardines and back: multidecadal change in the Pacific Ocean ”. In: Science 299 ( 5604 ), pp. 217 – 221 . DOI: 10.1126/science.1075880 . OpenUrl Abstract / FREE Full Text ↵ Checkley , DM , RG Asch , and RR Rykaczewski ( 2017 ). “ Climate, Anchovy, and Sardine ”. In: Annual Review of Marine Science 9 ( 1 ). PMID: 28045355 , pp. 469 – 493 . DOI: 10.1146/annurev-marine-122414-033819 . OpenUrl CrossRef PubMed ↵ Chen , T and C Guestrin ( 2016 ). “ XGBoost: A Scalable Tree Boosting System ”. In: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. KDD ‘16 . San Francisco, California, USA : Association for Computing Machinery , pp. 785 – 794 . ISBN: 9781450342322 . DOI: 10.1145/2939672.2939785 . OpenUrl CrossRef ↵ Echevin , V , M Gévaudan , D Espinoza-Morriberón , J Tam , O Aumont , D Gutierrez , and F Colas ( 2020 ). “ Physical and biogeochemical impacts of RCP8.5 scenario in the Peru upwelling system ”. In: Biogeosciences 17 , pp. 3317 – 3341 . DOI: 10.5194/bg-17-3317-2020 . OpenUrl CrossRef ↵ Echevin , V , K Goubanova , A Belmadani , and B Dewitte ( 2012 ). “ Sensitivity of the Humboldt Current system to global warming: A downscaling experiment of the IPSL-CM4 model ”. In: Climate Dynamics 38 ( 3-4 ), pp. 761 – 774 . ISSN: 09307575 . DOI: 10.1007/s00382-011-1085-2 . OpenUrl CrossRef ↵ Espinoza-Morriberón , D , V Echevin , D Gutiérrez , J Tam , M Graco , J Ledesma , and F Colas ( 2021 ). “ Evidences and drivers of ocean deoxygenation off Peru over recent past decades ”. In: Scientific Reports 11 ( 1 ), p. 20292 . DOI: 10.1038/s41598-021-99876-8 . OpenUrl CrossRef ↵ Espinoza-Morriberon , D , M Graco , V Echevin , F Colas , J Tam , J Ledesma , L Vasquez , and M Graco ( 2017 ). “ Impacts of El Niño events on the Peruvian upwelling system productivity ”. In: Journal of Geophysical Research: Oceans 122 ( 5423 ), pp. 2647 – 2651 . DOI: 10.1002/2016JC012439 . OpenUrl CrossRef ↵ Flores-Valiente , J , C Lett , F Colas , L Pecquerie , A Aguirre-Velarde , F Rioual , J Tam , A Bertrand , P Ayón , S Sall , N Barrier , and T Brochier ( 2023 ). “ Influence of combined temperature and food availability on Peruvian anchovy (/textitEngraulis ringens) early life stages in the northern Humboldt Current system: A modelling approach ”. In: Progress in Oceanography 215 , p. 103034 . ISSN: 0079-6611 . DOI: 10.1016/j.pocean.2023.103034 . OpenUrl CrossRef Fréon , P , H Durand , A Avadí , S Huaranca , and R Orozco Moreyra ( 2017 ). “ Life cycle assessment of three Peruvian fishmeal plants: Toward a cleaner production ”. In: Journal of Cleaner Production 145 , pp. 50 – 63 . ISSN: 0959-6526 . DOI: 10.1016/j.jclepro.2017.01.036 . OpenUrl CrossRef Fréon , P , JC Sueiro , F Iriarte , OF Miro Evar , Y Landa , JF Mittaine , and M Bouchon ( 2014 ). “ Harvesting for food versus feed: a review of Peruvian fisheries in a global context ”. In: Reviews in Fish Biology and Fisheries 24 ( 1 ), pp. 381 – 398 . DOI: 10.1007/s11160-013-9336-4 . OpenUrl CrossRef ↵ Friedland , KD , C Stock , KF Drinkwater , JS Link , RT Leaf , BV Shank , JM Rose , CH Pilskaln , and MJ Fogarty ( 2012 ). “ Pathways between Primary Production and Fisheries Yields of Large Marine Ecosystems ”. In: PLOS ONE 7 ( 1 ), pp. 1 – 11 . DOI: 10.1371/journal.pone.0028945 . OpenUrl CrossRef ↵ Gergis , JL and AM Fowler ( 2009 ). “ A history of ENSO events since A.D. 1525: implications for future climate change ”. In: Climatic Change 92 ( 3 ), pp. 343 – 387 . DOI: 10.1007/s10584-008-9476-z . OpenUrl CrossRef GeoRef Web of Science ↵ Getzlaff , J and A Oschlies ( 2017 ). “ Pilot Study on Potential Impacts of Fisheries-Induced Changes in Zooplankton Mortality on Marine Biogeochemistry ”. In: Global Biogeochemical Cycles 31 ( 11 ), pp. 1656 – 1673 . DOI: 10.1002/2017GB005721 . OpenUrl CrossRef ↵ Gogina , M , H Nygård , M Blomqvist , D Daunys , AB Josefson , J Kotta , A Maximov , J Warzocha , V Yermakov , U Gräwe , and ML Zettler ( 2016 ). “ The Baltic Sea scale inventory of benthic faunal communities ”. In: ICES Journal of Marine Science 73 ( 4 ), pp. 1196 – 1213 . ISSN: 1054-3139 . DOI: 10.1093/icesjms/fsv265 . eprint: https://academic.oup.com/icesjms/article-pdf/73/4/1196/31230998/fsv265.pdf . OpenUrl CrossRef ↵ Grüss , A , M Drexler , and CH Ainsworth ( 2014 ). “ Using delta generalized additive models to produce distribution maps for spatially explicit ecosystem models ”. In: Fisheries Research 159 , pp. 11 – 24 . ISSN: 0165-7836 . DOI: 10.1016/j.fishres.2014.05.005 . OpenUrl CrossRef ↵ Guisan , A and NE Zimmermann ( 2000 ). “ Predictive habitat distribution models in ecology ”. In: Ecological Modelling 135 ( 2 ), pp. 147 – 186 . ISSN: 0304-3800 . DOI: 10.1016/S0304-3800(00)00354-9 . OpenUrl CrossRef ↵ Gutiérrez , D , A Bertrand , C Wosnitza-Mendo , B Dewitte , S Purca , C Peña , A Chainegau , et al. ( 2011 ). “ Sensibilidad del sistema de afloramiento costero del Perú al cambio climático e implicancias ecológicas [Climate change sensitivity of the Peruvian upwelling system and ecological implications] ”. In: Revista Peruana Geoatmosférica 3 , pp. 1 – 24 . OpenUrl ↵ Gutiérrez , M , R Castillo , M Segura , S Peraltilla , and M Flores ( 2012 ). “ Trends in spatio-temporal distribution of Peruvian anchovy and other small pelagic fish biomass from 1966-2009 ”. In: Latin American Journal of Aquatic Research 40 ( 3 ), pp. 633 – 648 . DOI: 10.3856/vol40-issue3-fulltext-12 . OpenUrl CrossRef ↵ Gutknecht , E , I Dadou , B Le Vu , G Cambon , J Sudre , V Garçon , E Machu , T Rixen , A Kock , A Flohr , et al. ( 2013 ). “ Coupled physical/biogeochemical modeling including O2-dependent processes in the Eastern Boundary Upwelling Systems: application in the Benguela ”. In: Biogeosciences 10 , pp. 3559 – 3591 . DOI: 10.5194/bg-10-3559-2013 . OpenUrl CrossRef ↵ Hill Cruz , M. I Kriest , J José , R Kiko , H Hauss , and A Oschlies ( 2021 ). “ Zooplankton mortality effects on the plankton community of the northern Humboldt Current System: sensitivity of a regional biogeochemical model ”. In: Biogeosciences 18 , pp. 2891 – 2916 . DOI: 10.5194/bg-18-2891-2021 . OpenUrl CrossRef ↵ Hill Cruz , M. I Frenger , J Getzlaff , I Kriest , T Xue , and YJ Shin ( 2022 ). “ Understanding the drivers of fish variability in an end-to-end model of the Northern Humboldt Current System ”. In: Ecological Modelling 472 , p. 110097 . DOI: 10.1016/j.ecolmodel.2022.110097 . OpenUrl CrossRef ↵ Hurtt , GC and RA Armstrong ( 1996 ). “ A pelagic ecosystem model calibrated with BATS data ”. In: Deep Sea Research Part II: Topical Studies in Oceanography 43 ( 2 ), pp. 653 – 683 . ISSN: 0967-0645 . DOI: 10.1016/0967-0645(96)00007-0 . OpenUrl CrossRef ↵ Instituto Público de Investigación de Acuiculture Pesca ( 2023 ). Presencia de anchoveta (Engraulis ringens) en el Golfo de Guayaquil: Crucero de prosepcción hidroacústica y pesca comprobatoria . Reública del Ecuador. Executive report . ↵ Liu , C , M White , and G Newell ( 2009 ). “ Measuring the accuracy of species distribution models: a review ”. In: Proceedings 18th World IMACs/MODSIM Congress, Cairns, Australia 13-17 July 2009 , pp. 4241 – 4247 . ↵ Luján Paredes , DC ( 2016 ). “ Factores derminantes de la variabilidad espacial de anchoveta peruana (Angraulis ringens) en el Pacífico sudoriental ”. Master’s thesis. Universidad Peruana Cayetano Heredia . URL: https://biblioimarpe.imarpe.gob.pe/bitstream/20.500.12958/3031/1/Lujan%20Paredes%2c%20C.pdf x(visited on Apr. 15, 2024). ↵ D Pauly , P Muck , J Mendo , and I Tsukayama Mathisen , O ( 1989 ). “Adaptation of the anchoveta (Engraulis ringens) to the Peruvian upwelling system”. In: The Peruvian upwelling ecosystem: dynamics and interactions. ICLARM Conference Proceedings 18 . Ed. by D Pauly , P Muck , J Mendo , and I Tsukayama . Instituto del Mar del Peru (IMARPE) Callao, Peru; Deutsche Gesellschaft fuer Technische Zusammenarbeit (GIZ), GmbH, Eschbom, Federal Republic of Germany; and International Center for Living Aquatic Resources Management (ICLARM) , Manila Philippines ., pp. 220 – 234 . ↵ Matthes , K , A Biastoch , S Wahl , J Harlaß , T Martin , T Brücher , A Drews , D Ehlert , K Getzlaff , F Krüger , W Rath , M Scheinert , FU Schwarzkopf , T Bayr , H Schmidt , and W Park ( 2020 ). “ The Flexible Ocean and Climate Infrastructure version 1 (FOCI1): mean state and variability ”. In: Geoscientific Model Development 13 ( 6 ), pp. 2533 – 2568 . DOI: 10.5194/gmd-13-2533-2020 . OpenUrl CrossRef ↵ Meier , HEM , M Edman , K Eilola , M Placke , T Neumann , HC Andersson , SE Brunnabend , C Dieterich , C Frauen , R Friedland , M Gröger , BG Gustafsson , E Gustafsson , A Isaev , M Kniebusch , I Kuznetsov , B Müller-Karulis , M Naumann , A Omstedt , V Ryabchenko , S Saraiva , and OP Savchuk ( 2019 ). “ Assessment of Uncertainties in Scenario Simulations of Biogeochemical Cycles in the Baltic Sea ”. In: Frontiers in Marine Science 6 . ISSN: 2296-7745 . DOI: 10.3389/fmars.2019.00046 . OpenUrl CrossRef Ñiquen , M and M Bouchon ( 2004 ). “ Impact of El Niño events on pelagic fisheries in Peruvian waters ”. In: Deep Sea Research Part II: Topical Studies in Oceanography 51 ( 6 ). Oceanography of the Eastern Pacific: Volume III, pp. 563 – 574 . ISSN: 0967-0645 . DOI: 10.1016/j.dsr2.2004.03.001 . OpenUrl CrossRef Web of Science ↵ Ñiquen Carranza , m , m Bouchon Corrales , S Cahuín Villanueva , and E Díaz Acuña ( 2000 ). Pesquería de anchoveta en el mar peruano. 1950 - 1999. Instituto del Mar del Perú . Vol. 19 . 1-2 , pp. 117 – 123 . URL: https://hdl.handle.net/20.500.12958/1003 x(visited on May 19, 2022). OpenUrl ↵ Oliveros-Ramos , R ( 2014 ). “ End–to–end modelling for an ecosystem approach to fisheries in the Northern Humboldt Current Ecosystem ”. PhD thesis. University of Montpellier . ↵ Oliveros-Ramos , R and YJ Shin ( 2023 ). “ Future climate change impacts on anchoveta (Engraulis ringens) in the Northern Peru Current Ecosystem ”. In: bioRxiv . DOI: 10.1101/2023.02.14.528548 . eprint: https://www.biorxiv.org/content/early/2023/02/15/2023.02.14.528548.full.pdf . OpenUrl Abstract / FREE Full Text ↵ Pedregosa , F , G Varoquaux , A Gramfort , V Michel , B Thirion , O Grisel , M Blondel , P Prettenhofer , R Weiss , V Dubourg , J Vanderplas , A Passos , D Cournapeau , M Brucher , M Perrot , and E Duchesnay ( 2011 ). “ Scikit-learn: Machine Learning in Python ”. In: Journal of Machine Learning Research 12 , pp. 2825 – 2830 . OpenUrl ↵ Pinsky , ML , B Worm , MJ Fogarty , JL Sarmiento , and SA Levin ( 2013 ). “ Marine Taxa Track Local Climate Velocities ”. In: Science 341 ( 6151 ), pp. 1239 – 1242 . DOI: 10.1126/science.1239352 . eprint: https://www.science.org/doi/pdf/10.1126/science.1239352 . OpenUrl Abstract / FREE Full Text ↵ Pozo Buil , M. J Fiechter , MG Jacox , SJ Bograd , and MA Alexander ( 2023 ). “ Evaluation of Different Bias Correction Methods for Dynamical Downscaled Future Projections of the California Current Upwelling System ”. In: Earth and Space Science 10 ( 12 ), e2023EA003121 . DOI: 10.1029/2023EA003121 . OpenUrl CrossRef ↵ Ridgway , K , J Dunn , and J Wilkin ( 2002 ). “ Ocean interpolation by four-dimensional weighted least squares— Application to the waters around Australasia ”. In: Journal of atmospheric and oceanic technology 19 ( 9 ), pp. 1357 – 1375 . DOI: 10.1175/1520-0426(2002)0192.0.CO;2 . OpenUrl CrossRef Web of Science ↵ Rykaczewski , RR , JP Dunne , WJ Sydeman , M García-Reyes , BA Black , and SJ Bograd ( 2015 ). “ Poleward displacement of coastal upwelling-favorable winds in the ocean’s eastern boundary currents through the 21st century ”. In: Geophysical Research Letters 42 ( 15 ), pp. 6424 – 6431 . DOI: 10.1002/2015GL064694 . eprint: https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1002/2015GL064694 . OpenUrl CrossRef ↵ Salazar , JJ , L Garland , J Ochoa , and MJ Pyrcz ( 2022 ). “ Fair train-test split in machine learning: Mitigating spatial autocorrelation for improved prediction accuracy ”. In: Journal of Petroleum Science and Engineering 209 , p. 109885 . ISSN: 0920-4105 . DOI: 10.1016/j.petrol.2021.109885 . OpenUrl CrossRef ↵ Salvatteci , R , RR Schneider , E Galbraith , D Field , T Blanz , T Bauersachs , X Crosta , P Martinez , V Echevin , F Scholz , and A Bertrand ( 2022 ). “ Smaller fish species in a warm and oxygen-poor Humboldt Current system ”. In: Science 375 ( 6576 ), pp. 101 – 104 . DOI: 10.1126/science.abj0270 . OpenUrl CrossRef PubMed ↵ Schubert , PR , W Hukriede , and TB Karez Rolf unandd Reusch ( 2015 ). “ Mapping and modeling eelgrass Zostera marina distribution in the western Baltic Sea ”. In: Marine Ecology Progress Series 522 , pp. 79 – 95 . DOI: 10.3354/meps11133 . OpenUrl CrossRef ↵ Senamhi ( 2022 ). Fenómeno “El Niño/La Niña” No04 (abirl 2022) . Servicio Nacional de Meteorología e Hidrología del Perú. Boletín de monitoreo , pp. 1 – 16 . URL: https://www.senamhi.gob.pe/load/file/02216SENA-94.pdf x(visited on May 14, 2024). ↵ Sequeira , AMM , C Mellin , DA Fordham , MG Meekan , and CJA Bradshaw ( 2014 ). “ Predicting current and future global distributions of whale sharks ”. In: Global Change Biology 20 ( 3 ), pp. 778 – 789 . DOI: 10.1111/gcb.12343 . eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/gcb.12343 . OpenUrl CrossRef ↵ Servén , D and C Brummitt ( 2018 ). pyGAM: Generalized Additive Models in Python . DOI: 10.5281/zenodo.1208723 . OpenUrl CrossRef ↵ Shchepetkin , AF and JC McWilliams ( 2005 ). “ The regional oceanic modeling system (ROMS): a split-explicit, free-surface, topography-following-coordinate oceanic model ”. In: Ocean modelling 9 ( 4 ), pp. 347 – 404 . DOI: 10.1016/j.ocemod.2004.08.002 . OpenUrl CrossRef ↵ Shin , NY , JS Kug , MF Stuecker , FF Jin , A Timmermann , and GI Kim ( 2022 ). “ More frequent central Pacific El Niño and stronger eastern pacific El Niño in a warmer climate ”. In: npj Climate and Atmospheric Science 5 ( 1 ), p. 101 . DOI: 10.1038/s41612-022-00324-9 . OpenUrl CrossRef ↵ Syfert , MM , MJ Smith , and DA Coomes ( 2013 ). “ The Effects of Sampling Bias and Model Complexity on the Predictive Performance of MaxEnt Species Distribution Models ”. In: PLOS ONE 8 ( 2 ), pp. 1 – 10 . DOI: 10.1371/journal.pone.0055158 . OpenUrl CrossRef ↵ Taylor , KE , RJ Stouffer , and GA Meehl ( 2012 ). “ An Overview of CMIP5 and the Experiment Design ”. In: Bulletin of the American Meteorological Society 93 ( 4 ), pp. 485 – 498 . DOI: 10.1175/BAMS-D-11-00094.1 . OpenUrl CrossRef ↵ Travers-Trolet , M , YJ Shin , LJ Shannon , CL Moloney , and JG Field ( 2014 ). “ Combined fishing and climate forcing in the southern Benguela upwelling ecosystem: an end-to-end modelling approach reveals dampened effects ”. In: PloS one 9 ( 4 ), e94286 . DOI: 10.1371/journal.pone.0094286 . OpenUrl CrossRef ↵ Wardhani , NWS , MY Rochayani , A Iriany , AD Sulistyono , and P Lestantyo ( 2019 ). “ Cross-validation Metrics for Evaluating Classification Performance on Imbalanced Data ”. In: 2019 International Conference on Computer, Control, Informatics and its Applications (IC3INA) , pp. 14 – 18 . DOI: 10.1109/IC3INA48034.2019.8949568 . OpenUrl CrossRef ↵ Xue , T , J Terhaar , Prowe , AEF. , T. Frölicher , A Oschlies , and I Frenger ( 2024 ). “ Southern Ocean phytoplankton under climate change: a shifting balance of bottom-up and top-down control ”. In: Biogeosciences Discussions 21 , pp. 2473 – 2491 . DOI: 10.5194/bg-21-2473-2024 . OpenUrl CrossRef ↵ Xue , T , I Frenger , J Hauschildt , and A Oschlies ( 2023 ). CROCO-BioEBUS hindcast simulation (1990-2010) . data-set . URL: https://hdl.handle.net/20.500.12085/b4d40ba5-48ad-48c8-99c4-fc422aa3cebd (visited on June 11, 2024). Xue , T , I Frenger , J Hauschildt , and A Oschlies ( 2024 [In review]). “Mechanisms regulating trophic transfer in the Humboldt Upwelling System differ across time scales” . In: Environmental Research Letters . ↵ Xue , T , I Frenger , A Prowe , YS José , and A Oschlies ( 2022 ). “ Mixed layer depth dominates over upwelling in regulating the seasonality of ecosystem functioning in the Peruvian upwelling system ”. In: Biogeo-sciences 19 ( 2 ), pp. 455 – 475 . DOI: 10.5194/bg-19-455-2022 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted May 13, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Supervised and unsupervised machine learning methods for modelling current and future habitat of Peruvian anchovy Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Supervised and unsupervised machine learning methods for modelling current and future habitat of Peruvian anchovy Mariana Hill , Tianfei Xue , Jaard Hauschildt , Mariano Gutiérrez , Tronje Kemena bioRxiv 2025.05.08.652876; doi: https://doi.org/10.1101/2025.05.08.652876 Share This Article: Copy Citation Tools Supervised and unsupervised machine learning methods for modelling current and future habitat of Peruvian anchovy Mariana Hill , Tianfei Xue , Jaard Hauschildt , Mariano Gutiérrez , Tronje Kemena bioRxiv 2025.05.08.652876; doi: https://doi.org/10.1101/2025.05.08.652876 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Ecology Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17635) Bioengineering (13859) Bioinformatics (41846) Biophysics (21401) Cancer Biology (18534) Cell Biology (25423) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24286) Genetics (15582) Genomics (22463) Immunology (17700) Microbiology (40298) Molecular Biology (17141) Neuroscience (88429) Paleontology (666) Pathology (2825) Pharmacology and Toxicology (4813) Physiology (7633) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.