A ready-to use workflow for automated, sound-based bird identification and localization

preprint OA: closed
📄 Open PDF Full text JSON View at publisher
Full text 52,110 characters · extracted from preprint-html · click to expand
A ready-to use workflow for automated, sound-based bird identification and localization | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results A ready-to use workflow for automated, sound-based bird identification and localization View ORCID Profile Carsten M. Buchmann , View ORCID Profile Frank M. Schurr doi: https://doi.org/10.1101/2024.07.02.601711 Carsten M. Buchmann 1 Institute of Landscape and Plant Ecology, University of Hohenheim , Ottilie-Zeller-Weg 2, 70599 Stuttgart, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Carsten M. Buchmann For correspondence: carsten.buchmann{at}uni-hohenheim.de Frank M. Schurr 1 Institute of Landscape and Plant Ecology, University of Hohenheim , Ottilie-Zeller-Weg 2, 70599 Stuttgart, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Frank M. Schurr Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Field studies of bird communities typically require the fine-scale mapping of individuals. Passive acoustic monitoring combined with localization of individuals is a promising approach to gather such data. While various approaches for identification of species and localization of individuals have been proposed, no fully automated ready-to-use workflow is available so far. We here present a novel approach based on sound recordings with multiple cost-efficient automated recording units (Audiomoths). The workflow uses a well-established AI model (BirdNET; other models possible) for species identification and localizes the sources of all identified bird sounds with high accuracy. Tests with replayed sounds of different bird species in an agricultural landscape show that - after filtering out identifications with low identification confidence - the algorithm localizes more than 90% of the sounds within 5 m of the true location (85 % < 2 m). Recording and localization of wild birds demonstrate the applicability of the approach for avian ecology. This workflow is completely automated and ready-to-use, also for non-experts and can also be used when strong winds affect the speed of sound or if 3D localizations are of interest. By making data on individual bird locations accessible the presented work will help to advance fundamental and applied ecology as well as conservation. Introduction Localizing organisms in space is central to ecology ( Krebs 1972 ). In avian ecology, it is essential for bird censuses and for quantifying behaviour and habitat use. There is thus a strong need for simple, cost- and labour-efficient approaches to localize birds. The efficiency of bird monitoring has increased massively through the widespread use of passive acoustic monitoring (PAM) with automated recording units (ARU; e.g. Darras, Batáry et al., 2018, Ruff et al., 2020 and references therein for examples of different taxa, Pérez-Granados & Traba, 2021 ). Recently, low-cost ARUs for deployment in the wild have become available ( Smith et al., 2022 , Manzano-Rubio et al., 2022 ). Among these the Audiomoth by Open Acoustic Devices ( Hill et al., 2019 ) is to our knowledge the most widely used model nowadays since it provides good performance in relation to the low price of below 100 $ ( Darras et al., 2019 , Lapp et al., 2023 ). The combination of audio recordings of wildlife with artificial intelligence, e.g. convolutional neural networks (CNNs) for automated species identification, is a rapidly developing research area (e.g. Ruff et al., 2020 , Manzano-Rubio et al., 2022 , Müller et al., 2023 ). Among numerous models for automated species identification (seed Xie et al., 2023 for a comprehensive review) for birds the CNN BirdNET ( Kahl et al., 2021 ) currently seems to be the state-of-the-art tool since it is open source and comparatively simple and ready-to use. It was evaluated as the method of choice, particularly if one is interested in a large array of species and has limited knowledge in bird identification based on sounds, and hence cannot easily provide a large set of training data needed for more specific CNN approaches ( Lauha et al., 2022 , Michaud et al., 2023 , Xie et al., 2023 ). A next step besides pure species identification is the localization of the detected individuals. Pérez-Granados and Traba (2021) give a comprehensive overview of methods to use ARUs for density estimation. They assess the use of microphone arrays (i.e. several synchronized microphones distributed in space) that enable the localization and distinction of different individuals based on time delays of sounds on different recordings as most promising (see Rhinehart et al., 2020 for a summary on time delay-based methods). However, high costs and logistic effort as well as time required for the interpretation of the recordings are so far major disadvantages of this method ( Pérez-Granados & Traba, 2021 ). In recent years, various studies used microphone arrays to localize individuals of single species ( Wilson & Bayne, 2018 , Avots et al., 2022 ) or limited numbers of species ( Smith et al., 2022 ). They used expensive technology (beyond 1000 $ per ARU; Avots et al., 2022 ) in combination with manual species identification ( Smith et al., 2022 ), and laborious manual detection of time delays ( Wilson & Bayne, 2018 , Smith et al., 2022 ). Bistel et al. (2022) present a set of algorithms including complex CNNs that can do both identification and localization and tested it on synthetic sounds of one species in a small-scale lab-setting (<15 m). This approach is, however, far from an automated ready-to-use workflow, for instance because the AI models still needs to be trained for every case study and there is no simple application or running code available that can be used directly for collected field data. What is still missing, but urgently needed, is a cheap and simple, ready-to-use integrated workflow. Moreover, such a workflow should be able to identify a large number of species and localize individuals in a fully automated manner. Here, we present a workflow that uses low-cost sound recorders (Audiomoths, Hill et al., 2019 ) and an existing AI for bird species identification (BirdNET, Kahl et al. 2021 ), to calculate accurate positions of individual birds – all entirely automated. This workflow thus offers a cost-efficient way to identify and localize individual birds in the field. We test the approach in both a simple and a structurally complex landscape. Materials and Methods Use of ARUs and setup in the field For localization of the origin of sounds based on time delays, the sound needs to be detected by at least three microphones (ARUs). The detection distance of the microphones should be considered for the spatial setup of the ARUs. The locations of the ARUs need to be recorded with high accuracy (e.g. high precision differential GPS). The ARUs should be set up to start at the same time to simplify temporal calibration (synchronization), see below. If the duration of time gaps between predefined recording periods on different ARUs cannot be guaranteed to be consistent between different ARUs, it is recommended to plan the observation period within a non-interrupted recording session (one WAV file per ARU). We used cost-efficient Audiomoth ARUs (Model 1.2.0, Open Acoustic Devices, Hill et al., 2019 ) in this study. In a preliminary test for these devices we had found a good detection distance of bird sounds up to 100m facing the sound and 50-100 m facing the opposite direction, dependent on background noise, e.g. wind. Gaps between recording periods as well as recording speed are not consistent to an acceptable degree (between five tested Audiomoths we had found differences of up to 1.3 s per file gap when recording continuously and up to 0.03 s runtime differences of 60 minute recordings). For this reason, the recording period was set to fall in a non-interrupted recording session and a temporal calibration with two calibration signals was performed. After deployment and (simultaneous) start of all ARUs in the study site, a loud sound signal for temporal calibration/synchronization that could be recorded by all ARUs was emitted from a defined position (loud clapping). Alternatively, an external signal (e.g. church bells from a known position) can also be used if it can be recognized and temporally located on all recordings with high precision. At the end of the observation period, another calibration signal was given. These signals were used to correct for small differences in starting times and recording speeds of individual ARUs. Temporal calibration of ARU recordings ( Fig. 1.1 ) We used Audacity (free software), to extract the timing of the calibration signals on each ARU recording. The sounds can be visualized, the signals marked and the markers exported as a so-called label file (TXT). The accuracy that can and should ideally be reached with this approach is < 0.003 s (corresponding to a sound travel distance of 1 m). From the times of the calibration signals on each recording, any event on any recording can be calibrated to the average time of all recordings by linear interpolation. The time needed for the sound to travel from the calibration point (where the calibration signal is emitted) to any ARU was considered in the calculation. These calibrated times of any recorded sound denote the standardized time since the first calibration signal. Download figure Open in new tab Fig.1. An automated workflow of bird identification and localization. Red arrows indicate parameters that were varied in this study. 1.: The exact times of the calibration signals are identified on all recordings. 2.: Sets of identifications belonging to the same bird sound are identified from the output of the ID algorithm (here BirdNET), these sets are filtered by confidence thresholds and the calibrated time for each identification is calculated. 3.: Time delays between the same sound signal on different recordings (sound file snippets of 3 seconds) are determined by determining the time shift yielding the highest spectrogram similarity by means of convolution. 4.: For a grid of potential sound locations (red points as examples) surrounding the sample area, the RMSE between the expected and observed time delays is calculated. This procedure is then repeated around the grid point with smallest RMSE (green point) using a finer grid. Species identification ( Fig. 1.2 ) All recordings were analysed with BirdNET-Analyzer ( Kahl et al., 2021 , Software Version 2022), which produces a file that lists all species identified including the confidence of the identification in three second time slots (e.g. 6-9, 12-15, 33-36 s etc.) for the entire recording (from now on referred to as ‘ID hits’). Other methods for the identification can be used, if the time slot and identification confidence of any sound event are provided. We recommend time intervals of 3 s (as is the default in BirdNET) to cover large parts of a bird song or call but to still reduce the chance of a mixture of several sounds of different individuals or species within time intervals to a large extent. Besides using AI species identification this task can also be by experts (in the field or listening to the recordings), in which case the (real) times of the individual observations need to be provided for further analysis. Analysis and determination of delays of sound signals ( Fig. 1.3 ) Sets of ID hits (‘ID sets’) were identified that were found on at least three recordings, i.e. the same species in a 3 s time slot with a calibrated start time of the time slot that is not more different from the others than a threshold (here set to 0.75 s; note: by starting the recordings at the same time, the time slots are largely synchronous so that this threshold should not become relevant). Subsequently a two-step filter was applied: first, only ID hits exceeding a certain confidence threshold were considered (parameter IDCT 1, here set to 0.25), and secondly at least one of the ID hits from the ID set needed to exceed another confidence level (parameter IDCT 2, here set to 0.75, also tested: 0.35). Then a sound snippet of the corresponding 3 s interval was created (software R, package TuneR, R Core Team, 2021 , Ligges et al., 2023 ) for all recordings and the calibrated time of the start of each snippet was stored with it. For the automated determination of time delays of a sound signal on different recordings, the different snippets were converted into spectrograms (R package signal, Signal developers, 2023 ). Frequencies below 1000 and above 20000 Hz were excluded eliminating background noise but still retaining the relevant frequencies for bird sounds. Subsequently the phase information was discarded, spectrograms were converted to dB, scaled and centred. Then these spectrograms were compared with respect to similarity (see below). This comparison was done between one reference recording (the recording where the ID hit had the highest ID confidence) and each of the other recordings. To calculate time delays between sound signals, the spectrogram of each recording was shifted relative to the spectrogram of the reference recording (pixel by pixel; for a default recording setting of 48kHz a pixel represents 0.0027 s, which is a good compromise between runtime and accuracy). The temporal overlap for shifting the 3 s spectrograms relative to each other was set to 2.7 s. This means that in the beginning of the shifting procedure 0.3 s at the end of the reference recording and at the beginning of any other recording are not considered and vice versa at the end of the procedure (i.e. only 2.7 s are compared). Shifting the spectrograms by up to 0.3 s can detect delays for a situation where the distance of two ARUs to a sound source is different by up to ∼ 100 m. At each shifting step, similarity of the spectrograms was measured as the convolution of the two sound signals (the sum of all pairwise dB products for corresponding time and frequency). The time shift at which the similarity between the compared spectrograms is highest defines the time delay of the sound signal relative to the reference. The relative difference between maximum similarity and average similarity across all time shifts considered was recorded as a general performance measure indicating the quality of the time delay assessment, and could be used to filter the results later on. Together with the calibrated time of each snippet start this gives the total time delay between detections of a sound signal by any pair of ARUs. As alternative to the automated approach, time delays can also be determined manually/optically (e.g. in Audacity) and provided for further analysis. Localization of sound sources ( Fig. 1.4 ) Knowing at least two time delays of a sound signal (from three recordings) allows the localization of the sound origin. One can decide to use only the best ones with respect to the performance measure (see above), or more, up to the maximum number that are available for a specific sound (parameter ‘ND’ for number of time delays used, here set to 3, also tested: maximum available). For any point in a coarse 2D grid (we here used 5 m by 5 m, 100 m around the sample area) the corresponding time delays were calculated based on the speed of sound from this point to the locations of the ARUs (2D distance; note: wind speed can be considered here, see below). These expected time delays were compared to the measured time delays by means of RMSE. This procedure was then repeated for a finer 2D grid (here 0.5 m by 0.5 m) covering 10 m x 10 m around the best location found using the coarse grid. Subsequently, 3 D localization was performed by sampling a cube (0.5 m by 0.5 m by 0.5 m, 10 m around the best point), considering 3D distances and, if applicable, 3D wind velocities from potential sound locations to the ARUs, to assess the height of the sound origin. The sonic speed (according to conditions, notably temperature) and wind speed as well as direction during the observation period can be specified by the user and are then considered by the algorithm. Wind is specified as a 3D velocity vector with x pointing East, y pointing North and z pointing upwards (we here used 335 m s -1 and (0,0,0) m s -1 , for a recording at 7°C without wind). The relevance of incorporating wind for accurate localization of sound sources can be illustrated with a small example: in a situation of wind 20 m s -1 from the West, and a sound location in between two ARUs at 50 m distance to each, all aligned with the wind, the signal arrival times at the ARUs correspond to those expected from a sound location is shifted approx. 3 m to the West without wind. Our algorithm corrects for this bias by calculating any relevant speed of sound by vector addition of the respective sound velocity vector and the wind velocity vector. Field studies to test the identification and localization workflow In an agricultural field site (Agricultural Research Station Heidfeldhof of the University of Hohenheim, Stuttgart, Germany) we set up six Audiomoth ARUs (Model 1.2.0, Hill et al., 2019 ) in a rectangle of approx. 70 m by 60 m ( Fig. 3 ) in March 2023. In this study sound recordings were done to develop and test the algorithm and to evaluate different parameter settings (‘validation study/site’). Audiomoth ARUs were programmed to start at the same time, recording continuously at 48kHz with medium gain and mounted on metal poles at 1.5m height. We emitted a calibration signal from another defined position in the centre of the area by clapping two small metal boxes 9 times. From six defined positions, 1.5 m above ground, we then played bird sounds of five different species (Carrion Crow, Corvus corone, file: XC758247, White Wagtail, Motacilla alba , file: XC712328, Eurasian Skylark, Alauda arvensis , file: XC717648, Eurasian Tree Sparrow, Passer montanus , file: XC737729, Meadow Pipit, Anthus pratensis , file: XC760273; https://xeno-canto.org/ ). Each bird sound was played 13 times over the course of approx. 30 minutes with a smartphone and a bluetooth speaker (JBL Flip Essential) at 75% of maximum volume in six different defined locations distributed in the rectangle of ARUs. After that a second calibration signal was emitted and the recordings were stopped. All positions were measured with a Trimble GeoX DGPS (differential correction in postprocessing resulted in an accuracy of 78% < 0.15 m, 90% < 0.3 m, 99% < 1 m). Wind was < 2 m s -1 without predominant direction during the study, hence neglected in the analysis. A second set of recordings were collected in a traditional orchard with larger trees adjacent to a forest (at University Hohenheim Research Station Unterer Lindenhof, Ehingen, Germany) to apply the approach in a more closed landscape (‘application study/site’) in April 2023. Here we again used six Audiomoth ARUs arranged in a rectangle of approx. 100 by 60 m( Figs 6a,b ). To test the algorithm in an environment with strong wild bird activity, the same bird sounds were again played with the speaker, now 18 times in six defined locations. In this study, however, the total recording period was approx. 1.7 h to also record and subsequently analyse sounds of wild birds. In this study the spatial accuracy of the defined locations was: 40% < 0.15 m, 69% < 0.3 m, 93% < 1 m. Wind speed and direction were again neglectable in this study. Results The localization algorithm was able to locate the origin of bird sounds (handheld speaker) that were identified by the AI (here: BirdNET) with moderate confidence levels (thresholds: 2 or more recordings > 0.25, one of these > 0.75) for most species with high accuracy. While for Carrion Crow no set of recordings passed these thresholds and for Eurasian Tree Sparrow only approx. 10%, for Eurasian Skylark, White Wagtail and Meadow Pipit all sounds passed the ID confidence thresholds and > 90% of sounds were located with an accuracy of less than 5 m, >85% of less than 2m, > 50% of less than 1m ( Figs 2 , 3 ). Download figure Open in new tab Fig. 2.: Proportion of sounds of different bird species replayed in the validation experiment that were correctly identified by BirdNET and localized with a certain accuracy. Download figure Open in new tab Fig. 3: Setup of the validation experiment in an agricultural landscape, showing the locations of ARUs, the calibration point (calibration signal emitted), the locations of bird sound playback and the localization of these bird sounds by the algorithm (UTM Zone 32U). The presented localization algorithm calculates 2D and 3D positions of the sound source. Since our validation site was flat and ARUs as well as the playback locations of bird sounds were at the same height, considering the 3rd dimension did not impact our results for the 2D locations of sound origins. The resulting z-coordinates were, however, randomly distributed in the scope of values that were given to be tested by the algorithm (20 m). This is not surprising, since ARUs mounted in a plain can gather little information on different heights of a sound source. Two parameters controlling the localization algorithm were found to affect localization accuracy namely the second ID confidence threshold (IDCT2) and the number of time delays used for localization (ND). We found that a higher IDCT2, while lowering the number of sound events that enter the actual localization, increases the accuracy of these localizations ( Fig. 4 ). If a sound is on n recordings n-1 time delays are availble; the quality of their determination is quantified with a performance measure based on the success of the spectrogram overlay anaylsis using convolution, compare Fig 1.3 ). Localization accuracy increased further if only the three most clearly defined time delays (ND) were used instead of using all time delays that were available for any sound ( Fig. 4 ). Download figure Open in new tab Fig. 4. Effects of the second ID confidence threshold (IDCT2)and number of time delays used for localization (ND) on the proportion of correctly identified bird sounds that were localized within 2 m of the true location. Note: 100% here denotes the number of sounds that passed IDCT1 and IDCT2. To test how inaccuracies in quantifying delay times affect localization accuracy, we added random noise (n = 250, normal distribution, mean = 0, SD =0.0025 s resulting in 95% of values within 0.01 s around the true value; this is oriented towards the fine vertical elements in the spectrogram of a Eurasian Skylark sound, compare Fig. 1 ) to calculated delay times for different sound locations, and used these to re-estimate the sound location ( Fig. 5 ). In the area delimited by the ARUs, 95% of localizations were < 2m from the original sound location. This localization accuracy decreased as sound locations moved away from this area ( Fig. 5 ). Download figure Open in new tab Fig. 5: Error propagation from inaccuracies in the determination of time delays to the localization of sound origins in the validation experiment (UTM Zone 32U). In the application study in a more closed landscape (orchard next to a forest) where many birds where actively singing, a lower number of replayed bird sounds passed the ID thresholds ( Fig. 6a ). However, of these sounds 93 % were localized with an accuracy of < 5 m (61 % < 2 m). In the application study we also localized all natural bird sounds that BirdNET identified in the 1.7 h recording period. This yielded 202 localizations of 9 bird species ( Fig. 6b ). These can be used to derive habitat utility descriptions for different species or even individuals (based on the known time for each localization), such as the estimation of home ranges (for an example see Fig. 6b ). Download figure Open in new tab Fig. 6: Localization result of the played bird sounds (a) during the application study at the natural field site (orchard close to forest), and bird localizations of bird sounds beyond the manually played ones during the recording period lasting for 1.7 h (b). In (b) the polygons illustrate the 75 % utility density based on the localizations of two species, Marsh Tit and Eurasian Blackcap (function getverticeshr in R package adehabitatHR) as a possible application of localization results (background photo from Bingmaps). Discussion The presented automated workflow can facilitate bird monitoring, conservation and fundamental ecological research by identifying and precisely locating birds based on their sounds. It requires a simple setup in the field and uses comparatively low-cost sound recorders. The localization analysis is coded in a ready-to-use R script which can be executed easily without advanced programming skills. If required, the workflow also enables localization in 3D space and can correct for bias imposed by strong winds. In recent years other studies have presented solutions for different aspects of our workflow. While BirdNET currently seems to be the AI of choice for multi-species systems (compare Toenies & Rich, 2021 , Cole et al., 2022 , Singer et al., 2024 , for a comparison to other methods see Xie et al., 2023 ), the development of AI systems for species identification can be expected to be fast in the next years. Any novel output from possibly more advanced AI models in the future can be easily incorporated into our workflow without much adjustment. Some recent studies have used time delays on different recordings for localization of wildlife. However, these have either only done so for single to few species, used comparatively expensive technology or require manual determination of time delays. Hence, these approaches are still far from ready-to use for multi-species analyses (e.g. Bistel et al. 2022 , Smith et al., 2022 , Barré et al., 2024 ). While our workflow still allows for both alternatives, manual identification of sounds by experts and manual/visual detection of time delays, the complete analysis presented here, using AI for species identification and the automated time delay assessment and subsequent localization, is the first cost-efficient, simple and ready-to-use workflow for identification and localization individual birds in species-rich systems. Still, AI for identification provided, the approach can also be applied to other taxa that can be detected by sound. The value of automated bird localization The ongoing decline in bird diversity globally ( Rosenberg et al., 2019 , Burns et al., 2021 ), and in European farmland birds specifically ( Traba & Morales, 2019 , Rigal et al., 2023 ), calls for efficient monitoring to quantify trends for different species and understand the underlying drivers. Passive acoustic monitoring (PAM) has considerable potential in this respect but faces the unresolved challenge of how to convert the number of recorded bird calls into an estimate of population density ( Pérez-Granados & Traba, 2021 ). Knowing the exact location and time of any bird call overcomes this challenge by making it possible to apply standard methods of distance sampling and home range mapping ( Bibby et al. 2000 ; Buckland 2006 ). Considering that habitat destruction and loss of landscape heterogeneity is a major driver of declining bird abundance ( Xu et al., 2018 , Traba & Morales, 2019 , Rigal et al., 2023 ) understanding fine scale habitat use of individuals is crucial for targeted conservation efforts (e.g. Blackburn & Cresswell 2015 ). Here a method that combines low-disturbance PAM with accurate localization of individuals can significantly improve our understanding of fine-scale resource and habitat use of species and individuals including movement behaviour and how these are affected by landscape changes. Also, fundamental ecological research on biotic interactions in natural communities as well as the assessment of ecosystem services such as seed dispersal ( Schurr et al. 2009 ) or pollination ( Schmid et al., 2015 ) by birds, can substantially benefit from small-scale spatial information of bird activity and movement. Challenges, limitations and extensions of the presented approach The temporal calibration of ARUs is a simple procedure that allows the use of comparatively cheap equipment, but can cause some disturbance for the birds. If available, an external sound signal (like bells from a close-by church tower) can be used to avoid disturbance. If a more advanced ARU system is available, where the start and end times of recordings are synchronous (<0.002 s, e.g. through cable or network connection) and recording speed is comparable, or GPS-equipped ARUs (e.g. GPS board for Audiomoths) can be used, which provide a comparable time stamp for all recordings, the temporal calibration procedure can be omitted. In this case an interrupted recording scheme (e.g. 15 minutes once every hour) can also be used, since calibration signals at the beginning and the end of every non-interrupted recording period would not be required. As in all PAM methods the detection distance for the species - ARU combination of interest needs to be considered and tested before deploying the ARUs. If a larger area shall be sampled in a grid of ARUs especially the detection distance of the ARUs in all directions need to be considered for choosing a grid resolution which ensures that any sound originating from the sampling area can be identified on at least three recordings. For Audiomoths, our test indicated a detection distance that is reduced by up to ∼ 50% when not facing the sound origin. The fact that also more advanced and expensive ARU systems do not have considerably larger detection distances (compare Darras, Furnas et al., 2018, analysing detection distances, and Darras et al., 2019 , comparing prices of ARU systems) points to the possibility but also the economic reasoning to use cheap ARUs, especially when larger areas are to be monitored and numerous ARUs are thus required. In our study, we applied a general rather low threshold for identification confidence (0.25) for all recordings and a second higher threshold (0.75) for at least one identification of a set of recordings belonging to one bird sound. While the first one is simply filtering very unlikely identifications, the latter is a reasonable value for most species (see Pérez-Granados, 2023 , and references therein for a more detailed discussion on confidence thresholds). Still applying these thresholds had different consequences for sounds of different species, namely that from 0 % to 100 % of sets of identifications (from different ARUs) belonging together were excluded. Within species lowering this second confidence threshold of identifications entering the localization procedure decreased the accuracy of localizations. For any real field monitoring study lowering the confidence score (i.e. accepting an identification more easily) moreover increases the risk of false identifications and hence wrong conclusions. Our results and those from several recent studies illustrate that confidence levels have to be chosen with care and possibly need to be adjusted (and validated) for different species (e.g. Barré et al., 2019 , Cole et al., 2022 , Metcalf et al., 2022 , Singer et al., 2024 ). Setting different thresholds for different species could be easily incorporated in the analysis (exemplary code is included). Still, the approach to handle and deal with confidence scores is a topic that currently receives a lot of attention and can be expected to soon yield new insights that will help to improve this aspect of our workflow. In the tested setup (ARUs at the same height) the 3D localization did not yield reasonable information on the height of the sound source. A little example can illustrate the reason: in a situation where two ARUs are 100m apart and a sound source is in between at 70/30m distance, raising the height of the sound source by 15 m changes the distances to the ARUs only slightly (1.5 and 3.6 m) and hence causes a change of the time delay between the recordings of both ARUs of only 0.0058 s, which is very small compared to the resolution of the temporal calibration and the spectrogram analysis. Hence small inaccuracies in the determined time delays (by spectrogram comparisons) cause comparatively large variation in the z coordinate that is calculated. For reasonable use of 3D localization ARUs need to be mounted at different heights, and for a specific localization of any sound only cases should be considered where the respective recordings used for this localization also stem from ARUs at different heights. The number of time delays (pairs of ARUs) used for localization also needs to be considered. Ignoring time delays for which the time delay could not be determined precisely improved the localization performance ( Fig. 4 ). However, while in principle enough, only using two delays (i.e. three recordings) can be problematic if all three ARUs are placed in a row (which can happen if ARUs are set up in a grid array), since then a sound origin could be located at the same distance left or right of this line. Hence, we recommended to use at least three time delays. If ARUs are set up at a distance that is oriented towards their detection distance it can almost be ruled out that for a specific sound source four ARUs in a row, and not ARUs surrounding the sound, would deliver the clearest time delays and hence would be chosen for the localization. Since during our studies there were no strong winds from a predominant direction this aspect was not considered (wind velocity vector was set to (0,0,0) m s -1 ). Still, in situations such as a sample area on a slope in times of falling winds it can be crucial to specify all three wind velocity dimensions. Conclusions This study presents the first ever ready-to-use workflow that, after a simple setup using affordable equipment, combines species identification with individual localization for birds, both fully automated. The species identification can be performed by experts or using available AI models for the organism of interest. Since this was found to be the most critical aspect for missing out bird sounds, expected future advances in the field of AI for species identification will strongly benefit our approach. The subsequent localization approach can in principle be used for any sound source. Application of the workflow in environments where 3D localization is relevant, for example aquatic systems, or in situations with strong wind, is possible without larger adjustments. Being able to identify and localize bird individuals will help to advance monitoring and develop targeted conservation measures for species. Also, fundamental ecological research on species interactions, behavioural ecology and ecosystem services and functioning can substantially benefit from a simple and fast workflow providing novel data on individual movements at high spatial and temporal resolution. Author contributions CMB and FMS conceived the ideas for the study and the methodology; CMB lead the field study and collected the data; CMB and FMS developed the workflow, CMB analysed the results and led the writing of the manuscript. All authors contributed critically to the drafts and gave final approval for publication. Data availability A zip-archive including R-code of the entire analysis together with exemplary data can be downloaded from: https://tinyurl.com/yhv4tcaj ( these will be published on figshare after publication) Acknowledgements We thank S. Klos, M. Kasten and T. Köhler for helping in the field and running preliminary studies. Moreover, we thank the teams of the University Hohenheim research stations Heidfeldhof and Unterer Lindenhof. Footnotes https://tinyurl.com/yhv4tcaj References ↵ Avots , E. , Vecvanags , A. , Filipovs , J. , Brauns , A. , Skudrins , G. , Done , G. , Ozolins , J. , Anbarjafari , G. , & Jakovels , D . ( 2022 ). Towards automated detection and localization of red deer Cervus elaphus using passive acoustic sensors during the rut . Remote Sensing , 14 ( 10 ), 2464 . doi: 10.3390/rs14102464 OpenUrl CrossRef ↵ Barré , K. , Baudouin , A. , Froidevaux , J. S. P. , Chartendrault , V. , & Kerbiriou , C . ( 2024 ). Insectivorous bats alter their flight and feeding behaviour at ground-mounted solar farms . Journal of Applied Ecology , 61 ( 2 ), 328 – 339 . doi: 10.1111/1365-2664.14555 OpenUrl CrossRef ↵ Barré , K. , Le Viol , I. , Julliard , R. , Pauwels , J. , Newson , S. E. , Julien , J. , Claireau , F. , Kerbiriou , C. , & Bas , Y. ( 2019 ). Accounting for automated identification errors in acoustic surveys . Methods in Ecology and Evolution , 10 ( 8 ), 1171 – 1188 . doi: 10.1111/2041-210X.13198 OpenUrl CrossRef ↵ Bibby , C. J. , Burgess , N. D. , Hill , D. A. , & Mustoe , S. ( 2000 ). Bird census techniques ( 2 nd edition). Academic Press London . ↵ Bistel , R. A. , Martinez , A. , & Mindlin , G. B . ( 2022 ). Neural networks that locate and identify birds through their songs . The European Physical Journal Special Topics , 231 ( 3 ), 185 – 194 . doi: 10.1140/epjs/s11734-021-00405-5 OpenUrl CrossRef ↵ Blackburn , E. , & Cresswell , W . ( 2015 ). Fine-scale habitat use during the non-breeding season suggests that winter habitat does not limit breeding populations of a declining long-distance Palearctic migrant . Journal of Avian Biology , 46 , 622 – 633 . doi: 10.1111/jav.00738 OpenUrl CrossRef ↵ Buckland , S. T . ( 2006 ). Point-transect surveys for songbirds: Robust methodologies . The Auk , 123 ( 2 ), 345 – 357 . doi: 10.1093/auk/123.2.345 OpenUrl CrossRef ↵ Burns , F. , Eaton , M. A. , Burfield , I. J. , Klvaňová , A. , Šilarová , E. , Staneva , A. , & Gregory , R. D . ( 2021 ). Abundance decline in the avifauna of the European Union reveals cross-continental similarities in biodiversity change . Ecology and Evolution , 11 ( 23 ), 16647 – 16660 . doi: 10.1002/ece3.8282 OpenUrl CrossRef ↵ Cole , J. S. , Michel , N. L. , Emerson , S. A. , & Siegel , R. B . ( 2022 ). Automated bird sound classifications of long-duration recordings produce occupancy model outputs similar to manually annotated data . Ornithological Applications , 124 ( 2 ), duac003 . doi: 10.1093/ornithapp/duac003 OpenUrl CrossRef Darras , K. , Batáry , P. , Furnas , B. , Celis-Murillo , A. , Van Wilgenburg , S. L. , Mulyani , Y. A. , & Tscharntke , T. ( 2018 ). Comparing the sampling performance of sound recorders versus point counts in bird surveys: A meta-analysis . Journal of Applied Ecology , 55 ( 6 ), 2575 – 2586 . doi: 10.1111/1365-2664.13229 OpenUrl CrossRef ↵ Darras , K. , Batáry , P. , Furnas , B. J. , Grass , I. , Mulyani , Y. A. , & Tscharntke , T . ( 2019 ). Autonomous sound recording outperforms human observation for sampling birds: A systematic map and user guide . Ecological Applications , 29 ( 6 ). doi: 10.1002/eap.1954 OpenUrl CrossRef Darras , K. , Furnas , B. , Fitriawan , I. , Mulyani , Y. , & Tscharntke , T . ( 2018 ). Estimating bird detection distances in sound recordings for standardizing detection ranges and distance sampling . Methods in Ecology and Evolution , 9 ( 9 ), 1928 – 1938 . doi: 10.1111/2041-210X.13031 OpenUrl CrossRef ↵ Hill , A. P. , Prince , P. , Snaddon , J. L. , Doncaster , C. P. , & Rogers , A . ( 2019 ). AudioMoth: A low-cost acoustic device for monitoring biodiversity and the environment . HardwareX , 6 , e00073 . doi: 10.1016/j.ohx.2019.e00073 OpenUrl CrossRef ↵ Kahl , S. , Wood , C. M. , Eibl , M. , & Klinck , H . ( 2021 ). BirdNET: A deep learning solution for avian diversity monitoring . Ecological Informatics , 61 , 101236 . doi: 10.1016/j.ecoinf.2021.101236 OpenUrl CrossRef ↵ Krebs , C. J . ( 1972 ). Ecology: The Experimental Analysis of Distribution and Abundance . Harper & Row . ↵ Lapp , S. , Stahlman , N. , & Kitzes , J . ( 2023 ). A Quantitative Evaluation of the Performance of the Low-Cost AudioMoth Acoustic Recording Unit . Sensors , 23 ( 11 ), 5254 . doi: 10.3390/s23115254 OpenUrl CrossRef ↵ Lauha , P. , Somervuo , P. , Lehikoinen , P. , Geres , L. , Richter , T. , Seibold , S. , & Ovaskainen , O . ( 2022 ). Domain-specific neural networks improve automated bird sound recognition already with small amount of local data . Methods in Ecology and Evolution , 13 ( 12 ), 2799 – 2810 . doi: 10.1111/2041-210X.14003 OpenUrl CrossRef ↵ Ligges , U. , Krey , S. , Mersmann , O. , & Schnackenberg , S. ( 2023 ). tuneR: Analysis of Music and Speech . https://CRAN.R-project.org/package=tuneR ↵ Manzano-Rubio , R. , Bota , G. , Brotons , L. , Soto-Largo , E. , & Pérez-Granados , C . ( 2022 ). Low-cost open-source recorders and ready-to-use machine learning approaches provide effective monitoring of threatened species . Ecological Informatics , 72 , 101910 . doi: 10.1016/j.ecoinf.2022.101910 OpenUrl CrossRef ↵ Metcalf , O. C. , Barlow , J. , Bas , Y. , Berenguer , E. , Devenish , C. , França , F. , Marsden , S. , Smith , C. , & Lees , A. C . ( 2022 ). Detecting and reducing heterogeneity of error in acoustic classification . Methods in Ecology and Evolution , 13 ( 11 ), 2559 – 2571 . doi: 10.1111/2041-210X.13967 OpenUrl CrossRef ↵ Michaud , F. , Sueur , J. , Le Cesne , M. , & Haupert , S. ( 2023 ). Unsupervised classification to improve the quality of a bird song recording dataset . Ecological Informatics , 74 , 101952 . doi: 10.1016/j.ecoinf.2022.101952 OpenUrl CrossRef ↵ Müller , J. , Mitesser , O. , Schaefer , H. M. , Seibold , S. , Busse , A. , Kriegel , P. , Rabl , D. , Gelis , R. , Arteaga , A. , Freile , J. , Leite , G. A. , De Melo , T. N. , LeBien , J. , Campos-Cerqueira , M. , Blüthgen , N. , Tremlett , C. J. , Böttger , D. , Feldhaar , H. , Grella , N. , … Buřivalová , Z. ( 2023 ). Soundscapes and deep learning enable tracking biodiversity recovery in tropical forests . Nature Communications , 14 ( 1 ), 6191 . doi: 10.1038/s41467-023-41693-w OpenUrl CrossRef ↵ Pérez-Granados , C . ( 2023 ). BIRDNET: Applications, performance, pitfalls and future opportunities . Ibis , 165 ( 3 ), 1068 – 1075 . doi: 10.1111/ibi.13193 OpenUrl CrossRef ↵ Pérez-Granados , C. & Traba , J . ( 2021 ). Estimating bird density using passive acoustic monitoring: A review of methods and suggestions for further research . Ibis, v . 163 ( 3 ), 765 – 783 . PubAg. doi: 10.1111/ibi.12944 OpenUrl CrossRef ↵ R Core Team . ( 2021 ). R: A language and environment for statistical computing . R Foundation for Statistical Computing . https://www.R-project.org/ ↵ Rhinehart , T. A. , Chronister , L. M. , Devlin , T. , & Kitzes , J . ( 2020 ). Acoustic localization of terrestrial wildlife: Current practices and future opportunities . Ecology and Evolution , 10 ( 13 ), 6794 – 6818 . doi: 10.1002/ece3.6216 OpenUrl CrossRef ↵ Rigal , S. , Dakos , V. , Alonso , H. , Auniņš , A. , Benkő , Z. , Brotons , L. , Chodkiewicz , T. , Chylarecki , P. , De Carli , E. , Del Moral , J. C. , Domşa , C. , Escandell , V. , Fontaine , B. , Foppen , R. , Gregory , R. , Harris , S. , Herrando , S. , Husby , M. , Ieronymidou , C. , … Devictor , V. ( 2023 ). Farmland practices are driving bird population decline across Europe . Proceedings of the National Academy of Sciences , 120 ( 21 ), e2216573120 . doi: 10.1073/pnas.2216573120 OpenUrl CrossRef ↵ Rosenberg , K. V. , Dokter , A. M. , Blancher , P. J. , Sauer , J. R. , Smith , A. C. , Smith , P. A. , Stanton , J. C. , Panjabi , A. , Helft , L. , Parr , M. , & Marra , P. P . ( 2019 ). Decline of the North American avifauna . Science , 366 ( 6461 ), 120 – 124 . doi: 10.1126/science.aaw1313 OpenUrl Abstract / FREE Full Text ↵ Ruff , Z. J. , Lesmeister , D. B. , Duchac , L. S. , Padmaraju , B. K. , & Sullivan , C. M . ( 2020 ). Automated identification of avian vocalizations with deep convolutional neural networks . Remote Sensing in Ecology and Conservation , 6 ( 1 ), 79 – 92 . doi: 10.1002/rse2.125 OpenUrl CrossRef ↵ Schmid , B. , Nottebrock , H. , Esler , K. J. , Pagel , J. , Pauw , A. , Böhning-Gaese , K. , Schurr , F. M. , & Schleuning , M . ( 2015 ). Reward quality predicts effects of bird-pollinators on the reproduction of African Protea shrubs . Perspectives in Plant Ecology, Evolution and Systematics , 17 ( 3 ), 209 – 217 . doi: 10.1016/j.ppees.2015.02.007 OpenUrl CrossRef ↵ Schurr , F. M. , Spiegel , O. , Steinitz , O. , Trakhtenbrot , A. , Tsoar , A. , & Nathan , R . ( 2009 ). Long-distance seed dispersal . In Annual Plant Reviews: Vol. Fruit Development and Seed Dispersal (p. 204 – 237 ). doi: 10.1007/978-90-481-2406-0 OpenUrl CrossRef ↵ Signal developers ( 2023 ). Signal: Signal processing . https://r-forge.r-project.org/projects/signal/ ↵ Singer , D. , Hagge , J. , Kamp , J. , Hondong , H. , & Schuldt , A . ( 2024 ). Aggregated time-series features boost species-specific differentiation of true and false positives in passive acoustic monitoring of bird assemblages . Remote Sensing in Ecology and Conservation , rse2.385 . doi: 10.1002/rse2.385 OpenUrl CrossRef ↵ Smith , B. R. , Root-Gutteridge , H. , Butkiewicz , H. , Dassow , A. , Fontaine , A. C. , Markham , A. , Owens , J. , Schindler , L. , Wijers , M. , & Kershenbaum , A . ( 2022 ). Acoustic localisation of wildlife with low-cost equipment: Lower sensitivity, but no loss of precision . Wildlife Research , 49 ( 4 ), 372 – 381 . doi: 10.1071/WR21089 OpenUrl CrossRef ↵ Toenies , M. , & Rich , L . ( 2021 ). Advancing bird survey efforts through novel recorder technology and automated species identification . California Fish and Wildlife Journal , 107 ( 2 ), 56 – 70 . doi: 10.51492/cfwj.107.5 OpenUrl CrossRef ↵ Traba , J. , & Morales , M. B . ( 2019 ). The decline of farmland birds in Spain is strongly associated to the loss of fallowland . Scientific Reports , 9 ( 1 ), 9473 . doi: 10.1038/s41598-019-45854-0 OpenUrl CrossRef ↵ Wilson , S. J. , & Bayne , E. M . ( 2018 ). Use of an acoustic location system to understand how presence of conspecifics and canopy cover influence ovenbird (Seiurus aurocapilla) space use near reclaimed wellsites in the boreal forest of alberta . Avian Conservation and Ecology , 13 ( 2 ). doi: 10.5751/ACE-01248-130204 OpenUrl CrossRef ↵ Xie , J. , Zhong , Y. , Zhang , J. , Liu , S. , Ding , C. , & Triantafyllopoulos , A . ( 2023 ). A review of automatic recognition technology for bird vocalizations in the deep learning era . Ecological Informatics , 73 , 101927 . doi: 10.1016/j.ecoinf.2022.101927 OpenUrl CrossRef ↵ Xu , X. , Xie , Y. , Qi , K. , Luo , Z. , & Wang , X . ( 2018 ). Detecting the response of bird communities and biodiversity to habitat loss and fragmentation due to urbanization . Science of The Total Environment , 624 , 1561 – 1576 . doi: 10.1016/j.scitotenv.2017.12.143 OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted July 04, 2024. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following A ready-to use workflow for automated, sound-based bird identification and localization Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share A ready-to use workflow for automated, sound-based bird identification and localization Carsten M. Buchmann , Frank M. Schurr bioRxiv 2024.07.02.601711; doi: https://doi.org/10.1101/2024.07.02.601711 Share This Article: Copy Citation Tools A ready-to use workflow for automated, sound-based bird identification and localization Carsten M. Buchmann , Frank M. Schurr bioRxiv 2024.07.02.601711; doi: https://doi.org/10.1101/2024.07.02.601711 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Ecology Subject Areas All Articles Animal Behavior and Cognition (7647) Biochemistry (17728) Bioengineering (13921) Bioinformatics (42047) Biophysics (21490) Cancer Biology (18637) Cell Biology (25555) Clinical Trials (138) Developmental Biology (13403) Ecology (19942) Epidemiology (2067) Evolutionary Biology (24368) Genetics (15625) Genomics (22549) Immunology (17764) Microbiology (40475) Molecular Biology (17208) Neuroscience (88761) Paleontology (667) Pathology (2842) Pharmacology and Toxicology (4834) Physiology (7659) Plant Biology (15175) Scientific Communication and Education (2047) Synthetic Biology (4304) Systems Biology (9835) Zoology (2272)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00