GitHub repo: https://github.com/UoA-eResearch/bird_recognition

Load the necessary packages

library(bioacoustics)
library(tuneR)
library(seewave)
library(dplyr)
library(tools)
library(randomForest)
library(stringr)
library(keras)

Load the wave files. Separate out the metadata from the filename into columns. Render some spectrograms

files = list.files("wav_files_playback", "*.wav", full.names=TRUE)
files_without_extension = basename(file_path_sans_ext(files))
wavs = setNames(
  lapply(
    files,
    read_audio
  ),
  files_without_extension
)
metadata = data.frame(str_split_fixed(files_without_extension, "_", 3))
metadata = cbind(metadata, files_without_extension)
colnames(metadata) = c("birdid", "calltype", "idnumber", "filename")
head(metadata)
oscillo(wavs[[1]])

birds = unique(metadata$birdid)
filtered = filter(metadata, birdid == birds[1])
for (i in 1:nrow(filtered)) {
  audio = wavs[[filtered$filename[i]]]
  spectro(audio, main=filtered$filename[i])
}

Each calltype seems to have a unique spectrogram.

Split dataset into train / test. The test dataset will contain one row for each combination of birdid / calltype

test = metadata %>% 
          group_by(birdid, calltype) %>% 
          filter(row_number()==1)
test
train = metadata %>% 
          group_by(birdid, calltype) %>% 
          filter(row_number()!=1)
train

Train a random forest classifier based on Mel-frequency Cepstral Coefficients (MFCC)

mels = sapply(wavs, melfcc, numcep = 20) # Calculate all MFCCs
head(mels[[1]]) # Result is a matrix of 20 coefficients across nrow time frames
          [,1]       [,2]       [,3]     [,4]      [,5]      [,6]      [,7]      [,8]       [,9]     [,10]     [,11]    [,12]      [,13]
[1,]  68.69654  -3.780422   4.476156 5.271380  3.650631  1.741341  4.444163  4.672678  4.5178220 5.6266429  6.415695 4.343187  0.3932380
[2,]  88.96067  -6.927129  -6.818393 6.524440  2.334660 -3.355236 -7.613945  6.056264  4.9538498 0.4898021 -1.436976 9.079844 -0.1176275
[3,] 106.16452 -11.646540 -12.414796 5.262883  3.662236 -4.392132 -6.964880  5.706424  3.9451137 2.5565591 -1.964917 5.797055 -1.4145690
[4,] 105.03493 -12.002984 -16.421228 5.316297  4.980148 -6.378010 -8.123595  6.597439  1.3899451 2.0900014 -1.035095 1.936149 -0.1160323
[5,] 106.57354  -8.992413 -18.565619 5.524081  8.566183 -5.366326 -5.994870  7.726556 -0.3243773 2.2493048  2.440603 2.514020 -2.8800924
[6,] 106.27823  -9.205815 -18.199581 6.648214 10.821699 -5.813343 -6.089672 11.239428  1.9412227 0.1380098  5.415030 4.131425 -6.6722864
          [,14]      [,15]      [,16]        [,17]     [,18]     [,19]     [,20]
[1,]  0.3931017   5.003128  2.3691691   1.32332143  1.554147  2.412560  3.804502
[2,] -6.2854837   1.201570 -2.7528557   1.06447825 11.173515 -2.219323 13.336220
[3,] -6.1204109  -3.322195  0.2802506   0.09545402 14.361013  8.248917 22.918580
[4,]  0.3712420 -10.974840 -9.7360260 -10.41496130  5.833937 10.492120 29.569579
[5,] -1.6816549 -12.407400 -0.7134785  -1.69401514 10.708495 10.874922 25.567692
[6,] -5.2800663 -10.521954  3.9942376   8.10396261 21.555281 13.163049 24.356962
image(mels[[1]])

trainMels = mels[train$filename] # Select the MFCCs corresponding to the training dataset
trainM = do.call(rbind, trainMels) # melfcc gives a matrix - rbind to cast from 3D to 2D across the whole training dataset
trainM.labels = rep(train$birdid, lapply(trainMels, function(x) dim(x)[1])) # Create birdid labels for each MFCC
rownames(trainM) = trainM.labels
head(trainM)
          [,1]       [,2]       [,3]     [,4]      [,5]       [,6]      [,7]     [,8]        [,9]      [,10]     [,11]     [,12]      [,13]
fem1  68.77074  -2.213384   4.628686 5.367107  4.975266   2.363725  1.711655 4.081043  2.15101408  6.2229622  7.754493  8.741295 11.2565395
fem1  91.08736  -6.059811  -9.014764 8.962229  6.072573  -8.355901  1.810023 9.093744  2.30597955  5.0335445  0.207314  3.978590 -0.2043023
fem1 103.19964 -11.542509 -15.596763 5.258514  5.115331 -12.100052 -1.563007 7.470451 -0.56864577  2.8360797 -1.306980 -1.008521 -0.5511411
fem1 104.50974 -11.346757 -15.660340 6.592491  8.500276 -11.932705 -2.245001 9.080727 -0.04320037  2.4075273  2.280807 -2.626861  4.3281720
fem1 106.57821 -10.260614 -19.382625 5.064119  9.259625 -11.346783 -2.406839 8.412054 -2.31515390  0.7805553  4.681734 -1.877691 -0.4420264
fem1 107.95517  -8.147488 -17.902705 6.439735 11.188799 -11.997391 -3.764681 9.622480 -0.24517770 -0.4588778  5.608058 -2.104510 -2.2810293
          [,14]      [,15]       [,16]     [,17]     [,18]      [,19]     [,20]
fem1   6.572777  4.4741195   2.1953349  2.065046  2.129172 -0.6963477 -3.261804
fem1 -11.330215  3.9435470  -4.5559470  3.389776  8.398569  7.7292334 11.494852
fem1 -13.738228  0.5436374  -7.8162985  2.401570 10.051834 15.9195557 23.747543
fem1  -6.853353 -0.8192290 -13.5075895 -2.579033  3.912165 15.0152520 30.905115
fem1 -12.124533 -0.9039051  -4.0965733  4.939002 12.314143 14.9999231 26.944541
fem1 -12.702156 -0.1276930  -0.1820043  9.876577 12.962467  9.0181244 16.009651
set.seed(1337)
rf = randomForest(trainM, trainM.labels, importance = FALSE, proximity = FALSE, replace = TRUE, ntree = 4000, mtry = 4)
rf

Call:
 randomForest(x = trainM, y = trainM.labels, ntree = 4000, mtry = 4,      replace = TRUE, importance = FALSE, proximity = FALSE) 
               Type of random forest: classification
                     Number of trees: 4000
No. of variables tried at each split: 4

        OOB estimate of  error rate: 22.9%
Confusion matrix:
      fem1 fem11 fem2 fem5 fem8 fem9 mal1 mal11 mal2 mal5 mal8 mal9 class.error
fem1    86     1    3    2    2    0   10     5    3    2    2    1   0.2649573
fem11    0    91    0    0    0    5    1     1    1    4    1    3   0.1495327
fem2     2     0   98    0    0    1    0     3    3    8    0    1   0.1551724
fem5     3     0    4  102    0    1    3     2    7   12    4    0   0.2608696
fem8     0     0    0    0   88    3    4     1   15    7    7    6   0.3282443
fem9     0     2    0    1    0   92   10     6    5    8    5    5   0.3134328
mal1     2     0    0    0    4    0  217    16   19    8    7    6   0.2222222
mal11    0     1    2    0    1    3   17   238   14    7    7   12   0.2119205
mal2     0     2    0    0    3    0   15     7  233   17    4    4   0.1824561
mal5     1     4    4    4    0    1    9     2   18  251    9    5   0.1850649
mal8     1     0    0    1    0    1    6     8   11   24  225    7   0.2077465
mal9     0     3    4    0    0    0   13    23   11   14   13  168   0.3253012

Check accuracy of randomForest model

testMels = mels[test$filename]
testM = do.call(rbind, testMels)
testM.labels = rep(test$birdid, lapply(testMels, function(x) dim(x)[1]))
rownames(testM) = testM.labels
head(testM)
          [,1]       [,2]       [,3]     [,4]      [,5]      [,6]      [,7]      [,8]       [,9]     [,10]     [,11]    [,12]      [,13]
fem1  68.69654  -3.780422   4.476156 5.271380  3.650631  1.741341  4.444163  4.672678  4.5178220 5.6266429  6.415695 4.343187  0.3932380
fem1  88.96067  -6.927129  -6.818393 6.524440  2.334660 -3.355236 -7.613945  6.056264  4.9538498 0.4898021 -1.436976 9.079844 -0.1176275
fem1 106.16452 -11.646540 -12.414796 5.262883  3.662236 -4.392132 -6.964880  5.706424  3.9451137 2.5565591 -1.964917 5.797055 -1.4145690
fem1 105.03493 -12.002984 -16.421228 5.316297  4.980148 -6.378010 -8.123595  6.597439  1.3899451 2.0900014 -1.035095 1.936149 -0.1160323
fem1 106.57354  -8.992413 -18.565619 5.524081  8.566183 -5.366326 -5.994870  7.726556 -0.3243773 2.2493048  2.440603 2.514020 -2.8800924
fem1 106.27823  -9.205815 -18.199581 6.648214 10.821699 -5.813343 -6.089672 11.239428  1.9412227 0.1380098  5.415030 4.131425 -6.6722864
          [,14]      [,15]      [,16]        [,17]     [,18]     [,19]     [,20]
fem1  0.3931017   5.003128  2.3691691   1.32332143  1.554147  2.412560  3.804502
fem1 -6.2854837   1.201570 -2.7528557   1.06447825 11.173515 -2.219323 13.336220
fem1 -6.1204109  -3.322195  0.2802506   0.09545402 14.361013  8.248917 22.918580
fem1  0.3712420 -10.974840 -9.7360260 -10.41496130  5.833937 10.492120 29.569579
fem1 -1.6816549 -12.407400 -0.7134785  -1.69401514 10.708495 10.874922 25.567692
fem1 -5.2800663 -10.521954  3.9942376   8.10396261 21.555281 13.163049 24.356962
# Let's make predictions with our classifier on a test set
table = table(testM.labels, predict(rf, testM, type = "response"))
table
            
testM.labels fem1 fem11 fem2 fem5 fem8 fem9 mal1 mal11 mal2 mal5 mal8 mal9
       fem1    31     0    1    4    8    1    4     4    1    1    2    1
       fem11    0    35    4    0    0    2    0     1    3    3    0    2
       fem2     2     2   30    0    1    2    3     3    2    5   10    0
       fem5     3     1    5   32   17    2    0     0    7    6    1    0
       fem8     2     0    0    2   41    2    3     1    6    4    6    0
       fem9     0     0    0    0    1   57    8     2    3    3    2    3
       mal1     0     1    0    0    2    0  107     9    9    3    3    6
       mal11    0     1    1    1    0    1   15   102    3    5    2   10
       mal2     3     0    6    0    1    5    3     4  143    9    3    3
       mal5     0     0    3    6    0    0    4     4    9  123    5    6
       mal8     4     0    0    2    1    0    8     6    1   11   95    2
       mal9     0     2    0    0    0    2   10    14    5   13    4   75
accuracy_pct = round(sum(diag(table)) / sum(table) * 100, 2)
print(paste("accuracy across whole dataset", accuracy_pct, "%"))
[1] "accuracy across whole dataset 68.91 %"
# To look at the predictions 
predict(rf, testM, type = "prob")
         fem1   fem11    fem2    fem5    fem8    fem9    mal1   mal11    mal2    mal5    mal8    mal9
fem1  0.08375 0.00950 0.01100 0.09600 0.08500 0.05725 0.07700 0.06000 0.22000 0.15800 0.04825 0.09425
fem1  0.13925 0.00925 0.01250 0.03100 0.04575 0.04450 0.10325 0.21275 0.09425 0.10575 0.05875 0.14300
fem1  0.38725 0.01925 0.00850 0.18475 0.09050 0.02750 0.06475 0.09550 0.02125 0.02025 0.04300 0.03750
fem1  0.29475 0.00450 0.00100 0.26775 0.12600 0.02075 0.06475 0.10225 0.03075 0.01850 0.03625 0.03275
fem1  0.20050 0.00375 0.00300 0.50525 0.14725 0.01400 0.03850 0.02625 0.01850 0.01250 0.01575 0.01475
fem1  0.16125 0.02950 0.01200 0.38825 0.25225 0.01675 0.05300 0.01575 0.04200 0.01200 0.00550 0.01175
fem1  0.21500 0.04975 0.00775 0.12775 0.48550 0.01025 0.05575 0.00925 0.02025 0.00625 0.00825 0.00425
fem1  0.17850 0.01750 0.00800 0.08275 0.61275 0.00950 0.05550 0.00600 0.01450 0.00625 0.00625 0.00250
fem1  0.43475 0.01175 0.00450 0.15750 0.25850 0.00475 0.07575 0.00900 0.02375 0.00650 0.00625 0.00700
fem1  0.50775 0.01475 0.00675 0.19750 0.14525 0.01100 0.07100 0.00850 0.01950 0.00900 0.00550 0.00350
fem1  0.65425 0.00775 0.00425 0.06675 0.15475 0.00375 0.05975 0.01175 0.02075 0.00450 0.00525 0.00650
fem1  0.36900 0.00300 0.01225 0.04175 0.40200 0.00775 0.06300 0.01600 0.03625 0.02275 0.00650 0.01975
fem1  0.19925 0.00225 0.00325 0.01200 0.69200 0.00475 0.03550 0.00475 0.02525 0.01025 0.00500 0.00575
fem1  0.18625 0.01550 0.02100 0.00800 0.63025 0.01075 0.03850 0.01025 0.04125 0.01050 0.01700 0.01075
fem1  0.48550 0.02600 0.00850 0.00850 0.35400 0.01175 0.02575 0.01150 0.04650 0.01175 0.00350 0.00675
fem1  0.41250 0.05700 0.02150 0.03250 0.19825 0.02700 0.04575 0.04750 0.06675 0.05150 0.01375 0.02600
fem1  0.29900 0.01125 0.02125 0.01125 0.48250 0.01200 0.04700 0.01375 0.05875 0.01750 0.01025 0.01550
fem1  0.20000 0.02925 0.03300 0.01650 0.50450 0.02175 0.04975 0.02325 0.06850 0.01700 0.02350 0.01300
fem1  0.34350 0.04850 0.02425 0.03325 0.31850 0.02200 0.03600 0.03550 0.06850 0.03900 0.01225 0.01875
fem1  0.38450 0.12775 0.00775 0.01875 0.25225 0.01450 0.02900 0.04225 0.08850 0.01400 0.00500 0.01575
fem1  0.52900 0.05100 0.00900 0.02225 0.22000 0.01175 0.03050 0.02400 0.06900 0.01325 0.00550 0.01475
fem1  0.40125 0.11975 0.00525 0.01500 0.30175 0.00975 0.02775 0.03100 0.05925 0.01425 0.00350 0.01150
fem1  0.55025 0.01400 0.00325 0.04350 0.25800 0.00650 0.04300 0.00875 0.04850 0.01075 0.00550 0.00800
fem1  0.34050 0.00800 0.01175 0.01175 0.23900 0.02350 0.14625 0.05300 0.07150 0.04925 0.02000 0.02550
fem1  0.09850 0.00175 0.00800 0.05425 0.03025 0.01775 0.20450 0.18575 0.10700 0.11600 0.03000 0.14625
fem1  0.02850 0.00350 0.00425 0.02000 0.02975 0.01075 0.31825 0.10950 0.08575 0.13225 0.05000 0.20750
fem1  0.07550 0.01075 0.05350 0.01325 0.06000 0.05525 0.26850 0.18375 0.09200 0.05975 0.03825 0.08950
fem1  0.18325 0.01700 0.07700 0.06325 0.04300 0.05950 0.09950 0.09125 0.06175 0.11700 0.07300 0.11450
fem1  0.43075 0.02825 0.04500 0.07875 0.02125 0.08550 0.04125 0.13225 0.02325 0.05525 0.03300 0.02550
fem1  0.34600 0.01575 0.02300 0.17700 0.02075 0.07025 0.03900 0.16575 0.04850 0.03475 0.03375 0.02550
fem1  0.26775 0.01500 0.03950 0.20025 0.04425 0.08300 0.04475 0.14825 0.03275 0.04675 0.03850 0.03925
fem1  0.12800 0.01150 0.04000 0.17925 0.01675 0.03175 0.07825 0.15975 0.03850 0.08100 0.15025 0.08500
fem1  0.08450 0.00375 0.09100 0.09350 0.01175 0.01950 0.05450 0.09750 0.04600 0.10825 0.31650 0.07325
fem1  0.22250 0.01675 0.06375 0.03425 0.03775 0.04600 0.14775 0.11850 0.10025 0.08325 0.05850 0.07075
fem1  0.12950 0.00650 0.05450 0.03925 0.01075 0.02850 0.17475 0.29850 0.01450 0.04150 0.08525 0.11650
fem1  0.04950 0.00775 0.04725 0.01600 0.01375 0.03000 0.18425 0.20300 0.03200 0.07800 0.13200 0.20650
fem1  0.17550 0.03125 0.08600 0.05550 0.05850 0.06100 0.10925 0.15850 0.01475 0.07700 0.06400 0.10875
fem1  0.21250 0.00825 0.02625 0.27050 0.03250 0.02800 0.10150 0.17750 0.01000 0.03725 0.04400 0.05175
fem1  0.15225 0.01050 0.03125 0.09225 0.01500 0.02950 0.06975 0.19800 0.02000 0.10700 0.13700 0.13750
fem1  0.13100 0.00975 0.07775 0.06650 0.01425 0.02700 0.07800 0.17650 0.02650 0.10850 0.15525 0.12900
fem1  0.06975 0.02800 0.04775 0.04625 0.11125 0.07475 0.05825 0.09000 0.09050 0.16125 0.06600 0.15625
fem1  0.04325 0.02150 0.05950 0.04925 0.15650 0.08550 0.05200 0.07700 0.11550 0.13700 0.08925 0.11375
fem1  0.06275 0.02875 0.02550 0.02675 0.05650 0.04400 0.20475 0.17100 0.10600 0.07950 0.10100 0.09350
fem1  0.22525 0.03925 0.03525 0.09050 0.03975 0.03475 0.16425 0.17775 0.02475 0.06675 0.05775 0.04400
fem1  0.29775 0.04000 0.04950 0.10650 0.05450 0.06125 0.07975 0.14725 0.02875 0.05175 0.05550 0.02750
fem1  0.28900 0.02400 0.03925 0.14100 0.07750 0.03975 0.09875 0.14250 0.01650 0.05525 0.04525 0.03125
fem1  0.32725 0.01225 0.01575 0.25700 0.08400 0.01475 0.07175 0.11500 0.01475 0.02625 0.04250 0.01875
fem1  0.32925 0.02000 0.01675 0.19525 0.13450 0.02825 0.05950 0.10650 0.01325 0.02525 0.05275 0.01875
fem1  0.34625 0.01875 0.01825 0.17650 0.12525 0.02550 0.06175 0.12450 0.01425 0.02500 0.04350 0.02050
fem1  0.35800 0.00750 0.01700 0.21450 0.10575 0.01000 0.06075 0.12775 0.01550 0.02650 0.03775 0.01900
fem1  0.32675 0.02475 0.02950 0.14875 0.08300 0.03075 0.11125 0.13350 0.01300 0.03050 0.04100 0.02725
fem1  0.31275 0.03375 0.03950 0.10250 0.04600 0.05300 0.07825 0.16250 0.02575 0.06575 0.04850 0.03175
fem1  0.19375 0.06100 0.05450 0.05725 0.01075 0.05975 0.13700 0.16375 0.01600 0.11825 0.08700 0.04100
fem1  0.02550 0.04300 0.06650 0.11825 0.14900 0.15950 0.03825 0.04150 0.12775 0.09750 0.04400 0.08925
fem1  0.11500 0.01250 0.19600 0.04800 0.01350 0.01200 0.13450 0.11375 0.02650 0.07775 0.13300 0.11750
fem1  0.24625 0.02425 0.07550 0.05850 0.03600 0.05325 0.11100 0.07200 0.02850 0.03875 0.21950 0.03650
fem1  0.49500 0.00400 0.03025 0.08350 0.00825 0.01550 0.03425 0.08850 0.01175 0.04400 0.14225 0.04275
fem1  0.22150 0.00200 0.03075 0.08775 0.01400 0.01675 0.03500 0.11775 0.01950 0.09075 0.31975 0.04450
fem11 0.00250 0.15450 0.00975 0.00600 0.03800 0.05050 0.14075 0.06050 0.22075 0.10300 0.05300 0.16075
fem11 0.00225 0.29975 0.02750 0.00200 0.02150 0.03800 0.13650 0.04950 0.16750 0.12700 0.03525 0.09325
fem11 0.00075 0.54800 0.02125 0.00150 0.01075 0.02300 0.09700 0.02900 0.09250 0.07650 0.06025 0.03950
fem11 0.00150 0.57625 0.01875 0.00175 0.00800 0.02250 0.09875 0.04125 0.08750 0.07050 0.03750 0.03575
fem11 0.00725 0.31525 0.01575 0.00175 0.03200 0.03675 0.20425 0.09475 0.08800 0.05250 0.07275 0.07900
fem11 0.00275 0.61450 0.02600 0.00275 0.00800 0.02650 0.07750 0.05300 0.08100 0.04150 0.02825 0.03825
fem11 0.00800 0.41525 0.04575 0.00400 0.01900 0.06325 0.13775 0.07175 0.07025 0.04925 0.05225 0.06350
fem11 0.00725 0.35875 0.06425 0.00225 0.02450 0.08350 0.15325 0.06250 0.08125 0.05075 0.05800 0.05375
fem11 0.01450 0.41600 0.11850 0.00375 0.03900 0.12775 0.07275 0.04275 0.07475 0.03225 0.02925 0.02875
fem11 0.00950 0.45400 0.12725 0.00325 0.03600 0.13100 0.05250 0.02650 0.08100 0.03700 0.02475 0.01725
fem11 0.01175 0.48825 0.12525 0.00300 0.03075 0.17650 0.02925 0.01750 0.06700 0.02950 0.01000 0.01125
fem11 0.00900 0.40950 0.08475 0.00225 0.04300 0.18825 0.06650 0.02275 0.07725 0.05000 0.02500 0.02175
fem11 0.01650 0.35400 0.05050 0.00250 0.10650 0.16650 0.07800 0.02875 0.09575 0.04375 0.03500 0.02225
fem11 0.00875 0.42975 0.08000 0.00300 0.06050 0.18975 0.05225 0.02275 0.07250 0.04125 0.02150 0.01800
fem11 0.00275 0.70025 0.05125 0.00050 0.01875 0.10600 0.02000 0.01675 0.05300 0.01575 0.00575 0.00925
fem11 0.00200 0.63400 0.04200 0.00300 0.06500 0.03975 0.05850 0.03500 0.05825 0.01925 0.01725 0.02600
fem11 0.01025 0.62425 0.05800 0.00375 0.08425 0.08800 0.02600 0.01325 0.04325 0.01725 0.01725 0.01450
fem11 0.01325 0.58150 0.05750 0.00875 0.11600 0.06325 0.02850 0.03200 0.05975 0.01175 0.01175 0.01600
fem11 0.00875 0.58875 0.03900 0.00825 0.12975 0.04950 0.03725 0.04100 0.05575 0.01050 0.01125 0.02025
fem11 0.00575 0.67975 0.05150 0.00475 0.07050 0.05000 0.03475 0.02400 0.04375 0.00800 0.00725 0.02000
fem11 0.00825 0.65850 0.04700 0.00600 0.10325 0.04250 0.02275 0.03375 0.04425 0.00775 0.00775 0.01825
fem11 0.01150 0.62725 0.03850 0.00700 0.11175 0.04425 0.02700 0.03550 0.04500 0.01650 0.01125 0.02450
fem11 0.00975 0.64675 0.04625 0.00600 0.08325 0.04150 0.02675 0.03975 0.04625 0.01625 0.01150 0.02600
fem11 0.01475 0.56150 0.05825 0.00975 0.14375 0.04650 0.02875 0.03500 0.04500 0.01725 0.01650 0.02300
fem11 0.01475 0.43525 0.05325 0.02200 0.20025 0.04775 0.05075 0.02575 0.07300 0.03675 0.01125 0.02925
 [ reached getOption("max.print") -- omitted 1181 rows ]
attr(,"class")
[1] "matrix" "votes" 
predictions = character(nrow(test))
for (i in 1:nrow(test)) {
  mel = testMels[[i]]
  prediction = names(which.max(colMeans(predict(rf, mel, type="prob"))))
  predictions[i] = prediction
}
testWithPredictions = data.frame(test, predictions)
testWithPredictions
correct_predictions = nrow(filter(testWithPredictions, birdid == predictions))
accuracy_pct = round(correct_predictions / nrow(test) * 100, 2)
print(paste(correct_predictions, "/", nrow(test), "wavs in the test dataset correctly identifed. Accuracy: ", accuracy_pct, "%"))
[1] "52 / 63 wavs in the test dataset correctly identifed. Accuracy:  82.54 %"

Deep learning classification with the R interface to Keras based on MFCC

X_train = trainM
Y_train = to_categorical(as.integer(trainM.labels) - 1)
X_test = testM
Y_test = to_categorical(as.integer(testM.labels) - 1)

# Build the sequential model
model = keras_model_sequential()
model %>%
  # Input shape layer = c(samples, rows, cols, channels)
  layer_reshape(input_shape=ncol(X_train),target_shape=c(1,1,ncol(X_train))) %>% 
  # First conv 2d layer with 128 neurons, kernel size of 8 x 8 and stride of 1 x 1
  layer_conv_2d(128, c(8,8), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Second conv 2d layer with 256 neurons, kernel size of 5 x 5 and stride of 1 x 1
  layer_conv_2d(256, c(5,5), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Third conv 2d layer with 128 neurons, kernel size of 3 x 3 and stride of 1 x 1
  layer_conv_2d(128, c(3,3), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Average pooling layer
  layer_global_average_pooling_2d() %>%
  # Activation output layer with 2 classes
  layer_dense(units = ncol(Y_train),  activation='softmax')

# Model compile
model %>% compile(loss = 'categorical_crossentropy',
                 optimizer = "adam",
                 metrics = "categorical_accuracy")


# Add a callback to reduce the learning rate when reaching the plateau
reduce_lr <- callback_reduce_lr_on_plateau(monitor = 'loss', factor = 0.5,
                                           patience = 50, min_lr = 0.0001)
# Start learning
history = model %>% fit(X_train, Y_train, batch_size = 32, epochs = 50,
             validation_data = list(X_test, Y_test),
             verbose = 0, callbacks = reduce_lr)
plot(history)

Check accuracy of keras model

# Score on the test set
model %>% evaluate(X_test, Y_test, batch_size = 32)

  32/1264 [..............................] - ETA: 0s - loss: 1.2283 - categorical_accuracy: 0.6250
 576/1264 [============>.................] - ETA: 0s - loss: 1.3964 - categorical_accuracy: 0.5712
1120/1264 [=========================>....] - ETA: 0s - loss: 1.1001 - categorical_accuracy: 0.6518
1264/1264 [==============================] - 0s 95us/sample - loss: 1.0824 - categorical_accuracy: 0.6574
$loss
[1] 1.082426

$categorical_accuracy
[1] 0.6574367
predictions = character(nrow(test))
for (i in 1:nrow(test)) {
  mel = testMels[[i]]
  prediction = which.max(colMeans(predict_proba(model, mel)))
  prediction = levels(testM.labels)[prediction]
  predictions[i] = prediction
}
testWithPredictions = data.frame(test, predictions)
testWithPredictions
correct_predictions = nrow(filter(testWithPredictions, birdid == predictions))
accuracy_pct = round(correct_predictions / nrow(test) * 100, 2)
print(paste(correct_predictions, "/", nrow(test), "wavs in the test dataset correctly identifed. Accuracy: ", accuracy_pct, "%"))
[1] "47 / 63 wavs in the test dataset correctly identifed. Accuracy:  74.6 %"

randomForest did better.

---
title: "Bird Recognition"
output:
  html_notebook:
    toc: TRUE
    toc_float: TRUE
---

GitHub repo: https://github.com/UoA-eResearch/bird_recognition

## Load the necessary packages

```{r setup, warning=FALSE}
library(bioacoustics)
library(tuneR)
library(seewave)
library(dplyr)
library(tools)
library(randomForest)
library(stringr)
library(keras)
```

## Load the wave files. Separate out the metadata from the filename into columns. Render some spectrograms

```{r load}
files = list.files("wav_files_playback", "*.wav", full.names=TRUE)
files_without_extension = basename(file_path_sans_ext(files))
wavs = setNames(
  lapply(
    files,
    read_audio
  ),
  files_without_extension
)
metadata = data.frame(str_split_fixed(files_without_extension, "_", 3))
metadata = cbind(metadata, files_without_extension)
colnames(metadata) = c("birdid", "calltype", "idnumber", "filename")
head(metadata)
oscillo(wavs[[1]])
birds = unique(metadata$birdid)
filtered = filter(metadata, birdid == birds[1])
for (i in 1:nrow(filtered)) {
  audio = wavs[[filtered$filename[i]]]
  spectro(audio, main=filtered$filename[i])
}
```

Each calltype seems to have a unique spectrogram.

## Split dataset into train / test. The test dataset will contain one row for each combination of birdid / calltype

```{r split}
test = metadata %>% 
          group_by(birdid, calltype) %>% 
          filter(row_number()==1)
test
train = metadata %>% 
          group_by(birdid, calltype) %>% 
          filter(row_number()!=1)
train
```

## Train a random forest classifier based on Mel-frequency Cepstral Coefficients (MFCC)

```{r rf}
mels = sapply(wavs, melfcc, numcep = 20) # Calculate all MFCCs
head(mels[[1]]) # Result is a matrix of 20 coefficients across nrow time frames
image(mels[[1]])

trainMels = mels[train$filename] # Select the MFCCs corresponding to the training dataset
trainM = do.call(rbind, trainMels) # melfcc gives a matrix - rbind to cast from 3D to 2D across the whole training dataset
trainM.labels = rep(train$birdid, lapply(trainMels, function(x) dim(x)[1])) # Create birdid labels for each MFCC
rownames(trainM) = trainM.labels
head(trainM)

set.seed(1337)
rf = randomForest(trainM, trainM.labels, importance = FALSE, proximity = FALSE, replace = TRUE, ntree = 4000, mtry = 4)
rf
```
### Check accuracy of randomForest model

```{r rf-acc}
testMels = mels[test$filename]
testM = do.call(rbind, testMels)
testM.labels = rep(test$birdid, lapply(testMels, function(x) dim(x)[1]))
rownames(testM) = testM.labels
head(testM)

# Let's make predictions with our classifier on a test set
table = table(testM.labels, predict(rf, testM, type = "response"))
table

accuracy_pct = round(sum(diag(table)) / sum(table) * 100, 2)
print(paste("accuracy across whole dataset", accuracy_pct, "%"))

# To look at the predictions 
predict(rf, testM, type = "prob")

predictions = character(nrow(test))
for (i in 1:nrow(test)) {
  mel = testMels[[i]]
  prediction = names(which.max(colMeans(predict(rf, mel, type="prob"))))
  predictions[i] = prediction
}
testWithPredictions = data.frame(test, predictions)
testWithPredictions
correct_predictions = nrow(filter(testWithPredictions, birdid == predictions))
accuracy_pct = round(correct_predictions / nrow(test) * 100, 2)
print(paste(correct_predictions, "/", nrow(test), "wavs in the test dataset correctly identifed. Accuracy: ", accuracy_pct, "%"))
```


## Deep learning classification with the R interface to Keras based on MFCC

```{r keras, warning=FALSE}
X_train = trainM
Y_train = to_categorical(as.integer(trainM.labels) - 1)
X_test = testM
Y_test = to_categorical(as.integer(testM.labels) - 1)

# Build the sequential model
model = keras_model_sequential()
model %>%
  # Input shape layer = c(samples, rows, cols, channels)
  layer_reshape(input_shape=ncol(X_train),target_shape=c(1,1,ncol(X_train))) %>% 
  # First conv 2d layer with 128 neurons, kernel size of 8 x 8 and stride of 1 x 1
  layer_conv_2d(128, c(8,8), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Second conv 2d layer with 256 neurons, kernel size of 5 x 5 and stride of 1 x 1
  layer_conv_2d(256, c(5,5), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Third conv 2d layer with 128 neurons, kernel size of 3 x 3 and stride of 1 x 1
  layer_conv_2d(128, c(3,3), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Average pooling layer
  layer_global_average_pooling_2d() %>%
  # Activation output layer with 2 classes
  layer_dense(units = ncol(Y_train),  activation='softmax')

# Model compile
model %>% compile(loss = 'categorical_crossentropy',
                 optimizer = "adam",
                 metrics = "categorical_accuracy")


# Add a callback to reduce the learning rate when reaching the plateau
reduce_lr <- callback_reduce_lr_on_plateau(monitor = 'loss', factor = 0.5,
                                           patience = 50, min_lr = 0.0001)
# Start learning
history = model %>% fit(X_train, Y_train, batch_size = 32, epochs = 50,
             validation_data = list(X_test, Y_test),
             verbose = 0, callbacks = reduce_lr)
```

```{r keras-plot}
plot(history)
```

### Check accuracy of keras model

```{r keras-acc}
# Score on the test set
model %>% evaluate(X_test, Y_test, batch_size = 32)

predictions = character(nrow(test))
for (i in 1:nrow(test)) {
  mel = testMels[[i]]
  prediction = which.max(colMeans(predict_proba(model, mel)))
  prediction = levels(testM.labels)[prediction]
  predictions[i] = prediction
}
testWithPredictions = data.frame(test, predictions)
testWithPredictions
correct_predictions = nrow(filter(testWithPredictions, birdid == predictions))
accuracy_pct = round(correct_predictions / nrow(test) * 100, 2)
print(paste(correct_predictions, "/", nrow(test), "wavs in the test dataset correctly identifed. Accuracy: ", accuracy_pct, "%"))
```

randomForest did better.
