BOPNWZL4RWF4UGC2LUEVONNQFSKYJ2Q5W747GH3UURZLBJFFEBUQC · Changes

file deletion: Train_delete.jl (----------)

[4.6598]→[4.335579:335618](∅→∅),[4.335618]→[4.329788:329788](∅→∅)

# Train.jl

# https://github.com/FluxML/model-zoo/blob/master/tutorials/transfer_learning/transfer_learning.jl

# This works on my data IT TRAINS best, but only -t 4

# dont forget temp env

using Random: shuffle!

using Random: seed!

import Base: length

import Base: getindex

using Images

using Flux

using CUDA

using Metalhead

using Noise

using Glob

using BSON: @save

using Dates

#using CSV

using DataFrames

using FreqTables

using JLD2

using Logging, LoggingExtras

imgs = glob("2023-09-*/*/*/[N,K]/*.png") #from SSD2

seed!(1234);

shuffle!(imgs)

#CSV.write("files.csv", DataFrame(file=imgs))

device = CUDA.functional() ? gpu : cpu

struct ImageContainer{T<:Vector}

img::T

end

struct ValidationImageContainer{T<:Vector}

img::T

end

data = ImageContainer(imgs)

val_data = ValidationImageContainer(imgs)

length(data::ImageContainer) = length(data.img)

length(data::ValidationImageContainer) = length(data.img)

const im_size = (224, 224)

name_to_idx = Dict{String,Int32}("K" => 1, "N" => 2)

function getindex(data::ImageContainer{Vector{String}}, idx::Int)

path = data.img[idx]

img =

Images.load(path) |>

x ->

Images.imresize(x, 224, 224) |>

x ->

Images.RGB.(x) |>

x ->

Noise.add_gauss(x, (rand() * 0.2)) |>

x ->

apply_mask(x, 3, 3, 12) |>

x ->

collect(channelview(float32.(x))) |>

x -> permutedims(x, (3, 2, 1))

y = name_to_idx[(split(path, "/")[end-1])]

return img, y

end

function getindex(data::ValidationImageContainer{Vector{String}}, idx::Int)

path = data.img[idx]

img =

Images.load(path) |>

x ->

Images.imresize(x, 224, 224) |>

x ->

Images.RGB.(x) |>

x -> collect(channelview(float32.(x))) |> x -> permutedims(x, (3, 2, 1))

y = name_to_idx[(split(path, "/")[end-1])]

return img, y

end

# assumes 224px square images

function apply_mask(

img::Array{RGB{N0f8},2},

max_number::Int = 3,

min_size::Int = 3,

max_size::Int = 22,

)

# horizontal

for range in get_random_ranges(max_number, min_size, max_size)

img[range, :] .= RGB{N0f8}(0.7, 0.7, 0.7)

end

# vertical

for range in get_random_ranges(max_number, min_size, max_size)

img[:, range] .= RGB{N0f8}(0.7, 0.7, 0.7)

end

return img

end

# assumes 224px square images

function get_random_ranges(max_number::Int, min_size::Int, max_size::Int)

number = rand(0:max_number)

ranges = []

while length(ranges) < number

start = rand(1:224)

size = rand(min_size:max_size)

if start + size > 224

continue

end

push!(ranges, start:start+size)

end

return ranges

end

# define DataLoaders

const batch_size = 64

const train_test_split = 0.95

const ceiling = length(data) ÷ batch_size * batch_size

const train_test_index =

ceiling ÷ batch_size * train_test_split |> round |> x -> x * batch_size |> Int

train = Flux.DataLoader(

ImageContainer(imgs[1:train_test_index]);

batchsize = batch_size,

collate = true,

parallel = true,

)

device == gpu ? train = CuIterator(train) : nothing

train_sample = Flux.DataLoader(

ValidationImageContainer(imgs[1:(ceiling-train_test_index)]);

batchsize = batch_size,

collate = true,

parallel = true,

)

device == gpu ? train_sample = CuIterator(train_sample) : nothing

test = Flux.DataLoader(

ValidationImageContainer(imgs[train_test_index+1:ceiling]);

batchsize = batch_size,

collate = true,

parallel = true,

)

device == gpu ? test = CuIterator(test) : nothing

fst = Metalhead.ResNet(18, pretrain = true).layers

# BEWARE NUMBER CLASSES

lst = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => 2));

model = Flux.Chain(fst[1], lst) |> device

function eval_f(m, d)

good = 0

count = 0

pred = []

actual = []

for (x, y) in d

p = Flux.onecold(m(x))

good += sum(p .== y)

count += length(y)

append!(pred, p)

append!(actual, y)

end

accuracy = round(good / count, digits = 4)

confusion_matrix =

freqtable(DataFrame(targets = actual, predicts = pred), :targets, :predicts)

return accuracy, confusion_matrix

end

# BEWARE NUMBER CLASSES

function train_epoch!(model; opt, train)

Flux.train!(model, train, opt) do m, x, y

Flux.Losses.logitcrossentropy(m(x), Flux.onehotbatch(y, 1:2))

end

opt = Flux.setup(Flux.Optimisers.Adam(1e-5), model);

logger = FileLogger("logfile.txt"; append = true)

@time metric_eval, v_confusion_matrix = eval_f(model, test)

#with_logger(logger) do

@info "eval" accuracy = metric_eval

@info "eval" v_confusion_matrix

#end

a = 0.0

for iter in 1:15

println("")

println("Epoch: $iter")

@time train_epoch!(model; opt, train)

@time metric_train, t_confusion_matrix = eval_f(model, train_sample)

#with_logger(logger) do

@info "Epoch: " iter

@info "train" accuracy = metric_train

@info "train" t_confusion_matrix

#end

@time metric_eval, v_confusion_matrix = eval_f(model, test)

#with_logger(logger) do

@info "test" accuracy = metric_eval

@info "test" v_confusion_matrix

#end

metric_eval > a && begin

a = metric_eval

let _model = cpu(model)

jldsave(

"model_K1-4_CPU_epoch-$iter-$metric_eval-$(today()).jld2";

model_state = Flux.state(_model),

)

#BSON.@save "model_K1-3_CPU_epoch-$iter-$metric_eval-$(now()).bson" _model

#with_logger(logger) do

@info "Saved a best_model"

#end

end

file deletion: Predict_delete.jl (----------)

[4.6598]→[4.366453:366494](∅→∅),[4.366494]→[4.360107:360107](∅→∅)

# Predict.jl

using WAV, DSP, Images, ThreadsX, Dates, DataFrames, CSV, Flux, CUDA, Metalhead, JLD2, FLAC

export predict

"""

predict(glob_pattern::String, model::String)

This function takes a glob pattern for folders to run over, and a model path. It saves results in a csv for each folder, similar to opensoundscape

Args:

• glob pattern (folder/)

• model path

Returns: Nothing - This function saves csv files.

I use this function to find kiwi from new data gathered on a trip.

Note:

Dont forget temp env, julia -t 4

From Pomona-3/Pomona-3/

Use like:

using Skraak

glob_pattern = "*/2023-10-19/" #from SSD1

model = "/media/david/SSD1/model_K1-3_CPU_epoch-10-0.9965-2023-10-18T17:32:36.747.jld2"

predict(glob_pattern, model)

"""

function predict(glob_pattern::String, model::String)

model = load_model(model) |> device

folders = glob(glob_pattern)

@info "Folders: $folders"

for folder in folders

@info "Working on: $folder"

predict_folder(folder, model)

end

function predict(folders::Vector{String}, model::String)

model = load_model(model) |> device

@info "Folders: $folders"

for folder in folders

@info "Working on: $folder"

predict_folder(folder, model)

end

#~~~~~ The guts ~~~~~#

device = CUDA.functional() ? gpu : cpu

#= TO DELETE

function get_image_for_inference(sample, f)

S = DSP.spectrogram(sample, 400, 2; fs = f)

i = S.power

if minimum(i) == 0.0

l = i |> vec |> unique |> sort

replace!(i, 0.0 => l[2])

end

image =

#! format: off

DSP.pow2db.(i) |>

x -> x .+ abs(minimum(x)) |>

x -> x ./ maximum(x) |>

x -> reverse(x, dims = 1) |>

x -> RGB.(x) |>

x -> imresize(x, 224, 224) |>

x -> collect(channelview(float32.(x))) |>

x -> permutedims(x, (3, 2, 1))

#! format: on

return image

end

=#

function get_image_for_inference(sample, f)

image =

#! format: off

get_image_from_sample(sample, f) |>

x -> collect(channelview(float32.(x))) |>

x -> permutedims(x, (3, 2, 1))

#! format: on

return image

end

function get_images(file::String, increment::Int = 5, divisor::Int = 2) #5s sample, 2.5s hop

ext = split(file, ".")[end]

@assert ext in ["WAV", "wav", "flac"] "Unsupported audio file type, requires wav or flac."

if ext in ["WAV", "wav"]

signal, freq = wavread(file)

else

signal, freq = load(file)

end

if freq > 16000

signal = DSP.resample(signal, 16000.0f0 / freq; dims = 1)

freq = 16000

end

f = convert(Int, freq)

inc = increment * f

hop = f * increment ÷ divisor #need guarunteed Int, maybe not anymore, refactor

split_signal = DSP.arraysplit(signal[:, 1], inc, hop)

raw_images = ThreadsX.map(x -> get_image_for_inference(x, f), split_signal)

n_samples = length(raw_images)

return raw_images, n_samples

end

function get_images_time_from_wav(file::String, increment::Int = 5, divisor::Int = 2)

raw_images, n_samples = get_images(file::String, increment, divisor)

images = reshape_images(raw_images, n_samples)

start_time = 0:(increment/divisor):(n_samples-1)*(increment/divisor)

end_time = increment:(increment/divisor):(n_samples+1)*(increment/divisor)

time = collect(zip(start_time, end_time))

return images, time

end

function reshape_images(raw_images, n_samples)

images =

#! format: off

hcat(raw_images...) |>

x -> reshape(x, (224, 224, 3, n_samples))

#! format: on

return images

end

function predict_file(file::String, folder::String, model)

#check form of opensoundscape preds.csv and needed by my make_clips

@info "File: $file"

@time images, time = get_images_time_from_wav(file)

data = images |> device

@time predictions = Flux.onecold(model(data))

f = (repeat(["$file"], length(time)))

df = DataFrame(

:file => f,

:start_time => first.(time),

:end_time => last.(time),

:label => predictions,

)

return df

end

function predict_folder(folder::String, model)

wav = glob("$folder/*.[W,w][A,a][V,v]")

flac = glob("$folder/*.flac")

files = cat(wav, flac; dims = 1)

@info "$(length(files)) files in $folder"

df = DataFrame(

file = String[],

start_time = Float64[],

end_time = Float64[],

label = Int[],

)

save_path = "$folder/preds-$(today()).csv"

CSV.write("$save_path", df)

for file in files

df = predict_file(file, folder, model)

CSV.write("$save_path", df, append = true)

end

# see load_model() from train, different input types

function load_model(model_path::String)

model_state = JLD2.load(model_path, "model_state")

model_classes = length(model_state[1][2][1][3][2])

f = Metalhead.ResNet(18, pretrain = false).layers

l = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => model_classes))

model = Flux.Chain(f[1], l)

Flux.loadmodel!(model, model_state)

return model

end

#=

function load_bson(model_path::String)

BSON.@load model_path model

end

=#

############### PYTHON Opensoundscape ################

#=

# Dont forget conda activate opensoundscape

# Dont forget to modify file names and glob pattern

# Run script in Pomona-2, hard code trip date in the glob

# python /media/david/USB/Skraak/src/predict.py

from opensoundscape.torch.models.cnn import load_model

import opensoundscape

import torch

from pathlib import Path

import numpy as np

import pandas as pd

from glob import glob

import os

from datetime import datetime

model = load_model('/home/david/best.model')

# folders = glob('./*/2023-?????/')

folders = glob('./*/*/')

for folder in folders:

os.chdir(folder)

print(folder, ' start: ', datetime.now())

# Beware, secretary island files are .wav

field_recordings = glob('./*.WAV')

scores, preds, unsafe = model.predict(

field_recordings,

binary_preds = 'single_target',

overlap_fraction = 0.5,

batch_size = 128,

num_workers = 12)

scores.to_csv("scores-2023-11-07.csv")

preds.to_csv("preds-2023-11-07.csv")

os.chdir('../..')

print(folder, ' done: ', datetime.now())

print()

=#

edit in src/Utility.jl at line 2

[4.313626]→[4.313626:315017](∅→∅)

using CSV,

DataFrames,

Dates,

DBInterface,

DSP,

DuckDB,

Glob,

HTTP,

Images,

JSON,

PNGFiles,

Random,

SHA,

TimeZones,

WAV,

XMLDict

export move_one_hour!,

check_png_wav_both_present,

file_metadata_to_df,

resize_image!,

twilight_tuple_local_time,

utc_to_nzdt!

"""

move_one_hour!(files::Vector{String}, operator)

This function takes a vector of file paths and renames each file in the

vector by changing the name of the file to the name of the file created one

hour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,

which represents the time stamp of the original file minus (or plus) one hour. This

function avoids force=true with mv, since new file names may already exist

and mv will stacktrace leaving a big mess to tidy up.

Args:

• files (Vector{String}): A vector of strings where each element is

a path to a file.

Returns: Nothing - This function only renames files and saves them.

I use this to turn the clock back at the end of daylight saving.

"""

#Assumes WAV files

function move_one_hour!(files::Vector{String}, operator)

@assert operator == (+) || operator == (-)

fix_extension_of_files = []

for old_file in files

# Extract the date and time of the original file using string chopping

a = chop(old_file, tail = 4)

d, t = split(a, "_")

replacement in src/Utility.jl at line 3

[4.315018]→[4.315018:315222](∅→∅)

ye = parse(Int64, d[1:4])

mo = parse(Int64, d[5:6])

da = parse(Int64, d[7:8])

ho = parse(Int64, t[1:2])

mi = parse(Int64, t[3:4])

se = parse(Int64, t[5:6])

[4.315018]

[4.315222]

export check_png_wav_both_present,

resize_image!, twilight_tuple_local_time, move_one_hour!, utc_to_nzdt!

replacement in src/Utility.jl at line 6

[4.315223]→[4.315223:315269](∅→∅)

dt = DateTime(ye, mo, da, ho, mi, se)

[4.315223]

[4.315269]

using CSV, DataFrames, Dates, Glob, HTTP, Images, JSON, TimeZones, WAV

#XMLDict, DBInterface, DSP, DuckDB, PNGFiles, Random, SHA

edit in src/Utility.jl at line 9

[4.315270]→[4.315270:315940](∅→∅)

#new_date = dt - Dates.Hour(1)

new_date = operator(dt, Dates.Hour(1))

# Must drop the WAV extension to avoiding force=true

# with mv, since the new file name may already exist and mv

# will stacktrace leaving a big mess to tidy up.

base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")

temp_file = base_file * ".tmp"

# Tuple to tidy extensions later

tidy = (temp_file, base_file * ".WAV")

mv(old_file, temp_file)

push!(fix_extension_of_files, tidy)

print(".")

end

for item in fix_extension_of_files

mv(item[1], item[2])

end

print("Tidy\n")

end

edit in src/Utility.jl at line 24

[4.316421]→[4.316421:316422](∅→∅)

edit in src/Utility.jl at line 41

[4.316919]→[4.316919:317670](∅→∅)

#=

used like:

using Glob, Skraak, CSV

folders=glob("*/2023-11-02/")

for folder in folders

cd(folder)

try

df = Skraak.file_metadata_to_df()

CSV.write("/media/david/Pomona-3/Pomona-3/pomona_files_20231102.csv", df; append=true)

catch

@warn "error with $folder"

end

cd("/media/david/Pomona-3/Pomona-3/")

end

Then using duckdb cli from SSD:

duckdb AudioData.duckdb

show tables;

SELECT * FROM pomona_files;

COPY pomona_files FROM '/media/david/Pomona-3/Pomona-3/pomona_files_20231019.csv';

SELECT * FROM pomona_files;

Then backup with:

EXPORT DATABASE 'AudioDataBackup_2023-07-29';

.quit

Then quit and backup using cp on the db file

Then rsync ssd to usb

rsync -avzr --delete /media/david/SSD1/ /media/david/USB/

=#

edit in src/Utility.jl at line 42

[4.317674]→[4.317674:324634](∅→∅)

file_metadata_to_df()

This function takes a file name, extracts wav metadata, gpx location, recording period start/end and returnes a dataframe.

This function needs raw audiomoth wav files and a gpx.

This function needs /media/david/SSD1/dawn_dusk.csv

using DataFrames, Dates, DelimitedFiles, DuckDB, Glob, JSON3, Random, SHA, TimeZones, WAV, XMLDict

"""

function file_metadata_to_df()

    # Initialise dataframe with columns: disk, location, trip_date, file, lattitude, longitude, start_recording_period_localt, finish_recording_period_localt, duration, sample_rate, zdt, ldt, moth_id, gain, battery, temperature

df = DataFrame(

disk = String[],

location = String[],

trip_date = String[],

file = String[],

latitude = Float64[],

longitude = Float64[],

start_recording_period_localt = String[],

finish_recording_period_localt = String[],

duration = Float64[],

sample_rate = Int[],

utc = String[],

ldt = String[],

moth_id = String[],

gain = String[],

battery = Float64[],

temperature = Float64[],

sha2_256 = String[],

night = Bool[],

)

#Get WAV list for folder

wav_list = glob("*.WAV") |> sort

#Return empty df if nothing in the folder

if length(wav_list) == 0

return df

end

#Get path info from file system

raw_path_vec = split(pwd(), "/")[end-2:end]

disk = raw_path_vec[1]

location = raw_path_vec[2]

trip_date = raw_path_vec[3]

#Get location, assumes 1 gpx is in the follder

waypoint = glob("*.gpx")

length(waypoint) != 1 && @error "no gpx file in $trip_date $location"

loc = read(waypoint[1], String) |> xml_dict

latitude = parse(Float64, (loc["gpx"]["wpt"][:lat]))

longitude = parse(Float64, (loc["gpx"]["wpt"][:lon]))

#Start of recording period

_, _, _, binary_metadata_start = wavread(wav_list[1])

c_v_s = split(wav_info_read(binary_metadata_start)[:ICMT], " ")

comment_vector_start = length(c_v_s) < 22 ? c_v_s : c_v_s[1:19]

date_start = split(comment_vector_start[4], "/")

time_start = split(comment_vector_start[3], ":")

tz_start = chop(comment_vector_start[5], head = 4, tail = 1)

time_zone_start = isempty(tz_start) ? "+00" : tz_start

    #zdt1 = ZonedDateTime(parse(Int, date_start[3]), parse(Int, date_start[2]), parse(Int, date_start[1]), parse(Int, time_start[1]), parse(Int, time_start[2]), parse(Int, time_start[3]), tz"UTC")

time_string_start =

date_start[3] *

"-" *

date_start[2] *

"-" *

date_start[1] *

"T" *

time_start[1] *

":" *

time_start[2] *

":" *

time_start[3] *

"." *

"000" *

time_zone_start

zdt1 = ZonedDateTime(time_string_start)

start_recording_period_localt =

Dates.format(astimezone(zdt1, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")

#End of recording period

_, _, _, binary_metadata_end = wavread(wav_list[end])

c_v_e = split(wav_info_read(binary_metadata_end)[:ICMT], " ")

comment_vector_end = length(c_v_e) < 22 ? c_v_e : c_v_e[1:19]

date_end = split(comment_vector_end[4], "/")

time_end = split(comment_vector_end[3], ":")

tz_end = chop(comment_vector_start[5], head = 4, tail = 1)

time_zone_end = isempty(tz_end) ? "+00" : tz_end

    #zdt2 = ZonedDateTime(parse(Int, date_end[3]), parse(Int, date_end[2]), parse(Int, date_end[1]),parse(Int, time_end[1]), parse(Int, time_end[2]), parse(Int, time_end[3]), tz"UTC")

time_string_end =

date_end[3] *

"-" *

date_end[2] *

"-" *

date_end[1] *

"T" *

time_end[1] *

":" *

time_end[2] *

":" *

time_end[3] *

"." *

"000" *

time_zone_end

zdt2 = ZonedDateTime(time_string_end)

finish_recording_period_localt =

Dates.format(astimezone(zdt2, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")

dict = Skraak.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")

#So I know what it is doing

println(raw_path_vec)

#Loop over file list

for file in wav_list

#print(file)

try

audio_data, sample_rate, _, binary_metadata = wavread(file)

c_v = split(wav_info_read(binary_metadata)[:ICMT], " ")

comment_vector = length(c_v) < 22 ? c_v : c_v[1:19]

duration = Float64(length(audio_data) / sample_rate)

date = split(comment_vector[4], "/")

time = split(comment_vector[3], ":")

tz = chop(comment_vector[5], head = 4, tail = 1)

time_zone = isempty(tz) ? "+00" : tz

            #preformatting_zdt = ZonedDateTime(parse(Int, date[3]), parse(Int, date[2]), parse(Int, date[1]), parse(Int, time[1]), parse(Int, time[2]), parse(Int, time[3]), tz"UTC")

time_string =

date[3] *

"-" *

date[2] *

"-" *

date[1] *

"T" *

time[1] *

":" *

time[2] *

":" *

time[3] *

"." *

"000" *

time_zone

preformatting_zdt = ZonedDateTime(time_string)

#zdt = Dates.format(preformatting_zdt, "yyyy-mm-dd HH:MM:SSzzzz")

preformatting_utc = astimezone(preformatting_zdt, tz"UTC")

utc = Dates.format(preformatting_utc, "yyyy-mm-dd HH:MM:SSzzzz")

preformatting_ldt = astimezone(preformatting_zdt, tz"Pacific/Auckland")

ldt = Dates.format(preformatting_ldt, "yyyy-mm-dd HH:MM:SSzzzz")

moth_id = comment_vector[8]

gain = comment_vector[10]

#index back from end because if V > 4.9 the wording chaaanges

battery = parse(Float64, chop(comment_vector[end-4], tail = 1))

temperature = parse(Float64, chop(comment_vector[end], tail = 2))

sha2_256 = bytes2hex(sha256(file))

#assumes 15 minute file and calculates on half way time

nt = Skraak.night(DateTime(preformatting_ldt + Minute(7) + Second(30)), dict)

#Populate row to push into df

row = [

disk,

location,

trip_date,

file,

latitude,

longitude,

start_recording_period_localt,

finish_recording_period_localt,

duration,

Int(sample_rate),

utc,

ldt,

moth_id,

gain,

battery,

temperature,

sha2_256,

nt,

]

push!(df, row)

print(".")

catch

@warn "error with $folder $file"

end

return df

end

"""

edit in src/Utility.jl at line 149

[4.328100]

move_one_hour!(files::Vector{String}, operator)

This function takes a vector of file paths and renames each file in the

vector by changing the name of the file to the name of the file created one

hour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,

which represents the time stamp of the original file minus (or plus) one hour. This

function avoids force=true with mv, since new file names may already exist

and mv will stacktrace leaving a big mess to tidy up.

Args:

• files (Vector{String}): A vector of strings where each element is

a path to a file.

Returns: Nothing - This function only renames files and saves them.

I use this to turn the clock back at the end of daylight saving.

Assumes WAV files

"""

function move_one_hour!(files::Vector{String}, operator)

@assert operator == (+) || operator == (-)

fix_extension_of_files = []

for old_file in files

# Extract the date and time of the original file using string chopping

a = chop(old_file, tail = 4)

d, t = split(a, "_")

ye = parse(Int64, d[1:4])

mo = parse(Int64, d[5:6])

da = parse(Int64, d[7:8])

ho = parse(Int64, t[1:2])

mi = parse(Int64, t[3:4])

se = parse(Int64, t[5:6])

dt = DateTime(ye, mo, da, ho, mi, se)

#new_date = dt - Dates.Hour(1)

new_date = operator(dt, Dates.Hour(1))

# Must drop the WAV extension to avoiding force=true

# with mv, since the new file name may already exist and mv

# will stacktrace leaving a big mess to tidy up.

base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")

temp_file = base_file * ".tmp"

# Tuple to tidy extensions later

tidy = (temp_file, base_file * ".WAV")

mv(old_file, temp_file)

push!(fix_extension_of_files, tidy)

print(".")

end

for item in fix_extension_of_files

mv(item[1], item[2])

end

print("Tidy\n")

end

"""

edit in src/Train.jl at line 2

[4.335632]

[4.335645]

export train #beware Flux.train! is not Skraak.train

edit in src/Train.jl at line 9

[4.335805]→[4.335805:335859](∅→∅)

export train #beware Flux.train! is not Skraak.train

replacement in src/Skraak.jl at line 1

[4.345458]→[4.345459:345473](∅→∅)

module Skraak

[4.345458]

[4.345473]

# Skraak.jl

replacement in src/Skraak.jl at line 3

[4.345474]→[4.345474:345547](∅→∅)

export make_clips, move_clips_to_folders, aggregate_labels, audiodata_db

[4.345474]

[4.345547]

module Skraak

edit in src/Skraak.jl at line 5

[4.345548]

include("ConstructPrimaryDataset.jl")

replacement in src/Skraak.jl at line 8

[4.345590]→[4.345590:345629](∅→∅)

#include("ConstructPrimaryDataset.jl")

[4.345590]

[4.345629]

include("FileMetaData.jl")

include("Clips.jl")

include("Labels.jl")

edit in src/Skraak.jl at line 13

[4.345680]→[4.345680:356019](∅→∅)

using CSV, DataFrames, DataFramesMeta, Dates, DSP, Glob, JSON, Random, TimeZones, WAV, PNGFiles, Images #Plots

#import DataFramesMeta: @transform!, @subset!, @byrow, @passmissing

"""

make_clips(preds_path::String, dawn_dusk_dict::Dict{Dates.Date, Tuple{Dates.DateTime, Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv"))

This function takes a preds.csv files and generates

file names, wav's, spectrograms etc to be reviewed.

it calls night() and may call construct_dawn_dusk_dict() unless the dict is globally defined and passed in

It should be run from Pomona-1/, Pomona-2/ or Pomona-3/, assumes it is, it uses the path

It saves wav and png files to /home/david/Upload/

need to use a try/catch because the 2 assert functions thow an error to short circuit the function

using Glob, Skraak

predictions = glob("*/2023-09-11*/preds*")

predictions = glob("path/to/preds*")

for file in predictions #[1:6][7:12][13:18][19:24]

try

make_clips(file)

catch x

println(x)

end

if needed to change headers in preds csv

shift, control, f in subl

file,start_time,end_time,label

/media/david/Pomona-2,<project filters>, preds-2023-02-27.csv

file,start_time,end_time,absent,present

using Glob, CSV, DataFrames, DataFramesMeta, Dates, DSP, Plots, Random, WAV

"""

# Assumes run on linux

# Assumes function run from Pomona-1 or Pomona-2

#dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv",),

function make_clips(

preds_path::String,

label::Int = 1,

night::Bool = true,

dawn_dusk_dict = dddict,

)

# Assumes function run from Pomona-1 or Pomona-2

location, trip_date, _ = split(preds_path, "/")

# Load and group data frame by file

gdf =

#! format: off

DataFrame(CSV.File(preds_path)) |>

x -> assert_not_empty(x, preds_path) |>

x -> rename_column!(x, "1.0", "label") |> #can remove now, needs to be label

x -> assert_detections_present(x, label, location, trip_date) |>

x -> filter_positives!(x, label) |>

insert_datetime_column! |>

x -> night_or_day!(x, dawn_dusk_dict, night) |> #true=night, false=day

group_by_file!

#! format: on

# Make clip and spectrogram

for (k, v) in pairs(gdf)

#file_name = chop(v.file[1], head = 2, tail = 4)

file_name = path_to_file_string(v.file[1])

start_times = v[!, :start_time] |> sort

detections = cluster_detections(start_times)

isempty(detections) && continue

signal, freq = wavread("$location/$trip_date/$file_name.WAV")

length_signal = length(signal)

for detection in detections

st, en = calculate_clip_start_end(detection, freq, length_signal)

name = "$location-$trip_date-$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"

f = "Clips_$(today())"

mkpath(f)

outfile = "$f/$name"

sample = signal[Int(st):Int(en)]

wavwrite(sample, "$outfile.wav", Fs = Int(freq))

#plot = plot_spectrogram(sample, freq)

#savefig(plot, "$outfile.png")

image = get_image_from_sample(sample, freq)

PNGFiles.save("$outfile.png", image)

end

print(".")

end

println("\ndone $location/$trip_date \n")

end

#######################################################################

function assert_not_empty(df::DataFrame, preds_path::String)::DataFrame

size(df) != (0, 0) ? (return df) : @error "Empty dataframe at $preds_path"

#return df

end

function rename_column!(df::DataFrame, old_name::String, new_name::String)::DataFrame

old_name in names(df) && rename!(df, old_name => new_name)

return df

end

# assumes kiwi, binary classifier from opensoundscape

# needed to remove ::String annotation for location, trip_date to make it work

function assert_detections_present(

df::DataFrame,

label::Int,

location,

trip_date,

)::DataFrame

label in levels(df.label) ? (return df) :

@error "No detections for label = $label at $location/$trip_date"

end

# assumes kiwi

function filter_positives!(df::DataFrame, label)::DataFrame

#filter!(row -> row.kiwi > 0, df)

filter!(row -> row.label == label, df)

return df

end

function path_to_file_string(path) #becareful path::String won't work: no method matching path_to_file_string(::InlineStrings.String31) line 70

f = split(path, "/")[end] |> x -> split(x, ".") |> first

#f = chop(file, head = 2, tail = 4)

return f

end

function filename_to_datetime!(file)::DateTime

#file_string = chop(file, head = 2, tail = 4)

file_string = path_to_file_string(file)

date_time =

length(file_string) > 13 ? DateTime(file_string, dateformat"yyyymmdd_HHMMSS") :

DateTime(

(file_string[1:4] * "20" * file_string[5:end]),

dateformat"ddmmyyyy_HHMMSS",

)

return date_time

end

function insert_datetime_column!(df::DataFrame)::DataFrame

@transform!(df, @byrow :DateTime = filename_to_datetime!(String(:file)))

return df

end

# calls night(), needs dawn_dusk_dict in local time format

function night_or_day!(

df::DataFrame,

dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}},

night_time::Bool = true,

)::DataFrame

night_time ? @subset!(df, @byrow night(:DateTime, dawn_dusk_dict)) :

@subset!(df, @byrow !night(:DateTime, dawn_dusk_dict))

return df

end

function group_by_file!(df::DataFrame)

gdf = groupby(df, :file)

return gdf

end

function cluster_detections(start_times::Vector{Float64})::Vector{Vector{Float64}}

s = Vector{Float64}[]

t = Float64[start_times[1]]

for time in start_times[2:end]

if time - last(t) <= 15.0

push!(t, time)

else

push!(s, copy(t))

t = Float64[time]

end

push!(s, copy(t))

detections = filter(x -> length(x) > 1, s)

return detections

end

# assumes it is operating on 5 second clips

function calculate_clip_start_end(

detection::Vector{Float64},

freq::Float32,

length_signal::Int64,

)::Tuple{Float64,Float64}

first(detection) > 0 ? st = first(detection) * freq : st = 1

(last(detection) + 5.0) * freq <= length_signal ? en = (last(detection) + 5.0) * freq :

en = length_signal

return st, en

end

#= Deprecated use get_image_from_sample()

function plot_spectrogram(

sample::Vector{Float64},

freq::Float32,

)::Plots.Plot{Plots.GRBackend}

S = DSP.spectrogram(sample[:, 1], 400, 2; fs = convert(Int, freq))

plot = Plots.heatmap(

S.time,

S.freq,

pow2db.(S.power),

size = (448, 448),

showaxis = false,

ticks = false,

legend = false,

thickness_scaling = 0,

)

return plot

end

=#

# f neeeds to be an Int

function get_image_from_sample(sample, f) #sample::Vector{Float64}

S = DSP.spectrogram(sample, 400, 2; fs = convert(Int, f))

i = S.power

if minimum(i) == 0.0

l = i |> vec |> unique |> sort

replace!(i, 0.0 => l[2])

end

image =

#! format: off

DSP.pow2db.(i) |>

x -> x .+ abs(minimum(x)) |>

x -> x ./ maximum(x) |>

x -> reverse(x, dims = 1) |>

x -> RGB.(x) |>

x -> imresize(x, 224, 224)

#! format: on

return image

end

"""

construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}

sun = DataFrame(CSV.File(file))

Takes dawn dusk.csv and returns a dict to be consumeed by night().

~/dawn_dusk.csv

At present it goes from the start of 2019 to the end of 2024

The csv contains local time sunrise and sunset

I use this to decide if a file with a local time encoded name was recorded at night

dict = construct_dawn_dusk_dict("/Volumes/SSD1/dawn_dusk.csv")

dict = Utility.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")

using CSV, DataFrames

"""

function construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}

sun = DataFrame(CSV.File(file))

x = Tuple(zip(sun.Dawn, sun.Dusk))

y = Dict(zip(sun.Date, x))

return y

end

"""

night(call_time::DateTime, dict::Dict{Date, Tuple{DateTime, DateTime}})::Bool

Returns true if time is at night, ie between civil twilights, dusk to dawn.

Consumes dict from construct_dawn_dusk_dict

time=DateTime("2021-11-02T21:14:35",dateformat"yyyy-mm-ddTHH:MM:SS")

Utility.night(time, dict)

"""

function night(call_time::DateTime, dict::Dict{Date,Tuple{DateTime,DateTime}})::Bool

dawn = dict[Date(call_time)][1]

dusk = dict[Date(call_time)][2]

if call_time <= dawn || call_time >= dusk

return true

else

return false

end

#######################################################################

#INBETWEEN STEP: use secondary model to sort clips, move clips into D, F, M, N, and hand classify, generate actual.csv.

"""

move_clips_to_folders(df::DataFrame)

Takes a 2 column dataframe: file, label

file must be list of png images, assumes wav's are there too

will move mp4's from video folder if they are present

"""

function move_clips_to_folders(df::DataFrame)

p = glob("*.png")

w = glob("*.[W,w][A,a][V,v]")

@assert (first(df.file) |> x -> split(x, ".")[end] |> x -> x == "png") "df.file must be a list of png's"

@assert issetequal(df.file, p) "All png files in dataframe must be present in folder"

    @assert issetequal(chop.(df.file, head = 0, tail = 4), chop.(w, head = 0, tail = 4)) "There must be a wav for every png in the dataframe"

for row in eachrow(df)

src = row.file

dst = "$(row.label)/$(row.file)"

mkpath("$(row.label)/")

try

mv(src, dst)

mv(chop(src, tail = 3) * "wav", chop(dst, tail = 3) * "wav")

if isdir(video)

mkpath("video/$(row.label)/")

mv(

"video/" * chop(src, tail = 3) * "mp4",

"video/" * chop(dst, tail = 3) * "mp4",

)

end

catch e

@info e

end

#=

actual.csv must be list of qualified png file names:

D/C05-2023-04-15-20230219_223000-380-470.png

using Glob, DataFrames, CSV

a=glob("[M,F,D,N]/*.png")

df = DataFrame(file=a)

CSV.write("actual_mfdn.csv", df)

edit in src/Skraak.jl at line 14

[4.356020]→[4.356020:360059](∅→∅)

make a folder D,F,M,N

mkpath.(["D", "F", "M", "N"])

move wavs to match pngs

df=DataFrame(CSV.File("actual_mfdn.csv"))

for row in eachrow(df)

src=split(row.file, "/")[2]

dst=row.file

mv(src, dst)

mv(chop(src, tail=3)*"wav", chop(dst, tail=3)*"wav")

end

=#

#run from parent folder of label folders

#saves actual.csv and returns a df

#labels=["D", "F", "M", "N"]

function actual_from_folders(labels::Vector{String})::DataFrame

paths=String[]

for l in labels

paths=append!(paths, glob("$l/*.png"))

end

df = DataFrame(file=paths)

CSV.write("actual.csv", df)

return df

end

"""

aggregate_labels(actual="actual.csv", outfile="labels.csv")

file

[D, F, M, N]/C05-2023-04-15-20230219_223000-380-470.png

This function takes the csv output from my hand classification and ouputs a df, and csv for insertion into AudioData.duckdb using the duckdb cli or using DFto.audiodata_db()

assumes run from Clips_xxxx-xx-xx folder and that "actual.csv" present if not specified.

returns a dataframe

using CSV, DataFrames, DataFramesMeta

"""

#=

df=aggregate_labels()

audiodata_db(df, "pomona_labels_20230418") NOT_WORKING maybe titles

to use cli, need to remove header row

duckdb /media/david/SSD1/AudioData.duckdb

COPY pomona_labels_20230418 FROM 'DB_Labels/pomona_labels_2023-11-02.csv';

COPY pomona_files FROM 'DB_Files/pomona_files_20231102.csv';

Then backup with:

EXPORT DATABASE 'AudioDataBackup_2023-11-14';

.quit

Then quit and backup using cp on the db file, dated copy

Then rsync ssd to usb

rsync -avzr --delete /media/david/SSD1/ /media/david/USB/

note: run on mac

cd skraak.kiwi

julia-1.9

using Franklin

serve()

=#

# New one, without noise and distance, does not do :box anymore therefore requires new db schema

function aggregate_labels(

actual::String = "actual.csv",

outfile::String = "labels.csv",

hdr::Bool = false #header for outfile

)::DataFrame

df = DataFrame(CSV.File(actual))

# location, f, start_time, end_time

@transform!(df, @byrow :location = split(split(:file, "/")[2], "-")[1])

@transform!(df, @byrow :f = split(split(:file, "/")[2], "-")[5] * ".WAV")

@transform!( df, @byrow :start_time = split(split(:file, "/")[2], "-")[end-1])

@transform!( df, @byrow :end_time = chop(split(split(:file, "/")[2], "-")[end], tail=4))

    #@transform!( df, @byrow :box = "[$(split(split(:file, "/")[2], "-")[end-1]), $(chop(split(split(:file, "/")[2], "-")[end], tail=4))]")

# male, female, duet, not

@transform!(df, @byrow @passmissing :male = split(:file, "/")[1] == "M" ? true : false)

@transform!(

df,

@byrow @passmissing :female = split(:file, "/")[1] == "F" ? true : false

)

@transform!(df, @byrow @passmissing :duet = split(:file, "/")[1] == "D" ? true : false)

@transform!(

df,

@byrow @passmissing :not_kiwi =

split(:file, "/")[1] in ["KA", "KE", "N", "Q"] ? true : false

)

# other_label

@transform!(

df,

@byrow @passmissing :other_label =

split(:file, "/")[1] in ["KA", "KE", "Q"] ? split(:file, "/")[1] : missing

)

# remove unwanted cols, rename f to file

select!(df, Not([:file]))

rename!(df, :f => :file)

CSV.write(outfile, df; header=hdr)

return df

end

"""

audiodata_db(df::DataFrame, table::String)

Use to upload labels to AudioData.duckdb

Takes a dataframe and inserts into AudioData.db table.

audiodata_db(df, "pomona_labels_20230418")

using DataFrames, DBInterface, DuckDB, Random

"""

function audiodata_db(df::DataFrame, table::String)

if Sys.islinux()

con = DBInterface.connect(DuckDB.DB, "/media/david/SSD1/AudioData.duckdb")

else

con = DBInterface.connect(DuckDB.DB, "/Volumes/SSD1/AudioData.duckdb")

end

temp_name = randstring(6)

DuckDB.register_data_frame(con, df, temp_name)

DBInterface.execute(

con,

"""

INSERT

INTO $table

SELECT *

FROM '$temp_name'

""",

)

DBInterface.close!(con)

end

edit in src/Predict.jl at line 2

[4.366510]

export predict

edit in src/Predict.jl at line 8

[4.366665]→[4.366665:366681](∅→∅)

export predict

edit in src/ConstructPrimaryDataset.jl at line 3

[2.48]→[2.48:71](∅→∅)

# Does not compile yet

replacement in src/ConstructPrimaryDataset.jl at line 4

[2.72]→[4.375126:375170](∅→∅),[4.375126]→[4.375126:375170](∅→∅)

using DataFrames, DataFramesMeta, CSV, Glob

[2.72]

[4.375170]

using DataFrames, CSV, Glob

using DataFramesMeta: @transform!, @byrow #, @subset!, @passmissing

replacement in src/ConstructPrimaryDataset.jl at line 7

[4.375171]→[4.375171:375599](∅→∅)

# Only moves WAVs not already there in dataset

# converts WAVs to flac to save space, file metadata will not survive

# requires columns :location, :file, :start_time, :end_time

# :file is the file name, :location is the actual recorder location eg "C05"

# run where the raw data is

# will find file in folder structure location/trip_date/file

# constructs dataset at output_path

# assumes file name has one . for extension only

[4.375171]

[3.5050]

"""

Only moves WAVs not already there in dataset

converts WAVs to flac to save space, file metadata will not survive

requires columns :location, :file, :start_time, :end_time

:file is the file name, :location is the actual recorder location eg "C05"

run where the raw data is

will find file in folder structure location/trip_date/file

constructs dataset at output_path

assumes file name has one . for extension only

"""

replacement in src/ConstructPrimaryDataset.jl at line 49

[4.376687]→[4.376687:376720](∅→∅)

function save_pngs(df:DataFrame)

[4.376687]

[3.6239]

function save_pngs(df::DataFrame)

refactored Skraak.jl into sub files, tidy now

Dependencies

Change contents