BOPNWZL4RWF4UGC2LUEVONNQFSKYJ2Q5W747GH3UURZLBJFFEBUQC
# Train.jl
# https://github.com/FluxML/model-zoo/blob/master/tutorials/transfer_learning/transfer_learning.jl
# This works on my data IT TRAINS best, but only -t 4
# dont forget temp env
using Random: shuffle!
using Random: seed!
import Base: length
import Base: getindex
using Images
using Flux
using CUDA
using Metalhead
using Noise
using Glob
using BSON: @save
using Dates
#using CSV
using DataFrames
using FreqTables
using JLD2
using Logging, LoggingExtras
imgs = glob("2023-09-*/*/*/[N,K]/*.png") #from SSD2
seed!(1234);
shuffle!(imgs)
#CSV.write("files.csv", DataFrame(file=imgs))
device = CUDA.functional() ? gpu : cpu
struct ImageContainer{T<:Vector}
img::T
end
struct ValidationImageContainer{T<:Vector}
img::T
end
data = ImageContainer(imgs)
val_data = ValidationImageContainer(imgs)
length(data::ImageContainer) = length(data.img)
length(data::ValidationImageContainer) = length(data.img)
const im_size = (224, 224)
name_to_idx = Dict{String,Int32}("K" => 1, "N" => 2)
function getindex(data::ImageContainer{Vector{String}}, idx::Int)
path = data.img[idx]
img =
Images.load(path) |>
x ->
Images.imresize(x, 224, 224) |>
x ->
Images.RGB.(x) |>
x ->
Noise.add_gauss(x, (rand() * 0.2)) |>
x ->
apply_mask(x, 3, 3, 12) |>
x ->
collect(channelview(float32.(x))) |>
x -> permutedims(x, (3, 2, 1))
y = name_to_idx[(split(path, "/")[end-1])]
return img, y
end
function getindex(data::ValidationImageContainer{Vector{String}}, idx::Int)
path = data.img[idx]
img =
Images.load(path) |>
x ->
Images.imresize(x, 224, 224) |>
x ->
Images.RGB.(x) |>
x -> collect(channelview(float32.(x))) |> x -> permutedims(x, (3, 2, 1))
y = name_to_idx[(split(path, "/")[end-1])]
return img, y
end
# assumes 224px square images
function apply_mask(
img::Array{RGB{N0f8},2},
max_number::Int = 3,
min_size::Int = 3,
max_size::Int = 22,
)
# horizontal
for range in get_random_ranges(max_number, min_size, max_size)
img[range, :] .= RGB{N0f8}(0.7, 0.7, 0.7)
end
# vertical
for range in get_random_ranges(max_number, min_size, max_size)
img[:, range] .= RGB{N0f8}(0.7, 0.7, 0.7)
end
return img
end
# assumes 224px square images
function get_random_ranges(max_number::Int, min_size::Int, max_size::Int)
number = rand(0:max_number)
ranges = []
while length(ranges) < number
start = rand(1:224)
size = rand(min_size:max_size)
if start + size > 224
continue
end
push!(ranges, start:start+size)
end
return ranges
end
# define DataLoaders
const batch_size = 64
const train_test_split = 0.95
const ceiling = length(data) ÷ batch_size * batch_size
const train_test_index =
ceiling ÷ batch_size * train_test_split |> round |> x -> x * batch_size |> Int
train = Flux.DataLoader(
ImageContainer(imgs[1:train_test_index]);
batchsize = batch_size,
collate = true,
parallel = true,
)
device == gpu ? train = CuIterator(train) : nothing
train_sample = Flux.DataLoader(
ValidationImageContainer(imgs[1:(ceiling-train_test_index)]);
batchsize = batch_size,
collate = true,
parallel = true,
)
device == gpu ? train_sample = CuIterator(train_sample) : nothing
test = Flux.DataLoader(
ValidationImageContainer(imgs[train_test_index+1:ceiling]);
batchsize = batch_size,
collate = true,
parallel = true,
)
device == gpu ? test = CuIterator(test) : nothing
fst = Metalhead.ResNet(18, pretrain = true).layers
# BEWARE NUMBER CLASSES
lst = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => 2));
model = Flux.Chain(fst[1], lst) |> device
function eval_f(m, d)
good = 0
count = 0
pred = []
actual = []
for (x, y) in d
p = Flux.onecold(m(x))
good += sum(p .== y)
count += length(y)
append!(pred, p)
append!(actual, y)
end
accuracy = round(good / count, digits = 4)
confusion_matrix =
freqtable(DataFrame(targets = actual, predicts = pred), :targets, :predicts)
return accuracy, confusion_matrix
end
# BEWARE NUMBER CLASSES
function train_epoch!(model; opt, train)
Flux.train!(model, train, opt) do m, x, y
Flux.Losses.logitcrossentropy(m(x), Flux.onehotbatch(y, 1:2))
end
end
opt = Flux.setup(Flux.Optimisers.Adam(1e-5), model);
logger = FileLogger("logfile.txt"; append = true)
@time metric_eval, v_confusion_matrix = eval_f(model, test)
#with_logger(logger) do
@info "eval" accuracy = metric_eval
@info "eval" v_confusion_matrix
#end
a = 0.0
for iter in 1:15
println("")
println("Epoch: $iter")
@time train_epoch!(model; opt, train)
@time metric_train, t_confusion_matrix = eval_f(model, train_sample)
#with_logger(logger) do
@info "Epoch: " iter
@info "train" accuracy = metric_train
@info "train" t_confusion_matrix
#end
@time metric_eval, v_confusion_matrix = eval_f(model, test)
#with_logger(logger) do
@info "test" accuracy = metric_eval
@info "test" v_confusion_matrix
#end
metric_eval > a && begin
a = metric_eval
let _model = cpu(model)
jldsave(
"model_K1-4_CPU_epoch-$iter-$metric_eval-$(today()).jld2";
model_state = Flux.state(_model),
)
#BSON.@save "model_K1-3_CPU_epoch-$iter-$metric_eval-$(now()).bson" _model
#with_logger(logger) do
@info "Saved a best_model"
#end
end
end
end
# Predict.jl
using WAV, DSP, Images, ThreadsX, Dates, DataFrames, CSV, Flux, CUDA, Metalhead, JLD2, FLAC
export predict
"""
predict(glob_pattern::String, model::String)
This function takes a glob pattern for folders to run over, and a model path. It saves results in a csv for each folder, similar to opensoundscape
Args:
• glob pattern (folder/)
• model path
Returns: Nothing - This function saves csv files.
I use this function to find kiwi from new data gathered on a trip.
Note:
Dont forget temp env, julia -t 4
From Pomona-3/Pomona-3/
Use like:
using Skraak
glob_pattern = "*/2023-10-19/" #from SSD1
model = "/media/david/SSD1/model_K1-3_CPU_epoch-10-0.9965-2023-10-18T17:32:36.747.jld2"
predict(glob_pattern, model)
"""
function predict(glob_pattern::String, model::String)
model = load_model(model) |> device
folders = glob(glob_pattern)
@info "Folders: $folders"
for folder in folders
@info "Working on: $folder"
predict_folder(folder, model)
end
end
function predict(folders::Vector{String}, model::String)
model = load_model(model) |> device
@info "Folders: $folders"
for folder in folders
@info "Working on: $folder"
predict_folder(folder, model)
end
end
#~~~~~ The guts ~~~~~#
device = CUDA.functional() ? gpu : cpu
#= TO DELETE
function get_image_for_inference(sample, f)
S = DSP.spectrogram(sample, 400, 2; fs = f)
i = S.power
if minimum(i) == 0.0
l = i |> vec |> unique |> sort
replace!(i, 0.0 => l[2])
end
image =
#! format: off
DSP.pow2db.(i) |>
x -> x .+ abs(minimum(x)) |>
x -> x ./ maximum(x) |>
x -> reverse(x, dims = 1) |>
x -> RGB.(x) |>
x -> imresize(x, 224, 224) |>
x -> collect(channelview(float32.(x))) |>
x -> permutedims(x, (3, 2, 1))
#! format: on
return image
end
=#
function get_image_for_inference(sample, f)
image =
#! format: off
get_image_from_sample(sample, f) |>
x -> collect(channelview(float32.(x))) |>
x -> permutedims(x, (3, 2, 1))
#! format: on
return image
end
function get_images(file::String, increment::Int = 5, divisor::Int = 2) #5s sample, 2.5s hop
ext = split(file, ".")[end]
@assert ext in ["WAV", "wav", "flac"] "Unsupported audio file type, requires wav or flac."
if ext in ["WAV", "wav"]
signal, freq = wavread(file)
else
signal, freq = load(file)
end
if freq > 16000
signal = DSP.resample(signal, 16000.0f0 / freq; dims = 1)
freq = 16000
end
f = convert(Int, freq)
inc = increment * f
hop = f * increment ÷ divisor #need guarunteed Int, maybe not anymore, refactor
split_signal = DSP.arraysplit(signal[:, 1], inc, hop)
raw_images = ThreadsX.map(x -> get_image_for_inference(x, f), split_signal)
n_samples = length(raw_images)
return raw_images, n_samples
end
function get_images_time_from_wav(file::String, increment::Int = 5, divisor::Int = 2)
raw_images, n_samples = get_images(file::String, increment, divisor)
images = reshape_images(raw_images, n_samples)
start_time = 0:(increment/divisor):(n_samples-1)*(increment/divisor)
end_time = increment:(increment/divisor):(n_samples+1)*(increment/divisor)
time = collect(zip(start_time, end_time))
return images, time
end
function reshape_images(raw_images, n_samples)
images =
#! format: off
hcat(raw_images...) |>
x -> reshape(x, (224, 224, 3, n_samples))
#! format: on
return images
end
function predict_file(file::String, folder::String, model)
#check form of opensoundscape preds.csv and needed by my make_clips
@info "File: $file"
@time images, time = get_images_time_from_wav(file)
data = images |> device
@time predictions = Flux.onecold(model(data))
f = (repeat(["$file"], length(time)))
df = DataFrame(
:file => f,
:start_time => first.(time),
:end_time => last.(time),
:label => predictions,
)
return df
end
function predict_folder(folder::String, model)
wav = glob("$folder/*.[W,w][A,a][V,v]")
flac = glob("$folder/*.flac")
files = cat(wav, flac; dims = 1)
@info "$(length(files)) files in $folder"
df = DataFrame(
file = String[],
start_time = Float64[],
end_time = Float64[],
label = Int[],
)
save_path = "$folder/preds-$(today()).csv"
CSV.write("$save_path", df)
for file in files
df = predict_file(file, folder, model)
CSV.write("$save_path", df, append = true)
end
end
# see load_model() from train, different input types
function load_model(model_path::String)
model_state = JLD2.load(model_path, "model_state")
model_classes = length(model_state[1][2][1][3][2])
f = Metalhead.ResNet(18, pretrain = false).layers
l = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => model_classes))
model = Flux.Chain(f[1], l)
Flux.loadmodel!(model, model_state)
return model
end
#=
function load_bson(model_path::String)
BSON.@load model_path model
end
=#
############### PYTHON Opensoundscape ################
#=
# Dont forget conda activate opensoundscape
# Dont forget to modify file names and glob pattern
# Run script in Pomona-2, hard code trip date in the glob
# python /media/david/USB/Skraak/src/predict.py
from opensoundscape.torch.models.cnn import load_model
import opensoundscape
import torch
from pathlib import Path
import numpy as np
import pandas as pd
from glob import glob
import os
from datetime import datetime
model = load_model('/home/david/best.model')
# folders = glob('./*/2023-?????/')
folders = glob('./*/*/')
for folder in folders:
os.chdir(folder)
print(folder, ' start: ', datetime.now())
# Beware, secretary island files are .wav
field_recordings = glob('./*.WAV')
scores, preds, unsafe = model.predict(
field_recordings,
binary_preds = 'single_target',
overlap_fraction = 0.5,
batch_size = 128,
num_workers = 12)
scores.to_csv("scores-2023-11-07.csv")
preds.to_csv("preds-2023-11-07.csv")
os.chdir('../..')
print(folder, ' done: ', datetime.now())
print()
print()
=#
using CSV,
DataFrames,
Dates,
DBInterface,
DSP,
DuckDB,
Glob,
HTTP,
Images,
JSON,
PNGFiles,
Random,
SHA,
TimeZones,
WAV,
XMLDict
export move_one_hour!,
check_png_wav_both_present,
file_metadata_to_df,
resize_image!,
twilight_tuple_local_time,
utc_to_nzdt!
"""
move_one_hour!(files::Vector{String}, operator)
This function takes a vector of file paths and renames each file in the
vector by changing the name of the file to the name of the file created one
hour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,
which represents the time stamp of the original file minus (or plus) one hour. This
function avoids force=true with mv, since new file names may already exist
and mv will stacktrace leaving a big mess to tidy up.
Args:
• files (Vector{String}): A vector of strings where each element is
a path to a file.
Returns: Nothing - This function only renames files and saves them.
I use this to turn the clock back at the end of daylight saving.
"""
#Assumes WAV files
function move_one_hour!(files::Vector{String}, operator)
@assert operator == (+) || operator == (-)
fix_extension_of_files = []
for old_file in files
# Extract the date and time of the original file using string chopping
a = chop(old_file, tail = 4)
d, t = split(a, "_")
ye = parse(Int64, d[1:4])
mo = parse(Int64, d[5:6])
da = parse(Int64, d[7:8])
ho = parse(Int64, t[1:2])
mi = parse(Int64, t[3:4])
se = parse(Int64, t[5:6])
export check_png_wav_both_present,
resize_image!, twilight_tuple_local_time, move_one_hour!, utc_to_nzdt!
#new_date = dt - Dates.Hour(1)
new_date = operator(dt, Dates.Hour(1))
# Must drop the WAV extension to avoiding force=true
# with mv, since the new file name may already exist and mv
# will stacktrace leaving a big mess to tidy up.
base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")
temp_file = base_file * ".tmp"
# Tuple to tidy extensions later
tidy = (temp_file, base_file * ".WAV")
mv(old_file, temp_file)
push!(fix_extension_of_files, tidy)
print(".")
end
for item in fix_extension_of_files
mv(item[1], item[2])
end
print("Tidy\n")
end
#=
used like:
using Glob, Skraak, CSV
folders=glob("*/2023-11-02/")
for folder in folders
cd(folder)
try
df = Skraak.file_metadata_to_df()
CSV.write("/media/david/Pomona-3/Pomona-3/pomona_files_20231102.csv", df; append=true)
catch
@warn "error with $folder"
end
cd("/media/david/Pomona-3/Pomona-3/")
end
Then using duckdb cli from SSD:
duckdb AudioData.duckdb
show tables;
SELECT * FROM pomona_files;
COPY pomona_files FROM '/media/david/Pomona-3/Pomona-3/pomona_files_20231019.csv';
SELECT * FROM pomona_files;
Then backup with:
EXPORT DATABASE 'AudioDataBackup_2023-07-29';
.quit
Then quit and backup using cp on the db file
Then rsync ssd to usb
rsync -avzr --delete /media/david/SSD1/ /media/david/USB/
=#
file_metadata_to_df()
This function takes a file name, extracts wav metadata, gpx location, recording period start/end and returnes a dataframe.
This function needs raw audiomoth wav files and a gpx.
This function needs /media/david/SSD1/dawn_dusk.csv
using DataFrames, Dates, DelimitedFiles, DuckDB, Glob, JSON3, Random, SHA, TimeZones, WAV, XMLDict
"""
function file_metadata_to_df()
# Initialise dataframe with columns: disk, location, trip_date, file, lattitude, longitude, start_recording_period_localt, finish_recording_period_localt, duration, sample_rate, zdt, ldt, moth_id, gain, battery, temperature
df = DataFrame(
disk = String[],
location = String[],
trip_date = String[],
file = String[],
latitude = Float64[],
longitude = Float64[],
start_recording_period_localt = String[],
finish_recording_period_localt = String[],
duration = Float64[],
sample_rate = Int[],
utc = String[],
ldt = String[],
moth_id = String[],
gain = String[],
battery = Float64[],
temperature = Float64[],
sha2_256 = String[],
night = Bool[],
)
#Get WAV list for folder
wav_list = glob("*.WAV") |> sort
#Return empty df if nothing in the folder
if length(wav_list) == 0
return df
end
#Get path info from file system
raw_path_vec = split(pwd(), "/")[end-2:end]
disk = raw_path_vec[1]
location = raw_path_vec[2]
trip_date = raw_path_vec[3]
#Get location, assumes 1 gpx is in the follder
waypoint = glob("*.gpx")
length(waypoint) != 1 && @error "no gpx file in $trip_date $location"
loc = read(waypoint[1], String) |> xml_dict
latitude = parse(Float64, (loc["gpx"]["wpt"][:lat]))
longitude = parse(Float64, (loc["gpx"]["wpt"][:lon]))
#Start of recording period
_, _, _, binary_metadata_start = wavread(wav_list[1])
c_v_s = split(wav_info_read(binary_metadata_start)[:ICMT], " ")
comment_vector_start = length(c_v_s) < 22 ? c_v_s : c_v_s[1:19]
date_start = split(comment_vector_start[4], "/")
time_start = split(comment_vector_start[3], ":")
tz_start = chop(comment_vector_start[5], head = 4, tail = 1)
time_zone_start = isempty(tz_start) ? "+00" : tz_start
#zdt1 = ZonedDateTime(parse(Int, date_start[3]), parse(Int, date_start[2]), parse(Int, date_start[1]), parse(Int, time_start[1]), parse(Int, time_start[2]), parse(Int, time_start[3]), tz"UTC")
time_string_start =
date_start[3] *
"-" *
date_start[2] *
"-" *
date_start[1] *
"T" *
time_start[1] *
":" *
time_start[2] *
":" *
time_start[3] *
"." *
"000" *
time_zone_start
zdt1 = ZonedDateTime(time_string_start)
start_recording_period_localt =
Dates.format(astimezone(zdt1, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
#End of recording period
_, _, _, binary_metadata_end = wavread(wav_list[end])
c_v_e = split(wav_info_read(binary_metadata_end)[:ICMT], " ")
comment_vector_end = length(c_v_e) < 22 ? c_v_e : c_v_e[1:19]
date_end = split(comment_vector_end[4], "/")
time_end = split(comment_vector_end[3], ":")
tz_end = chop(comment_vector_start[5], head = 4, tail = 1)
time_zone_end = isempty(tz_end) ? "+00" : tz_end
#zdt2 = ZonedDateTime(parse(Int, date_end[3]), parse(Int, date_end[2]), parse(Int, date_end[1]),parse(Int, time_end[1]), parse(Int, time_end[2]), parse(Int, time_end[3]), tz"UTC")
time_string_end =
date_end[3] *
"-" *
date_end[2] *
"-" *
date_end[1] *
"T" *
time_end[1] *
":" *
time_end[2] *
":" *
time_end[3] *
"." *
"000" *
time_zone_end
zdt2 = ZonedDateTime(time_string_end)
finish_recording_period_localt =
Dates.format(astimezone(zdt2, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
dict = Skraak.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
#So I know what it is doing
println(raw_path_vec)
#Loop over file list
for file in wav_list
#print(file)
try
audio_data, sample_rate, _, binary_metadata = wavread(file)
c_v = split(wav_info_read(binary_metadata)[:ICMT], " ")
comment_vector = length(c_v) < 22 ? c_v : c_v[1:19]
duration = Float64(length(audio_data) / sample_rate)
date = split(comment_vector[4], "/")
time = split(comment_vector[3], ":")
tz = chop(comment_vector[5], head = 4, tail = 1)
time_zone = isempty(tz) ? "+00" : tz
#preformatting_zdt = ZonedDateTime(parse(Int, date[3]), parse(Int, date[2]), parse(Int, date[1]), parse(Int, time[1]), parse(Int, time[2]), parse(Int, time[3]), tz"UTC")
time_string =
date[3] *
"-" *
date[2] *
"-" *
date[1] *
"T" *
time[1] *
":" *
time[2] *
":" *
time[3] *
"." *
"000" *
time_zone
preformatting_zdt = ZonedDateTime(time_string)
#zdt = Dates.format(preformatting_zdt, "yyyy-mm-dd HH:MM:SSzzzz")
preformatting_utc = astimezone(preformatting_zdt, tz"UTC")
utc = Dates.format(preformatting_utc, "yyyy-mm-dd HH:MM:SSzzzz")
preformatting_ldt = astimezone(preformatting_zdt, tz"Pacific/Auckland")
ldt = Dates.format(preformatting_ldt, "yyyy-mm-dd HH:MM:SSzzzz")
moth_id = comment_vector[8]
gain = comment_vector[10]
#index back from end because if V > 4.9 the wording chaaanges
battery = parse(Float64, chop(comment_vector[end-4], tail = 1))
temperature = parse(Float64, chop(comment_vector[end], tail = 2))
sha2_256 = bytes2hex(sha256(file))
#assumes 15 minute file and calculates on half way time
nt = Skraak.night(DateTime(preformatting_ldt + Minute(7) + Second(30)), dict)
#Populate row to push into df
row = [
disk,
location,
trip_date,
file,
latitude,
longitude,
start_recording_period_localt,
finish_recording_period_localt,
duration,
Int(sample_rate),
utc,
ldt,
moth_id,
gain,
battery,
temperature,
sha2_256,
nt,
]
push!(df, row)
print(".")
catch
@warn "error with $folder $file"
end
end
return df
end
"""
move_one_hour!(files::Vector{String}, operator)
This function takes a vector of file paths and renames each file in the
vector by changing the name of the file to the name of the file created one
hour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,
which represents the time stamp of the original file minus (or plus) one hour. This
function avoids force=true with mv, since new file names may already exist
and mv will stacktrace leaving a big mess to tidy up.
Args:
• files (Vector{String}): A vector of strings where each element is
a path to a file.
Returns: Nothing - This function only renames files and saves them.
I use this to turn the clock back at the end of daylight saving.
Assumes WAV files
"""
function move_one_hour!(files::Vector{String}, operator)
@assert operator == (+) || operator == (-)
fix_extension_of_files = []
for old_file in files
# Extract the date and time of the original file using string chopping
a = chop(old_file, tail = 4)
d, t = split(a, "_")
ye = parse(Int64, d[1:4])
mo = parse(Int64, d[5:6])
da = parse(Int64, d[7:8])
ho = parse(Int64, t[1:2])
mi = parse(Int64, t[3:4])
se = parse(Int64, t[5:6])
dt = DateTime(ye, mo, da, ho, mi, se)
#new_date = dt - Dates.Hour(1)
new_date = operator(dt, Dates.Hour(1))
# Must drop the WAV extension to avoiding force=true
# with mv, since the new file name may already exist and mv
# will stacktrace leaving a big mess to tidy up.
base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")
temp_file = base_file * ".tmp"
# Tuple to tidy extensions later
tidy = (temp_file, base_file * ".WAV")
mv(old_file, temp_file)
push!(fix_extension_of_files, tidy)
print(".")
end
for item in fix_extension_of_files
mv(item[1], item[2])
end
print("Tidy\n")
end
"""
using CSV, DataFrames, DataFramesMeta, Dates, DSP, Glob, JSON, Random, TimeZones, WAV, PNGFiles, Images #Plots
#import DataFramesMeta: @transform!, @subset!, @byrow, @passmissing
"""
make_clips(preds_path::String, dawn_dusk_dict::Dict{Dates.Date, Tuple{Dates.DateTime, Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv"))
This function takes a preds.csv files and generates
file names, wav's, spectrograms etc to be reviewed.
it calls night() and may call construct_dawn_dusk_dict() unless the dict is globally defined and passed in
It should be run from Pomona-1/, Pomona-2/ or Pomona-3/, assumes it is, it uses the path
It saves wav and png files to /home/david/Upload/
need to use a try/catch because the 2 assert functions thow an error to short circuit the function
using Glob, Skraak
predictions = glob("*/2023-09-11*/preds*")
predictions = glob("path/to/preds*")
for file in predictions #[1:6][7:12][13:18][19:24]
try
make_clips(file)
catch x
println(x)
end
end
if needed to change headers in preds csv
shift, control, f in subl
file,start_time,end_time,label
/media/david/Pomona-2,<project filters>, preds-2023-02-27.csv
file,start_time,end_time,absent,present
using Glob, CSV, DataFrames, DataFramesMeta, Dates, DSP, Plots, Random, WAV
"""
# Assumes run on linux
# Assumes function run from Pomona-1 or Pomona-2
#dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv",),
function make_clips(
preds_path::String,
label::Int = 1,
night::Bool = true,
dawn_dusk_dict = dddict,
)
# Assumes function run from Pomona-1 or Pomona-2
location, trip_date, _ = split(preds_path, "/")
# Load and group data frame by file
gdf =
#! format: off
DataFrame(CSV.File(preds_path)) |>
x -> assert_not_empty(x, preds_path) |>
x -> rename_column!(x, "1.0", "label") |> #can remove now, needs to be label
x -> assert_detections_present(x, label, location, trip_date) |>
x -> filter_positives!(x, label) |>
insert_datetime_column! |>
x -> night_or_day!(x, dawn_dusk_dict, night) |> #true=night, false=day
group_by_file!
#! format: on
# Make clip and spectrogram
for (k, v) in pairs(gdf)
#file_name = chop(v.file[1], head = 2, tail = 4)
file_name = path_to_file_string(v.file[1])
start_times = v[!, :start_time] |> sort
detections = cluster_detections(start_times)
isempty(detections) && continue
signal, freq = wavread("$location/$trip_date/$file_name.WAV")
length_signal = length(signal)
for detection in detections
st, en = calculate_clip_start_end(detection, freq, length_signal)
name = "$location-$trip_date-$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"
f = "Clips_$(today())"
mkpath(f)
outfile = "$f/$name"
sample = signal[Int(st):Int(en)]
wavwrite(sample, "$outfile.wav", Fs = Int(freq))
#plot = plot_spectrogram(sample, freq)
#savefig(plot, "$outfile.png")
image = get_image_from_sample(sample, freq)
PNGFiles.save("$outfile.png", image)
end
print(".")
end
println("\ndone $location/$trip_date \n")
end
#######################################################################
function assert_not_empty(df::DataFrame, preds_path::String)::DataFrame
size(df) != (0, 0) ? (return df) : @error "Empty dataframe at $preds_path"
#return df
end
function rename_column!(df::DataFrame, old_name::String, new_name::String)::DataFrame
old_name in names(df) && rename!(df, old_name => new_name)
return df
end
# assumes kiwi, binary classifier from opensoundscape
# needed to remove ::String annotation for location, trip_date to make it work
function assert_detections_present(
df::DataFrame,
label::Int,
location,
trip_date,
)::DataFrame
label in levels(df.label) ? (return df) :
@error "No detections for label = $label at $location/$trip_date"
end
# assumes kiwi
function filter_positives!(df::DataFrame, label)::DataFrame
#filter!(row -> row.kiwi > 0, df)
filter!(row -> row.label == label, df)
return df
end
function path_to_file_string(path) #becareful path::String won't work: no method matching path_to_file_string(::InlineStrings.String31) line 70
f = split(path, "/")[end] |> x -> split(x, ".") |> first
#f = chop(file, head = 2, tail = 4)
return f
end
function filename_to_datetime!(file)::DateTime
#file_string = chop(file, head = 2, tail = 4)
file_string = path_to_file_string(file)
date_time =
length(file_string) > 13 ? DateTime(file_string, dateformat"yyyymmdd_HHMMSS") :
DateTime(
(file_string[1:4] * "20" * file_string[5:end]),
dateformat"ddmmyyyy_HHMMSS",
)
return date_time
end
function insert_datetime_column!(df::DataFrame)::DataFrame
@transform!(df, @byrow :DateTime = filename_to_datetime!(String(:file)))
return df
end
# calls night(), needs dawn_dusk_dict in local time format
function night_or_day!(
df::DataFrame,
dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}},
night_time::Bool = true,
)::DataFrame
night_time ? @subset!(df, @byrow night(:DateTime, dawn_dusk_dict)) :
@subset!(df, @byrow !night(:DateTime, dawn_dusk_dict))
return df
end
function group_by_file!(df::DataFrame)
gdf = groupby(df, :file)
return gdf
end
function cluster_detections(start_times::Vector{Float64})::Vector{Vector{Float64}}
s = Vector{Float64}[]
t = Float64[start_times[1]]
for time in start_times[2:end]
if time - last(t) <= 15.0
push!(t, time)
else
push!(s, copy(t))
t = Float64[time]
end
end
push!(s, copy(t))
detections = filter(x -> length(x) > 1, s)
return detections
end
# assumes it is operating on 5 second clips
function calculate_clip_start_end(
detection::Vector{Float64},
freq::Float32,
length_signal::Int64,
)::Tuple{Float64,Float64}
first(detection) > 0 ? st = first(detection) * freq : st = 1
(last(detection) + 5.0) * freq <= length_signal ? en = (last(detection) + 5.0) * freq :
en = length_signal
return st, en
end
#= Deprecated use get_image_from_sample()
function plot_spectrogram(
sample::Vector{Float64},
freq::Float32,
)::Plots.Plot{Plots.GRBackend}
S = DSP.spectrogram(sample[:, 1], 400, 2; fs = convert(Int, freq))
plot = Plots.heatmap(
S.time,
S.freq,
pow2db.(S.power),
size = (448, 448),
showaxis = false,
ticks = false,
legend = false,
thickness_scaling = 0,
)
return plot
end
=#
# f neeeds to be an Int
function get_image_from_sample(sample, f) #sample::Vector{Float64}
S = DSP.spectrogram(sample, 400, 2; fs = convert(Int, f))
i = S.power
if minimum(i) == 0.0
l = i |> vec |> unique |> sort
replace!(i, 0.0 => l[2])
end
image =
#! format: off
DSP.pow2db.(i) |>
x -> x .+ abs(minimum(x)) |>
x -> x ./ maximum(x) |>
x -> reverse(x, dims = 1) |>
x -> RGB.(x) |>
x -> imresize(x, 224, 224)
#! format: on
return image
end
"""
construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
sun = DataFrame(CSV.File(file))
Takes dawn dusk.csv and returns a dict to be consumeed by night().
~/dawn_dusk.csv
At present it goes from the start of 2019 to the end of 2024
The csv contains local time sunrise and sunset
I use this to decide if a file with a local time encoded name was recorded at night
dict = construct_dawn_dusk_dict("/Volumes/SSD1/dawn_dusk.csv")
dict = Utility.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
using CSV, DataFrames
"""
function construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
sun = DataFrame(CSV.File(file))
x = Tuple(zip(sun.Dawn, sun.Dusk))
y = Dict(zip(sun.Date, x))
return y
end
"""
night(call_time::DateTime, dict::Dict{Date, Tuple{DateTime, DateTime}})::Bool
Returns true if time is at night, ie between civil twilights, dusk to dawn.
Consumes dict from construct_dawn_dusk_dict
time=DateTime("2021-11-02T21:14:35",dateformat"yyyy-mm-ddTHH:MM:SS")
Utility.night(time, dict)
"""
function night(call_time::DateTime, dict::Dict{Date,Tuple{DateTime,DateTime}})::Bool
dawn = dict[Date(call_time)][1]
dusk = dict[Date(call_time)][2]
if call_time <= dawn || call_time >= dusk
return true
else
return false
end
end
#######################################################################
#INBETWEEN STEP: use secondary model to sort clips, move clips into D, F, M, N, and hand classify, generate actual.csv.
"""
move_clips_to_folders(df::DataFrame)
Takes a 2 column dataframe: file, label
file must be list of png images, assumes wav's are there too
will move mp4's from video folder if they are present
"""
function move_clips_to_folders(df::DataFrame)
p = glob("*.png")
w = glob("*.[W,w][A,a][V,v]")
@assert (first(df.file) |> x -> split(x, ".")[end] |> x -> x == "png") "df.file must be a list of png's"
@assert issetequal(df.file, p) "All png files in dataframe must be present in folder"
@assert issetequal(chop.(df.file, head = 0, tail = 4), chop.(w, head = 0, tail = 4)) "There must be a wav for every png in the dataframe"
for row in eachrow(df)
src = row.file
dst = "$(row.label)/$(row.file)"
mkpath("$(row.label)/")
try
mv(src, dst)
mv(chop(src, tail = 3) * "wav", chop(dst, tail = 3) * "wav")
if isdir(video)
mkpath("video/$(row.label)/")
mv(
"video/" * chop(src, tail = 3) * "mp4",
"video/" * chop(dst, tail = 3) * "mp4",
)
end
catch e
@info e
end
end
end
#=
actual.csv must be list of qualified png file names:
D/C05-2023-04-15-20230219_223000-380-470.png
using Glob, DataFrames, CSV
a=glob("[M,F,D,N]/*.png")
df = DataFrame(file=a)
CSV.write("actual_mfdn.csv", df)
make a folder D,F,M,N
mkpath.(["D", "F", "M", "N"])
move wavs to match pngs
df=DataFrame(CSV.File("actual_mfdn.csv"))
for row in eachrow(df)
src=split(row.file, "/")[2]
dst=row.file
mv(src, dst)
mv(chop(src, tail=3)*"wav", chop(dst, tail=3)*"wav")
end
=#
#run from parent folder of label folders
#saves actual.csv and returns a df
#labels=["D", "F", "M", "N"]
function actual_from_folders(labels::Vector{String})::DataFrame
paths=String[]
for l in labels
paths=append!(paths, glob("$l/*.png"))
end
df = DataFrame(file=paths)
CSV.write("actual.csv", df)
return df
end
"""
aggregate_labels(actual="actual.csv", outfile="labels.csv")
file
[D, F, M, N]/C05-2023-04-15-20230219_223000-380-470.png
This function takes the csv output from my hand classification and ouputs a df, and csv for insertion into AudioData.duckdb using the duckdb cli or using DFto.audiodata_db()
assumes run from Clips_xxxx-xx-xx folder and that "actual.csv" present if not specified.
returns a dataframe
using CSV, DataFrames, DataFramesMeta
"""
#=
df=aggregate_labels()
audiodata_db(df, "pomona_labels_20230418") NOT_WORKING maybe titles
to use cli, need to remove header row
duckdb /media/david/SSD1/AudioData.duckdb
COPY pomona_labels_20230418 FROM 'DB_Labels/pomona_labels_2023-11-02.csv';
COPY pomona_files FROM 'DB_Files/pomona_files_20231102.csv';
Then backup with:
EXPORT DATABASE 'AudioDataBackup_2023-11-14';
.quit
Then quit and backup using cp on the db file, dated copy
Then rsync ssd to usb
rsync -avzr --delete /media/david/SSD1/ /media/david/USB/
note: run on mac
cd skraak.kiwi
julia-1.9
using Franklin
serve()
=#
# New one, without noise and distance, does not do :box anymore therefore requires new db schema
function aggregate_labels(
actual::String = "actual.csv",
outfile::String = "labels.csv",
hdr::Bool = false #header for outfile
)::DataFrame
df = DataFrame(CSV.File(actual))
# location, f, start_time, end_time
@transform!(df, @byrow :location = split(split(:file, "/")[2], "-")[1])
@transform!(df, @byrow :f = split(split(:file, "/")[2], "-")[5] * ".WAV")
@transform!( df, @byrow :start_time = split(split(:file, "/")[2], "-")[end-1])
@transform!( df, @byrow :end_time = chop(split(split(:file, "/")[2], "-")[end], tail=4))
#@transform!( df, @byrow :box = "[$(split(split(:file, "/")[2], "-")[end-1]), $(chop(split(split(:file, "/")[2], "-")[end], tail=4))]")
# male, female, duet, not
@transform!(df, @byrow @passmissing :male = split(:file, "/")[1] == "M" ? true : false)
@transform!(
df,
@byrow @passmissing :female = split(:file, "/")[1] == "F" ? true : false
)
@transform!(df, @byrow @passmissing :duet = split(:file, "/")[1] == "D" ? true : false)
@transform!(
df,
@byrow @passmissing :not_kiwi =
split(:file, "/")[1] in ["KA", "KE", "N", "Q"] ? true : false
)
# other_label
@transform!(
df,
@byrow @passmissing :other_label =
split(:file, "/")[1] in ["KA", "KE", "Q"] ? split(:file, "/")[1] : missing
)
# remove unwanted cols, rename f to file
select!(df, Not([:file]))
rename!(df, :f => :file)
CSV.write(outfile, df; header=hdr)
return df
end
"""
audiodata_db(df::DataFrame, table::String)
Use to upload labels to AudioData.duckdb
Takes a dataframe and inserts into AudioData.db table.
audiodata_db(df, "pomona_labels_20230418")
using DataFrames, DBInterface, DuckDB, Random
"""
function audiodata_db(df::DataFrame, table::String)
if Sys.islinux()
con = DBInterface.connect(DuckDB.DB, "/media/david/SSD1/AudioData.duckdb")
else
con = DBInterface.connect(DuckDB.DB, "/Volumes/SSD1/AudioData.duckdb")
end
temp_name = randstring(6)
DuckDB.register_data_frame(con, df, temp_name)
DBInterface.execute(
con,
"""
INSERT
INTO $table
SELECT *
FROM '$temp_name'
""",
)
DBInterface.close!(con)
end
# Only moves WAVs not already there in dataset
# converts WAVs to flac to save space, file metadata will not survive
# requires columns :location, :file, :start_time, :end_time
# :file is the file name, :location is the actual recorder location eg "C05"
# run where the raw data is
# will find file in folder structure location/trip_date/file
# constructs dataset at output_path
# assumes file name has one . for extension only
"""
Only moves WAVs not already there in dataset
converts WAVs to flac to save space, file metadata will not survive
requires columns :location, :file, :start_time, :end_time
:file is the file name, :location is the actual recorder location eg "C05"
run where the raw data is
will find file in folder structure location/trip_date/file
constructs dataset at output_path
assumes file name has one . for extension only
"""