BOPNWZL4RWF4UGC2LUEVONNQFSKYJ2Q5W747GH3UURZLBJFFEBUQC # Train.jl# https://github.com/FluxML/model-zoo/blob/master/tutorials/transfer_learning/transfer_learning.jl# This works on my data IT TRAINS best, but only -t 4# dont forget temp envusing Random: shuffle!using Random: seed!import Base: lengthimport Base: getindexusing Imagesusing Fluxusing CUDAusing Metalheadusing Noiseusing Globusing BSON: @saveusing Dates#using CSVusing DataFramesusing FreqTablesusing JLD2using Logging, LoggingExtrasimgs = glob("2023-09-*/*/*/[N,K]/*.png") #from SSD2seed!(1234);shuffle!(imgs)#CSV.write("files.csv", DataFrame(file=imgs))device = CUDA.functional() ? gpu : cpustruct ImageContainer{T<:Vector}img::Tendstruct ValidationImageContainer{T<:Vector}img::Tenddata = ImageContainer(imgs)val_data = ValidationImageContainer(imgs)length(data::ImageContainer) = length(data.img)length(data::ValidationImageContainer) = length(data.img)const im_size = (224, 224)name_to_idx = Dict{String,Int32}("K" => 1, "N" => 2)function getindex(data::ImageContainer{Vector{String}}, idx::Int)path = data.img[idx]img =Images.load(path) |>x ->Images.imresize(x, 224, 224) |>x ->Images.RGB.(x) |>x ->Noise.add_gauss(x, (rand() * 0.2)) |>x ->apply_mask(x, 3, 3, 12) |>x ->collect(channelview(float32.(x))) |>x -> permutedims(x, (3, 2, 1))y = name_to_idx[(split(path, "/")[end-1])]return img, yendfunction getindex(data::ValidationImageContainer{Vector{String}}, idx::Int)path = data.img[idx]img =Images.load(path) |>x ->Images.imresize(x, 224, 224) |>x ->Images.RGB.(x) |>x -> collect(channelview(float32.(x))) |> x -> permutedims(x, (3, 2, 1))y = name_to_idx[(split(path, "/")[end-1])]return img, yend# assumes 224px square imagesfunction apply_mask(img::Array{RGB{N0f8},2},max_number::Int = 3,min_size::Int = 3,max_size::Int = 22,)# horizontalfor range in get_random_ranges(max_number, min_size, max_size)img[range, :] .= RGB{N0f8}(0.7, 0.7, 0.7)end# verticalfor range in get_random_ranges(max_number, min_size, max_size)img[:, range] .= RGB{N0f8}(0.7, 0.7, 0.7)endreturn imgend# assumes 224px square imagesfunction get_random_ranges(max_number::Int, min_size::Int, max_size::Int)number = rand(0:max_number)ranges = []while length(ranges) < numberstart = rand(1:224)size = rand(min_size:max_size)if start + size > 224continueendpush!(ranges, start:start+size)endreturn rangesend# define DataLoadersconst batch_size = 64const train_test_split = 0.95const ceiling = length(data) ÷ batch_size * batch_sizeconst train_test_index =ceiling ÷ batch_size * train_test_split |> round |> x -> x * batch_size |> Inttrain = Flux.DataLoader(ImageContainer(imgs[1:train_test_index]);batchsize = batch_size,collate = true,parallel = true,)device == gpu ? train = CuIterator(train) : nothingtrain_sample = Flux.DataLoader(ValidationImageContainer(imgs[1:(ceiling-train_test_index)]);batchsize = batch_size,collate = true,parallel = true,)device == gpu ? train_sample = CuIterator(train_sample) : nothingtest = Flux.DataLoader(ValidationImageContainer(imgs[train_test_index+1:ceiling]);batchsize = batch_size,collate = true,parallel = true,)device == gpu ? test = CuIterator(test) : nothingfst = Metalhead.ResNet(18, pretrain = true).layers# BEWARE NUMBER CLASSESlst = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => 2));model = Flux.Chain(fst[1], lst) |> devicefunction eval_f(m, d)good = 0count = 0pred = []actual = []for (x, y) in dp = Flux.onecold(m(x))good += sum(p .== y)count += length(y)append!(pred, p)append!(actual, y)endaccuracy = round(good / count, digits = 4)confusion_matrix =freqtable(DataFrame(targets = actual, predicts = pred), :targets, :predicts)return accuracy, confusion_matrixend# BEWARE NUMBER CLASSESfunction train_epoch!(model; opt, train)Flux.train!(model, train, opt) do m, x, yFlux.Losses.logitcrossentropy(m(x), Flux.onehotbatch(y, 1:2))endendopt = Flux.setup(Flux.Optimisers.Adam(1e-5), model);logger = FileLogger("logfile.txt"; append = true)@time metric_eval, v_confusion_matrix = eval_f(model, test)#with_logger(logger) do@info "eval" accuracy = metric_eval@info "eval" v_confusion_matrix#enda = 0.0for iter in 1:15println("")println("Epoch: $iter")@time train_epoch!(model; opt, train)@time metric_train, t_confusion_matrix = eval_f(model, train_sample)#with_logger(logger) do@info "Epoch: " iter@info "train" accuracy = metric_train@info "train" t_confusion_matrix#end@time metric_eval, v_confusion_matrix = eval_f(model, test)#with_logger(logger) do@info "test" accuracy = metric_eval@info "test" v_confusion_matrix#endmetric_eval > a && begina = metric_evallet _model = cpu(model)jldsave("model_K1-4_CPU_epoch-$iter-$metric_eval-$(today()).jld2";model_state = Flux.state(_model),)#BSON.@save "model_K1-3_CPU_epoch-$iter-$metric_eval-$(now()).bson" _model#with_logger(logger) do@info "Saved a best_model"#endendendend
# Predict.jlusing WAV, DSP, Images, ThreadsX, Dates, DataFrames, CSV, Flux, CUDA, Metalhead, JLD2, FLACexport predict"""predict(glob_pattern::String, model::String)This function takes a glob pattern for folders to run over, and a model path. It saves results in a csv for each folder, similar to opensoundscapeArgs:• glob pattern (folder/)• model pathReturns: Nothing - This function saves csv files.I use this function to find kiwi from new data gathered on a trip.Note:Dont forget temp env, julia -t 4From Pomona-3/Pomona-3/Use like:using Skraakglob_pattern = "*/2023-10-19/" #from SSD1model = "/media/david/SSD1/model_K1-3_CPU_epoch-10-0.9965-2023-10-18T17:32:36.747.jld2"predict(glob_pattern, model)"""function predict(glob_pattern::String, model::String)model = load_model(model) |> devicefolders = glob(glob_pattern)@info "Folders: $folders"for folder in folders@info "Working on: $folder"predict_folder(folder, model)endendfunction predict(folders::Vector{String}, model::String)model = load_model(model) |> device@info "Folders: $folders"for folder in folders@info "Working on: $folder"predict_folder(folder, model)endend#~~~~~ The guts ~~~~~#device = CUDA.functional() ? gpu : cpu#= TO DELETEfunction get_image_for_inference(sample, f)S = DSP.spectrogram(sample, 400, 2; fs = f)i = S.powerif minimum(i) == 0.0l = i |> vec |> unique |> sortreplace!(i, 0.0 => l[2])endimage =#! format: offDSP.pow2db.(i) |>x -> x .+ abs(minimum(x)) |>x -> x ./ maximum(x) |>x -> reverse(x, dims = 1) |>x -> RGB.(x) |>x -> imresize(x, 224, 224) |>x -> collect(channelview(float32.(x))) |>x -> permutedims(x, (3, 2, 1))#! format: onreturn imageend=#function get_image_for_inference(sample, f)image =#! format: offget_image_from_sample(sample, f) |>x -> collect(channelview(float32.(x))) |>x -> permutedims(x, (3, 2, 1))#! format: onreturn imageendfunction get_images(file::String, increment::Int = 5, divisor::Int = 2) #5s sample, 2.5s hopext = split(file, ".")[end]@assert ext in ["WAV", "wav", "flac"] "Unsupported audio file type, requires wav or flac."if ext in ["WAV", "wav"]signal, freq = wavread(file)elsesignal, freq = load(file)endif freq > 16000signal = DSP.resample(signal, 16000.0f0 / freq; dims = 1)freq = 16000endf = convert(Int, freq)inc = increment * fhop = f * increment ÷ divisor #need guarunteed Int, maybe not anymore, refactorsplit_signal = DSP.arraysplit(signal[:, 1], inc, hop)raw_images = ThreadsX.map(x -> get_image_for_inference(x, f), split_signal)n_samples = length(raw_images)return raw_images, n_samplesendfunction get_images_time_from_wav(file::String, increment::Int = 5, divisor::Int = 2)raw_images, n_samples = get_images(file::String, increment, divisor)images = reshape_images(raw_images, n_samples)start_time = 0:(increment/divisor):(n_samples-1)*(increment/divisor)end_time = increment:(increment/divisor):(n_samples+1)*(increment/divisor)time = collect(zip(start_time, end_time))return images, timeendfunction reshape_images(raw_images, n_samples)images =#! format: offhcat(raw_images...) |>x -> reshape(x, (224, 224, 3, n_samples))#! format: onreturn imagesendfunction predict_file(file::String, folder::String, model)#check form of opensoundscape preds.csv and needed by my make_clips@info "File: $file"@time images, time = get_images_time_from_wav(file)data = images |> device@time predictions = Flux.onecold(model(data))f = (repeat(["$file"], length(time)))df = DataFrame(:file => f,:start_time => first.(time),:end_time => last.(time),:label => predictions,)return dfendfunction predict_folder(folder::String, model)wav = glob("$folder/*.[W,w][A,a][V,v]")flac = glob("$folder/*.flac")files = cat(wav, flac; dims = 1)@info "$(length(files)) files in $folder"df = DataFrame(file = String[],start_time = Float64[],end_time = Float64[],label = Int[],)save_path = "$folder/preds-$(today()).csv"CSV.write("$save_path", df)for file in filesdf = predict_file(file, folder, model)CSV.write("$save_path", df, append = true)endend# see load_model() from train, different input typesfunction load_model(model_path::String)model_state = JLD2.load(model_path, "model_state")model_classes = length(model_state[1][2][1][3][2])f = Metalhead.ResNet(18, pretrain = false).layersl = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => model_classes))model = Flux.Chain(f[1], l)Flux.loadmodel!(model, model_state)return modelend#=function load_bson(model_path::String)BSON.@load model_path modelend=################ PYTHON Opensoundscape #################=# Dont forget conda activate opensoundscape# Dont forget to modify file names and glob pattern# Run script in Pomona-2, hard code trip date in the glob# python /media/david/USB/Skraak/src/predict.pyfrom opensoundscape.torch.models.cnn import load_modelimport opensoundscapeimport torchfrom pathlib import Pathimport numpy as npimport pandas as pdfrom glob import globimport osfrom datetime import datetimemodel = load_model('/home/david/best.model')# folders = glob('./*/2023-?????/')folders = glob('./*/*/')for folder in folders:os.chdir(folder)print(folder, ' start: ', datetime.now())# Beware, secretary island files are .wavfield_recordings = glob('./*.WAV')scores, preds, unsafe = model.predict(field_recordings,binary_preds = 'single_target',overlap_fraction = 0.5,batch_size = 128,num_workers = 12)scores.to_csv("scores-2023-11-07.csv")preds.to_csv("preds-2023-11-07.csv")os.chdir('../..')print(folder, ' done: ', datetime.now())print()print()=#
using CSV,DataFrames,Dates,DBInterface,DSP,DuckDB,Glob,HTTP,Images,JSON,PNGFiles,Random,SHA,TimeZones,WAV,XMLDictexport move_one_hour!,check_png_wav_both_present,file_metadata_to_df,resize_image!,twilight_tuple_local_time,utc_to_nzdt!"""move_one_hour!(files::Vector{String}, operator)This function takes a vector of file paths and renames each file in thevector by changing the name of the file to the name of the file created onehour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,which represents the time stamp of the original file minus (or plus) one hour. Thisfunction avoids force=true with mv, since new file names may already existand mv will stacktrace leaving a big mess to tidy up.Args:• files (Vector{String}): A vector of strings where each element isa path to a file.Returns: Nothing - This function only renames files and saves them.I use this to turn the clock back at the end of daylight saving."""#Assumes WAV filesfunction move_one_hour!(files::Vector{String}, operator)@assert operator == (+) || operator == (-)fix_extension_of_files = []for old_file in files# Extract the date and time of the original file using string choppinga = chop(old_file, tail = 4)d, t = split(a, "_")
ye = parse(Int64, d[1:4])mo = parse(Int64, d[5:6])da = parse(Int64, d[7:8])ho = parse(Int64, t[1:2])mi = parse(Int64, t[3:4])se = parse(Int64, t[5:6])
export check_png_wav_both_present,resize_image!, twilight_tuple_local_time, move_one_hour!, utc_to_nzdt!
#new_date = dt - Dates.Hour(1)new_date = operator(dt, Dates.Hour(1))# Must drop the WAV extension to avoiding force=true# with mv, since the new file name may already exist and mv# will stacktrace leaving a big mess to tidy up.base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")temp_file = base_file * ".tmp"# Tuple to tidy extensions latertidy = (temp_file, base_file * ".WAV")mv(old_file, temp_file)push!(fix_extension_of_files, tidy)print(".")endfor item in fix_extension_of_filesmv(item[1], item[2])endprint("Tidy\n")end
#=used like:using Glob, Skraak, CSVfolders=glob("*/2023-11-02/")for folder in folderscd(folder)trydf = Skraak.file_metadata_to_df()CSV.write("/media/david/Pomona-3/Pomona-3/pomona_files_20231102.csv", df; append=true)catch@warn "error with $folder"endcd("/media/david/Pomona-3/Pomona-3/")endThen using duckdb cli from SSD:duckdb AudioData.duckdbshow tables;SELECT * FROM pomona_files;COPY pomona_files FROM '/media/david/Pomona-3/Pomona-3/pomona_files_20231019.csv';SELECT * FROM pomona_files;Then backup with:EXPORT DATABASE 'AudioDataBackup_2023-07-29';.quitThen quit and backup using cp on the db fileThen rsync ssd to usbrsync -avzr --delete /media/david/SSD1/ /media/david/USB/=#
file_metadata_to_df()This function takes a file name, extracts wav metadata, gpx location, recording period start/end and returnes a dataframe.This function needs raw audiomoth wav files and a gpx.This function needs /media/david/SSD1/dawn_dusk.csvusing DataFrames, Dates, DelimitedFiles, DuckDB, Glob, JSON3, Random, SHA, TimeZones, WAV, XMLDict"""function file_metadata_to_df()# Initialise dataframe with columns: disk, location, trip_date, file, lattitude, longitude, start_recording_period_localt, finish_recording_period_localt, duration, sample_rate, zdt, ldt, moth_id, gain, battery, temperaturedf = DataFrame(disk = String[],location = String[],trip_date = String[],file = String[],latitude = Float64[],longitude = Float64[],start_recording_period_localt = String[],finish_recording_period_localt = String[],duration = Float64[],sample_rate = Int[],utc = String[],ldt = String[],moth_id = String[],gain = String[],battery = Float64[],temperature = Float64[],sha2_256 = String[],night = Bool[],)#Get WAV list for folderwav_list = glob("*.WAV") |> sort#Return empty df if nothing in the folderif length(wav_list) == 0return dfend#Get path info from file systemraw_path_vec = split(pwd(), "/")[end-2:end]disk = raw_path_vec[1]location = raw_path_vec[2]trip_date = raw_path_vec[3]#Get location, assumes 1 gpx is in the follderwaypoint = glob("*.gpx")length(waypoint) != 1 && @error "no gpx file in $trip_date $location"loc = read(waypoint[1], String) |> xml_dictlatitude = parse(Float64, (loc["gpx"]["wpt"][:lat]))longitude = parse(Float64, (loc["gpx"]["wpt"][:lon]))#Start of recording period_, _, _, binary_metadata_start = wavread(wav_list[1])c_v_s = split(wav_info_read(binary_metadata_start)[:ICMT], " ")comment_vector_start = length(c_v_s) < 22 ? c_v_s : c_v_s[1:19]date_start = split(comment_vector_start[4], "/")time_start = split(comment_vector_start[3], ":")tz_start = chop(comment_vector_start[5], head = 4, tail = 1)time_zone_start = isempty(tz_start) ? "+00" : tz_start#zdt1 = ZonedDateTime(parse(Int, date_start[3]), parse(Int, date_start[2]), parse(Int, date_start[1]), parse(Int, time_start[1]), parse(Int, time_start[2]), parse(Int, time_start[3]), tz"UTC")time_string_start =date_start[3] *"-" *date_start[2] *"-" *date_start[1] *"T" *time_start[1] *":" *time_start[2] *":" *time_start[3] *"." *"000" *time_zone_startzdt1 = ZonedDateTime(time_string_start)start_recording_period_localt =Dates.format(astimezone(zdt1, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")#End of recording period_, _, _, binary_metadata_end = wavread(wav_list[end])c_v_e = split(wav_info_read(binary_metadata_end)[:ICMT], " ")comment_vector_end = length(c_v_e) < 22 ? c_v_e : c_v_e[1:19]date_end = split(comment_vector_end[4], "/")time_end = split(comment_vector_end[3], ":")tz_end = chop(comment_vector_start[5], head = 4, tail = 1)time_zone_end = isempty(tz_end) ? "+00" : tz_end#zdt2 = ZonedDateTime(parse(Int, date_end[3]), parse(Int, date_end[2]), parse(Int, date_end[1]),parse(Int, time_end[1]), parse(Int, time_end[2]), parse(Int, time_end[3]), tz"UTC")time_string_end =date_end[3] *"-" *date_end[2] *"-" *date_end[1] *"T" *time_end[1] *":" *time_end[2] *":" *time_end[3] *"." *"000" *time_zone_endzdt2 = ZonedDateTime(time_string_end)finish_recording_period_localt =Dates.format(astimezone(zdt2, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")dict = Skraak.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")#So I know what it is doingprintln(raw_path_vec)#Loop over file listfor file in wav_list#print(file)tryaudio_data, sample_rate, _, binary_metadata = wavread(file)c_v = split(wav_info_read(binary_metadata)[:ICMT], " ")comment_vector = length(c_v) < 22 ? c_v : c_v[1:19]duration = Float64(length(audio_data) / sample_rate)date = split(comment_vector[4], "/")time = split(comment_vector[3], ":")tz = chop(comment_vector[5], head = 4, tail = 1)time_zone = isempty(tz) ? "+00" : tz#preformatting_zdt = ZonedDateTime(parse(Int, date[3]), parse(Int, date[2]), parse(Int, date[1]), parse(Int, time[1]), parse(Int, time[2]), parse(Int, time[3]), tz"UTC")time_string =date[3] *"-" *date[2] *"-" *date[1] *"T" *time[1] *":" *time[2] *":" *time[3] *"." *"000" *time_zonepreformatting_zdt = ZonedDateTime(time_string)#zdt = Dates.format(preformatting_zdt, "yyyy-mm-dd HH:MM:SSzzzz")preformatting_utc = astimezone(preformatting_zdt, tz"UTC")utc = Dates.format(preformatting_utc, "yyyy-mm-dd HH:MM:SSzzzz")preformatting_ldt = astimezone(preformatting_zdt, tz"Pacific/Auckland")ldt = Dates.format(preformatting_ldt, "yyyy-mm-dd HH:MM:SSzzzz")moth_id = comment_vector[8]gain = comment_vector[10]#index back from end because if V > 4.9 the wording chaaangesbattery = parse(Float64, chop(comment_vector[end-4], tail = 1))temperature = parse(Float64, chop(comment_vector[end], tail = 2))sha2_256 = bytes2hex(sha256(file))#assumes 15 minute file and calculates on half way timent = Skraak.night(DateTime(preformatting_ldt + Minute(7) + Second(30)), dict)#Populate row to push into dfrow = [disk,location,trip_date,file,latitude,longitude,start_recording_period_localt,finish_recording_period_localt,duration,Int(sample_rate),utc,ldt,moth_id,gain,battery,temperature,sha2_256,nt,]push!(df, row)print(".")catch@warn "error with $folder $file"endendreturn dfend"""
move_one_hour!(files::Vector{String}, operator)This function takes a vector of file paths and renames each file in thevector by changing the name of the file to the name of the file created onehour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,which represents the time stamp of the original file minus (or plus) one hour. Thisfunction avoids force=true with mv, since new file names may already existand mv will stacktrace leaving a big mess to tidy up.Args:• files (Vector{String}): A vector of strings where each element isa path to a file.Returns: Nothing - This function only renames files and saves them.I use this to turn the clock back at the end of daylight saving.Assumes WAV files"""function move_one_hour!(files::Vector{String}, operator)@assert operator == (+) || operator == (-)fix_extension_of_files = []for old_file in files# Extract the date and time of the original file using string choppinga = chop(old_file, tail = 4)d, t = split(a, "_")ye = parse(Int64, d[1:4])mo = parse(Int64, d[5:6])da = parse(Int64, d[7:8])ho = parse(Int64, t[1:2])mi = parse(Int64, t[3:4])se = parse(Int64, t[5:6])dt = DateTime(ye, mo, da, ho, mi, se)#new_date = dt - Dates.Hour(1)new_date = operator(dt, Dates.Hour(1))# Must drop the WAV extension to avoiding force=true# with mv, since the new file name may already exist and mv# will stacktrace leaving a big mess to tidy up.base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")temp_file = base_file * ".tmp"# Tuple to tidy extensions latertidy = (temp_file, base_file * ".WAV")mv(old_file, temp_file)push!(fix_extension_of_files, tidy)print(".")endfor item in fix_extension_of_filesmv(item[1], item[2])endprint("Tidy\n")end"""
using CSV, DataFrames, DataFramesMeta, Dates, DSP, Glob, JSON, Random, TimeZones, WAV, PNGFiles, Images #Plots#import DataFramesMeta: @transform!, @subset!, @byrow, @passmissing"""make_clips(preds_path::String, dawn_dusk_dict::Dict{Dates.Date, Tuple{Dates.DateTime, Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv"))This function takes a preds.csv files and generatesfile names, wav's, spectrograms etc to be reviewed.it calls night() and may call construct_dawn_dusk_dict() unless the dict is globally defined and passed inIt should be run from Pomona-1/, Pomona-2/ or Pomona-3/, assumes it is, it uses the pathIt saves wav and png files to /home/david/Upload/need to use a try/catch because the 2 assert functions thow an error to short circuit the functionusing Glob, Skraakpredictions = glob("*/2023-09-11*/preds*")predictions = glob("path/to/preds*")for file in predictions #[1:6][7:12][13:18][19:24]trymake_clips(file)catch xprintln(x)endendif needed to change headers in preds csvshift, control, f in sublfile,start_time,end_time,label/media/david/Pomona-2,<project filters>, preds-2023-02-27.csvfile,start_time,end_time,absent,presentusing Glob, CSV, DataFrames, DataFramesMeta, Dates, DSP, Plots, Random, WAV"""# Assumes run on linux# Assumes function run from Pomona-1 or Pomona-2#dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv",),function make_clips(preds_path::String,label::Int = 1,night::Bool = true,dawn_dusk_dict = dddict,)# Assumes function run from Pomona-1 or Pomona-2location, trip_date, _ = split(preds_path, "/")# Load and group data frame by filegdf =#! format: offDataFrame(CSV.File(preds_path)) |>x -> assert_not_empty(x, preds_path) |>x -> rename_column!(x, "1.0", "label") |> #can remove now, needs to be labelx -> assert_detections_present(x, label, location, trip_date) |>x -> filter_positives!(x, label) |>insert_datetime_column! |>x -> night_or_day!(x, dawn_dusk_dict, night) |> #true=night, false=daygroup_by_file!#! format: on# Make clip and spectrogramfor (k, v) in pairs(gdf)#file_name = chop(v.file[1], head = 2, tail = 4)file_name = path_to_file_string(v.file[1])start_times = v[!, :start_time] |> sortdetections = cluster_detections(start_times)isempty(detections) && continuesignal, freq = wavread("$location/$trip_date/$file_name.WAV")length_signal = length(signal)for detection in detectionsst, en = calculate_clip_start_end(detection, freq, length_signal)name = "$location-$trip_date-$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"f = "Clips_$(today())"mkpath(f)outfile = "$f/$name"sample = signal[Int(st):Int(en)]wavwrite(sample, "$outfile.wav", Fs = Int(freq))#plot = plot_spectrogram(sample, freq)#savefig(plot, "$outfile.png")image = get_image_from_sample(sample, freq)PNGFiles.save("$outfile.png", image)endprint(".")endprintln("\ndone $location/$trip_date \n")end#######################################################################function assert_not_empty(df::DataFrame, preds_path::String)::DataFramesize(df) != (0, 0) ? (return df) : @error "Empty dataframe at $preds_path"#return dfendfunction rename_column!(df::DataFrame, old_name::String, new_name::String)::DataFrameold_name in names(df) && rename!(df, old_name => new_name)return dfend# assumes kiwi, binary classifier from opensoundscape# needed to remove ::String annotation for location, trip_date to make it workfunction assert_detections_present(df::DataFrame,label::Int,location,trip_date,)::DataFramelabel in levels(df.label) ? (return df) :@error "No detections for label = $label at $location/$trip_date"end# assumes kiwifunction filter_positives!(df::DataFrame, label)::DataFrame#filter!(row -> row.kiwi > 0, df)filter!(row -> row.label == label, df)return dfendfunction path_to_file_string(path) #becareful path::String won't work: no method matching path_to_file_string(::InlineStrings.String31) line 70f = split(path, "/")[end] |> x -> split(x, ".") |> first#f = chop(file, head = 2, tail = 4)return fendfunction filename_to_datetime!(file)::DateTime#file_string = chop(file, head = 2, tail = 4)file_string = path_to_file_string(file)date_time =length(file_string) > 13 ? DateTime(file_string, dateformat"yyyymmdd_HHMMSS") :DateTime((file_string[1:4] * "20" * file_string[5:end]),dateformat"ddmmyyyy_HHMMSS",)return date_timeendfunction insert_datetime_column!(df::DataFrame)::DataFrame@transform!(df, @byrow :DateTime = filename_to_datetime!(String(:file)))return dfend# calls night(), needs dawn_dusk_dict in local time formatfunction night_or_day!(df::DataFrame,dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}},night_time::Bool = true,)::DataFramenight_time ? @subset!(df, @byrow night(:DateTime, dawn_dusk_dict)) :@subset!(df, @byrow !night(:DateTime, dawn_dusk_dict))return dfendfunction group_by_file!(df::DataFrame)gdf = groupby(df, :file)return gdfendfunction cluster_detections(start_times::Vector{Float64})::Vector{Vector{Float64}}s = Vector{Float64}[]t = Float64[start_times[1]]for time in start_times[2:end]if time - last(t) <= 15.0push!(t, time)elsepush!(s, copy(t))t = Float64[time]endendpush!(s, copy(t))detections = filter(x -> length(x) > 1, s)return detectionsend# assumes it is operating on 5 second clipsfunction calculate_clip_start_end(detection::Vector{Float64},freq::Float32,length_signal::Int64,)::Tuple{Float64,Float64}first(detection) > 0 ? st = first(detection) * freq : st = 1(last(detection) + 5.0) * freq <= length_signal ? en = (last(detection) + 5.0) * freq :en = length_signalreturn st, enend#= Deprecated use get_image_from_sample()function plot_spectrogram(sample::Vector{Float64},freq::Float32,)::Plots.Plot{Plots.GRBackend}S = DSP.spectrogram(sample[:, 1], 400, 2; fs = convert(Int, freq))plot = Plots.heatmap(S.time,S.freq,pow2db.(S.power),size = (448, 448),showaxis = false,ticks = false,legend = false,thickness_scaling = 0,)return plotend=## f neeeds to be an Intfunction get_image_from_sample(sample, f) #sample::Vector{Float64}S = DSP.spectrogram(sample, 400, 2; fs = convert(Int, f))i = S.powerif minimum(i) == 0.0l = i |> vec |> unique |> sortreplace!(i, 0.0 => l[2])endimage =#! format: offDSP.pow2db.(i) |>x -> x .+ abs(minimum(x)) |>x -> x ./ maximum(x) |>x -> reverse(x, dims = 1) |>x -> RGB.(x) |>x -> imresize(x, 224, 224)#! format: onreturn imageend"""construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}sun = DataFrame(CSV.File(file))Takes dawn dusk.csv and returns a dict to be consumeed by night().~/dawn_dusk.csvAt present it goes from the start of 2019 to the end of 2024The csv contains local time sunrise and sunsetI use this to decide if a file with a local time encoded name was recorded at nightdict = construct_dawn_dusk_dict("/Volumes/SSD1/dawn_dusk.csv")dict = Utility.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")using CSV, DataFrames"""function construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}sun = DataFrame(CSV.File(file))x = Tuple(zip(sun.Dawn, sun.Dusk))y = Dict(zip(sun.Date, x))return yend"""night(call_time::DateTime, dict::Dict{Date, Tuple{DateTime, DateTime}})::BoolReturns true if time is at night, ie between civil twilights, dusk to dawn.Consumes dict from construct_dawn_dusk_dicttime=DateTime("2021-11-02T21:14:35",dateformat"yyyy-mm-ddTHH:MM:SS")Utility.night(time, dict)"""function night(call_time::DateTime, dict::Dict{Date,Tuple{DateTime,DateTime}})::Booldawn = dict[Date(call_time)][1]dusk = dict[Date(call_time)][2]if call_time <= dawn || call_time >= duskreturn trueelsereturn falseendend########################################################################INBETWEEN STEP: use secondary model to sort clips, move clips into D, F, M, N, and hand classify, generate actual.csv."""move_clips_to_folders(df::DataFrame)Takes a 2 column dataframe: file, labelfile must be list of png images, assumes wav's are there toowill move mp4's from video folder if they are present"""function move_clips_to_folders(df::DataFrame)p = glob("*.png")w = glob("*.[W,w][A,a][V,v]")@assert (first(df.file) |> x -> split(x, ".")[end] |> x -> x == "png") "df.file must be a list of png's"@assert issetequal(df.file, p) "All png files in dataframe must be present in folder"@assert issetequal(chop.(df.file, head = 0, tail = 4), chop.(w, head = 0, tail = 4)) "There must be a wav for every png in the dataframe"for row in eachrow(df)src = row.filedst = "$(row.label)/$(row.file)"mkpath("$(row.label)/")trymv(src, dst)mv(chop(src, tail = 3) * "wav", chop(dst, tail = 3) * "wav")if isdir(video)mkpath("video/$(row.label)/")mv("video/" * chop(src, tail = 3) * "mp4","video/" * chop(dst, tail = 3) * "mp4",)endcatch e@info eendendend#=actual.csv must be list of qualified png file names:D/C05-2023-04-15-20230219_223000-380-470.pngusing Glob, DataFrames, CSVa=glob("[M,F,D,N]/*.png")df = DataFrame(file=a)CSV.write("actual_mfdn.csv", df)
make a folder D,F,M,Nmkpath.(["D", "F", "M", "N"])move wavs to match pngsdf=DataFrame(CSV.File("actual_mfdn.csv"))for row in eachrow(df)src=split(row.file, "/")[2]dst=row.filemv(src, dst)mv(chop(src, tail=3)*"wav", chop(dst, tail=3)*"wav")end=##run from parent folder of label folders#saves actual.csv and returns a df#labels=["D", "F", "M", "N"]function actual_from_folders(labels::Vector{String})::DataFramepaths=String[]for l in labelspaths=append!(paths, glob("$l/*.png"))enddf = DataFrame(file=paths)CSV.write("actual.csv", df)return dfend"""aggregate_labels(actual="actual.csv", outfile="labels.csv")file[D, F, M, N]/C05-2023-04-15-20230219_223000-380-470.pngThis function takes the csv output from my hand classification and ouputs a df, and csv for insertion into AudioData.duckdb using the duckdb cli or using DFto.audiodata_db()assumes run from Clips_xxxx-xx-xx folder and that "actual.csv" present if not specified.returns a dataframeusing CSV, DataFrames, DataFramesMeta"""#=df=aggregate_labels()audiodata_db(df, "pomona_labels_20230418") NOT_WORKING maybe titlesto use cli, need to remove header rowduckdb /media/david/SSD1/AudioData.duckdbCOPY pomona_labels_20230418 FROM 'DB_Labels/pomona_labels_2023-11-02.csv';COPY pomona_files FROM 'DB_Files/pomona_files_20231102.csv';Then backup with:EXPORT DATABASE 'AudioDataBackup_2023-11-14';.quitThen quit and backup using cp on the db file, dated copyThen rsync ssd to usbrsync -avzr --delete /media/david/SSD1/ /media/david/USB/note: run on maccd skraak.kiwijulia-1.9using Franklinserve()=## New one, without noise and distance, does not do :box anymore therefore requires new db schemafunction aggregate_labels(actual::String = "actual.csv",outfile::String = "labels.csv",hdr::Bool = false #header for outfile)::DataFramedf = DataFrame(CSV.File(actual))# location, f, start_time, end_time@transform!(df, @byrow :location = split(split(:file, "/")[2], "-")[1])@transform!(df, @byrow :f = split(split(:file, "/")[2], "-")[5] * ".WAV")@transform!( df, @byrow :start_time = split(split(:file, "/")[2], "-")[end-1])@transform!( df, @byrow :end_time = chop(split(split(:file, "/")[2], "-")[end], tail=4))#@transform!( df, @byrow :box = "[$(split(split(:file, "/")[2], "-")[end-1]), $(chop(split(split(:file, "/")[2], "-")[end], tail=4))]")# male, female, duet, not@transform!(df, @byrow @passmissing :male = split(:file, "/")[1] == "M" ? true : false)@transform!(df,@byrow @passmissing :female = split(:file, "/")[1] == "F" ? true : false)@transform!(df, @byrow @passmissing :duet = split(:file, "/")[1] == "D" ? true : false)@transform!(df,@byrow @passmissing :not_kiwi =split(:file, "/")[1] in ["KA", "KE", "N", "Q"] ? true : false)# other_label@transform!(df,@byrow @passmissing :other_label =split(:file, "/")[1] in ["KA", "KE", "Q"] ? split(:file, "/")[1] : missing)# remove unwanted cols, rename f to fileselect!(df, Not([:file]))rename!(df, :f => :file)CSV.write(outfile, df; header=hdr)return dfend"""audiodata_db(df::DataFrame, table::String)Use to upload labels to AudioData.duckdbTakes a dataframe and inserts into AudioData.db table.audiodata_db(df, "pomona_labels_20230418")using DataFrames, DBInterface, DuckDB, Random"""function audiodata_db(df::DataFrame, table::String)if Sys.islinux()con = DBInterface.connect(DuckDB.DB, "/media/david/SSD1/AudioData.duckdb")elsecon = DBInterface.connect(DuckDB.DB, "/Volumes/SSD1/AudioData.duckdb")endtemp_name = randstring(6)DuckDB.register_data_frame(con, df, temp_name)DBInterface.execute(con,"""INSERTINTO $tableSELECT *FROM '$temp_name'""",)DBInterface.close!(con)end
# Only moves WAVs not already there in dataset# converts WAVs to flac to save space, file metadata will not survive# requires columns :location, :file, :start_time, :end_time# :file is the file name, :location is the actual recorder location eg "C05"# run where the raw data is# will find file in folder structure location/trip_date/file# constructs dataset at output_path# assumes file name has one . for extension only
"""Only moves WAVs not already there in datasetconverts WAVs to flac to save space, file metadata will not surviverequires columns :location, :file, :start_time, :end_time:file is the file name, :location is the actual recorder location eg "C05"run where the raw data iswill find file in folder structure location/trip_date/fileconstructs dataset at output_pathassumes file name has one . for extension only"""