quietlight/Skraak - Change BOPNWZL4RWF4UGC2LUEVONNQFSKYJ2Q5W747GH3UURZLBJFFEBUQC

refactored Skraak.jl into sub files, tidy now

Created by AEj8dahVWy718uSSFPe9VSRJ5qX5G8pC2zvFzJJ8yzBd on December 9, 2023

BOPNWZL4RWF4UGC2LUEVONNQFSKYJ2Q5W747GH3UURZLBJFFEBUQC

Dependencies

In channels

main

Change contents

File deletion: Train_delete.jl

BF:BFD[4.6598] → [4.335579:335618]

BF:BFD[4.335618] → [4.329788:329788]

B:BD[4.329788] → [4.329789:335578]

# Train.jl
# https://github.com/FluxML/model-zoo/blob/master/tutorials/transfer_learning/transfer_learning.jl
# This works on my data IT TRAINS best, but only -t 4
# dont forget temp env
using Random: shuffle!
using Random: seed!
import Base: length
import Base: getindex
using Images
using Flux
using CUDA
using Metalhead
using Noise
using Glob
using BSON: @save
using Dates
#using CSV
using DataFrames
using FreqTables
using JLD2
using Logging, LoggingExtras
imgs = glob("2023-09-*/*/*/[N,K]/*.png") #from SSD2
seed!(1234);
shuffle!(imgs)
#CSV.write("files.csv", DataFrame(file=imgs))
device = CUDA.functional() ? gpu : cpu
struct ImageContainer{T<:Vector}
    img::T
end
struct ValidationImageContainer{T<:Vector}
    img::T
end
data = ImageContainer(imgs)
val_data = ValidationImageContainer(imgs)
length(data::ImageContainer) = length(data.img)
length(data::ValidationImageContainer) = length(data.img)
const im_size = (224, 224)
name_to_idx = Dict{String,Int32}("K" => 1, "N" => 2)
function getindex(data::ImageContainer{Vector{String}}, idx::Int)
    path = data.img[idx]
    img =
        Images.load(path) |>
        x ->
            Images.imresize(x, 224, 224) |>
            x ->
                Images.RGB.(x) |>
                x ->
                    Noise.add_gauss(x, (rand() * 0.2)) |>
                    x ->
                        apply_mask(x, 3, 3, 12) |>
                        x ->
                            collect(channelview(float32.(x))) |>
                            x -> permutedims(x, (3, 2, 1))
    y = name_to_idx[(split(path, "/")[end-1])]
    return img, y
end
function getindex(data::ValidationImageContainer{Vector{String}}, idx::Int)
    path = data.img[idx]
    img =
        Images.load(path) |>
        x ->
            Images.imresize(x, 224, 224) |>
            x ->
                Images.RGB.(x) |>
                x -> collect(channelview(float32.(x))) |> x -> permutedims(x, (3, 2, 1))
    y = name_to_idx[(split(path, "/")[end-1])]
    return img, y
end
# assumes 224px square images
function apply_mask(
    img::Array{RGB{N0f8},2},
    max_number::Int = 3,
    min_size::Int = 3,
    max_size::Int = 22,
)
    # horizontal
    for range in get_random_ranges(max_number, min_size, max_size)
        img[range, :] .= RGB{N0f8}(0.7, 0.7, 0.7)
    end
    # vertical
    for range in get_random_ranges(max_number, min_size, max_size)
        img[:, range] .= RGB{N0f8}(0.7, 0.7, 0.7)
    end
    return img
end
# assumes 224px square images
function get_random_ranges(max_number::Int, min_size::Int, max_size::Int)
    number = rand(0:max_number)
    ranges = []
    while length(ranges) < number
        start = rand(1:224)
        size = rand(min_size:max_size)
        if start + size > 224
            continue
        end
        push!(ranges, start:start+size)
    end
    return ranges
end
# define DataLoaders
const batch_size = 64
const train_test_split = 0.95
const ceiling = length(data) ÷ batch_size * batch_size
const train_test_index =
    ceiling ÷ batch_size * train_test_split |> round |> x -> x * batch_size |> Int
train = Flux.DataLoader(
    ImageContainer(imgs[1:train_test_index]);
    batchsize = batch_size,
    collate = true,
    parallel = true,
)
device == gpu ? train = CuIterator(train) : nothing
train_sample = Flux.DataLoader(
    ValidationImageContainer(imgs[1:(ceiling-train_test_index)]);
    batchsize = batch_size,
    collate = true,
    parallel = true,
)
device == gpu ? train_sample = CuIterator(train_sample) : nothing
test = Flux.DataLoader(
    ValidationImageContainer(imgs[train_test_index+1:ceiling]);
    batchsize = batch_size,
    collate = true,
    parallel = true,
)
device == gpu ? test = CuIterator(test) : nothing
fst = Metalhead.ResNet(18, pretrain = true).layers
# BEWARE NUMBER CLASSES
lst = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => 2));
model = Flux.Chain(fst[1], lst) |> device
function eval_f(m, d)
    good = 0
    count = 0
    pred = []
    actual = []
    for (x, y) in d
        p = Flux.onecold(m(x))
        good += sum(p .== y)
        count += length(y)
        append!(pred, p)
        append!(actual, y)
    end
    accuracy = round(good / count, digits = 4)
    confusion_matrix =
        freqtable(DataFrame(targets = actual, predicts = pred), :targets, :predicts)
    return accuracy, confusion_matrix
end
# BEWARE NUMBER CLASSES
function train_epoch!(model; opt, train)
    Flux.train!(model, train, opt) do m, x, y
        Flux.Losses.logitcrossentropy(m(x), Flux.onehotbatch(y, 1:2))
    end
end
opt = Flux.setup(Flux.Optimisers.Adam(1e-5), model);
logger = FileLogger("logfile.txt"; append = true)
@time metric_eval, v_confusion_matrix = eval_f(model, test)
#with_logger(logger) do
@info "eval" accuracy = metric_eval
@info "eval" v_confusion_matrix
#end
a = 0.0
for iter in 1:15
    println("")
    println("Epoch: $iter")
    @time train_epoch!(model; opt, train)
    @time metric_train, t_confusion_matrix = eval_f(model, train_sample)
    #with_logger(logger) do
    @info "Epoch: " iter
    @info "train" accuracy = metric_train
    @info "train" t_confusion_matrix
    #end
    @time metric_eval, v_confusion_matrix = eval_f(model, test)
    #with_logger(logger) do
    @info "test" accuracy = metric_eval
    @info "test" v_confusion_matrix
    #end
    metric_eval > a && begin
        a = metric_eval
        let _model = cpu(model)
            jldsave(
                "model_K1-4_CPU_epoch-$iter-$metric_eval-$(today()).jld2";
                model_state = Flux.state(_model),
            )
            #BSON.@save "model_K1-3_CPU_epoch-$iter-$metric_eval-$(now()).bson" _model
            #with_logger(logger) do
            @info "Saved a best_model"
            #end
        end
    end
end

File deletion: Predict_delete.jl

BF:BFD[4.6598] → [4.366453:366494]

BF:BFD[4.366494] → [4.360107:360107]

B:BD[4.360107] → [4.360108:366452]

# Predict.jl
using WAV, DSP, Images, ThreadsX, Dates, DataFrames, CSV, Flux, CUDA, Metalhead, JLD2, FLAC
export predict
"""
predict(glob_pattern::String, model::String)
This function takes a glob pattern for folders to run over, and a model path. It saves results in a csv for each folder, similar to opensoundscape
Args:
•  glob pattern (folder/)
•  model path
Returns: Nothing - This function saves csv files.
I use this function to find kiwi from new data gathered on a trip.
Note:
Dont forget temp env,  julia -t 4
From Pomona-3/Pomona-3/
Use like:
using Skraak
glob_pattern = "*/2023-10-19/" #from SSD1
model = "/media/david/SSD1/model_K1-3_CPU_epoch-10-0.9965-2023-10-18T17:32:36.747.jld2"
predict(glob_pattern, model)
"""
function predict(glob_pattern::String, model::String)
    model = load_model(model) |> device
    folders = glob(glob_pattern)
    @info "Folders: $folders"
    for folder in folders
        @info "Working on: $folder"
        predict_folder(folder, model)
    end
end
function predict(folders::Vector{String}, model::String)
    model = load_model(model) |> device
    @info "Folders: $folders"
    for folder in folders
        @info "Working on: $folder"
        predict_folder(folder, model)
    end
end
#~~~~~ The guts ~~~~~#
device = CUDA.functional() ? gpu : cpu
#= TO DELETE
function get_image_for_inference(sample, f)
    S = DSP.spectrogram(sample, 400, 2; fs = f)
    i = S.power
    if minimum(i) == 0.0
        l = i |> vec |> unique |> sort
        replace!(i, 0.0 => l[2])
    end
    image =
        #! format: off
        DSP.pow2db.(i) |>
        x -> x .+ abs(minimum(x)) |>
        x -> x ./ maximum(x) |>
        x -> reverse(x, dims = 1) |>
        x -> RGB.(x) |>
        x -> imresize(x, 224, 224) |>
        x -> collect(channelview(float32.(x))) |> 
        x -> permutedims(x, (3, 2, 1))
        #! format: on
    return image
end
=#
function get_image_for_inference(sample, f)
    image =
        #! format: off
        get_image_from_sample(sample, f) |>
        x -> collect(channelview(float32.(x))) |> 
        x -> permutedims(x, (3, 2, 1))
        #! format: on
    return image
end
function get_images(file::String, increment::Int = 5, divisor::Int = 2) #5s sample, 2.5s hop
    ext = split(file, ".")[end]
    @assert ext in ["WAV", "wav", "flac"] "Unsupported audio file type, requires wav or flac."
    if ext in ["WAV", "wav"]
        signal, freq = wavread(file)
    else
        signal, freq = load(file)
    end
    if freq > 16000
        signal = DSP.resample(signal, 16000.0f0 / freq; dims = 1)
        freq = 16000
    end
    f = convert(Int, freq)
    inc = increment * f
    hop = f * increment ÷ divisor #need guarunteed Int, maybe not anymore, refactor
    split_signal = DSP.arraysplit(signal[:, 1], inc, hop)
    raw_images = ThreadsX.map(x -> get_image_for_inference(x, f), split_signal)
    n_samples = length(raw_images)
    return raw_images, n_samples
end
function get_images_time_from_wav(file::String, increment::Int = 5, divisor::Int = 2)
    raw_images, n_samples = get_images(file::String, increment, divisor)
    images = reshape_images(raw_images, n_samples)
    start_time = 0:(increment/divisor):(n_samples-1)*(increment/divisor)
    end_time = increment:(increment/divisor):(n_samples+1)*(increment/divisor)
    time = collect(zip(start_time, end_time))
    return images, time
end
function reshape_images(raw_images, n_samples)
    images =
        #! format: off
        hcat(raw_images...) |>
        x -> reshape(x, (224, 224, 3, n_samples))
        #! format: on
    return images
end
function predict_file(file::String, folder::String, model)
    #check form of opensoundscape preds.csv and needed by my make_clips
    @info "File: $file"
    @time images, time = get_images_time_from_wav(file)
    data = images |> device
    @time predictions = Flux.onecold(model(data))
    f = (repeat(["$file"], length(time)))
    df = DataFrame(
        :file => f,
        :start_time => first.(time),
        :end_time => last.(time),
        :label => predictions,
    )
    return df
end
function predict_folder(folder::String, model)
    wav = glob("$folder/*.[W,w][A,a][V,v]")
    flac = glob("$folder/*.flac")
    files = cat(wav, flac; dims = 1)
    @info "$(length(files)) files in $folder"
    df = DataFrame(
        file = String[],
        start_time = Float64[],
        end_time = Float64[],
        label = Int[],
    )
    save_path = "$folder/preds-$(today()).csv"
    CSV.write("$save_path", df)
    for file in files
        df = predict_file(file, folder, model)
        CSV.write("$save_path", df, append = true)
    end
end
# see load_model() from train, different input types
function load_model(model_path::String)
    model_state = JLD2.load(model_path, "model_state")
    model_classes = length(model_state[1][2][1][3][2])
    f = Metalhead.ResNet(18, pretrain = false).layers
    l = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => model_classes))
    model = Flux.Chain(f[1], l)
    Flux.loadmodel!(model, model_state)
    return model
end
#=
function load_bson(model_path::String)
    BSON.@load model_path model
end
=#
############### PYTHON Opensoundscape ################
#=
# Dont forget conda activate opensoundscape
# Dont forget to modify file names and glob pattern
# Run script in Pomona-2, hard code trip date in the glob
# python /media/david/USB/Skraak/src/predict.py
from opensoundscape.torch.models.cnn import load_model
import opensoundscape
import torch
from pathlib import Path
import numpy as np
import pandas as pd
from glob import glob
import os
from datetime import datetime
model = load_model('/home/david/best.model')
# folders =  glob('./*/2023-?????/')
folders =  glob('./*/*/')
for folder in folders:
    os.chdir(folder)
    print(folder, ' start: ', datetime.now())
    # Beware, secretary island files are .wav
    field_recordings = glob('./*.WAV')
    scores, preds, unsafe = model.predict(
            field_recordings,
            binary_preds = 'single_target',
            overlap_fraction = 0.5,
            batch_size =  128,
            num_workers = 12)
    scores.to_csv("scores-2023-11-07.csv")
    preds.to_csv("preds-2023-11-07.csv")
    os.chdir('../..')
    print(folder, ' done: ', datetime.now())
    print()
    print()
=#

Deletion in src/Utility.jl at line 2 [4.313612]

B:BD[4.313626] → [4.313626:315017]


using CSV,
    DataFrames,
    Dates,
    DBInterface,
    DSP,
    DuckDB,
    Glob,
    HTTP,
    Images,
    JSON,
    PNGFiles,
    Random,
    SHA,
    TimeZones,
    WAV,
    XMLDict
export move_one_hour!,
    check_png_wav_both_present,
    file_metadata_to_df,
    resize_image!,
    twilight_tuple_local_time,
    utc_to_nzdt!
"""
move_one_hour!(files::Vector{String}, operator)
This function takes a vector of file paths and renames each file in the
vector by changing the name of the file to the name of the file created one
hour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,
which represents the time stamp of the original file minus (or plus) one hour. This
function avoids force=true with mv, since new file names may already exist
and mv will stacktrace leaving a big mess to tidy up.
Args:
•  files (Vector{String}): A vector of strings where each element is
a path to a file.
Returns: Nothing - This function only renames files and saves them.
I use this to turn the clock back at the end of daylight saving.
"""
#Assumes WAV files
function move_one_hour!(files::Vector{String}, operator)
    @assert operator == (+) || operator == (-)
    fix_extension_of_files = []
    for old_file in files
        # Extract the date and time of the original file using string chopping
        a = chop(old_file, tail = 4)
        d, t = split(a, "_")

Replacement in src/Utility.jl at line 3 [4.313612]

B:BD[4.315018] → [4.315018:315222]

        ye = parse(Int64, d[1:4])
        mo = parse(Int64, d[5:6])
        da = parse(Int64, d[7:8])
        ho = parse(Int64, t[1:2])
        mi = parse(Int64, t[3:4])
        se = parse(Int64, t[5:6])

[4.315018]

[4.315222]

export check_png_wav_both_present,
    resize_image!, twilight_tuple_local_time, move_one_hour!, utc_to_nzdt!

Replacement in src/Utility.jl at line 6 [4.313612]

B:BD[4.315223] → [4.315223:315269]

        dt = DateTime(ye, mo, da, ho, mi, se)

[4.315223]

[4.315269]

using CSV, DataFrames, Dates, Glob, HTTP, Images, JSON, TimeZones, WAV
#XMLDict, DBInterface, DSP, DuckDB, PNGFiles, Random, SHA

Deletion in src/Utility.jl at line 9 [4.313612]

B:BD[4.315270] → [4.315270:315940]

        #new_date = dt - Dates.Hour(1)
        new_date = operator(dt, Dates.Hour(1))
        # Must drop the WAV extension to avoiding force=true 
        # with  mv, since  the new file name may already exist and mv
        # will stacktrace leaving a big mess to tidy up.
        base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")
        temp_file = base_file * ".tmp"
        # Tuple to tidy extensions later
        tidy = (temp_file, base_file * ".WAV")
        mv(old_file, temp_file)
        push!(fix_extension_of_files, tidy)
        print(".")
    end
    for item in fix_extension_of_files
        mv(item[1], item[2])
    end
    print("Tidy\n")
end

Deletion in src/Utility.jl at line 24 [4.313612]
B:BD[4.316421] → [4.316421:316422]

Deletion in src/Utility.jl at line 41 [4.313612]

B:BD[4.316919] → [4.316919:317670]

#=
used like:
using Glob, Skraak, CSV
folders=glob("*/2023-11-02/")
for folder in folders
cd(folder)
    try
        df = Skraak.file_metadata_to_df()
        CSV.write("/media/david/Pomona-3/Pomona-3/pomona_files_20231102.csv", df; append=true)
    catch
        @warn "error with $folder"
    end
cd("/media/david/Pomona-3/Pomona-3/")
end
Then using duckdb cli from SSD:
duckdb AudioData.duckdb
show tables;
SELECT * FROM pomona_files;
COPY pomona_files FROM '/media/david/Pomona-3/Pomona-3/pomona_files_20231019.csv';
SELECT * FROM pomona_files;
Then backup with:
EXPORT DATABASE 'AudioDataBackup_2023-07-29';
.quit
Then quit and backup using cp on the db file
Then rsync ssd to usb
rsync -avzr  --delete /media/david/SSD1/ /media/david/USB/
=#

Deletion in src/Utility.jl at line 42 [4.313612]

B:BD[4.317674] → [4.317674:324634]

file_metadata_to_df()
This function takes a file name, extracts wav metadata, gpx location, recording period start/end and returnes a dataframe.
This function needs raw audiomoth wav files and a gpx.
This function needs /media/david/SSD1/dawn_dusk.csv
using DataFrames, Dates, DelimitedFiles, DuckDB, Glob, JSON3, Random, SHA, TimeZones, WAV, XMLDict
"""
function file_metadata_to_df()
    # Initialise dataframe with columns: disk, location, trip_date, file, lattitude, longitude, start_recording_period_localt, finish_recording_period_localt, duration, sample_rate, zdt, ldt, moth_id, gain, battery, temperature
    df = DataFrame(
        disk = String[],
        location = String[],
        trip_date = String[],
        file = String[],
        latitude = Float64[],
        longitude = Float64[],
        start_recording_period_localt = String[],
        finish_recording_period_localt = String[],
        duration = Float64[],
        sample_rate = Int[],
        utc = String[],
        ldt = String[],
        moth_id = String[],
        gain = String[],
        battery = Float64[],
        temperature = Float64[],
        sha2_256 = String[],
        night = Bool[],
    )
    #Get WAV list for folder
    wav_list = glob("*.WAV") |> sort
    #Return empty df if nothing in the folder
    if length(wav_list) == 0
        return df
    end
    #Get path info from file system
    raw_path_vec = split(pwd(), "/")[end-2:end]
    disk = raw_path_vec[1]
    location = raw_path_vec[2]
    trip_date = raw_path_vec[3]
    #Get location, assumes 1 gpx is in the follder
    waypoint = glob("*.gpx")
    length(waypoint) != 1 && @error "no gpx file in $trip_date $location"
    loc = read(waypoint[1], String) |> xml_dict
    latitude = parse(Float64, (loc["gpx"]["wpt"][:lat]))
    longitude = parse(Float64, (loc["gpx"]["wpt"][:lon]))
    #Start of recording period
    _, _, _, binary_metadata_start = wavread(wav_list[1])
    c_v_s = split(wav_info_read(binary_metadata_start)[:ICMT], " ")
    comment_vector_start = length(c_v_s) < 22 ? c_v_s : c_v_s[1:19]
    date_start = split(comment_vector_start[4], "/")
    time_start = split(comment_vector_start[3], ":")
    tz_start = chop(comment_vector_start[5], head = 4, tail = 1)
    time_zone_start = isempty(tz_start) ? "+00" : tz_start
    #zdt1 = ZonedDateTime(parse(Int, date_start[3]), parse(Int, date_start[2]), parse(Int, date_start[1]), parse(Int, time_start[1]), parse(Int, time_start[2]), parse(Int, time_start[3]), tz"UTC")
    time_string_start =
        date_start[3] *
        "-" *
        date_start[2] *
        "-" *
        date_start[1] *
        "T" *
        time_start[1] *
        ":" *
        time_start[2] *
        ":" *
        time_start[3] *
        "." *
        "000" *
        time_zone_start
    zdt1 = ZonedDateTime(time_string_start)
    start_recording_period_localt =
        Dates.format(astimezone(zdt1, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
    #End of recording period
    _, _, _, binary_metadata_end = wavread(wav_list[end])
    c_v_e = split(wav_info_read(binary_metadata_end)[:ICMT], " ")
    comment_vector_end = length(c_v_e) < 22 ? c_v_e : c_v_e[1:19]
    date_end = split(comment_vector_end[4], "/")
    time_end = split(comment_vector_end[3], ":")
    tz_end = chop(comment_vector_start[5], head = 4, tail = 1)
    time_zone_end = isempty(tz_end) ? "+00" : tz_end
    #zdt2 = ZonedDateTime(parse(Int, date_end[3]), parse(Int, date_end[2]), parse(Int, date_end[1]),parse(Int, time_end[1]), parse(Int, time_end[2]), parse(Int, time_end[3]), tz"UTC")
    time_string_end =
        date_end[3] *
        "-" *
        date_end[2] *
        "-" *
        date_end[1] *
        "T" *
        time_end[1] *
        ":" *
        time_end[2] *
        ":" *
        time_end[3] *
        "." *
        "000" *
        time_zone_end
    zdt2 = ZonedDateTime(time_string_end)
    finish_recording_period_localt =
        Dates.format(astimezone(zdt2, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
    dict = Skraak.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
    #So I know what it is doing
    println(raw_path_vec)
    #Loop over file list
    for file in wav_list
        #print(file)
        try
            audio_data, sample_rate, _, binary_metadata = wavread(file)
            c_v = split(wav_info_read(binary_metadata)[:ICMT], " ")
            comment_vector = length(c_v) < 22 ? c_v : c_v[1:19]
            duration = Float64(length(audio_data) / sample_rate)
            date = split(comment_vector[4], "/")
            time = split(comment_vector[3], ":")
            tz = chop(comment_vector[5], head = 4, tail = 1)
            time_zone = isempty(tz) ? "+00" : tz
            #preformatting_zdt = ZonedDateTime(parse(Int, date[3]), parse(Int, date[2]), parse(Int, date[1]), parse(Int, time[1]), parse(Int, time[2]), parse(Int, time[3]), tz"UTC")
            time_string =
                date[3] *
                "-" *
                date[2] *
                "-" *
                date[1] *
                "T" *
                time[1] *
                ":" *
                time[2] *
                ":" *
                time[3] *
                "." *
                "000" *
                time_zone
            preformatting_zdt = ZonedDateTime(time_string)
            #zdt = Dates.format(preformatting_zdt, "yyyy-mm-dd HH:MM:SSzzzz")
            preformatting_utc = astimezone(preformatting_zdt, tz"UTC")
            utc = Dates.format(preformatting_utc, "yyyy-mm-dd HH:MM:SSzzzz")
            preformatting_ldt = astimezone(preformatting_zdt, tz"Pacific/Auckland")
            ldt = Dates.format(preformatting_ldt, "yyyy-mm-dd HH:MM:SSzzzz")
            moth_id = comment_vector[8]
            gain = comment_vector[10]
            #index back from end because if V > 4.9 the wording chaaanges
            battery = parse(Float64, chop(comment_vector[end-4], tail = 1))
            temperature = parse(Float64, chop(comment_vector[end], tail = 2))
            sha2_256 = bytes2hex(sha256(file))
            #assumes 15 minute file and calculates on half way time
            nt = Skraak.night(DateTime(preformatting_ldt + Minute(7) + Second(30)), dict)
            #Populate row to push into df
            row = [
                disk,
                location,
                trip_date,
                file,
                latitude,
                longitude,
                start_recording_period_localt,
                finish_recording_period_localt,
                duration,
                Int(sample_rate),
                utc,
                ldt,
                moth_id,
                gain,
                battery,
                temperature,
                sha2_256,
                nt,
            ]
            push!(df, row)
            print(".")
        catch
            @warn "error with $folder $file"
        end
    end
    return df
end
"""

Insertion in src/Utility.jl at line 149 [4.313612]

[4.328100]

move_one_hour!(files::Vector{String}, operator)
This function takes a vector of file paths and renames each file in the
vector by changing the name of the file to the name of the file created one
hour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,
which represents the time stamp of the original file minus (or plus) one hour. This
function avoids force=true with mv, since new file names may already exist
and mv will stacktrace leaving a big mess to tidy up.
Args:
•  files (Vector{String}): A vector of strings where each element is
a path to a file.
Returns: Nothing - This function only renames files and saves them.
I use this to turn the clock back at the end of daylight saving.
Assumes WAV files
"""
function move_one_hour!(files::Vector{String}, operator)
    @assert operator == (+) || operator == (-)
    fix_extension_of_files = []
    for old_file in files
        # Extract the date and time of the original file using string chopping
        a = chop(old_file, tail = 4)
        d, t = split(a, "_")
        ye = parse(Int64, d[1:4])
        mo = parse(Int64, d[5:6])
        da = parse(Int64, d[7:8])
        ho = parse(Int64, t[1:2])
        mi = parse(Int64, t[3:4])
        se = parse(Int64, t[5:6])
        dt = DateTime(ye, mo, da, ho, mi, se)
        #new_date = dt - Dates.Hour(1)
        new_date = operator(dt, Dates.Hour(1))
        # Must drop the WAV extension to avoiding force=true 
        # with  mv, since  the new file name may already exist and mv
        # will stacktrace leaving a big mess to tidy up.
        base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")
        temp_file = base_file * ".tmp"
        # Tuple to tidy extensions later
        tidy = (temp_file, base_file * ".WAV")
        mv(old_file, temp_file)
        push!(fix_extension_of_files, tidy)
        print(".")
    end
    for item in fix_extension_of_files
        mv(item[1], item[2])
    end
    print("Tidy\n")
end
"""

Insertion in src/Train.jl at line 2 [4.335620]
[4.335632]
[4.335645]
```
export train #beware Flux.train! is not Skraak.train
```
Deletion in src/Train.jl at line 9 [4.335620]
B:BD[4.335805] → [4.335805:335859]
```
export train #beware Flux.train! is not Skraak.train
```
Replacement in src/Skraak.jl at line 1 [4.345458]
B:BD[4.345458] → [4.345459:345473]
```
module Skraak
```
[4.345458]
[4.345473]
```
# Skraak.jl
```
Replacement in src/Skraak.jl at line 3 [4.345458]
B:BD[4.345474] → [4.345474:345547]
```
export make_clips, move_clips_to_folders, aggregate_labels, audiodata_db
```
[4.345474]
[4.345547]
```
module Skraak
```
Insertion in src/Skraak.jl at line 5 [4.345458]
[4.345548]
[4.345548]
```
include("ConstructPrimaryDataset.jl")
```
Replacement in src/Skraak.jl at line 8 [4.345458]
B:BD[4.345590] → [4.345590:345629]
```
#include("ConstructPrimaryDataset.jl")
```
[4.345590]
[4.345629]
```
include("FileMetaData.jl")
include("Clips.jl")
include("Labels.jl")
```

Deletion in src/Skraak.jl at line 13 [4.345458]

B:BD[4.345680] → [4.345680:356019]


using CSV, DataFrames, DataFramesMeta, Dates, DSP, Glob, JSON, Random, TimeZones, WAV, PNGFiles, Images #Plots
#import DataFramesMeta: @transform!, @subset!, @byrow, @passmissing
"""
make_clips(preds_path::String, dawn_dusk_dict::Dict{Dates.Date, Tuple{Dates.DateTime, Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv"))
This function takes a preds.csv files and generates
file names, wav's, spectrograms etc to be reviewed.
it calls night() and may call construct_dawn_dusk_dict() unless the dict is globally defined and passed in
It should be run from Pomona-1/, Pomona-2/ or Pomona-3/, assumes it is, it uses the path
It saves  wav and png files to /home/david/Upload/
need to use a try/catch because the 2 assert functions thow an error to short circuit the function
using Glob, Skraak
predictions = glob("*/2023-09-11*/preds*")
predictions = glob("path/to/preds*")
for file in predictions #[1:6][7:12][13:18][19:24]
try
make_clips(file)
catch x
println(x)
end
end
if needed to change headers in preds csv
shift, control, f in subl
file,start_time,end_time,label
/media/david/Pomona-2,<project filters>, preds-2023-02-27.csv
file,start_time,end_time,absent,present
using Glob, CSV, DataFrames, DataFramesMeta, Dates, DSP, Plots, Random, WAV
"""
# Assumes run on linux
# Assumes function run from Pomona-1 or Pomona-2
#dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv",),
function make_clips(
    preds_path::String,
    label::Int = 1,
    night::Bool = true,
    dawn_dusk_dict = dddict,
)
    # Assumes function run from Pomona-1 or Pomona-2
    location, trip_date, _ = split(preds_path, "/")
    # Load and group data frame by file
    gdf =
        #! format: off
        DataFrame(CSV.File(preds_path)) |>
        x -> assert_not_empty(x, preds_path) |>
        x -> rename_column!(x, "1.0", "label") |> #can remove now, needs to be label
        x -> assert_detections_present(x, label, location, trip_date) |>
        x -> filter_positives!(x, label) |>
        insert_datetime_column! |>
        x -> night_or_day!(x, dawn_dusk_dict, night) |> #true=night, false=day
        group_by_file!
        #! format: on
    # Make clip and spectrogram
    for (k, v) in pairs(gdf)
        #file_name = chop(v.file[1], head = 2, tail = 4)
        file_name = path_to_file_string(v.file[1])
        start_times = v[!, :start_time] |> sort
        detections = cluster_detections(start_times)
        isempty(detections) && continue
        signal, freq = wavread("$location/$trip_date/$file_name.WAV")
        length_signal = length(signal)
        for detection in detections
            st, en = calculate_clip_start_end(detection, freq, length_signal)
            name = "$location-$trip_date-$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"
            f = "Clips_$(today())"
            mkpath(f)
            outfile = "$f/$name"
            sample = signal[Int(st):Int(en)]
            wavwrite(sample, "$outfile.wav", Fs = Int(freq))
            #plot = plot_spectrogram(sample, freq)
            #savefig(plot, "$outfile.png")
            image = get_image_from_sample(sample, freq)
            PNGFiles.save("$outfile.png", image)
        end
        print(".")
    end
    println("\ndone $location/$trip_date \n")
end
#######################################################################
function assert_not_empty(df::DataFrame, preds_path::String)::DataFrame
    size(df) != (0, 0) ? (return df) : @error "Empty dataframe at $preds_path"
    #return df
end
function rename_column!(df::DataFrame, old_name::String, new_name::String)::DataFrame
    old_name in names(df) && rename!(df, old_name => new_name)
    return df
end
# assumes kiwi, binary classifier from opensoundscape
# needed to remove ::String annotation for location, trip_date to make it work
function assert_detections_present(
    df::DataFrame,
    label::Int,
    location,
    trip_date,
)::DataFrame
    label in levels(df.label) ? (return df) :
    @error "No detections for label = $label at $location/$trip_date"
end
# assumes kiwi
function filter_positives!(df::DataFrame, label)::DataFrame
    #filter!(row -> row.kiwi > 0, df)
    filter!(row -> row.label == label, df)
    return df
end
function path_to_file_string(path) #becareful path::String won't work: no method matching path_to_file_string(::InlineStrings.String31) line 70
    f = split(path, "/")[end] |> x -> split(x, ".") |> first
    #f = chop(file, head = 2, tail = 4)
    return f
end
function filename_to_datetime!(file)::DateTime
    #file_string = chop(file, head = 2, tail = 4)
    file_string = path_to_file_string(file)
    date_time =
        length(file_string) > 13 ? DateTime(file_string, dateformat"yyyymmdd_HHMMSS") :
        DateTime(
            (file_string[1:4] * "20" * file_string[5:end]),
            dateformat"ddmmyyyy_HHMMSS",
        )
    return date_time
end
function insert_datetime_column!(df::DataFrame)::DataFrame
    @transform!(df, @byrow :DateTime = filename_to_datetime!(String(:file)))
    return df
end
# calls night(), needs dawn_dusk_dict in local time format
function night_or_day!(
    df::DataFrame,
    dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}},
    night_time::Bool = true,
)::DataFrame
    night_time ? @subset!(df, @byrow night(:DateTime, dawn_dusk_dict)) :
    @subset!(df, @byrow !night(:DateTime, dawn_dusk_dict))
    return df
end
function group_by_file!(df::DataFrame)
    gdf = groupby(df, :file)
    return gdf
end
function cluster_detections(start_times::Vector{Float64})::Vector{Vector{Float64}}
    s = Vector{Float64}[]
    t = Float64[start_times[1]]
    for time in start_times[2:end]
        if time - last(t) <= 15.0
            push!(t, time)
        else
            push!(s, copy(t))
            t = Float64[time]
        end
    end
    push!(s, copy(t))
    detections = filter(x -> length(x) > 1, s)
    return detections
end
# assumes it is operating on 5 second clips
function calculate_clip_start_end(
    detection::Vector{Float64},
    freq::Float32,
    length_signal::Int64,
)::Tuple{Float64,Float64}
    first(detection) > 0 ? st = first(detection) * freq : st = 1
    (last(detection) + 5.0) * freq <= length_signal ? en = (last(detection) + 5.0) * freq :
    en = length_signal
    return st, en
end
#= Deprecated use get_image_from_sample()
function plot_spectrogram(
    sample::Vector{Float64},
    freq::Float32,
)::Plots.Plot{Plots.GRBackend}
    S = DSP.spectrogram(sample[:, 1], 400, 2; fs = convert(Int, freq))
    plot = Plots.heatmap(
        S.time,
        S.freq,
        pow2db.(S.power),
        size = (448, 448),
        showaxis = false,
        ticks = false,
        legend = false,
        thickness_scaling = 0,
    )
    return plot
end
=#
# f neeeds to be an Int
function get_image_from_sample(sample, f) #sample::Vector{Float64}
    S = DSP.spectrogram(sample, 400, 2; fs = convert(Int, f))
    i = S.power
    if minimum(i) == 0.0
        l = i |> vec |> unique |> sort
        replace!(i, 0.0 => l[2])
    end
    image =
        #! format: off
        DSP.pow2db.(i) |>
        x -> x .+ abs(minimum(x)) |>
        x -> x ./ maximum(x) |>
        x -> reverse(x, dims = 1) |>
        x -> RGB.(x) |> 
        x -> imresize(x, 224, 224)
        #! format: on
    return image
end
"""
construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
sun = DataFrame(CSV.File(file))
Takes dawn dusk.csv and returns a dict to be consumeed by night().
~/dawn_dusk.csv
At present it goes from the start of 2019 to the end of 2024
The csv contains local time sunrise and sunset
I use this to decide if a file with a local time encoded name was recorded at night
dict = construct_dawn_dusk_dict("/Volumes/SSD1/dawn_dusk.csv")
dict = Utility.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
using CSV, DataFrames
"""
function construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
    sun = DataFrame(CSV.File(file))
    x = Tuple(zip(sun.Dawn, sun.Dusk))
    y = Dict(zip(sun.Date, x))
    return y
end
"""
night(call_time::DateTime, dict::Dict{Date, Tuple{DateTime, DateTime}})::Bool
Returns true if time is at night, ie between civil twilights, dusk to dawn.
Consumes dict from construct_dawn_dusk_dict
time=DateTime("2021-11-02T21:14:35",dateformat"yyyy-mm-ddTHH:MM:SS")
Utility.night(time, dict)
"""
function night(call_time::DateTime, dict::Dict{Date,Tuple{DateTime,DateTime}})::Bool
    dawn = dict[Date(call_time)][1]
    dusk = dict[Date(call_time)][2]
    if call_time <= dawn || call_time >= dusk
        return true
    else
        return false
    end
end
#######################################################################
#INBETWEEN STEP: use secondary model to sort clips, move clips into D, F, M, N, and hand classify, generate actual.csv.
"""
move_clips_to_folders(df::DataFrame)
Takes a 2 column dataframe: file, label
file must be list of png images, assumes wav's are there too
will move mp4's from video folder if they are present
"""
function move_clips_to_folders(df::DataFrame)
    p = glob("*.png")
    w = glob("*.[W,w][A,a][V,v]")
    @assert (first(df.file) |> x -> split(x, ".")[end] |> x -> x == "png") "df.file must be a list of png's"
    @assert issetequal(df.file, p) "All png files in dataframe must be present in folder"
    @assert issetequal(chop.(df.file, head = 0, tail = 4), chop.(w, head = 0, tail = 4)) "There must be a wav for every png in the dataframe"
    for row in eachrow(df)
        src = row.file
        dst = "$(row.label)/$(row.file)"
        mkpath("$(row.label)/")
        try
            mv(src, dst)
            mv(chop(src, tail = 3) * "wav", chop(dst, tail = 3) * "wav")
            if isdir(video)
                mkpath("video/$(row.label)/")
                mv(
                    "video/" * chop(src, tail = 3) * "mp4",
                    "video/" * chop(dst, tail = 3) * "mp4",
                )
            end
        catch e
            @info e
        end
    end
end
#=
actual.csv must be list of qualified png file names: 
D/C05-2023-04-15-20230219_223000-380-470.png
using Glob, DataFrames, CSV
a=glob("[M,F,D,N]/*.png")
df = DataFrame(file=a)
CSV.write("actual_mfdn.csv", df)

Deletion in src/Skraak.jl at line 14 [4.345458]

B:BD[4.356020] → [4.356020:360059]

make a folder D,F,M,N
mkpath.(["D", "F", "M", "N"])
move wavs to match pngs
df=DataFrame(CSV.File("actual_mfdn.csv"))
for row in eachrow(df)
   src=split(row.file, "/")[2]
   dst=row.file
   mv(src, dst)
   mv(chop(src, tail=3)*"wav", chop(dst, tail=3)*"wav")
end
=#
#run from parent folder of label folders
#saves actual.csv and returns a df
#labels=["D", "F", "M", "N"]
function actual_from_folders(labels::Vector{String})::DataFrame
    paths=String[]
    for l in labels
        paths=append!(paths, glob("$l/*.png"))
    end
    df = DataFrame(file=paths)
    CSV.write("actual.csv", df)
    return df
end
"""
aggregate_labels(actual="actual.csv", outfile="labels.csv")
file
[D, F, M, N]/C05-2023-04-15-20230219_223000-380-470.png
This function takes the csv output from my hand classification and ouputs a df, and csv for insertion into AudioData.duckdb using the duckdb cli or using DFto.audiodata_db()
assumes run from Clips_xxxx-xx-xx folder and that "actual.csv" present if not specified.
returns a dataframe
using CSV, DataFrames, DataFramesMeta
"""
#=
df=aggregate_labels()
audiodata_db(df, "pomona_labels_20230418") NOT_WORKING maybe titles
to use cli, need to remove header row
duckdb /media/david/SSD1/AudioData.duckdb
COPY pomona_labels_20230418 FROM 'DB_Labels/pomona_labels_2023-11-02.csv';
COPY pomona_files FROM 'DB_Files/pomona_files_20231102.csv';
Then backup with:
EXPORT DATABASE 'AudioDataBackup_2023-11-14';
.quit
Then quit and backup using cp on the db file, dated copy
Then rsync ssd to usb
rsync -avzr  --delete /media/david/SSD1/ /media/david/USB/
note: run on mac
cd skraak.kiwi
julia-1.9
using Franklin
serve()
=#
# New one, without noise and distance, does not do :box anymore therefore requires new db schema
function aggregate_labels(
    actual::String = "actual.csv",
    outfile::String = "labels.csv",
    hdr::Bool = false #header for outfile 
)::DataFrame
    df = DataFrame(CSV.File(actual))
    # location, f, start_time, end_time
    @transform!(df, @byrow :location = split(split(:file, "/")[2], "-")[1])
    @transform!(df, @byrow :f = split(split(:file, "/")[2], "-")[5] * ".WAV")
    @transform!( df, @byrow :start_time = split(split(:file, "/")[2], "-")[end-1])
    @transform!( df, @byrow :end_time = chop(split(split(:file, "/")[2], "-")[end], tail=4))
    #@transform!( df, @byrow :box = "[$(split(split(:file, "/")[2], "-")[end-1]), $(chop(split(split(:file, "/")[2], "-")[end], tail=4))]")
    # male, female, duet, not
    @transform!(df, @byrow @passmissing :male = split(:file, "/")[1] == "M" ? true : false)
    @transform!(
        df,
        @byrow @passmissing :female = split(:file, "/")[1] == "F" ? true : false
    )
    @transform!(df, @byrow @passmissing :duet = split(:file, "/")[1] == "D" ? true : false)
    @transform!(
        df,
        @byrow @passmissing :not_kiwi =
            split(:file, "/")[1] in ["KA", "KE", "N", "Q"] ? true : false
    )
    # other_label
    @transform!(
        df,
        @byrow @passmissing :other_label =
            split(:file, "/")[1] in ["KA", "KE", "Q"] ? split(:file, "/")[1] : missing
    )
    # remove unwanted cols, rename f to file
    select!(df, Not([:file]))
    rename!(df, :f => :file)
    CSV.write(outfile, df; header=hdr)
    return df
end
"""
audiodata_db(df::DataFrame, table::String)
Use to upload labels to AudioData.duckdb
Takes a dataframe and inserts into AudioData.db table.
audiodata_db(df, "pomona_labels_20230418")
using DataFrames, DBInterface, DuckDB, Random
"""
function audiodata_db(df::DataFrame, table::String)
    if Sys.islinux()
        con = DBInterface.connect(DuckDB.DB, "/media/david/SSD1/AudioData.duckdb")
    else
        con = DBInterface.connect(DuckDB.DB, "/Volumes/SSD1/AudioData.duckdb")
    end
    temp_name = randstring(6)
    DuckDB.register_data_frame(con, df, temp_name)
    DBInterface.execute(
        con,
        """
        INSERT
        INTO $table
        SELECT *
        FROM '$temp_name'
        """,
    )
    DBInterface.close!(con)
end

Insertion in src/Predict.jl at line 2 [4.366496]
[4.366510]
[4.366510]
```
export predict
```
Deletion in src/Predict.jl at line 8 [4.366496]
B:BD[4.366665] → [4.366665:366681]
```
export predict
```
Deletion in src/ConstructPrimaryDataset.jl at line 3 [4.375124]
B:BD[2.48] → [2.48:71]
```
# Does not compile yet
```
Replacement in src/ConstructPrimaryDataset.jl at line 4 [4.375124]
∅:D[2.72] → [4.375126:375170]
B:BD[4.375126] → [4.375126:375170]
```
using DataFrames, DataFramesMeta, CSV, Glob
```
[2.72]
[4.375170]
```
using DataFrames, CSV, Glob
using DataFramesMeta: @transform!, @byrow #, @subset!, @passmissing 
```

Replacement in src/ConstructPrimaryDataset.jl at line 7 [4.375124]

B:BD[4.375171] → [4.375171:375599]

# Only moves WAVs not already there in dataset
# converts WAVs to flac to save space, file metadata will not survive
# requires columns :location, :file, :start_time, :end_time
# :file is the file name, :location is the actual recorder location eg "C05"
# run where the raw data is
# will find file in folder structure location/trip_date/file
# constructs dataset at output_path
# assumes file name has one . for extension only

[4.375171]

[3.5050]

"""
Only moves WAVs not already there in dataset
converts WAVs to flac to save space, file metadata will not survive
requires columns :location, :file, :start_time, :end_time
:file is the file name, :location is the actual recorder location eg "C05"
run where the raw data is
will find file in folder structure location/trip_date/file
constructs dataset at output_path
assumes file name has one . for extension only
"""

Replacement in src/ConstructPrimaryDataset.jl at line 49 [4.375124]
B:BD[4.376687] → [4.376687:376720]
```
function save_pngs(df:DataFrame)
```
[4.376687]
[3.6239]
```
function save_pngs(df::DataFrame)
```

refactored Skraak.jl into sub files, tidy now

Dependencies

In channels

Change contents

File deletion: Train_delete.jl

File deletion: Predict_delete.jl

Deletion in src/Utility.jl at line 2 [4.313612]

Replacement in src/Utility.jl at line 3 [4.313612]

Replacement in src/Utility.jl at line 6 [4.313612]

Deletion in src/Utility.jl at line 9 [4.313612]

Deletion in src/Utility.jl at line 24 [4.313612]

Deletion in src/Utility.jl at line 41 [4.313612]

Deletion in src/Utility.jl at line 42 [4.313612]

Insertion in src/Utility.jl at line 149 [4.313612]

Insertion in src/Train.jl at line 2 [4.335620]

Deletion in src/Train.jl at line 9 [4.335620]

Replacement in src/Skraak.jl at line 1 [4.345458]

Replacement in src/Skraak.jl at line 3 [4.345458]

Insertion in src/Skraak.jl at line 5 [4.345458]

Replacement in src/Skraak.jl at line 8 [4.345458]

Deletion in src/Skraak.jl at line 13 [4.345458]

Deletion in src/Skraak.jl at line 14 [4.345458]

Insertion in src/Predict.jl at line 2 [4.366496]

Deletion in src/Predict.jl at line 8 [4.366496]

Deletion in src/ConstructPrimaryDataset.jl at line 3 [4.375124]

Replacement in src/ConstructPrimaryDataset.jl at line 4 [4.375124]

Replacement in src/ConstructPrimaryDataset.jl at line 7 [4.375124]

Replacement in src/ConstructPrimaryDataset.jl at line 49 [4.375124]