refactored Skraak.jl into sub files, tidy now

[?]
AEj8dahVWy718uSSFPe9VSRJ5qX5G8pC2zvFzJJ8yzBd
Dec 9, 2023, 8:01 AM
BOPNWZL4RWF4UGC2LUEVONNQFSKYJ2Q5W747GH3UURZLBJFFEBUQC

Dependencies

  • [2] U46LDPL7 added model, CUDA works again now on ubuntu
  • [3] EDYR5C55 fixed ConstructPrimaryDataSet, licence date, Readme, re-arranged Predict.jl
  • [4] NV7FXZ5Q first commit

Change contents

  • file deletion: Train_delete.jl (----------)
    [4.6598][4.335579:335618](),[4.335618][4.329788:329788]()
    # Train.jl
    # https://github.com/FluxML/model-zoo/blob/master/tutorials/transfer_learning/transfer_learning.jl
    # This works on my data IT TRAINS best, but only -t 4
    # dont forget temp env
    using Random: shuffle!
    using Random: seed!
    import Base: length
    import Base: getindex
    using Images
    using Flux
    using CUDA
    using Metalhead
    using Noise
    using Glob
    using BSON: @save
    using Dates
    #using CSV
    using DataFrames
    using FreqTables
    using JLD2
    using Logging, LoggingExtras
    imgs = glob("2023-09-*/*/*/[N,K]/*.png") #from SSD2
    seed!(1234);
    shuffle!(imgs)
    #CSV.write("files.csv", DataFrame(file=imgs))
    device = CUDA.functional() ? gpu : cpu
    struct ImageContainer{T<:Vector}
    img::T
    end
    struct ValidationImageContainer{T<:Vector}
    img::T
    end
    data = ImageContainer(imgs)
    val_data = ValidationImageContainer(imgs)
    length(data::ImageContainer) = length(data.img)
    length(data::ValidationImageContainer) = length(data.img)
    const im_size = (224, 224)
    name_to_idx = Dict{String,Int32}("K" => 1, "N" => 2)
    function getindex(data::ImageContainer{Vector{String}}, idx::Int)
    path = data.img[idx]
    img =
    Images.load(path) |>
    x ->
    Images.imresize(x, 224, 224) |>
    x ->
    Images.RGB.(x) |>
    x ->
    Noise.add_gauss(x, (rand() * 0.2)) |>
    x ->
    apply_mask(x, 3, 3, 12) |>
    x ->
    collect(channelview(float32.(x))) |>
    x -> permutedims(x, (3, 2, 1))
    y = name_to_idx[(split(path, "/")[end-1])]
    return img, y
    end
    function getindex(data::ValidationImageContainer{Vector{String}}, idx::Int)
    path = data.img[idx]
    img =
    Images.load(path) |>
    x ->
    Images.imresize(x, 224, 224) |>
    x ->
    Images.RGB.(x) |>
    x -> collect(channelview(float32.(x))) |> x -> permutedims(x, (3, 2, 1))
    y = name_to_idx[(split(path, "/")[end-1])]
    return img, y
    end
    # assumes 224px square images
    function apply_mask(
    img::Array{RGB{N0f8},2},
    max_number::Int = 3,
    min_size::Int = 3,
    max_size::Int = 22,
    )
    # horizontal
    for range in get_random_ranges(max_number, min_size, max_size)
    img[range, :] .= RGB{N0f8}(0.7, 0.7, 0.7)
    end
    # vertical
    for range in get_random_ranges(max_number, min_size, max_size)
    img[:, range] .= RGB{N0f8}(0.7, 0.7, 0.7)
    end
    return img
    end
    # assumes 224px square images
    function get_random_ranges(max_number::Int, min_size::Int, max_size::Int)
    number = rand(0:max_number)
    ranges = []
    while length(ranges) < number
    start = rand(1:224)
    size = rand(min_size:max_size)
    if start + size > 224
    continue
    end
    push!(ranges, start:start+size)
    end
    return ranges
    end
    # define DataLoaders
    const batch_size = 64
    const train_test_split = 0.95
    const ceiling = length(data) ÷ batch_size * batch_size
    const train_test_index =
    ceiling ÷ batch_size * train_test_split |> round |> x -> x * batch_size |> Int
    train = Flux.DataLoader(
    ImageContainer(imgs[1:train_test_index]);
    batchsize = batch_size,
    collate = true,
    parallel = true,
    )
    device == gpu ? train = CuIterator(train) : nothing
    train_sample = Flux.DataLoader(
    ValidationImageContainer(imgs[1:(ceiling-train_test_index)]);
    batchsize = batch_size,
    collate = true,
    parallel = true,
    )
    device == gpu ? train_sample = CuIterator(train_sample) : nothing
    test = Flux.DataLoader(
    ValidationImageContainer(imgs[train_test_index+1:ceiling]);
    batchsize = batch_size,
    collate = true,
    parallel = true,
    )
    device == gpu ? test = CuIterator(test) : nothing
    fst = Metalhead.ResNet(18, pretrain = true).layers
    # BEWARE NUMBER CLASSES
    lst = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => 2));
    model = Flux.Chain(fst[1], lst) |> device
    function eval_f(m, d)
    good = 0
    count = 0
    pred = []
    actual = []
    for (x, y) in d
    p = Flux.onecold(m(x))
    good += sum(p .== y)
    count += length(y)
    append!(pred, p)
    append!(actual, y)
    end
    accuracy = round(good / count, digits = 4)
    confusion_matrix =
    freqtable(DataFrame(targets = actual, predicts = pred), :targets, :predicts)
    return accuracy, confusion_matrix
    end
    # BEWARE NUMBER CLASSES
    function train_epoch!(model; opt, train)
    Flux.train!(model, train, opt) do m, x, y
    Flux.Losses.logitcrossentropy(m(x), Flux.onehotbatch(y, 1:2))
    end
    end
    opt = Flux.setup(Flux.Optimisers.Adam(1e-5), model);
    logger = FileLogger("logfile.txt"; append = true)
    @time metric_eval, v_confusion_matrix = eval_f(model, test)
    #with_logger(logger) do
    @info "eval" accuracy = metric_eval
    @info "eval" v_confusion_matrix
    #end
    a = 0.0
    for iter in 1:15
    println("")
    println("Epoch: $iter")
    @time train_epoch!(model; opt, train)
    @time metric_train, t_confusion_matrix = eval_f(model, train_sample)
    #with_logger(logger) do
    @info "Epoch: " iter
    @info "train" accuracy = metric_train
    @info "train" t_confusion_matrix
    #end
    @time metric_eval, v_confusion_matrix = eval_f(model, test)
    #with_logger(logger) do
    @info "test" accuracy = metric_eval
    @info "test" v_confusion_matrix
    #end
    metric_eval > a && begin
    a = metric_eval
    let _model = cpu(model)
    jldsave(
    "model_K1-4_CPU_epoch-$iter-$metric_eval-$(today()).jld2";
    model_state = Flux.state(_model),
    )
    #BSON.@save "model_K1-3_CPU_epoch-$iter-$metric_eval-$(now()).bson" _model
    #with_logger(logger) do
    @info "Saved a best_model"
    #end
    end
    end
    end
  • file deletion: Predict_delete.jl (----------)
    [4.6598][4.366453:366494](),[4.366494][4.360107:360107]()
    # Predict.jl
    using WAV, DSP, Images, ThreadsX, Dates, DataFrames, CSV, Flux, CUDA, Metalhead, JLD2, FLAC
    export predict
    """
    predict(glob_pattern::String, model::String)
    This function takes a glob pattern for folders to run over, and a model path. It saves results in a csv for each folder, similar to opensoundscape
    Args:
    • glob pattern (folder/)
    • model path
    Returns: Nothing - This function saves csv files.
    I use this function to find kiwi from new data gathered on a trip.
    Note:
    Dont forget temp env, julia -t 4
    From Pomona-3/Pomona-3/
    Use like:
    using Skraak
    glob_pattern = "*/2023-10-19/" #from SSD1
    model = "/media/david/SSD1/model_K1-3_CPU_epoch-10-0.9965-2023-10-18T17:32:36.747.jld2"
    predict(glob_pattern, model)
    """
    function predict(glob_pattern::String, model::String)
    model = load_model(model) |> device
    folders = glob(glob_pattern)
    @info "Folders: $folders"
    for folder in folders
    @info "Working on: $folder"
    predict_folder(folder, model)
    end
    end
    function predict(folders::Vector{String}, model::String)
    model = load_model(model) |> device
    @info "Folders: $folders"
    for folder in folders
    @info "Working on: $folder"
    predict_folder(folder, model)
    end
    end
    #~~~~~ The guts ~~~~~#
    device = CUDA.functional() ? gpu : cpu
    #= TO DELETE
    function get_image_for_inference(sample, f)
    S = DSP.spectrogram(sample, 400, 2; fs = f)
    i = S.power
    if minimum(i) == 0.0
    l = i |> vec |> unique |> sort
    replace!(i, 0.0 => l[2])
    end
    image =
    #! format: off
    DSP.pow2db.(i) |>
    x -> x .+ abs(minimum(x)) |>
    x -> x ./ maximum(x) |>
    x -> reverse(x, dims = 1) |>
    x -> RGB.(x) |>
    x -> imresize(x, 224, 224) |>
    x -> collect(channelview(float32.(x))) |>
    x -> permutedims(x, (3, 2, 1))
    #! format: on
    return image
    end
    =#
    function get_image_for_inference(sample, f)
    image =
    #! format: off
    get_image_from_sample(sample, f) |>
    x -> collect(channelview(float32.(x))) |>
    x -> permutedims(x, (3, 2, 1))
    #! format: on
    return image
    end
    function get_images(file::String, increment::Int = 5, divisor::Int = 2) #5s sample, 2.5s hop
    ext = split(file, ".")[end]
    @assert ext in ["WAV", "wav", "flac"] "Unsupported audio file type, requires wav or flac."
    if ext in ["WAV", "wav"]
    signal, freq = wavread(file)
    else
    signal, freq = load(file)
    end
    if freq > 16000
    signal = DSP.resample(signal, 16000.0f0 / freq; dims = 1)
    freq = 16000
    end
    f = convert(Int, freq)
    inc = increment * f
    hop = f * increment ÷ divisor #need guarunteed Int, maybe not anymore, refactor
    split_signal = DSP.arraysplit(signal[:, 1], inc, hop)
    raw_images = ThreadsX.map(x -> get_image_for_inference(x, f), split_signal)
    n_samples = length(raw_images)
    return raw_images, n_samples
    end
    function get_images_time_from_wav(file::String, increment::Int = 5, divisor::Int = 2)
    raw_images, n_samples = get_images(file::String, increment, divisor)
    images = reshape_images(raw_images, n_samples)
    start_time = 0:(increment/divisor):(n_samples-1)*(increment/divisor)
    end_time = increment:(increment/divisor):(n_samples+1)*(increment/divisor)
    time = collect(zip(start_time, end_time))
    return images, time
    end
    function reshape_images(raw_images, n_samples)
    images =
    #! format: off
    hcat(raw_images...) |>
    x -> reshape(x, (224, 224, 3, n_samples))
    #! format: on
    return images
    end
    function predict_file(file::String, folder::String, model)
    #check form of opensoundscape preds.csv and needed by my make_clips
    @info "File: $file"
    @time images, time = get_images_time_from_wav(file)
    data = images |> device
    @time predictions = Flux.onecold(model(data))
    f = (repeat(["$file"], length(time)))
    df = DataFrame(
    :file => f,
    :start_time => first.(time),
    :end_time => last.(time),
    :label => predictions,
    )
    return df
    end
    function predict_folder(folder::String, model)
    wav = glob("$folder/*.[W,w][A,a][V,v]")
    flac = glob("$folder/*.flac")
    files = cat(wav, flac; dims = 1)
    @info "$(length(files)) files in $folder"
    df = DataFrame(
    file = String[],
    start_time = Float64[],
    end_time = Float64[],
    label = Int[],
    )
    save_path = "$folder/preds-$(today()).csv"
    CSV.write("$save_path", df)
    for file in files
    df = predict_file(file, folder, model)
    CSV.write("$save_path", df, append = true)
    end
    end
    # see load_model() from train, different input types
    function load_model(model_path::String)
    model_state = JLD2.load(model_path, "model_state")
    model_classes = length(model_state[1][2][1][3][2])
    f = Metalhead.ResNet(18, pretrain = false).layers
    l = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => model_classes))
    model = Flux.Chain(f[1], l)
    Flux.loadmodel!(model, model_state)
    return model
    end
    #=
    function load_bson(model_path::String)
    BSON.@load model_path model
    end
    =#
    ############### PYTHON Opensoundscape ################
    #=
    # Dont forget conda activate opensoundscape
    # Dont forget to modify file names and glob pattern
    # Run script in Pomona-2, hard code trip date in the glob
    # python /media/david/USB/Skraak/src/predict.py
    from opensoundscape.torch.models.cnn import load_model
    import opensoundscape
    import torch
    from pathlib import Path
    import numpy as np
    import pandas as pd
    from glob import glob
    import os
    from datetime import datetime
    model = load_model('/home/david/best.model')
    # folders = glob('./*/2023-?????/')
    folders = glob('./*/*/')
    for folder in folders:
    os.chdir(folder)
    print(folder, ' start: ', datetime.now())
    # Beware, secretary island files are .wav
    field_recordings = glob('./*.WAV')
    scores, preds, unsafe = model.predict(
    field_recordings,
    binary_preds = 'single_target',
    overlap_fraction = 0.5,
    batch_size = 128,
    num_workers = 12)
    scores.to_csv("scores-2023-11-07.csv")
    preds.to_csv("preds-2023-11-07.csv")
    os.chdir('../..')
    print(folder, ' done: ', datetime.now())
    print()
    print()
    =#
  • edit in src/Utility.jl at line 2
    [4.313626][4.313626:315017]()
    using CSV,
    DataFrames,
    Dates,
    DBInterface,
    DSP,
    DuckDB,
    Glob,
    HTTP,
    Images,
    JSON,
    PNGFiles,
    Random,
    SHA,
    TimeZones,
    WAV,
    XMLDict
    export move_one_hour!,
    check_png_wav_both_present,
    file_metadata_to_df,
    resize_image!,
    twilight_tuple_local_time,
    utc_to_nzdt!
    """
    move_one_hour!(files::Vector{String}, operator)
    This function takes a vector of file paths and renames each file in the
    vector by changing the name of the file to the name of the file created one
    hour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,
    which represents the time stamp of the original file minus (or plus) one hour. This
    function avoids force=true with mv, since new file names may already exist
    and mv will stacktrace leaving a big mess to tidy up.
    Args:
    • files (Vector{String}): A vector of strings where each element is
    a path to a file.
    Returns: Nothing - This function only renames files and saves them.
    I use this to turn the clock back at the end of daylight saving.
    """
    #Assumes WAV files
    function move_one_hour!(files::Vector{String}, operator)
    @assert operator == (+) || operator == (-)
    fix_extension_of_files = []
    for old_file in files
    # Extract the date and time of the original file using string chopping
    a = chop(old_file, tail = 4)
    d, t = split(a, "_")
  • replacement in src/Utility.jl at line 3
    [4.315018][4.315018:315222]()
    ye = parse(Int64, d[1:4])
    mo = parse(Int64, d[5:6])
    da = parse(Int64, d[7:8])
    ho = parse(Int64, t[1:2])
    mi = parse(Int64, t[3:4])
    se = parse(Int64, t[5:6])
    [4.315018]
    [4.315222]
    export check_png_wav_both_present,
    resize_image!, twilight_tuple_local_time, move_one_hour!, utc_to_nzdt!
  • replacement in src/Utility.jl at line 6
    [4.315223][4.315223:315269]()
    dt = DateTime(ye, mo, da, ho, mi, se)
    [4.315223]
    [4.315269]
    using CSV, DataFrames, Dates, Glob, HTTP, Images, JSON, TimeZones, WAV
    #XMLDict, DBInterface, DSP, DuckDB, PNGFiles, Random, SHA
  • edit in src/Utility.jl at line 9
    [4.315270][4.315270:315940]()
    #new_date = dt - Dates.Hour(1)
    new_date = operator(dt, Dates.Hour(1))
    # Must drop the WAV extension to avoiding force=true
    # with mv, since the new file name may already exist and mv
    # will stacktrace leaving a big mess to tidy up.
    base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")
    temp_file = base_file * ".tmp"
    # Tuple to tidy extensions later
    tidy = (temp_file, base_file * ".WAV")
    mv(old_file, temp_file)
    push!(fix_extension_of_files, tidy)
    print(".")
    end
    for item in fix_extension_of_files
    mv(item[1], item[2])
    end
    print("Tidy\n")
    end
  • edit in src/Utility.jl at line 24
    [4.316421][4.316421:316422]()
  • edit in src/Utility.jl at line 41
    [4.316919][4.316919:317670]()
    #=
    used like:
    using Glob, Skraak, CSV
    folders=glob("*/2023-11-02/")
    for folder in folders
    cd(folder)
    try
    df = Skraak.file_metadata_to_df()
    CSV.write("/media/david/Pomona-3/Pomona-3/pomona_files_20231102.csv", df; append=true)
    catch
    @warn "error with $folder"
    end
    cd("/media/david/Pomona-3/Pomona-3/")
    end
    Then using duckdb cli from SSD:
    duckdb AudioData.duckdb
    show tables;
    SELECT * FROM pomona_files;
    COPY pomona_files FROM '/media/david/Pomona-3/Pomona-3/pomona_files_20231019.csv';
    SELECT * FROM pomona_files;
    Then backup with:
    EXPORT DATABASE 'AudioDataBackup_2023-07-29';
    .quit
    Then quit and backup using cp on the db file
    Then rsync ssd to usb
    rsync -avzr --delete /media/david/SSD1/ /media/david/USB/
    =#
  • edit in src/Utility.jl at line 42
    [4.317674][4.317674:324634]()
    file_metadata_to_df()
    This function takes a file name, extracts wav metadata, gpx location, recording period start/end and returnes a dataframe.
    This function needs raw audiomoth wav files and a gpx.
    This function needs /media/david/SSD1/dawn_dusk.csv
    using DataFrames, Dates, DelimitedFiles, DuckDB, Glob, JSON3, Random, SHA, TimeZones, WAV, XMLDict
    """
    function file_metadata_to_df()
    # Initialise dataframe with columns: disk, location, trip_date, file, lattitude, longitude, start_recording_period_localt, finish_recording_period_localt, duration, sample_rate, zdt, ldt, moth_id, gain, battery, temperature
    df = DataFrame(
    disk = String[],
    location = String[],
    trip_date = String[],
    file = String[],
    latitude = Float64[],
    longitude = Float64[],
    start_recording_period_localt = String[],
    finish_recording_period_localt = String[],
    duration = Float64[],
    sample_rate = Int[],
    utc = String[],
    ldt = String[],
    moth_id = String[],
    gain = String[],
    battery = Float64[],
    temperature = Float64[],
    sha2_256 = String[],
    night = Bool[],
    )
    #Get WAV list for folder
    wav_list = glob("*.WAV") |> sort
    #Return empty df if nothing in the folder
    if length(wav_list) == 0
    return df
    end
    #Get path info from file system
    raw_path_vec = split(pwd(), "/")[end-2:end]
    disk = raw_path_vec[1]
    location = raw_path_vec[2]
    trip_date = raw_path_vec[3]
    #Get location, assumes 1 gpx is in the follder
    waypoint = glob("*.gpx")
    length(waypoint) != 1 && @error "no gpx file in $trip_date $location"
    loc = read(waypoint[1], String) |> xml_dict
    latitude = parse(Float64, (loc["gpx"]["wpt"][:lat]))
    longitude = parse(Float64, (loc["gpx"]["wpt"][:lon]))
    #Start of recording period
    _, _, _, binary_metadata_start = wavread(wav_list[1])
    c_v_s = split(wav_info_read(binary_metadata_start)[:ICMT], " ")
    comment_vector_start = length(c_v_s) < 22 ? c_v_s : c_v_s[1:19]
    date_start = split(comment_vector_start[4], "/")
    time_start = split(comment_vector_start[3], ":")
    tz_start = chop(comment_vector_start[5], head = 4, tail = 1)
    time_zone_start = isempty(tz_start) ? "+00" : tz_start
    #zdt1 = ZonedDateTime(parse(Int, date_start[3]), parse(Int, date_start[2]), parse(Int, date_start[1]), parse(Int, time_start[1]), parse(Int, time_start[2]), parse(Int, time_start[3]), tz"UTC")
    time_string_start =
    date_start[3] *
    "-" *
    date_start[2] *
    "-" *
    date_start[1] *
    "T" *
    time_start[1] *
    ":" *
    time_start[2] *
    ":" *
    time_start[3] *
    "." *
    "000" *
    time_zone_start
    zdt1 = ZonedDateTime(time_string_start)
    start_recording_period_localt =
    Dates.format(astimezone(zdt1, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
    #End of recording period
    _, _, _, binary_metadata_end = wavread(wav_list[end])
    c_v_e = split(wav_info_read(binary_metadata_end)[:ICMT], " ")
    comment_vector_end = length(c_v_e) < 22 ? c_v_e : c_v_e[1:19]
    date_end = split(comment_vector_end[4], "/")
    time_end = split(comment_vector_end[3], ":")
    tz_end = chop(comment_vector_start[5], head = 4, tail = 1)
    time_zone_end = isempty(tz_end) ? "+00" : tz_end
    #zdt2 = ZonedDateTime(parse(Int, date_end[3]), parse(Int, date_end[2]), parse(Int, date_end[1]),parse(Int, time_end[1]), parse(Int, time_end[2]), parse(Int, time_end[3]), tz"UTC")
    time_string_end =
    date_end[3] *
    "-" *
    date_end[2] *
    "-" *
    date_end[1] *
    "T" *
    time_end[1] *
    ":" *
    time_end[2] *
    ":" *
    time_end[3] *
    "." *
    "000" *
    time_zone_end
    zdt2 = ZonedDateTime(time_string_end)
    finish_recording_period_localt =
    Dates.format(astimezone(zdt2, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
    dict = Skraak.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
    #So I know what it is doing
    println(raw_path_vec)
    #Loop over file list
    for file in wav_list
    #print(file)
    try
    audio_data, sample_rate, _, binary_metadata = wavread(file)
    c_v = split(wav_info_read(binary_metadata)[:ICMT], " ")
    comment_vector = length(c_v) < 22 ? c_v : c_v[1:19]
    duration = Float64(length(audio_data) / sample_rate)
    date = split(comment_vector[4], "/")
    time = split(comment_vector[3], ":")
    tz = chop(comment_vector[5], head = 4, tail = 1)
    time_zone = isempty(tz) ? "+00" : tz
    #preformatting_zdt = ZonedDateTime(parse(Int, date[3]), parse(Int, date[2]), parse(Int, date[1]), parse(Int, time[1]), parse(Int, time[2]), parse(Int, time[3]), tz"UTC")
    time_string =
    date[3] *
    "-" *
    date[2] *
    "-" *
    date[1] *
    "T" *
    time[1] *
    ":" *
    time[2] *
    ":" *
    time[3] *
    "." *
    "000" *
    time_zone
    preformatting_zdt = ZonedDateTime(time_string)
    #zdt = Dates.format(preformatting_zdt, "yyyy-mm-dd HH:MM:SSzzzz")
    preformatting_utc = astimezone(preformatting_zdt, tz"UTC")
    utc = Dates.format(preformatting_utc, "yyyy-mm-dd HH:MM:SSzzzz")
    preformatting_ldt = astimezone(preformatting_zdt, tz"Pacific/Auckland")
    ldt = Dates.format(preformatting_ldt, "yyyy-mm-dd HH:MM:SSzzzz")
    moth_id = comment_vector[8]
    gain = comment_vector[10]
    #index back from end because if V > 4.9 the wording chaaanges
    battery = parse(Float64, chop(comment_vector[end-4], tail = 1))
    temperature = parse(Float64, chop(comment_vector[end], tail = 2))
    sha2_256 = bytes2hex(sha256(file))
    #assumes 15 minute file and calculates on half way time
    nt = Skraak.night(DateTime(preformatting_ldt + Minute(7) + Second(30)), dict)
    #Populate row to push into df
    row = [
    disk,
    location,
    trip_date,
    file,
    latitude,
    longitude,
    start_recording_period_localt,
    finish_recording_period_localt,
    duration,
    Int(sample_rate),
    utc,
    ldt,
    moth_id,
    gain,
    battery,
    temperature,
    sha2_256,
    nt,
    ]
    push!(df, row)
    print(".")
    catch
    @warn "error with $folder $file"
    end
    end
    return df
    end
    """
  • edit in src/Utility.jl at line 149
    [4.328100]
    [4.328100]
    move_one_hour!(files::Vector{String}, operator)
    This function takes a vector of file paths and renames each file in the
    vector by changing the name of the file to the name of the file created one
    hour before the original file. The new name format is yyyymmdd_HHMMSS.tmp,
    which represents the time stamp of the original file minus (or plus) one hour. This
    function avoids force=true with mv, since new file names may already exist
    and mv will stacktrace leaving a big mess to tidy up.
    Args:
    • files (Vector{String}): A vector of strings where each element is
    a path to a file.
    Returns: Nothing - This function only renames files and saves them.
    I use this to turn the clock back at the end of daylight saving.
    Assumes WAV files
    """
    function move_one_hour!(files::Vector{String}, operator)
    @assert operator == (+) || operator == (-)
    fix_extension_of_files = []
    for old_file in files
    # Extract the date and time of the original file using string chopping
    a = chop(old_file, tail = 4)
    d, t = split(a, "_")
    ye = parse(Int64, d[1:4])
    mo = parse(Int64, d[5:6])
    da = parse(Int64, d[7:8])
    ho = parse(Int64, t[1:2])
    mi = parse(Int64, t[3:4])
    se = parse(Int64, t[5:6])
    dt = DateTime(ye, mo, da, ho, mi, se)
    #new_date = dt - Dates.Hour(1)
    new_date = operator(dt, Dates.Hour(1))
    # Must drop the WAV extension to avoiding force=true
    # with mv, since the new file name may already exist and mv
    # will stacktrace leaving a big mess to tidy up.
    base_file = Dates.format(new_date, "yyyymmdd_HHMMSS")
    temp_file = base_file * ".tmp"
    # Tuple to tidy extensions later
    tidy = (temp_file, base_file * ".WAV")
    mv(old_file, temp_file)
    push!(fix_extension_of_files, tidy)
    print(".")
    end
    for item in fix_extension_of_files
    mv(item[1], item[2])
    end
    print("Tidy\n")
    end
    """
  • edit in src/Train.jl at line 2
    [4.335632]
    [4.335645]
    export train #beware Flux.train! is not Skraak.train
  • edit in src/Train.jl at line 9
    [4.335805][4.335805:335859]()
    export train #beware Flux.train! is not Skraak.train
  • replacement in src/Skraak.jl at line 1
    [4.345458][4.345459:345473]()
    module Skraak
    [4.345458]
    [4.345473]
    # Skraak.jl
  • replacement in src/Skraak.jl at line 3
    [4.345474][4.345474:345547]()
    export make_clips, move_clips_to_folders, aggregate_labels, audiodata_db
    [4.345474]
    [4.345547]
    module Skraak
  • edit in src/Skraak.jl at line 5
    [4.345548]
    [4.345548]
    include("ConstructPrimaryDataset.jl")
  • replacement in src/Skraak.jl at line 8
    [4.345590][4.345590:345629]()
    #include("ConstructPrimaryDataset.jl")
    [4.345590]
    [4.345629]
    include("FileMetaData.jl")
    include("Clips.jl")
    include("Labels.jl")
  • edit in src/Skraak.jl at line 13
    [4.345680][4.345680:356019]()
    using CSV, DataFrames, DataFramesMeta, Dates, DSP, Glob, JSON, Random, TimeZones, WAV, PNGFiles, Images #Plots
    #import DataFramesMeta: @transform!, @subset!, @byrow, @passmissing
    """
    make_clips(preds_path::String, dawn_dusk_dict::Dict{Dates.Date, Tuple{Dates.DateTime, Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv"))
    This function takes a preds.csv files and generates
    file names, wav's, spectrograms etc to be reviewed.
    it calls night() and may call construct_dawn_dusk_dict() unless the dict is globally defined and passed in
    It should be run from Pomona-1/, Pomona-2/ or Pomona-3/, assumes it is, it uses the path
    It saves wav and png files to /home/david/Upload/
    need to use a try/catch because the 2 assert functions thow an error to short circuit the function
    using Glob, Skraak
    predictions = glob("*/2023-09-11*/preds*")
    predictions = glob("path/to/preds*")
    for file in predictions #[1:6][7:12][13:18][19:24]
    try
    make_clips(file)
    catch x
    println(x)
    end
    end
    if needed to change headers in preds csv
    shift, control, f in subl
    file,start_time,end_time,label
    /media/david/Pomona-2,<project filters>, preds-2023-02-27.csv
    file,start_time,end_time,absent,present
    using Glob, CSV, DataFrames, DataFramesMeta, Dates, DSP, Plots, Random, WAV
    """
    # Assumes run on linux
    # Assumes function run from Pomona-1 or Pomona-2
    #dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv",),
    function make_clips(
    preds_path::String,
    label::Int = 1,
    night::Bool = true,
    dawn_dusk_dict = dddict,
    )
    # Assumes function run from Pomona-1 or Pomona-2
    location, trip_date, _ = split(preds_path, "/")
    # Load and group data frame by file
    gdf =
    #! format: off
    DataFrame(CSV.File(preds_path)) |>
    x -> assert_not_empty(x, preds_path) |>
    x -> rename_column!(x, "1.0", "label") |> #can remove now, needs to be label
    x -> assert_detections_present(x, label, location, trip_date) |>
    x -> filter_positives!(x, label) |>
    insert_datetime_column! |>
    x -> night_or_day!(x, dawn_dusk_dict, night) |> #true=night, false=day
    group_by_file!
    #! format: on
    # Make clip and spectrogram
    for (k, v) in pairs(gdf)
    #file_name = chop(v.file[1], head = 2, tail = 4)
    file_name = path_to_file_string(v.file[1])
    start_times = v[!, :start_time] |> sort
    detections = cluster_detections(start_times)
    isempty(detections) && continue
    signal, freq = wavread("$location/$trip_date/$file_name.WAV")
    length_signal = length(signal)
    for detection in detections
    st, en = calculate_clip_start_end(detection, freq, length_signal)
    name = "$location-$trip_date-$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"
    f = "Clips_$(today())"
    mkpath(f)
    outfile = "$f/$name"
    sample = signal[Int(st):Int(en)]
    wavwrite(sample, "$outfile.wav", Fs = Int(freq))
    #plot = plot_spectrogram(sample, freq)
    #savefig(plot, "$outfile.png")
    image = get_image_from_sample(sample, freq)
    PNGFiles.save("$outfile.png", image)
    end
    print(".")
    end
    println("\ndone $location/$trip_date \n")
    end
    #######################################################################
    function assert_not_empty(df::DataFrame, preds_path::String)::DataFrame
    size(df) != (0, 0) ? (return df) : @error "Empty dataframe at $preds_path"
    #return df
    end
    function rename_column!(df::DataFrame, old_name::String, new_name::String)::DataFrame
    old_name in names(df) && rename!(df, old_name => new_name)
    return df
    end
    # assumes kiwi, binary classifier from opensoundscape
    # needed to remove ::String annotation for location, trip_date to make it work
    function assert_detections_present(
    df::DataFrame,
    label::Int,
    location,
    trip_date,
    )::DataFrame
    label in levels(df.label) ? (return df) :
    @error "No detections for label = $label at $location/$trip_date"
    end
    # assumes kiwi
    function filter_positives!(df::DataFrame, label)::DataFrame
    #filter!(row -> row.kiwi > 0, df)
    filter!(row -> row.label == label, df)
    return df
    end
    function path_to_file_string(path) #becareful path::String won't work: no method matching path_to_file_string(::InlineStrings.String31) line 70
    f = split(path, "/")[end] |> x -> split(x, ".") |> first
    #f = chop(file, head = 2, tail = 4)
    return f
    end
    function filename_to_datetime!(file)::DateTime
    #file_string = chop(file, head = 2, tail = 4)
    file_string = path_to_file_string(file)
    date_time =
    length(file_string) > 13 ? DateTime(file_string, dateformat"yyyymmdd_HHMMSS") :
    DateTime(
    (file_string[1:4] * "20" * file_string[5:end]),
    dateformat"ddmmyyyy_HHMMSS",
    )
    return date_time
    end
    function insert_datetime_column!(df::DataFrame)::DataFrame
    @transform!(df, @byrow :DateTime = filename_to_datetime!(String(:file)))
    return df
    end
    # calls night(), needs dawn_dusk_dict in local time format
    function night_or_day!(
    df::DataFrame,
    dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}},
    night_time::Bool = true,
    )::DataFrame
    night_time ? @subset!(df, @byrow night(:DateTime, dawn_dusk_dict)) :
    @subset!(df, @byrow !night(:DateTime, dawn_dusk_dict))
    return df
    end
    function group_by_file!(df::DataFrame)
    gdf = groupby(df, :file)
    return gdf
    end
    function cluster_detections(start_times::Vector{Float64})::Vector{Vector{Float64}}
    s = Vector{Float64}[]
    t = Float64[start_times[1]]
    for time in start_times[2:end]
    if time - last(t) <= 15.0
    push!(t, time)
    else
    push!(s, copy(t))
    t = Float64[time]
    end
    end
    push!(s, copy(t))
    detections = filter(x -> length(x) > 1, s)
    return detections
    end
    # assumes it is operating on 5 second clips
    function calculate_clip_start_end(
    detection::Vector{Float64},
    freq::Float32,
    length_signal::Int64,
    )::Tuple{Float64,Float64}
    first(detection) > 0 ? st = first(detection) * freq : st = 1
    (last(detection) + 5.0) * freq <= length_signal ? en = (last(detection) + 5.0) * freq :
    en = length_signal
    return st, en
    end
    #= Deprecated use get_image_from_sample()
    function plot_spectrogram(
    sample::Vector{Float64},
    freq::Float32,
    )::Plots.Plot{Plots.GRBackend}
    S = DSP.spectrogram(sample[:, 1], 400, 2; fs = convert(Int, freq))
    plot = Plots.heatmap(
    S.time,
    S.freq,
    pow2db.(S.power),
    size = (448, 448),
    showaxis = false,
    ticks = false,
    legend = false,
    thickness_scaling = 0,
    )
    return plot
    end
    =#
    # f neeeds to be an Int
    function get_image_from_sample(sample, f) #sample::Vector{Float64}
    S = DSP.spectrogram(sample, 400, 2; fs = convert(Int, f))
    i = S.power
    if minimum(i) == 0.0
    l = i |> vec |> unique |> sort
    replace!(i, 0.0 => l[2])
    end
    image =
    #! format: off
    DSP.pow2db.(i) |>
    x -> x .+ abs(minimum(x)) |>
    x -> x ./ maximum(x) |>
    x -> reverse(x, dims = 1) |>
    x -> RGB.(x) |>
    x -> imresize(x, 224, 224)
    #! format: on
    return image
    end
    """
    construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
    sun = DataFrame(CSV.File(file))
    Takes dawn dusk.csv and returns a dict to be consumeed by night().
    ~/dawn_dusk.csv
    At present it goes from the start of 2019 to the end of 2024
    The csv contains local time sunrise and sunset
    I use this to decide if a file with a local time encoded name was recorded at night
    dict = construct_dawn_dusk_dict("/Volumes/SSD1/dawn_dusk.csv")
    dict = Utility.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
    using CSV, DataFrames
    """
    function construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
    sun = DataFrame(CSV.File(file))
    x = Tuple(zip(sun.Dawn, sun.Dusk))
    y = Dict(zip(sun.Date, x))
    return y
    end
    """
    night(call_time::DateTime, dict::Dict{Date, Tuple{DateTime, DateTime}})::Bool
    Returns true if time is at night, ie between civil twilights, dusk to dawn.
    Consumes dict from construct_dawn_dusk_dict
    time=DateTime("2021-11-02T21:14:35",dateformat"yyyy-mm-ddTHH:MM:SS")
    Utility.night(time, dict)
    """
    function night(call_time::DateTime, dict::Dict{Date,Tuple{DateTime,DateTime}})::Bool
    dawn = dict[Date(call_time)][1]
    dusk = dict[Date(call_time)][2]
    if call_time <= dawn || call_time >= dusk
    return true
    else
    return false
    end
    end
    #######################################################################
    #INBETWEEN STEP: use secondary model to sort clips, move clips into D, F, M, N, and hand classify, generate actual.csv.
    """
    move_clips_to_folders(df::DataFrame)
    Takes a 2 column dataframe: file, label
    file must be list of png images, assumes wav's are there too
    will move mp4's from video folder if they are present
    """
    function move_clips_to_folders(df::DataFrame)
    p = glob("*.png")
    w = glob("*.[W,w][A,a][V,v]")
    @assert (first(df.file) |> x -> split(x, ".")[end] |> x -> x == "png") "df.file must be a list of png's"
    @assert issetequal(df.file, p) "All png files in dataframe must be present in folder"
    @assert issetequal(chop.(df.file, head = 0, tail = 4), chop.(w, head = 0, tail = 4)) "There must be a wav for every png in the dataframe"
    for row in eachrow(df)
    src = row.file
    dst = "$(row.label)/$(row.file)"
    mkpath("$(row.label)/")
    try
    mv(src, dst)
    mv(chop(src, tail = 3) * "wav", chop(dst, tail = 3) * "wav")
    if isdir(video)
    mkpath("video/$(row.label)/")
    mv(
    "video/" * chop(src, tail = 3) * "mp4",
    "video/" * chop(dst, tail = 3) * "mp4",
    )
    end
    catch e
    @info e
    end
    end
    end
    #=
    actual.csv must be list of qualified png file names:
    D/C05-2023-04-15-20230219_223000-380-470.png
    using Glob, DataFrames, CSV
    a=glob("[M,F,D,N]/*.png")
    df = DataFrame(file=a)
    CSV.write("actual_mfdn.csv", df)
  • edit in src/Skraak.jl at line 14
    [4.356020][4.356020:360059]()
    make a folder D,F,M,N
    mkpath.(["D", "F", "M", "N"])
    move wavs to match pngs
    df=DataFrame(CSV.File("actual_mfdn.csv"))
    for row in eachrow(df)
    src=split(row.file, "/")[2]
    dst=row.file
    mv(src, dst)
    mv(chop(src, tail=3)*"wav", chop(dst, tail=3)*"wav")
    end
    =#
    #run from parent folder of label folders
    #saves actual.csv and returns a df
    #labels=["D", "F", "M", "N"]
    function actual_from_folders(labels::Vector{String})::DataFrame
    paths=String[]
    for l in labels
    paths=append!(paths, glob("$l/*.png"))
    end
    df = DataFrame(file=paths)
    CSV.write("actual.csv", df)
    return df
    end
    """
    aggregate_labels(actual="actual.csv", outfile="labels.csv")
    file
    [D, F, M, N]/C05-2023-04-15-20230219_223000-380-470.png
    This function takes the csv output from my hand classification and ouputs a df, and csv for insertion into AudioData.duckdb using the duckdb cli or using DFto.audiodata_db()
    assumes run from Clips_xxxx-xx-xx folder and that "actual.csv" present if not specified.
    returns a dataframe
    using CSV, DataFrames, DataFramesMeta
    """
    #=
    df=aggregate_labels()
    audiodata_db(df, "pomona_labels_20230418") NOT_WORKING maybe titles
    to use cli, need to remove header row
    duckdb /media/david/SSD1/AudioData.duckdb
    COPY pomona_labels_20230418 FROM 'DB_Labels/pomona_labels_2023-11-02.csv';
    COPY pomona_files FROM 'DB_Files/pomona_files_20231102.csv';
    Then backup with:
    EXPORT DATABASE 'AudioDataBackup_2023-11-14';
    .quit
    Then quit and backup using cp on the db file, dated copy
    Then rsync ssd to usb
    rsync -avzr --delete /media/david/SSD1/ /media/david/USB/
    note: run on mac
    cd skraak.kiwi
    julia-1.9
    using Franklin
    serve()
    =#
    # New one, without noise and distance, does not do :box anymore therefore requires new db schema
    function aggregate_labels(
    actual::String = "actual.csv",
    outfile::String = "labels.csv",
    hdr::Bool = false #header for outfile
    )::DataFrame
    df = DataFrame(CSV.File(actual))
    # location, f, start_time, end_time
    @transform!(df, @byrow :location = split(split(:file, "/")[2], "-")[1])
    @transform!(df, @byrow :f = split(split(:file, "/")[2], "-")[5] * ".WAV")
    @transform!( df, @byrow :start_time = split(split(:file, "/")[2], "-")[end-1])
    @transform!( df, @byrow :end_time = chop(split(split(:file, "/")[2], "-")[end], tail=4))
    #@transform!( df, @byrow :box = "[$(split(split(:file, "/")[2], "-")[end-1]), $(chop(split(split(:file, "/")[2], "-")[end], tail=4))]")
    # male, female, duet, not
    @transform!(df, @byrow @passmissing :male = split(:file, "/")[1] == "M" ? true : false)
    @transform!(
    df,
    @byrow @passmissing :female = split(:file, "/")[1] == "F" ? true : false
    )
    @transform!(df, @byrow @passmissing :duet = split(:file, "/")[1] == "D" ? true : false)
    @transform!(
    df,
    @byrow @passmissing :not_kiwi =
    split(:file, "/")[1] in ["KA", "KE", "N", "Q"] ? true : false
    )
    # other_label
    @transform!(
    df,
    @byrow @passmissing :other_label =
    split(:file, "/")[1] in ["KA", "KE", "Q"] ? split(:file, "/")[1] : missing
    )
    # remove unwanted cols, rename f to file
    select!(df, Not([:file]))
    rename!(df, :f => :file)
    CSV.write(outfile, df; header=hdr)
    return df
    end
    """
    audiodata_db(df::DataFrame, table::String)
    Use to upload labels to AudioData.duckdb
    Takes a dataframe and inserts into AudioData.db table.
    audiodata_db(df, "pomona_labels_20230418")
    using DataFrames, DBInterface, DuckDB, Random
    """
    function audiodata_db(df::DataFrame, table::String)
    if Sys.islinux()
    con = DBInterface.connect(DuckDB.DB, "/media/david/SSD1/AudioData.duckdb")
    else
    con = DBInterface.connect(DuckDB.DB, "/Volumes/SSD1/AudioData.duckdb")
    end
    temp_name = randstring(6)
    DuckDB.register_data_frame(con, df, temp_name)
    DBInterface.execute(
    con,
    """
    INSERT
    INTO $table
    SELECT *
    FROM '$temp_name'
    """,
    )
    DBInterface.close!(con)
    end
  • edit in src/Predict.jl at line 2
    [4.366510]
    [4.366510]
    export predict
  • edit in src/Predict.jl at line 8
    [4.366665][4.366665:366681]()
    export predict
  • edit in src/ConstructPrimaryDataset.jl at line 3
    [2.48][2.48:71]()
    # Does not compile yet
  • replacement in src/ConstructPrimaryDataset.jl at line 4
    [2.72][4.375126:375170](),[4.375126][4.375126:375170]()
    using DataFrames, DataFramesMeta, CSV, Glob
    [2.72]
    [4.375170]
    using DataFrames, CSV, Glob
    using DataFramesMeta: @transform!, @byrow #, @subset!, @passmissing
  • replacement in src/ConstructPrimaryDataset.jl at line 7
    [4.375171][4.375171:375599]()
    # Only moves WAVs not already there in dataset
    # converts WAVs to flac to save space, file metadata will not survive
    # requires columns :location, :file, :start_time, :end_time
    # :file is the file name, :location is the actual recorder location eg "C05"
    # run where the raw data is
    # will find file in folder structure location/trip_date/file
    # constructs dataset at output_path
    # assumes file name has one . for extension only
    [4.375171]
    [3.5050]
    """
    Only moves WAVs not already there in dataset
    converts WAVs to flac to save space, file metadata will not survive
    requires columns :location, :file, :start_time, :end_time
    :file is the file name, :location is the actual recorder location eg "C05"
    run where the raw data is
    will find file in folder structure location/trip_date/file
    constructs dataset at output_path
    assumes file name has one . for extension only
    """
  • replacement in src/ConstructPrimaryDataset.jl at line 49
    [4.376687][4.376687:376720]()
    function save_pngs(df:DataFrame)
    [4.376687]
    [3.6239]
    function save_pngs(df::DataFrame)