new files tracked

[?]
AEj8dahVWy718uSSFPe9VSRJ5qX5G8pC2zvFzJJ8yzBd
Dec 9, 2023, 8:06 AM
2UBDFCJH2BG6U6SY2YDJ7QK4JOLUJAHOYZS3YRQ7E7U4UGP4YR5QC

Dependencies

Change contents

  • file addition: Labels.jl (----------)
    [2.6598]
    # Labels.jl
    export aggregate_labels, audiodata_db
    using CSV, DataFrames, Glob, Random, DBInterface, DuckDB
    using DataFramesMeta: @transform!, @subset!, @byrow, @passmissing
    #=
    actual.csv must be list of qualified png file names:
    D/C05-2023-04-15-20230219_223000-380-470.png
    using Glob, DataFrames, CSV
    a=glob("[M,F,D,N]/*.png")
    df = DataFrame(file=a)
    CSV.write("actual_mfdn.csv", df)
    make a folder D,F,M,N
    mkpath.(["D", "F", "M", "N"])
    move wavs to match pngs
    df=DataFrame(CSV.File("actual_mfdn.csv"))
    for row in eachrow(df)
    src=split(row.file, "/")[2]
    dst=row.file
    mv(src, dst)
    mv(chop(src, tail=3)*"wav", chop(dst, tail=3)*"wav")
    end
    =#
    """
    actual_from_folders(labels::Vector{String})::DataFrame
    run from parent folder of label folders
    saves actual.csv and returns a df
    labels=["D", "F", "M", "N"]
    """
    function actual_from_folders(labels::Vector{String})::DataFrame
    paths = String[]
    for l in labels
    paths = append!(paths, glob("$l/*.png"))
    end
    df = DataFrame(file = paths)
    CSV.write("actual.csv", df)
    return df
    end
    """
    aggregate_labels(actual="actual.csv", outfile="labels.csv")
    file
    [D, F, M, N]/C05-2023-04-15-20230219_223000-380-470.png
    This function takes the csv output from my hand classification and ouputs a df, and csv for insertion into AudioData.duckdb using the duckdb cli or using DFto.audiodata_db()
    assumes run from Clips_xxxx-xx-xx folder and that "actual.csv" present if not specified.
    returns a dataframe
    using CSV, DataFrames, DataFramesMeta
    """
    #=
    df=aggregate_labels()
    audiodata_db(df, "pomona_labels_20230418") NOT_WORKING maybe titles
    to use cli, need to remove header row
    duckdb /media/david/SSD1/AudioData.duckdb
    COPY pomona_labels_20230418 FROM 'DB_Labels/pomona_labels_2023-11-02.csv';
    COPY pomona_files FROM 'DB_Files/pomona_files_20231102.csv';
    Then backup with:
    EXPORT DATABASE 'AudioDataBackup_2023-11-14';
    .quit
    Then quit and backup using cp on the db file, dated copy
    Then rsync ssd to usb
    rsync -avzr --delete /media/david/SSD1/ /media/david/USB/
    note: run on mac
    cd skraak.kiwi
    julia-1.9
    using Franklin
    serve()
    =#
    # New one, without noise and distance, does not do :box anymore therefore requires new db schema
    function aggregate_labels(
    actual::String = "actual.csv",
    outfile::String = "labels.csv",
    hdr::Bool = false, #header for outfile
    )::DataFrame
    df = DataFrame(CSV.File(actual))
    # location, f, start_time, end_time
    @transform!(df, @byrow :location = split(split(:file, "/")[2], "-")[1])
    @transform!(df, @byrow :f = split(split(:file, "/")[2], "-")[5] * ".WAV")
    @transform!(df, @byrow :start_time = split(split(:file, "/")[2], "-")[end-1])
    @transform!(
    df,
    @byrow :end_time = chop(split(split(:file, "/")[2], "-")[end], tail = 4)
    )
    #@transform!( df, @byrow :box = "[$(split(split(:file, "/")[2], "-")[end-1]), $(chop(split(split(:file, "/")[2], "-")[end], tail=4))]")
    # male, female, duet, not
    @transform!(df, @byrow @passmissing :male = split(:file, "/")[1] == "M" ? true : false)
    @transform!(
    df,
    @byrow @passmissing :female = split(:file, "/")[1] == "F" ? true : false
    )
    @transform!(df, @byrow @passmissing :duet = split(:file, "/")[1] == "D" ? true : false)
    @transform!(
    df,
    @byrow @passmissing :not_kiwi =
    split(:file, "/")[1] in ["KA", "KE", "N", "Q"] ? true : false
    )
    # other_label
    @transform!(
    df,
    @byrow @passmissing :other_label =
    split(:file, "/")[1] in ["KA", "KE", "Q"] ? split(:file, "/")[1] : missing
    )
    # remove unwanted cols, rename f to file
    select!(df, Not([:file]))
    rename!(df, :f => :file)
    CSV.write(outfile, df; header = hdr)
    return df
    end
    """
    audiodata_db(df::DataFrame, table::String)
    Use to upload labels to AudioData.duckdb
    Takes a dataframe and inserts into AudioData.db table.
    audiodata_db(df, "pomona_labels_20230418")
    using DataFrames, DBInterface, DuckDB, Random
    """
    function audiodata_db(df::DataFrame, table::String)
    if Sys.islinux()
    con = DBInterface.connect(DuckDB.DB, "/media/david/SSD1/AudioData.duckdb")
    else
    con = DBInterface.connect(DuckDB.DB, "/Volumes/SSD1/AudioData.duckdb")
    end
    temp_name = randstring(6)
    DuckDB.register_data_frame(con, df, temp_name)
    DBInterface.execute(
    con,
    """
    INSERT
    INTO $table
    SELECT *
    FROM '$temp_name'
    """,
    )
    DBInterface.close!(con)
    end
  • file addition: FileMetaData.jl (----------)
    [2.6598]
    # FileMetaData.jl
    export file_metadata_to_df
    using DataFrames, Dates, Glob, Random, SHA, TimeZones, WAV, XMLDict
    #DelimitedFiles, DuckDB, JSON3
    #=
    used like:
    using Glob, Skraak, CSV
    folders=glob("*/2023-11-02/")
    for folder in folders
    cd(folder)
    try
    df = Skraak.file_metadata_to_df()
    CSV.write("/media/david/Pomona-3/Pomona-3/pomona_files_20231102.csv", df; append=true)
    catch
    @warn "error with $folder"
    end
    cd("/media/david/Pomona-3/Pomona-3/")
    end
    Then using duckdb cli from SSD:
    duckdb AudioData.duckdb
    show tables;
    SELECT * FROM pomona_files;
    COPY pomona_files FROM '/media/david/Pomona-3/Pomona-3/pomona_files_20231019.csv';
    SELECT * FROM pomona_files;
    Then backup with:
    EXPORT DATABASE 'AudioDataBackup_2023-07-29';
    .quit
    Then quit and backup using cp on the db file
    Then rsync ssd to usb
    rsync -avzr --delete /media/david/SSD1/ /media/david/USB/
    =#
    """
    file_metadata_to_df()
    This function takes a file name, extracts wav metadata, gpx location, recording period start/end and returnes a dataframe.
    This function needs raw audiomoth wav files and a gpx.
    This function needs /media/david/SSD1/dawn_dusk.csv
    """
    function file_metadata_to_df()
    # Initialise dataframe with columns: disk, location, trip_date, file, lattitude, longitude, start_recording_period_localt, finish_recording_period_localt, duration, sample_rate, zdt, ldt, moth_id, gain, battery, temperature
    df = DataFrame(
    disk = String[],
    location = String[],
    trip_date = String[],
    file = String[],
    latitude = Float64[],
    longitude = Float64[],
    start_recording_period_localt = String[],
    finish_recording_period_localt = String[],
    duration = Float64[],
    sample_rate = Int[],
    utc = String[],
    ldt = String[],
    moth_id = String[],
    gain = String[],
    battery = Float64[],
    temperature = Float64[],
    sha2_256 = String[],
    night = Bool[],
    )
    #Get WAV list for folder
    wav_list = glob("*.WAV") |> sort
    #Return empty df if nothing in the folder
    if length(wav_list) == 0
    return df
    end
    #Get path info from file system
    raw_path_vec = split(pwd(), "/")[end-2:end]
    disk = raw_path_vec[1]
    location = raw_path_vec[2]
    trip_date = raw_path_vec[3]
    #Get location, assumes 1 gpx is in the follder
    waypoint = glob("*.gpx")
    length(waypoint) != 1 && @error "no gpx file in $trip_date $location"
    loc = read(waypoint[1], String) |> xml_dict
    latitude = parse(Float64, (loc["gpx"]["wpt"][:lat]))
    longitude = parse(Float64, (loc["gpx"]["wpt"][:lon]))
    #Start of recording period
    _, _, _, binary_metadata_start = wavread(wav_list[1])
    c_v_s = split(wav_info_read(binary_metadata_start)[:ICMT], " ")
    comment_vector_start = length(c_v_s) < 22 ? c_v_s : c_v_s[1:19]
    date_start = split(comment_vector_start[4], "/")
    time_start = split(comment_vector_start[3], ":")
    tz_start = chop(comment_vector_start[5], head = 4, tail = 1)
    time_zone_start = isempty(tz_start) ? "+00" : tz_start
    #zdt1 = ZonedDateTime(parse(Int, date_start[3]), parse(Int, date_start[2]), parse(Int, date_start[1]), parse(Int, time_start[1]), parse(Int, time_start[2]), parse(Int, time_start[3]), tz"UTC")
    time_string_start =
    date_start[3] *
    "-" *
    date_start[2] *
    "-" *
    date_start[1] *
    "T" *
    time_start[1] *
    ":" *
    time_start[2] *
    ":" *
    time_start[3] *
    "." *
    "000" *
    time_zone_start
    zdt1 = ZonedDateTime(time_string_start)
    start_recording_period_localt =
    Dates.format(astimezone(zdt1, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
    #End of recording period
    _, _, _, binary_metadata_end = wavread(wav_list[end])
    c_v_e = split(wav_info_read(binary_metadata_end)[:ICMT], " ")
    comment_vector_end = length(c_v_e) < 22 ? c_v_e : c_v_e[1:19]
    date_end = split(comment_vector_end[4], "/")
    time_end = split(comment_vector_end[3], ":")
    tz_end = chop(comment_vector_start[5], head = 4, tail = 1)
    time_zone_end = isempty(tz_end) ? "+00" : tz_end
    #zdt2 = ZonedDateTime(parse(Int, date_end[3]), parse(Int, date_end[2]), parse(Int, date_end[1]),parse(Int, time_end[1]), parse(Int, time_end[2]), parse(Int, time_end[3]), tz"UTC")
    time_string_end =
    date_end[3] *
    "-" *
    date_end[2] *
    "-" *
    date_end[1] *
    "T" *
    time_end[1] *
    ":" *
    time_end[2] *
    ":" *
    time_end[3] *
    "." *
    "000" *
    time_zone_end
    zdt2 = ZonedDateTime(time_string_end)
    finish_recording_period_localt =
    Dates.format(astimezone(zdt2, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
    dict = Skraak.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
    #So I know what it is doing
    println(raw_path_vec)
    #Loop over file list
    for file in wav_list
    #print(file)
    try
    audio_data, sample_rate, _, binary_metadata = wavread(file)
    c_v = split(wav_info_read(binary_metadata)[:ICMT], " ")
    comment_vector = length(c_v) < 22 ? c_v : c_v[1:19]
    duration = Float64(length(audio_data) / sample_rate)
    date = split(comment_vector[4], "/")
    time = split(comment_vector[3], ":")
    tz = chop(comment_vector[5], head = 4, tail = 1)
    time_zone = isempty(tz) ? "+00" : tz
    #preformatting_zdt = ZonedDateTime(parse(Int, date[3]), parse(Int, date[2]), parse(Int, date[1]), parse(Int, time[1]), parse(Int, time[2]), parse(Int, time[3]), tz"UTC")
    time_string =
    date[3] *
    "-" *
    date[2] *
    "-" *
    date[1] *
    "T" *
    time[1] *
    ":" *
    time[2] *
    ":" *
    time[3] *
    "." *
    "000" *
    time_zone
    preformatting_zdt = ZonedDateTime(time_string)
    #zdt = Dates.format(preformatting_zdt, "yyyy-mm-dd HH:MM:SSzzzz")
    preformatting_utc = astimezone(preformatting_zdt, tz"UTC")
    utc = Dates.format(preformatting_utc, "yyyy-mm-dd HH:MM:SSzzzz")
    preformatting_ldt = astimezone(preformatting_zdt, tz"Pacific/Auckland")
    ldt = Dates.format(preformatting_ldt, "yyyy-mm-dd HH:MM:SSzzzz")
    moth_id = comment_vector[8]
    gain = comment_vector[10]
    #index back from end because if V > 4.9 the wording chaaanges
    battery = parse(Float64, chop(comment_vector[end-4], tail = 1))
    temperature = parse(Float64, chop(comment_vector[end], tail = 2))
    sha2_256 = bytes2hex(sha256(file))
    #assumes 15 minute file and calculates on half way time
    nt = Skraak.night(DateTime(preformatting_ldt + Minute(7) + Second(30)), dict)
    #Populate row to push into df
    row = [
    disk,
    location,
    trip_date,
    file,
    latitude,
    longitude,
    start_recording_period_localt,
    finish_recording_period_localt,
    duration,
    Int(sample_rate),
    utc,
    ldt,
    moth_id,
    gain,
    battery,
    temperature,
    sha2_256,
    nt,
    ]
    push!(df, row)
    print(".")
    catch
    @warn "error with $folder $file"
    end
    end
    return df
    end
  • file addition: Clips.jl (----------)
    [2.6598]
    # Clips.jl
    export make_clips, move_clips_to_folders
    using CSV, DataFrames, Dates, DSP, Glob, JSON, Random, TimeZones, WAV, PNGFiles, Images
    using DataFramesMeta: @transform!, @subset!, @byrow, @passmissing
    """
    make_clips(preds_path::String, dawn_dusk_dict::Dict{Dates.Date, Tuple{Dates.DateTime, Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv"))
    This function takes a preds.csv files and generates
    file names, wav's, spectrograms etc to be reviewed.
    it calls night() and may call construct_dawn_dusk_dict() unless the dict is globally defined and passed in
    It should be run from Pomona-1/, Pomona-2/ or Pomona-3/, assumes it is, it uses the path
    It saves wav and png files to /home/david/Upload/
    need to use a try/catch because the 2 assert functions thow an error to short circuit the function
    using Glob, Skraak
    predictions = glob("*/2023-09-11*/preds*")
    predictions = glob("path/to/preds*")
    for file in predictions #[1:6][7:12][13:18][19:24]
    try
    make_clips(file)
    catch x
    println(x)
    end
    end
    if needed to change headers in preds csv
    shift, control, f in subl
    file,start_time,end_time,label
    /media/david/Pomona-2,<project filters>, preds-2023-02-27.csv
    file,start_time,end_time,absent,present
    using Glob, CSV, DataFrames, DataFramesMeta, Dates, DSP, Plots, Random, WAV
    """
    # Assumes run on linux
    # Assumes function run from Pomona-1 or Pomona-2
    #dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv",),
    function make_clips(
    preds_path::String,
    label::Int = 1,
    night::Bool = true,
    dawn_dusk_dict = dddict,
    )
    # Assumes function run from Pomona-1 or Pomona-2
    location, trip_date, _ = split(preds_path, "/")
    # Load and group data frame by file
    gdf =
    #! format: off
    DataFrame(CSV.File(preds_path)) |>
    x -> assert_not_empty(x, preds_path) |>
    x -> rename_column!(x, "1.0", "label") |> #can remove now, needs to be label
    x -> assert_detections_present(x, label, location, trip_date) |>
    x -> filter_positives!(x, label) |>
    insert_datetime_column! |>
    x -> night_or_day!(x, dawn_dusk_dict, night) |> #true=night, false=day
    group_by_file!
    #! format: on
    # Make clip and spectrogram
    for (k, v) in pairs(gdf)
    #file_name = chop(v.file[1], head = 2, tail = 4)
    file_name = path_to_file_string(v.file[1])
    start_times = v[!, :start_time] |> sort
    detections = cluster_detections(start_times)
    isempty(detections) && continue
    signal, freq = wavread("$location/$trip_date/$file_name.WAV")
    length_signal = length(signal)
    for detection in detections
    st, en = calculate_clip_start_end(detection, freq, length_signal)
    name = "$location-$trip_date-$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"
    f = "Clips_$(today())"
    mkpath(f)
    outfile = "$f/$name"
    sample = signal[Int(st):Int(en)]
    wavwrite(sample, "$outfile.wav", Fs = Int(freq))
    #plot = plot_spectrogram(sample, freq)
    #savefig(plot, "$outfile.png")
    image = get_image_from_sample(sample, freq)
    PNGFiles.save("$outfile.png", image)
    end
    print(".")
    end
    println("\ndone $location/$trip_date \n")
    end
    #######################################################################
    function assert_not_empty(df::DataFrame, preds_path::String)::DataFrame
    size(df) != (0, 0) ? (return df) : @error "Empty dataframe at $preds_path"
    #return df
    end
    function rename_column!(df::DataFrame, old_name::String, new_name::String)::DataFrame
    old_name in names(df) && rename!(df, old_name => new_name)
    return df
    end
    # assumes kiwi, binary classifier from opensoundscape
    # needed to remove ::String annotation for location, trip_date to make it work
    function assert_detections_present(
    df::DataFrame,
    label::Int,
    location,
    trip_date,
    )::DataFrame
    label in levels(df.label) ? (return df) :
    @error "No detections for label = $label at $location/$trip_date"
    end
    # assumes kiwi
    function filter_positives!(df::DataFrame, label)::DataFrame
    #filter!(row -> row.kiwi > 0, df)
    filter!(row -> row.label == label, df)
    return df
    end
    function path_to_file_string(path) #becareful path::String won't work: no method matching path_to_file_string(::InlineStrings.String31) line 70
    f = split(path, "/")[end] |> x -> split(x, ".") |> first
    #f = chop(file, head = 2, tail = 4)
    return f
    end
    function filename_to_datetime!(file)::DateTime
    #file_string = chop(file, head = 2, tail = 4)
    file_string = path_to_file_string(file)
    date_time =
    length(file_string) > 13 ? DateTime(file_string, dateformat"yyyymmdd_HHMMSS") :
    DateTime(
    (file_string[1:4] * "20" * file_string[5:end]),
    dateformat"ddmmyyyy_HHMMSS",
    )
    return date_time
    end
    function insert_datetime_column!(df::DataFrame)::DataFrame
    @transform!(df, @byrow :DateTime = filename_to_datetime!(String(:file)))
    return df
    end
    # calls night(), needs dawn_dusk_dict in local time format
    function night_or_day!(
    df::DataFrame,
    dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}},
    night_time::Bool = true,
    )::DataFrame
    night_time ? @subset!(df, @byrow night(:DateTime, dawn_dusk_dict)) :
    @subset!(df, @byrow !night(:DateTime, dawn_dusk_dict))
    return df
    end
    function group_by_file!(df::DataFrame)
    gdf = groupby(df, :file)
    return gdf
    end
    function cluster_detections(start_times::Vector{Float64})::Vector{Vector{Float64}}
    s = Vector{Float64}[]
    t = Float64[start_times[1]]
    for time in start_times[2:end]
    if time - last(t) <= 15.0
    push!(t, time)
    else
    push!(s, copy(t))
    t = Float64[time]
    end
    end
    push!(s, copy(t))
    detections = filter(x -> length(x) > 1, s)
    return detections
    end
    # assumes it is operating on 5 second clips
    function calculate_clip_start_end(
    detection::Vector{Float64},
    freq::Float32,
    length_signal::Int64,
    )::Tuple{Float64,Float64}
    first(detection) > 0 ? st = first(detection) * freq : st = 1
    (last(detection) + 5.0) * freq <= length_signal ? en = (last(detection) + 5.0) * freq :
    en = length_signal
    return st, en
    end
    # f neeeds to be an Int
    function get_image_from_sample(sample, f) #sample::Vector{Float64}
    S = DSP.spectrogram(sample, 400, 2; fs = convert(Int, f))
    i = S.power
    if minimum(i) == 0.0
    l = i |> vec |> unique |> sort
    replace!(i, 0.0 => l[2])
    end
    image =
    #! format: off
    DSP.pow2db.(i) |>
    x -> x .+ abs(minimum(x)) |>
    x -> x ./ maximum(x) |>
    x -> reverse(x, dims = 1) |>
    x -> RGB.(x) |>
    x -> imresize(x, 224, 224)
    #! format: on
    return image
    end
    """
    construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
    sun = DataFrame(CSV.File(file))
    Takes dawn dusk.csv and returns a dict to be consumeed by night().
    ~/dawn_dusk.csv
    At present it goes from the start of 2019 to the end of 2024
    The csv contains local time sunrise and sunset
    I use this to decide if a file with a local time encoded name was recorded at night
    dict = construct_dawn_dusk_dict("/Volumes/SSD1/dawn_dusk.csv")
    dict = Utility.construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
    using CSV, DataFrames
    """
    function construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
    sun = DataFrame(CSV.File(file))
    x = Tuple(zip(sun.Dawn, sun.Dusk))
    y = Dict(zip(sun.Date, x))
    return y
    end
    """
    night(call_time::DateTime, dict::Dict{Date, Tuple{DateTime, DateTime}})::Bool
    Returns true if time is at night, ie between civil twilights, dusk to dawn.
    Consumes dict from construct_dawn_dusk_dict
    time=DateTime("2021-11-02T21:14:35",dateformat"yyyy-mm-ddTHH:MM:SS")
    Utility.night(time, dict)
    """
    function night(call_time::DateTime, dict::Dict{Date,Tuple{DateTime,DateTime}})::Bool
    dawn = dict[Date(call_time)][1]
    dusk = dict[Date(call_time)][2]
    if call_time <= dawn || call_time >= dusk
    return true
    else
    return false
    end
    end
    #######################################################################
    #INBETWEEN STEP: use secondary model to sort clips, move clips into D, F, M, N, and hand classify, generate actual.csv.
    """
    move_clips_to_folders(df::DataFrame)
    Takes a 2 column dataframe: file, label
    file must be list of png images, assumes wav's are there too
    will move mp4's from video folder if they are present
    """
    function move_clips_to_folders(df::DataFrame)
    p = glob("*.png")
    w = glob("*.[W,w][A,a][V,v]")
    @assert (first(df.file) |> x -> split(x, ".")[end] |> x -> x == "png") "df.file must be a list of png's"
    @assert issetequal(df.file, p) "All png files in dataframe must be present in folder"
    @assert issetequal(chop.(df.file, head = 0, tail = 4), chop.(w, head = 0, tail = 4)) "There must be a wav for every png in the dataframe"
    for row in eachrow(df)
    src = row.file
    dst = "$(row.label)/$(row.file)"
    mkpath("$(row.label)/")
    try
    mv(src, dst)
    mv(chop(src, tail = 3) * "wav", chop(dst, tail = 3) * "wav")
    if isdir(video)
    mkpath("video/$(row.label)/")
    mv(
    "video/" * chop(src, tail = 3) * "mp4",
    "video/" * chop(dst, tail = 3) * "mp4",
    )
    end
    catch e
    @info e
    end
    end
    end