Labels.jl
# Labels.jl
#=
actual.csv must be list of qualified png file names:
D/C05-2023-04-15-20230219_223000-380-470.png
using Glob, DataFrames, CSV
a=Glob.glob("[M,F,D,N]/*.png")
df = DataFrames.DataFrame(file=a)
CSV.write("actual_mfdn.csv", df)
make a folder D,F,M,N
mkpath.(["D", "F", "M", "N"])
move wavs to match pngs
df=DataFrames.DataFrame(CSV.File("actual_mfdn.csv"))
for row in eachrow(df)
src=split(row.file, "/")[2]
dst=row.file
mv(src, dst)
mv(chop(src, tail=3)*"wav", chop(dst, tail=3)*"wav")
end
=#
"""
actual_from_folders(labels::Vector{String})::DataFrame
run from parent folder of label folders
saves actual.csv and returns a df
labels=["D", "F", "M", "N"]
"""
function actual_from_folders(labels::Vector{String})::DataFrame
paths = String[]
for l in labels
paths = append!(paths, Glob.glob("$l/*.png"))
end
df = DataFrames.DataFrame(file = paths)
CSV.write("actual.csv", df)
return df
end
#=
df=aggregate_labels()
audiodata_db(df, "pomona_labels_20230418") NOT_WORKING maybe titles
to use cli, need to remove header row
duckdb /media/david/SSD1/AudioData.duckdb
COPY pomona_labels_20230418 FROM 'DB_Labels/pomona_labels_2023-12-28.csv';
COPY pomona_files FROM 'DB_Files/pomona_files_20231228.csv';
Then backup with:
EXPORT DATABASE 'AudioDataBackup_2023-11-14';
.quit
Then quit and backup using cp on the db file, dated copy
Then rsync ssd to usb
rsync -avzr --delete /media/david/SSD1/ /media/david/USB/
note: run on mac
cd skraak.kiwi
julia-1.9
using Franklin
serve()
=#
# New one, without noise and distance, does not do :box anymore therefore requires new db schema
"""
aggregate_labels(actual="actual.csv", outfile="labels.csv")
file
[D, F, M, N]/C05-2023-04-15-20230219_223000-380-470.png
This function takes the csv output from my hand classification and ouputs a df, and csv for insertion into AudioData.duckdb using the duckdb cli or using audiodata_db()
assumes run from Clips_xxxx-xx-xx folder and that "actual.csv" present if not specified.
returns a dataframe
using CSV, DataFrames, DataFramesMeta
"""
function aggregate_labels(
actual::String = "actual.csv",
outfile::String = "labels.csv",
hdr::Bool = false, #header for outfile
)::DataFrame
df = DataFrames.DataFrame(CSV.File(actual))
# location, f, start_time, end_time
@transform!(df, @byrow :location = split(split(:file, "/")[2], "-")[1])
@transform!(df, @byrow :f = split(split(:file, "/")[2], "-")[5] * ".WAV")
@transform!(df, @byrow :start_time = split(split(:file, "/")[2], "-")[end-1])
@transform!(
df,
@byrow :end_time = chop(split(split(:file, "/")[2], "-")[end], tail = 4)
)
#@transform!( df, @byrow :box = "[$(split(split(:file, "/")[2], "-")[end-1]), $(chop(split(split(:file, "/")[2], "-")[end], tail=4))]")
# male, female, duet, not
@transform!(df, @byrow @passmissing :male = split(:file, "/")[1] == "M" ? true : false)
@transform!(
df,
@byrow @passmissing :female = split(:file, "/")[1] == "F" ? true : false
)
@transform!(df, @byrow @passmissing :duet = split(:file, "/")[1] == "D" ? true : false)
@transform!(
df,
@byrow @passmissing :not_kiwi =
split(:file, "/")[1] in ["KA", "KE", "N", "Q"] ? true : false
)
# other_label
@transform!(
df,
@byrow @passmissing :other_label =
split(:file, "/")[1] in ["KA", "KE", "Q"] ? split(:file, "/")[1] : missing
)
# remove unwanted cols, rename f to file
select!(df, Not([:file]))
rename!(df, :f => :file)
CSV.write(outfile, df; header = hdr)
return df
end
"""
audiodata_db(df::DataFrame, table::String)
Use to upload labels to AudioData.duckdb
Takes a dataframe and inserts into AudioData.db table.
audiodata_db(df, "pomona_labels_20230418")
using DataFrames, DBInterface, DuckDB, Random
"""
function audiodata_db(df::DataFrame, table::String)
if Sys.islinux()
con = DBInterface.connect(DuckDB.DB, "/media/david/SSD1/AudioData.duckdb")
else
con = DBInterface.connect(DuckDB.DB, "/Volumes/SSD1/AudioData.duckdb")
end
temp_name = randstring(6)
DuckDB.register_data_frame(con, df, temp_name)
DBInterface.execute(
con,
"""
INSERT
INTO $table
SELECT *
FROM '$temp_name'
""",
)
DBInterface.close!(con)
end
function avianz_file_of_dict(f, payload)
isfile(f) && cp(f, "$f.backup")
open(f, "w") do io
JSON3.write(io, payload)
end
end
# ASSUMES 60 second files, also wav and txt file in same folder
# Raven selections.txt to AviaNZ .data
# using CSV, DataFrames, JSON3
# uncomment line 'isfile(avianzf) && rm(f)' to remove .selection.txt file
# a=Glob.glob("*/*/*.Table.1.selections.txt")
# map(x -> avianz_of_raven(x), a)
function avianz_of_raven(f::String) #not working right, check
df = DataFrames.DataFrame(CSV.File(f))
data = Any[Dict([("Operator", "D"), ("Reviewer", "D"), ("Duration", 60.0)])]
labels = Any[]
for row in eachrow(df)
label = [
row."Begin Time (s)",
row."End Time (s)",
floor(Int, row."Low Freq (Hz)"),
ceil(Int, row."High Freq (Hz)"),
[Dict([("filter", "M"), ("species", row.Species), ("certainty", 100)])],
]
push!(labels, label)
end
append!(data, labels)
basef = f = replace(f, "Table.1.selections.txt" => "")
isfile((basef * ".WAV")) ? avianzf = basef * "WAV.data" : avianzf = basef * "wav.data"
avianz_file_of_dict(avianzf, data)
end
function dict_of_avianz_file(f)
json_string = read(f, String)
x = JSON3.read(json_string)
return x
end
function df_of_avianz_dict(data)
df = DataFrames.DataFrame(
duration = Float64[],
start_time = Float64[],
end_time = Float64[],
low_f = Float64[],
high_f = Float64[],
Species = String[],
)
dur = data[1]["Duration"]
for (index1, item1) in enumerate(data[2:end])
st, en, lf, hf = item1[1:4]
sp = map(x -> x[:species], item1[5])
lsp = length(sp)
for (index2, item2) in enumerate(sp)
push!(
df,
(
duration = dur,
start_time = st,
end_time = en,
low_f = lf,
high_f = hf,
Species = sp[index2],
),
)
end
end
return df
end
function prepare_df_for_raven(data)
l = length(data.start_time)
df = DataFrames.DataFrame(
"Selection" => collect(1:l),
"View" => ["Spectrogram 1" for x = 1:l],
"Channel" => [1 for x = 1:l],
"Begin Time (s)" => data.start_time,
"End Time (s)" => data.end_time,
"Low Freq (Hz)" => data.low_f,
"High Freq (Hz)" => data.high_f,
"Species" => data.Species,
)
return df
end
# using CSV, DataFrames, JSON3
function raven_of_avianz(file::String)
data = dict_of_avianz_file(file)
if length(data) >= 2 #ignores empty .data files
df = df_of_avianz_dict(data) |> prepare_df_for_raven
outfile = replace(
file,
".WAV.data" => ".Table.1.selections.txt",
".wav.data" => ".Table.1.selections.txt",
)
CSV.write(outfile, df, delim = '\t')
end
end
# Check then mutate all AviaNZ .data labels (Species column)
# using JSON3
# a=Glob.glob("*.WAV.data")
# check_and_change_labels!(a, "somethning", "something")
function check_change_avianz_species!(list::Vector{String}, wrong::String, right::String)
function mutate_avianz_dict!(dct, wrong::String, right::String)
didmutate = false
data = copy(dct)
for (index1, item1) in enumerate(data[2:end])
for (index2, item2) in enumerate(item1[5])
if item2[:species] == wrong
data[index1+1][5][index2][:species] = right
didmutate = true
end
end
end
return (didmutate, data)
end
for file in list
x = dict_of_avianz_file(file)
if length(x) >= 2 #ignores empty .data files
didmutate, payload = mutate_avianz_dict!(x, wrong, right)
if didmutate == true
try
avianz_file_of_dict(file, payload)
catch
@warn "Could not write file or backup: $file"
end
end
end
end
end
# if specified folder must include trailing /, can be "" for current folder
function label_summary(folder::String, avianz::Bool = true)
if avianz == true
files = Glob.glob("$(folder)*.['W','w']['A','a']['V','v'].data")
df = DataFrames.DataFrame(
duration = Float64[],
start_time = Float64[],
end_time = Float64[],
low_f = Float64[],
high_f = Float64[],
Species = String[],
File = String[],
)
for file in files
df1 = dict_of_avianz_file(file) |> x -> df_of_avianz_dict(x)
f = split(file, "/")[end] |> x -> replace(x, ".data" => "")
df1.File = ["$f" for x = 1:length(df1.start_time)]
df = vcat(df, df1)
end
select!(df, [:File, :duration, :start_time, :end_time, :low_f, :high_f, :Species])
CSV.write("$(folder)label_summary_avianz-$(Dates.today()).csv", df, delim = '\t')
return df
else
files = Glob.glob("$folder/*.Table.1.selections.txt")
df = DataFrames.DataFrame(
File = String[],
start_time = Float64[],
end_time = Float64[],
low_f = Float64[],
high_f = Float64[],
Species = String[],
)
for file in files
df1 = DataFrames.DataFrame(CSV.File(file))
for d in eachrow(df1)
f =
split(file, "/")[end] |>
x -> replace(x, ".Table.1.selections.txt" => "")
push!(
df,
(
File = f,
start_time = d."Begin Time (s)",
end_time = d."End Time (s)",
low_f = d."Low Freq (Hz)",
high_f = d."High Freq (Hz)",
Species = d."Species",
),
)
end
end
CSV.write("$(folder)label_summary_raven-$(Dates.today()).csv", df, delim = '\t')
return df
end
end
function folder_summary()
a=glob("*/*.data")
end
#allow 0.2 overlap each side
#round end_time down to nearest 5
function et(end_time::Float64)
end_time % 5 > 0.2 ? (c0 = ceil(end_time / 5) * 5) : c0 = floor(end_time / 5) * 5
return c0
end
#allow 0.2 overlap each side
#round start_time down to nearest 5
function st(start_time::Float64)
start_time % 5 < 4.8 ? (f = floor(start_time / 5) * 5) : f = ceil(start_time / 5) * 5
return f
end
#labels must be a df loaded from label_summary run over avianz data (not raven)
function one_hot_labels(labels::DataFrame)
gdf = groupby(labels, :File)
vdf = []
for group in gdf
dur = first(group.duration)
nrows = dur ÷ 5
seil = nrows * 5
df = DataFrame(
file = [first(group.File) for x = 1:nrows],
start_time = collect(0:5:seil-1),
end_time = collect(5:5:seil),
)
for row in eachrow(group)
fst = st(row.start_time)
@assert fst >= 0
lst0 = et(row.end_time)
#end time must not be greater than duration
lst0 > dur ? lst = lst0 - 5 : lst = lst0
@assert lst <= dur
f_idx = fst ÷ 5 + 1 |> Int
l_idx = lst ÷ 5 |> Int
vect = [false for x = 1:nrows]
for idx = f_idx:l_idx
vect[idx] = true
end
col_name = row.Species
df[!, Symbol(col_name)] = vect
end
#(names(df) |> length) > 4 && println(df)
push!(vdf, df)
end
cdf1 = reduce(
(x, y) ->
outerjoin(x, y, matchmissing = :equal, on = intersect(names(x), names(y))),
vdf,
)
cdf2 = coalesce.(cdf1, false)
end
#using DataFrames, CSV, WAV
function avianz_of_scores(scores::String) #scores.csv
df = CSV.read(scores, DataFrame)
sort!(df, [:file, :start_time, :end_time])
gdf = groupby(df, :file)
for group in gdf
filename = first(group.file)
audio_data, sample_rate, _, _ = WAV.wavread(filename)
duration = Float64(length(audio_data) / sample_rate)
data = Any[Dict([("Operator", "D"), ("Reviewer", "D"), ("Duration", duration)])]
labels = Any[]
for row in eachrow(group)
if row.prediction != "nothing"
label = [
row.start_time,
row.end_time,
0,
8000,
[
Dict([
("filter", "M"),
("species", row.prediction),
("certainty", 100),
]),
],
]
push!(labels, label)
end
end
append!(data, labels)
avianzf = filename * ".data"
length(data) > 1 && avianz_file_of_dict(avianzf, data)
end
end