#unfinished
using DataFrames, CSV, Glob, DataFramesMeta
function make_skraak_dataset(
input_file::String,
output_path::String = "/media/david/SSD2/PrimaryDataset/kiwi_set/",
)
@info "This could take some time.\nFirst we move audio files.\nThen we make the spectrogram images."
move_files(input_file, output_path) |> x -> save_pngs(x, output_path)
end
"""
Only moves WAVs not already there in dataset
converts WAVs to flac to save space, file metadata will not survive
requires columns :location, :file, :start_time, :end_time
:file is the file name, :location is the actual recorder location eg "C05"
run where the raw data is
will find file in folder structure location/trip_date/file
constructs dataset at output_path
assumes file name has one . for extension only
"""
function move_files(input_file::String, output_path::String)
df = DataFrames.DataFrame(CSV.File(input_file))
@assert nrow(df) > 0 "Empty csv therefore dataframe"
if "box" in names(df)
@transform!(df, @byrow :start_time = first(eval(Meta.parse(:box))))
@transform!(df, @byrow :end_time = last(eval(Meta.parse(:box))))
end
for col_name in ["location", "file", "start_time", "end_time"]
@assert col_name in names(df) "Column $col_name not present in csv"
end
select!(df, :location, :file, :start_time, :end_time)
@transform!(df, @byrow :key = :location * "-" * :file)
@info "Moving files..."
k = levels(df.key) #Vector{String}:
for item in k
fldr = split(item, ".")[end-1]
outf = replace(item, ".wav" => ".flac", ".WAV" => ".flac")
if !isfile("$output_path$(fldr)/$outf")
println(item)
l, f = split(item, "-")
b = Glob.glob("$l/*/$f")
@assert length(b) == 1
mkpath("$fldr")
signal, freq = load_audio_file(first(b))
save("$output_path$(fldr)/$outf", signal, freq)
end
end
return df
end
# work needed to make it save in correct place, currently saves relativ to pwd, this is incorrect relative to the previous function
function save_pngs(df::DataFrame, output_path::String)
@info "$(length(levels(df.key))) files"
@info "$(length(df.key)) labels"
select!(df, :key, :start_time, :end_time)
@info "Making spectrogram images..."
gdf = DataFrames.groupby(df, :key)
for f in gdf #where f = the file
file = first(f.key) |> x -> replace(x, ".wav" => ".flac", ".WAV" => ".flac")
folder = split(file, ".")[1]
#signal, freq = WAV.wavread("kiwi_set_2023-11-13/$folder/$file")
signal, freq = load_audio_file("$output_path$folder/$file")
length_signal = length(signal)
duration = length_signal ÷ freq
mkpath("$output_path$folder/K")
mkpath("$output_path$folder/N")
ldf = DataFrames.DataFrame(second = 1:duration, kiwi = false)
#get a list of start and end times for each clip, clumsy, need to fix this bit
kiwi = collect(map(collect, zip(f.start_time, f.end_time)))
@info (folder, duration, kiwi)
for clip in kiwi
clip[1] > 0 ? st = clip[1] : st = 1
clip[2] <= duration ? nd = clip[2] : nd = duration
ldf.kiwi[st:nd] .= true
end
start = 1
while start + 4 <= duration
wdf = ldf[start:start+4, :]
#make image
st, en = calculate_clip(start, start + 4, freq, length_signal)
sample = signal[Int(st):Int(en)]
plot = get_image_from_sample(sample, freq)
if true in levels(wdf.kiwi)
#save to K folder
PNGFiles.save("$output_path$folder/K/$folder-$start-$(start+4).png", plot)
start += 2
else
#save to N folder
PNGFiles.save("$output_path$folder/N/$folder-$start-$(start+4).png", plot)
start += 5
end
end
if start + 4 > duration
wdf = ldf[duration-4:duration, :]
#make image
st, en = calculate_clip(duration - 4, duration, freq, length_signal)
sample = signal[Int(st):Int(en)]
plot = get_image_from_sample(sample, freq)
#save to correct folder
true in levels(wdf.kiwi) ? l = "K" : l = "N"
PNGFiles.save(
"$output_path$folder/$l/$folder-$(duration-4)-$duration.png",
plot,
)
end
end
end
function calculate_clip(st::Int, en::Int, freq::Int32, len::Int)
s = (st * freq) - freq + 1
en * freq <= len ? e = en * freq : e = len
return s, e
end