# FileMetaData.jl
#DelimitedFiles, DuckDB, JSON3
#=
# needs SSD1 present for dawn_dusk.csv (not anymore, gets from dddict)
used like:
using Glob, Skraak, CSV, DataFrames
folders=Glob.glob("*/2025-05-18/")
for folder in folders
cd(folder)
try
df = file_metadata_to_df()
CSV.write("/media/david/Pomona-4/Pomona/pomona_files_20250522_new.csv", df; append=true)
#CSV.write("/media/david/Pomona-4/Pomona/pomona_files_20241126_new.csv", df; append=true)
catch x
@warn "$x error with $folder"
end
cd("/media/david/Pomona-4/Pomona/")
end
Then using duckdb cli from SSD:
duckdb AudioData.duckdb
show tables;
SELECT * FROM pomona_files;
COPY pomona_files FROM '/media/david/Pomona-3/Pomona-3/pomona_files_20241018.csv';
SELECT * FROM pomona_files;
Then backup with:
EXPORT DATABASE 'AudioDataBackup_2023-07-29';
.quit
Then quit and backup using cp on the db file
Then rsync ssd to usb
rsync -avzr --delete /media/david/SSD1/ /media/david/USB/
To restore from backup:
duckdb my_database.duckdb
IMPORT DATABASE 'AudioDataBackup_2024-07-10';
=#
"""
file_metadata_to_df()
This function takes a file name, extracts wav metadata, gpx location, recording period start/end and returnes a dataframe.
This function needs raw audiomoth wav files and a gpx.
This function needs /media/david/SSD1/dawn_dusk.csv
"""
function file_metadata_to_df()
# Initialise dataframe with columns: disk, location, trip_date, file, lattitude, longitude, start_recording_period_localt, finish_recording_period_localt, duration, sample_rate, zdt, ldt, moth_id, gain, battery, temperature
df = DataFrames.DataFrame(
disk = String[],
location = String[],
trip_date = String[],
file = String[],
latitude = Float64[],
longitude = Float64[],
start_recording_period_localt = String[],
finish_recording_period_localt = String[],
duration = Float64[],
sample_rate = Int[],
utc = String[],
ldt = String[],
moth_id = String[],
gain = String[],
battery = Float64[],
temperature = Float64[],
sha2_256 = String[],
night = Bool[],
xxh64 = String[],
)
#Get WAV list for folder
wav_list = Glob.glob("*.WAV") |> sort
#Return empty df if nothing in the folder
if length(wav_list) == 0
return df
end
#Get path info from file system
raw_path_vec = split(pwd(), "/")[end-2:end]
disk = raw_path_vec[1]
location = raw_path_vec[2]
trip_date = raw_path_vec[3]
#Get location, assumes 1 gpx is in the follder
waypoint = Glob.glob("*.gpx")
length(waypoint) != 1 && @error "no gpx file in $trip_date $location"
loc = read(waypoint[1], String) |> xml_dict
latitude = parse(Float64, (loc["gpx"]["wpt"][:lat]))
longitude = parse(Float64, (loc["gpx"]["wpt"][:lon]))
#Start of recording period
_, _, _, binary_metadata_start = WAV.wavread(wav_list[1])
c_v_s = split(wav_info_read(binary_metadata_start)[:ICMT], " ")
comment_vector_start = length(c_v_s) < 22 ? c_v_s : c_v_s[1:19]
date_start = split(comment_vector_start[4], "/")
time_start = split(comment_vector_start[3], ":")
tz_start = chop(comment_vector_start[5], head = 4, tail = 1)
time_zone_start = isempty(tz_start) ? "+00" : tz_start
#zdt1 = ZonedDateTime(parse(Int, date_start[3]), parse(Int, date_start[2]), parse(Int, date_start[1]), parse(Int, time_start[1]), parse(Int, time_start[2]), parse(Int, time_start[3]), tz"UTC")
time_string_start =
date_start[3] *
"-" *
date_start[2] *
"-" *
date_start[1] *
"T" *
time_start[1] *
":" *
time_start[2] *
":" *
time_start[3] *
"." *
"000" *
time_zone_start
zdt1 = ZonedDateTime(time_string_start)
start_recording_period_localt =
Dates.format(astimezone(zdt1, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
#End of recording period
_, _, _, binary_metadata_end = WAV.wavread(wav_list[end])
c_v_e = split(wav_info_read(binary_metadata_end)[:ICMT], " ")
comment_vector_end = length(c_v_e) < 22 ? c_v_e : c_v_e[1:19]
date_end = split(comment_vector_end[4], "/")
time_end = split(comment_vector_end[3], ":")
tz_end = chop(comment_vector_start[5], head = 4, tail = 1)
time_zone_end = isempty(tz_end) ? "+00" : tz_end
#zdt2 = ZonedDateTime(parse(Int, date_end[3]), parse(Int, date_end[2]), parse(Int, date_end[1]),parse(Int, time_end[1]), parse(Int, time_end[2]), parse(Int, time_end[3]), tz"UTC")
time_string_end =
date_end[3] *
"-" *
date_end[2] *
"-" *
date_end[1] *
"T" *
time_end[1] *
":" *
time_end[2] *
":" *
time_end[3] *
"." *
"000" *
time_zone_end
zdt2 = ZonedDateTime(time_string_end)
finish_recording_period_localt =
Dates.format(astimezone(zdt2, tz"Pacific/Auckland"), "yyyy-mm-dd HH:MM:SSzzzz")
#dict = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv")
dict = dddict
#So I know what it is doing
println(raw_path_vec)
#Loop over file list
for file in wav_list
#print(file)
try
audio_data, sample_rate, _, binary_metadata = WAV.wavread(file)
c_v = split(wav_info_read(binary_metadata)[:ICMT], " ")
comment_vector = length(c_v) < 22 ? c_v : c_v[1:19]
duration = Float64(length(audio_data) / sample_rate)
date = split(comment_vector[4], "/")
time = split(comment_vector[3], ":")
tz = chop(comment_vector[5], head = 4, tail = 1)
time_zone = isempty(tz) ? "+00" : tz
#preformatting_zdt = ZonedDateTime(parse(Int, date[3]), parse(Int, date[2]), parse(Int, date[1]), parse(Int, time[1]), parse(Int, time[2]), parse(Int, time[3]), tz"UTC")
time_string =
date[3] *
"-" *
date[2] *
"-" *
date[1] *
"T" *
time[1] *
":" *
time[2] *
":" *
time[3] *
"." *
"000" *
time_zone
preformatting_zdt = ZonedDateTime(time_string)
#zdt = Dates.format(preformatting_zdt, "yyyy-mm-dd HH:MM:SSzzzz")
preformatting_utc = astimezone(preformatting_zdt, tz"UTC")
utc = Dates.format(preformatting_utc, "yyyy-mm-dd HH:MM:SSzzzz")
preformatting_ldt = astimezone(preformatting_zdt, tz"Pacific/Auckland")
ldt = Dates.format(preformatting_ldt, "yyyy-mm-dd HH:MM:SSzzzz")
moth_id = comment_vector[8]
gain = comment_vector[10]
#index back from end because if V > 4.9 the wording chaaanges
battery = parse(Float64, chop(comment_vector[end-4], tail = 1))
temperature = parse(Float64, chop(comment_vector[end], tail = 2))
sha2_256 = bytes2hex(sha256(file))
#assumes 15 minute file and calculates on half way time
nt = night(DateTime(preformatting_ldt + Minute(7) + Second(30)), dict)
xxh64 = readchomp(`xxh-hash $file`) #run(`xxh-hash $file`)
#Populate row to push into df
row = [
disk,
location,
trip_date,
file,
latitude,
longitude,
start_recording_period_localt,
finish_recording_period_localt,
duration,
Int(sample_rate),
utc,
ldt,
moth_id,
gain,
battery,
temperature,
sha2_256,
nt,
xxh64,
]
push!(df, row)
print(".")
catch
@warn "error with $folder $file"
end
end
return df
end