# Clips.jl

#using DataFramesMeta : @transform!, @subset!, @byrow, @passmissing

#= something here causes a problem with Dates when enclosed by """
using Glob, Skraak
predictions = Glob.glob("*/2024-10-18/preds*")
for file in predictions
try
make_clips(file)
catch x
println(x)
end
end

if needed to change headers in preds csv
shift, control, f in subl
file,start_time,end_time,label
/media/david/Pomona-2,<project filters>, preds-2023-02-27.csv
file,start_time,end_time,absent,present

using Glob, CSV, DataFrames, DataFramesMeta, Dates, DSP, Plots, Random, WAV

# Assumes run on linux
# Assumes function run from Pomona-1 or Pomona-2
#dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv",),
=#
"""
make_clips(preds_path::String, dawn_dusk_dict::Dict{Dates.Date, Tuple{Dates.DateTime, Dates.DateTime}} = construct_dawn_dusk_dict("/media/david/SSD1/dawn_dusk.csv"))

This function takes a preds.csv files and generates
file names, wav's, spectrograms etc to be reviewed.
it calls night() and may call construct_dawn_dusk_dict() unless the dict is globally defined and passed in

It should be run from Pomona-1/, Pomona-2/ or Pomona-3/, assumes it is, it uses the path
It saves  wav and png files to current working directory, ie Pomona-3
need to use a try/catch because the 2 assert functions thow an error to short circuit the function
"""
function make_clips(
    preds_path::String,
    label::Int = 1,
    night::Bool = true,
    dawn_dusk_dict = dddict,
)
    # Assumes function run from Pomona-1 or Pomona-2
    location, trip_date, _ = split(preds_path, "/")
    ##location, h, trip_date, _ = split(preds_path, "/")

    # Load and group data frame by file
    gdf =
        #! format: off
        DataFrames.DataFrame(CSV.File(preds_path)) |>
        x -> assert_not_empty(x, preds_path) |>
        x -> rename_column!(x, "1.0", "label") |> #can remove now, for old opensounscape model needs to be label
        x -> rename_column!(x, "Kiwi", "label") |> #for new model Kiwi needs to read as label
        x -> assert_detections_present(x, label, location, trip_date) |>
        x -> filter_positives!(x, label) |>
        insert_datetime_column! |>
        x -> night_or_day!(x, dawn_dusk_dict, night) |> #true=night, false=day
        group_by_file!
        #! format: on
    # Make clip and spectrogram
    for (k, v) in pairs(gdf)
        #file_name = chop(v.file[1], head = 2, tail = 4)
        file_name, extension = path_to_file_string(v.file[1])
        start_times = v[!, :start_time] |> sort

        detections = cluster_detections(start_times)
        isempty(detections) && continue

        signal, freq = WAV.wavread("$location/$trip_date/$(file_name).$(extension)")
        ##signal, freq = WAV.wavread("$location/$h/$trip_date/$(file_name).$(extension)")
        if size(signal, 2) == 2
            signal = (signal[:, 1] + signal[:, 2]) / 2
        end
        if freq > 8000
            signal, freq = resample_to_8000hz(signal, freq)
        end
        freq = freq |> Float32
        length_signal = length(signal)

        for detection in detections
            st, en = calculate_clip_start_end(detection, freq, length_signal)
            name = "$location-$trip_date-$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"
            ##name = "$location-$h-$trip_date-$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"
            f = "Clips_$(today())"
            mkpath(f)
            outfile = "$f/$name"

            sample = signal[Int(st):Int(en)]
            WAV.wavwrite(sample, "$outfile.wav", Fs = Int(freq))

            #plot = plot_spectrogram(sample, freq)
            #savefig(plot, "$outfile.png")
            image = get_image_from_sample(sample, freq)
            PNGFiles.save("$outfile.png", image)
            #PNGFiles.save("/media/david/SSD1/$outfile.png", image)
        end
        print(".")
    end
    println("\ndone $location/$trip_date \n")
end

#=
using Opus, PNGFiles, Skraak
function image_from_opus(file)
    signal, freq = Opus.load(file)
    if size(signal, 2) == 2
        signal = (signal[:, 1] + signal[:, 2]) / 2
    end
    if freq > 8000
        signal, freq = Skraak.resample_to_8000hz(signal, freq)
    end
    freq = freq |> Float32
    sample = signal[:]
    image = Skraak.get_image_from_sample(sample, freq)
    PNGFiles.save("$(file).png", image)
    print(".")
end
=#

#######################################################################

function assert_not_empty(df::DataFrame, preds_path::String)::DataFrame
    size(df) != (0, 0) ? (return df) : @error "Empty dataframe at $preds_path"
    #return df
end

function rename_column!(df::DataFrame, old_name::String, new_name::String)::DataFrame
    old_name in names(df) && rename!(df, old_name => new_name)
    return df
end

# assumes kiwi, binary classifier from opensoundscape
# needed to remove ::String annotation for location, trip_date to make it work
function assert_detections_present(
    df::DataFrame,
    label::Int,
    location,
    trip_date,
)::DataFrame
    label in levels(df.label) ? (return df) :
    @error "No detections for label = $label at $location/$trip_date"
end

##################################################################
function filter_positives!(df::DataFrame, label)::DataFrame
    #filter!(row -> row.kiwi > 0, df)
    filter!(row -> row.label == 1, df)
    return df
end

function path_to_file_string(path) #becareful path::String won't work: no method matching path_to_file_string(::InlineStrings.String31) line 70
    f = split(path, "/")[end] |> x -> split(x, ".")
    #f = chop(file, head = 2, tail = 4)
    return first(f), last(f)
end

function filename_to_datetime!(file)::DateTime
    #file_string = chop(file, head = 2, tail = 4)
    file_string = path_to_file_string(file) |> first
    date_time =
        length(file_string) > 13 ? DateTime(file_string, dateformat"yyyymmdd_HHMMSS") :
        DateTime(
            (file_string[1:4] * "20" * file_string[5:end]),
            dateformat"ddmmyyyy_HHMMSS",
        )
    return date_time
end

function insert_datetime_column!(df::DataFrame)::DataFrame
    @transform!(df, @byrow :DateTime = filename_to_datetime!(String(:file)))
    return df
end

# calls night(), needs dawn_dusk_dict in local time format
function night_or_day!(
    df::DataFrame,
    dawn_dusk_dict::Dict{Dates.Date,Tuple{Dates.DateTime,Dates.DateTime}},
    night_time::Bool = true,
)::DataFrame
    night_time ? @subset!(df, @byrow night(:DateTime, dawn_dusk_dict)) :
    @subset!(df, @byrow !night(:DateTime, dawn_dusk_dict))
    return df
end

function group_by_file!(df::DataFrame)
    gdf = DataFrames.groupby(df, :file)
    return gdf
end

function cluster_detections(start_times::Vector{Float64})::Vector{Vector{Float64}}
    s = Vector{Float64}[]
    t = Float64[start_times[1]]
    for time in start_times[2:end]
        if time - last(t) <= 15.0
            push!(t, time)
        else
            push!(s, copy(t))
            t = Float64[time]
        end
    end
    push!(s, copy(t))
    detections = filter(x -> length(x) > 1, s)
    return detections
end

# assumes it is operating on 5 second clips
function calculate_clip_start_end(
    detection::Vector{Float64},
    freq::Float32,
    length_signal::Int64,
)::Tuple{Float64,Float64}
    first(detection) > 0 ? st = first(detection) * freq : st = 1
    (last(detection) + 5.0) * freq <= length_signal ? en = (last(detection) + 5.0) * freq :
    en = length_signal
    return st, en
end

# f neeeds to be an Int
function get_image_from_sample(sample, f) #sample::Vector{Float64}
    S = DSP.spectrogram(sample, 400, 2; fs = convert(Int, f))
    i = S.power
    if minimum(i) == 0.0
        l = i |> vec |> unique |> sort
        replace!(i, 0.0 => l[2])
    end
    image =
        #! format: off
        DSP.pow2db.(i) |>
        x -> x .+ abs(minimum(x)) |>
        x -> x ./ maximum(x) |>
        x -> reverse(x, dims = 1) |>
        x -> PerceptualColourMaps.applycolourmap(x, cmap("L4")) |>
        #x -> RGB.(x) |>
        x -> ImageTransformations.imresize(x, 224, 224) |>
        x -> Float32.(x)
        #! format: on
    return image
end

#=
dict = construct_dawn_dusk_dict("/Volumes/SSD1/dawn_dusk.csv")

df=DataFrames.DataFrame(dict)
df=CSV.File("dawn_dusk.csv") |> DataFrame
open("dict.jl", "w") do file
    write(file, "const dddict = Dict(")
    for row in eachrow(df)
        line="\tDates.Date(\"$(row.Date)\") =>
            (Dates.DateTime(\"$(row.Dawn)\"), Dates.DateTime(\"$(row.Dusk)\")),\n"
        write(file, line)
    end
    write(file, ")")
end

using CSV, DataFrames
=#
"""
construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
sun = DataFrames.DataFrame(CSV.File(file))

Takes dawn_dusk.csv and returns a dict to be consumed by night().
~/dawn_dusk.csv
At present it goes from the start of 2019 to the end of 2026
The csv contains local time sunrise and sunset
I use this to decide if a file with a local time encoded name was recorded at nigh
"""
function construct_dawn_dusk_dict(file::String)::Dict{Date,Tuple{DateTime,DateTime}}
    sun = DataFrames.DataFrame(CSV.File(file))
    x = Tuple(zip(sun.Dawn, sun.Dusk))
    y = Dict(zip(sun.Date, x))
    return y
end

"""
night(call_time::DateTime, dict::Dict{Date, Tuple{DateTime, DateTime}})::Bool

Returns true if time is at night, ie between civil twilights, dusk to dawn.
Consumes dict from dawn_dusk_dict.jl (or construct_dawn_dusk_dict)

time=DateTime("2021-11-02T21:14:35",dateformat"yyyy-mm-ddTHH:MM:SS")
Utility.night(time, dict)
"""
function night(call_time::DateTime, dict::Dict{Date,Tuple{DateTime,DateTime}})::Bool
    dawn = dict[Date(call_time)][1]
    dusk = dict[Date(call_time)][2]
    if call_time <= dawn || call_time >= dusk
        return true
    else
        return false
    end
end

#######################################################################

#INBETWEEN STEP: use secondary model to sort clips, move clips into D, F, M, N, and hand classify, generate actual.csv.

"""
move_clips_to_folders(df::DataFrame)

df=CSV.read("preds.csv", DataFrame)
Takes a 2 column dataframe: file, label
file must be list of png images, assumes wav's are there too
will move mp4's from video folder if they are present
label = "Rowi"
"""

function move_clips_to_folders(df::DataFrame, label::String)
    p = Glob.glob("*.png")
    w = Glob.glob("*.[W,w][A,a][V,v]")
    @assert (first(df.file) |> x -> split(x, ".")[end] |> x -> x == "png") "df.file must be a list of png's"
    @assert issetequal(df.file, p) "All png files in dataframe must be present in folder"
    @assert issetequal(chop.(df.file, head = 0, tail = 4), chop.(w, head = 0, tail = 4)) "There must be a wav for every png in the dataframe"
    call_map = Dict(1 => "Duet", 2 => "Female - Solo", 3 => "Male - Solo")
    for row in eachrow(df)
        src = row.file
        if row.label == 4
            dst = "Don't Know/$(row.file)"
            mkpath("Don't Know/")
        else
            dst = "$label/$(call_map[row.label])/$(row.file)"
            mkpath("$label/$(call_map[row.label])")
        end
        #dst = "$(row.label)/$(row.file)"
        #mkpath("$(row.label)/")
        try
            mv(src, dst)
            isfile(chop(src, tail = 3) * "wav") &&
                mv(chop(src, tail = 3) * "wav", chop(dst, tail = 3) * "wav")
            #=if isdir(video)
                mkpath("video/$(row.label)/")
                mv(
                    "video/" * chop(src, tail = 3) * "mp4",
                    "video/" * chop(dst, tail = 3) * "mp4",
                )
            end=#
        catch e
            @info e
        end
    end
end

#=
using Glob, Skraak, CSV, DataFrames, Dates, PNGFiles

predictions = glob("*/*/preds-2024-10-21.csv")

For Kahurangi (1) Data
a=glob("Manu*/['m','X']*/*/preds_Kahurangi_1-2_ST_2024-10-29.csv")
b=glob("Manu*/['m','X']*/*/*/preds_Kahurangi_1-2_ST_2024-10-29.csv")
c=glob("Manu*/['m','X']*/*/*/*/preds_Kahurangi_1-2_ST_2024-10-29.csv")
predictions = [a ; b ; c]

a=glob("Manu*/['m','X']*/*/preds4_Kahurangi_1-2_2024-10-29.csv")
b=glob("Manu*/['m','X']*/*/*/preds4_Kahurangi_1-2_2024-10-29.csv")
c=glob("Manu*/['m','X']*/*/*/*/preds4_Kahurangi_1-2_2024-10-29.csv")
predictions = [a ; b ; c]

a=glob("Manu*/['m','X']*/*/preds3_Kahurangi_1-2_2024-10-29.csv")
b=glob("Manu*/['m','X']*/*/*/preds3_Kahurangi_1-2_2024-10-29.csv")
c=glob("Manu*/['m','X']*/*/*/*/preds3_Kahurangi_1-2_2024-10-29.csv")
predictions = [a ; b ; c]

For Kahurangi (2) Data
predictions=glob("Manu*/m*/preds_Kahurangi_1-2_ST_2024-10-29.csv")
predictions=glob("Manu*/m*/preds4_Kahurangi_1-2_2024-10-29.csv")
predictions=glob("Manu*/m*/preds3_Kahurangi_1-2_2024-10-29.csv")

For Cobb
predictions=glob("Friends*/preds_Kahurangi_1-2_ST_2024-10-29.csv")
predictions=glob("Friends*/preds4_Kahurangi_1-2_2024-10-29.csv")
predictions=glob("Friends*/preds3_Kahurangi_1-2_2024-10-29.csv")

Flora
a=glob("*/*/preds_Kahurangi_1-2_ST_2024-10-31.csv")
b=glob("*/*/*/preds_Kahurangi_1-2_ST_2024-10-31.csv")
c=glob("*/*/*/*/preds_Kahurangi_1-2_ST_2024-10-31.csv")
predictions=[a;b;c]

Flora
a=glob("../Misc-1/Friends of Flora (1)/Flora */*/preds3_Kahurangi_1-5_2025-02-14.csv")
b=glob("../Misc-1/Friends of Flora (1)/Flora */*/*/preds3_Kahurangi_1-5_2025-02-1*.csv")
c=glob("../Misc-1/Friends of Flora (1)/Flora */*/*/*/preds3_Kahurangi_1-5_2025-02-1*.csv")
predictions=[a;b;c]

# to delete empty preds.csv files, should be ok now with improved python script
for file in predictions
    size = stat(file).size
    if size < 10
        println("Deleting $file - $size")
        rm(file)
    end
end

for file in predictions
try
make_clips_generic(file, 1, "MOK_202505-_K1-5_T3", true) #####false
catch x
println(x)
end
end
=#
#note of false above, be close to the wavs to minimise length of fname, wont save anything if .. in filename
#=
function make_clips_generic(
    preds_path::String,
    label::Int, ##= 1, needs looking at here TODO
    id::String,
    unique_file_names = true,
)
    # Assumes function run from Kahurangi Data
    #pth = replace(preds_path, "preds-2024-10-21.csv" => "")
    pth0 = split(preds_path, "/")
    length(pth0) > 1 ? (pth = joinpath(pth0[1:end-1]) * "/") : pth = ""

    function assert_detections_present_(df::DataFrame, label::Int, preds_path)::DataFrame
        label in levels(df.label) ? (return df) :
        @error "No detections for label = $label at $preds_path"
    end

    # Load and group data frame by file
    gdf =
        #! format: off
        DataFrames.DataFrame(CSV.File(preds_path)) |>
        x -> assert_not_empty(x, preds_path) |>
        x -> rename_column!(x, "1.0", "label") |> #can remove now, for old opensoundscape kiwi model needs to be label
        x -> rename_column!(x, "Kiwi", "label") |> #for new kiwi model needs to be label
        x -> assert_detections_present_(x, label, preds_path) |>
        x -> filter_positives!(x, label) |>
        group_by_file!
        #! format: on
    # Make clip and spectrogram
    for (k, v) in pairs(gdf)
        #file_name = chop(v.file[1], head = 2, tail = 4)
        file_name, extension = path_to_file_string(v.file[1])
        #@info (file_name, extension)
        start_times = v.start_time |> x ->
            convert(Vector{Float64}, x) |>
            #dropmissing(x, disallowmissing = true) |> ######CHECK used to make cobb work. not working anymore, but convert works fine. This happens because the col type of dataframe is Float64? even though no missings, seems to ony happen with doc recorders
            sort
        detections = cluster_detections(start_times)
        isempty(detections) && continue

        signal, freq = WAV.wavread("$pth$(file_name).$(extension)")
        if size(signal, 2) == 2
            signal = (signal[:, 1] + signal[:, 2]) / 2
        end
        if freq > 8000
            signal, freq = resample_to_8000hz(signal, freq)
        end
        freq = freq |> Float32
        length_signal = length(signal)

        for detection in detections
            st, en = calculate_clip_start_end(detection, freq, length_signal)
            if unique_file_names == true
                name = "$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))" #leave off path, not necesaray if unique file names
            else
                p = replace(pth, "/" => "--") #replace / with -- including trailing /
                name = "$p$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"
            end
            f = "Clips_$(id)_$(today())"
            mkpath(f)
            outfile = "$f/$name"

            sample = signal[Int(st):Int(en)]
            WAV.wavwrite(sample, "$outfile.wav", Fs = Int(freq))

            image = get_image_from_sample(sample, freq)
            PNGFiles.save("$outfile.png", image)
        end
        print(".")
    end
    print(".")
end
=#

#make_clips_clusters(glob("*/"), "preds3_opensoundscape-kiwi-1.2_2025-07-09.csv")
#make_clips_clusters(glob("*/"), "predsST_opensoundscape-kiwi-1.5_2025-07-09.csv")
# Valid file names follow this pattern: preds3_opensoundscape-kiwi-1.2_2025-07-09.csv

function make_clips_clusters(
    clusters::Vector{String},
    csv_name::String
)::nothing
    x=split(csv_name, "_")
    @assert length(x) == 3
    @assert "preds" occursin x[1]
    @assert "opensoundscape" occursin x[2]
    @assert length(x[3]) == 14
    @assert ".csv" occursin x[3]
    model = split(x[2])
    sensitivity = replace(first(x), "preds"=>"")
    for cluster in clusters
        cd(cluster)
        preds = glob("*/$csv_name")
        for pred in preds
            l=CSV.read(pred, DataFrame) |> x -> names(x)
            for ebird in l
                try
                    make_clips_generic(pred, ebird, model, sensitivity, true)
                catch e
                    @info e
                end
            end
        end
        cd("..")
    end
    return nothing
end

function make_clips_generic(
    preds_path::String,
    label::String, ##column header, ie ebird or "Kiwi"
    model_name::String,
    sensitivity::String
    unique_file_names = true,
)
    # Assumes function run from Kahurangi Data
    #pth = replace(preds_path, "preds-2024-10-21.csv" => "")
    pth0 = split(preds_path, "/")
    length(pth0) > 1 ? (pth = joinpath(pth0[1:end-1]) * "/") : pth = ""

    function assert_detections_present_(df::DataFrame, label::String, preds_path)::DataFrame
        1 in levels(df.label) ? (return df) :
        @error "No detections for label = $label at $preds_path"
    end

    # Load and group data frame by file
    gdf =
        #! format: off
        DataFrames.DataFrame(CSV.File(preds_path)) |>
        x -> assert_not_empty(x, preds_path) |>
        x -> assert_detections_present_(x, label, preds_path) |>
        x -> filter_positives!(x, label) |>
        group_by_file!
        #! format: on
    # Make clip and spectrogram
    for (k, v) in pairs(gdf)
        #file_name = chop(v.file[1], head = 2, tail = 4)
        file_name, extension = path_to_file_string(v.file[1])
        #@info (file_name, extension)
        start_times = v.start_time |> x ->
            convert(Vector{Float64}, x) |>
            #dropmissing(x, disallowmissing = true) |> ######CHECK used to make cobb work. not working anymore, but convert works fine. This happens because the col type of dataframe is Float64? even though no missings, seems to ony happen with doc recorders
            sort
        detections = cluster_detections(start_times)
        isempty(detections) && continue

        signal, freq = WAV.wavread("$pth$(file_name).$(extension)")
        if size(signal, 2) == 2
            signal = (signal[:, 1] + signal[:, 2]) / 2
        end
        if freq > 8000
            signal, freq = resample_to_8000hz(signal, freq)
        end
        freq = freq |> Float32
        length_signal = length(signal)

        for detection in detections
            st, en = calculate_clip_start_end(detection, freq, length_signal)
            if unique_file_names == true
                name = "$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))" #leave off path, not necesaray if unique file names
            else
                p = replace(pth, "/" => "--") #replace / with -- including trailing /
                name = "$p$file_name-$(Int(floor(st/freq)))-$(Int(ceil(en/freq)))"
            end
            f = "Clips_$(model_name)_$(today())"
            mkpath(f)
            outfile = "$f/$name"

            sample = signal[Int(st):Int(en)]
            WAV.wavwrite(sample, "$outfile.wav", Fs = Int(freq))

            image = get_image_from_sample(sample, freq)
            PNGFiles.save("$outfile.png", image)
        end
        print(".")
    end
    print(".")
end